From 2fe1c59347369fa856ae259e2fac3c8c8dd9d335 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Tue, 3 Jun 2025 23:43:07 +0800 Subject: bpf: Add cookie to raw_tp bpf_link_info After commit 68ca5d4eebb8 ("bpf: support BPF cookie in raw tracepoint (raw_tp, tp_btf) programs"), we can show the cookie in bpf_link_info like kprobe etc. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20250603154309.3063644-1-chen.dylane@linux.dev --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85180e4aaa5a..f1160ebbf526 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6651,6 +6651,8 @@ struct bpf_link_info { struct { __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ __u32 tp_name_len; /* in/out: tp_name buffer len */ + __u32 :32; + __u64 cookie; } raw_tracepoint; struct { __u32 attach_type; -- cgit v1.2.3 From 1209339844601ec1766f4ff430673fbcfe42bb51 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 6 Jun 2025 09:31:41 -0700 Subject: bpf: Implement mprog API on top of existing cgroup progs Current cgroup prog ordering is appending at attachment time. This is not ideal. In some cases, users want specific ordering at a particular cgroup level. To address this, the existing mprog API seems an ideal solution with supporting BPF_F_BEFORE and BPF_F_AFTER flags. But there are a few obstacles to directly use kernel mprog interface. Currently cgroup bpf progs already support prog attach/detach/replace and link-based attach/detach/replace. For example, in struct bpf_prog_array_item, the cgroup_storage field needs to be together with bpf prog. But the mprog API struct bpf_mprog_fp only has bpf_prog as the member, which makes it difficult to use kernel mprog interface. In another case, the current cgroup prog detach tries to use the same flag as in attach. This is different from mprog kernel interface which uses flags passed from user space. So to avoid modifying existing behavior, I made the following changes to support mprog API for cgroup progs: - The support is for prog list at cgroup level. Cross-level prog list (a.k.a. effective prog list) is not supported. - Previously, BPF_F_PREORDER is supported only for prog attach, now BPF_F_PREORDER is also supported by link-based attach. - For attach, BPF_F_BEFORE/BPF_F_AFTER/BPF_F_ID/BPF_F_LINK is supported similar to kernel mprog but with different implementation. - For detach and replace, use the existing implementation. - For attach, detach and replace, the revision for a particular prog list, associated with a particular attach type, will be updated by increasing count by 1. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250606163141.2428937-1-yonghong.song@linux.dev --- include/uapi/linux/bpf.h | 7 ++ kernel/bpf/cgroup.c | 182 ++++++++++++++++++++++++++++++++++++----- kernel/bpf/syscall.c | 46 +++++++---- tools/include/uapi/linux/bpf.h | 7 ++ 4 files changed, 205 insertions(+), 37 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1160ebbf526..25e9cf92ffaf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1794,6 +1794,13 @@ union bpf_attr { }; __u64 expected_revision; } netkit; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } cgroup; }; } link_create; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9122c39870bf..ffbafbef5010 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -658,6 +658,116 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, return NULL; } +static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd) +{ + struct bpf_link *link = ERR_PTR(-EINVAL); + + if (flags & BPF_F_ID) + link = bpf_link_by_id(id_or_fd); + else if (id_or_fd) + link = bpf_link_get_from_fd(id_or_fd); + return link; +} + +static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd) +{ + struct bpf_prog *prog = ERR_PTR(-EINVAL); + + if (flags & BPF_F_ID) + prog = bpf_prog_by_id(id_or_fd); + else if (id_or_fd) + prog = bpf_prog_get(id_or_fd); + return prog; +} + +static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog, + struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd) +{ + bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID; + struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL); + bool preorder = flags & BPF_F_PREORDER; + struct bpf_link *anchor_link = NULL; + struct bpf_prog *anchor_prog = NULL; + bool is_before, is_after; + + is_before = flags & BPF_F_BEFORE; + is_after = flags & BPF_F_AFTER; + if (is_link || is_id || id_or_fd) { + /* flags must have either BPF_F_BEFORE or BPF_F_AFTER */ + if (is_before == is_after) + return ERR_PTR(-EINVAL); + if ((is_link && !link) || (!is_link && !prog)) + return ERR_PTR(-EINVAL); + } else if (!hlist_empty(progs)) { + /* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */ + if (is_before && is_after) + return ERR_PTR(-EINVAL); + } + + if (is_link) { + anchor_link = bpf_get_anchor_link(flags, id_or_fd); + if (IS_ERR(anchor_link)) + return ERR_PTR(PTR_ERR(anchor_link)); + } else if (is_id || id_or_fd) { + anchor_prog = bpf_get_anchor_prog(flags, id_or_fd); + if (IS_ERR(anchor_prog)) + return ERR_PTR(PTR_ERR(anchor_prog)); + } + + if (!anchor_prog && !anchor_link) { + /* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER + * doesn't matter since either prepend or append to a combined + * list of progs will end up with correct result. + */ + hlist_for_each_entry(pltmp, progs, node) { + if (is_before) + return pltmp; + if (pltmp->node.next) + continue; + return pltmp; + } + return NULL; + } + + hlist_for_each_entry(pltmp, progs, node) { + if ((anchor_prog && anchor_prog == pltmp->prog) || + (anchor_link && anchor_link == &pltmp->link->link)) { + if (!!(pltmp->flags & BPF_F_PREORDER) != preorder) + goto out; + pl = pltmp; + goto out; + } + } + + pl = ERR_PTR(-ENOENT); +out: + if (anchor_link) + bpf_link_put(anchor_link); + else + bpf_prog_put(anchor_prog); + return pl; +} + +static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs, + struct bpf_prog *prog, struct bpf_cgroup_link *link, + u32 flags, u32 id_or_fd) +{ + struct bpf_prog_list *pltmp; + + pltmp = get_prog_list(progs, prog, link, flags, id_or_fd); + if (IS_ERR(pltmp)) + return PTR_ERR(pltmp); + + if (!pltmp) + hlist_add_head(&pl->node, progs); + else if (flags & BPF_F_BEFORE) + hlist_add_before(&pl->node, &pltmp->node); + else + hlist_add_behind(&pl->node, &pltmp->node); + + return 0; +} + /** * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and * propagate the change to descendants @@ -667,6 +777,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags + * @id_or_fd: Relative prog id or fd + * @revision: bpf_prog_list revision * * Exactly one of @prog or @link can be non-null. * Must be called with cgroup_mutex held. @@ -674,7 +786,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, struct bpf_cgroup_link *link, - enum bpf_attach_type type, u32 flags) + enum bpf_attach_type type, u32 flags, u32 id_or_fd, + u64 revision) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct bpf_prog *old_prog = NULL; @@ -690,6 +803,9 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; + if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER))) + /* only either replace or insertion with before/after */ + return -EINVAL; if (link && (prog || replace_prog)) /* only either link or prog/replace_prog can be specified */ return -EINVAL; @@ -700,6 +816,8 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; + if (revision && revision != cgrp->bpf.revisions[atype]) + return -ESTALE; progs = &cgrp->bpf.progs[atype]; @@ -728,22 +846,18 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (pl) { old_prog = pl->prog; } else { - struct hlist_node *last = NULL; - pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { bpf_cgroup_storages_free(new_storage); return -ENOMEM; } - if (hlist_empty(progs)) - hlist_add_head(&pl->node, progs); - else - hlist_for_each(last, progs) { - if (last->next) - continue; - hlist_add_behind(&pl->node, last); - break; - } + + err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd); + if (err) { + kfree(pl); + bpf_cgroup_storages_free(new_storage); + return err; + } } pl->prog = prog; @@ -762,6 +876,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (err) goto cleanup_trampoline; + cgrp->bpf.revisions[atype] += 1; if (old_prog) { if (type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(old_prog); @@ -793,12 +908,13 @@ static int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, struct bpf_cgroup_link *link, enum bpf_attach_type type, - u32 flags) + u32 flags, u32 id_or_fd, u64 revision) { int ret; cgroup_lock(); - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags, + id_or_fd, revision); cgroup_unlock(); return ret; } @@ -886,6 +1002,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, if (!found) return -ENOENT; + cgrp->bpf.revisions[atype] += 1; old_prog = xchg(&link->link.prog, new_prog); replace_effective_prog(cgrp, atype, link); bpf_prog_put(old_prog); @@ -1011,12 +1128,14 @@ found: * @prog: A program to detach or NULL * @link: A link to detach or NULL * @type: Type of detach operation + * @revision: bpf_prog_list revision * * At most one of @prog or @link can be non-NULL. * Must be called with cgroup_mutex held. */ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_cgroup_link *link, enum bpf_attach_type type) + struct bpf_cgroup_link *link, enum bpf_attach_type type, + u64 revision) { enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; @@ -1034,6 +1153,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, if (atype < 0) return -EINVAL; + if (revision && revision != cgrp->bpf.revisions[atype]) + return -ESTALE; + progs = &cgrp->bpf.progs[atype]; flags = cgrp->bpf.flags[atype]; @@ -1059,6 +1181,7 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ hlist_del(&pl->node); + cgrp->bpf.revisions[atype] += 1; kfree(pl); if (hlist_empty(progs)) @@ -1074,12 +1197,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, } static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type) + enum bpf_attach_type type, u64 revision) { int ret; cgroup_lock(); - ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision); cgroup_unlock(); return ret; } @@ -1097,6 +1220,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, struct bpf_prog_array *effective; int cnt, ret = 0, i; int total_cnt = 0; + u64 revision = 0; u32 flags; if (effective_query && prog_attach_flags) @@ -1134,6 +1258,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return -EFAULT; if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) return -EFAULT; + if (!effective_query && from_atype == to_atype) + revision = cgrp->bpf.revisions[from_atype]; + if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) + return -EFAULT; if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ return 0; @@ -1216,7 +1344,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, } ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, - attr->attach_type, attr->attach_flags); + attr->attach_type, attr->attach_flags, + attr->relative_fd, attr->expected_revision); if (replace_prog) bpf_prog_put(replace_prog); @@ -1238,7 +1367,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) if (IS_ERR(prog)) prog = NULL; - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type); + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision); if (prog) bpf_prog_put(prog); @@ -1267,7 +1396,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link) } WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, - cg_link->type)); + cg_link->type, 0)); if (cg_link->type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); @@ -1339,6 +1468,13 @@ static const struct bpf_link_ops bpf_cgroup_link_lops = { .fill_link_info = bpf_cgroup_link_fill_link_info, }; +#define BPF_F_LINK_ATTACH_MASK \ + (BPF_F_ID | \ + BPF_F_BEFORE | \ + BPF_F_AFTER | \ + BPF_F_PREORDER | \ + BPF_F_LINK) + int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; @@ -1346,7 +1482,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) struct cgroup *cgrp; int err; - if (attr->link_create.flags) + if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK)) return -EINVAL; cgrp = cgroup_get_from_fd(attr->link_create.target_fd); @@ -1370,7 +1506,9 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, - link->type, BPF_F_ALLOW_MULTI); + link->type, BPF_F_ALLOW_MULTI | attr->link_create.flags, + attr->link_create.cgroup.relative_fd, + attr->link_create.cgroup.expected_revision); if (err) { bpf_link_cleanup(&link_primer); goto out_put_cgroup; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 89d027cd7ca0..97ad57ffc404 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4186,6 +4186,25 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, } } +static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype, + bool check_atype) +{ + switch (ptype) { + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + return true; + case BPF_PROG_TYPE_LSM: + return check_atype ? atype == BPF_LSM_CGROUP : true; + default: + return false; + } +} + #define BPF_PROG_ATTACH_LAST_FIELD expected_revision #define BPF_F_ATTACH_MASK_BASE \ @@ -4216,6 +4235,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (bpf_mprog_supported(ptype)) { if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) return -EINVAL; + } else if (is_cgroup_prog_type(ptype, 0, false)) { + if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG)) + return -EINVAL; } else { if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE) return -EINVAL; @@ -4233,6 +4255,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) return -EINVAL; } + if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) { + ret = cgroup_bpf_prog_attach(attr, ptype, prog); + goto out; + } + switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: @@ -4244,20 +4271,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_FLOW_DISSECTOR: ret = netns_bpf_prog_attach(attr, prog); break; - case BPF_PROG_TYPE_CGROUP_DEVICE: - case BPF_PROG_TYPE_CGROUP_SKB: - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - case BPF_PROG_TYPE_CGROUP_SOCKOPT: - case BPF_PROG_TYPE_CGROUP_SYSCTL: - case BPF_PROG_TYPE_SOCK_OPS: - case BPF_PROG_TYPE_LSM: - if (ptype == BPF_PROG_TYPE_LSM && - prog->expected_attach_type != BPF_LSM_CGROUP) - ret = -EINVAL; - else - ret = cgroup_bpf_prog_attach(attr, ptype, prog); - break; case BPF_PROG_TYPE_SCHED_CLS: if (attr->attach_type == BPF_TCX_INGRESS || attr->attach_type == BPF_TCX_EGRESS) @@ -4268,7 +4281,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) default: ret = -EINVAL; } - +out: if (ret) bpf_prog_put(prog); return ret; @@ -4296,6 +4309,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); } + } else if (is_cgroup_prog_type(ptype, 0, false)) { + if (attr->attach_flags || attr->relative_fd) + return -EINVAL; } else if (attr->attach_flags || attr->relative_fd || attr->expected_revision) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f1160ebbf526..25e9cf92ffaf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1794,6 +1794,13 @@ union bpf_attr { }; __u64 expected_revision; } netkit; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } cgroup; }; } link_create; -- cgit v1.2.3 From c7beb48344d2ea0f3f1869b078309dbeb2ed4c96 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sat, 7 Jun 2025 00:58:14 +0800 Subject: bpf: Add cookie to tracing bpf_link_info bpf_tramp_link includes cookie info, we can add it in bpf_link_info. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250606165818.3394397-1-chen.dylane@linux.dev --- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 1 + tools/include/uapi/linux/bpf.h | 2 ++ 3 files changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 25e9cf92ffaf..194ed9891b40 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6665,6 +6665,8 @@ struct bpf_link_info { __u32 attach_type; __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ __u32 target_btf_id; /* BTF type id inside the object */ + __u32 :32; + __u64 cookie; } tracing; struct { __u64 cgroup_id; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0c267f37775b..85e080c3333d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3416,6 +3416,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, container_of(link, struct bpf_tracing_link, link.link); info->tracing.attach_type = tr_link->attach_type; + info->tracing.cookie = tr_link->link.cookie; bpf_trampoline_unpack_key(tr_link->trampoline->key, &info->tracing.target_obj_id, &info->tracing.target_btf_id); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 25e9cf92ffaf..194ed9891b40 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6665,6 +6665,8 @@ struct bpf_link_info { __u32 attach_type; __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ __u32 target_btf_id; /* BTF type id inside the object */ + __u32 :32; + __u64 cookie; } tracing; struct { __u64 cgroup_id; -- cgit v1.2.3 From 2d72dd14d77f31a7caa619fe0b889304844e612e Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 10 Jun 2025 16:07:56 +0200 Subject: bpf: adjust path to trace_output sample eBPF program The sample file was renamed from trace_output_kern.c to trace_output.bpf.c in commit d4fffba4d04b ("samples/bpf: Change _kern suffix to .bpf with syscall tracing program"). Adjust the path in the documentation comment for bpf_perf_event_output. Signed-off-by: Tobias Klauser Link: https://lore.kernel.org/r/20250610140756.16332-1-tklauser@distanz.ch Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 194ed9891b40..39e7818cca80 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2409,7 +2409,7 @@ union bpf_attr { * into it. An example is available in file * *samples/bpf/trace_output_user.c* in the Linux kernel source * tree (the eBPF program counterpart is in - * *samples/bpf/trace_output_kern.c*). + * *samples/bpf/trace_output.bpf.c*). * * **bpf_perf_event_output**\ () achieves better performance * than **bpf_trace_printk**\ () for sharing data with user diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 194ed9891b40..39e7818cca80 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2409,7 +2409,7 @@ union bpf_attr { * into it. An example is available in file * *samples/bpf/trace_output_user.c* in the Linux kernel source * tree (the eBPF program counterpart is in - * *samples/bpf/trace_output_kern.c*). + * *samples/bpf/trace_output.bpf.c*). * * **bpf_perf_event_output**\ () achieves better performance * than **bpf_trace_printk**\ () for sharing data with user -- cgit v1.2.3 From 31557b3487b349464daf42bc4366153743c1e727 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Jun 2025 07:39:33 -0700 Subject: uapi: in6: restore visibility of most IPv6 socket options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A decade ago commit 6d08acd2d32e ("in6: fix conflict with glibc") hid the definitions of IPV6 options, because GCC was complaining about duplicates. The commit did not list the warnings seen, but trying to recreate them now I think they are (building iproute2): In file included from ./include/uapi/rdma/rdma_user_cm.h:39, from rdma.h:16, from res.h:9, from res-ctx.c:7: ../include/uapi/linux/in6.h:171:9: warning: ‘IPV6_ADD_MEMBERSHIP’ redefined 171 | #define IPV6_ADD_MEMBERSHIP 20 | ^~~~~~~~~~~~~~~~~~~ In file included from /usr/include/netinet/in.h:37, from rdma.h:13: /usr/include/bits/in.h:233:10: note: this is the location of the previous definition 233 | # define IPV6_ADD_MEMBERSHIP IPV6_JOIN_GROUP | ^~~~~~~~~~~~~~~~~~~ ../include/uapi/linux/in6.h:172:9: warning: ‘IPV6_DROP_MEMBERSHIP’ redefined 172 | #define IPV6_DROP_MEMBERSHIP 21 | ^~~~~~~~~~~~~~~~~~~~ /usr/include/bits/in.h:234:10: note: this is the location of the previous definition 234 | # define IPV6_DROP_MEMBERSHIP IPV6_LEAVE_GROUP | ^~~~~~~~~~~~~~~~~~~~ Compilers don't complain about redefinition if the defines are identical, but here we have the kernel using the literal value, and glibc using an indirection (defining to a name of another define, with the same numerical value). Problem is, the commit in question hid all the IPV6 socket options, and glibc has a pretty sparse list. For instance it lacks Flow Label related options. Willem called this out in commit 3fb321fde22d ("selftests/net: ipv6 flowlabel"): /* uapi/glibc weirdness may leave this undefined */ #ifndef IPV6_FLOWINFO #define IPV6_FLOWINFO 11 #endif More interestingly some applications (socat) use a #ifdef IPV6_FLOWINFO to gate compilation of thier rudimentary flow label support. (For added confusion socat misspells it as IPV4_FLOWINFO in some places.) Hide only the two defines we know glibc has a problem with. If we discover more warnings we can hide more but we should avoid covering the entire block of defines for "IPV6 socket options". Link: https://patch.msgid.link/20250609143933.1654417-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/in6.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index ff8d21f9e95b..5a47339ef7d7 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -152,7 +152,6 @@ struct in6_flowlabel_req { /* * IPV6 socket options */ -#if __UAPI_DEF_IPV6_OPTIONS #define IPV6_ADDRFORM 1 #define IPV6_2292PKTINFO 2 #define IPV6_2292HOPOPTS 3 @@ -169,8 +168,10 @@ struct in6_flowlabel_req { #define IPV6_MULTICAST_IF 17 #define IPV6_MULTICAST_HOPS 18 #define IPV6_MULTICAST_LOOP 19 +#if __UAPI_DEF_IPV6_OPTIONS #define IPV6_ADD_MEMBERSHIP 20 #define IPV6_DROP_MEMBERSHIP 21 +#endif #define IPV6_ROUTER_ALERT 22 #define IPV6_MTU_DISCOVER 23 #define IPV6_MTU 24 @@ -203,7 +204,6 @@ struct in6_flowlabel_req { #define IPV6_IPSEC_POLICY 34 #define IPV6_XFRM_POLICY 35 #define IPV6_HDRINCL 36 -#endif /* * Multicast: -- cgit v1.2.3 From 6a9e2fb1bab53b54d02714a2ee3c6612d19629ce Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 6 Jun 2025 11:45:07 +0200 Subject: nsfs: move root inode number to uapi Userspace relies on the root inode numbers to identify the initial namespaces. That's already a hard dependency. So we cannot change that anymore. Move the initial inode numbers to a public header. Link: https://github.com/systemd/systemd/commit/d293fade24b34ccc2f5716b0ff5513e9533cf0c4 Link: https://lore.kernel.org/20250606-work-nsfs-v1-1-b8749c9a8844@kernel.org Signed-off-by: Christian Brauner --- include/linux/proc_ns.h | 13 +++++++------ include/uapi/linux/nsfs.h | 9 +++++++++ 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 5ea470eb4d76..e77a37b23ca7 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -6,6 +6,7 @@ #define _LINUX_PROC_NS_H #include +#include struct pid_namespace; struct nsset; @@ -40,12 +41,12 @@ extern const struct proc_ns_operations timens_for_children_operations; */ enum { PROC_ROOT_INO = 1, - PROC_IPC_INIT_INO = 0xEFFFFFFFU, - PROC_UTS_INIT_INO = 0xEFFFFFFEU, - PROC_USER_INIT_INO = 0xEFFFFFFDU, - PROC_PID_INIT_INO = 0xEFFFFFFCU, - PROC_CGROUP_INIT_INO = 0xEFFFFFFBU, - PROC_TIME_INIT_INO = 0xEFFFFFFAU, + PROC_IPC_INIT_INO = IPC_NS_INIT_INO, + PROC_UTS_INIT_INO = UTS_NS_INIT_INO, + PROC_USER_INIT_INO = USER_NS_INIT_INO, + PROC_PID_INIT_INO = PID_NS_INIT_INO, + PROC_CGROUP_INIT_INO = CGROUP_NS_INIT_INO, + PROC_TIME_INIT_INO = TIME_NS_INIT_INO, }; #ifdef CONFIG_PROC_FS diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 34127653fd00..6683e7ca3996 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -42,4 +42,13 @@ struct mnt_ns_info { /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) +enum init_ns_ino { + IPC_NS_INIT_INO = 0xEFFFFFFFU, + UTS_NS_INIT_INO = 0xEFFFFFFEU, + USER_NS_INIT_INO = 0xEFFFFFFDU, + PID_NS_INIT_INO = 0xEFFFFFFCU, + CGROUP_NS_INIT_INO = 0xEFFFFFFBU, + TIME_NS_INIT_INO = 0xEFFFFFFAU, +}; + #endif /* __LINUX_NSFS_H */ -- cgit v1.2.3 From 9b0240b3ccc325c7a96cf362877180bc9e10d546 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 6 Jun 2025 11:45:08 +0200 Subject: netns: use stable inode number for initial mount ns Apart from the network and mount namespace all other namespaces expose a stable inode number and userspace has been relying on that for a very long time now. It's very much heavily used API. Align the network namespace and use a stable inode number from the reserved procfs inode number space so this is consistent across all namespaces. Link: https://lore.kernel.org/20250606-work-nsfs-v1-2-b8749c9a8844@kernel.org Reviewed-by: Jakub Kicinski Signed-off-by: Christian Brauner --- include/linux/proc_ns.h | 1 + include/uapi/linux/nsfs.h | 1 + net/core/net_namespace.c | 8 ++++++++ 3 files changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index e77a37b23ca7..3ff0bd381704 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -47,6 +47,7 @@ enum { PROC_PID_INIT_INO = PID_NS_INIT_INO, PROC_CGROUP_INIT_INO = CGROUP_NS_INIT_INO, PROC_TIME_INIT_INO = TIME_NS_INIT_INO, + PROC_NET_INIT_INO = NET_NS_INIT_INO, }; #ifdef CONFIG_PROC_FS diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 6683e7ca3996..393778489d85 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -49,6 +49,7 @@ enum init_ns_ino { PID_NS_INIT_INO = 0xEFFFFFFCU, CGROUP_NS_INIT_INO = 0xEFFFFFFBU, TIME_NS_INIT_INO = 0xEFFFFFFAU, + NET_NS_INIT_INO = 0xEFFFFFF9U, }; #endif /* __LINUX_NSFS_H */ diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index ae54f26709ca..03cf87d3b380 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -796,11 +796,19 @@ static __net_init int net_ns_net_init(struct net *net) #ifdef CONFIG_NET_NS net->ns.ops = &netns_operations; #endif + if (net == &init_net) { + net->ns.inum = PROC_NET_INIT_INO; + return 0; + } return ns_alloc_inum(&net->ns); } static __net_exit void net_ns_net_exit(struct net *net) { + /* + * Initial network namespace doesn't exit so we don't need any + * special checks here. + */ ns_free_inum(&net->ns); } -- cgit v1.2.3 From 7f4f229195b73606ded77e56943f463b78adf635 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 6 Jun 2025 11:45:09 +0200 Subject: mntns: use stable inode number for initial mount ns Apart from the network and mount namespace all other namespaces expose a stable inode number and userspace has been relying on that for a very long time now. It's very much heavily used API. Align the mount namespace and use a stable inode number from the reserved procfs inode number space so this is consistent across all namespaces. Link: https://lore.kernel.org/20250606-work-nsfs-v1-3-b8749c9a8844@kernel.org Signed-off-by: Christian Brauner --- fs/namespace.c | 4 +++- include/linux/proc_ns.h | 1 + include/uapi/linux/nsfs.h | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/fs/namespace.c b/fs/namespace.c index e13d9ab4f564..7ca4612c7ae9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6203,9 +6203,11 @@ static void __init init_mount_tree(void) if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = alloc_mnt_ns(&init_user_ns, false); + ns = alloc_mnt_ns(&init_user_ns, true); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); + ns->seq = atomic64_inc_return(&mnt_ns_seq); + ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); ns->root = m; ns->nr_mounts = 1; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 3ff0bd381704..6258455e49a4 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -48,6 +48,7 @@ enum { PROC_CGROUP_INIT_INO = CGROUP_NS_INIT_INO, PROC_TIME_INIT_INO = TIME_NS_INIT_INO, PROC_NET_INIT_INO = NET_NS_INIT_INO, + PROC_MNT_INIT_INO = MNT_NS_INIT_INO, }; #ifdef CONFIG_PROC_FS diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 393778489d85..97d8d80d139f 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -50,6 +50,7 @@ enum init_ns_ino { CGROUP_NS_INIT_INO = 0xEFFFFFFBU, TIME_NS_INIT_INO = 0xEFFFFFFAU, NET_NS_INIT_INO = 0xEFFFFFF9U, + MNT_NS_INIT_INO = 0xEFFFFFF8U, }; #endif /* __LINUX_NSFS_H */ -- cgit v1.2.3 From 12b5b138d111db0588492002fdd8089af61b80e5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 3 Jun 2025 15:31:55 +0200 Subject: coredump: allow for flexible coredump handling Extend the coredump socket to allow the coredump server to tell the kernel how to process individual coredumps. When the crashing task connects to the coredump socket the kernel will send a struct coredump_req to the coredump server. The kernel will set the size member of struct coredump_req allowing the coredump server how much data can be read. The coredump server uses MSG_PEEK to peek the size of struct coredump_req. If the kernel uses a newer struct coredump_req the coredump server just reads the size it knows and discard any remaining bytes in the buffer. If the kernel uses an older struct coredump_req the coredump server just reads the size the kernel knows. The returned struct coredump_req will inform the coredump server what features the kernel supports. The coredump_req->mask member is set to the currently know features. The coredump server may only use features whose bits were raised by the kernel in coredump_req->mask. In response to a coredump_req from the kernel the coredump server sends a struct coredump_ack to the kernel. The kernel informs the coredump server what version of struct coredump_ack it supports by setting struct coredump_req->size_ack to the size it knows about. The coredump server may only send as many bytes as coredump_req->size_ack indicates (a smaller size is fine of course). The coredump server must set coredump_ack->size accordingly. The coredump server sets the features it wants to use in struct coredump_ack->mask. Only bits returned in struct coredump_req->mask may be used. In case an invalid struct coredump_ack is sent to the kernel a non-zero u32 integer is sent indicating the reason for the failure. If it was successful a zero u32 integer is sent. In the initial version the following features are supported in coredump_{req,ack}->mask: * COREDUMP_KERNEL The kernel will write the coredump data to the socket. * COREDUMP_USERSPACE The kernel will not write coredump data but will indicate to the parent that a coredump has been generated. This is used when userspace generates its own coredumps. * COREDUMP_REJECT The kernel will skip generating a coredump for this task. * COREDUMP_WAIT The kernel will prevent the task from exiting until the coredump server has shutdown the socket connection. The flexible coredump socket can be enabled by using the "@@" prefix instead of the single "@" prefix for the regular coredump socket: @@/run/systemd/coredump.socket will enable flexible coredump handling. Current kernels already enforce that "@" must be followed by "/" and will reject anything else. So extending this is backward and forward compatible. Link: https://lore.kernel.org/20250603-work-coredump-socket-protocol-v2-1-05a5f0c18ecc@kernel.org Acked-by: Lennart Poettering Reviewed-by: Alexander Mikhalitsyn Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/coredump.c | 195 ++++++++++++++++++++++++++++++++++++------ include/uapi/linux/coredump.h | 104 ++++++++++++++++++++++ 2 files changed, 272 insertions(+), 27 deletions(-) create mode 100644 include/uapi/linux/coredump.h (limited to 'include/uapi/linux') diff --git a/fs/coredump.c b/fs/coredump.c index f217ebf2b3b6..b3eaa8c27ced 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -83,15 +84,17 @@ static int core_name_size = CORENAME_MAX_SIZE; unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT; enum coredump_type_t { - COREDUMP_FILE = 1, - COREDUMP_PIPE = 2, - COREDUMP_SOCK = 3, + COREDUMP_FILE = 1, + COREDUMP_PIPE = 2, + COREDUMP_SOCK = 3, + COREDUMP_SOCK_REQ = 4, }; struct core_name { char *corename; int used, size; enum coredump_type_t core_type; + u64 mask; }; static int expand_corename(struct core_name *cn, int size) @@ -235,6 +238,9 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, int pid_in_pattern = 0; int err = 0; + cn->mask = COREDUMP_KERNEL; + if (core_pipe_limit) + cn->mask |= COREDUMP_WAIT; cn->used = 0; cn->corename = NULL; if (*pat_ptr == '|') @@ -264,6 +270,13 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, pat_ptr++; if (!(*pat_ptr)) return -ENOMEM; + if (*pat_ptr == '@') { + pat_ptr++; + if (!(*pat_ptr)) + return -ENOMEM; + + cn->core_type = COREDUMP_SOCK_REQ; + } err = cn_printf(cn, "%s", pat_ptr); if (err) @@ -632,6 +645,135 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) return 0; } +#ifdef CONFIG_UNIX +static inline bool coredump_sock_recv(struct file *file, struct coredump_ack *ack, size_t size, int flags) +{ + struct msghdr msg = {}; + struct kvec iov = { .iov_base = ack, .iov_len = size }; + ssize_t ret; + + memset(ack, 0, size); + ret = kernel_recvmsg(sock_from_file(file), &msg, &iov, 1, size, flags); + return ret == size; +} + +static inline bool coredump_sock_send(struct file *file, struct coredump_req *req) +{ + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL }; + struct kvec iov = { .iov_base = req, .iov_len = sizeof(*req) }; + ssize_t ret; + + ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(*req)); + return ret == sizeof(*req); +} + +static_assert(sizeof(enum coredump_mark) == sizeof(__u32)); + +static inline bool coredump_sock_mark(struct file *file, enum coredump_mark mark) +{ + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL }; + struct kvec iov = { .iov_base = &mark, .iov_len = sizeof(mark) }; + ssize_t ret; + + ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(mark)); + return ret == sizeof(mark); +} + +static inline void coredump_sock_wait(struct file *file) +{ + ssize_t n; + + /* + * We use a simple read to wait for the coredump processing to + * finish. Either the socket is closed or we get sent unexpected + * data. In both cases, we're done. + */ + n = __kernel_read(file, &(char){ 0 }, 1, NULL); + if (n > 0) + coredump_report_failure("Coredump socket had unexpected data"); + else if (n < 0) + coredump_report_failure("Coredump socket failed"); +} + +static inline void coredump_sock_shutdown(struct file *file) +{ + struct socket *socket; + + socket = sock_from_file(file); + if (!socket) + return; + + /* Let userspace know we're done processing the coredump. */ + kernel_sock_shutdown(socket, SHUT_WR); +} + +static bool coredump_request(struct core_name *cn, struct coredump_params *cprm) +{ + struct coredump_req req = { + .size = sizeof(struct coredump_req), + .mask = COREDUMP_KERNEL | COREDUMP_USERSPACE | + COREDUMP_REJECT | COREDUMP_WAIT, + .size_ack = sizeof(struct coredump_ack), + }; + struct coredump_ack ack = {}; + ssize_t usize; + + if (cn->core_type != COREDUMP_SOCK_REQ) + return true; + + /* Let userspace know what we support. */ + if (!coredump_sock_send(cprm->file, &req)) + return false; + + /* Peek the size of the coredump_ack. */ + if (!coredump_sock_recv(cprm->file, &ack, sizeof(ack.size), + MSG_PEEK | MSG_WAITALL)) + return false; + + /* Refuse unknown coredump_ack sizes. */ + usize = ack.size; + if (usize < COREDUMP_ACK_SIZE_VER0) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_MINSIZE); + return false; + } + + if (usize > sizeof(ack)) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_MAXSIZE); + return false; + } + + /* Now retrieve the coredump_ack. */ + if (!coredump_sock_recv(cprm->file, &ack, usize, MSG_WAITALL)) + return false; + if (ack.size != usize) + return false; + + /* Refuse unknown coredump_ack flags. */ + if (ack.mask & ~req.mask) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED); + return false; + } + + /* Refuse mutually exclusive options. */ + if (hweight64(ack.mask & (COREDUMP_USERSPACE | COREDUMP_KERNEL | + COREDUMP_REJECT)) != 1) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_CONFLICTING); + return false; + } + + if (ack.spare) { + coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED); + return false; + } + + cn->mask = ack.mask; + return coredump_sock_mark(cprm->file, COREDUMP_MARK_REQACK); +} +#else +static inline void coredump_sock_wait(struct file *file) { } +static inline void coredump_sock_shutdown(struct file *file) { } +#endif + void do_coredump(const kernel_siginfo_t *siginfo) { struct core_state core_state; @@ -850,6 +992,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) } break; } + case COREDUMP_SOCK_REQ: + fallthrough; case COREDUMP_SOCK: { #ifdef CONFIG_UNIX struct file *file __free(fput) = NULL; @@ -918,6 +1062,9 @@ void do_coredump(const kernel_siginfo_t *siginfo) cprm.limit = RLIM_INFINITY; cprm.file = no_free_ptr(file); + + if (!coredump_request(&cn, &cprm)) + goto close_fail; #else coredump_report_failure("Core dump socket support %s disabled", cn.corename); goto close_fail; @@ -929,12 +1076,17 @@ void do_coredump(const kernel_siginfo_t *siginfo) goto close_fail; } + /* Don't even generate the coredump. */ + if (cn.mask & COREDUMP_REJECT) + goto close_fail; + /* get us an unshared descriptor table; almost always a no-op */ /* The cell spufs coredump code reads the file descriptor tables */ retval = unshare_files(); if (retval) goto close_fail; - if (!dump_interrupted()) { + + if ((cn.mask & COREDUMP_KERNEL) && !dump_interrupted()) { /* * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would * have this set to NULL. @@ -962,38 +1114,27 @@ void do_coredump(const kernel_siginfo_t *siginfo) free_vma_snapshot(&cprm); } -#ifdef CONFIG_UNIX - /* Let userspace know we're done processing the coredump. */ - if (sock_from_file(cprm.file)) - kernel_sock_shutdown(sock_from_file(cprm.file), SHUT_WR); -#endif + coredump_sock_shutdown(cprm.file); + + /* Let the parent know that a coredump was generated. */ + if (cn.mask & COREDUMP_USERSPACE) + core_dumped = true; /* * When core_pipe_limit is set we wait for the coredump server * or usermodehelper to finish before exiting so it can e.g., * inspect /proc/. */ - if (core_pipe_limit) { + if (cn.mask & COREDUMP_WAIT) { switch (cn.core_type) { case COREDUMP_PIPE: wait_for_dump_helpers(cprm.file); break; -#ifdef CONFIG_UNIX - case COREDUMP_SOCK: { - ssize_t n; - - /* - * We use a simple read to wait for the coredump - * processing to finish. Either the socket is - * closed or we get sent unexpected data. In - * both cases, we're done. - */ - n = __kernel_read(cprm.file, &(char){ 0 }, 1, NULL); - if (n != 0) - coredump_report_failure("Unexpected data on coredump socket"); + case COREDUMP_SOCK_REQ: + fallthrough; + case COREDUMP_SOCK: + coredump_sock_wait(cprm.file); break; - } -#endif default: break; } @@ -1249,8 +1390,8 @@ static inline bool check_coredump_socket(void) if (current->nsproxy->mnt_ns != init_task.nsproxy->mnt_ns) return false; - /* Must be an absolute path. */ - if (*(core_pattern + 1) != '/') + /* Must be an absolute path or the socket request. */ + if (*(core_pattern + 1) != '/' && *(core_pattern + 1) != '@') return false; return true; diff --git a/include/uapi/linux/coredump.h b/include/uapi/linux/coredump.h new file mode 100644 index 000000000000..dc3789b78af0 --- /dev/null +++ b/include/uapi/linux/coredump.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_COREDUMP_H +#define _UAPI_LINUX_COREDUMP_H + +#include + +/** + * coredump_{req,ack} flags + * @COREDUMP_KERNEL: kernel writes coredump + * @COREDUMP_USERSPACE: userspace writes coredump + * @COREDUMP_REJECT: don't generate coredump + * @COREDUMP_WAIT: wait for coredump server + */ +enum { + COREDUMP_KERNEL = (1ULL << 0), + COREDUMP_USERSPACE = (1ULL << 1), + COREDUMP_REJECT = (1ULL << 2), + COREDUMP_WAIT = (1ULL << 3), +}; + +/** + * struct coredump_req - message kernel sends to userspace + * @size: size of struct coredump_req + * @size_ack: known size of struct coredump_ack on this kernel + * @mask: supported features + * + * When a coredump happens the kernel will connect to the coredump + * socket and send a coredump request to the coredump server. The @size + * member is set to the size of struct coredump_req and provides a hint + * to userspace how much data can be read. Userspace may use MSG_PEEK to + * peek the size of struct coredump_req and then choose to consume it in + * one go. Userspace may also simply read a COREDUMP_ACK_SIZE_VER0 + * request. If the size the kernel sends is larger userspace simply + * discards any remaining data. + * + * The coredump_req->mask member is set to the currently know features. + * Userspace may only set coredump_ack->mask to the bits raised by the + * kernel in coredump_req->mask. + * + * The coredump_req->size_ack member is set by the kernel to the size of + * struct coredump_ack the kernel knows. Userspace may only send up to + * coredump_req->size_ack bytes to the kernel and must set + * coredump_ack->size accordingly. + */ +struct coredump_req { + __u32 size; + __u32 size_ack; + __u64 mask; +}; + +enum { + COREDUMP_REQ_SIZE_VER0 = 16U, /* size of first published struct */ +}; + +/** + * struct coredump_ack - message userspace sends to kernel + * @size: size of the struct + * @spare: unused + * @mask: features kernel is supposed to use + * + * The @size member must be set to the size of struct coredump_ack. It + * may never exceed what the kernel returned in coredump_req->size_ack + * but it may of course be smaller (>= COREDUMP_ACK_SIZE_VER0 and <= + * coredump_req->size_ack). + * + * The @mask member must be set to the features the coredump server + * wants the kernel to use. Only bits the kernel returned in + * coredump_req->mask may be set. + */ +struct coredump_ack { + __u32 size; + __u32 spare; + __u64 mask; +}; + +enum { + COREDUMP_ACK_SIZE_VER0 = 16U, /* size of first published struct */ +}; + +/** + * enum coredump_mark - Markers for the coredump socket + * + * The kernel will place a single byte on the coredump socket. The + * markers notify userspace whether the coredump ack succeeded or + * failed. + * + * @COREDUMP_MARK_MINSIZE: the provided coredump_ack size was too small + * @COREDUMP_MARK_MAXSIZE: the provided coredump_ack size was too big + * @COREDUMP_MARK_UNSUPPORTED: the provided coredump_ack mask was invalid + * @COREDUMP_MARK_CONFLICTING: the provided coredump_ack mask has conflicting options + * @COREDUMP_MARK_REQACK: the coredump request and ack was successful + * @__COREDUMP_MARK_MAX: the maximum coredump mark value + */ +enum coredump_mark { + COREDUMP_MARK_REQACK = 0U, + COREDUMP_MARK_MINSIZE = 1U, + COREDUMP_MARK_MAXSIZE = 2U, + COREDUMP_MARK_UNSUPPORTED = 3U, + COREDUMP_MARK_CONFLICTING = 4U, + __COREDUMP_MARK_MAX = (1U << 31), +}; + +#endif /* _UAPI_LINUX_COREDUMP_H */ -- cgit v1.2.3 From a2fc422ed75748eef2985454e97847fb22f873c2 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 21 May 2025 17:04:29 +0200 Subject: syscall_user_dispatch: Add PR_SYS_DISPATCH_INCLUSIVE_ON There are two possible scenarios for syscall filtering: - having a trusted/allowed range of PCs, and intercepting everything else - or the opposite: a single untrusted/intercepted range and allowing everything else (this is relevant for any kind of sandboxing scenario, or monitoring behavior of a single library) The current API only allows the former use case due to allowed range wrap-around check. Add PR_SYS_DISPATCH_INCLUSIVE_ON that enables the second use case. Add PR_SYS_DISPATCH_EXCLUSIVE_ON alias for PR_SYS_DISPATCH_ON to make it clear how it's different from the new PR_SYS_DISPATCH_INCLUSIVE_ON. Signed-off-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/97947cc8e205ff49675826d7b0327ef2e2c66eea.1747839857.git.dvyukov@google.com --- .../admin-guide/syscall-user-dispatch.rst | 23 ++++++++------ include/uapi/linux/prctl.h | 7 ++++- kernel/entry/syscall_user_dispatch.c | 36 ++++++++++++++-------- tools/include/uapi/linux/prctl.h | 7 ++++- 4 files changed, 49 insertions(+), 24 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/admin-guide/syscall-user-dispatch.rst b/Documentation/admin-guide/syscall-user-dispatch.rst index e3cfffef5a63..c1768d9e80fa 100644 --- a/Documentation/admin-guide/syscall-user-dispatch.rst +++ b/Documentation/admin-guide/syscall-user-dispatch.rst @@ -53,20 +53,25 @@ following prctl: prctl(PR_SET_SYSCALL_USER_DISPATCH, , , , [selector]) - is either PR_SYS_DISPATCH_ON or PR_SYS_DISPATCH_OFF, to enable and -disable the mechanism globally for that thread. When -PR_SYS_DISPATCH_OFF is used, the other fields must be zero. - -[, +) delimit a memory region interval -from which syscalls are always executed directly, regardless of the -userspace selector. This provides a fast path for the C library, which -includes the most common syscall dispatchers in the native code -applications, and also provides a way for the signal handler to return + is either PR_SYS_DISPATCH_EXCLUSIVE_ON/PR_SYS_DISPATCH_INCLUSIVE_ON +or PR_SYS_DISPATCH_OFF, to enable and disable the mechanism globally for +that thread. When PR_SYS_DISPATCH_OFF is used, the other fields must be zero. + +For PR_SYS_DISPATCH_EXCLUSIVE_ON [, +) delimit +a memory region interval from which syscalls are always executed directly, +regardless of the userspace selector. This provides a fast path for the +C library, which includes the most common syscall dispatchers in the native +code applications, and also provides a way for the signal handler to return without triggering a nested SIGSYS on (rt\_)sigreturn. Users of this interface should make sure that at least the signal trampoline code is included in this region. In addition, for syscalls that implement the trampoline code on the vDSO, that trampoline is never intercepted. +For PR_SYS_DISPATCH_INCLUSIVE_ON [, +) delimit +a memory region interval from which syscalls are dispatched based on +the userspace selector. Syscalls from outside of the range are always +executed directly. + [selector] is a pointer to a char-sized region in the process memory region, that provides a quick way to enable disable syscall redirection thread-wide, without the need to invoke the kernel directly. selector diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 43dec6eed559..9785c1d49f05 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -255,7 +255,12 @@ struct prctl_mm_map { /* Dispatch syscalls to a userspace handler */ #define PR_SET_SYSCALL_USER_DISPATCH 59 # define PR_SYS_DISPATCH_OFF 0 -# define PR_SYS_DISPATCH_ON 1 +/* Enable dispatch except for the specified range */ +# define PR_SYS_DISPATCH_EXCLUSIVE_ON 1 +/* Enable dispatch for the specified range */ +# define PR_SYS_DISPATCH_INCLUSIVE_ON 2 +/* Legacy name for backwards compatibility */ +# define PR_SYS_DISPATCH_ON PR_SYS_DISPATCH_EXCLUSIVE_ON /* The control values for the user space selector when dispatch is enabled */ # define SYSCALL_DISPATCH_FILTER_ALLOW 0 # define SYSCALL_DISPATCH_FILTER_BLOCK 1 diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index 5340c5aa89e7..a9055eccb27e 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -78,7 +78,7 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon if (offset || len || selector) return -EINVAL; break; - case PR_SYS_DISPATCH_ON: + case PR_SYS_DISPATCH_EXCLUSIVE_ON: /* * Validate the direct dispatcher region just for basic * sanity against overflow and a 0-sized dispatcher @@ -87,30 +87,40 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon */ if (offset && offset + len <= offset) return -EINVAL; - + break; + case PR_SYS_DISPATCH_INCLUSIVE_ON: + if (len == 0 || offset + len <= offset) + return -EINVAL; /* - * access_ok() will clear memory tags for tagged addresses - * if current has memory tagging enabled. - - * To enable a tracer to set a tracees selector the - * selector address must be untagged for access_ok(), - * otherwise an untagged tracer will always fail to set a - * tagged tracees selector. + * Invert the range, the check in syscall_user_dispatch() + * supports wrap-around. */ - if (selector && !access_ok(untagged_addr(selector), sizeof(*selector))) - return -EFAULT; - + offset = offset + len; + len = -len; break; default: return -EINVAL; } + /* + * access_ok() will clear memory tags for tagged addresses + * if current has memory tagging enabled. + * + * To enable a tracer to set a tracees selector the + * selector address must be untagged for access_ok(), + * otherwise an untagged tracer will always fail to set a + * tagged tracees selector. + */ + if (mode != PR_SYS_DISPATCH_OFF && selector && + !access_ok(untagged_addr(selector), sizeof(*selector))) + return -EFAULT; + task->syscall_dispatch.selector = selector; task->syscall_dispatch.offset = offset; task->syscall_dispatch.len = len; task->syscall_dispatch.on_dispatch = false; - if (mode == PR_SYS_DISPATCH_ON) + if (mode != PR_SYS_DISPATCH_OFF) set_task_syscall_work(task, SYSCALL_USER_DISPATCH); else clear_task_syscall_work(task, SYSCALL_USER_DISPATCH); diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 43dec6eed559..9785c1d49f05 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -255,7 +255,12 @@ struct prctl_mm_map { /* Dispatch syscalls to a userspace handler */ #define PR_SET_SYSCALL_USER_DISPATCH 59 # define PR_SYS_DISPATCH_OFF 0 -# define PR_SYS_DISPATCH_ON 1 +/* Enable dispatch except for the specified range */ +# define PR_SYS_DISPATCH_EXCLUSIVE_ON 1 +/* Enable dispatch for the specified range */ +# define PR_SYS_DISPATCH_INCLUSIVE_ON 2 +/* Legacy name for backwards compatibility */ +# define PR_SYS_DISPATCH_ON PR_SYS_DISPATCH_EXCLUSIVE_ON /* The control values for the user space selector when dispatch is enabled */ # define SYSCALL_DISPATCH_FILTER_ALLOW 0 # define SYSCALL_DISPATCH_FILTER_BLOCK 1 -- cgit v1.2.3 From c035e736038045b411cb368e63f07bc2f5dbc0e1 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Thu, 12 Jun 2025 17:28:33 +0200 Subject: dpll: add phase-offset-monitor feature to netlink spec Add enum dpll_feature_state for control over features. Add dpll device level attribute: DPLL_A_PHASE_OFFSET_MONITOR - to allow control over a phase offset monitor feature. Attribute is present and shall return current state of a feature (enum dpll_feature_state), if the device driver provides such capability, otherwie attribute shall not be present. Reviewed-by: Aleksandr Loktionov Reviewed-by: Milena Olech Reviewed-by: Jiri Pirko Signed-off-by: Arkadiusz Kubalewski Acked-by: Vadim Fedorenko Link: https://patch.msgid.link/20250612152835.1703397-2-arkadiusz.kubalewski@intel.com Signed-off-by: Jakub Kicinski --- Documentation/driver-api/dpll.rst | 18 ++++++++++++++++++ Documentation/netlink/specs/dpll.yaml | 24 ++++++++++++++++++++++++ drivers/dpll/dpll_nl.c | 5 +++-- include/uapi/linux/dpll.h | 12 ++++++++++++ 4 files changed, 57 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst index e6855cd37e85..195e1e5d9a58 100644 --- a/Documentation/driver-api/dpll.rst +++ b/Documentation/driver-api/dpll.rst @@ -214,6 +214,24 @@ offset values are fractional with 3-digit decimal places and shell be divided with ``DPLL_PIN_PHASE_OFFSET_DIVIDER`` to get integer part and modulo divided to get fractional part. +Phase offset monitor +==================== + +Phase offset measurement is typically performed against the current active +source. However, some DPLL (Digital Phase-Locked Loop) devices may offer +the capability to monitor phase offsets across all available inputs. +The attribute and current feature state shall be included in the response +message of the ``DPLL_CMD_DEVICE_GET`` command for supported DPLL devices. +In such cases, users can also control the feature using the +``DPLL_CMD_DEVICE_SET`` command by setting the ``enum dpll_feature_state`` +values for the attribute. +Once enabled the phase offset measurements for the input shall be returned +in the ``DPLL_A_PIN_PHASE_OFFSET`` attribute. + + =============================== ======================== + ``DPLL_A_PHASE_OFFSET_MONITOR`` attr state of a feature + =============================== ======================== + Embedded SYNC ============= diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml index 115d1a8f50bd..3bd6851c1d3c 100644 --- a/Documentation/netlink/specs/dpll.yaml +++ b/Documentation/netlink/specs/dpll.yaml @@ -240,6 +240,20 @@ definitions: integer part of a measured phase offset value. Value of (DPLL_A_PHASE_OFFSET % DPLL_PHASE_OFFSET_DIVIDER) is a fractional part of a measured phase offset value. + - + type: enum + name: feature-state + doc: | + Allow control (enable/disable) and status checking over features. + entries: + - + name: disable + doc: | + feature shall be disabled + - + name: enable + doc: | + feature shall be enabled attribute-sets: - @@ -293,6 +307,14 @@ attribute-sets: be put to message multiple times to indicate possible parallel quality levels (e.g. one specified by ITU option 1 and another one specified by option 2). + - + name: phase-offset-monitor + type: u32 + enum: feature-state + doc: Receive or request state of phase offset monitor feature. + If enabled, dpll device shall monitor and notify all currently + available inputs for changes of their phase offset against the + dpll device. - name: pin enum-name: dpll_a_pin @@ -483,6 +505,7 @@ operations: - temp - clock-id - type + - phase-offset-monitor dump: reply: *dev-attrs @@ -499,6 +522,7 @@ operations: request: attributes: - id + - phase-offset-monitor - name: device-create-ntf doc: Notification about device appearing diff --git a/drivers/dpll/dpll_nl.c b/drivers/dpll/dpll_nl.c index fe9b6893d261..8de90310c3be 100644 --- a/drivers/dpll/dpll_nl.c +++ b/drivers/dpll/dpll_nl.c @@ -37,8 +37,9 @@ static const struct nla_policy dpll_device_get_nl_policy[DPLL_A_ID + 1] = { }; /* DPLL_CMD_DEVICE_SET - do */ -static const struct nla_policy dpll_device_set_nl_policy[DPLL_A_ID + 1] = { +static const struct nla_policy dpll_device_set_nl_policy[DPLL_A_PHASE_OFFSET_MONITOR + 1] = { [DPLL_A_ID] = { .type = NLA_U32, }, + [DPLL_A_PHASE_OFFSET_MONITOR] = NLA_POLICY_MAX(NLA_U32, 1), }; /* DPLL_CMD_PIN_ID_GET - do */ @@ -105,7 +106,7 @@ static const struct genl_split_ops dpll_nl_ops[] = { .doit = dpll_nl_device_set_doit, .post_doit = dpll_post_doit, .policy = dpll_device_set_nl_policy, - .maxattr = DPLL_A_ID, + .maxattr = DPLL_A_PHASE_OFFSET_MONITOR, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index bf97d4b6d51f..349e1b3ca1ae 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -192,6 +192,17 @@ enum dpll_pin_capabilities { #define DPLL_PHASE_OFFSET_DIVIDER 1000 +/** + * enum dpll_feature_state - Allow control (enable/disable) and status checking + * over features. + * @DPLL_FEATURE_STATE_DISABLE: feature shall be disabled + * @DPLL_FEATURE_STATE_ENABLE: feature shall be enabled + */ +enum dpll_feature_state { + DPLL_FEATURE_STATE_DISABLE, + DPLL_FEATURE_STATE_ENABLE, +}; + enum dpll_a { DPLL_A_ID = 1, DPLL_A_MODULE_NAME, @@ -204,6 +215,7 @@ enum dpll_a { DPLL_A_TYPE, DPLL_A_LOCK_STATUS_ERROR, DPLL_A_CLOCK_QUALITY_LEVEL, + DPLL_A_PHASE_OFFSET_MONITOR, __DPLL_A_MAX, DPLL_A_MAX = (__DPLL_A_MAX - 1) -- cgit v1.2.3 From 866380bcf10c810c1c7097641170d53bbe5239ce Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 11 Jun 2025 12:02:52 +0200 Subject: tty: vt: use sane types for userspace API As discussed earlier (see the Link below), use the preferred ioctl types in vt.h (__u8, __u16, ...). These types are already used for the new VT_GETCONSIZECSRPOS. Therefore, the necessary includes are already present. Since now, the types are used for every structure defined in the header now. Note the kernel is built with -funsigned-char, therefore 'char' becomes '__u8' in here. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Nicolas Pitre Link: https://lore.kernel.org/all/p7p83sq1-4ro2-o924-s9o2-30spr74n076o@syhkavp.arg/ Reviewed-by: Nicolas Pitre Link: https://lore.kernel.org/r/20250611100319.186924-7-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/vt.h | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h index e5b0c492aa18..714483d68c69 100644 --- a/include/uapi/linux/vt.h +++ b/include/uapi/linux/vt.h @@ -19,11 +19,11 @@ #define VT_OPENQRY 0x5600 /* find available vt */ struct vt_mode { - char mode; /* vt mode */ - char waitv; /* if set, hang on writes if not active */ - short relsig; /* signal to raise on release req */ - short acqsig; /* signal to raise on acquisition */ - short frsig; /* unused (set to 0) */ + __u8 mode; /* vt mode */ + __u8 waitv; /* if set, hang on writes if not active */ + __s16 relsig; /* signal to raise on release req */ + __s16 acqsig; /* signal to raise on acquisition */ + __s16 frsig; /* unused (set to 0) */ }; #define VT_GETMODE 0x5601 /* get mode of active vt */ #define VT_SETMODE 0x5602 /* set mode of active vt */ @@ -32,9 +32,9 @@ struct vt_mode { #define VT_ACKACQ 0x02 /* acknowledge switch */ struct vt_stat { - unsigned short v_active; /* active vt */ - unsigned short v_signal; /* signal to send */ - unsigned short v_state; /* vt bitmask */ + __u16 v_active; /* active vt */ + __u16 v_signal; /* signal to send */ + __u16 v_state; /* vt bitmask */ }; #define VT_GETSTATE 0x5603 /* get global vt state info */ #define VT_SENDSIG 0x5604 /* signal to send to bitmask of vts */ @@ -46,19 +46,19 @@ struct vt_stat { #define VT_DISALLOCATE 0x5608 /* free memory associated to vt */ struct vt_sizes { - unsigned short v_rows; /* number of rows */ - unsigned short v_cols; /* number of columns */ - unsigned short v_scrollsize; /* number of lines of scrollback */ + __u16 v_rows; /* number of rows */ + __u16 v_cols; /* number of columns */ + __u16 v_scrollsize; /* number of lines of scrollback */ }; #define VT_RESIZE 0x5609 /* set kernel's idea of screensize */ struct vt_consize { - unsigned short v_rows; /* number of rows */ - unsigned short v_cols; /* number of columns */ - unsigned short v_vlin; /* number of pixel rows on screen */ - unsigned short v_clin; /* number of pixel rows per character */ - unsigned short v_vcol; /* number of pixel columns on screen */ - unsigned short v_ccol; /* number of pixel columns per character */ + __u16 v_rows; /* number of rows */ + __u16 v_cols; /* number of columns */ + __u16 v_vlin; /* number of pixel rows on screen */ + __u16 v_clin; /* number of pixel rows per character */ + __u16 v_vcol; /* number of pixel columns on screen */ + __u16 v_ccol; /* number of pixel columns per character */ }; #define VT_RESIZEX 0x560A /* set kernel's idea of screensize + more */ #define VT_LOCKSWITCH 0x560B /* disallow vt switching */ @@ -66,21 +66,21 @@ struct vt_consize { #define VT_GETHIFONTMASK 0x560D /* return hi font mask */ struct vt_event { - unsigned int event; + __u32 event; #define VT_EVENT_SWITCH 0x0001 /* Console switch */ #define VT_EVENT_BLANK 0x0002 /* Screen blank */ #define VT_EVENT_UNBLANK 0x0004 /* Screen unblank */ #define VT_EVENT_RESIZE 0x0008 /* Resize display */ #define VT_MAX_EVENT 0x000F - unsigned int oldev; /* Old console */ - unsigned int newev; /* New console (if changing) */ - unsigned int pad[4]; /* Padding for expansion */ + __u32 oldev; /* Old console */ + __u32 newev; /* New console (if changing) */ + __u32 pad[4]; /* Padding for expansion */ }; #define VT_WAITEVENT 0x560E /* Wait for an event */ struct vt_setactivate { - unsigned int console; + __u32 console; struct vt_mode mode; }; -- cgit v1.2.3 From f1180ca37abe3d117e4a19be12142fe722612a7c Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 11 Jun 2025 12:02:53 +0200 Subject: tty: vt: use _IO() to define ioctl numbers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _IO*() is the proper way of defining ioctl numbers. All these vt numbers were synthetically built up the same way the _IO() macro does. So instead of implicit hex numbers, use _IO() properly. To not change the pre-existing numbers, use only _IO() (and not _IOR() or _IOW()). The latter would change the numbers indeed. Objdump of vt_ioctl.o reveals no difference with this patch. Again, VT_GETCONSIZECSRPOS already uses _IOR(), so everything is paved for this patch. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Nicolas Pitre Reviewed-by: Nicolas Pitre Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20250611100319.186924-8-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/vt.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h index 714483d68c69..b60fcdfb2746 100644 --- a/include/uapi/linux/vt.h +++ b/include/uapi/linux/vt.h @@ -14,9 +14,9 @@ /* Note: the ioctl VT_GETSTATE does not work for consoles 16 and higher (since it returns a short) */ -/* 0x56 is 'V', to avoid collision with termios and kd */ +/* 'V' to avoid collision with termios and kd */ -#define VT_OPENQRY 0x5600 /* find available vt */ +#define VT_OPENQRY _IO('V', 0x00) /* find available vt */ struct vt_mode { __u8 mode; /* vt mode */ @@ -25,8 +25,8 @@ struct vt_mode { __s16 acqsig; /* signal to raise on acquisition */ __s16 frsig; /* unused (set to 0) */ }; -#define VT_GETMODE 0x5601 /* get mode of active vt */ -#define VT_SETMODE 0x5602 /* set mode of active vt */ +#define VT_GETMODE _IO('V', 0x01) /* get mode of active vt */ +#define VT_SETMODE _IO('V', 0x02) /* set mode of active vt */ #define VT_AUTO 0x00 /* auto vt switching */ #define VT_PROCESS 0x01 /* process controls switching */ #define VT_ACKACQ 0x02 /* acknowledge switch */ @@ -36,21 +36,21 @@ struct vt_stat { __u16 v_signal; /* signal to send */ __u16 v_state; /* vt bitmask */ }; -#define VT_GETSTATE 0x5603 /* get global vt state info */ -#define VT_SENDSIG 0x5604 /* signal to send to bitmask of vts */ +#define VT_GETSTATE _IO('V', 0x03) /* get global vt state info */ +#define VT_SENDSIG _IO('V', 0x04) /* signal to send to bitmask of vts */ -#define VT_RELDISP 0x5605 /* release display */ +#define VT_RELDISP _IO('V', 0x05) /* release display */ -#define VT_ACTIVATE 0x5606 /* make vt active */ -#define VT_WAITACTIVE 0x5607 /* wait for vt active */ -#define VT_DISALLOCATE 0x5608 /* free memory associated to vt */ +#define VT_ACTIVATE _IO('V', 0x06) /* make vt active */ +#define VT_WAITACTIVE _IO('V', 0x07) /* wait for vt active */ +#define VT_DISALLOCATE _IO('V', 0x08) /* free memory associated to vt */ struct vt_sizes { __u16 v_rows; /* number of rows */ __u16 v_cols; /* number of columns */ __u16 v_scrollsize; /* number of lines of scrollback */ }; -#define VT_RESIZE 0x5609 /* set kernel's idea of screensize */ +#define VT_RESIZE _IO('V', 0x09) /* set kernel's idea of screensize */ struct vt_consize { __u16 v_rows; /* number of rows */ @@ -60,10 +60,10 @@ struct vt_consize { __u16 v_vcol; /* number of pixel columns on screen */ __u16 v_ccol; /* number of pixel columns per character */ }; -#define VT_RESIZEX 0x560A /* set kernel's idea of screensize + more */ -#define VT_LOCKSWITCH 0x560B /* disallow vt switching */ -#define VT_UNLOCKSWITCH 0x560C /* allow vt switching */ -#define VT_GETHIFONTMASK 0x560D /* return hi font mask */ +#define VT_RESIZEX _IO('V', 0x0A) /* set kernel's idea of screensize + more */ +#define VT_LOCKSWITCH _IO('V', 0x0B) /* disallow vt switching */ +#define VT_UNLOCKSWITCH _IO('V', 0x0C) /* allow vt switching */ +#define VT_GETHIFONTMASK _IO('V', 0x0D) /* return hi font mask */ struct vt_event { __u32 event; @@ -77,14 +77,14 @@ struct vt_event { __u32 pad[4]; /* Padding for expansion */ }; -#define VT_WAITEVENT 0x560E /* Wait for an event */ +#define VT_WAITEVENT _IO('V', 0x0E) /* Wait for an event */ struct vt_setactivate { __u32 console; struct vt_mode mode; }; -#define VT_SETACTIVATE 0x560F /* Activate and set the mode of a console */ +#define VT_SETACTIVATE _IO('V', 0x0F) /* Activate and set the mode of a console */ /* get console size and cursor position */ struct vt_consizecsrpos { -- cgit v1.2.3 From fc92099902fbf21000554678a47654b029c15a4d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 12:36:06 -0300 Subject: tools headers: Synchronize linux/bits.h with the kernel sources To pick up the changes in this cset: 1e7933a575ed8af4 ("uapi: Revert "bitops: avoid integer overflow in GENMASK(_ULL)"") 5b572e8a9f3dcd6e ("bits: introduce fixed-type BIT_U*()") 19408200c094858d ("bits: introduce fixed-type GENMASK_U*()") 31299a5e02112411 ("bits: add comments and newlines to #if, #else and #endif directives") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/linux/bits.h include/linux/bits.h Please see tools/include/uapi/README for further details. Acked-by: Vincent Mailhol Cc: I Hsin Cheng Cc: Yury Norov Cc: Adrian Hunter Cc: Ian Rogers Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Lucas De Marchi Cc: Namhyung Kim Cc: Yury Norov Link: https://lore.kernel.org/r/aEr0ZJ60EbshEy6p@x1 Signed-off-by: Arnaldo Carvalho de Melo --- include/uapi/linux/bits.h | 4 ++-- tools/include/linux/bits.h | 57 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bits.h b/include/uapi/linux/bits.h index a04afef9efca..682b406e1067 100644 --- a/include/uapi/linux/bits.h +++ b/include/uapi/linux/bits.h @@ -4,9 +4,9 @@ #ifndef _UAPI_LINUX_BITS_H #define _UAPI_LINUX_BITS_H -#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (__BITS_PER_LONG - 1 - (h)))) +#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (BITS_PER_LONG - 1 - (h)))) -#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h)))) +#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) #define __GENMASK_U128(h, l) \ ((_BIT128((h)) << 1) - (_BIT128(l))) diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 14fd0ca9a6cd..7ad056219115 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -12,6 +12,7 @@ #define BIT_ULL_MASK(nr) (ULL(1) << ((nr) % BITS_PER_LONG_LONG)) #define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) #define BITS_PER_BYTE 8 +#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) /* * Create a contiguous bitmask starting at bit position @l and ending at @@ -19,16 +20,68 @@ * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ #if !defined(__ASSEMBLY__) + +/* + * Missing asm support + * + * GENMASK_U*() and BIT_U*() depend on BITS_PER_TYPE() which relies on sizeof(), + * something not available in asm. Nevertheless, fixed width integers is a C + * concept. Assembly code can rely on the long and long long versions instead. + */ + #include #include +#include + #define GENMASK_INPUT_CHECK(h, l) BUILD_BUG_ON_ZERO(const_true((l) > (h))) -#else + +/* + * Generate a mask for the specified type @t. Additional checks are made to + * guarantee the value returned fits in that type, relying on + * -Wshift-count-overflow compiler check to detect incompatible arguments. + * For example, all these create build errors or warnings: + * + * - GENMASK(15, 20): wrong argument order + * - GENMASK(72, 15): doesn't fit unsigned long + * - GENMASK_U32(33, 15): doesn't fit in a u32 + */ +#define GENMASK_TYPE(t, h, l) \ + ((t)(GENMASK_INPUT_CHECK(h, l) + \ + (type_max(t) << (l) & \ + type_max(t) >> (BITS_PER_TYPE(t) - 1 - (h))))) + +#define GENMASK_U8(h, l) GENMASK_TYPE(u8, h, l) +#define GENMASK_U16(h, l) GENMASK_TYPE(u16, h, l) +#define GENMASK_U32(h, l) GENMASK_TYPE(u32, h, l) +#define GENMASK_U64(h, l) GENMASK_TYPE(u64, h, l) + +/* + * Fixed-type variants of BIT(), with additional checks like GENMASK_TYPE(). The + * following examples generate compiler warnings due to -Wshift-count-overflow: + * + * - BIT_U8(8) + * - BIT_U32(-1) + * - BIT_U32(40) + */ +#define BIT_INPUT_CHECK(type, nr) \ + BUILD_BUG_ON_ZERO(const_true((nr) >= BITS_PER_TYPE(type))) + +#define BIT_TYPE(type, nr) ((type)(BIT_INPUT_CHECK(type, nr) + BIT_ULL(nr))) + +#define BIT_U8(nr) BIT_TYPE(u8, nr) +#define BIT_U16(nr) BIT_TYPE(u16, nr) +#define BIT_U32(nr) BIT_TYPE(u32, nr) +#define BIT_U64(nr) BIT_TYPE(u64, nr) + +#else /* defined(__ASSEMBLY__) */ + /* * BUILD_BUG_ON_ZERO is not available in h files included from asm files, * disable the input check if that is the case. */ #define GENMASK_INPUT_CHECK(h, l) 0 -#endif + +#endif /* !defined(__ASSEMBLY__) */ #define GENMASK(h, l) \ (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l)) -- cgit v1.2.3 From f8337efa4ff5a27e6c1d4e384166413eecd21a65 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:19 +0200 Subject: vxlan: Support MC routing in the underlay Locally-generated MC packets have so far not been subject to MC routing. Instead an MC-enabled installation would maintain the MC routing tables, and separately from that the list of interfaces to send packets to as part of the VXLAN FDB and MDB. In a previous patch, a ip_mr_output() and ip6_mr_output() routines were added for IPv4 and IPv6. All locally generated MC traffic is now passed through these functions. For reasons of backward compatibility, an SKB (IPCB / IP6CB) flag guards the actual MC routing. This patch adds logic to set the flag, and the UAPI to enable the behavior. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/d899655bb7e9b2521ee8c793e67056b9fd02ba12.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 22 ++++++++++++++++++++-- include/net/vxlan.h | 5 ++++- include/uapi/linux/if_link.h | 1 + 3 files changed, 25 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index b22f9866be8e..a6cc1de4d8b8 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2451,6 +2451,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, rcu_read_lock(); if (addr_family == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); + u16 ipcb_flags = 0; struct rtable *rt; __be16 df = 0; __be32 saddr; @@ -2467,6 +2468,9 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto tx_error; } + if (flags & VXLAN_F_MC_ROUTE) + ipcb_flags |= IPSKB_MCROUTE; + if (!info) { /* Bypass encapsulation if the destination is local */ err = encap_bypass_if_local(skb, dev, vxlan, AF_INET, @@ -2522,11 +2526,13 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr, pkey->u.ipv4.dst, tos, ttl, df, - src_port, dst_port, xnet, !udp_sum, 0); + src_port, dst_port, xnet, !udp_sum, + ipcb_flags); #if IS_ENABLED(CONFIG_IPV6) } else { struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct in6_addr saddr; + u16 ip6cb_flags = 0; if (!ifindex) ifindex = sock6->sock->sk->sk_bound_dev_if; @@ -2542,6 +2548,9 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto tx_error; } + if (flags & VXLAN_F_MC_ROUTE) + ip6cb_flags |= IP6SKB_MCROUTE; + if (!info) { u32 rt6i_flags = dst_rt6_info(ndst)->rt6i_flags; @@ -2587,7 +2596,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, &saddr, &pkey->u.ipv6.dst, tos, ttl, pkey->label, src_port, dst_port, !udp_sum, - 0); + ip6cb_flags); #endif } vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len); @@ -3402,6 +3411,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX), [IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)), + [IFLA_VXLAN_MC_ROUTE] = NLA_POLICY_MAX(NLA_U8, 1), }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -4315,6 +4325,14 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], return err; } + if (data[IFLA_VXLAN_MC_ROUTE]) { + err = vxlan_nl2flag(conf, data, IFLA_VXLAN_MC_ROUTE, + VXLAN_F_MC_ROUTE, changelink, + true, extack); + if (err) + return err; + } + if (tb[IFLA_MTU]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], diff --git a/include/net/vxlan.h b/include/net/vxlan.h index e2f7ca045d3e..0ee50785f4f1 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -332,6 +332,7 @@ struct vxlan_dev { #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 #define VXLAN_F_LOCALBYPASS 0x80000 +#define VXLAN_F_MC_ROUTE 0x100000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -353,7 +354,9 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ VXLAN_F_VNIFILTER | \ - VXLAN_F_LOCALBYPASS) + VXLAN_F_LOCALBYPASS | \ + VXLAN_F_MC_ROUTE | \ + 0) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 3ad2d5d98034..873c285996fe 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1398,6 +1398,7 @@ enum { IFLA_VXLAN_LOCALBYPASS, IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ IFLA_VXLAN_RESERVED_BITS, + IFLA_VXLAN_MC_ROUTE, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) -- cgit v1.2.3 From 7c8c957ef12c41968adb66d785ce1dd5fb2f96e7 Mon Sep 17 00:00:00 2001 From: Stefan Klug Date: Fri, 23 May 2025 17:14:31 +0200 Subject: media: rkisp1: Add RKISP1_CID_SUPPORTED_PARAMS_BLOCKS control Add a RKISP1_CID_SUPPORTED_PARAMS_BLOCKS V4L2 control to be able to query the parameters blocks supported by the current kernel on the current hardware from user space. Signed-off-by: Stefan Klug Reviewed-by: Paul Elder Reviewed-by: Laurent Pinchart Link: https://lore.kernel.org/r/20250523-supported-params-and-wdr-v3-2-7283b8536694@ideasonboard.com Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- .../media/platform/rockchip/rkisp1/rkisp1-common.h | 2 + .../media/platform/rockchip/rkisp1/rkisp1-params.c | 49 +++++++++++++++++++++- include/uapi/linux/rkisp1-config.h | 11 +++++ include/uapi/linux/v4l2-controls.h | 6 +++ 4 files changed, 67 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h b/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h index ca952fd0829b..5f187f9efc7b 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h @@ -415,6 +415,8 @@ struct rkisp1_params { spinlock_t config_lock; /* locks the buffers list 'params' */ struct list_head params; + struct v4l2_ctrl_handler ctrls; + const struct v4l2_meta_format *metafmt; enum v4l2_quantization quantization; diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c index 24a8de697f2b..4db0ca8d86db 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c @@ -2736,6 +2736,44 @@ static int rkisp1_params_init_vb2_queue(struct vb2_queue *q, return vb2_queue_init(q); } +static int rkisp1_params_ctrl_init(struct rkisp1_params *params) +{ + struct v4l2_ctrl_config ctrl_config = { + .id = RKISP1_CID_SUPPORTED_PARAMS_BLOCKS, + .name = "Supported Params Blocks", + .type = V4L2_CTRL_TYPE_BITMASK, + .flags = V4L2_CTRL_FLAG_READ_ONLY, + }; + int ret; + + v4l2_ctrl_handler_init(¶ms->ctrls, 1); + + for (unsigned int i = 0; i < ARRAY_SIZE(rkisp1_ext_params_handlers); i++) { + const struct rkisp1_ext_params_handler *block_handler; + + block_handler = &rkisp1_ext_params_handlers[i]; + ctrl_config.max |= BIT(i); + + if ((params->rkisp1->info->features & block_handler->features) != + block_handler->features) + continue; + + ctrl_config.def |= BIT(i); + } + + v4l2_ctrl_new_custom(¶ms->ctrls, &ctrl_config, NULL); + + params->vnode.vdev.ctrl_handler = ¶ms->ctrls; + + if (params->ctrls.error) { + ret = params->ctrls.error; + v4l2_ctrl_handler_free(¶ms->ctrls); + return ret; + } + + return 0; +} + int rkisp1_params_register(struct rkisp1_device *rkisp1) { struct rkisp1_params *params = &rkisp1->params; @@ -2781,15 +2819,23 @@ int rkisp1_params_register(struct rkisp1_device *rkisp1) if (ret) goto err_media; + ret = rkisp1_params_ctrl_init(params); + if (ret) { + dev_err(rkisp1->dev, "Control initialization error %d\n", ret); + goto err_media; + } + ret = video_register_device(vdev, VFL_TYPE_VIDEO, -1); if (ret) { dev_err(rkisp1->dev, "failed to register %s, ret=%d\n", vdev->name, ret); - goto err_media; + goto err_ctrl; } return 0; +err_ctrl: + v4l2_ctrl_handler_free(¶ms->ctrls); err_media: media_entity_cleanup(&vdev->entity); mutex_destroy(&node->vlock); @@ -2806,6 +2852,7 @@ void rkisp1_params_unregister(struct rkisp1_device *rkisp1) return; vb2_video_unregister_device(vdev); + v4l2_ctrl_handler_free(¶ms->ctrls); media_entity_cleanup(&vdev->entity); mutex_destroy(&node->vlock); } diff --git a/include/uapi/linux/rkisp1-config.h b/include/uapi/linux/rkisp1-config.h index 2d995f3c1ca3..5ca4d5961c5b 100644 --- a/include/uapi/linux/rkisp1-config.h +++ b/include/uapi/linux/rkisp1-config.h @@ -1086,6 +1086,9 @@ enum rkisp1_ext_params_block_type { #define RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE (1U << 0) #define RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE (1U << 1) +/* A bitmask of parameters blocks supported on the current hardware. */ +#define RKISP1_CID_SUPPORTED_PARAMS_BLOCKS (V4L2_CID_USER_RKISP1_BASE + 0x01) + /** * struct rkisp1_ext_params_block_header - RkISP1 extensible parameters block * header @@ -1520,6 +1523,14 @@ enum rksip1_ext_param_buffer_version { * V4L2 control. If such control is not available, userspace should assume only * RKISP1_EXT_PARAM_BUFFER_V1 is supported by the driver. * + * The read-only V4L2 control ``RKISP1_CID_SUPPORTED_PARAMS_BLOCKS`` can be used + * to query the blocks supported by the device. It contains a bitmask where each + * bit represents the availability of the corresponding entry from the + * :c:type:`rkisp1_ext_params_block_type` enum. The current and default values + * of the control represents the blocks supported by the device instance, while + * the maximum value represents the blocks supported by the kernel driver, + * independently of the device instance. + * * For each ISP block that userspace wants to configure, a block-specific * structure is appended to the @data buffer, one after the other without gaps * in between nor overlaps. Userspace shall populate the @data_size field with diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 72e32814ea83..f836512e9deb 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -222,6 +222,12 @@ enum v4l2_colorfx { */ #define V4L2_CID_USER_UVC_BASE (V4L2_CID_USER_BASE + 0x11e0) +/* + * The base for Rockchip ISP1 driver controls. + * We reserve 16 controls for this driver. + */ +#define V4L2_CID_USER_RKISP1_BASE (V4L2_CID_USER_BASE + 0x1220) + /* MPEG-class control IDs */ /* The MPEG controls are applicable to all codec controls * and the 'MPEG' part of the define is historical */ -- cgit v1.2.3 From cd403e8aed6caad87967d2c135b57d92ba8e5544 Mon Sep 17 00:00:00 2001 From: Jai Luthra Date: Tue, 10 Jun 2025 17:55:27 +0530 Subject: media: rockchip: rkisp1: Add support for Wide Dynamic Range RKISP supports a basic Wide Dynamic Range (WDR) module since the first iteration (v1.0) of the ISP. Add support for enabling and configuring it using extensible parameters. Also, to ease programming, switch to using macro variables for defining the tonemapping curve register addresses. Reviewed-by: Stefan Klug Tested-by: Stefan Klug Reviewed-by: Paul Elder Reviewed-by: Laurent Pinchart Signed-off-by: Jai Luthra Link: https://lore.kernel.org/r/20250610-wdr-latest-v4-1-b69d0ac17ce9@ideasonboard.com Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- .../media/platform/rockchip/rkisp1/rkisp1-params.c | 93 ++++++++++++++++++++ .../media/platform/rockchip/rkisp1/rkisp1-regs.h | 99 ++++++---------------- include/uapi/linux/rkisp1-config.h | 95 ++++++++++++++++++++- 3 files changed, 212 insertions(+), 75 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c index 4db0ca8d86db..f1585f8fa0f4 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c @@ -5,6 +5,7 @@ * Copyright (C) 2017 Rockchip Electronics Co., Ltd. */ +#include #include #include @@ -60,6 +61,7 @@ union rkisp1_ext_params_config { struct rkisp1_ext_params_afc_config afc; struct rkisp1_ext_params_compand_bls_config compand_bls; struct rkisp1_ext_params_compand_curve_config compand_curve; + struct rkisp1_ext_params_wdr_config wdr; }; enum rkisp1_params_formats { @@ -1348,6 +1350,73 @@ rkisp1_compand_compress_config(struct rkisp1_params *params, arg->x); } +static void rkisp1_wdr_config(struct rkisp1_params *params, + const struct rkisp1_cif_isp_wdr_config *arg) +{ + unsigned int i; + u32 value; + + value = rkisp1_read(params->rkisp1, RKISP1_CIF_ISP_WDR_CTRL) + & ~(RKISP1_CIF_ISP_WDR_USE_IREF | + RKISP1_CIF_ISP_WDR_COLOR_SPACE_SELECT | + RKISP1_CIF_ISP_WDR_CR_MAPPING_DISABLE | + RKISP1_CIF_ISP_WDR_USE_Y9_8 | + RKISP1_CIF_ISP_WDR_USE_RGB7_8 | + RKISP1_CIF_ISP_WDR_DISABLE_TRANSIENT | + RKISP1_CIF_ISP_WDR_RGB_FACTOR_MASK); + + /* Colorspace and chrominance mapping */ + if (arg->use_rgb_colorspace) + value |= RKISP1_CIF_ISP_WDR_COLOR_SPACE_SELECT; + + if (!arg->use_rgb_colorspace && arg->bypass_chroma_mapping) + value |= RKISP1_CIF_ISP_WDR_CR_MAPPING_DISABLE; + + /* Illumination reference */ + if (arg->use_iref) { + value |= RKISP1_CIF_ISP_WDR_USE_IREF; + + if (arg->iref_config.use_y9_8) + value |= RKISP1_CIF_ISP_WDR_USE_Y9_8; + + if (arg->iref_config.use_rgb7_8) + value |= RKISP1_CIF_ISP_WDR_USE_RGB7_8; + + if (arg->iref_config.disable_transient) + value |= RKISP1_CIF_ISP_WDR_DISABLE_TRANSIENT; + + value |= FIELD_PREP(RKISP1_CIF_ISP_WDR_RGB_FACTOR_MASK, + min(arg->iref_config.rgb_factor, + RKISP1_CIF_ISP_WDR_RGB_FACTOR_MAX)); + } + + rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_WDR_CTRL, value); + + /* RGB and Luminance offsets */ + value = FIELD_PREP(RKISP1_CIF_ISP_WDR_RGB_OFFSET_MASK, + arg->rgb_offset) + | FIELD_PREP(RKISP1_CIF_ISP_WDR_LUM_OFFSET_MASK, + arg->luma_offset); + rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_WDR_OFFSET, value); + + /* DeltaMin */ + value = FIELD_PREP(RKISP1_CIF_ISP_WDR_DMIN_THRESH_MASK, + arg->dmin_thresh) + | FIELD_PREP(RKISP1_CIF_ISP_WDR_DMIN_STRENGTH_MASK, + min(arg->dmin_strength, + RKISP1_CIF_ISP_WDR_DMIN_STRENGTH_MAX)); + rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_WDR_DELTAMIN, value); + + /* Tone curve */ + for (i = 0; i < RKISP1_CIF_ISP_WDR_CURVE_NUM_DY_REGS; i++) + rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_WDR_TONECURVE(i), + arg->tone_curve.dY[i]); + for (i = 0; i < RKISP1_CIF_ISP_WDR_CURVE_NUM_COEFF; i++) + rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_WDR_TONECURVE_YM(i), + arg->tone_curve.ym[i] & + RKISP1_CIF_ISP_WDR_TONE_CURVE_YM_MASK); +} + static void rkisp1_isp_isr_other_config(struct rkisp1_params *params, const struct rkisp1_params_cfg *new_params) @@ -2005,6 +2074,25 @@ static void rkisp1_ext_params_compand_compress(struct rkisp1_params *params, RKISP1_CIF_ISP_COMPAND_CTRL_COMPRESS_ENABLE); } +static void rkisp1_ext_params_wdr(struct rkisp1_params *params, + const union rkisp1_ext_params_config *block) +{ + const struct rkisp1_ext_params_wdr_config *wdr = &block->wdr; + + if (wdr->header.flags & RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE) { + rkisp1_param_clear_bits(params, RKISP1_CIF_ISP_WDR_CTRL, + RKISP1_CIF_ISP_WDR_CTRL_ENABLE); + return; + } + + rkisp1_wdr_config(params, &wdr->config); + + if ((wdr->header.flags & RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE) && + !(params->enabled_blocks & BIT(wdr->header.type))) + rkisp1_param_set_bits(params, RKISP1_CIF_ISP_WDR_CTRL, + RKISP1_CIF_ISP_WDR_CTRL_ENABLE); +} + typedef void (*rkisp1_block_handler)(struct rkisp1_params *params, const union rkisp1_ext_params_config *config); @@ -2118,6 +2206,11 @@ static const struct rkisp1_ext_params_handler { .group = RKISP1_EXT_PARAMS_BLOCK_GROUP_OTHERS, .features = RKISP1_FEATURE_COMPAND, }, + [RKISP1_EXT_PARAMS_BLOCK_TYPE_WDR] = { + .size = sizeof(struct rkisp1_ext_params_wdr_config), + .handler = rkisp1_ext_params_wdr, + .group = RKISP1_EXT_PARAMS_BLOCK_GROUP_OTHERS, + }, }; static void rkisp1_ext_params_config(struct rkisp1_params *params, diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h index 139177db9c6d..fbeb186cde0d 100644 --- a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h +++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h @@ -703,6 +703,27 @@ #define RKISP1_CIF_ISP_COMPAND_CTRL_SOFT_RESET_FLAG BIT(2) #define RKISP1_CIF_ISP_COMPAND_CTRL_BLS_ENABLE BIT(3) +/* WDR */ +/* ISP_WDR_CTRL */ +#define RKISP1_CIF_ISP_WDR_CTRL_ENABLE BIT(0) +#define RKISP1_CIF_ISP_WDR_COLOR_SPACE_SELECT BIT(1) +#define RKISP1_CIF_ISP_WDR_CR_MAPPING_DISABLE BIT(2) +#define RKISP1_CIF_ISP_WDR_USE_IREF BIT(3) +#define RKISP1_CIF_ISP_WDR_USE_Y9_8 BIT(4) +#define RKISP1_CIF_ISP_WDR_USE_RGB7_8 BIT(5) +#define RKISP1_CIF_ISP_WDR_DISABLE_TRANSIENT BIT(6) +#define RKISP1_CIF_ISP_WDR_RGB_FACTOR_MASK GENMASK(11, 8) +#define RKISP1_CIF_ISP_WDR_RGB_FACTOR_MAX 8U +/* ISP_WDR_TONE_CURVE_YM */ +#define RKISP1_CIF_ISP_WDR_TONE_CURVE_YM_MASK GENMASK(12, 0) +/* ISP_WDR_OFFSET */ +#define RKISP1_CIF_ISP_WDR_RGB_OFFSET_MASK GENMASK(11, 0) +#define RKISP1_CIF_ISP_WDR_LUM_OFFSET_MASK GENMASK(27, 16) +/* ISP_WDR_DELTAMIN */ +#define RKISP1_CIF_ISP_WDR_DMIN_THRESH_MASK GENMASK(11, 0) +#define RKISP1_CIF_ISP_WDR_DMIN_STRENGTH_MASK GENMASK(20, 16) +#define RKISP1_CIF_ISP_WDR_DMIN_STRENGTH_MAX 16U + /* =================================================================== */ /* CIF Registers */ /* =================================================================== */ @@ -1295,82 +1316,12 @@ #define RKISP1_CIF_ISP_WDR_BASE 0x00002a00 #define RKISP1_CIF_ISP_WDR_CTRL (RKISP1_CIF_ISP_WDR_BASE + 0x00000000) -#define RKISP1_CIF_ISP_WDR_TONECURVE_1 (RKISP1_CIF_ISP_WDR_BASE + 0x00000004) -#define RKISP1_CIF_ISP_WDR_TONECURVE_2 (RKISP1_CIF_ISP_WDR_BASE + 0x00000008) -#define RKISP1_CIF_ISP_WDR_TONECURVE_3 (RKISP1_CIF_ISP_WDR_BASE + 0x0000000c) -#define RKISP1_CIF_ISP_WDR_TONECURVE_4 (RKISP1_CIF_ISP_WDR_BASE + 0x00000010) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_0 (RKISP1_CIF_ISP_WDR_BASE + 0x00000014) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_1 (RKISP1_CIF_ISP_WDR_BASE + 0x00000018) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_2 (RKISP1_CIF_ISP_WDR_BASE + 0x0000001c) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_3 (RKISP1_CIF_ISP_WDR_BASE + 0x00000020) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_4 (RKISP1_CIF_ISP_WDR_BASE + 0x00000024) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_5 (RKISP1_CIF_ISP_WDR_BASE + 0x00000028) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_6 (RKISP1_CIF_ISP_WDR_BASE + 0x0000002c) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_7 (RKISP1_CIF_ISP_WDR_BASE + 0x00000030) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_8 (RKISP1_CIF_ISP_WDR_BASE + 0x00000034) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_9 (RKISP1_CIF_ISP_WDR_BASE + 0x00000038) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_10 (RKISP1_CIF_ISP_WDR_BASE + 0x0000003c) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_11 (RKISP1_CIF_ISP_WDR_BASE + 0x00000040) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_12 (RKISP1_CIF_ISP_WDR_BASE + 0x00000044) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_13 (RKISP1_CIF_ISP_WDR_BASE + 0x00000048) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_14 (RKISP1_CIF_ISP_WDR_BASE + 0x0000004c) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_15 (RKISP1_CIF_ISP_WDR_BASE + 0x00000050) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_16 (RKISP1_CIF_ISP_WDR_BASE + 0x00000054) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_17 (RKISP1_CIF_ISP_WDR_BASE + 0x00000058) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_18 (RKISP1_CIF_ISP_WDR_BASE + 0x0000005c) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_19 (RKISP1_CIF_ISP_WDR_BASE + 0x00000060) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_20 (RKISP1_CIF_ISP_WDR_BASE + 0x00000064) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_21 (RKISP1_CIF_ISP_WDR_BASE + 0x00000068) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_22 (RKISP1_CIF_ISP_WDR_BASE + 0x0000006c) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_23 (RKISP1_CIF_ISP_WDR_BASE + 0x00000070) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_24 (RKISP1_CIF_ISP_WDR_BASE + 0x00000074) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_25 (RKISP1_CIF_ISP_WDR_BASE + 0x00000078) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_26 (RKISP1_CIF_ISP_WDR_BASE + 0x0000007c) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_27 (RKISP1_CIF_ISP_WDR_BASE + 0x00000080) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_28 (RKISP1_CIF_ISP_WDR_BASE + 0x00000084) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_29 (RKISP1_CIF_ISP_WDR_BASE + 0x00000088) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_30 (RKISP1_CIF_ISP_WDR_BASE + 0x0000008c) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_31 (RKISP1_CIF_ISP_WDR_BASE + 0x00000090) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_32 (RKISP1_CIF_ISP_WDR_BASE + 0x00000094) +#define RKISP1_CIF_ISP_WDR_TONECURVE(n) (RKISP1_CIF_ISP_WDR_BASE + 0x00000004 + (n) * 4) +#define RKISP1_CIF_ISP_WDR_TONECURVE_YM(n) (RKISP1_CIF_ISP_WDR_BASE + 0x00000014 + (n) * 4) #define RKISP1_CIF_ISP_WDR_OFFSET (RKISP1_CIF_ISP_WDR_BASE + 0x00000098) #define RKISP1_CIF_ISP_WDR_DELTAMIN (RKISP1_CIF_ISP_WDR_BASE + 0x0000009c) -#define RKISP1_CIF_ISP_WDR_TONECURVE_1_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000a0) -#define RKISP1_CIF_ISP_WDR_TONECURVE_2_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000a4) -#define RKISP1_CIF_ISP_WDR_TONECURVE_3_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000a8) -#define RKISP1_CIF_ISP_WDR_TONECURVE_4_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000ac) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_0_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000b0) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_1_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000b4) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_2_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000b8) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_3_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000bc) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_4_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000c0) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_5_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000c4) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_6_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000c8) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_7_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000cc) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_8_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000d0) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_9_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000d4) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_10_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000d8) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_11_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000dc) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_12_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000e0) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_13_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000e4) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_14_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000e8) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_15_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000ec) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_16_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000f0) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_17_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000f4) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_18_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000f8) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_19_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x000000fc) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_20_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x00000100) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_21_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x00000104) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_22_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x00000108) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_23_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x0000010c) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_24_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x00000110) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_25_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x00000114) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_26_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x00000118) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_27_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x0000011c) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_28_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x00000120) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_29_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x00000124) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_30_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x00000128) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_31_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x0000012c) -#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_32_SHD (RKISP1_CIF_ISP_WDR_BASE + 0x00000130) +#define RKISP1_CIF_ISP_WDR_TONECURVE_SHD(n) (RKISP1_CIF_ISP_WDR_BASE + 0x000000a0 + (n) * 4) +#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_SHD(n) (RKISP1_CIF_ISP_WDR_BASE + 0x000000b0 + (n) * 4) #define RKISP1_CIF_ISP_HIST_BASE_V12 0x00002c00 #define RKISP1_CIF_ISP_HIST_CTRL_V12 (RKISP1_CIF_ISP_HIST_BASE_V12 + 0x00000000) diff --git a/include/uapi/linux/rkisp1-config.h b/include/uapi/linux/rkisp1-config.h index 5ca4d5961c5b..3b060ea6eed7 100644 --- a/include/uapi/linux/rkisp1-config.h +++ b/include/uapi/linux/rkisp1-config.h @@ -169,6 +169,13 @@ */ #define RKISP1_CIF_ISP_COMPAND_NUM_POINTS 64 +/* + * Wide Dynamic Range + */ +#define RKISP1_CIF_ISP_WDR_CURVE_NUM_INTERV 32 +#define RKISP1_CIF_ISP_WDR_CURVE_NUM_COEFF (RKISP1_CIF_ISP_WDR_CURVE_NUM_INTERV + 1) +#define RKISP1_CIF_ISP_WDR_CURVE_NUM_DY_REGS 4 + /* * Measurement types */ @@ -889,6 +896,72 @@ struct rkisp1_cif_isp_compand_curve_config { __u32 y[RKISP1_CIF_ISP_COMPAND_NUM_POINTS]; }; +/** + * struct rkisp1_cif_isp_wdr_tone_curve - Tone mapping curve definition for WDR. + * + * @dY: the dYn increments for horizontal (input) axis of the tone curve. + * each 3-bit dY value represents an increment of 2**(value+3). + * dY[0] bits 0:2 is increment dY1, bit 3 unused + * dY[0] bits 4:6 is increment dY2, bit 7 unused + * ... + * dY[0] bits 28:30 is increment dY8, bit 31 unused + * ... and so on till dY[3] bits 28:30 is increment dY32, bit 31 unused. + * @ym: the Ym values for the vertical (output) axis of the tone curve. + * each value is 13 bit. + */ +struct rkisp1_cif_isp_wdr_tone_curve { + __u32 dY[RKISP1_CIF_ISP_WDR_CURVE_NUM_DY_REGS]; + __u16 ym[RKISP1_CIF_ISP_WDR_CURVE_NUM_COEFF]; +}; + +/** + * struct rkisp1_cif_isp_wdr_iref_config - Illumination reference config for WDR. + * + * Use illumination reference value as described below, instead of only the + * luminance (Y) value for tone mapping and gain calculations: + * IRef = (rgb_factor * RGBMax_tr + (8 - rgb_factor) * Y)/8 + * + * @rgb_factor: defines how much influence the RGBmax approach has in + * comparison to Y (valid values are 0..8). + * @use_y9_8: use Y*9/8 for maximum value calculation along with the + * default of R, G, B for noise reduction. + * @use_rgb7_8: decrease RGBMax by 7/8 for noise reduction. + * @disable_transient: disable transient calculation between Y and RGBY_max. + */ +struct rkisp1_cif_isp_wdr_iref_config { + __u8 rgb_factor; + __u8 use_y9_8; + __u8 use_rgb7_8; + __u8 disable_transient; +}; + +/** + * struct rkisp1_cif_isp_wdr_config - Configuration for wide dynamic range. + * + * @tone_curve: tone mapping curve. + * @iref_config: illumination reference configuration. (when use_iref is true) + * @rgb_offset: RGB offset value for RGB operation mode. (12 bits) + * @luma_offset: luminance offset value for RGB operation mode. (12 bits) + * @dmin_thresh: lower threshold for deltaMin value. (12 bits) + * @dmin_strength: strength factor for deltaMin. (valid range is 0x00..0x10) + * @use_rgb_colorspace: use RGB instead of luminance/chrominance colorspace. + * @bypass_chroma_mapping: disable chrominance mapping (only valid if + * use_rgb_colorspace = 0) + * @use_iref: use illumination reference instead of Y for tone mapping + * and gain calculations. + */ +struct rkisp1_cif_isp_wdr_config { + struct rkisp1_cif_isp_wdr_tone_curve tone_curve; + struct rkisp1_cif_isp_wdr_iref_config iref_config; + __u16 rgb_offset; + __u16 luma_offset; + __u16 dmin_thresh; + __u8 dmin_strength; + __u8 use_rgb_colorspace; + __u8 bypass_chroma_mapping; + __u8 use_iref; +}; + /*---------- PART2: Measurement Statistics ------------*/ /** @@ -1059,6 +1132,7 @@ struct rkisp1_stat_buffer { * @RKISP1_EXT_PARAMS_BLOCK_TYPE_COMPAND_BLS: BLS in the compand block * @RKISP1_EXT_PARAMS_BLOCK_TYPE_COMPAND_EXPAND: Companding expand curve * @RKISP1_EXT_PARAMS_BLOCK_TYPE_COMPAND_COMPRESS: Companding compress curve + * @RKISP1_EXT_PARAMS_BLOCK_TYPE_WDR: Wide dynamic range */ enum rkisp1_ext_params_block_type { RKISP1_EXT_PARAMS_BLOCK_TYPE_BLS, @@ -1081,6 +1155,7 @@ enum rkisp1_ext_params_block_type { RKISP1_EXT_PARAMS_BLOCK_TYPE_COMPAND_BLS, RKISP1_EXT_PARAMS_BLOCK_TYPE_COMPAND_EXPAND, RKISP1_EXT_PARAMS_BLOCK_TYPE_COMPAND_COMPRESS, + RKISP1_EXT_PARAMS_BLOCK_TYPE_WDR, }; #define RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE (1U << 0) @@ -1463,6 +1538,23 @@ struct rkisp1_ext_params_compand_curve_config { struct rkisp1_cif_isp_compand_curve_config config; } __attribute__((aligned(8))); +/** + * struct rkisp1_ext_params_wdr_config - RkISP1 extensible params + * Wide dynamic range config + * + * RkISP1 extensible parameters WDR block. + * Identified by :c:type:`RKISP1_EXT_PARAMS_BLOCK_TYPE_WDR` + * + * @header: The RkISP1 extensible parameters header, see + * :c:type:`rkisp1_ext_params_block_header` + * @config: WDR configuration, see + * :c:type:`rkisp1_cif_isp_wdr_config` + */ +struct rkisp1_ext_params_wdr_config { + struct rkisp1_ext_params_block_header header; + struct rkisp1_cif_isp_wdr_config config; +} __attribute__((aligned(8))); + /* * The rkisp1_ext_params_compand_curve_config structure is counted twice as it * is used for both the COMPAND_EXPAND and COMPAND_COMPRESS block types. @@ -1487,7 +1579,8 @@ struct rkisp1_ext_params_compand_curve_config { sizeof(struct rkisp1_ext_params_afc_config) +\ sizeof(struct rkisp1_ext_params_compand_bls_config) +\ sizeof(struct rkisp1_ext_params_compand_curve_config) +\ - sizeof(struct rkisp1_ext_params_compand_curve_config)) + sizeof(struct rkisp1_ext_params_compand_curve_config) +\ + sizeof(struct rkisp1_ext_params_wdr_config)) /** * enum rksip1_ext_param_buffer_version - RkISP1 extensible parameters version -- cgit v1.2.3 From c6d732c38f93c4aebd204a5656583142289c3a2e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jun 2025 13:22:40 -0700 Subject: net: ethtool: remove duplicate defines for family info Commit under fixes switched to uAPI generation from the YAML spec. A number of custom defines were left behind, mostly for commands very hard to express in YAML spec. Among what was left behind was the name and version of the generic netlink family. Problem is that the codegen always outputs those values so we ended up with a duplicated, differently named set of defines. Provide naming info in YAML and remove the incorrect defines. Fixes: 8d0580c6ebdd ("ethtool: regenerate uapi header from the spec") Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250617202240.811179-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 3 +++ include/uapi/linux/ethtool_netlink.h | 4 ---- include/uapi/linux/ethtool_netlink_generated.h | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 9f98715a6512..72a076b0e1b5 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -7,6 +7,9 @@ protocol: genetlink-legacy doc: Partial family for Ethtool Netlink. uapi-header: linux/ethtool_netlink_generated.h +c-family-name: ethtool-genl-name +c-version-name: ethtool-genl-version + definitions: - name: udp-tunnel-type diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 9ff72cfb2e98..09a75bdb6560 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -208,10 +208,6 @@ enum { ETHTOOL_A_STATS_PHY_MAX = (__ETHTOOL_A_STATS_PHY_CNT - 1) }; -/* generic netlink info */ -#define ETHTOOL_GENL_NAME "ethtool" -#define ETHTOOL_GENL_VERSION 1 - #define ETHTOOL_MCGRP_MONITOR_NAME "monitor" #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 9a02f579de22..aa8ab5227c1e 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -6,8 +6,8 @@ #ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H #define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H -#define ETHTOOL_FAMILY_NAME "ethtool" -#define ETHTOOL_FAMILY_VERSION 1 +#define ETHTOOL_GENL_NAME "ethtool" +#define ETHTOOL_GENL_VERSION 1 enum { ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, -- cgit v1.2.3 From fc0e6db30941a66e284b8516b82356f97f31061d Mon Sep 17 00:00:00 2001 From: "Kory Maincent (Dent Project)" Date: Tue, 17 Jun 2025 14:12:01 +0200 Subject: net: pse-pd: Add support for reporting events Add support for devm_pse_irq_helper() to register PSE interrupts and report events such as over-current or over-temperature conditions. This follows a similar approach to the regulator API but also sends notifications using a dedicated PSE ethtool netlink socket. Signed-off-by: Kory Maincent (Dent Project) Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-2-78a1a645e2ee@bootlin.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 34 +++++ Documentation/networking/ethtool-netlink.rst | 19 +++ drivers/net/pse-pd/pse_core.c | 179 +++++++++++++++++++++++++ include/linux/ethtool_netlink.h | 7 + include/linux/pse-pd/pse.h | 20 +++ include/uapi/linux/ethtool_netlink_generated.h | 19 +++ net/ethtool/pse-pd.c | 39 ++++++ 7 files changed, 317 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index ed9bcdec01cc..92b34a19f308 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -118,6 +118,17 @@ definitions: doc: | Hardware timestamp comes from one PHY device of the network topology + - + name: pse-event + doc: PSE event list for the PSE controller + type: flags + entries: + - + name: over-current + doc: PSE output current is too high + - + name: over-temp + doc: PSE in over temperature state attribute-sets: - @@ -1555,6 +1566,19 @@ attribute-sets: name: hwtstamp-flags type: nest nested-attributes: bitset + - + name: pse-ntf + attr-cnt-name: --ethtool-a-pse-ntf-cnt + attributes: + - + name: header + type: nest + nested-attributes: header + - + name: events + type: uint + enum: pse-event + doc: List of events reported by the PSE controller operations: enum-model: directional @@ -2413,3 +2437,13 @@ operations: attributes: *tsconfig reply: attributes: *tsconfig + - + name: pse-ntf + doc: Notification for PSE events. + + attribute-set: pse-ntf + + event: + attributes: + - header + - events diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index b6e9af4d0f1b..433737865bc2 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -290,6 +290,7 @@ Kernel to userspace: ``ETHTOOL_MSG_PHY_NTF`` Ethernet PHY information change ``ETHTOOL_MSG_TSCONFIG_GET_REPLY`` hw timestamping configuration ``ETHTOOL_MSG_TSCONFIG_SET_REPLY`` new hw timestamping configuration + ``ETHTOOL_MSG_PSE_NTF`` PSE events notification ======================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -1896,6 +1897,24 @@ various existing products that document power consumption in watts rather than classes. If power limit configuration based on classes is needed, the conversion can be done in user space, for example by ethtool. +PSE_NTF +======= + +Notify PSE events. + +Notification contents: + + =============================== ====== ======================== + ``ETHTOOL_A_PSE_HEADER`` nested request header + ``ETHTOOL_A_PSE_EVENTS`` bitset PSE events + =============================== ====== ======================== + +When set, the optional ``ETHTOOL_A_PSE_EVENTS`` attribute identifies the +PSE events. + +.. kernel-doc:: include/uapi/linux/ethtool_netlink_generated.h + :identifiers: ethtool_pse_event + RSS_GET ======= diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c index 4610c1f0ddd6..16cc1dc07246 100644 --- a/drivers/net/pse-pd/pse_core.c +++ b/drivers/net/pse-pd/pse_core.c @@ -7,10 +7,14 @@ #include #include +#include #include +#include #include #include #include +#include +#include static DEFINE_MUTEX(pse_list_mutex); static LIST_HEAD(pse_controller_list); @@ -210,6 +214,48 @@ out: return ret; } +/** + * pse_control_find_net_by_id - Find net attached to the pse control id + * @pcdev: a pointer to the PSE + * @id: index of the PSE control + * + * Return: pse_control pointer or NULL. The device returned has had a + * reference added and the pointer is safe until the user calls + * pse_control_put() to indicate they have finished with it. + */ +static struct pse_control * +pse_control_find_by_id(struct pse_controller_dev *pcdev, int id) +{ + struct pse_control *psec; + + mutex_lock(&pse_list_mutex); + list_for_each_entry(psec, &pcdev->pse_control_head, list) { + if (psec->id == id) { + kref_get(&psec->refcnt); + mutex_unlock(&pse_list_mutex); + return psec; + } + } + mutex_unlock(&pse_list_mutex); + return NULL; +} + +/** + * pse_control_get_netdev - Return netdev associated to a PSE control + * @psec: PSE control pointer + * + * Return: netdev pointer or NULL + */ +static struct net_device *pse_control_get_netdev(struct pse_control *psec) +{ + ASSERT_RTNL(); + + if (!psec || !psec->attached_phydev) + return NULL; + + return psec->attached_phydev->attached_dev; +} + static int pse_pi_is_enabled(struct regulator_dev *rdev) { struct pse_controller_dev *pcdev = rdev_get_drvdata(rdev); @@ -559,6 +605,139 @@ int devm_pse_controller_register(struct device *dev, } EXPORT_SYMBOL_GPL(devm_pse_controller_register); +struct pse_irq { + struct pse_controller_dev *pcdev; + struct pse_irq_desc desc; + unsigned long *notifs; +}; + +/** + * pse_to_regulator_notifs - Convert PSE notifications to Regulator + * notifications + * @notifs: PSE notifications + * + * Return: Regulator notifications + */ +static unsigned long pse_to_regulator_notifs(unsigned long notifs) +{ + unsigned long rnotifs = 0; + + if (notifs & ETHTOOL_PSE_EVENT_OVER_CURRENT) + rnotifs |= REGULATOR_EVENT_OVER_CURRENT; + if (notifs & ETHTOOL_PSE_EVENT_OVER_TEMP) + rnotifs |= REGULATOR_EVENT_OVER_TEMP; + + return rnotifs; +} + +/** + * pse_isr - IRQ handler for PSE + * @irq: irq number + * @data: pointer to user interrupt structure + * + * Return: irqreturn_t - status of IRQ + */ +static irqreturn_t pse_isr(int irq, void *data) +{ + struct pse_controller_dev *pcdev; + unsigned long notifs_mask = 0; + struct pse_irq_desc *desc; + struct pse_irq *h = data; + int ret, i; + + desc = &h->desc; + pcdev = h->pcdev; + + /* Clear notifs mask */ + memset(h->notifs, 0, pcdev->nr_lines * sizeof(*h->notifs)); + mutex_lock(&pcdev->lock); + ret = desc->map_event(irq, pcdev, h->notifs, ¬ifs_mask); + mutex_unlock(&pcdev->lock); + if (ret || !notifs_mask) + return IRQ_NONE; + + for_each_set_bit(i, ¬ifs_mask, pcdev->nr_lines) { + unsigned long notifs, rnotifs; + struct net_device *netdev; + struct pse_control *psec; + + /* Do nothing PI not described */ + if (!pcdev->pi[i].rdev) + continue; + + notifs = h->notifs[i]; + dev_dbg(h->pcdev->dev, + "Sending PSE notification EVT 0x%lx\n", notifs); + + psec = pse_control_find_by_id(pcdev, i); + rtnl_lock(); + netdev = pse_control_get_netdev(psec); + if (netdev) + ethnl_pse_send_ntf(netdev, notifs); + rtnl_unlock(); + pse_control_put(psec); + + rnotifs = pse_to_regulator_notifs(notifs); + regulator_notifier_call_chain(pcdev->pi[i].rdev, rnotifs, + NULL); + } + + return IRQ_HANDLED; +} + +/** + * devm_pse_irq_helper - Register IRQ based PSE event notifier + * @pcdev: a pointer to the PSE + * @irq: the irq value to be passed to request_irq + * @irq_flags: the flags to be passed to request_irq + * @d: PSE interrupt description + * + * Return: 0 on success and errno on failure + */ +int devm_pse_irq_helper(struct pse_controller_dev *pcdev, int irq, + int irq_flags, const struct pse_irq_desc *d) +{ + struct device *dev = pcdev->dev; + size_t irq_name_len; + struct pse_irq *h; + char *irq_name; + int ret; + + if (!d || !d->map_event || !d->name) + return -EINVAL; + + h = devm_kzalloc(dev, sizeof(*h), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->pcdev = pcdev; + h->desc = *d; + + /* IRQ name len is pcdev dev name + 5 char + irq desc name + 1 */ + irq_name_len = strlen(dev_name(pcdev->dev)) + 5 + strlen(d->name) + 1; + irq_name = devm_kzalloc(dev, irq_name_len, GFP_KERNEL); + if (!irq_name) + return -ENOMEM; + + snprintf(irq_name, irq_name_len, "pse-%s:%s", dev_name(pcdev->dev), + d->name); + + h->notifs = devm_kcalloc(dev, pcdev->nr_lines, + sizeof(*h->notifs), GFP_KERNEL); + if (!h->notifs) + return -ENOMEM; + + ret = devm_request_threaded_irq(dev, irq, NULL, pse_isr, + IRQF_ONESHOT | irq_flags, + irq_name, h); + if (ret) + dev_err(pcdev->dev, "Failed to request IRQ %d\n", irq); + + pcdev->irq = irq; + return ret; +} +EXPORT_SYMBOL_GPL(devm_pse_irq_helper); + /* PSE control section */ static void __pse_control_release(struct kref *kref) diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index aba91335273a..1dcc4059b5ab 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -43,6 +43,8 @@ void ethtool_aggregate_rmon_stats(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats); bool ethtool_dev_mm_supported(struct net_device *dev); +void ethnl_pse_send_ntf(struct net_device *netdev, unsigned long notif); + #else static inline int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) { @@ -120,6 +122,11 @@ static inline bool ethtool_dev_mm_supported(struct net_device *dev) return false; } +static inline void ethnl_pse_send_ntf(struct phy_device *phydev, + unsigned long notif) +{ +} + #endif /* IS_ENABLED(CONFIG_ETHTOOL_NETLINK) */ static inline int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 8b0866fad2ad..6eb064722aa8 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -7,12 +7,15 @@ #include #include +#include +#include /* Maximum current in uA according to IEEE 802.3-2022 Table 145-1 */ #define MAX_PI_CURRENT 1920000 /* Maximum power in mW according to IEEE 802.3-2022 Table 145-16 */ #define MAX_PI_PW 99900 +struct net_device; struct phy_device; struct pse_controller_dev; struct netlink_ext_ack; @@ -37,6 +40,19 @@ struct ethtool_c33_pse_pw_limit_range { u32 max; }; +/** + * struct pse_irq_desc - notification sender description for IRQ based events. + * + * @name: the visible name for the IRQ + * @map_event: driver callback to map IRQ status into PSE devices with events. + */ +struct pse_irq_desc { + const char *name; + int (*map_event)(int irq, struct pse_controller_dev *pcdev, + unsigned long *notifs, + unsigned long *notifs_mask); +}; + /** * struct pse_control_config - PSE control/channel configuration. * @@ -228,6 +244,7 @@ struct pse_pi { * @types: types of the PSE controller * @pi: table of PSE PIs described in this controller device * @no_of_pse_pi: flag set if the pse_pis devicetree node is not used + * @irq: PSE interrupt */ struct pse_controller_dev { const struct pse_controller_ops *ops; @@ -241,6 +258,7 @@ struct pse_controller_dev { enum ethtool_pse_types types; struct pse_pi *pi; bool no_of_pse_pi; + int irq; }; #if IS_ENABLED(CONFIG_PSE_CONTROLLER) @@ -249,6 +267,8 @@ void pse_controller_unregister(struct pse_controller_dev *pcdev); struct device; int devm_pse_controller_register(struct device *dev, struct pse_controller_dev *pcdev); +int devm_pse_irq_helper(struct pse_controller_dev *pcdev, int irq, + int irq_flags, const struct pse_irq_desc *d); struct pse_control *of_pse_control_get(struct device_node *node, struct phy_device *phydev); diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 9a02f579de22..3864aa0de8c7 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -49,6 +49,16 @@ enum hwtstamp_source { HWTSTAMP_SOURCE_PHYLIB, }; +/** + * enum ethtool_pse_event - PSE event list for the PSE controller + * @ETHTOOL_PSE_EVENT_OVER_CURRENT: PSE output current is too high + * @ETHTOOL_PSE_EVENT_OVER_TEMP: PSE in over temperature state + */ +enum ethtool_pse_event { + ETHTOOL_PSE_EVENT_OVER_CURRENT = 1, + ETHTOOL_PSE_EVENT_OVER_TEMP = 2, +}; + enum { ETHTOOL_A_HEADER_UNSPEC, ETHTOOL_A_HEADER_DEV_INDEX, @@ -718,6 +728,14 @@ enum { ETHTOOL_A_TSCONFIG_MAX = (__ETHTOOL_A_TSCONFIG_CNT - 1) }; +enum { + ETHTOOL_A_PSE_NTF_HEADER = 1, + ETHTOOL_A_PSE_NTF_EVENTS, + + __ETHTOOL_A_PSE_NTF_CNT, + ETHTOOL_A_PSE_NTF_MAX = (__ETHTOOL_A_PSE_NTF_CNT - 1) +}; + enum { ETHTOOL_MSG_USER_NONE = 0, ETHTOOL_MSG_STRSET_GET = 1, @@ -822,6 +840,7 @@ enum { ETHTOOL_MSG_PHY_NTF, ETHTOOL_MSG_TSCONFIG_GET_REPLY, ETHTOOL_MSG_TSCONFIG_SET_REPLY, + ETHTOOL_MSG_PSE_NTF, __ETHTOOL_MSG_KERNEL_CNT, ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index 4f6b99eab2a6..5443b4e0065a 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -315,3 +315,42 @@ const struct ethnl_request_ops ethnl_pse_request_ops = { .set = ethnl_set_pse, /* PSE has no notification */ }; + +void ethnl_pse_send_ntf(struct net_device *netdev, unsigned long notifs) +{ + void *reply_payload; + struct sk_buff *skb; + int reply_len; + int ret; + + ASSERT_RTNL(); + + if (!netdev || !notifs) + return; + + reply_len = ethnl_reply_header_size() + + nla_total_size(sizeof(u32)); /* _PSE_NTF_EVENTS */ + + skb = genlmsg_new(reply_len, GFP_KERNEL); + if (!skb) + return; + + reply_payload = ethnl_bcastmsg_put(skb, ETHTOOL_MSG_PSE_NTF); + if (!reply_payload) + goto err_skb; + + ret = ethnl_fill_reply_header(skb, netdev, ETHTOOL_A_PSE_NTF_HEADER); + if (ret < 0) + goto err_skb; + + if (nla_put_uint(skb, ETHTOOL_A_PSE_NTF_EVENTS, notifs)) + goto err_skb; + + genlmsg_end(skb, reply_payload); + ethnl_multicast(skb, netdev); + return; + +err_skb: + nlmsg_free(skb); +} +EXPORT_SYMBOL_GPL(ethnl_pse_send_ntf); -- cgit v1.2.3 From 1176978ed851952652ddea3685e2f71a0e5d61ff Mon Sep 17 00:00:00 2001 From: "Kory Maincent (Dent Project)" Date: Tue, 17 Jun 2025 14:12:04 +0200 Subject: net: ethtool: Add support for new power domains index description Report the index of the newly introduced PSE power domain to the user, enabling improved management of the power budget for PSE devices. Signed-off-by: Kory Maincent (Dent Project) Reviewed-by: Oleksij Rempel Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-5-78a1a645e2ee@bootlin.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 5 +++++ Documentation/networking/ethtool-netlink.rst | 4 ++++ drivers/net/pse-pd/pse_core.c | 3 +++ include/linux/pse-pd/pse.h | 2 ++ include/uapi/linux/ethtool_netlink_generated.h | 1 + net/ethtool/pse-pd.c | 7 +++++++ 6 files changed, 22 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 92b34a19f308..dfd9b842a4e7 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1406,6 +1406,10 @@ attribute-sets: type: nest multi-attr: true nested-attributes: c33-pse-pw-limit + - + name: pse-pw-d-id + type: u32 + name-prefix: ethtool-a- - name: rss attr-cnt-name: __ethtool-a-rss-cnt @@ -2229,6 +2233,7 @@ operations: - c33-pse-ext-substate - c33-pse-avail-pw-limit - c33-pse-pw-limit-ranges + - pse-pw-d-id dump: *pse-get-op - name: pse-set diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 433737865bc2..e9af8e58564c 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1789,6 +1789,7 @@ Kernel response contents: limit of the PoE PSE. ``ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES`` nested Supported power limit configuration ranges. + ``ETHTOOL_A_PSE_PW_D_ID`` u32 Index of the PSE power domain ========================================== ====== ============================= When set, the optional ``ETHTOOL_A_PODL_PSE_ADMIN_STATE`` attribute identifies @@ -1862,6 +1863,9 @@ identifies the C33 PSE power limit ranges through If the controller works with fixed classes, the min and max values will be equal. +The ``ETHTOOL_A_PSE_PW_D_ID`` attribute identifies the index of PSE power +domain. + PSE_SET ======= diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c index f2fb7ccbc4c2..7d424c22225e 100644 --- a/drivers/net/pse-pd/pse_core.c +++ b/drivers/net/pse-pd/pse_core.c @@ -1098,6 +1098,9 @@ int pse_ethtool_get_status(struct pse_control *psec, pcdev = psec->pcdev; ops = pcdev->ops; mutex_lock(&pcdev->lock); + if (pcdev->pi[psec->id].pw_d) + status->pw_d_id = pcdev->pi[psec->id].pw_d->id; + ret = ops->pi_get_admin_state(pcdev, psec->id, &admin_state); if (ret) goto out; diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index f736b1677ea5..2f8ecfd87d43 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -114,6 +114,7 @@ struct pse_pw_limit_ranges { /** * struct ethtool_pse_control_status - PSE control/channel status. * + * @pw_d_id: PSE power domain index. * @podl_admin_state: operational state of the PoDL PSE * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState * @podl_pw_status: power detection status of the PoDL PSE. @@ -135,6 +136,7 @@ struct pse_pw_limit_ranges { * ranges */ struct ethtool_pse_control_status { + u32 pw_d_id; enum ethtool_podl_pse_admin_state podl_admin_state; enum ethtool_podl_pse_pw_d_status podl_pw_status; enum ethtool_c33_pse_admin_state c33_admin_state; diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 3864aa0de8c7..ed344c8533eb 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -652,6 +652,7 @@ enum { ETHTOOL_A_C33_PSE_EXT_SUBSTATE, ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, + ETHTOOL_A_PSE_PW_D_ID, __ETHTOOL_A_PSE_CNT, ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index 5443b4e0065a..6a978a55959e 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -83,6 +83,8 @@ static int pse_reply_size(const struct ethnl_req_info *req_base, const struct ethtool_pse_control_status *st = &data->status; int len = 0; + if (st->pw_d_id) + len += nla_total_size(sizeof(u32)); /* _PSE_PW_D_ID */ if (st->podl_admin_state > 0) len += nla_total_size(sizeof(u32)); /* _PODL_PSE_ADMIN_STATE */ if (st->podl_pw_status > 0) @@ -148,6 +150,11 @@ static int pse_fill_reply(struct sk_buff *skb, const struct pse_reply_data *data = PSE_REPDATA(reply_base); const struct ethtool_pse_control_status *st = &data->status; + if (st->pw_d_id && + nla_put_u32(skb, ETHTOOL_A_PSE_PW_D_ID, + st->pw_d_id)) + return -EMSGSIZE; + if (st->podl_admin_state > 0 && nla_put_u32(skb, ETHTOOL_A_PODL_PSE_ADMIN_STATE, st->podl_admin_state)) -- cgit v1.2.3 From ffef61d6d27374542f1bce4452200d9bdd2e1edd Mon Sep 17 00:00:00 2001 From: "Kory Maincent (Dent Project)" Date: Tue, 17 Jun 2025 14:12:06 +0200 Subject: net: pse-pd: Add support for budget evaluation strategies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces the ability to configure the PSE PI budget evaluation strategies. Budget evaluation strategies is utilized by PSE controllers to determine which ports to turn off first in scenarios such as power budget exceedance. The pis_prio_max value is used to define the maximum priority level supported by the controller. Both the current priority and the maximum priority are exposed to the user through the pse_ethtool_get_status call. This patch add support for two mode of budget evaluation strategies. 1. Static Method: This method involves distributing power based on PD classification. It’s straightforward and stable, the PSE core keeping track of the budget and subtracting the power requested by each PD’s class. Advantages: Every PD gets its promised power at any time, which guarantees reliability. Disadvantages: PD classification steps are large, meaning devices request much more power than they actually need. As a result, the power supply may only operate at, say, 50% capacity, which is inefficient and wastes money. Priority max value is matching the number of PSE PIs within the PSE. 2. Dynamic Method: To address the inefficiencies of the static method, vendors like Microchip have introduced dynamic power budgeting, as seen in the PD692x0 firmware. This method monitors the current consumption per port and subtracts it from the available power budget. When the budget is exceeded, lower-priority ports are shut down. Advantages: This method optimizes resource utilization, saving costs. Disadvantages: Low-priority devices may experience instability. Priority max value is set by the PSE controller driver. For now, budget evaluation methods are not configurable and cannot be mixed. They are hardcoded in the PSE driver itself, as no current PSE controller supports both methods. Signed-off-by: Kory Maincent (Dent Project) Acked-by: Oleksij Rempel Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-7-78a1a645e2ee@bootlin.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 30 +- drivers/net/pse-pd/pse_core.c | 731 +++++++++++++++++++++++-- include/linux/pse-pd/pse.h | 76 +++ include/uapi/linux/ethtool_netlink_generated.h | 18 + 4 files changed, 815 insertions(+), 40 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index dfd9b842a4e7..7a9a857370e2 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -122,13 +122,39 @@ definitions: name: pse-event doc: PSE event list for the PSE controller type: flags + name-prefix: ethtool- entries: - - name: over-current + name: pse-event-over-current doc: PSE output current is too high - - name: over-temp + name: pse-event-over-temp doc: PSE in over temperature state + - + name: c33-pse-event-detection + doc: | + detection process occur on the PSE. IEEE 802.3-2022 33.2.5 and + 145.2.6 PSE detection of PDs. IEEE 802.3-202 30.9.1.1.5 + aPSEPowerDetectionStatus + - + name: c33-pse-event-classification + doc: | + classification process occur on the PSE. IEEE 802.3-2022 33.2.6 + and 145.2.8 classification of PDs mutual identification. + IEEE 802.3-2022 30.9.1.1.8 aPSEPowerClassification. + - + name: c33-pse-event-disconnection + doc: | + PD has been disconnected on the PSE. IEEE 802.3-2022 33.3.8 + and 145.3.9 PD Maintain Power Signature. IEEE 802.3-2022 + 33.5.1.2.9 MPS Absent. IEEE 802.3-2022 30.9.1.1.20 + aPSEMPSAbsentCounter. + - + name: pse-event-over-budget + doc: PSE turned off due to over budget situation + - + name: pse-event-sw-pw-control-error + doc: PSE faced an error managing the power control from software attribute-sets: - diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c index 495d72f98029..23eb3c9d0bcd 100644 --- a/drivers/net/pse-pd/pse_core.c +++ b/drivers/net/pse-pd/pse_core.c @@ -47,11 +47,14 @@ struct pse_control { * @id: ID of the power domain * @supply: Power supply the Power Domain * @refcnt: Number of gets of this pse_power_domain + * @budget_eval_strategy: Current power budget evaluation strategy of the + * power domain */ struct pse_power_domain { int id; struct regulator *supply; struct kref refcnt; + u32 budget_eval_strategy; }; static int of_load_single_pse_pi_pairset(struct device_node *node, @@ -297,6 +300,115 @@ static int pse_pi_is_hw_enabled(struct pse_controller_dev *pcdev, int id) return 0; } +/** + * pse_pi_is_admin_enable_pending - Check if PI is in admin enable pending state + * which mean the power is not yet being + * delivered + * @pcdev: a pointer to the PSE controller device + * @id: Index of the PI + * + * Detects if a PI is enabled in software with a PD detected, but the hardware + * admin state hasn't been applied yet. + * + * This function is used in the power delivery and retry mechanisms to determine + * which PIs need to have power delivery attempted again. + * + * Return: true if the PI has admin enable flag set in software but not yet + * reflected in the hardware admin state, false otherwise. + */ +static bool +pse_pi_is_admin_enable_pending(struct pse_controller_dev *pcdev, int id) +{ + int ret; + + /* PI not enabled or nothing is plugged */ + if (!pcdev->pi[id].admin_state_enabled || + !pcdev->pi[id].isr_pd_detected) + return false; + + ret = pse_pi_is_hw_enabled(pcdev, id); + /* PSE PI is already enabled at hardware level */ + if (ret == 1) + return false; + + return true; +} + +static int _pse_pi_delivery_power_sw_pw_ctrl(struct pse_controller_dev *pcdev, + int id, + struct netlink_ext_ack *extack); + +/** + * pse_pw_d_retry_power_delivery - Retry power delivery for pending ports in a + * PSE power domain + * @pcdev: a pointer to the PSE controller device + * @pw_d: a pointer to the PSE power domain + * + * Scans all ports in the specified power domain and attempts to enable power + * delivery to any ports that have admin enable state set but don't yet have + * hardware power enabled. Used when there are changes in connection status, + * admin state, or priority that might allow previously unpowered ports to + * receive power, especially in over-budget conditions. + */ +static void pse_pw_d_retry_power_delivery(struct pse_controller_dev *pcdev, + struct pse_power_domain *pw_d) +{ + int i, ret = 0; + + for (i = 0; i < pcdev->nr_lines; i++) { + int prio_max = pcdev->nr_lines; + struct netlink_ext_ack extack; + + if (pcdev->pi[i].pw_d != pw_d) + continue; + + if (!pse_pi_is_admin_enable_pending(pcdev, i)) + continue; + + /* Do not try to enable PI with a lower prio (higher value) + * than one which already can't be enabled. + */ + if (pcdev->pi[i].prio > prio_max) + continue; + + ret = _pse_pi_delivery_power_sw_pw_ctrl(pcdev, i, &extack); + if (ret == -ERANGE) + prio_max = pcdev->pi[i].prio; + } +} + +/** + * pse_pw_d_is_sw_pw_control - Determine if power control is software managed + * @pcdev: a pointer to the PSE controller device + * @pw_d: a pointer to the PSE power domain + * + * This function determines whether the power control for a specific power + * domain is managed by software in the interrupt handler rather than directly + * by hardware. + * + * Software power control is active in the following cases: + * - When the budget evaluation strategy is set to static + * - When the budget evaluation strategy is disabled but the PSE controller + * has an interrupt handler that can report if a Powered Device is connected + * + * Return: true if the power control of the power domain is managed by software, + * false otherwise + */ +static bool pse_pw_d_is_sw_pw_control(struct pse_controller_dev *pcdev, + struct pse_power_domain *pw_d) +{ + if (!pw_d) + return false; + + if (pw_d->budget_eval_strategy == PSE_BUDGET_EVAL_STRAT_STATIC) + return true; + if (pw_d->budget_eval_strategy == PSE_BUDGET_EVAL_STRAT_DISABLED && + pcdev->ops->pi_enable && pcdev->irq) + return true; + + return false; +} + static int pse_pi_is_enabled(struct regulator_dev *rdev) { struct pse_controller_dev *pcdev = rdev_get_drvdata(rdev); @@ -309,17 +421,252 @@ static int pse_pi_is_enabled(struct regulator_dev *rdev) id = rdev_get_id(rdev); mutex_lock(&pcdev->lock); + if (pse_pw_d_is_sw_pw_control(pcdev, pcdev->pi[id].pw_d)) { + ret = pcdev->pi[id].admin_state_enabled; + goto out; + } + ret = pse_pi_is_hw_enabled(pcdev, id); + +out: mutex_unlock(&pcdev->lock); return ret; } +/** + * pse_pi_deallocate_pw_budget - Deallocate power budget of the PI + * @pi: a pointer to the PSE PI + */ +static void pse_pi_deallocate_pw_budget(struct pse_pi *pi) +{ + if (!pi->pw_d || !pi->pw_allocated_mW) + return; + + regulator_free_power_budget(pi->pw_d->supply, pi->pw_allocated_mW); + pi->pw_allocated_mW = 0; +} + +/** + * _pse_pi_disable - Call disable operation. Assumes the PSE lock has been + * acquired. + * @pcdev: a pointer to the PSE + * @id: index of the PSE control + * + * Return: 0 on success and failure value on error + */ +static int _pse_pi_disable(struct pse_controller_dev *pcdev, int id) +{ + const struct pse_controller_ops *ops = pcdev->ops; + int ret; + + if (!ops->pi_disable) + return -EOPNOTSUPP; + + ret = ops->pi_disable(pcdev, id); + if (ret) + return ret; + + pse_pi_deallocate_pw_budget(&pcdev->pi[id]); + + if (pse_pw_d_is_sw_pw_control(pcdev, pcdev->pi[id].pw_d)) + pse_pw_d_retry_power_delivery(pcdev, pcdev->pi[id].pw_d); + + return 0; +} + +/** + * pse_disable_pi_pol - Disable a PI on a power budget policy + * @pcdev: a pointer to the PSE + * @id: index of the PSE PI + * + * Return: 0 on success and failure value on error + */ +static int pse_disable_pi_pol(struct pse_controller_dev *pcdev, int id) +{ + unsigned long notifs = ETHTOOL_PSE_EVENT_OVER_BUDGET; + struct pse_ntf ntf = {}; + int ret; + + dev_dbg(pcdev->dev, "Disabling PI %d to free power budget\n", id); + + ret = _pse_pi_disable(pcdev, id); + if (ret) + notifs |= ETHTOOL_PSE_EVENT_SW_PW_CONTROL_ERROR; + + ntf.notifs = notifs; + ntf.id = id; + kfifo_in_spinlocked(&pcdev->ntf_fifo, &ntf, 1, &pcdev->ntf_fifo_lock); + schedule_work(&pcdev->ntf_work); + + return ret; +} + +/** + * pse_disable_pi_prio - Disable all PIs of a given priority inside a PSE + * power domain + * @pcdev: a pointer to the PSE + * @pw_d: a pointer to the PSE power domain + * @prio: priority + * + * Return: 0 on success and failure value on error + */ +static int pse_disable_pi_prio(struct pse_controller_dev *pcdev, + struct pse_power_domain *pw_d, + int prio) +{ + int i; + + for (i = 0; i < pcdev->nr_lines; i++) { + int ret; + + if (pcdev->pi[i].prio != prio || + pcdev->pi[i].pw_d != pw_d || + pse_pi_is_hw_enabled(pcdev, i) <= 0) + continue; + + ret = pse_disable_pi_pol(pcdev, i); + if (ret) + return ret; + } + + return 0; +} + +/** + * pse_pi_allocate_pw_budget_static_prio - Allocate power budget for the PI + * when the budget eval strategy is + * static + * @pcdev: a pointer to the PSE + * @id: index of the PSE control + * @pw_req: power requested in mW + * @extack: extack for error reporting + * + * Allocates power using static budget evaluation strategy, where allocation + * is based on PD classification. When insufficient budget is available, + * lower-priority ports (higher priority numbers) are turned off first. + * + * Return: 0 on success and failure value on error + */ +static int +pse_pi_allocate_pw_budget_static_prio(struct pse_controller_dev *pcdev, int id, + int pw_req, struct netlink_ext_ack *extack) +{ + struct pse_pi *pi = &pcdev->pi[id]; + int ret, _prio; + + _prio = pcdev->nr_lines; + while (regulator_request_power_budget(pi->pw_d->supply, pw_req) == -ERANGE) { + if (_prio <= pi->prio) { + NL_SET_ERR_MSG_FMT(extack, + "PI %d: not enough power budget available", + id); + return -ERANGE; + } + + ret = pse_disable_pi_prio(pcdev, pi->pw_d, _prio); + if (ret < 0) + return ret; + + _prio--; + } + + pi->pw_allocated_mW = pw_req; + return 0; +} + +/** + * pse_pi_allocate_pw_budget - Allocate power budget for the PI + * @pcdev: a pointer to the PSE + * @id: index of the PSE control + * @pw_req: power requested in mW + * @extack: extack for error reporting + * + * Return: 0 on success and failure value on error + */ +static int pse_pi_allocate_pw_budget(struct pse_controller_dev *pcdev, int id, + int pw_req, struct netlink_ext_ack *extack) +{ + struct pse_pi *pi = &pcdev->pi[id]; + + if (!pi->pw_d) + return 0; + + /* PSE_BUDGET_EVAL_STRAT_STATIC */ + if (pi->pw_d->budget_eval_strategy == PSE_BUDGET_EVAL_STRAT_STATIC) + return pse_pi_allocate_pw_budget_static_prio(pcdev, id, pw_req, + extack); + + return 0; +} + +/** + * _pse_pi_delivery_power_sw_pw_ctrl - Enable PSE PI in case of software power + * control. Assumes the PSE lock has been + * acquired. + * @pcdev: a pointer to the PSE + * @id: index of the PSE control + * @extack: extack for error reporting + * + * Return: 0 on success and failure value on error + */ +static int _pse_pi_delivery_power_sw_pw_ctrl(struct pse_controller_dev *pcdev, + int id, + struct netlink_ext_ack *extack) +{ + const struct pse_controller_ops *ops = pcdev->ops; + struct pse_pi *pi = &pcdev->pi[id]; + int ret, pw_req; + + if (!ops->pi_get_pw_req) { + /* No power allocation management */ + ret = ops->pi_enable(pcdev, id); + if (ret) + NL_SET_ERR_MSG_FMT(extack, + "PI %d: enable error %d", + id, ret); + return ret; + } + + ret = ops->pi_get_pw_req(pcdev, id); + if (ret < 0) + return ret; + + pw_req = ret; + + /* Compare requested power with port power limit and use the lowest + * one. + */ + if (ops->pi_get_pw_limit) { + ret = ops->pi_get_pw_limit(pcdev, id); + if (ret < 0) + return ret; + + if (ret < pw_req) + pw_req = ret; + } + + ret = pse_pi_allocate_pw_budget(pcdev, id, pw_req, extack); + if (ret) + return ret; + + ret = ops->pi_enable(pcdev, id); + if (ret) { + pse_pi_deallocate_pw_budget(pi); + NL_SET_ERR_MSG_FMT(extack, + "PI %d: enable error %d", + id, ret); + return ret; + } + + return 0; +} + static int pse_pi_enable(struct regulator_dev *rdev) { struct pse_controller_dev *pcdev = rdev_get_drvdata(rdev); const struct pse_controller_ops *ops; - int id, ret; + int id, ret = 0; ops = pcdev->ops; if (!ops->pi_enable) @@ -327,6 +674,23 @@ static int pse_pi_enable(struct regulator_dev *rdev) id = rdev_get_id(rdev); mutex_lock(&pcdev->lock); + if (pse_pw_d_is_sw_pw_control(pcdev, pcdev->pi[id].pw_d)) { + /* Manage enabled status by software. + * Real enable process will happen if a port is connected. + */ + if (pcdev->pi[id].isr_pd_detected) { + struct netlink_ext_ack extack; + + ret = _pse_pi_delivery_power_sw_pw_ctrl(pcdev, id, &extack); + } + if (!ret || ret == -ERANGE) { + pcdev->pi[id].admin_state_enabled = 1; + ret = 0; + } + mutex_unlock(&pcdev->lock); + return ret; + } + ret = ops->pi_enable(pcdev, id); if (!ret) pcdev->pi[id].admin_state_enabled = 1; @@ -338,21 +702,18 @@ static int pse_pi_enable(struct regulator_dev *rdev) static int pse_pi_disable(struct regulator_dev *rdev) { struct pse_controller_dev *pcdev = rdev_get_drvdata(rdev); - const struct pse_controller_ops *ops; + struct pse_pi *pi; int id, ret; - ops = pcdev->ops; - if (!ops->pi_disable) - return -EOPNOTSUPP; - id = rdev_get_id(rdev); + pi = &pcdev->pi[id]; mutex_lock(&pcdev->lock); - ret = ops->pi_disable(pcdev, id); + ret = _pse_pi_disable(pcdev, id); if (!ret) - pcdev->pi[id].admin_state_enabled = 0; - mutex_unlock(&pcdev->lock); + pi->admin_state_enabled = 0; - return ret; + mutex_unlock(&pcdev->lock); + return 0; } static int _pse_pi_get_voltage(struct regulator_dev *rdev) @@ -628,6 +989,11 @@ static int pse_register_pw_ds(struct pse_controller_dev *pcdev) } pw_d->supply = supply; + if (pcdev->supp_budget_eval_strategies) + pw_d->budget_eval_strategy = pcdev->supp_budget_eval_strategies; + else + pw_d->budget_eval_strategy = PSE_BUDGET_EVAL_STRAT_DISABLED; + kref_init(&pw_d->refcnt); pcdev->pi[i].pw_d = pw_d; } @@ -636,6 +1002,34 @@ out: return ret; } +/** + * pse_send_ntf_worker - Worker to send PSE notifications + * @work: work object + * + * Manage and send PSE netlink notifications using a workqueue to avoid + * deadlock between pcdev_lock and pse_list_mutex. + */ +static void pse_send_ntf_worker(struct work_struct *work) +{ + struct pse_controller_dev *pcdev; + struct pse_ntf ntf; + + pcdev = container_of(work, struct pse_controller_dev, ntf_work); + + while (kfifo_out(&pcdev->ntf_fifo, &ntf, 1)) { + struct net_device *netdev; + struct pse_control *psec; + + psec = pse_control_find_by_id(pcdev, ntf.id); + rtnl_lock(); + netdev = pse_control_get_netdev(psec); + if (netdev) + ethnl_pse_send_ntf(netdev, ntf.notifs); + rtnl_unlock(); + pse_control_put(psec); + } +} + /** * pse_controller_register - register a PSE controller device * @pcdev: a pointer to the initialized PSE controller device @@ -649,6 +1043,13 @@ int pse_controller_register(struct pse_controller_dev *pcdev) mutex_init(&pcdev->lock); INIT_LIST_HEAD(&pcdev->pse_control_head); + spin_lock_init(&pcdev->ntf_fifo_lock); + ret = kfifo_alloc(&pcdev->ntf_fifo, pcdev->nr_lines, GFP_KERNEL); + if (ret) { + dev_err(pcdev->dev, "failed to allocate kfifo notifications\n"); + return ret; + } + INIT_WORK(&pcdev->ntf_work, pse_send_ntf_worker); if (!pcdev->nr_lines) pcdev->nr_lines = 1; @@ -715,6 +1116,10 @@ void pse_controller_unregister(struct pse_controller_dev *pcdev) { pse_flush_pw_ds(pcdev); pse_release_pis(pcdev); + if (pcdev->irq) + disable_irq(pcdev->irq); + cancel_work_sync(&pcdev->ntf_work); + kfifo_free(&pcdev->ntf_fifo); mutex_lock(&pse_list_mutex); list_del(&pcdev->list); mutex_unlock(&pse_list_mutex); @@ -786,6 +1191,52 @@ static unsigned long pse_to_regulator_notifs(unsigned long notifs) return rnotifs; } +/** + * pse_set_config_isr - Set PSE control config according to the PSE + * notifications + * @pcdev: a pointer to the PSE + * @id: index of the PSE control + * @notifs: PSE event notifications + * + * Return: 0 on success and failure value on error + */ +static int pse_set_config_isr(struct pse_controller_dev *pcdev, int id, + unsigned long notifs) +{ + int ret = 0; + + if (notifs & PSE_BUDGET_EVAL_STRAT_DYNAMIC) + return 0; + + if ((notifs & ETHTOOL_C33_PSE_EVENT_DISCONNECTION) && + ((notifs & ETHTOOL_C33_PSE_EVENT_DETECTION) || + (notifs & ETHTOOL_C33_PSE_EVENT_CLASSIFICATION))) { + dev_dbg(pcdev->dev, + "PI %d: error, connection and disconnection reported simultaneously", + id); + return -EINVAL; + } + + if (notifs & ETHTOOL_C33_PSE_EVENT_CLASSIFICATION) { + struct netlink_ext_ack extack; + + pcdev->pi[id].isr_pd_detected = true; + if (pcdev->pi[id].admin_state_enabled) { + ret = _pse_pi_delivery_power_sw_pw_ctrl(pcdev, id, + &extack); + if (ret == -ERANGE) + ret = 0; + } + } else if (notifs & ETHTOOL_C33_PSE_EVENT_DISCONNECTION) { + if (pcdev->pi[id].admin_state_enabled && + pcdev->pi[id].isr_pd_detected) + ret = _pse_pi_disable(pcdev, id); + pcdev->pi[id].isr_pd_detected = false; + } + + return ret; +} + /** * pse_isr - IRQ handler for PSE * @irq: irq number @@ -808,36 +1259,42 @@ static irqreturn_t pse_isr(int irq, void *data) memset(h->notifs, 0, pcdev->nr_lines * sizeof(*h->notifs)); mutex_lock(&pcdev->lock); ret = desc->map_event(irq, pcdev, h->notifs, ¬ifs_mask); - mutex_unlock(&pcdev->lock); - if (ret || !notifs_mask) + if (ret || !notifs_mask) { + mutex_unlock(&pcdev->lock); return IRQ_NONE; + } for_each_set_bit(i, ¬ifs_mask, pcdev->nr_lines) { unsigned long notifs, rnotifs; - struct net_device *netdev; - struct pse_control *psec; + struct pse_ntf ntf = {}; /* Do nothing PI not described */ if (!pcdev->pi[i].rdev) continue; notifs = h->notifs[i]; + if (pse_pw_d_is_sw_pw_control(pcdev, pcdev->pi[i].pw_d)) { + ret = pse_set_config_isr(pcdev, i, notifs); + if (ret) + notifs |= ETHTOOL_PSE_EVENT_SW_PW_CONTROL_ERROR; + } + dev_dbg(h->pcdev->dev, "Sending PSE notification EVT 0x%lx\n", notifs); - psec = pse_control_find_by_id(pcdev, i); - rtnl_lock(); - netdev = pse_control_get_netdev(psec); - if (netdev) - ethnl_pse_send_ntf(netdev, notifs); - rtnl_unlock(); - pse_control_put(psec); + ntf.notifs = notifs; + ntf.id = i; + kfifo_in_spinlocked(&pcdev->ntf_fifo, &ntf, 1, + &pcdev->ntf_fifo_lock); + schedule_work(&pcdev->ntf_work); rnotifs = pse_to_regulator_notifs(notifs); regulator_notifier_call_chain(pcdev->pi[i].rdev, rnotifs, NULL); } + mutex_unlock(&pcdev->lock); + return IRQ_HANDLED; } @@ -960,6 +1417,20 @@ pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index, goto free_psec; } + if (!pcdev->ops->pi_get_admin_state) { + ret = -EOPNOTSUPP; + goto free_psec; + } + + /* Initialize admin_state_enabled before the regulator_get. This + * aims to have the right value reported in the first is_enabled + * call in case of control managed by software. + */ + ret = pse_pi_is_hw_enabled(pcdev, index); + if (ret < 0) + goto free_psec; + + pcdev->pi[index].admin_state_enabled = ret; psec->ps = devm_regulator_get_exclusive(pcdev->dev, rdev_get_name(pcdev->pi[index].rdev)); if (IS_ERR(psec->ps)) { @@ -967,12 +1438,6 @@ pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index, goto put_module; } - ret = regulator_is_enabled(psec->ps); - if (ret < 0) - goto regulator_put; - - pcdev->pi[index].admin_state_enabled = ret; - psec->pcdev = pcdev; list_add(&psec->list, &pcdev->pse_control_head); psec->id = index; @@ -981,8 +1446,6 @@ pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index, return psec; -regulator_put: - devm_regulator_put(psec->ps); put_module: module_put(pcdev->owner); free_psec: @@ -1093,6 +1556,35 @@ out: } EXPORT_SYMBOL_GPL(of_pse_control_get); +/** + * pse_get_sw_admin_state - Convert the software admin state to c33 or podl + * admin state value used in the standard + * @psec: PSE control pointer + * @admin_state: a pointer to the admin_state structure + */ +static void pse_get_sw_admin_state(struct pse_control *psec, + struct pse_admin_state *admin_state) +{ + struct pse_pi *pi = &psec->pcdev->pi[psec->id]; + + if (pse_has_podl(psec)) { + if (pi->admin_state_enabled) + admin_state->podl_admin_state = + ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED; + else + admin_state->podl_admin_state = + ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED; + } + if (pse_has_c33(psec)) { + if (pi->admin_state_enabled) + admin_state->c33_admin_state = + ETHTOOL_C33_PSE_ADMIN_STATE_ENABLED; + else + admin_state->c33_admin_state = + ETHTOOL_C33_PSE_ADMIN_STATE_DISABLED; + } +} + /** * pse_ethtool_get_status - get status of PSE control * @psec: PSE control pointer @@ -1109,19 +1601,46 @@ int pse_ethtool_get_status(struct pse_control *psec, struct pse_pw_status pw_status = {0}; const struct pse_controller_ops *ops; struct pse_controller_dev *pcdev; + struct pse_pi *pi; int ret; pcdev = psec->pcdev; ops = pcdev->ops; + + pi = &pcdev->pi[psec->id]; mutex_lock(&pcdev->lock); - if (pcdev->pi[psec->id].pw_d) - status->pw_d_id = pcdev->pi[psec->id].pw_d->id; + if (pi->pw_d) { + status->pw_d_id = pi->pw_d->id; + if (pse_pw_d_is_sw_pw_control(pcdev, pi->pw_d)) { + pse_get_sw_admin_state(psec, &admin_state); + } else { + ret = ops->pi_get_admin_state(pcdev, psec->id, + &admin_state); + if (ret) + goto out; + } + status->podl_admin_state = admin_state.podl_admin_state; + status->c33_admin_state = admin_state.c33_admin_state; - ret = ops->pi_get_admin_state(pcdev, psec->id, &admin_state); - if (ret) - goto out; - status->podl_admin_state = admin_state.podl_admin_state; - status->c33_admin_state = admin_state.c33_admin_state; + switch (pi->pw_d->budget_eval_strategy) { + case PSE_BUDGET_EVAL_STRAT_STATIC: + status->prio_max = pcdev->nr_lines - 1; + status->prio = pi->prio; + break; + case PSE_BUDGET_EVAL_STRAT_DYNAMIC: + status->prio_max = pcdev->pis_prio_max; + if (ops->pi_get_prio) { + ret = ops->pi_get_prio(pcdev, psec->id); + if (ret < 0) + goto out; + + status->prio = ret; + } + break; + default: + break; + } + } ret = ops->pi_get_pw_status(pcdev, psec->id, &pw_status); if (ret) @@ -1270,6 +1789,52 @@ int pse_ethtool_set_config(struct pse_control *psec, } EXPORT_SYMBOL_GPL(pse_ethtool_set_config); +/** + * pse_pi_update_pw_budget - Update PSE power budget allocated with new + * power in mW + * @pcdev: a pointer to the PSE controller device + * @id: index of the PSE PI + * @pw_req: power requested + * @extack: extack for reporting useful error messages + * + * Return: Previous power allocated on success and failure value on error + */ +static int pse_pi_update_pw_budget(struct pse_controller_dev *pcdev, int id, + const unsigned int pw_req, + struct netlink_ext_ack *extack) +{ + struct pse_pi *pi = &pcdev->pi[id]; + int previous_pw_allocated; + int pw_diff, ret = 0; + + /* We don't want pw_allocated_mW value change in the middle of an + * power budget update + */ + mutex_lock(&pcdev->lock); + previous_pw_allocated = pi->pw_allocated_mW; + pw_diff = pw_req - previous_pw_allocated; + if (!pw_diff) { + goto out; + } else if (pw_diff > 0) { + ret = regulator_request_power_budget(pi->pw_d->supply, pw_diff); + if (ret) { + NL_SET_ERR_MSG_FMT(extack, + "PI %d: not enough power budget available", + id); + goto out; + } + + } else { + regulator_free_power_budget(pi->pw_d->supply, -pw_diff); + } + pi->pw_allocated_mW = pw_req; + ret = previous_pw_allocated; + +out: + mutex_unlock(&pcdev->lock); + return ret; +} + /** * pse_ethtool_set_pw_limit - set PSE control power limit * @psec: PSE control pointer @@ -1282,7 +1847,7 @@ int pse_ethtool_set_pw_limit(struct pse_control *psec, struct netlink_ext_ack *extack, const unsigned int pw_limit) { - int uV, uA, ret; + int uV, uA, ret, previous_pw_allocated = 0; s64 tmp_64; if (pw_limit > MAX_PI_PW) @@ -1306,10 +1871,100 @@ int pse_ethtool_set_pw_limit(struct pse_control *psec, /* uA = mW * 1000000000 / uV */ uA = DIV_ROUND_CLOSEST_ULL(tmp_64, uV); - return regulator_set_current_limit(psec->ps, 0, uA); + /* Update power budget only in software power control case and + * if a Power Device is powered. + */ + if (pse_pw_d_is_sw_pw_control(psec->pcdev, + psec->pcdev->pi[psec->id].pw_d) && + psec->pcdev->pi[psec->id].admin_state_enabled && + psec->pcdev->pi[psec->id].isr_pd_detected) { + ret = pse_pi_update_pw_budget(psec->pcdev, psec->id, + pw_limit, extack); + if (ret < 0) + return ret; + previous_pw_allocated = ret; + } + + ret = regulator_set_current_limit(psec->ps, 0, uA); + if (ret < 0 && previous_pw_allocated) { + pse_pi_update_pw_budget(psec->pcdev, psec->id, + previous_pw_allocated, extack); + } + + return ret; } EXPORT_SYMBOL_GPL(pse_ethtool_set_pw_limit); +/** + * pse_ethtool_set_prio - Set PSE PI priority according to the budget + * evaluation strategy + * @psec: PSE control pointer + * @extack: extack for reporting useful error messages + * @prio: priovity value + * + * Return: 0 on success and failure value on error + */ +int pse_ethtool_set_prio(struct pse_control *psec, + struct netlink_ext_ack *extack, + unsigned int prio) +{ + struct pse_controller_dev *pcdev = psec->pcdev; + const struct pse_controller_ops *ops; + int ret = 0; + + if (!pcdev->pi[psec->id].pw_d) { + NL_SET_ERR_MSG(extack, "no power domain attached"); + return -EOPNOTSUPP; + } + + /* We don't want priority change in the middle of an + * enable/disable call or a priority mode change + */ + mutex_lock(&pcdev->lock); + switch (pcdev->pi[psec->id].pw_d->budget_eval_strategy) { + case PSE_BUDGET_EVAL_STRAT_STATIC: + if (prio >= pcdev->nr_lines) { + NL_SET_ERR_MSG_FMT(extack, + "priority %d exceed priority max %d", + prio, pcdev->nr_lines); + ret = -ERANGE; + goto out; + } + + pcdev->pi[psec->id].prio = prio; + pse_pw_d_retry_power_delivery(pcdev, pcdev->pi[psec->id].pw_d); + break; + + case PSE_BUDGET_EVAL_STRAT_DYNAMIC: + ops = psec->pcdev->ops; + if (!ops->pi_set_prio) { + NL_SET_ERR_MSG(extack, + "pse driver does not support setting port priority"); + ret = -EOPNOTSUPP; + goto out; + } + + if (prio > pcdev->pis_prio_max) { + NL_SET_ERR_MSG_FMT(extack, + "priority %d exceed priority max %d", + prio, pcdev->pis_prio_max); + ret = -ERANGE; + goto out; + } + + ret = ops->pi_set_prio(pcdev, psec->id, prio); + break; + + default: + ret = -EOPNOTSUPP; + } + +out: + mutex_unlock(&pcdev->lock); + return ret; +} +EXPORT_SYMBOL_GPL(pse_ethtool_set_prio); + bool pse_has_podl(struct pse_control *psec) { return psec->pcdev->types & ETHTOOL_PSE_PODL; diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 2f8ecfd87d43..e5f305cef82e 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -6,6 +6,8 @@ #define _LINUX_PSE_CONTROLLER_H #include +#include +#include #include #include #include @@ -134,6 +136,9 @@ struct pse_pw_limit_ranges { * is in charge of the memory allocation * @c33_pw_limit_nb_ranges: number of supported power limit configuration * ranges + * @prio_max: max priority allowed for the c33_prio variable value. + * @prio: priority of the PSE. Managed by PSE core in case of static budget + * evaluation strategy. */ struct ethtool_pse_control_status { u32 pw_d_id; @@ -147,6 +152,8 @@ struct ethtool_pse_control_status { u32 c33_avail_pw_limit; struct ethtool_c33_pse_pw_limit_range *c33_pw_limit_ranges; u32 c33_pw_limit_nb_ranges; + u32 prio_max; + u32 prio; }; /** @@ -170,6 +177,11 @@ struct ethtool_pse_control_status { * range. The driver is in charge of the memory * allocation and should return the number of * ranges. + * @pi_get_prio: Get the PSE PI priority. + * @pi_set_prio: Configure the PSE PI priority. + * @pi_get_pw_req: Get the power requested by a PD before enabling the PSE PI. + * This is only relevant when an interrupt is registered using + * devm_pse_irq_helper helper. */ struct pse_controller_ops { int (*setup_pi_matrix)(struct pse_controller_dev *pcdev); @@ -190,6 +202,10 @@ struct pse_controller_ops { int id, int max_mW); int (*pi_get_pw_limit_ranges)(struct pse_controller_dev *pcdev, int id, struct pse_pw_limit_ranges *pw_limit_ranges); + int (*pi_get_prio)(struct pse_controller_dev *pcdev, int id); + int (*pi_set_prio)(struct pse_controller_dev *pcdev, int id, + unsigned int prio); + int (*pi_get_pw_req)(struct pse_controller_dev *pcdev, int id); }; struct module; @@ -225,6 +241,13 @@ struct pse_pi_pairset { * @rdev: regulator represented by the PSE PI * @admin_state_enabled: PI enabled state * @pw_d: Power domain of the PSE PI + * @prio: Priority of the PSE PI. Used in static budget evaluation strategy + * @isr_pd_detected: PSE PI detection status managed by the interruption + * handler. This variable is relevant when the power enabled + * management is managed in software like the static + * budget evaluation strategy. + * @pw_allocated_mW: Power allocated to a PSE PI to manage power budget in + * static budget evaluation strategy. */ struct pse_pi { struct pse_pi_pairset pairset[2]; @@ -232,6 +255,20 @@ struct pse_pi { struct regulator_dev *rdev; bool admin_state_enabled; struct pse_power_domain *pw_d; + int prio; + bool isr_pd_detected; + int pw_allocated_mW; +}; + +/** + * struct pse_ntf - PSE notification element + * + * @id: ID of the PSE control + * @notifs: PSE notifications to be reported + */ +struct pse_ntf { + int id; + unsigned long notifs; }; /** @@ -249,6 +286,12 @@ struct pse_pi { * @pi: table of PSE PIs described in this controller device * @no_of_pse_pi: flag set if the pse_pis devicetree node is not used * @irq: PSE interrupt + * @pis_prio_max: Maximum value allowed for the PSE PIs priority + * @supp_budget_eval_strategies: budget evaluation strategies supported + * by the PSE + * @ntf_work: workqueue for PSE notification management + * @ntf_fifo: PSE notifications FIFO + * @ntf_fifo_lock: protect @ntf_fifo writer */ struct pse_controller_dev { const struct pse_controller_ops *ops; @@ -263,6 +306,29 @@ struct pse_controller_dev { struct pse_pi *pi; bool no_of_pse_pi; int irq; + unsigned int pis_prio_max; + u32 supp_budget_eval_strategies; + struct work_struct ntf_work; + DECLARE_KFIFO_PTR(ntf_fifo, struct pse_ntf); + spinlock_t ntf_fifo_lock; /* Protect @ntf_fifo writer */ +}; + +/** + * enum pse_budget_eval_strategies - PSE budget evaluation strategies. + * @PSE_BUDGET_EVAL_STRAT_DISABLED: Budget evaluation strategy disabled. + * @PSE_BUDGET_EVAL_STRAT_STATIC: PSE static budget evaluation strategy. + * Budget evaluation strategy based on the power requested during PD + * classification. This strategy is managed by the PSE core. + * @PSE_BUDGET_EVAL_STRAT_DYNAMIC: PSE dynamic budget evaluation + * strategy. Budget evaluation strategy based on the current consumption + * per ports compared to the total power budget. This mode is managed by + * the PSE controller. + */ + +enum pse_budget_eval_strategies { + PSE_BUDGET_EVAL_STRAT_DISABLED = 1 << 0, + PSE_BUDGET_EVAL_STRAT_STATIC = 1 << 1, + PSE_BUDGET_EVAL_STRAT_DYNAMIC = 1 << 2, }; #if IS_ENABLED(CONFIG_PSE_CONTROLLER) @@ -287,6 +353,9 @@ int pse_ethtool_set_config(struct pse_control *psec, int pse_ethtool_set_pw_limit(struct pse_control *psec, struct netlink_ext_ack *extack, const unsigned int pw_limit); +int pse_ethtool_set_prio(struct pse_control *psec, + struct netlink_ext_ack *extack, + unsigned int prio); bool pse_has_podl(struct pse_control *psec); bool pse_has_c33(struct pse_control *psec); @@ -324,6 +393,13 @@ static inline int pse_ethtool_set_pw_limit(struct pse_control *psec, return -EOPNOTSUPP; } +static inline int pse_ethtool_set_prio(struct pse_control *psec, + struct netlink_ext_ack *extack, + unsigned int prio) +{ + return -EOPNOTSUPP; +} + static inline bool pse_has_podl(struct pse_control *psec) { return false; diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index ed344c8533eb..c6a95224be25 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -53,10 +53,28 @@ enum hwtstamp_source { * enum ethtool_pse_event - PSE event list for the PSE controller * @ETHTOOL_PSE_EVENT_OVER_CURRENT: PSE output current is too high * @ETHTOOL_PSE_EVENT_OVER_TEMP: PSE in over temperature state + * @ETHTOOL_C33_PSE_EVENT_DETECTION: detection process occur on the PSE. IEEE + * 802.3-2022 33.2.5 and 145.2.6 PSE detection of PDs. IEEE 802.3-202 + * 30.9.1.1.5 aPSEPowerDetectionStatus + * @ETHTOOL_C33_PSE_EVENT_CLASSIFICATION: classification process occur on the + * PSE. IEEE 802.3-2022 33.2.6 and 145.2.8 classification of PDs mutual + * identification. IEEE 802.3-2022 30.9.1.1.8 aPSEPowerClassification. + * @ETHTOOL_C33_PSE_EVENT_DISCONNECTION: PD has been disconnected on the PSE. + * IEEE 802.3-2022 33.3.8 and 145.3.9 PD Maintain Power Signature. IEEE + * 802.3-2022 33.5.1.2.9 MPS Absent. IEEE 802.3-2022 30.9.1.1.20 + * aPSEMPSAbsentCounter. + * @ETHTOOL_PSE_EVENT_OVER_BUDGET: PSE turned off due to over budget situation + * @ETHTOOL_PSE_EVENT_SW_PW_CONTROL_ERROR: PSE faced an error managing the + * power control from software */ enum ethtool_pse_event { ETHTOOL_PSE_EVENT_OVER_CURRENT = 1, ETHTOOL_PSE_EVENT_OVER_TEMP = 2, + ETHTOOL_C33_PSE_EVENT_DETECTION = 4, + ETHTOOL_C33_PSE_EVENT_CLASSIFICATION = 8, + ETHTOOL_C33_PSE_EVENT_DISCONNECTION = 16, + ETHTOOL_PSE_EVENT_OVER_BUDGET = 32, + ETHTOOL_PSE_EVENT_SW_PW_CONTROL_ERROR = 64, }; enum { -- cgit v1.2.3 From eeb0c8f72f49a21984981188404cfd3700edbaff Mon Sep 17 00:00:00 2001 From: "Kory Maincent (Dent Project)" Date: Tue, 17 Jun 2025 14:12:07 +0200 Subject: net: ethtool: Add PSE port priority support feature This patch expands the status information provided by ethtool for PSE c33 with current port priority and max port priority. It also adds a call to pse_ethtool_set_prio() to configure the PSE port priority. Signed-off-by: Kory Maincent (Dent Project) Reviewed-by: Oleksij Rempel Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-8-78a1a645e2ee@bootlin.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 11 +++++++++++ Documentation/networking/ethtool-netlink.rst | 26 ++++++++++++++++++++++++++ include/uapi/linux/ethtool_netlink_generated.h | 2 ++ net/ethtool/pse-pd.c | 18 ++++++++++++++++++ 4 files changed, 57 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 7a9a857370e2..e6a77e8053a0 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1436,6 +1436,14 @@ attribute-sets: name: pse-pw-d-id type: u32 name-prefix: ethtool-a- + - + name: pse-prio-max + type: u32 + name-prefix: ethtool-a- + - + name: pse-prio + type: u32 + name-prefix: ethtool-a- - name: rss attr-cnt-name: __ethtool-a-rss-cnt @@ -2260,6 +2268,8 @@ operations: - c33-pse-avail-pw-limit - c33-pse-pw-limit-ranges - pse-pw-d-id + - pse-prio-max + - pse-prio dump: *pse-get-op - name: pse-set @@ -2274,6 +2284,7 @@ operations: - podl-pse-admin-control - c33-pse-admin-control - c33-pse-avail-pw-limit + - pse-prio - name: rss-get doc: Get RSS params. diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index e9af8e58564c..e45bb555e909 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1790,6 +1790,10 @@ Kernel response contents: ``ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES`` nested Supported power limit configuration ranges. ``ETHTOOL_A_PSE_PW_D_ID`` u32 Index of the PSE power domain + ``ETHTOOL_A_PSE_PRIO_MAX`` u32 Priority maximum configurable + on the PoE PSE + ``ETHTOOL_A_PSE_PRIO`` u32 Priority of the PoE PSE + currently configured ========================================== ====== ============================= When set, the optional ``ETHTOOL_A_PODL_PSE_ADMIN_STATE`` attribute identifies @@ -1866,6 +1870,12 @@ equal. The ``ETHTOOL_A_PSE_PW_D_ID`` attribute identifies the index of PSE power domain. +When set, the optional ``ETHTOOL_A_PSE_PRIO_MAX`` attribute identifies +the PSE maximum priority value. +When set, the optional ``ETHTOOL_A_PSE_PRIO`` attributes is used to +identifies the currently configured PSE priority. +For a description of PSE priority attributes, see ``PSE_SET``. + PSE_SET ======= @@ -1879,6 +1889,8 @@ Request contents: ``ETHTOOL_A_C33_PSE_ADMIN_CONTROL`` u32 Control PSE Admin state ``ETHTOOL_A_C33_PSE_AVAIL_PWR_LIMIT`` u32 Control PoE PSE available power limit + ``ETHTOOL_A_PSE_PRIO`` u32 Control priority of the + PoE PSE ====================================== ====== ============================= When set, the optional ``ETHTOOL_A_PODL_PSE_ADMIN_CONTROL`` attribute is used @@ -1901,6 +1913,20 @@ various existing products that document power consumption in watts rather than classes. If power limit configuration based on classes is needed, the conversion can be done in user space, for example by ethtool. +When set, the optional ``ETHTOOL_A_PSE_PRIO`` attributes is used to +control the PSE priority. Allowed priority value are between zero and +the value of ``ETHTOOL_A_PSE_PRIO_MAX`` attribute. + +A lower value indicates a higher priority, meaning that a priority value +of 0 corresponds to the highest port priority. +Port priority serves two functions: + + - Power-up Order: After a reset, ports are powered up in order of their + priority from highest to lowest. Ports with higher priority + (lower values) power up first. + - Shutdown Order: When the power budget is exceeded, ports with lower + priority (higher values) are turned off first. + PSE_NTF ======= diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index c6a95224be25..8e5d067e7ddf 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -671,6 +671,8 @@ enum { ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, ETHTOOL_A_PSE_PW_D_ID, + ETHTOOL_A_PSE_PRIO_MAX, + ETHTOOL_A_PSE_PRIO, __ETHTOOL_A_PSE_CNT, ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index 6a978a55959e..6c536dfe52da 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -111,6 +111,9 @@ static int pse_reply_size(const struct ethnl_req_info *req_base, len += st->c33_pw_limit_nb_ranges * (nla_total_size(0) + nla_total_size(sizeof(u32)) * 2); + if (st->prio_max) + /* _PSE_PRIO_MAX + _PSE_PRIO */ + len += nla_total_size(sizeof(u32)) * 2; return len; } @@ -205,6 +208,11 @@ static int pse_fill_reply(struct sk_buff *skb, pse_put_pw_limit_ranges(skb, st)) return -EMSGSIZE; + if (st->prio_max && + (nla_put_u32(skb, ETHTOOL_A_PSE_PRIO_MAX, st->prio_max) || + nla_put_u32(skb, ETHTOOL_A_PSE_PRIO, st->prio))) + return -EMSGSIZE; + return 0; } @@ -226,6 +234,7 @@ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { NLA_POLICY_RANGE(NLA_U32, ETHTOOL_C33_PSE_ADMIN_STATE_DISABLED, ETHTOOL_C33_PSE_ADMIN_STATE_ENABLED), [ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT] = { .type = NLA_U32 }, + [ETHTOOL_A_PSE_PRIO] = { .type = NLA_U32 }, }; static int @@ -274,6 +283,15 @@ ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) if (ret) return ret; + if (tb[ETHTOOL_A_PSE_PRIO]) { + unsigned int prio; + + prio = nla_get_u32(tb[ETHTOOL_A_PSE_PRIO]); + ret = pse_ethtool_set_prio(phydev->psec, info->extack, prio); + if (ret) + return ret; + } + if (tb[ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT]) { unsigned int pw_limit; -- cgit v1.2.3 From 9094c72c3d81bf2416b7c79d12c8494ab8fbac20 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 19 May 2025 10:33:20 +0200 Subject: time: Introduce auxiliary POSIX clocks To support auxiliary timekeeping and the related user space interfaces, it's required to define a clock ID range for them. Reserve 8 auxiliary clock IDs after the regular timekeeping clock ID space. This is the maximum number of auxiliary clocks the kernel can support. The actual number of supported clocks depends obviously on the presence of related devices and might be constraint by the available VDSO space. Add the corresponding timekeeper IDs as well. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083025.905800695@linutronix.de --- include/linux/timekeeper_internal.h | 10 ++++++++-- include/uapi/linux/time.h | 11 +++++++++++ kernel/time/Kconfig | 15 +++++++++++++-- 3 files changed, 32 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index bfcecad0e279..4201ae818f57 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -13,11 +13,17 @@ /** * timekeeper_ids - IDs for various time keepers in the kernel - * @TIMEKEEPER_CORE: The central core timekeeper managing system time - * @TIMEKEEPERS_MAX: The maximum number of timekeepers managed + * @TIMEKEEPER_CORE: The central core timekeeper managing system time + * @TIMEKEEPER_AUX_FIRST: The first AUX timekeeper + * @TIMEKEEPER_AUX_LAST: The last AUX timekeeper + * @TIMEKEEPERS_MAX: The maximum number of timekeepers managed */ enum timekeeper_ids { TIMEKEEPER_CORE, +#ifdef CONFIG_POSIX_AUX_CLOCKS + TIMEKEEPER_AUX_FIRST, + TIMEKEEPER_AUX_LAST = TIMEKEEPER_AUX_FIRST + MAX_AUX_CLOCKS - 1, +#endif TIMEKEEPERS_MAX, }; diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index 4f4b6e48e01c..16ca1ac206fd 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -64,6 +64,17 @@ struct timezone { #define CLOCK_TAI 11 #define MAX_CLOCKS 16 + +/* + * AUX clock support. AUXiliary clocks are dynamically configured by + * enabling a clock ID. These clock can be steered independently of the + * core timekeeper. The kernel can support up to 8 auxiliary clocks, but + * the actual limit depends on eventual architecture constraints vs. VDSO. + */ +#define CLOCK_AUX MAX_CLOCKS +#define MAX_AUX_CLOCKS 8 +#define CLOCK_AUX_LAST (CLOCK_AUX + MAX_AUX_CLOCKS - 1) + #define CLOCKS_MASK (CLOCK_REALTIME | CLOCK_MONOTONIC) #define CLOCKS_MONO CLOCK_MONOTONIC diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b0b97a60aaa6..7c6a52f7836c 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -82,9 +82,9 @@ config CONTEXT_TRACKING_IDLE help Tracks idle state on behalf of RCU. -if GENERIC_CLOCKEVENTS menu "Timers subsystem" +if GENERIC_CLOCKEVENTS # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices # are supported independent of this. @@ -208,6 +208,17 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US interval and NTP's maximum frequency drift of 500 parts per million. If the clocksource is good enough for NTP, it is good enough for the clocksource watchdog! +endif + +config POSIX_AUX_CLOCKS + bool "Enable auxiliary POSIX clocks" + depends on POSIX_TIMERS + help + Auxiliary POSIX clocks are clocks which can be steered + independently of the core timekeeper, which controls the + MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to + provide e.g. lockless time accessors to independent PTP clocks + and other clock domains, which are not correlated to the TAI/NTP + notion of time. endmenu -endif -- cgit v1.2.3 From 5ae1fc4069578f50798f3372f36a3c13ee565b66 Mon Sep 17 00:00:00 2001 From: Kavita Kavita Date: Wed, 4 Jun 2025 16:27:56 +0530 Subject: wifi: cfg80211: Improve the documentation for NL80211_CMD_ASSOC_MLO_RECONF The existing documentation for the NL80211_CMD_ASSOC_MLO_RECONF does not clearly explain handling of link reconfiguration request results from the driver. Add documentation to explain that the command is used as an event to notify userspace about added links information, and that the existing NL80211_CMD_LINKS_REMOVED command is used to notify userspace about removed links information. Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20250604105757.2542-2-quic_kkavita@quicinc.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e9ccf43fe3c6..e53840d009d1 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1330,7 +1330,11 @@ * TID to Link mapping for downlink/uplink traffic. * * @NL80211_CMD_ASSOC_MLO_RECONF: For a non-AP MLD station, request to - * add/remove links to/from the association. + * add/remove links to/from the association. To indicate link + * reconfiguration request results from the driver, this command is also + * used as an event to notify userspace about the added links information. + * For notifying the removed links information, the existing + * %NL80211_CMD_LINKS_REMOVED command is used. * * @NL80211_CMD_EPCS_CFG: EPCS configuration for a station. Used by userland to * control EPCS configuration. Used to notify userland on the current state -- cgit v1.2.3 From 7c598c653ad465138ecc2fe64492633c541effef Mon Sep 17 00:00:00 2001 From: Kavita Kavita Date: Wed, 4 Jun 2025 16:27:57 +0530 Subject: wifi: cfg80211: Add support for link reconfiguration negotiation offload to driver In the case of SME-in-driver, the driver can internally choose to update the links based on the AP MLD recommendation and do link reconfiguration negotiation with AP MLD. (e.g., After the driver processing the BSS Transition Management request frame received from the AP MLD with Neighbor Report containing Multi-Link element with recommended links information chooses to do link reconfiguration negotiation with AP MLD). To support this, extend cfg80211_mlo_reconf_add_done() and NL80211_CMD_ASSOC_MLO_RECONF to indicate added links information for driver-initiated link reconfiguration requests. For removed links, the driver indicates links information using the NL80211_CMD_LINKS_REMOVED event for driver-initiated cases, the same as supplicant initiated cases. For the driver-initiated case, cfg80211 will receive link reconfiguration result asynchronously from driver so holding BSSes of the accepted add links is needed in the event path. Also, no need of unhold call for the rejected add link BSSes since there was no hold call happened previously. Once the supplicant receives the NL80211_CMD_ASSOC_MLO_RECONF event, it needs to process the information about newly added links and install per-link group keys (e.g., GTK/IGTK/BIGTK etc.). In case of the SME-in-driver, using a vendor interface etc. to notify the supplicant to initiate a link reconfiguration request and then supplicant sending command to the cfg80211 can lead to race conditions. The correct design to avoid this is that the driver indicates the cfg80211 directly with the results of the link reconfiguration negotiation. Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20250604105757.2542-3-quic_kkavita@quicinc.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 ++++++ include/uapi/linux/nl80211.h | 6 +++++- net/wireless/mlme.c | 10 ++++++++-- net/wireless/trace.h | 10 ++++++---- 4 files changed, 25 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7719a90ab4d7..47b4235eea59 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -9747,6 +9747,11 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask); * struct cfg80211_mlo_reconf_done_data - MLO reconfiguration data * @buf: MLO Reconfiguration Response frame (header + body) * @len: length of the frame data + * @driver_initiated: Indicates whether the add links request is initiated by + * driver. This is set to true when the link reconfiguration request + * initiated by driver due to AP link recommendation requests + * (Ex: BTM (BSS Transition Management) request) handling offloaded to + * driver. * @added_links: BIT mask of links successfully added to the association * @links: per-link information indexed by link ID * @links.bss: the BSS that MLO reconfiguration was requested for, ownership of @@ -9759,6 +9764,7 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask); struct cfg80211_mlo_reconf_done_data { const u8 *buf; size_t len; + bool driver_initiated; u16 added_links; struct { struct cfg80211_bss *bss; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e53840d009d1..a289014abe37 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1334,7 +1334,11 @@ * reconfiguration request results from the driver, this command is also * used as an event to notify userspace about the added links information. * For notifying the removed links information, the existing - * %NL80211_CMD_LINKS_REMOVED command is used. + * %NL80211_CMD_LINKS_REMOVED command is used. This command is also used to + * notify userspace about newly added links for the current connection in + * case of AP-initiated link recommendation requests, received via + * a BTM (BSS Transition Management) request or a link reconfig notify + * frame, where the driver handles the link recommendation offload. * * @NL80211_CMD_EPCS_CFG: EPCS configuration for a station. Used by userland to * control EPCS configuration. Used to notify userland on the current state diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 05d44a443518..29e1ce8aff42 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -1331,7 +1331,8 @@ void cfg80211_mlo_reconf_add_done(struct net_device *dev, lockdep_assert_wiphy(wiphy); trace_cfg80211_mlo_reconf_add_done(dev, data->added_links, - data->buf, data->len); + data->buf, data->len, + data->driver_initiated); if (WARN_ON(!wdev->valid_links)) return; @@ -1361,11 +1362,16 @@ void cfg80211_mlo_reconf_add_done(struct net_device *dev, wdev->links[link_id].client.current_bss = bss_from_pub(bss); + if (data->driver_initiated) + cfg80211_hold_bss(bss_from_pub(bss)); + memcpy(wdev->links[link_id].addr, data->links[link_id].addr, ETH_ALEN); } else { - cfg80211_unhold_bss(bss_from_pub(bss)); + if (!data->driver_initiated) + cfg80211_unhold_bss(bss_from_pub(bss)); + cfg80211_put_bss(wiphy, bss); } } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 4ed9fada4ec0..61a5eca9c513 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -4126,20 +4126,22 @@ TRACE_EVENT(cfg80211_links_removed, TRACE_EVENT(cfg80211_mlo_reconf_add_done, TP_PROTO(struct net_device *netdev, u16 link_mask, - const u8 *buf, size_t len), - TP_ARGS(netdev, link_mask, buf, len), + const u8 *buf, size_t len, bool driver_initiated), + TP_ARGS(netdev, link_mask, buf, len, driver_initiated), TP_STRUCT__entry( NETDEV_ENTRY __field(u16, link_mask) __dynamic_array(u8, buf, len) + __field(bool, driver_initiated) ), TP_fast_assign( NETDEV_ASSIGN; __entry->link_mask = link_mask; memcpy(__get_dynamic_array(buf), buf, len); + __entry->driver_initiated = driver_initiated; ), - TP_printk(NETDEV_PR_FMT ", link_mask:0x%x", - NETDEV_PR_ARG, __entry->link_mask) + TP_printk(NETDEV_PR_FMT ", link_mask:0x%x, driver_initiated:%d", + NETDEV_PR_ARG, __entry->link_mask, __entry->driver_initiated) ); TRACE_EVENT(rdev_assoc_ml_reconf, -- cgit v1.2.3 From 3421d46440ebe0865bec71dbd2330b4e17a425ab Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 16 Jun 2025 19:49:07 +0800 Subject: HID: core: Add bus define for SoundWire bus SDCA (SoundWire Device Class for Audio) uses HID to convey input events from peripheral devices. Add a bus define for the SoundWire bus to prepare support for this. Signed-off-by: Charles Keepax Signed-off-by: Shuming Fan Acked-by: Jiri Kosina Link: https://patch.msgid.link/20250616114907.855452-1-shumingf@realtek.com Signed-off-by: Mark Brown --- drivers/hid/hid-core.c | 3 +++ include/uapi/linux/input.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index b348d0464314..b419e31005b8 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -2294,6 +2294,9 @@ int hid_connect(struct hid_device *hdev, unsigned int connect_mask) case BUS_I2C: bus = "I2C"; break; + case BUS_SDW: + bus = "SOUNDWIRE"; + break; case BUS_VIRTUAL: bus = "VIRTUAL"; break; diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index 2557eb7b0561..127119c287cf 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -275,6 +275,7 @@ struct input_mask { #define BUS_CEC 0x1E #define BUS_INTEL_ISHTP 0x1F #define BUS_AMD_SFH 0x20 +#define BUS_SDW 0x21 /* * MT_TOOL types -- cgit v1.2.3 From cf207eac06f661fb692f405d5ab8230df884ee52 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 10 Jun 2025 10:14:20 +0800 Subject: KVM: TDX: Handle TDG.VP.VMCALL Handle TDVMCALL for GetQuote to generate a TD-Quote. GetQuote is a doorbell-like interface used by TDX guests to request VMM to generate a TD-Quote signed by a service hosting TD-Quoting Enclave operating on the host. A TDX guest passes a TD Report (TDREPORT_STRUCT) in a shared-memory area as parameter. Host VMM can access it and queue the operation for a service hosting TD-Quoting enclave. When completed, the Quote is returned via the same shared-memory area. KVM only checks the GPA from the TDX guest has the shared-bit set and drops the shared-bit before exiting to userspace to avoid bleeding the shared-bit into KVM's exit ABI. KVM forwards the request to userspace VMM (e.g. QEMU) and userspace VMM queues the operation asynchronously. KVM sets the return code according to the 'ret' field set by userspace to notify the TDX guest whether the request has been queued successfully or not. When the request has been queued successfully, the TDX guest can poll the status field in the shared-memory area to check whether the Quote generation is completed or not. When completed, the generated Quote is returned via the same buffer. Add KVM_EXIT_TDX as a new exit reason to userspace. Userspace is required to handle the KVM exit reason as the initial support for TDX, by reentering KVM to ensure that the TDVMCALL is complete. While at it, add a note that KVM_EXIT_HYPERCALL also requires reentry with KVM_RUN. Signed-off-by: Binbin Wu Tested-by: Mikko Ylinen Acked-by: Kai Huang [Adjust userspace API. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 49 +++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/vmx/tdx.c | 32 +++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 17 +++++++++++++++ 3 files changed, 97 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1bd2d42e6424..115ec3c2b641 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6645,7 +6645,8 @@ to the byte array. .. note:: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, - KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding + KVM_EXIT_EPR, KVM_EXIT_HYPERCALL, KVM_EXIT_TDX, + KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding operations are complete (and guest state is consistent) only after userspace has re-entered the kernel with KVM_RUN. The kernel side will first finish incomplete operations and then check for pending signals. @@ -7174,6 +7175,52 @@ The valid value for 'flags' is: - KVM_NOTIFY_CONTEXT_INVALID -- the VM context is corrupted and not valid in VMCS. It would run into unknown result if resume the target VM. +:: + + /* KVM_EXIT_TDX */ + struct { + __u64 flags; + __u64 nr; + union { + struct { + u64 ret; + u64 data[5]; + } unknown; + struct { + u64 ret; + u64 gpa; + u64 size; + } get_quote; + }; + } tdx; + +Process a TDVMCALL from the guest. KVM forwards select TDVMCALL based +on the Guest-Hypervisor Communication Interface (GHCI) specification; +KVM bridges these requests to the userspace VMM with minimal changes, +placing the inputs in the union and copying them back to the guest +on re-entry. + +Flags are currently always zero, whereas ``nr`` contains the TDVMCALL +number from register R11. The remaining field of the union provide the +inputs and outputs of the TDVMCALL. Currently the following values of +``nr`` are defined: + +* ``TDVMCALL_GET_QUOTE``: the guest has requested to generate a TD-Quote +signed by a service hosting TD-Quoting Enclave operating on the host. +Parameters and return value are in the ``get_quote`` field of the union. +The ``gpa`` field and ``size`` specify the guest physical address +(without the shared bit set) and the size of a shared-memory buffer, in +which the TDX guest passes a TD Report. The ``ret`` field represents +the return value of the GetQuote request. When the request has been +queued successfully, the TDX guest can poll the status field in the +shared-memory area to check whether the Quote generation is completed or +not. When completed, the generated Quote is returned via the same buffer. + +KVM may add support for more values in the future that may cause a userspace +exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case, +it will enter with output fields already valid; in the common case, the +``unknown.ret`` field of the union will be ``TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED``. +Userspace need not do anything if it does not wish to support a TDVMCALL. :: /* Fix the size of the union. */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 5d100c240ab3..b619a3478983 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1465,6 +1465,36 @@ static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) return 1; } +static int tdx_complete_simple(struct kvm_vcpu *vcpu) +{ + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); + return 1; +} + +static int tdx_get_quote(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 gpa = tdx->vp_enter_args.r12; + u64 size = tdx->vp_enter_args.r13; + + /* The gpa of buffer must have shared bit set. */ + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; + vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; + vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); + vcpu->run->tdx.get_quote.size = size; + + vcpu->arch.complete_userspace_io = tdx_complete_simple; + + return 0; +} + static int handle_tdvmcall(struct kvm_vcpu *vcpu) { switch (tdvmcall_leaf(vcpu)) { @@ -1474,6 +1504,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu) return tdx_report_fatal_error(vcpu); case TDVMCALL_GET_TD_VM_CALL_INFO: return tdx_get_td_vm_call_info(vcpu); + case TDVMCALL_GET_QUOTE: + return tdx_get_quote(vcpu); default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d00b85cb168c..e23e7286ad1a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -178,6 +178,7 @@ struct kvm_xen_exit { #define KVM_EXIT_NOTIFY 37 #define KVM_EXIT_LOONGARCH_IOCSR 38 #define KVM_EXIT_MEMORY_FAULT 39 +#define KVM_EXIT_TDX 40 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -447,6 +448,22 @@ struct kvm_run { __u64 gpa; __u64 size; } memory_fault; + /* KVM_EXIT_TDX */ + struct { + __u64 flags; + __u64 nr; + union { + struct { + __u64 ret; + __u64 data[5]; + } unknown; + struct { + __u64 ret; + __u64 gpa; + __u64 size; + } get_quote; + }; + } tdx; /* Fix the size of the union. */ char padding[256]; }; -- cgit v1.2.3 From 25e8b1dd4883e6c251c3db5b347f3c8ae4ade921 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 10 Jun 2025 10:14:21 +0800 Subject: KVM: TDX: Exit to userspace for GetTdVmCallInfo Exit to userspace for TDG.VP.VMCALL via KVM_EXIT_TDX, to allow userspace to provide information about the support of TDVMCALLs when r12 is 1 for the TDVMCALLs beyond the GHCI base API. GHCI spec defines the GHCI base TDVMCALLs: , , , , <#VE.RequestMMIO>, , , and . They must be supported by VMM to support TDX guests. For GetTdVmCallInfo - When leaf (r12) to enumerate TDVMCALL functionality is set to 0, successful execution indicates all GHCI base TDVMCALLs listed above are supported. Update the KVM TDX document with the set of the GHCI base APIs. - When leaf (r12) to enumerate TDVMCALL functionality is set to 1, it indicates the TDX guest is querying the supported TDVMCALLs beyond the GHCI base TDVMCALLs. Exit to userspace to let userspace set the TDVMCALL sub-function bit(s) accordingly to the leaf outputs. KVM could set the TDVMCALL bit(s) supported by itself when the TDVMCALLs don't need support from userspace after returning from userspace and before entering guest. Currently, no such TDVMCALLs implemented, KVM just sets the values returned from userspace. Suggested-by: Paolo Bonzini Signed-off-by: Binbin Wu [Adjust userspace API. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 10 ++++++++++ arch/x86/kvm/vmx/tdx.c | 43 ++++++++++++++++++++++++++++++++++++++---- include/uapi/linux/kvm.h | 5 +++++ 3 files changed, 54 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 115ec3c2b641..9abf93ee5f65 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7191,6 +7191,11 @@ The valid value for 'flags' is: u64 gpa; u64 size; } get_quote; + struct { + u64 ret; + u64 leaf; + u64 r11, r12, r13, r14; + } get_tdvmcall_info; }; } tdx; @@ -7216,6 +7221,11 @@ queued successfully, the TDX guest can poll the status field in the shared-memory area to check whether the Quote generation is completed or not. When completed, the generated Quote is returned via the same buffer. +* ``TDVMCALL_GET_TD_VM_CALL_INFO``: the guest has requested the support +status of TDVMCALLs. The output values for the given leaf should be +placed in fields from ``r11`` to ``r14`` of the ``get_tdvmcall_info`` +field of the union. + KVM may add support for more values in the future that may cause a userspace exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case, it will enter with output fields already valid; in the common case, the diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b619a3478983..1ad20c273f3b 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1451,18 +1451,53 @@ error: return 1; } +static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); + + /* + * For now, there is no TDVMCALL beyond GHCI base API supported by KVM + * directly without the support from userspace, just set the value + * returned from userspace. + */ + tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; + tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; + tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; + tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; + + return 1; +} + static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) { struct vcpu_tdx *tdx = to_tdx(vcpu); - if (tdx->vp_enter_args.r12) - tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); - else { + switch (tdx->vp_enter_args.r12) { + case 0: tdx->vp_enter_args.r11 = 0; + tdx->vp_enter_args.r12 = 0; tdx->vp_enter_args.r13 = 0; tdx->vp_enter_args.r14 = 0; + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); + return 1; + case 1: + vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; + vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; + vcpu->run->tdx.get_tdvmcall_info.r11 = 0; + vcpu->run->tdx.get_tdvmcall_info.r12 = 0; + vcpu->run->tdx.get_tdvmcall_info.r13 = 0; + vcpu->run->tdx.get_tdvmcall_info.r14 = 0; + vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; + return 0; + default: + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; } - return 1; } static int tdx_complete_simple(struct kvm_vcpu *vcpu) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index e23e7286ad1a..37891580d05d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -462,6 +462,11 @@ struct kvm_run { __u64 gpa; __u64 size; } get_quote; + struct { + __u64 ret; + __u64 leaf; + __u64 r11, r12, r13, r14; + } get_tdvmcall_info; }; } tdx; /* Fix the size of the union. */ -- cgit v1.2.3 From 4580dbef5ce0f95a4bd8ac2d007bc4fbf1539332 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 20 Jun 2025 13:28:08 -0400 Subject: KVM: TDX: Exit to userspace for SetupEventNotifyInterrupt Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 7 +++++++ arch/x86/include/asm/shared/tdx.h | 1 + arch/x86/kvm/vmx/tdx.c | 23 +++++++++++++++++++++++ include/uapi/linux/kvm.h | 4 ++++ 4 files changed, 35 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9abf93ee5f65..f0d961436d0f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7196,6 +7196,10 @@ The valid value for 'flags' is: u64 leaf; u64 r11, r12, r13, r14; } get_tdvmcall_info; + struct { + u64 ret; + u64 vector; + } setup_event_notify; }; } tdx; @@ -7226,6 +7230,9 @@ status of TDVMCALLs. The output values for the given leaf should be placed in fields from ``r11`` to ``r14`` of the ``get_tdvmcall_info`` field of the union. +* ``TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT``: the guest has requested to +set up a notification interrupt for vector ``vector``. + KVM may add support for more values in the future that may cause a userspace exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case, it will enter with output fields already valid; in the common case, the diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index d8525e6ef50a..8bc074c8d7c6 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -72,6 +72,7 @@ #define TDVMCALL_MAP_GPA 0x10001 #define TDVMCALL_GET_QUOTE 0x10002 #define TDVMCALL_REPORT_FATAL_ERROR 0x10003 +#define TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT 0x10004ULL /* * TDG.VP.VMCALL Status Codes (returned in R10) diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 1ad20c273f3b..b4055a746ecd 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1530,6 +1530,27 @@ static int tdx_get_quote(struct kvm_vcpu *vcpu) return 0; } +static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 vector = tdx->vp_enter_args.r12; + + if (vector < 32 || vector > 255) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT; + vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; + vcpu->run->tdx.setup_event_notify.vector = vector; + + vcpu->arch.complete_userspace_io = tdx_complete_simple; + + return 0; +} + static int handle_tdvmcall(struct kvm_vcpu *vcpu) { switch (tdvmcall_leaf(vcpu)) { @@ -1541,6 +1562,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu) return tdx_get_td_vm_call_info(vcpu); case TDVMCALL_GET_QUOTE: return tdx_get_quote(vcpu); + case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: + return tdx_setup_event_notify_interrupt(vcpu); default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 37891580d05d..7a4c35ff03fe 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -467,6 +467,10 @@ struct kvm_run { __u64 leaf; __u64 r11, r12, r13, r14; } get_tdvmcall_info; + struct { + __u64 ret; + __u64 vector; + } setup_event_notify; }; } tdx; /* Fix the size of the union. */ -- cgit v1.2.3 From 7bd43cc79cab3850f34da0a31d5b042b701590ef Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 19 Jun 2025 19:18:03 +0800 Subject: fs: introduce FALLOC_FL_WRITE_ZEROES to fallocate With the development of flash-based storage devices, we can quickly write zeros to SSDs using the WRITE_ZERO command if the devices do not actually write physical zeroes to the media. Therefore, we can use this command to quickly preallocate a real all-zero file with written extents. This approach should be beneficial for subsequent pure overwriting within this file, as it can save on block allocation and, consequently, significant metadata changes, which should greatly improve overwrite performance on certain filesystems. Therefore, introduce a new operation FALLOC_FL_WRITE_ZEROES to fallocate. This flag is used to convert a specified range of a file to zeros by issuing a zeroing operation. Blocks should be allocated for the regions that span holes in the file, and the entire range is converted to written extents. If the underlying device supports the actual offload write zeroes command, the process of zeroing out operation can be accelerated. If it does not, we currently don't prevent the file system from writing actual zeros to the device. This provides users with a new method to quickly generate a zeroed file, users no longer need to write zero data to create a file with written extents. Users can determine whether a disk supports the unmap write zeroes feature through querying this sysfs interface: /sys/block//queue/write_zeroes_unmap_max_hw_bytes Users can also enable or disable the unmap write zeroes operation through this sysfs interface: /sys/block//queue/write_zeroes_unmap_max_bytes Finally, this flag cannot be specified in conjunction with the FALLOC_FL_KEEP_SIZE since allocating written extents beyond file EOF is not permitted. In addition, filesystems that always require out-of-place writes should not support this flag since they still need to allocated new blocks during subsequent overwrites. Signed-off-by: Zhang Yi Link: https://lore.kernel.org/20250619111806.3546162-7-yi.zhang@huaweicloud.com Reviewed-by: Christoph Hellwig Reviewed-by: "Martin K. Petersen" Signed-off-by: Christian Brauner --- fs/open.c | 1 + include/linux/falloc.h | 3 ++- include/uapi/linux/falloc.h | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/fs/open.c b/fs/open.c index 7828234a7caa..b777e11e5522 100644 --- a/fs/open.c +++ b/fs/open.c @@ -281,6 +281,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) break; case FALLOC_FL_COLLAPSE_RANGE: case FALLOC_FL_INSERT_RANGE: + case FALLOC_FL_WRITE_ZEROES: if (mode & FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; break; diff --git a/include/linux/falloc.h b/include/linux/falloc.h index 3f49f3df6af5..7c38c6b76b60 100644 --- a/include/linux/falloc.h +++ b/include/linux/falloc.h @@ -36,7 +36,8 @@ struct space_resv { FALLOC_FL_COLLAPSE_RANGE | \ FALLOC_FL_ZERO_RANGE | \ FALLOC_FL_INSERT_RANGE | \ - FALLOC_FL_UNSHARE_RANGE) + FALLOC_FL_UNSHARE_RANGE | \ + FALLOC_FL_WRITE_ZEROES) /* on ia32 l_start is on a 32-bit boundary */ #if defined(CONFIG_X86_64) diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h index 5810371ed72b..1f9ca757d02d 100644 --- a/include/uapi/linux/falloc.h +++ b/include/uapi/linux/falloc.h @@ -78,4 +78,21 @@ */ #define FALLOC_FL_UNSHARE_RANGE 0x40 +/* + * FALLOC_FL_WRITE_ZEROES zeroes a specified file range in such a way that + * subsequent writes to that range do not require further changes to the file + * mapping metadata. This flag is beneficial for subsequent pure overwriting + * within this range, as it can save on block allocation and, consequently, + * significant metadata changes. Therefore, filesystems that always require + * out-of-place writes should not support this flag. + * + * Different filesystems may implement different limitations on the + * granularity of the zeroing operation. Most will preferably be accelerated + * by submitting write zeroes command if the backing storage supports, which + * may not physically write zeros to the media. + * + * This flag cannot be specified in conjunction with the FALLOC_FL_KEEP_SIZE. + */ +#define FALLOC_FL_WRITE_ZEROES 0x80 + #endif /* _UAPI_FALLOC_H_ */ -- cgit v1.2.3 From cb9ccfb404e700dc0db59d68242d79fe386bb3f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Jun 2025 17:05:19 -0600 Subject: io_uring/nop: add IORING_NOP_TW completion flag To test and profile the overhead of io_uring task_work and the various types of it, add IORING_NOP_TW which tells nop to signal completions through task_work rather than complete them inline. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/nop.c | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cfd17e382082..8c3d43caab02 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -449,6 +449,7 @@ enum io_uring_msg_ring_flags { #define IORING_NOP_FILE (1U << 1) #define IORING_NOP_FIXED_FILE (1U << 2) #define IORING_NOP_FIXED_BUFFER (1U << 3) +#define IORING_NOP_TW (1U << 4) /* * IO completion data structure (Completion Queue Entry) diff --git a/io_uring/nop.c b/io_uring/nop.c index 6ac2de761fd3..20ed0f85b1c2 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -20,7 +20,8 @@ struct io_nop { }; #define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ - IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE) + IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \ + IORING_NOP_TW) int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -68,5 +69,10 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, nop->result, 0); + if (nop->flags & IORING_NOP_TW) { + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); + return IOU_ISSUE_SKIP_COMPLETE; + } return IOU_COMPLETE; } -- cgit v1.2.3 From 9e4ed359b8efad0e8ad4510d8ad22bf0b060526a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 16 Jun 2025 10:46:29 +0100 Subject: io_uring/netcmd: add tx timestamping cmd support Add a new socket command which returns tx time stamps to the user. It provide an alternative to the existing error queue recvmsg interface. The command works in a polled multishot mode, which means io_uring will poll the socket and keep posting timestamps until the request is cancelled or fails in any other way (e.g. with no space in the CQ). It reuses the net infra and grabs timestamps from the socket's error queue. The command requires IORING_SETUP_CQE32. All non-final CQEs (marked with IORING_CQE_F_MORE) have cqe->res set to the tskey, and the upper 16 bits of cqe->flags keep tstype (i.e. offset by IORING_CQE_BUFFER_SHIFT). The timevalue is store in the upper part of the extended CQE. The final completion won't have IORING_CQE_F_MORE and will have cqe->res storing 0/error. Suggested-by: Vadim Fedorenko Acked-by: Willem de Bruijn Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/92ee66e6b33b8de062a977843d825f58f21ecd37.1750065793.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 16 +++++++++ io_uring/cmd_net.c | 82 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 8c3d43caab02..85600ad0ac08 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -969,6 +969,22 @@ enum io_uring_socket_op { SOCKET_URING_OP_SIOCOUTQ, SOCKET_URING_OP_GETSOCKOPT, SOCKET_URING_OP_SETSOCKOPT, + SOCKET_URING_OP_TX_TIMESTAMP, +}; + +/* + * SOCKET_URING_OP_TX_TIMESTAMP definitions + */ + +#define IORING_TIMESTAMP_HW_SHIFT 16 +/* The cqe->flags bit from which the timestamp type is stored */ +#define IORING_TIMESTAMP_TYPE_SHIFT (IORING_TIMESTAMP_HW_SHIFT + 1) +/* The cqe->flags flag signifying whether it's a hardware timestamp */ +#define IORING_CQE_F_TSTAMP_HW ((__u32)1 << IORING_TIMESTAMP_HW_SHIFT); + +struct io_timespec { + __u64 tv_sec; + __u64 tv_nsec; }; /* Zero copy receive refill queue entry */ diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index e99170c7d41a..3866fe6ff541 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -1,5 +1,6 @@ #include #include +#include #include #include "uring_cmd.h" @@ -51,6 +52,85 @@ static inline int io_uring_cmd_setsockopt(struct socket *sock, optlen); } +static bool io_process_timestamp_skb(struct io_uring_cmd *cmd, struct sock *sk, + struct sk_buff *skb, unsigned issue_flags) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + struct io_uring_cqe cqe[2]; + struct io_timespec *iots; + struct timespec64 ts; + u32 tstype, tskey; + int ret; + + BUILD_BUG_ON(sizeof(struct io_uring_cqe) != sizeof(struct io_timespec)); + + ret = skb_get_tx_timestamp(skb, sk, &ts); + if (ret < 0) + return false; + + tskey = serr->ee.ee_data; + tstype = serr->ee.ee_info; + + cqe->user_data = 0; + cqe->res = tskey; + cqe->flags = IORING_CQE_F_MORE; + cqe->flags |= tstype << IORING_TIMESTAMP_TYPE_SHIFT; + if (ret == SOF_TIMESTAMPING_TX_HARDWARE) + cqe->flags |= IORING_CQE_F_TSTAMP_HW; + + iots = (struct io_timespec *)&cqe[1]; + iots->tv_sec = ts.tv_sec; + iots->tv_nsec = ts.tv_nsec; + return io_uring_cmd_post_mshot_cqe32(cmd, issue_flags, cqe); +} + +static int io_uring_cmd_timestamp(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct sock *sk = sock->sk; + struct sk_buff_head *q = &sk->sk_error_queue; + struct sk_buff *skb, *tmp; + struct sk_buff_head list; + int ret; + + if (!(issue_flags & IO_URING_F_CQE32)) + return -EINVAL; + ret = io_cmd_poll_multishot(cmd, issue_flags, EPOLLERR); + if (unlikely(ret)) + return ret; + + if (skb_queue_empty_lockless(q)) + return -EAGAIN; + __skb_queue_head_init(&list); + + scoped_guard(spinlock_irq, &q->lock) { + skb_queue_walk_safe(q, skb, tmp) { + /* don't support skbs with payload */ + if (!skb_has_tx_timestamp(skb, sk) || skb->len) + continue; + __skb_unlink(skb, q); + __skb_queue_tail(&list, skb); + } + } + + while (1) { + skb = skb_peek(&list); + if (!skb) + break; + if (!io_process_timestamp_skb(cmd, sk, skb, issue_flags)) + break; + __skb_dequeue(&list); + consume_skb(skb); + } + + if (!unlikely(skb_queue_empty(&list))) { + scoped_guard(spinlock_irqsave, &q->lock) + skb_queue_splice(q, &list); + } + return -EAGAIN; +} + int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct socket *sock = cmd->file->private_data; @@ -76,6 +156,8 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) return io_uring_cmd_getsockopt(sock, cmd, issue_flags); case SOCKET_URING_OP_SETSOCKOPT: return io_uring_cmd_setsockopt(sock, cmd, issue_flags); + case SOCKET_URING_OP_TX_TIMESTAMP: + return io_uring_cmd_timestamp(sock, cmd, issue_flags); default: return -EOPNOTSUPP; } -- cgit v1.2.3 From b74947b4f6ff7c122a1bb6eb38bb7ecfbb1d3820 Mon Sep 17 00:00:00 2001 From: Roopni Devanathan Date: Sun, 15 Jun 2025 13:53:09 +0530 Subject: wifi: cfg80211/mac80211: Add support to get radio index Currently, per-radio attributes are set on per-phy basis, i.e., all the radios present in a wiphy will take attributes values sent from user. But each radio in a wiphy can get different values from userspace based on its requirement. To extend support to set per-radio attributes, add support to get radio index from userspace. Add an NL attribute - NL80211_ATTR_WIPHY_RADIO_INDEX, to get user specified radio index for which attributes should be changed. Pass this to individual drivers, so that the drivers can use this radio index to change per-radio attributes when necessary. Currently, per-radio attributes identified are: NL80211_ATTR_WIPHY_TX_POWER_LEVEL NL80211_ATTR_WIPHY_ANTENNA_TX NL80211_ATTR_WIPHY_ANTENNA_RX NL80211_ATTR_WIPHY_RETRY_SHORT NL80211_ATTR_WIPHY_RETRY_LONG NL80211_ATTR_WIPHY_FRAG_THRESHOLD NL80211_ATTR_WIPHY_RTS_THRESHOLD NL80211_ATTR_WIPHY_COVERAGE_CLASS NL80211_ATTR_TXQ_LIMIT NL80211_ATTR_TXQ_MEMORY_LIMIT NL80211_ATTR_TXQ_QUANTUM By default, the radio index is set to -1. This means the attribute should be treated as a global configuration. If the user has not specified any index, then the radio index passed to individual drivers would be -1. This would indicate that the attribute applies to all radios in that wiphy. Signed-off-by: Roopni Devanathan Link: https://patch.msgid.link/20250615082312.619639-2-quic_rdevanat@quicinc.com Signed-off-by: Johannes Berg --- drivers/net/wireless/admtek/adm8211.c | 2 +- drivers/net/wireless/ath/ar5523/ar5523.c | 5 +- drivers/net/wireless/ath/ath10k/core.c | 2 +- drivers/net/wireless/ath/ath10k/hw.c | 1 + drivers/net/wireless/ath/ath10k/hw.h | 2 +- drivers/net/wireless/ath/ath10k/mac.c | 19 ++++-- drivers/net/wireless/ath/ath11k/mac.c | 14 ++-- drivers/net/wireless/ath/ath12k/mac.c | 14 ++-- drivers/net/wireless/ath/ath5k/mac80211-ops.c | 12 ++-- drivers/net/wireless/ath/ath6kl/cfg80211.c | 7 +- drivers/net/wireless/ath/ath9k/htc_drv_main.c | 10 +-- drivers/net/wireless/ath/ath9k/main.c | 9 ++- drivers/net/wireless/ath/carl9170/main.c | 2 +- drivers/net/wireless/ath/wcn36xx/main.c | 5 +- drivers/net/wireless/ath/wil6210/cfg80211.c | 3 +- drivers/net/wireless/atmel/at76c50x-usb.c | 2 +- drivers/net/wireless/broadcom/b43/main.c | 6 +- drivers/net/wireless/broadcom/b43legacy/main.c | 2 +- .../broadcom/brcm80211/brcmfmac/cfg80211.c | 8 ++- .../broadcom/brcm80211/brcmsmac/mac80211_if.c | 3 +- drivers/net/wireless/intel/iwlegacy/common.c | 2 +- drivers/net/wireless/intel/iwlegacy/common.h | 2 +- drivers/net/wireless/intel/iwlwifi/dvm/agn.h | 2 +- drivers/net/wireless/intel/iwlwifi/dvm/rxon.c | 2 +- drivers/net/wireless/intel/iwlwifi/mld/mac80211.c | 6 +- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 9 ++- drivers/net/wireless/intel/iwlwifi/mvm/mvm.h | 12 ++-- drivers/net/wireless/intersil/p54/main.c | 3 +- drivers/net/wireless/marvell/libertas_tf/main.c | 2 +- drivers/net/wireless/marvell/mwifiex/cfg80211.c | 11 ++- drivers/net/wireless/marvell/mwl8k.c | 12 ++-- drivers/net/wireless/mediatek/mt76/mac80211.c | 3 +- drivers/net/wireless/mediatek/mt76/mt76.h | 3 +- drivers/net/wireless/mediatek/mt76/mt7603/main.c | 5 +- drivers/net/wireless/mediatek/mt76/mt7615/main.c | 11 +-- drivers/net/wireless/mediatek/mt76/mt76x0/main.c | 2 +- drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h | 2 +- drivers/net/wireless/mediatek/mt76/mt76x02.h | 4 +- drivers/net/wireless/mediatek/mt76/mt76x02_util.c | 4 +- .../net/wireless/mediatek/mt76/mt76x2/pci_main.c | 6 +- .../net/wireless/mediatek/mt76/mt76x2/usb_main.c | 2 +- drivers/net/wireless/mediatek/mt76/mt7915/main.c | 13 ++-- drivers/net/wireless/mediatek/mt76/mt7921/main.c | 8 ++- drivers/net/wireless/mediatek/mt76/mt7925/main.c | 8 ++- drivers/net/wireless/mediatek/mt76/mt792x.h | 3 +- drivers/net/wireless/mediatek/mt76/mt792x_core.c | 3 +- drivers/net/wireless/mediatek/mt76/mt7996/main.c | 11 +-- drivers/net/wireless/mediatek/mt7601u/main.c | 5 +- drivers/net/wireless/microchip/wilc1000/cfg80211.c | 7 +- drivers/net/wireless/purelifi/plfxlc/mac.c | 5 +- drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 8 ++- drivers/net/wireless/ralink/rt2x00/rt2800lib.c | 2 +- drivers/net/wireless/ralink/rt2x00/rt2800lib.h | 3 +- drivers/net/wireless/ralink/rt2x00/rt2x00.h | 8 ++- drivers/net/wireless/ralink/rt2x00/rt2x00mac.c | 8 ++- drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c | 2 +- drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c | 2 +- drivers/net/wireless/realtek/rtl8xxxu/core.c | 8 ++- drivers/net/wireless/realtek/rtlwifi/core.c | 2 +- drivers/net/wireless/realtek/rtw88/mac80211.c | 9 ++- drivers/net/wireless/realtek/rtw88/main.h | 2 +- drivers/net/wireless/realtek/rtw88/rtw8822b.c | 1 + drivers/net/wireless/realtek/rtw88/rtw8822c.c | 1 + drivers/net/wireless/realtek/rtw89/mac80211.c | 10 +-- drivers/net/wireless/rsi/rsi_91x_mac80211.c | 9 ++- drivers/net/wireless/silabs/wfx/sta.c | 4 +- drivers/net/wireless/silabs/wfx/sta.h | 4 +- drivers/net/wireless/st/cw1200/sta.c | 5 +- drivers/net/wireless/st/cw1200/sta.h | 5 +- drivers/net/wireless/ti/wl1251/main.c | 5 +- drivers/net/wireless/ti/wlcore/main.c | 8 ++- drivers/net/wireless/virtual/mac80211_hwsim.c | 6 +- drivers/net/wireless/zydas/zd1211rw/zd_mac.c | 2 +- drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c | 6 +- include/net/cfg80211.h | 12 ++-- include/net/mac80211.h | 17 +++-- include/uapi/linux/nl80211.h | 10 +++ net/mac80211/cfg.c | 30 +++++--- net/mac80211/chan.c | 2 +- net/mac80211/driver-ops.h | 36 +++++----- net/mac80211/ieee80211_i.h | 5 +- net/mac80211/iface.c | 6 +- net/mac80211/main.c | 9 +-- net/mac80211/mlme.c | 12 ++-- net/mac80211/offchannel.c | 2 +- net/mac80211/pm.c | 2 +- net/mac80211/trace.h | 78 ++++++++++++++++----- net/mac80211/tx.c | 4 +- net/mac80211/util.c | 16 ++--- net/wireless/nl80211.c | 26 +++++-- net/wireless/rdev-ops.h | 39 ++++++----- net/wireless/trace.h | 79 +++++++++++++++------- net/wireless/wext-compat.c | 10 +-- 93 files changed, 520 insertions(+), 291 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/wireless/admtek/adm8211.c b/drivers/net/wireless/admtek/adm8211.c index a2d87c3ad196..e94a6b180314 100644 --- a/drivers/net/wireless/admtek/adm8211.c +++ b/drivers/net/wireless/admtek/adm8211.c @@ -1293,7 +1293,7 @@ static void adm8211_set_bssid(struct ieee80211_hw *dev, const u8 *bssid) ADM8211_CSR_WRITE(ABDA1, reg); } -static int adm8211_config(struct ieee80211_hw *dev, u32 changed) +static int adm8211_config(struct ieee80211_hw *dev, int radio_idx, u32 changed) { struct adm8211_priv *priv = dev->priv; struct ieee80211_conf *conf = &dev->conf; diff --git a/drivers/net/wireless/ath/ar5523/ar5523.c b/drivers/net/wireless/ath/ar5523/ar5523.c index 343c9de2749c..1230e6278f23 100644 --- a/drivers/net/wireless/ath/ar5523/ar5523.c +++ b/drivers/net/wireless/ath/ar5523/ar5523.c @@ -1083,7 +1083,8 @@ static void ar5523_stop(struct ieee80211_hw *hw, bool suspend) mutex_unlock(&ar->mutex); } -static int ar5523_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int ar5523_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct ar5523 *ar = hw->priv; int ret; @@ -1137,7 +1138,7 @@ static void ar5523_remove_interface(struct ieee80211_hw *hw, ar->vif = NULL; } -static int ar5523_hwconfig(struct ieee80211_hw *hw, u32 changed) +static int ar5523_hwconfig(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ar5523 *ar = hw->priv; diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index fe3a8f4a1cc1..52163c2bfe7a 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -2606,7 +2606,7 @@ static void ath10k_core_set_coverage_class_work(struct work_struct *work) set_coverage_class_work); if (ar->hw_params.hw_ops->set_coverage_class) - ar->hw_params.hw_ops->set_coverage_class(ar, -1); + ar->hw_params.hw_ops->set_coverage_class(ar, -1, -1); } static int ath10k_core_init_firmware_features(struct ath10k *ar) diff --git a/drivers/net/wireless/ath/ath10k/hw.c b/drivers/net/wireless/ath/ath10k/hw.c index 84b35a22fc23..59b6cebfdd8f 100644 --- a/drivers/net/wireless/ath/ath10k/hw.c +++ b/drivers/net/wireless/ath/ath10k/hw.c @@ -590,6 +590,7 @@ void ath10k_hw_fill_survey_time(struct ath10k *ar, struct survey_info *survey, * function monitors and modifies the corresponding MAC registers. */ static void ath10k_hw_qca988x_set_coverage_class(struct ath10k *ar, + int radio_idx, s16 value) { u32 slottime_reg; diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h index 7ffa1fbe2874..fec56b916497 100644 --- a/drivers/net/wireless/ath/ath10k/hw.h +++ b/drivers/net/wireless/ath/ath10k/hw.h @@ -646,7 +646,7 @@ struct htt_rx_ring_rx_desc_offsets; /* Defines needed for Rx descriptor abstraction */ struct ath10k_hw_ops { - void (*set_coverage_class)(struct ath10k *ar, s16 value); + void (*set_coverage_class)(struct ath10k *ar, int radio_idx, s16 value); int (*enable_pll_clk)(struct ath10k *ar); int (*tx_data_rssi_pad_bytes)(struct htt_resp *htt); int (*is_rssi_enable)(struct htt_resp *resp); diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 07fe05384cdf..590d7a8dd399 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -4820,7 +4820,8 @@ void ath10k_halt(struct ath10k *ar) spin_unlock_bh(&ar->data_lock); } -static int ath10k_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +static int ath10k_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ath10k *ar = hw->priv; @@ -5067,7 +5068,8 @@ static int __ath10k_set_antenna(struct ath10k *ar, u32 tx_ant, u32 rx_ant) return 0; } -static int ath10k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +static int ath10k_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct ath10k *ar = hw->priv; int ret; @@ -5437,7 +5439,7 @@ static int ath10k_config_ps(struct ath10k *ar) return ret; } -static int ath10k_config(struct ieee80211_hw *hw, u32 changed) +static int ath10k_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ath10k *ar = hw->priv; struct ieee80211_conf *conf = &hw->conf; @@ -6336,7 +6338,8 @@ static void ath10k_bss_info_changed(struct ieee80211_hw *hw, mutex_unlock(&ar->conf_mutex); } -static void ath10k_mac_op_set_coverage_class(struct ieee80211_hw *hw, s16 value) +static void ath10k_mac_op_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 value) { struct ath10k *ar = hw->priv; @@ -6347,7 +6350,7 @@ static void ath10k_mac_op_set_coverage_class(struct ieee80211_hw *hw, s16 value) WARN_ON_ONCE(1); return; } - ar->hw_params.hw_ops->set_coverage_class(ar, value); + ar->hw_params.hw_ops->set_coverage_class(ar, -1, value); } struct ath10k_mac_tdls_iter_data { @@ -8035,7 +8038,8 @@ static int ath10k_cancel_remain_on_channel(struct ieee80211_hw *hw, * in ath10k, but device-specific in mac80211. */ -static int ath10k_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int ath10k_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct ath10k *ar = hw->priv; struct ath10k_vif *arvif; @@ -8058,7 +8062,8 @@ static int ath10k_set_rts_threshold(struct ieee80211_hw *hw, u32 value) return ret; } -static int ath10k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, u32 value) +static int ath10k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { /* Even though there's a WMI enum for fragmentation threshold no known * firmware actually implements it. Moreover it is not possible to rely diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 13301ca317a5..758ef6f26432 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -1283,7 +1283,7 @@ static int ath11k_mac_config_ps(struct ath11k *ar) return ret; } -static int ath11k_mac_op_config(struct ieee80211_hw *hw, u32 changed) +static int ath11k_mac_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ath11k *ar = hw->priv; struct ieee80211_conf *conf = &hw->conf; @@ -7044,7 +7044,8 @@ static void ath11k_mac_op_configure_filter(struct ieee80211_hw *hw, mutex_unlock(&ar->conf_mutex); } -static int ath11k_mac_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +static int ath11k_mac_op_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ath11k *ar = hw->priv; @@ -7058,7 +7059,8 @@ static int ath11k_mac_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 * return 0; } -static int ath11k_mac_op_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +static int ath11k_mac_op_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct ath11k *ar = hw->priv; int ret; @@ -8182,7 +8184,8 @@ ath11k_set_vdev_param_to_all_vifs(struct ath11k *ar, int param, u32 value) /* mac80211 stores device specific RTS/Fragmentation threshold value, * this is set interface specific to firmware from ath11k driver */ -static int ath11k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int ath11k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { struct ath11k *ar = hw->priv; int param_id = WMI_VDEV_PARAM_RTS_THRESHOLD; @@ -8190,7 +8193,8 @@ static int ath11k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value) return ath11k_set_vdev_param_to_all_vifs(ar, param_id, value); } -static int ath11k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, u32 value) +static int ath11k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { /* Even though there's a WMI vdev param for fragmentation threshold no * known firmware actually implements it. Moreover it is not possible to diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 59ec422992d3..81c6b80fa890 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -1392,7 +1392,7 @@ err: return ret; } -static int ath12k_mac_op_config(struct ieee80211_hw *hw, u32 changed) +static int ath12k_mac_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { return 0; } @@ -9354,7 +9354,8 @@ static void ath12k_mac_op_configure_filter(struct ieee80211_hw *hw, ar->filter_flags = *total_flags; } -static int ath12k_mac_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +static int ath12k_mac_op_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ath12k_hw *ah = ath12k_hw_to_ah(hw); int antennas_rx = 0, antennas_tx = 0; @@ -9374,7 +9375,8 @@ static int ath12k_mac_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 * return 0; } -static int ath12k_mac_op_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +static int ath12k_mac_op_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct ath12k_hw *ah = ath12k_hw_to_ah(hw); struct ath12k *ar; @@ -10735,7 +10737,8 @@ ath12k_set_vdev_param_to_all_vifs(struct ath12k *ar, int param, u32 value) /* mac80211 stores device specific RTS/Fragmentation threshold value, * this is set interface specific to firmware from ath12k driver */ -static int ath12k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int ath12k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { struct ath12k_hw *ah = ath12k_hw_to_ah(hw); struct ath12k *ar; @@ -10760,7 +10763,8 @@ static int ath12k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value) return ret; } -static int ath12k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, u32 value) +static int ath12k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { /* Even though there's a WMI vdev param for fragmentation threshold no * known firmware actually implements it. Moreover it is not possible to diff --git a/drivers/net/wireless/ath/ath5k/mac80211-ops.c b/drivers/net/wireless/ath/ath5k/mac80211-ops.c index d81b2ad0b095..eca8145d3874 100644 --- a/drivers/net/wireless/ath/ath5k/mac80211-ops.c +++ b/drivers/net/wireless/ath/ath5k/mac80211-ops.c @@ -192,7 +192,7 @@ ath5k_remove_interface(struct ieee80211_hw *hw, * TODO: Phy disable/diversity etc */ static int -ath5k_config(struct ieee80211_hw *hw, u32 changed) +ath5k_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ath5k_hw *ah = hw->priv; struct ieee80211_conf *conf = &hw->conf; @@ -686,6 +686,7 @@ ath5k_get_survey(struct ieee80211_hw *hw, int idx, struct survey_info *survey) * ath5k_set_coverage_class - Set IEEE 802.11 coverage class * * @hw: struct ieee80211_hw pointer + * @radio_idx: Radio index * @coverage_class: IEEE 802.11 coverage class number * * Mac80211 callback. Sets slot time, ACK timeout and CTS timeout for given @@ -693,7 +694,8 @@ ath5k_get_survey(struct ieee80211_hw *hw, int idx, struct survey_info *survey) * reset. */ static void -ath5k_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) +ath5k_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class) { struct ath5k_hw *ah = hw->priv; @@ -704,7 +706,8 @@ ath5k_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) static int -ath5k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +ath5k_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant, + u32 rx_ant) { struct ath5k_hw *ah = hw->priv; @@ -721,7 +724,8 @@ ath5k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) static int -ath5k_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +ath5k_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ath5k_hw *ah = hw->priv; diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index 8c2e8081112e..88f0197fc041 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -1376,7 +1376,8 @@ void ath6kl_cfg80211_tkip_micerr_event(struct ath6kl_vif *vif, u8 keyid, GFP_KERNEL); } -static int ath6kl_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) +static int ath6kl_cfg80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { struct ath6kl *ar = (struct ath6kl *)wiphy_priv(wiphy); struct ath6kl_vif *vif; @@ -1405,6 +1406,7 @@ static int ath6kl_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) static int ath6kl_cfg80211_set_txpower(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, enum nl80211_tx_power_setting type, int mbm) { @@ -1441,6 +1443,7 @@ static int ath6kl_cfg80211_set_txpower(struct wiphy *wiphy, static int ath6kl_cfg80211_get_txpower(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, unsigned int link_id, int *dbm) { @@ -3242,7 +3245,7 @@ static int ath6kl_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, wait, buf, len, no_cck); } -static int ath6kl_get_antenna(struct wiphy *wiphy, +static int ath6kl_get_antenna(struct wiphy *wiphy, int radio_idx, u32 *tx_ant, u32 *rx_ant) { struct ath6kl *ar = wiphy_priv(wiphy); diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c index 19600018e562..0d6272ac0dac 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c @@ -1172,7 +1172,7 @@ static void ath9k_htc_remove_interface(struct ieee80211_hw *hw, mutex_unlock(&priv->mutex); } -static int ath9k_htc_config(struct ieee80211_hw *hw, u32 changed) +static int ath9k_htc_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ath9k_htc_priv *priv = hw->priv; struct ath_common *common = ath9k_hw_common(priv->ah); @@ -1737,12 +1737,14 @@ static void ath9k_htc_sw_scan_complete(struct ieee80211_hw *hw, mutex_unlock(&priv->mutex); } -static int ath9k_htc_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int ath9k_htc_set_rts_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { return 0; } static void ath9k_htc_set_coverage_class(struct ieee80211_hw *hw, + int radio_idx, s16 coverage_class) { struct ath9k_htc_priv *priv = hw->priv; @@ -1841,8 +1843,8 @@ struct base_eep_header *ath9k_htc_get_eeprom_base(struct ath9k_htc_priv *priv) } -static int ath9k_htc_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, - u32 *rx_ant) +static int ath9k_htc_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ath9k_htc_priv *priv = hw->priv; struct base_eep_header *pBase = ath9k_htc_get_eeprom_base(priv); diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index c56f4f3b8990..740a6fc7b067 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -1484,7 +1484,7 @@ static void ath9k_disable_ps(struct ath_softc *sc) ath_dbg(common, PS, "PowerSave disabled\n"); } -static int ath9k_config(struct ieee80211_hw *hw, u32 changed) +static int ath9k_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ath_softc *sc = hw->priv; struct ath_hw *ah = sc->sc_ah; @@ -2114,6 +2114,7 @@ static void ath9k_enable_dynack(struct ath_softc *sc) } static void ath9k_set_coverage_class(struct ieee80211_hw *hw, + int radio_idx, s16 coverage_class) { struct ath_softc *sc = hw->priv; @@ -2338,7 +2339,8 @@ static bool validate_antenna_mask(struct ath_hw *ah, u32 val) } } -static int ath9k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +static int ath9k_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct ath_softc *sc = hw->priv; struct ath_hw *ah = sc->sc_ah; @@ -2367,7 +2369,8 @@ static int ath9k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) return 0; } -static int ath9k_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +static int ath9k_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ath_softc *sc = hw->priv; diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c index 755c068e4197..a7a9345f3483 100644 --- a/drivers/net/wireless/ath/carl9170/main.c +++ b/drivers/net/wireless/ath/carl9170/main.c @@ -890,7 +890,7 @@ static void carl9170_stat_work(struct work_struct *work) round_jiffies(msecs_to_jiffies(CARL9170_STAT_WORK))); } -static int carl9170_op_config(struct ieee80211_hw *hw, u32 changed) +static int carl9170_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ar9170 *ar = hw->priv; int err = 0; diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c index 94d08d6ae1a3..02a525645bfa 100644 --- a/drivers/net/wireless/ath/wcn36xx/main.c +++ b/drivers/net/wireless/ath/wcn36xx/main.c @@ -361,7 +361,7 @@ static void wcn36xx_change_opchannel(struct wcn36xx *wcn, int ch) return; } -static int wcn36xx_config(struct ieee80211_hw *hw, u32 changed) +static int wcn36xx_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct wcn36xx *wcn = hw->priv; int ret; @@ -965,7 +965,8 @@ out: } /* this is required when using IEEE80211_HW_HAS_RATE_CONTROL */ -static int wcn36xx_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int wcn36xx_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct wcn36xx *wcn = hw->priv; wcn36xx_dbg(WCN36XX_DBG_MAC, "mac set RTS threshold %d\n", value); diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c index 5473c01cbe66..7703a0933a14 100644 --- a/drivers/net/wireless/ath/wil6210/cfg80211.c +++ b/drivers/net/wireless/ath/wil6210/cfg80211.c @@ -1408,7 +1408,8 @@ static int wil_cfg80211_disconnect(struct wiphy *wiphy, return rc; } -static int wil_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) +static int wil_cfg80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { struct wil6210_priv *wil = wiphy_to_wil(wiphy); int rc; diff --git a/drivers/net/wireless/atmel/at76c50x-usb.c b/drivers/net/wireless/atmel/at76c50x-usb.c index 6842c2b02b39..aa683eacaf38 100644 --- a/drivers/net/wireless/atmel/at76c50x-usb.c +++ b/drivers/net/wireless/atmel/at76c50x-usb.c @@ -2002,7 +2002,7 @@ exit: return 0; } -static int at76_config(struct ieee80211_hw *hw, u32 changed) +static int at76_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct at76_priv *priv = hw->priv; diff --git a/drivers/net/wireless/broadcom/b43/main.c b/drivers/net/wireless/broadcom/b43/main.c index 7529afd24aed..f1a77c4c445f 100644 --- a/drivers/net/wireless/broadcom/b43/main.c +++ b/drivers/net/wireless/broadcom/b43/main.c @@ -3975,7 +3975,7 @@ static void b43_set_retry_limits(struct b43_wldev *dev, long_retry); } -static int b43_op_config(struct ieee80211_hw *hw, u32 changed) +static int b43_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct b43_wl *wl = hw_to_b43_wl(hw); struct b43_wldev *dev = wl->current_dev; @@ -5073,7 +5073,7 @@ static int b43_op_start(struct ieee80211_hw *hw) * may hang the system. */ if (!err) - b43_op_config(hw, ~0); + b43_op_config(hw, -1, ~0); return err; } @@ -5248,7 +5248,7 @@ out: } /* reload configuration */ - b43_op_config(wl->hw, ~0); + b43_op_config(wl->hw, -1, ~0); if (wl->vif) b43_op_bss_info_changed(wl->hw, wl->vif, &wl->vif->bss_conf, ~0); diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c index 2370a2e6a2e3..aada342e0b80 100644 --- a/drivers/net/wireless/broadcom/b43legacy/main.c +++ b/drivers/net/wireless/broadcom/b43legacy/main.c @@ -2662,7 +2662,7 @@ static void b43legacy_set_retry_limits(struct b43legacy_wldev *dev, b43legacy_shm_write16(dev, B43legacy_SHM_WIRELESS, 0x0007, long_retry); } -static int b43legacy_op_dev_config(struct ieee80211_hw *hw, +static int b43legacy_op_dev_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct b43legacy_wl *wl = hw_to_b43legacy_wl(hw); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 5a0b252dfeaf..40a9a8177de6 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -1637,7 +1637,8 @@ static s32 brcmf_set_retry(struct net_device *ndev, u32 retry, bool l) return err; } -static s32 brcmf_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) +static s32 brcmf_cfg80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); struct net_device *ndev = cfg_to_ndev(cfg); @@ -2645,7 +2646,8 @@ brcmf_cfg80211_disconnect(struct wiphy *wiphy, struct net_device *ndev, static s32 brcmf_cfg80211_set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - enum nl80211_tx_power_setting type, s32 mbm) + int radio_idx, enum nl80211_tx_power_setting type, + s32 mbm) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); struct net_device *ndev = cfg_to_ndev(cfg); @@ -2696,7 +2698,7 @@ done: static s32 brcmf_cfg80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id, s32 *dbm) + int radio_idx, unsigned int link_id, s32 *dbm) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); struct brcmf_cfg80211_vif *vif = wdev_to_vif(wdev); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c index 1c3d29dca424..8ab452cf48c4 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c @@ -525,7 +525,8 @@ brcms_ops_remove_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif) spin_unlock_bh(&wl->lock); } -static int brcms_ops_config(struct ieee80211_hw *hw, u32 changed) +static int brcms_ops_config(struct ieee80211_hw *hw, int radio_idx, + u32 changed) { struct ieee80211_conf *conf = &hw->conf; struct brcms_info *wl = hw->priv; diff --git a/drivers/net/wireless/intel/iwlegacy/common.c b/drivers/net/wireless/intel/iwlegacy/common.c index 9a86688aea67..b7bd3ec4cc50 100644 --- a/drivers/net/wireless/intel/iwlegacy/common.c +++ b/drivers/net/wireless/intel/iwlegacy/common.c @@ -4990,7 +4990,7 @@ il_update_qos(struct il_priv *il) * il_mac_config - mac80211 config callback */ int -il_mac_config(struct ieee80211_hw *hw, u32 changed) +il_mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct il_priv *il = hw->priv; const struct il_channel_info *ch_info; diff --git a/drivers/net/wireless/intel/iwlegacy/common.h b/drivers/net/wireless/intel/iwlegacy/common.h index 52610f5e57a3..4c9836ab11dd 100644 --- a/drivers/net/wireless/intel/iwlegacy/common.h +++ b/drivers/net/wireless/intel/iwlegacy/common.h @@ -1956,7 +1956,7 @@ il_get_hw_mode(struct il_priv *il, enum nl80211_band band) } /* mac80211 handlers */ -int il_mac_config(struct ieee80211_hw *hw, u32 changed); +int il_mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed); void il_mac_reset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void il_mac_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *bss_conf, u64 changes); diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/agn.h b/drivers/net/wireless/intel/iwlwifi/dvm/agn.h index 1ebc7effcc2a..b39bf401567f 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/agn.h +++ b/drivers/net/wireless/intel/iwlwifi/dvm/agn.h @@ -88,7 +88,7 @@ void iwl_connection_init_rx_config(struct iwl_priv *priv, int iwlagn_set_pan_params(struct iwl_priv *priv); int iwlagn_commit_rxon(struct iwl_priv *priv, struct iwl_rxon_context *ctx); void iwlagn_set_rxon_chain(struct iwl_priv *priv, struct iwl_rxon_context *ctx); -int iwlagn_mac_config(struct ieee80211_hw *hw, u32 changed); +int iwlagn_mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed); void iwlagn_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *bss_conf, diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c index 2d3c1627f283..e08e44cae434 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c @@ -1149,7 +1149,7 @@ void iwlagn_config_ht40(struct ieee80211_conf *conf, } } -int iwlagn_mac_config(struct ieee80211_hw *hw, u32 changed) +int iwlagn_mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct iwl_priv *priv = IWL_MAC80211_GET_DVM(hw); struct iwl_rxon_context *ctx; diff --git a/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c index 4ba050397632..76e7e3fa2d13 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c @@ -574,7 +574,8 @@ void iwl_mld_mac80211_stop(struct ieee80211_hw *hw, bool suspend) } static -int iwl_mld_mac80211_config(struct ieee80211_hw *hw, u32 changed) +int iwl_mld_mac80211_config(struct ieee80211_hw *hw, int radio_idx, + u32 changed) { return 0; } @@ -1102,7 +1103,8 @@ void iwl_mld_unassign_vif_chanctx(struct ieee80211_hw *hw, } static -int iwl_mld_mac80211_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +int iwl_mld_mac80211_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { return 0; } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 956b491ae5a4..619d822efa5b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -298,7 +298,8 @@ static const struct wiphy_iftype_ext_capab add_iftypes_ext_capa[] = { }, }; -int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); *tx_ant = iwl_mvm_get_valid_tx_ant(mvm); @@ -306,7 +307,8 @@ int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) return 0; } -int iwl_mvm_op_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +int iwl_mvm_op_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant, + u32 rx_ant) { struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); @@ -4249,7 +4251,8 @@ int iwl_mvm_mac_sta_state_common(struct ieee80211_hw *hw, return ret; } -int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h index a4f412e750d0..5c8eaf1eacff 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h @@ -2866,13 +2866,16 @@ void iwl_mvm_mac_wake_tx_queue(struct ieee80211_hw *hw, int iwl_mvm_mac_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_ampdu_params *params); -int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); -int iwl_mvm_op_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant); +int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant, + u32 *rx_ant); +int iwl_mvm_op_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant, + u32 rx_ant); int iwl_mvm_mac_start(struct ieee80211_hw *hw); void iwl_mvm_mac_reconfig_complete(struct ieee80211_hw *hw, enum ieee80211_reconfig_type reconfig_type); void iwl_mvm_mac_stop(struct ieee80211_hw *hw, bool suspend); -static inline int iwl_mvm_mac_config(struct ieee80211_hw *hw, u32 changed) +static inline int iwl_mvm_mac_config(struct ieee80211_hw *hw, int radio_idx, + u32 changed) { return 0; } @@ -2905,7 +2908,8 @@ iwl_mvm_mac_release_buffered_frames(struct ieee80211_hw *hw, int num_frames, enum ieee80211_frame_release_type reason, bool more_data); -int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, u32 value); +int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value); void iwl_mvm_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_sta *link_sta, u32 changed); void iwl_mvm_mac_mgd_prepare_tx(struct ieee80211_hw *hw, diff --git a/drivers/net/wireless/intersil/p54/main.c b/drivers/net/wireless/intersil/p54/main.c index 42111bb53f58..2ec3655f1a9c 100644 --- a/drivers/net/wireless/intersil/p54/main.c +++ b/drivers/net/wireless/intersil/p54/main.c @@ -313,7 +313,7 @@ static void p54_reset_stats(struct p54_common *priv) priv->survey_raw.tx = 0; } -static int p54_config(struct ieee80211_hw *dev, u32 changed) +static int p54_config(struct ieee80211_hw *dev, int radio_idx, u32 changed) { int ret = 0; struct p54_common *priv = dev->priv; @@ -692,6 +692,7 @@ static void p54_flush(struct ieee80211_hw *dev, struct ieee80211_vif *vif, } static void p54_set_coverage_class(struct ieee80211_hw *dev, + int radio_idx, s16 coverage_class) { struct p54_common *priv = dev->priv; diff --git a/drivers/net/wireless/marvell/libertas_tf/main.c b/drivers/net/wireless/marvell/libertas_tf/main.c index 50c0f6179e2d..d1067874428f 100644 --- a/drivers/net/wireless/marvell/libertas_tf/main.c +++ b/drivers/net/wireless/marvell/libertas_tf/main.c @@ -337,7 +337,7 @@ static void lbtf_op_remove_interface(struct ieee80211_hw *hw, lbtf_deb_leave(LBTF_DEB_MACOPS); } -static int lbtf_op_config(struct ieee80211_hw *hw, u32 changed) +static int lbtf_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct lbtf_private *priv = hw->priv; struct ieee80211_conf *conf = &hw->conf; diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index 60c12328c2f3..286378770e9e 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -375,6 +375,7 @@ mwifiex_cfg80211_cancel_remain_on_channel(struct wiphy *wiphy, static int mwifiex_cfg80211_set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, enum nl80211_tx_power_setting type, int mbm) { @@ -410,6 +411,7 @@ mwifiex_cfg80211_set_tx_power(struct wiphy *wiphy, static int mwifiex_cfg80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, unsigned int link_id, int *dbm) { struct mwifiex_adapter *adapter = mwifiex_cfg80211_get_adapter(wiphy); @@ -737,7 +739,8 @@ mwifiex_set_rts(struct mwifiex_private *priv, u32 rts_thr) * Fragmentation threshold of the driver. */ static int -mwifiex_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) +mwifiex_cfg80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { struct mwifiex_adapter *adapter = mwifiex_cfg80211_get_adapter(wiphy); struct mwifiex_private *priv; @@ -1939,7 +1942,8 @@ mwifiex_cfg80211_del_station(struct wiphy *wiphy, struct net_device *dev, } static int -mwifiex_cfg80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) +mwifiex_cfg80211_set_antenna(struct wiphy *wiphy, int radio_idx, u32 tx_ant, + u32 rx_ant) { struct mwifiex_adapter *adapter = mwifiex_cfg80211_get_adapter(wiphy); struct mwifiex_private *priv = mwifiex_get_priv(adapter, @@ -2002,7 +2006,8 @@ mwifiex_cfg80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) } static int -mwifiex_cfg80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant) +mwifiex_cfg80211_get_antenna(struct wiphy *wiphy, int radio_idx, u32 *tx_ant, + u32 *rx_ant) { struct mwifiex_adapter *adapter = mwifiex_cfg80211_get_adapter(wiphy); struct mwifiex_private *priv = mwifiex_get_priv(adapter, diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c index bab9ef37a1ab..bc34a025acd6 100644 --- a/drivers/net/wireless/marvell/mwl8k.c +++ b/drivers/net/wireless/marvell/mwl8k.c @@ -3369,7 +3369,8 @@ struct mwl8k_cmd_set_rts_threshold { } __packed; static int -mwl8k_cmd_set_rts_threshold(struct ieee80211_hw *hw, int rts_thresh) +mwl8k_cmd_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + int rts_thresh) { struct mwl8k_cmd_set_rts_threshold *cmd; int rc; @@ -4955,7 +4956,7 @@ fail: wiphy_err(hw->wiphy, "Firmware restart failed\n"); } -static int mwl8k_config(struct ieee80211_hw *hw, u32 changed) +static int mwl8k_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ieee80211_conf *conf = &hw->conf; struct mwl8k_priv *priv = hw->priv; @@ -5321,9 +5322,10 @@ static void mwl8k_configure_filter(struct ieee80211_hw *hw, mwl8k_fw_unlock(hw); } -static int mwl8k_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int mwl8k_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { - return mwl8k_cmd_set_rts_threshold(hw, value); + return mwl8k_cmd_set_rts_threshold(hw, radio_idx, value); } static int mwl8k_sta_remove(struct ieee80211_hw *hw, @@ -6056,7 +6058,7 @@ static int mwl8k_reload_firmware(struct ieee80211_hw *hw, char *fw_image) if (rc) goto fail; - rc = mwl8k_config(hw, ~0); + rc = mwl8k_config(hw, -1, ~0); if (rc) goto fail; diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c index 45c8db939d55..3afe4c4cd7bb 100644 --- a/drivers/net/wireless/mediatek/mt76/mac80211.c +++ b/drivers/net/wireless/mediatek/mt76/mac80211.c @@ -1892,7 +1892,8 @@ void mt76_sw_scan_complete(struct ieee80211_hw *hw, struct ieee80211_vif *vif) } EXPORT_SYMBOL_GPL(mt76_sw_scan_complete); -int mt76_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +int mt76_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant, + u32 *rx_ant) { struct mt76_phy *phy = hw->priv; struct mt76_dev *dev = phy->dev; diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h index 5f8d81cda6cd..14927a92f9d1 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76.h +++ b/drivers/net/wireless/mediatek/mt76/mt76.h @@ -1513,7 +1513,8 @@ int mt76_get_sar_power(struct mt76_phy *phy, void mt76_csa_check(struct mt76_dev *dev); void mt76_csa_finish(struct mt76_dev *dev); -int mt76_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); +int mt76_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant, + u32 *rx_ant); int mt76_set_tim(struct ieee80211_hw *hw, struct ieee80211_sta *sta, bool set); void mt76_insert_ccmp_hdr(struct sk_buff *skb, u8 key_id); int mt76_get_rate(struct mt76_dev *dev, diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/main.c b/drivers/net/wireless/mediatek/mt76/mt7603/main.c index 3e8b1ec76169..0d7c84941cd0 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/main.c @@ -216,7 +216,7 @@ static int mt7603_set_sar_specs(struct ieee80211_hw *hw, } static int -mt7603_config(struct ieee80211_hw *hw, u32 changed) +mt7603_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt7603_dev *dev = hw->priv; int ret = 0; @@ -657,7 +657,8 @@ mt7603_sta_rate_tbl_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, } static void -mt7603_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) +mt7603_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class) { struct mt7603_dev *dev = hw->priv; diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/main.c b/drivers/net/wireless/mediatek/mt76/mt7615/main.c index 8a37fb37f77d..15fe155ac3f3 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/main.c @@ -420,7 +420,7 @@ static int mt7615_set_sar_specs(struct ieee80211_hw *hw, return mt76_update_channel(phy->mt76); } -static int mt7615_config(struct ieee80211_hw *hw, u32 changed) +static int mt7615_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt7615_dev *dev = mt7615_hw_dev(hw); struct mt7615_phy *phy = mt7615_hw_phy(hw); @@ -784,7 +784,8 @@ static void mt7615_tx(struct ieee80211_hw *hw, mt76_connac_pm_queue_skb(hw, &dev->pm, wcid, skb); } -static int mt7615_set_rts_threshold(struct ieee80211_hw *hw, u32 val) +static int mt7615_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 val) { struct mt7615_dev *dev = mt7615_hw_dev(hw); struct mt7615_phy *phy = mt7615_hw_phy(hw); @@ -972,7 +973,8 @@ mt7615_offset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif, } static void -mt7615_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) +mt7615_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class) { struct mt7615_phy *phy = mt7615_hw_phy(hw); struct mt7615_dev *dev = phy->dev; @@ -984,7 +986,8 @@ mt7615_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) } static int -mt7615_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +mt7615_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct mt7615_dev *dev = mt7615_hw_dev(hw); struct mt7615_phy *phy = mt7615_hw_phy(hw); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/main.c b/drivers/net/wireless/mediatek/mt76/mt76x0/main.c index 4aa2dcedc874..a5c40d350612 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/main.c @@ -57,7 +57,7 @@ out: } EXPORT_SYMBOL_GPL(mt76x0_set_sar_specs); -int mt76x0_config(struct ieee80211_hw *hw, u32 changed) +int mt76x0_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt76x02_dev *dev = hw->priv; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h b/drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h index 50f755344968..e5bc14d4c712 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h @@ -48,7 +48,7 @@ void mt76x0_chip_onoff(struct mt76x02_dev *dev, bool enable, bool reset); void mt76x0_mac_stop(struct mt76x02_dev *dev); -int mt76x0_config(struct ieee80211_hw *hw, u32 changed); +int mt76x0_config(struct ieee80211_hw *hw, int radio_idx, u32 changed); int mt76x0_set_channel(struct mt76_phy *mphy); int mt76x0_set_sar_specs(struct ieee80211_hw *hw, const struct cfg80211_sar_specs *sar); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02.h b/drivers/net/wireless/mediatek/mt76/mt76x02.h index 4cd63bacd742..2094c7d2af81 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02.h +++ b/drivers/net/wireless/mediatek/mt76/mt76x02.h @@ -183,8 +183,8 @@ void mt76x02_wdt_work(struct work_struct *work); void mt76x02_tx_set_txpwr_auto(struct mt76x02_dev *dev, s8 txpwr); void mt76x02_set_tx_ackto(struct mt76x02_dev *dev); void mt76x02_set_coverage_class(struct ieee80211_hw *hw, - s16 coverage_class); -int mt76x02_set_rts_threshold(struct ieee80211_hw *hw, u32 val); + int radio_idx, s16 coverage_class); +int mt76x02_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 val); void mt76x02_remove_hdr_pad(struct sk_buff *skb, int len); bool mt76x02_tx_status_data(struct mt76_dev *mdev, u8 *update); void mt76x02_queue_rx_skb(struct mt76_dev *mdev, enum mt76_rxq_id q, diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c index 4fb30589fa7a..7dfcb20c692c 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c @@ -548,7 +548,7 @@ void mt76x02_set_tx_ackto(struct mt76x02_dev *dev) EXPORT_SYMBOL_GPL(mt76x02_set_tx_ackto); void mt76x02_set_coverage_class(struct ieee80211_hw *hw, - s16 coverage_class) + int radio_idx, s16 coverage_class) { struct mt76x02_dev *dev = hw->priv; @@ -559,7 +559,7 @@ void mt76x02_set_coverage_class(struct ieee80211_hw *hw, } EXPORT_SYMBOL_GPL(mt76x02_set_coverage_class); -int mt76x02_set_rts_threshold(struct ieee80211_hw *hw, u32 val) +int mt76x02_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 val) { struct mt76x02_dev *dev = hw->priv; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c index eb70130d2711..c5dfb06d81e8 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c @@ -54,7 +54,7 @@ int mt76x2e_set_channel(struct mt76_phy *phy) } static int -mt76x2_config(struct ieee80211_hw *hw, u32 changed) +mt76x2_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt76x02_dev *dev = hw->priv; @@ -99,8 +99,8 @@ mt76x2_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, { } -static int mt76x2_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, - u32 rx_ant) +static int mt76x2_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct mt76x02_dev *dev = hw->priv; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c index 83e7061b10e2..6671c53faf9f 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c @@ -50,7 +50,7 @@ int mt76x2u_set_channel(struct mt76_phy *mphy) } static int -mt76x2u_config(struct ieee80211_hw *hw, u32 changed) +mt76x2u_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt76x02_dev *dev = hw->priv; int err = 0; diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/main.c b/drivers/net/wireless/mediatek/mt76/mt7915/main.c index 3aa31c5cefa6..fe0639c14bf9 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/main.c @@ -449,7 +449,8 @@ out: return err; } -static int mt7915_config(struct ieee80211_hw *hw, u32 changed) +static int mt7915_config(struct ieee80211_hw *hw, int radio_idx, + u32 changed) { struct mt7915_dev *dev = mt7915_hw_dev(hw); struct mt7915_phy *phy = mt7915_hw_phy(hw); @@ -906,7 +907,8 @@ static void mt7915_tx(struct ieee80211_hw *hw, mt76_tx(mphy, control->sta, wcid, skb); } -static int mt7915_set_rts_threshold(struct ieee80211_hw *hw, u32 val) +static int mt7915_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 val) { struct mt7915_dev *dev = mt7915_hw_dev(hw); struct mt7915_phy *phy = mt7915_hw_phy(hw); @@ -1102,7 +1104,8 @@ mt7915_offset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif, } static void -mt7915_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) +mt7915_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class) { struct mt7915_phy *phy = mt7915_hw_phy(hw); struct mt7915_dev *dev = phy->dev; @@ -1114,7 +1117,7 @@ mt7915_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) } static int -mt7915_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +mt7915_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant, u32 rx_ant) { struct mt7915_dev *dev = mt7915_hw_dev(hw); struct mt7915_phy *phy = mt7915_hw_phy(hw); @@ -1655,7 +1658,7 @@ mt7915_twt_teardown_request(struct ieee80211_hw *hw, } static int -mt7915_set_frag_threshold(struct ieee80211_hw *hw, u32 val) +mt7915_set_frag_threshold(struct ieee80211_hw *hw, int radio_idx, u32 val) { return 0; } diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c index 1fffa43379b2..1678204296d7 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c @@ -624,7 +624,7 @@ void mt7921_set_runtime_pm(struct mt792x_dev *dev) mt76_connac_mcu_set_deep_sleep(&dev->mt76, pm->ds_enable); } -static int mt7921_config(struct ieee80211_hw *hw, u32 changed) +static int mt7921_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt792x_dev *dev = mt792x_hw_dev(hw); struct mt792x_phy *phy = mt792x_hw_phy(hw); @@ -907,7 +907,8 @@ void mt7921_mac_sta_remove(struct mt76_dev *mdev, struct ieee80211_vif *vif, } EXPORT_SYMBOL_GPL(mt7921_mac_sta_remove); -static int mt7921_set_rts_threshold(struct ieee80211_hw *hw, u32 val) +static int mt7921_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 val) { struct mt792x_dev *dev = mt792x_hw_dev(hw); @@ -1088,7 +1089,8 @@ mt7921_stop_sched_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif) } static int -mt7921_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +mt7921_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct mt792x_dev *dev = mt792x_hw_dev(hw); struct mt792x_phy *phy = mt792x_hw_phy(hw); diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c index 94b0099dcd41..ed7cd75aa6bc 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c @@ -757,7 +757,7 @@ void mt7925_set_runtime_pm(struct mt792x_dev *dev) mt7925_mcu_set_deep_sleep(dev, pm->ds_enable); } -static int mt7925_config(struct ieee80211_hw *hw, u32 changed) +static int mt7925_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt792x_dev *dev = mt792x_hw_dev(hw); int ret = 0; @@ -1265,7 +1265,8 @@ void mt7925_mac_sta_remove(struct mt76_dev *mdev, struct ieee80211_vif *vif, } EXPORT_SYMBOL_GPL(mt7925_mac_sta_remove); -static int mt7925_set_rts_threshold(struct ieee80211_hw *hw, u32 val) +static int mt7925_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 val) { struct mt792x_dev *dev = mt792x_hw_dev(hw); @@ -1507,7 +1508,8 @@ mt7925_stop_sched_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif) } static int -mt7925_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +mt7925_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct mt792x_dev *dev = mt792x_hw_dev(hw); struct mt792x_phy *phy = mt792x_hw_phy(hw); diff --git a/drivers/net/wireless/mediatek/mt76/mt792x.h b/drivers/net/wireless/mediatek/mt76/mt792x.h index e0359d431eca..443d397d9961 100644 --- a/drivers/net/wireless/mediatek/mt76/mt792x.h +++ b/drivers/net/wireless/mediatek/mt76/mt792x.h @@ -412,7 +412,8 @@ void mt792x_sta_statistics(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct station_info *sinfo); -void mt792x_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class); +void mt792x_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class); void mt792x_dma_cleanup(struct mt792x_dev *dev); int mt792x_dma_enable(struct mt792x_dev *dev); int mt792x_wpdma_reset(struct mt792x_dev *dev, bool force); diff --git a/drivers/net/wireless/mediatek/mt76/mt792x_core.c b/drivers/net/wireless/mediatek/mt76/mt792x_core.c index a50c1723ca29..43a7ac0f718e 100644 --- a/drivers/net/wireless/mediatek/mt76/mt792x_core.c +++ b/drivers/net/wireless/mediatek/mt76/mt792x_core.c @@ -579,7 +579,8 @@ void mt792x_sta_statistics(struct ieee80211_hw *hw, } EXPORT_SYMBOL_GPL(mt792x_sta_statistics); -void mt792x_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) +void mt792x_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class) { struct mt792x_phy *phy = mt792x_hw_phy(hw); struct mt792x_dev *dev = phy->dev; diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/main.c b/drivers/net/wireless/mediatek/mt76/mt7996/main.c index 78ae9f5cb176..5283aee619a9 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/main.c @@ -591,7 +591,7 @@ static int mt7996_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd, return err; } -static int mt7996_config(struct ieee80211_hw *hw, u32 changed) +static int mt7996_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { return 0; } @@ -1251,7 +1251,8 @@ unlock: rcu_read_unlock(); } -static int mt7996_set_rts_threshold(struct ieee80211_hw *hw, u32 val) +static int mt7996_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 val) { struct mt7996_dev *dev = mt7996_hw_dev(hw); int i, ret = 0; @@ -1491,7 +1492,8 @@ unlock: } static void -mt7996_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) +mt7996_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class) { struct mt7996_dev *dev = mt7996_hw_dev(hw); struct mt7996_phy *phy; @@ -1505,7 +1507,8 @@ mt7996_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) } static int -mt7996_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +mt7996_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct mt7996_dev *dev = mt7996_hw_dev(hw); int i; diff --git a/drivers/net/wireless/mediatek/mt7601u/main.c b/drivers/net/wireless/mediatek/mt7601u/main.c index 7570c6ceecea..05ba43e1985c 100644 --- a/drivers/net/wireless/mediatek/mt7601u/main.c +++ b/drivers/net/wireless/mediatek/mt7601u/main.c @@ -78,7 +78,7 @@ static void mt7601u_remove_interface(struct ieee80211_hw *hw, dev->wcid_mask[wcid / BITS_PER_LONG] &= ~BIT(wcid % BITS_PER_LONG); } -static int mt7601u_config(struct ieee80211_hw *hw, u32 changed) +static int mt7601u_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt7601u_dev *dev = hw->priv; int ret = 0; @@ -334,7 +334,8 @@ mt7601u_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd, return mt76_mac_wcid_set_key(dev, msta->wcid.idx, key); } -static int mt7601u_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int mt7601u_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct mt7601u_dev *dev = hw->priv; diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c index e7aa0f991923..a395829ebadf 100644 --- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c +++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c @@ -800,7 +800,7 @@ static int change_bss(struct wiphy *wiphy, struct net_device *dev, return 0; } -static int set_wiphy_params(struct wiphy *wiphy, u32 changed) +static int set_wiphy_params(struct wiphy *wiphy, int radio_idx, u32 changed) { int ret = -EINVAL; struct cfg_param_attr cfg_param_val; @@ -1637,7 +1637,8 @@ static void wilc_set_wakeup(struct wiphy *wiphy, bool enabled) } static int set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - enum nl80211_tx_power_setting type, int mbm) + int radio_idx, enum nl80211_tx_power_setting type, + int mbm) { int ret; int srcu_idx; @@ -1669,7 +1670,7 @@ static int set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, } static int get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id, int *dbm) + int radio_idx, unsigned int link_id, int *dbm) { int ret; struct wilc_vif *vif = netdev_priv(wdev->netdev); diff --git a/drivers/net/wireless/purelifi/plfxlc/mac.c b/drivers/net/wireless/purelifi/plfxlc/mac.c index 82d1bf7edba2..d375ad60167f 100644 --- a/drivers/net/wireless/purelifi/plfxlc/mac.c +++ b/drivers/net/wireless/purelifi/plfxlc/mac.c @@ -531,7 +531,7 @@ static void plfxlc_op_remove_interface(struct ieee80211_hw *hw, mac->vif = NULL; } -static int plfxlc_op_config(struct ieee80211_hw *hw, u32 changed) +static int plfxlc_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { return 0; } @@ -677,7 +677,8 @@ static void plfxlc_get_et_stats(struct ieee80211_hw *hw, data[1] = mac->crc_errors; } -static int plfxlc_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int plfxlc_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { return 0; } diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c index 0b2282528342..f1188368e66b 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c @@ -370,7 +370,8 @@ static int qtnf_stop_ap(struct wiphy *wiphy, struct net_device *dev, return ret; } -static int qtnf_set_wiphy_params(struct wiphy *wiphy, u32 changed) +static int qtnf_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { struct qtnf_wmac *mac = wiphy_priv(wiphy); struct qtnf_vif *vif; @@ -881,7 +882,7 @@ static int qtnf_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, } static int qtnf_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id, int *dbm) + int radio_idx, unsigned int link_id, int *dbm) { struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev); int ret; @@ -894,7 +895,8 @@ static int qtnf_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, } static int qtnf_set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - enum nl80211_tx_power_setting type, int mbm) + int radio_idx, enum nl80211_tx_power_setting type, + int mbm) { struct qtnf_vif *vif; int ret; diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c index b7ea606bda08..4b5a7c9b6499 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c @@ -12100,7 +12100,7 @@ void rt2800_get_key_seq(struct ieee80211_hw *hw, } EXPORT_SYMBOL_GPL(rt2800_get_key_seq); -int rt2800_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +int rt2800_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 value) { struct rt2x00_dev *rt2x00dev = hw->priv; u32 reg; diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h index 194de676df8f..620a3d9872ce 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h +++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h @@ -253,7 +253,8 @@ int rt2800_probe_hw(struct rt2x00_dev *rt2x00dev); void rt2800_get_key_seq(struct ieee80211_hw *hw, struct ieee80211_key_conf *key, struct ieee80211_key_seq *seq); -int rt2800_set_rts_threshold(struct ieee80211_hw *hw, u32 value); +int rt2800_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value); int rt2800_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id, u16 queue_idx, diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00.h b/drivers/net/wireless/ralink/rt2x00/rt2x00.h index dfb4bb370f01..09b9d1f9f793 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00.h +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00.h @@ -1457,7 +1457,7 @@ int rt2x00mac_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void rt2x00mac_remove_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif); -int rt2x00mac_config(struct ieee80211_hw *hw, u32 changed); +int rt2x00mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed); void rt2x00mac_configure_filter(struct ieee80211_hw *hw, unsigned int changed_flags, unsigned int *total_flags, @@ -1489,8 +1489,10 @@ int rt2x00mac_conf_tx(struct ieee80211_hw *hw, void rt2x00mac_rfkill_poll(struct ieee80211_hw *hw); void rt2x00mac_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 queues, bool drop); -int rt2x00mac_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant); -int rt2x00mac_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); +int rt2x00mac_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant); +int rt2x00mac_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant); void rt2x00mac_get_ringparam(struct ieee80211_hw *hw, u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max); bool rt2x00mac_tx_frames_pending(struct ieee80211_hw *hw); diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c index 451632488805..3bc0c1c906c9 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c @@ -304,7 +304,7 @@ void rt2x00mac_remove_interface(struct ieee80211_hw *hw, } EXPORT_SYMBOL_GPL(rt2x00mac_remove_interface); -int rt2x00mac_config(struct ieee80211_hw *hw, u32 changed) +int rt2x00mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct rt2x00_dev *rt2x00dev = hw->priv; struct ieee80211_conf *conf = &hw->conf; @@ -740,7 +740,8 @@ void rt2x00mac_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, } EXPORT_SYMBOL_GPL(rt2x00mac_flush); -int rt2x00mac_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +int rt2x00mac_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct rt2x00_dev *rt2x00dev = hw->priv; struct link_ant *ant = &rt2x00dev->link.ant; @@ -785,7 +786,8 @@ int rt2x00mac_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) } EXPORT_SYMBOL_GPL(rt2x00mac_set_antenna); -int rt2x00mac_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +int rt2x00mac_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct rt2x00_dev *rt2x00dev = hw->priv; struct link_ant *ant = &rt2x00dev->link.ant; diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c index ded8d4d59289..2905baea6239 100644 --- a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c +++ b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c @@ -1370,7 +1370,7 @@ static void rtl8180_remove_interface(struct ieee80211_hw *dev, priv->vif = NULL; } -static int rtl8180_config(struct ieee80211_hw *dev, u32 changed) +static int rtl8180_config(struct ieee80211_hw *dev, int radio_idx, u32 changed) { struct rtl8180_priv *priv = dev->priv; struct ieee80211_conf *conf = &dev->conf; diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c index 220ac5bdf279..8857bb542c7f 100644 --- a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c +++ b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c @@ -1151,7 +1151,7 @@ static void rtl8187_remove_interface(struct ieee80211_hw *dev, mutex_unlock(&priv->conf_mutex); } -static int rtl8187_config(struct ieee80211_hw *dev, u32 changed) +static int rtl8187_config(struct ieee80211_hw *dev, int radio_idx, u32 changed) { struct rtl8187_priv *priv = dev->priv; struct ieee80211_conf *conf = &dev->conf; diff --git a/drivers/net/wireless/realtek/rtl8xxxu/core.c b/drivers/net/wireless/realtek/rtl8xxxu/core.c index 569856ca677f..496836f716aa 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/core.c +++ b/drivers/net/wireless/realtek/rtl8xxxu/core.c @@ -4552,7 +4552,8 @@ static void rtl8xxxu_cam_write(struct rtl8xxxu_priv *priv, } static -int rtl8xxxu_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +int rtl8xxxu_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant, + u32 *rx_ant) { struct rtl8xxxu_priv *priv = hw->priv; @@ -6839,7 +6840,7 @@ static void rtl8xxxu_remove_interface(struct ieee80211_hw *hw, priv->vifs[rtlvif->port_num] = NULL; } -static int rtl8xxxu_config(struct ieee80211_hw *hw, u32 changed) +static int rtl8xxxu_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct rtl8xxxu_priv *priv = hw->priv; struct device *dev = &priv->udev->dev; @@ -6988,7 +6989,8 @@ static void rtl8xxxu_configure_filter(struct ieee80211_hw *hw, FIF_PROBE_REQ); } -static int rtl8xxxu_set_rts_threshold(struct ieee80211_hw *hw, u32 rts) +static int rtl8xxxu_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 rts) { if (rts > 2347 && rts != (u32)-1) return -EINVAL; diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c index 819cf519e66e..22633c301564 100644 --- a/drivers/net/wireless/realtek/rtlwifi/core.c +++ b/drivers/net/wireless/realtek/rtlwifi/core.c @@ -566,7 +566,7 @@ static int rtl_op_resume(struct ieee80211_hw *hw) } #endif -static int rtl_op_config(struct ieee80211_hw *hw, u32 changed) +static int rtl_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_phy *rtlphy = &(rtlpriv->phy); diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c index 77f9fbe1870c..766f22d31079 100644 --- a/drivers/net/wireless/realtek/rtw88/mac80211.c +++ b/drivers/net/wireless/realtek/rtw88/mac80211.c @@ -71,7 +71,7 @@ static void rtw_ops_stop(struct ieee80211_hw *hw, bool suspend) mutex_unlock(&rtwdev->mutex); } -static int rtw_ops_config(struct ieee80211_hw *hw, u32 changed) +static int rtw_ops_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct rtw_dev *rtwdev = hw->priv; int ret = 0; @@ -708,7 +708,8 @@ static void rtw_ops_mgd_prepare_tx(struct ieee80211_hw *hw, mutex_unlock(&rtwdev->mutex); } -static int rtw_ops_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int rtw_ops_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct rtw_dev *rtwdev = hw->priv; @@ -797,6 +798,7 @@ static int rtw_ops_set_bitrate_mask(struct ieee80211_hw *hw, } static int rtw_ops_set_antenna(struct ieee80211_hw *hw, + int radio_idx, u32 tx_antenna, u32 rx_antenna) { @@ -808,13 +810,14 @@ static int rtw_ops_set_antenna(struct ieee80211_hw *hw, return -EOPNOTSUPP; mutex_lock(&rtwdev->mutex); - ret = chip->ops->set_antenna(rtwdev, tx_antenna, rx_antenna); + ret = chip->ops->set_antenna(rtwdev, -1, tx_antenna, rx_antenna); mutex_unlock(&rtwdev->mutex); return ret; } static int rtw_ops_get_antenna(struct ieee80211_hw *hw, + int radio_idx, u32 *tx_antenna, u32 *rx_antenna) { diff --git a/drivers/net/wireless/realtek/rtw88/main.h b/drivers/net/wireless/realtek/rtw88/main.h index b0f1fabe9554..7ae67143e909 100644 --- a/drivers/net/wireless/realtek/rtw88/main.h +++ b/drivers/net/wireless/realtek/rtw88/main.h @@ -873,7 +873,7 @@ struct rtw_chip_ops { void (*set_tx_power_index)(struct rtw_dev *rtwdev); int (*rsvd_page_dump)(struct rtw_dev *rtwdev, u8 *buf, u32 offset, u32 size); - int (*set_antenna)(struct rtw_dev *rtwdev, + int (*set_antenna)(struct rtw_dev *rtwdev, int radio_idx, u32 antenna_tx, u32 antenna_rx); void (*cfg_ldo25)(struct rtw_dev *rtwdev, bool enable); diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822b.c b/drivers/net/wireless/realtek/rtw88/rtw8822b.c index ab199eaea3c7..710126379e77 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822b.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822b.c @@ -983,6 +983,7 @@ static bool rtw8822b_check_rf_path(u8 antenna) } static int rtw8822b_set_antenna(struct rtw_dev *rtwdev, + int radio_idx, u32 antenna_tx, u32 antenna_rx) { diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822c.c b/drivers/net/wireless/realtek/rtw88/rtw8822c.c index 017d959de3ce..0ce6aa10493e 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822c.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822c.c @@ -2767,6 +2767,7 @@ static void rtw8822c_set_tx_power_index(struct rtw_dev *rtwdev) } static int rtw8822c_set_antenna(struct rtw_dev *rtwdev, + int radio_idx, u32 antenna_tx, u32 antenna_rx) { diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c index a47971003bd4..b9e046208424 100644 --- a/drivers/net/wireless/realtek/rtw89/mac80211.c +++ b/drivers/net/wireless/realtek/rtw89/mac80211.c @@ -72,7 +72,7 @@ static void rtw89_ops_stop(struct ieee80211_hw *hw, bool suspend) rtw89_core_stop(rtwdev); } -static int rtw89_ops_config(struct ieee80211_hw *hw, u32 changed) +static int rtw89_ops_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct rtw89_dev *rtwdev = hw->priv; @@ -1007,7 +1007,8 @@ static int rtw89_ops_ampdu_action(struct ieee80211_hw *hw, return 0; } -static int rtw89_ops_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int rtw89_ops_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct rtw89_dev *rtwdev = hw->priv; @@ -1119,7 +1120,7 @@ static int rtw89_ops_set_bitrate_mask(struct ieee80211_hw *hw, } static -int rtw89_ops_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +int rtw89_ops_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant, u32 rx_ant) { struct rtw89_dev *rtwdev = hw->priv; struct rtw89_hal *hal = &rtwdev->hal; @@ -1142,7 +1143,8 @@ int rtw89_ops_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) } static -int rtw89_ops_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +int rtw89_ops_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant, + u32 *rx_ant) { struct rtw89_dev *rtwdev = hw->priv; struct rtw89_hal *hal = &rtwdev->hal; diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c index 0e115b428f96..f3a853edfc11 100644 --- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c +++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c @@ -656,11 +656,13 @@ static int rsi_config_power(struct ieee80211_hw *hw) * requests. The stack calls this function to * change hardware configuration, e.g., channel. * @hw: Pointer to the ieee80211_hw structure. + * @radio_idx: Radio index. * @changed: Changed flags set. * * Return: 0 on success, negative error code on failure. */ static int rsi_mac80211_config(struct ieee80211_hw *hw, + int radio_idx, u32 changed) { struct rsi_hw *adapter = hw->priv; @@ -1201,12 +1203,13 @@ unlock: /** * rsi_mac80211_set_rts_threshold() - This function sets rts threshold value. * @hw: Pointer to the ieee80211_hw structure. + * @radio_idx: Radio index. * @value: Rts threshold value. * * Return: 0 on success. */ static int rsi_mac80211_set_rts_threshold(struct ieee80211_hw *hw, - u32 value) + int radio_idx, u32 value) { struct rsi_hw *adapter = hw->priv; struct rsi_common *common = adapter->priv; @@ -1583,12 +1586,14 @@ static int rsi_mac80211_sta_remove(struct ieee80211_hw *hw, * rsi_mac80211_set_antenna() - This function is used to configure * tx and rx antennas. * @hw: Pointer to the ieee80211_hw structure. + * @radio_idx: Radio index * @tx_ant: Bitmap for tx antenna * @rx_ant: Bitmap for rx antenna * * Return: 0 on success, Negative error code on failure. */ static int rsi_mac80211_set_antenna(struct ieee80211_hw *hw, + int radio_idx, u32 tx_ant, u32 rx_ant) { struct rsi_hw *adapter = hw->priv; @@ -1634,12 +1639,14 @@ fail_set_antenna: * tx and rx antennas. * * @hw: Pointer to the ieee80211_hw structure. + * @radio_idx: Radio index * @tx_ant: Bitmap for tx antenna * @rx_ant: Bitmap for rx antenna * * Return: 0 on success, negative error codes on failure. */ static int rsi_mac80211_get_antenna(struct ieee80211_hw *hw, + int radio_idx, u32 *tx_ant, u32 *rx_ant) { struct rsi_hw *adapter = hw->priv; diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c index e95b9ded17d9..d12fcc755701 100644 --- a/drivers/net/wireless/silabs/wfx/sta.c +++ b/drivers/net/wireless/silabs/wfx/sta.c @@ -220,7 +220,7 @@ int wfx_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, return 0; } -int wfx_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +int wfx_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 value) { struct wfx_dev *wdev = hw->priv; struct wfx_vif *wvif = NULL; @@ -706,7 +706,7 @@ void wfx_unassign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif wvif->channel = NULL; } -int wfx_config(struct ieee80211_hw *hw, u32 changed) +int wfx_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { return 0; } diff --git a/drivers/net/wireless/silabs/wfx/sta.h b/drivers/net/wireless/silabs/wfx/sta.h index 8702eed5267f..b4812b294f3c 100644 --- a/drivers/net/wireless/silabs/wfx/sta.h +++ b/drivers/net/wireless/silabs/wfx/sta.h @@ -21,8 +21,8 @@ struct wfx_sta_priv { /* mac80211 interface */ int wfx_start(struct ieee80211_hw *hw); void wfx_stop(struct ieee80211_hw *hw, bool suspend); -int wfx_config(struct ieee80211_hw *hw, u32 changed); -int wfx_set_rts_threshold(struct ieee80211_hw *hw, u32 value); +int wfx_config(struct ieee80211_hw *hw, int radio_idx, u32 changed); +int wfx_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 value); void wfx_set_default_unicast_key(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int idx); void wfx_configure_filter(struct ieee80211_hw *hw, unsigned int changed_flags, unsigned int *total_flags, u64 unused); diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c index 5dd7f6a38900..b1dd76e8aecb 100644 --- a/drivers/net/wireless/st/cw1200/sta.c +++ b/drivers/net/wireless/st/cw1200/sta.c @@ -321,7 +321,7 @@ int cw1200_change_interface(struct ieee80211_hw *dev, return ret; } -int cw1200_config(struct ieee80211_hw *dev, u32 changed) +int cw1200_config(struct ieee80211_hw *dev, int radio_idx, u32 changed) { int ret = 0; struct cw1200_common *priv = dev->priv; @@ -857,7 +857,8 @@ void cw1200_wep_key_work(struct work_struct *work) wsm_unlock_tx(priv); } -int cw1200_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +int cw1200_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { int ret = 0; __le32 val32; diff --git a/drivers/net/wireless/st/cw1200/sta.h b/drivers/net/wireless/st/cw1200/sta.h index b955b92cfd73..b4f04371668d 100644 --- a/drivers/net/wireless/st/cw1200/sta.h +++ b/drivers/net/wireless/st/cw1200/sta.h @@ -22,7 +22,7 @@ int cw1200_change_interface(struct ieee80211_hw *dev, struct ieee80211_vif *vif, enum nl80211_iftype new_type, bool p2p); -int cw1200_config(struct ieee80211_hw *dev, u32 changed); +int cw1200_config(struct ieee80211_hw *dev, int radio_idx, u32 changed); void cw1200_configure_filter(struct ieee80211_hw *dev, unsigned int changed_flags, unsigned int *total_flags, @@ -36,7 +36,8 @@ int cw1200_set_key(struct ieee80211_hw *dev, enum set_key_cmd cmd, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key); -int cw1200_set_rts_threshold(struct ieee80211_hw *hw, u32 value); +int cw1200_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value); void cw1200_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 queues, bool drop); diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c index bb53d681c11b..69fc51f183ad 100644 --- a/drivers/net/wireless/ti/wl1251/main.c +++ b/drivers/net/wireless/ti/wl1251/main.c @@ -589,7 +589,7 @@ static bool wl1251_can_do_pm(struct ieee80211_conf *conf, struct wl1251 *wl) return (conf->flags & IEEE80211_CONF_PS) && !wl->monitor_present; } -static int wl1251_op_config(struct ieee80211_hw *hw, u32 changed) +static int wl1251_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct wl1251 *wl = hw->priv; struct ieee80211_conf *conf = &hw->conf; @@ -1051,7 +1051,8 @@ out: return ret; } -static int wl1251_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int wl1251_op_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct wl1251 *wl = hw->priv; int ret; diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index f93c95edd991..6116a8522d96 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -3166,7 +3166,7 @@ static int wl12xx_config_vif(struct wl1271 *wl, struct wl12xx_vif *wlvif, return 0; } -static int wl1271_op_config(struct ieee80211_hw *hw, u32 changed) +static int wl1271_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct wl1271 *wl = hw->priv; struct wl12xx_vif *wlvif; @@ -3895,7 +3895,8 @@ out: return 0; } -static int wl1271_op_set_frag_threshold(struct ieee80211_hw *hw, u32 value) +static int wl1271_op_set_frag_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { struct wl1271 *wl = hw->priv; int ret = 0; @@ -3924,7 +3925,8 @@ out: return ret; } -static int wl1271_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int wl1271_op_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct wl1271 *wl = hw->priv; struct wl12xx_vif *wlvif; diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c index f6add19d1da1..eefe8da3b14d 100644 --- a/drivers/net/wireless/virtual/mac80211_hwsim.c +++ b/drivers/net/wireless/virtual/mac80211_hwsim.c @@ -2381,7 +2381,8 @@ static const char * const hwsim_chanwidths[] = { [NL80211_CHAN_WIDTH_320] = "eht320", }; -static int mac80211_hwsim_config(struct ieee80211_hw *hw, u32 changed) +static int mac80211_hwsim_config(struct ieee80211_hw *hw, int radio_idx, + u32 changed) { struct mac80211_hwsim_data *data = hw->priv; struct ieee80211_conf *conf = &hw->conf; @@ -3338,7 +3339,8 @@ static int mac80211_hwsim_tx_last_beacon(struct ieee80211_hw *hw) return 1; } -static int mac80211_hwsim_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int mac80211_hwsim_set_rts_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { return -EOPNOTSUPP; } diff --git a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c index 9653dbaac3c0..f7c56174424d 100644 --- a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c +++ b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c @@ -1133,7 +1133,7 @@ static void zd_op_remove_interface(struct ieee80211_hw *hw, zd_mac_free_cur_beacon(mac); } -static int zd_op_config(struct ieee80211_hw *hw, u32 changed) +static int zd_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct zd_mac *mac = zd_hw_mac(hw); struct ieee80211_conf *conf = &hw->conf; diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c index 7fcc46a0bb48..4e29652f8ee7 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c @@ -1298,7 +1298,8 @@ exit: return ret; } -static int cfg80211_rtw_set_wiphy_params(struct wiphy *wiphy, u32 changed) +static int cfg80211_rtw_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { return 0; } @@ -1795,7 +1796,7 @@ static int cfg80211_rtw_disconnect(struct wiphy *wiphy, struct net_device *ndev, } static int cfg80211_rtw_set_txpower(struct wiphy *wiphy, - struct wireless_dev *wdev, + struct wireless_dev *wdev, int radio_idx, enum nl80211_tx_power_setting type, int mbm) { return 0; @@ -1803,6 +1804,7 @@ static int cfg80211_rtw_set_txpower(struct wiphy *wiphy, static int cfg80211_rtw_get_txpower(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, unsigned int link_id, int *dbm) { *dbm = (12); diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index eec066f4738a..ffd9564fc840 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4853,12 +4853,14 @@ struct cfg80211_ops { int (*set_mcast_rate)(struct wiphy *wiphy, struct net_device *dev, int rate[NUM_NL80211_BANDS]); - int (*set_wiphy_params)(struct wiphy *wiphy, u32 changed); + int (*set_wiphy_params)(struct wiphy *wiphy, int radio_idx, + u32 changed); int (*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, enum nl80211_tx_power_setting type, int mbm); int (*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id, int *dbm); + int radio_idx, unsigned int link_id, int *dbm); void (*rfkill_poll)(struct wiphy *wiphy); @@ -4920,8 +4922,10 @@ struct cfg80211_ops { struct wireless_dev *wdev, struct mgmt_frame_regs *upd); - int (*set_antenna)(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant); - int (*get_antenna)(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant); + int (*set_antenna)(struct wiphy *wiphy, int radio_idx, + u32 tx_ant, u32 rx_ant); + int (*get_antenna)(struct wiphy *wiphy, int radio_idx, + u32 *tx_ant, u32 *rx_ant); int (*sched_scan_start)(struct wiphy *wiphy, struct net_device *dev, diff --git a/include/net/mac80211.h b/include/net/mac80211.h index fa2325692abf..a0de0da4d79b 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4517,7 +4517,7 @@ struct ieee80211_ops { enum nl80211_iftype new_type, bool p2p); void (*remove_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); - int (*config)(struct ieee80211_hw *hw, u32 changed); + int (*config)(struct ieee80211_hw *hw, int radio_idx, u32 changed); void (*bss_info_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, @@ -4580,8 +4580,10 @@ struct ieee80211_ops { void (*get_key_seq)(struct ieee80211_hw *hw, struct ieee80211_key_conf *key, struct ieee80211_key_seq *seq); - int (*set_frag_threshold)(struct ieee80211_hw *hw, u32 value); - int (*set_rts_threshold)(struct ieee80211_hw *hw, u32 value); + int (*set_frag_threshold)(struct ieee80211_hw *hw, int radio_idx, + u32 value); + int (*set_rts_threshold)(struct ieee80211_hw *hw, int radio_idx, + u32 value); int (*sta_add)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, @@ -4678,7 +4680,8 @@ struct ieee80211_ops { int (*get_survey)(struct ieee80211_hw *hw, int idx, struct survey_info *survey); void (*rfkill_poll)(struct ieee80211_hw *hw); - void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class); + void (*set_coverage_class)(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class); #ifdef CONFIG_NL80211_TESTMODE int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void *data, int len); @@ -4693,8 +4696,10 @@ struct ieee80211_ops { void (*channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); - int (*set_antenna)(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant); - int (*get_antenna)(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); + int (*set_antenna)(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant); + int (*get_antenna)(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant); int (*remain_on_channel)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a289014abe37..2a71149c3065 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2907,6 +2907,14 @@ enum nl80211_commands { * APs Support". Drivers may set additional flags that they support * in the kernel or device. * + * @NL80211_ATTR_WIPHY_RADIO_INDEX: (int) Integer attribute denoting the index + * of the radio in interest. Internally a value of -1 is used to + * indicate that the radio id is not given in user-space. This means + * that all the attributes are applicable to all the radios. If there is + * a radio index provided in user-space, the attributes will be + * applicable to that specific radio only. If the radio id is greater + * thank the number of radios, error denoting invalid value is returned. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3464,6 +3472,8 @@ enum nl80211_attrs { NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS, + NL80211_ATTR_WIPHY_RADIO_INDEX, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 1a17d66dfa75..72cecc304658 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3045,7 +3045,8 @@ static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev, return 0; } -static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) +static int ieee80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { struct ieee80211_local *local = wiphy_priv(wiphy); int err; @@ -3053,7 +3054,8 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) if (changed & WIPHY_PARAM_FRAG_THRESHOLD) { ieee80211_check_fast_xmit_all(local); - err = drv_set_frag_threshold(local, wiphy->frag_threshold); + err = drv_set_frag_threshold(local, radio_idx, + wiphy->frag_threshold); if (err) { ieee80211_check_fast_xmit_all(local); @@ -3067,14 +3069,16 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) coverage_class = changed & WIPHY_PARAM_COVERAGE_CLASS ? wiphy->coverage_class : -1; - err = drv_set_coverage_class(local, coverage_class); + err = drv_set_coverage_class(local, radio_idx, + coverage_class); if (err) return err; } if (changed & WIPHY_PARAM_RTS_THRESHOLD) { - err = drv_set_rts_threshold(local, wiphy->rts_threshold); + err = drv_set_rts_threshold(local, radio_idx, + wiphy->rts_threshold); if (err) return err; @@ -3092,18 +3096,19 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) } if (changed & (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG)) - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS); + ieee80211_hw_config(local, radio_idx, + IEEE80211_CONF_CHANGE_RETRY_LIMITS); if (changed & (WIPHY_PARAM_TXQ_LIMIT | WIPHY_PARAM_TXQ_MEMORY_LIMIT | WIPHY_PARAM_TXQ_QUANTUM)) - ieee80211_txq_set_params(local); + ieee80211_txq_set_params(local, radio_idx); return 0; } static int ieee80211_set_tx_power(struct wiphy *wiphy, - struct wireless_dev *wdev, + struct wireless_dev *wdev, int radio_idx, enum nl80211_tx_power_setting type, int mbm) { struct ieee80211_local *local = wiphy_priv(wiphy); @@ -3231,6 +3236,7 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, static int ieee80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, unsigned int link_id, int *dbm) { @@ -3409,7 +3415,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, } if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); @@ -4305,7 +4311,8 @@ ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy, ieee80211_configure_filter(local); } -static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) +static int ieee80211_set_antenna(struct wiphy *wiphy, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); int ret; @@ -4321,11 +4328,12 @@ static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) return 0; } -static int ieee80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant) +static int ieee80211_get_antenna(struct wiphy *wiphy, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); - return drv_get_antenna(local, tx_ant, rx_ant); + return drv_get_antenna(local, radio_idx, tx_ant, rx_ant); } static int ieee80211_set_rekey_data(struct wiphy *wiphy, diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index d62f91656a19..4bcbcf9d98b5 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -744,7 +744,7 @@ static int ieee80211_add_chanctx(struct ieee80211_local *local, /* turn idle off *before* setting channel -- some drivers need that */ changed = ieee80211_idle_off(local); if (changed) - ieee80211_hw_config(local, changed); + ieee80211_hw_config(local, -1, changed); err = drv_add_chanctx(local, ctx); if (err) { diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index ba017bf3fd15..8baebb5636ec 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -143,15 +143,16 @@ int drv_change_interface(struct ieee80211_local *local, void drv_remove_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); -static inline int drv_config(struct ieee80211_local *local, u32 changed) +static inline int drv_config(struct ieee80211_local *local, int radio_idx, + u32 changed) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); - trace_drv_config(local, changed); - ret = local->ops->config(&local->hw, changed); + trace_drv_config(local, radio_idx, changed); + ret = local->ops->config(&local->hw, radio_idx, changed); trace_drv_return_int(local, ret); return ret; } @@ -387,45 +388,47 @@ static inline void drv_get_key_seq(struct ieee80211_local *local, } static inline int drv_set_frag_threshold(struct ieee80211_local *local, - u32 value) + int radio_idx, u32 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); - trace_drv_set_frag_threshold(local, value); + trace_drv_set_frag_threshold(local, radio_idx, value); if (local->ops->set_frag_threshold) - ret = local->ops->set_frag_threshold(&local->hw, value); + ret = local->ops->set_frag_threshold(&local->hw, radio_idx, + value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_rts_threshold(struct ieee80211_local *local, - u32 value) + int radio_idx, u32 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); - trace_drv_set_rts_threshold(local, value); + trace_drv_set_rts_threshold(local, radio_idx, value); if (local->ops->set_rts_threshold) - ret = local->ops->set_rts_threshold(&local->hw, value); + ret = local->ops->set_rts_threshold(&local->hw, radio_idx, + value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_coverage_class(struct ieee80211_local *local, - s16 value) + int radio_idx, s16 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); - trace_drv_set_coverage_class(local, value); + trace_drv_set_coverage_class(local, radio_idx, value); if (local->ops->set_coverage_class) - local->ops->set_coverage_class(&local->hw, value); + local->ops->set_coverage_class(&local->hw, radio_idx, value); else ret = -EOPNOTSUPP; @@ -772,20 +775,21 @@ static inline int drv_set_antenna(struct ieee80211_local *local, might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->set_antenna) - ret = local->ops->set_antenna(&local->hw, tx_ant, rx_ant); + ret = local->ops->set_antenna(&local->hw, -1, tx_ant, rx_ant); trace_drv_set_antenna(local, tx_ant, rx_ant, ret); return ret; } -static inline int drv_get_antenna(struct ieee80211_local *local, +static inline int drv_get_antenna(struct ieee80211_local *local, int radio_idx, u32 *tx_ant, u32 *rx_ant) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->get_antenna) - ret = local->ops->get_antenna(&local->hw, tx_ant, rx_ant); - trace_drv_get_antenna(local, *tx_ant, *rx_ant, ret); + ret = local->ops->get_antenna(&local->hw, radio_idx, + tx_ant, rx_ant); + trace_drv_get_antenna(local, radio_idx, *tx_ant, *rx_ant, ret); return ret; } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9b9c7209878b..f59a5b38e6f2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1872,7 +1872,8 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, struct ieee80211_rx_status *status, unsigned int mpdu_len, unsigned int mpdu_offset); -int ieee80211_hw_config(struct ieee80211_local *local, u32 changed); +int ieee80211_hw_config(struct ieee80211_local *local, int radio_idx, + u32 changed); int ieee80211_hw_conf_chan(struct ieee80211_local *local); void ieee80211_hw_conf_init(struct ieee80211_local *local); void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx); @@ -2542,7 +2543,7 @@ static inline bool ieee80211_can_run_worker(struct ieee80211_local *local) } int ieee80211_txq_setup_flows(struct ieee80211_local *local); -void ieee80211_txq_set_params(struct ieee80211_local *local); +void ieee80211_txq_set_params(struct ieee80211_local *local, int radio_idx); void ieee80211_txq_teardown_flows(struct ieee80211_local *local); void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 7c27f3cd841c..7b2baebb8644 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -146,7 +146,7 @@ void ieee80211_recalc_idle(struct ieee80211_local *local) { u32 change = __ieee80211_recalc_idle(local, false); if (change) - ieee80211_hw_config(local, change); + ieee80211_hw_config(local, -1, change); } static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr, @@ -726,7 +726,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do /* do after stop to avoid reconfiguring when we stop anyway */ ieee80211_configure_filter(local); - ieee80211_hw_config(local, hw_reconf_flags); + ieee80211_hw_config(local, -1, hw_reconf_flags); if (local->virt_monitors == local->open_count) ieee80211_add_virtual_monitor(local); @@ -1491,7 +1491,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) if (local->open_count == 1) ieee80211_hw_conf_init(local); else if (hw_reconf_flags) - ieee80211_hw_config(local, hw_reconf_flags); + ieee80211_hw_config(local, -1, hw_reconf_flags); ieee80211_recalc_ps(local); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 6b6de43d9420..c1c758e76d2e 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -190,7 +190,8 @@ static u32 ieee80211_calc_hw_conf_chan(struct ieee80211_local *local, return changed; } -int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) +int ieee80211_hw_config(struct ieee80211_local *local, int radio_idx, + u32 changed) { int ret = 0; @@ -201,7 +202,7 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) IEEE80211_CONF_CHANGE_SMPS)); if (changed && local->open_count) { - ret = drv_config(local, changed); + ret = drv_config(local, radio_idx, changed); /* * Goal: * HW reconfiguration should never fail, the driver has told @@ -235,7 +236,7 @@ static int _ieee80211_hw_conf_chan(struct ieee80211_local *local, if (!changed) return 0; - return drv_config(local, changed); + return drv_config(local, -1, changed); } int ieee80211_hw_conf_chan(struct ieee80211_local *local) @@ -269,7 +270,7 @@ void ieee80211_hw_conf_init(struct ieee80211_local *local) ctx ? &ctx->conf : NULL); } - WARN_ON(drv_config(local, changed)); + WARN_ON(drv_config(local, -1, changed)); } int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2d46d4af60d7..d526f2fe9fe5 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3181,7 +3181,7 @@ static void ieee80211_enable_ps(struct ieee80211_local *local, return; conf->flags |= IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } } @@ -3193,7 +3193,7 @@ static void ieee80211_change_ps(struct ieee80211_local *local) ieee80211_enable_ps(local, local->ps_sdata); } else if (conf->flags & IEEE80211_CONF_PS) { conf->flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); timer_delete_sync(&local->dynamic_ps_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); @@ -3302,7 +3302,7 @@ void ieee80211_dynamic_ps_disable_work(struct wiphy *wiphy, if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } ieee80211_wake_queues_by_reason(&local->hw, @@ -3377,7 +3377,7 @@ void ieee80211_dynamic_ps_enable_work(struct wiphy *wiphy, (ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; local->hw.conf.flags |= IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } } @@ -3986,7 +3986,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, */ if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } local->ps_sdata = NULL; @@ -7340,7 +7340,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } ieee80211_send_nullfunc(local, sdata, false); diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 686d9f6e9b52..13df6321634d 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -39,7 +39,7 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) if (local->hw.conf.flags & IEEE80211_CONF_PS) { offchannel_ps_enabled = true; local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } if (!offchannel_ps_enabled || diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index a9cc832240a5..5a508d99e84f 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -108,7 +108,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) sdata->u.mgd.powersave && !(local->hw.conf.flags & IEEE80211_CONF_PS)) { local->hw.conf.flags |= IEEE80211_CONF_PS; - ieee80211_hw_config(local, + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } } diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 8215ca58ce5e..0bfbce157486 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -384,12 +384,14 @@ DEFINE_EVENT(local_sdata_addr_evt, drv_remove_interface, TRACE_EVENT(drv_config, TP_PROTO(struct ieee80211_local *local, + int radio_idx, u32 changed), - TP_ARGS(local, changed), + TP_ARGS(local, radio_idx, changed), TP_STRUCT__entry( LOCAL_ENTRY + __field(int, radio_idx) __field(u32, changed) __field(u32, flags) __field(int, power_level) @@ -403,6 +405,7 @@ TRACE_EVENT(drv_config, TP_fast_assign( LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; __entry->changed = changed; __entry->flags = local->hw.conf.flags; __entry->power_level = local->hw.conf.power_level; @@ -417,8 +420,8 @@ TRACE_EVENT(drv_config, ), TP_printk( - LOCAL_PR_FMT " ch:%#x" CHANDEF_PR_FMT, - LOCAL_PR_ARG, __entry->changed, CHANDEF_PR_ARG + LOCAL_PR_FMT " radio_idx:%d ch:%#x" CHANDEF_PR_FMT, + LOCAL_PR_ARG, __entry->radio_idx, __entry->changed, CHANDEF_PR_ARG ) ); @@ -818,34 +821,71 @@ TRACE_EVENT(drv_get_key_seq, ) ); -DEFINE_EVENT(local_u32_evt, drv_set_frag_threshold, - TP_PROTO(struct ieee80211_local *local, u32 value), - TP_ARGS(local, value) +TRACE_EVENT(drv_set_frag_threshold, + TP_PROTO(struct ieee80211_local *local, int radio_idx, u32 value), + + TP_ARGS(local, radio_idx, value), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, radio_idx) + __field(u32, value) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; + __entry->value = value; + ), + + TP_printk( + LOCAL_PR_FMT " radio_id:%d value:%u", + LOCAL_PR_ARG, __entry->radio_idx, __entry->value + ) ); -DEFINE_EVENT(local_u32_evt, drv_set_rts_threshold, - TP_PROTO(struct ieee80211_local *local, u32 value), - TP_ARGS(local, value) +TRACE_EVENT(drv_set_rts_threshold, + TP_PROTO(struct ieee80211_local *local, int radio_idx, u32 value), + + TP_ARGS(local, radio_idx, value), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, radio_idx) + __field(u32, value) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; + __entry->value = value; + ), + + TP_printk( + LOCAL_PR_FMT " radio_id:%d value:%u", + LOCAL_PR_ARG, __entry->radio_idx, __entry->value + ) ); TRACE_EVENT(drv_set_coverage_class, - TP_PROTO(struct ieee80211_local *local, s16 value), + TP_PROTO(struct ieee80211_local *local, int radio_idx, s16 value), - TP_ARGS(local, value), + TP_ARGS(local, radio_idx, value), TP_STRUCT__entry( LOCAL_ENTRY + __field(int, radio_idx) __field(s16, value) ), TP_fast_assign( LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; __entry->value = value; ), TP_printk( - LOCAL_PR_FMT " value:%d", - LOCAL_PR_ARG, __entry->value + LOCAL_PR_FMT " radio_id:%d value:%d", + LOCAL_PR_ARG, __entry->radio_idx, __entry->value ) ); @@ -1318,12 +1358,14 @@ TRACE_EVENT(drv_set_antenna, ); TRACE_EVENT(drv_get_antenna, - TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret), + TP_PROTO(struct ieee80211_local *local, int radio_idx, u32 tx_ant, + u32 rx_ant, int ret), - TP_ARGS(local, tx_ant, rx_ant, ret), + TP_ARGS(local, radio_idx, tx_ant, rx_ant, ret), TP_STRUCT__entry( LOCAL_ENTRY + __field(int, radio_idx) __field(u32, tx_ant) __field(u32, rx_ant) __field(int, ret) @@ -1331,14 +1373,16 @@ TRACE_EVENT(drv_get_antenna, TP_fast_assign( LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; __entry->tx_ant = tx_ant; __entry->rx_ant = rx_ant; __entry->ret = ret; ), TP_printk( - LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d", - LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret + LOCAL_PR_FMT " radio_idx:%d tx_ant:%d rx_ant:%d ret:%d", + LOCAL_PR_ARG, __entry->radio_idx, __entry->tx_ant, + __entry->rx_ant, __entry->ret ) ); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index d58b80813bdd..6278d55aeb2e 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1541,7 +1541,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local, spin_unlock_bh(&local->active_txq_lock[txqi->txq.ac]); } -void ieee80211_txq_set_params(struct ieee80211_local *local) +void ieee80211_txq_set_params(struct ieee80211_local *local, int radio_idx) { if (local->hw.wiphy->txq_limit) local->fq.limit = local->hw.wiphy->txq_limit; @@ -1605,7 +1605,7 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) for (i = 0; i < fq->flows_cnt; i++) codel_vars_init(&local->cvars[i]); - ieee80211_txq_set_params(local); + ieee80211_txq_set_params(local, -1); return 0; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 24c43a1ef2aa..773c8da0acc9 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1826,13 +1826,13 @@ int ieee80211_reconfig(struct ieee80211_local *local) } /* setup fragmentation threshold */ - drv_set_frag_threshold(local, hw->wiphy->frag_threshold); + drv_set_frag_threshold(local, -1, hw->wiphy->frag_threshold); /* setup RTS threshold */ - drv_set_rts_threshold(local, hw->wiphy->rts_threshold); + drv_set_rts_threshold(local, -1, hw->wiphy->rts_threshold); /* reset coverage class */ - drv_set_coverage_class(local, hw->wiphy->coverage_class); + drv_set_coverage_class(local, -1, hw->wiphy->coverage_class); ieee80211_led_radio(local, true); ieee80211_mod_tpt_led_trig(local, @@ -1890,11 +1890,11 @@ int ieee80211_reconfig(struct ieee80211_local *local) ieee80211_assign_chanctx(local, sdata, &sdata->deflink); /* reconfigure hardware */ - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL | - IEEE80211_CONF_CHANGE_MONITOR | - IEEE80211_CONF_CHANGE_PS | - IEEE80211_CONF_CHANGE_RETRY_LIMITS | - IEEE80211_CONF_CHANGE_IDLE); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL | + IEEE80211_CONF_CHANGE_MONITOR | + IEEE80211_CONF_CHANGE_PS | + IEEE80211_CONF_CHANGE_RETRY_LIMITS | + IEEE80211_CONF_CHANGE_IDLE); ieee80211_configure_filter(local); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9ef618baac9e..b40978549790 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -854,6 +854,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MLO_RECONF_REM_LINKS] = { .type = NLA_U16 }, [NL80211_ATTR_EPCS] = { .type = NLA_FLAG }, [NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS] = { .type = NLA_U16 }, + [NL80211_ATTR_WIPHY_RADIO_INDEX] = { .type = NLA_U8 }, }; /* policy for the key attributes */ @@ -2639,7 +2640,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, u32 tx_ant = 0, rx_ant = 0; int res; - res = rdev_get_antenna(rdev, &tx_ant, &rx_ant); + res = rdev_get_antenna(rdev, -1, &tx_ant, &rx_ant); if (!res) { if (nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_TX, @@ -3620,6 +3621,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u32 frag_threshold = 0, rts_threshold = 0; u8 coverage_class = 0; u32 txq_limit = 0, txq_memory_limit = 0, txq_quantum = 0; + int radio_idx = -1; rtnl_lock(); /* @@ -3670,6 +3672,17 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (result) return result; + if (info->attrs[NL80211_ATTR_WIPHY_RADIO_INDEX]) { + /* Radio idx is not expected for non-multi radio wiphy */ + if (rdev->wiphy.n_radio <= 0) + return -EINVAL; + + radio_idx = nla_get_u8( + info->attrs[NL80211_ATTR_WIPHY_RADIO_INDEX]); + if (radio_idx >= rdev->wiphy.n_radio) + return -EINVAL; + } + if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) { struct ieee80211_txq_params txq_params; struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1]; @@ -3759,7 +3772,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) mbm = nla_get_u32(info->attrs[idx]); } - result = rdev_set_tx_power(rdev, txp_wdev, type, mbm); + result = rdev_set_tx_power(rdev, txp_wdev, radio_idx, type, + mbm); if (result) return result; } @@ -3785,7 +3799,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) tx_ant = tx_ant & rdev->wiphy.available_antennas_tx; rx_ant = rx_ant & rdev->wiphy.available_antennas_rx; - result = rdev_set_antenna(rdev, tx_ant, rx_ant); + result = rdev_set_antenna(rdev, radio_idx, tx_ant, rx_ant); if (result) return result; } @@ -3911,7 +3925,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (changed & WIPHY_PARAM_TXQ_QUANTUM) rdev->wiphy.txq_quantum = txq_quantum; - result = rdev_set_wiphy_params(rdev, changed); + result = rdev_set_wiphy_params(rdev, radio_idx, changed); if (result) { rdev->wiphy.retry_short = old_retry_short; rdev->wiphy.retry_long = old_retry_long; @@ -4012,7 +4026,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (rdev->ops->get_tx_power && !wdev->valid_links) { int dbm, ret; - ret = rdev_get_tx_power(rdev, wdev, 0, &dbm); + ret = rdev_get_tx_power(rdev, wdev, -1, 0, &dbm); if (ret == 0 && nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL, DBM_TO_MBM(dbm))) @@ -4084,7 +4098,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (rdev->ops->get_tx_power) { int dbm, ret; - ret = rdev_get_tx_power(rdev, wdev, link_id, &dbm); + ret = rdev_get_tx_power(rdev, wdev, -1, link_id, &dbm); if (ret == 0 && nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL, DBM_TO_MBM(dbm))) diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 9f4783c2354c..803b39c26587 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -577,35 +577,40 @@ static inline int rdev_leave_ibss(struct cfg80211_registered_device *rdev, } static inline int -rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed) +rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, int radio_idx, + u32 changed) { int ret = -EOPNOTSUPP; - trace_rdev_set_wiphy_params(&rdev->wiphy, changed); + trace_rdev_set_wiphy_params(&rdev->wiphy, radio_idx, changed); if (rdev->ops->set_wiphy_params) - ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed); + ret = rdev->ops->set_wiphy_params(&rdev->wiphy, radio_idx, + changed); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, - enum nl80211_tx_power_setting type, int mbm) + struct wireless_dev *wdev, int radio_idx, + enum nl80211_tx_power_setting type, + int mbm) { int ret; - trace_rdev_set_tx_power(&rdev->wiphy, wdev, type, mbm); - ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, type, mbm); + trace_rdev_set_tx_power(&rdev->wiphy, wdev, radio_idx, type, mbm); + ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, radio_idx, type, + mbm); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, unsigned int link_id, - int *dbm) + struct wireless_dev *wdev, int radio_idx, + unsigned int link_id, int *dbm) { int ret; - trace_rdev_get_tx_power(&rdev->wiphy, wdev, link_id); - ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, link_id, dbm); + trace_rdev_get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id); + ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id, + dbm); trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm); return ret; } @@ -857,21 +862,21 @@ rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev, } static inline int rdev_set_antenna(struct cfg80211_registered_device *rdev, - u32 tx_ant, u32 rx_ant) + int radio_idx, u32 tx_ant, u32 rx_ant) { int ret; - trace_rdev_set_antenna(&rdev->wiphy, tx_ant, rx_ant); - ret = rdev->ops->set_antenna(&rdev->wiphy, tx_ant, rx_ant); + trace_rdev_set_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant); + ret = rdev->ops->set_antenna(&rdev->wiphy, -1, tx_ant, rx_ant); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev, - u32 *tx_ant, u32 *rx_ant) + int radio_idx, u32 *tx_ant, u32 *rx_ant) { int ret; - trace_rdev_get_antenna(&rdev->wiphy); - ret = rdev->ops->get_antenna(&rdev->wiphy, tx_ant, rx_ant); + trace_rdev_get_antenna(&rdev->wiphy, radio_idx); + ret = rdev->ops->get_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant); if (ret) trace_rdev_return_int(&rdev->wiphy, ret); else diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 61a5eca9c513..7e43ab9de923 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -406,9 +406,19 @@ DEFINE_EVENT(wiphy_only_evt, rdev_return_void, TP_ARGS(wiphy) ); -DEFINE_EVENT(wiphy_only_evt, rdev_get_antenna, - TP_PROTO(struct wiphy *wiphy), - TP_ARGS(wiphy) +TRACE_EVENT(rdev_get_antenna, + TP_PROTO(struct wiphy *wiphy, int radio_idx), + TP_ARGS(wiphy, radio_idx), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(int, radio_idx) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->radio_idx = radio_idx; + ), + TP_printk(WIPHY_PR_FMT ", radio_idx: %d", + WIPHY_PR_ARG, __entry->radio_idx) ); DEFINE_EVENT(wiphy_only_evt, rdev_rfkill_poll, @@ -1678,18 +1688,20 @@ TRACE_EVENT(rdev_join_ocb, ); TRACE_EVENT(rdev_set_wiphy_params, - TP_PROTO(struct wiphy *wiphy, u32 changed), - TP_ARGS(wiphy, changed), + TP_PROTO(struct wiphy *wiphy, int radio_idx, u32 changed), + TP_ARGS(wiphy, radio_idx, changed), TP_STRUCT__entry( WIPHY_ENTRY + __field(int, radio_idx) __field(u32, changed) ), TP_fast_assign( WIPHY_ASSIGN; + __entry->radio_idx = radio_idx; __entry->changed = changed; ), - TP_printk(WIPHY_PR_FMT ", changed: %u", - WIPHY_PR_ARG, __entry->changed) + TP_printk(WIPHY_PR_FMT ", radio_idx: %d, changed: %u", + WIPHY_PR_ARG, __entry->radio_idx, __entry->changed) ); DECLARE_EVENT_CLASS(wiphy_wdev_link_evt, @@ -1710,30 +1722,51 @@ DECLARE_EVENT_CLASS(wiphy_wdev_link_evt, WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id) ); -DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_tx_power, +TRACE_EVENT(rdev_get_tx_power, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id), - TP_ARGS(wiphy, wdev, link_id) + int radio_idx, unsigned int link_id), + TP_ARGS(wiphy, wdev, radio_idx, link_id), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(int, radio_idx) + __field(unsigned int, link_id) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->radio_idx = radio_idx; + __entry->link_id = link_id; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT + ", radio_idx: %d, link_id: %u", + WIPHY_PR_ARG, WDEV_PR_ARG, + __entry->radio_idx, __entry->link_id) ); TRACE_EVENT(rdev_set_tx_power, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, - enum nl80211_tx_power_setting type, int mbm), - TP_ARGS(wiphy, wdev, type, mbm), + int radio_idx, enum nl80211_tx_power_setting type, + int mbm), + TP_ARGS(wiphy, wdev, radio_idx, type, mbm), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY + __field(int, radio_idx) __field(enum nl80211_tx_power_setting, type) __field(int, mbm) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; + __entry->radio_idx = radio_idx; __entry->type = type; __entry->mbm = mbm; ), - TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type: %u, mbm: %d", - WIPHY_PR_ARG, WDEV_PR_ARG,__entry->type, __entry->mbm) + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT + ", radio_idx: %d, type: %u, mbm: %d", + WIPHY_PR_ARG, WDEV_PR_ARG, + __entry->radio_idx, __entry->type, __entry->mbm) ); TRACE_EVENT(rdev_return_int_int, @@ -1866,26 +1899,24 @@ TRACE_EVENT(rdev_return_void_tx_rx, __entry->rx_max) ); -DECLARE_EVENT_CLASS(tx_rx_evt, - TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx), - TP_ARGS(wiphy, tx, rx), +TRACE_EVENT(rdev_set_antenna, + TP_PROTO(struct wiphy *wiphy, int radio_idx, u32 tx, u32 rx), + TP_ARGS(wiphy, radio_idx, tx, rx), TP_STRUCT__entry( WIPHY_ENTRY + __field(int, radio_idx) __field(u32, tx) __field(u32, rx) ), TP_fast_assign( WIPHY_ASSIGN; + __entry->radio_idx = radio_idx; __entry->tx = tx; __entry->rx = rx; ), - TP_printk(WIPHY_PR_FMT ", tx: %u, rx: %u ", - WIPHY_PR_ARG, __entry->tx, __entry->rx) -); - -DEFINE_EVENT(tx_rx_evt, rdev_set_antenna, - TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx), - TP_ARGS(wiphy, tx, rx) + TP_printk(WIPHY_PR_FMT ", radio_idx: %d, tx: %u, rx: %u ", + WIPHY_PR_ARG, __entry->radio_idx, + __entry->tx, __entry->rx) ); DECLARE_EVENT_CLASS(wiphy_netdev_id_evt, diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index a74b1afc594e..1241fda78a68 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -263,7 +263,7 @@ int cfg80211_wext_siwrts(struct net_device *dev, else wdev->wiphy->rts_threshold = rts->value; - err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_RTS_THRESHOLD); + err = rdev_set_wiphy_params(rdev, -1, WIPHY_PARAM_RTS_THRESHOLD); if (err) wdev->wiphy->rts_threshold = orts; return err; @@ -304,7 +304,7 @@ int cfg80211_wext_siwfrag(struct net_device *dev, wdev->wiphy->frag_threshold = frag->value & ~0x1; } - err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_FRAG_THRESHOLD); + err = rdev_set_wiphy_params(rdev, -1, WIPHY_PARAM_FRAG_THRESHOLD); if (err) wdev->wiphy->frag_threshold = ofrag; return err; @@ -355,7 +355,7 @@ static int cfg80211_wext_siwretry(struct net_device *dev, changed |= WIPHY_PARAM_RETRY_SHORT; } - err = rdev_set_wiphy_params(rdev, changed); + err = rdev_set_wiphy_params(rdev, -1, changed); if (err) { wdev->wiphy->retry_short = oshort; wdev->wiphy->retry_long = olong; @@ -890,7 +890,7 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, guard(wiphy)(&rdev->wiphy); - return rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); + return rdev_set_tx_power(rdev, wdev, -1, type, DBM_TO_MBM(dbm)); } static int cfg80211_wext_giwtxpower(struct net_device *dev, @@ -910,7 +910,7 @@ static int cfg80211_wext_giwtxpower(struct net_device *dev, return -EOPNOTSUPP; scoped_guard(wiphy, &rdev->wiphy) { - err = rdev_get_tx_power(rdev, wdev, 0, &val); + err = rdev_get_tx_power(rdev, wdev, -1, 0, &val); } if (err) return err; -- cgit v1.2.3 From 89595190058c6e9ca4a8ca7d49be3fc8d2395e79 Mon Sep 17 00:00:00 2001 From: Roopni Devanathan Date: Sun, 15 Jun 2025 13:53:11 +0530 Subject: wifi: cfg80211: Report per-radio RTS threshold to userspace In case of multi-radio wiphys, with per-radio RTS threshold brought into use, RTS threshold for each radio in a wiphy can be recorded in wiphy parameter - wiphy_radio_cfg, as an array. Add a new attribute - NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD in nested parameter - NL80211_ATTR_WIPHY_RADIOS. When a request for getting RTS threshold for a particular radio is received, parse the radio id and get the required data. Add this data to the newly added nested attribute NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD. Add support to report this data to userspace. Signed-off-by: Roopni Devanathan Link: https://patch.msgid.link/20250615082312.619639-4-quic_rdevanat@quicinc.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/nl80211.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 2a71149c3065..39460334dafb 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -8106,6 +8106,7 @@ enum nl80211_ap_settings_flags { * and contains attributes defined in &enum nl80211_if_combination_attrs. * @NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK: bitmask (u32) of antennas * connected to this radio. + * @NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD: RTS threshold (u32) of this radio. * * @__NL80211_WIPHY_RADIO_ATTR_LAST: Internal * @NL80211_WIPHY_RADIO_ATTR_MAX: Highest attribute @@ -8117,6 +8118,7 @@ enum nl80211_wiphy_radio_attrs { NL80211_WIPHY_RADIO_ATTR_FREQ_RANGE, NL80211_WIPHY_RADIO_ATTR_INTERFACE_COMBINATION, NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK, + NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD, /* keep last */ __NL80211_WIPHY_RADIO_ATTR_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b0176090182c..70bfe2bfdcc7 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2447,6 +2447,7 @@ fail: static int nl80211_put_radio(struct wiphy *wiphy, struct sk_buff *msg, int idx) { const struct wiphy_radio *r = &wiphy->radio[idx]; + const struct wiphy_radio_cfg *rcfg = &wiphy->radio_cfg[idx]; struct nlattr *radio, *freq; int i; @@ -2457,6 +2458,11 @@ static int nl80211_put_radio(struct wiphy *wiphy, struct sk_buff *msg, int idx) if (nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_INDEX, idx)) goto nla_put_failure; + if (rcfg->rts_threshold && + nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD, + rcfg->rts_threshold)) + goto nla_put_failure; + if (r->antenna_mask && nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK, r->antenna_mask)) -- cgit v1.2.3 From a4c746f06853f91d3759ae8aca514d135b6aa56d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 15:48:42 +0200 Subject: uapi/fcntl: mark range as reserved Mark the range from -10000 to -40000 as a range reserved for special in-kernel values. Move the PIDFD_SELF_*/PIDFD_THREAD_* sentinels over so all the special values are in one place. Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-6-d02a04858fe3@kernel.org Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 16 ++++++++++++++++ include/uapi/linux/pidfd.h | 15 --------------- 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index a15ac2fa4b20..ed02d8ae0667 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -90,10 +90,26 @@ #define DN_ATTRIB 0x00000020 /* File changed attibutes */ #define DN_MULTISHOT 0x80000000 /* Don't remove notifier */ +/* Reserved kernel ranges [-100], [-10000, -40000]. */ #define AT_FDCWD -100 /* Special value for dirfd used to indicate openat should use the current working directory. */ +/* + * The concept of process and threads in userland and the kernel is a confusing + * one - within the kernel every thread is a 'task' with its own individual PID, + * however from userland's point of view threads are grouped by a single PID, + * which is that of the 'thread group leader', typically the first thread + * spawned. + * + * To cut the Gideon knot, for internal kernel usage, we refer to + * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel + * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread + * group leader... + */ +#define PIDFD_SELF_THREAD -10000 /* Current thread. */ +#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ + /* Generic flags for the *at(2) family of syscalls. */ diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index c27a4e238e4b..957db425d459 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -42,21 +42,6 @@ #define PIDFD_COREDUMP_USER (1U << 2) /* coredump was done as the user. */ #define PIDFD_COREDUMP_ROOT (1U << 3) /* coredump was done as root. */ -/* - * The concept of process and threads in userland and the kernel is a confusing - * one - within the kernel every thread is a 'task' with its own individual PID, - * however from userland's point of view threads are grouped by a single PID, - * which is that of the 'thread group leader', typically the first thread - * spawned. - * - * To cut the Gideon knot, for internal kernel usage, we refer to - * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel - * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread - * group leader... - */ -#define PIDFD_SELF_THREAD -10000 /* Current thread. */ -#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ - /* * ...and for userland we make life simpler - PIDFD_SELF refers to the current * thread, PIDFD_SELF_PROCESS refers to the process thread group leader. -- cgit v1.2.3 From 67fcec2919e4ed31ab845eb456ad7d6f1e85505c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 15:48:49 +0200 Subject: fcntl/pidfd: redefine PIDFD_SELF_THREAD_GROUP Don't jump somewhere into the middle of the reserved range. We're still able to change that value it won't be that widely used yet. If not, we can revert. Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 2 +- tools/testing/selftests/pidfd/pidfd.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index ed02d8ae0667..ba4a698d2f33 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -108,7 +108,7 @@ * group leader... */ #define PIDFD_SELF_THREAD -10000 /* Current thread. */ -#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ +#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ /* Generic flags for the *at(2) family of syscalls. */ diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index efd74063126e..5dfeb1bdf399 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -56,7 +56,7 @@ #endif #ifndef PIDFD_SELF_THREAD_GROUP -#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ +#define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ #endif #ifndef PIDFD_SELF -- cgit v1.2.3 From cd5d2006327b6d8488612cb8c03ad7304417c8f2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 10:29:10 +0200 Subject: uapi/fcntl: add FD_INVALID Add a marker for an invalid file descriptor. Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-7-d02a04858fe3@kernel.org Reviewed-by: Jan Kara Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index ba4a698d2f33..a5bebe7c4400 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -110,6 +110,7 @@ #define PIDFD_SELF_THREAD -10000 /* Current thread. */ #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ +#define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ -- cgit v1.2.3 From 3941e37f62fe2c3c8b8675c12183185f20450539 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Jun 2025 16:57:51 +0200 Subject: uapi/fcntl: add FD_PIDFS_ROOT Add a special file descriptor indicating the root of the pidfs filesystem. Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index a5bebe7c4400..f291ab4f94eb 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -110,6 +110,7 @@ #define PIDFD_SELF_THREAD -10000 /* Current thread. */ #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ +#define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ #define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ -- cgit v1.2.3 From 22bbc1dcd0d6785fb390c41f0dd5b5e218d23bdd Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 23 Jun 2025 12:00:53 +0200 Subject: vsock/uapi: fix linux/vm_sockets.h userspace compilation errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a userspace application just include will fail to build with the following errors: /usr/include/linux/vm_sockets.h:182:39: error: invalid application of ‘sizeof’ to incomplete type ‘struct sockaddr’ 182 | unsigned char svm_zero[sizeof(struct sockaddr) - | ^~~~~~ /usr/include/linux/vm_sockets.h:183:39: error: ‘sa_family_t’ undeclared here (not in a function) 183 | sizeof(sa_family_t) - | Include for userspace (guarded by ifndef __KERNEL__) where `struct sockaddr` and `sa_family_t` are defined. We already do something similar in and . Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Reported-by: Daan De Meyer Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20250623100053.40979-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/vm_sockets.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h index ed07181d4eff..e05280e41522 100644 --- a/include/uapi/linux/vm_sockets.h +++ b/include/uapi/linux/vm_sockets.h @@ -17,6 +17,10 @@ #ifndef _UAPI_VM_SOCKETS_H #define _UAPI_VM_SOCKETS_H +#ifndef __KERNEL__ +#include /* for struct sockaddr and sa_family_t */ +#endif + #include #include -- cgit v1.2.3 From 67caa528ae08cd05e485c0ea6aea0baaf6579b06 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sat, 21 Jun 2025 10:28:41 -0600 Subject: ublk: fix narrowing warnings in UAPI header When a C++ file compiled with -Wc++11-narrowing includes the UAPI header linux/ublk_cmd.h, ublk_sqe_addr_to_auto_buf_reg()'s assignments of u64 values to u8, u16, and u32 fields result in compiler warnings. Add explicit casts to the intended types to avoid these warnings. Drop the unnecessary bitmasks. Reported-by: Uday Shankar Signed-off-by: Caleb Sander Mateos Fixes: 99c1e4eb6a3f ("ublk: register buffer to local io_uring with provided buf index via UBLK_F_AUTO_BUF_REG") Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250621162842.337452-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/uapi/linux/ublk_cmd.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 77d9d6af46da..c062109cb686 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -450,10 +450,10 @@ static inline struct ublk_auto_buf_reg ublk_sqe_addr_to_auto_buf_reg( __u64 sqe_addr) { struct ublk_auto_buf_reg reg = { - .index = sqe_addr & 0xffff, - .flags = (sqe_addr >> 16) & 0xff, - .reserved0 = (sqe_addr >> 24) & 0xff, - .reserved1 = sqe_addr >> 32, + .index = (__u16)sqe_addr, + .flags = (__u8)(sqe_addr >> 16), + .reserved0 = (__u8)(sqe_addr >> 24), + .reserved1 = (__u32)(sqe_addr >> 32), }; return reg; -- cgit v1.2.3 From 81b4d1a1d03301dcca8af5c58eded9e535f1f6ed Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sat, 21 Jun 2025 11:10:14 -0600 Subject: ublk: update UBLK_F_SUPPORT_ZERO_COPY comment in UAPI header UBLK_F_SUPPORT_ZERO_COPY has a very old comment describing the initial idea for how zero-copy would be implemented. The actual implementation added in commit 1f6540e2aabb ("ublk: zc register/unregister bvec") uses io_uring registered buffers rather than shared memory mapping. Remove the inaccurate remarks about mapping ublk request memory into the ublk server's address space and requiring 4K block size. Replace them with a description of the current zero-copy mechanism. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250621171015.354932-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/uapi/linux/ublk_cmd.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index c062109cb686..c9751bdfd937 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -135,8 +135,28 @@ #define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS) /* - * zero copy requires 4k block size, and can remap ublk driver's io - * request into ublksrv's vm space + * ublk server can register data buffers for incoming I/O requests with a sparse + * io_uring buffer table. The request buffer can then be used as the data buffer + * for io_uring operations via the fixed buffer index. + * Note that the ublk server can never directly access the request data memory. + * + * To use this feature, the ublk server must first register a sparse buffer + * table on an io_uring instance. + * When an incoming ublk request is received, the ublk server submits a + * UBLK_U_IO_REGISTER_IO_BUF command to that io_uring instance. The + * ublksrv_io_cmd's q_id and tag specify the request whose buffer to register + * and addr is the index in the io_uring's buffer table to install the buffer. + * SQEs can now be submitted to the io_uring to read/write the request's buffer + * by enabling fixed buffers (e.g. using IORING_OP_{READ,WRITE}_FIXED or + * IORING_URING_CMD_FIXED) and passing the registered buffer index in buf_index. + * Once the last io_uring operation using the request's buffer has completed, + * the ublk server submits a UBLK_U_IO_UNREGISTER_IO_BUF command with q_id, tag, + * and addr again specifying the request buffer to unregister. + * The ublk request is completed when its buffer is unregistered from all + * io_uring instances and the ublk server issues UBLK_U_IO_COMMIT_AND_FETCH_REQ. + * + * Not available for UBLK_F_UNPRIVILEGED_DEV, as a ublk server can leak + * uninitialized kernel memory by not reading into the full request buffer. */ #define UBLK_F_SUPPORT_ZERO_COPY (1ULL << 0) -- cgit v1.2.3 From 826334359eacc1b70e9752ebc4954ed775dd40ca Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 23 Jun 2025 16:17:13 -0700 Subject: netlink: specs: add the multicast group name to spec Add the multicast group's name to the YAML spec. Without it YNL doesn't know how to subscribe to notifications. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250623231720.3124717-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 6 ++++++ include/uapi/linux/ethtool_netlink.h | 2 -- include/uapi/linux/ethtool_netlink_generated.h | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index c1651e175e8b..cfe84f84ba29 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -2492,3 +2492,9 @@ operations: attributes: - header - events + +mcast-groups: + list: + - + name: monitor + c-define-name: ethtool-mcgrp-monitor-name diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 09a75bdb6560..fa5d645140a4 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -208,6 +208,4 @@ enum { ETHTOOL_A_STATS_PHY_MAX = (__ETHTOOL_A_STATS_PHY_CNT - 1) }; -#define ETHTOOL_MCGRP_MONITOR_NAME "monitor" - #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 4944badf9fba..859e28c8a91a 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -867,4 +867,6 @@ enum { ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) }; +#define ETHTOOL_MCGRP_MONITOR_NAME "monitor" + #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H */ -- cgit v1.2.3 From 46837be5afc6ea70bc827ca4439410e069e2ee37 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 23 Jun 2025 16:17:18 -0700 Subject: net: ethtool: rss: add notifications In preparation for RSS_SET handling in ethnl introduce Netlink notifications for RSS. Only cover modifications, not creation and not removal of a context, because the latter may deserve a different notification type. We should cross that bridge when we add the support for context add / remove via Netlink. Link: https://patch.msgid.link/20250623231720.3124717-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 7 +++++++ Documentation/networking/ethtool-netlink.rst | 1 + include/uapi/linux/ethtool_netlink_generated.h | 1 + net/ethtool/common.h | 8 ++++++++ net/ethtool/ioctl.c | 4 ++++ net/ethtool/netlink.c | 2 ++ net/ethtool/rss.c | 11 +++++++++++ 7 files changed, 34 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index cfe84f84ba29..19a32229772a 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -2492,6 +2492,13 @@ operations: attributes: - header - events + - + name: rss-ntf + doc: | + Notification for change in RSS configuration. + For additional contexts only modifications are modified, not creation + or removal of the contexts. + notify: rss-get mcast-groups: list: diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index e45bb555e909..08abca99a6dc 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -281,6 +281,7 @@ Kernel to userspace: ``ETHTOOL_MSG_MODULE_GET_REPLY`` transceiver module parameters ``ETHTOOL_MSG_PSE_GET_REPLY`` PSE parameters ``ETHTOOL_MSG_RSS_GET_REPLY`` RSS settings + ``ETHTOOL_MSG_RSS_NTF`` RSS settings ``ETHTOOL_MSG_PLCA_GET_CFG_REPLY`` PLCA RS parameters ``ETHTOOL_MSG_PLCA_GET_STATUS_REPLY`` PLCA RS status ``ETHTOOL_MSG_PLCA_NTF`` PLCA RS parameters diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 859e28c8a91a..8f30ffa1cd14 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -862,6 +862,7 @@ enum { ETHTOOL_MSG_TSCONFIG_GET_REPLY, ETHTOOL_MSG_TSCONFIG_SET_REPLY, ETHTOOL_MSG_PSE_NTF, + ETHTOOL_MSG_RSS_NTF, __ETHTOOL_MSG_KERNEL_CNT, ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) diff --git a/net/ethtool/common.h b/net/ethtool/common.h index b4683d286a5a..c41db1595621 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -74,4 +74,12 @@ int ethtool_get_module_eeprom_call(struct net_device *dev, bool __ethtool_dev_mm_supported(struct net_device *dev); +#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK) +void ethtool_rss_notify(struct net_device *dev, u32 rss_context); +#else +static inline void ethtool_rss_notify(struct net_device *dev, u32 rss_context) +{ +} +#endif + #endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 96da9d18789b..c34bac7bffd8 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1502,6 +1502,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, struct ethtool_rxfh rxfh; bool locked = false; /* dev->ethtool->rss_lock taken */ bool create = false; + bool mod = false; u8 *rss_config; int ret; @@ -1688,6 +1689,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } goto out; } + mod = !create && !rxfh_dev.rss_delete; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context), &rxfh_dev.rss_context, sizeof(rxfh_dev.rss_context))) @@ -1757,6 +1759,8 @@ out: if (locked) mutex_unlock(&dev->ethtool->rss_lock); kfree(rss_config); + if (mod) + ethtool_rss_notify(dev, rxfh.rss_context); return ret; } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 60b3c07507d2..09c81cc9a08f 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -946,6 +946,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_MODULE_NTF] = ðnl_module_request_ops, [ETHTOOL_MSG_PLCA_NTF] = ðnl_plca_cfg_request_ops, [ETHTOOL_MSG_MM_NTF] = ðnl_mm_request_ops, + [ETHTOOL_MSG_RSS_NTF] = ðnl_rss_request_ops, }; /* default notification handler */ @@ -1052,6 +1053,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_MODULE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PLCA_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MM_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_RSS_NTF] = ethnl_default_notify, }; void ethnl_notify(struct net_device *dev, unsigned int cmd, diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 6d9b1769896b..3adddca7e215 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -358,6 +358,17 @@ int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return ret; } +/* RSS_NTF */ + +void ethtool_rss_notify(struct net_device *dev, u32 rss_context) +{ + struct rss_req_info req_info = { + .rss_context = rss_context, + }; + + ethnl_notify(dev, ETHTOOL_MSG_RSS_NTF, &req_info.base); +} + const struct ethnl_request_ops ethnl_rss_request_ops = { .request_cmd = ETHTOOL_MSG_RSS_GET, .reply_cmd = ETHTOOL_MSG_RSS_GET_REPLY, -- cgit v1.2.3 From 2855e43c6bb154a9b8e27abda8df364aed574b22 Mon Sep 17 00:00:00 2001 From: RubenKelevra Date: Tue, 24 Jun 2025 18:57:11 +0200 Subject: uapi: net_dropmon: drop unused is_drop_point_hw macro Commit 4ea7e38696c7 ("dropmon: add ability to detect when hardware drops rx packets") introduced is_drop_point_hw, but the symbol was never referenced anywhere in the kernel tree and is currently not used by dropwatch. I could not find, to the best of my abilities, a current out-of-tree user of this macro. The definition also contains a syntax error in its for-loop, so any project that tried to compile against it would fail. Removing the macro therefore eliminates dead code without breaking existing users. Signed-off-by: RubenKelevra Link: https://patch.msgid.link/20250624165711.1188691-1-rubenkelevra@gmail.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/net_dropmon.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h index 9dd41c2f58a6..87cbef48d4c7 100644 --- a/include/uapi/linux/net_dropmon.h +++ b/include/uapi/linux/net_dropmon.h @@ -10,13 +10,6 @@ struct net_dm_drop_point { __u32 count; }; -#define is_drop_point_hw(x) do {\ - int ____i, ____j;\ - for (____i = 0; ____i < 8; i ____i++)\ - ____j |= x[____i];\ - ____j;\ -} while (0) - #define NET_DM_CFG_VERSION 0 #define NET_DM_CFG_ALERT_COUNT 1 #define NET_DM_CFG_ALERT_DELAY 2 -- cgit v1.2.3 From 9e6dd4c256d0774701637b958ba682eff4991277 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:59 -0700 Subject: netlink: specs: mptcp: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: bc8aeb2045e2 ("Documentation: netlink: add a YAML spec for mptcp") Reviewed-by: Davide Caratti Reviewed-by: Donald Hunter Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250624211002.3475021-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 8 ++++---- include/uapi/linux/mptcp_pm.h | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index dfd017780d2f..fb57860fe778 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -57,21 +57,21 @@ definitions: doc: >- A new subflow has been established. 'error' should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - daddr6, sport, dport, backup, if_idx [, error]. + daddr6, sport, dport, backup, if-idx [, error]. - name: sub-closed doc: >- A subflow has been closed. An error (copy of sk_err) could be set if an error has been detected for this subflow. Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - daddr6, sport, dport, backup, if_idx [, error]. + daddr6, sport, dport, backup, if-idx [, error]. - name: sub-priority value: 13 doc: >- The priority of a subflow has changed. 'error' should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - daddr6, sport, dport, backup, if_idx [, error]. + daddr6, sport, dport, backup, if-idx [, error]. - name: listener-created value: 15 @@ -255,7 +255,7 @@ attribute-sets: name: timeout type: u32 - - name: if_idx + name: if-idx type: u32 - name: reset-reason diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 84fa8a21dfd0..6ac84b2f636c 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -27,14 +27,14 @@ * token, rem_id. * @MPTCP_EVENT_SUB_ESTABLISHED: A new subflow has been established. 'error' * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | - * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * saddr6, daddr4 | daddr6, sport, dport, backup, if-idx [, error]. * @MPTCP_EVENT_SUB_CLOSED: A subflow has been closed. An error (copy of * sk_err) could be set if an error has been detected for this subflow. * Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - * daddr6, sport, dport, backup, if_idx [, error]. + * daddr6, sport, dport, backup, if-idx [, error]. * @MPTCP_EVENT_SUB_PRIORITY: The priority of a subflow has changed. 'error' * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | - * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * saddr6, daddr4 | daddr6, sport, dport, backup, if-idx [, error]. * @MPTCP_EVENT_LISTENER_CREATED: A new PM listener is created. Attributes: * family, sport, saddr4 | saddr6. * @MPTCP_EVENT_LISTENER_CLOSED: A PM listener is closed. Attributes: family, -- cgit v1.2.3 From 7f15ee35972dd3dee37704bfd0f136290f6d63d9 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Thu, 26 Jun 2025 15:52:17 +0200 Subject: dpll: add reference-sync netlink attribute Add new netlink attribute to allow user space configuration of reference sync pin pairs, where both pins are used to provide one clock signal consisting of both: base frequency and sync signal. Reviewed-by: Przemek Kitszel Reviewed-by: Milena Olech Reviewed-by: Jiri Pirko Signed-off-by: Arkadiusz Kubalewski Link: https://patch.msgid.link/20250626135219.1769350-2-arkadiusz.kubalewski@intel.com Signed-off-by: Jakub Kicinski --- Documentation/driver-api/dpll.rst | 25 +++++++++++++++++++++++++ Documentation/netlink/specs/dpll.yaml | 19 +++++++++++++++++++ drivers/dpll/dpll_nl.c | 10 ++++++++-- drivers/dpll/dpll_nl.h | 1 + include/uapi/linux/dpll.h | 1 + 5 files changed, 54 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst index 195e1e5d9a58..eca72d9b9ed8 100644 --- a/Documentation/driver-api/dpll.rst +++ b/Documentation/driver-api/dpll.rst @@ -253,6 +253,31 @@ the pin. ``DPLL_A_PIN_ESYNC_PULSE`` pulse type of Embedded SYNC ========================================= ================================= +Reference SYNC +============== + +The device may support the Reference SYNC feature, which allows the combination +of two inputs into a input pair. In this configuration, clock signals +from both inputs are used to synchronize the DPLL device. The higher frequency +signal is utilized for the loop bandwidth of the DPLL, while the lower frequency +signal is used to syntonize the output signal of the DPLL device. This feature +enables the provision of a high-quality loop bandwidth signal from an external +source. + +A capable input provides a list of inputs that can be bound with to create +Reference SYNC. To control this feature, the user must request a desired +state for a target pin: use ``DPLL_PIN_STATE_CONNECTED`` to enable or +``DPLL_PIN_STATE_DISCONNECTED`` to disable the feature. An input pin can be +bound to only one other pin at any given time. + + ============================== ========================================== + ``DPLL_A_PIN_REFERENCE_SYNC`` nested attribute for providing info or + requesting configuration of the Reference + SYNC feature + ``DPLL_A_PIN_ID`` target pin id for Reference SYNC feature + ``DPLL_A_PIN_STATE`` state of Reference SYNC connection + ============================== ========================================== + Configuration commands group ============================ diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml index c13440efab24..5decee61a2c4 100644 --- a/Documentation/netlink/specs/dpll.yaml +++ b/Documentation/netlink/specs/dpll.yaml @@ -428,6 +428,15 @@ attribute-sets: doc: | A ratio of high to low state of a SYNC signal pulse embedded into base clock frequency. Value is in percents. + - + name: reference-sync + type: nest + multi-attr: true + nested-attributes: reference-sync + doc: | + Capable pin provides list of pins that can be bound to create a + reference-sync pin pair. + - name: pin-parent-device subset-of: pin @@ -458,6 +467,14 @@ attribute-sets: name: frequency-min - name: frequency-max + - + name: reference-sync + subset-of: pin + attributes: + - + name: id + - + name: state operations: enum-name: dpll_cmd @@ -598,6 +615,7 @@ operations: - esync-frequency - esync-frequency-supported - esync-pulse + - reference-sync dump: request: @@ -625,6 +643,7 @@ operations: - parent-pin - phase-adjust - esync-frequency + - reference-sync - name: pin-create-ntf doc: Notification about pin appearing diff --git a/drivers/dpll/dpll_nl.c b/drivers/dpll/dpll_nl.c index 8de90310c3be..9f2efaf25268 100644 --- a/drivers/dpll/dpll_nl.c +++ b/drivers/dpll/dpll_nl.c @@ -24,6 +24,11 @@ const struct nla_policy dpll_pin_parent_pin_nl_policy[DPLL_A_PIN_STATE + 1] = { [DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3), }; +const struct nla_policy dpll_reference_sync_nl_policy[DPLL_A_PIN_STATE + 1] = { + [DPLL_A_PIN_ID] = { .type = NLA_U32, }, + [DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3), +}; + /* DPLL_CMD_DEVICE_ID_GET - do */ static const struct nla_policy dpll_device_id_get_nl_policy[DPLL_A_TYPE + 1] = { [DPLL_A_MODULE_NAME] = { .type = NLA_NUL_STRING, }, @@ -63,7 +68,7 @@ static const struct nla_policy dpll_pin_get_dump_nl_policy[DPLL_A_PIN_ID + 1] = }; /* DPLL_CMD_PIN_SET - do */ -static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_ESYNC_FREQUENCY + 1] = { +static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_REFERENCE_SYNC + 1] = { [DPLL_A_PIN_ID] = { .type = NLA_U32, }, [DPLL_A_PIN_FREQUENCY] = { .type = NLA_U64, }, [DPLL_A_PIN_DIRECTION] = NLA_POLICY_RANGE(NLA_U32, 1, 2), @@ -73,6 +78,7 @@ static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_ESYNC_FREQUENCY [DPLL_A_PIN_PARENT_PIN] = NLA_POLICY_NESTED(dpll_pin_parent_pin_nl_policy), [DPLL_A_PIN_PHASE_ADJUST] = { .type = NLA_S32, }, [DPLL_A_PIN_ESYNC_FREQUENCY] = { .type = NLA_U64, }, + [DPLL_A_PIN_REFERENCE_SYNC] = NLA_POLICY_NESTED(dpll_reference_sync_nl_policy), }; /* Ops table for dpll */ @@ -140,7 +146,7 @@ static const struct genl_split_ops dpll_nl_ops[] = { .doit = dpll_nl_pin_set_doit, .post_doit = dpll_pin_post_doit, .policy = dpll_pin_set_nl_policy, - .maxattr = DPLL_A_PIN_ESYNC_FREQUENCY, + .maxattr = DPLL_A_PIN_REFERENCE_SYNC, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, }; diff --git a/drivers/dpll/dpll_nl.h b/drivers/dpll/dpll_nl.h index f491262bee4f..3da10cfe9a6e 100644 --- a/drivers/dpll/dpll_nl.h +++ b/drivers/dpll/dpll_nl.h @@ -14,6 +14,7 @@ /* Common nested types */ extern const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_PHASE_OFFSET + 1]; extern const struct nla_policy dpll_pin_parent_pin_nl_policy[DPLL_A_PIN_STATE + 1]; +extern const struct nla_policy dpll_reference_sync_nl_policy[DPLL_A_PIN_STATE + 1]; int dpll_lock_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index 349e1b3ca1ae..37b438ce8efc 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -249,6 +249,7 @@ enum dpll_a_pin { DPLL_A_PIN_ESYNC_FREQUENCY, DPLL_A_PIN_ESYNC_FREQUENCY_SUPPORTED, DPLL_A_PIN_ESYNC_PULSE, + DPLL_A_PIN_REFERENCE_SYNC, __DPLL_A_PIN_MAX, DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1) -- cgit v1.2.3 From 94b2030968be70b33fed9a5514a5967c7f20aebc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Jun 2025 11:36:54 -0600 Subject: io_uring: remove errant ';' from IORING_CQE_F_TSTAMP_HW definition An errant ';' slipped into that definition, which will cause some compilers to complain when it's used in an application: timestamp.c:257:45: error: empty expression statement has no effect; remove unnecessary ';' to silence this warning [-Werror,-Wextra-semi-stmt] 257 | hwts = cqe->flags & IORING_CQE_F_TSTAMP_HW; | ^ Fixes: 9e4ed359b8ef ("io_uring/netcmd: add tx timestamping cmd support") Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 85600ad0ac08..b6be063693c8 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -980,7 +980,7 @@ enum io_uring_socket_op { /* The cqe->flags bit from which the timestamp type is stored */ #define IORING_TIMESTAMP_TYPE_SHIFT (IORING_TIMESTAMP_HW_SHIFT + 1) /* The cqe->flags flag signifying whether it's a hardware timestamp */ -#define IORING_CQE_F_TSTAMP_HW ((__u32)1 << IORING_TIMESTAMP_HW_SHIFT); +#define IORING_CQE_F_TSTAMP_HW ((__u32)1 << IORING_TIMESTAMP_HW_SHIFT) struct io_timespec { __u64 tv_sec; -- cgit v1.2.3 From 03dc03fa0432a9160c4fcbdb86f274e6b4587972 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 26 Jun 2025 10:31:10 +0300 Subject: neighbor: Add NTF_EXT_VALIDATED flag for externally validated entries tl;dr ===== Add a new neighbor flag ("extern_valid") that can be used to indicate to the kernel that a neighbor entry was learned and determined to be valid externally. The kernel will not try to remove or invalidate such an entry, leaving these decisions to the user space control plane. This is needed for EVPN multi-homing where a neighbor entry for a multi-homed host needs to be synced across all the VTEPs among which the host is multi-homed. Background ========== In a typical EVPN multi-homing setup each host is multi-homed using a set of links called ES (Ethernet Segment, i.e., LAG) to multiple leaf switches (VTEPs). VTEPs that are connected to the same ES are called ES peers. When a neighbor entry is learned on a VTEP, it is distributed to both ES peers and remote VTEPs using EVPN MAC/IP advertisement routes. ES peers use the neighbor entry when routing traffic towards the multi-homed host and remote VTEPs use it for ARP/NS suppression. Motivation ========== If the ES link between a host and the VTEP on which the neighbor entry was locally learned goes down, the EVPN MAC/IP advertisement route will be withdrawn and the neighbor entries will be removed from both ES peers and remote VTEPs. Routing towards the multi-homed host and ARP/NS suppression can fail until another ES peer locally learns the neighbor entry and distributes it via an EVPN MAC/IP advertisement route. "draft-rbickhart-evpn-ip-mac-proxy-adv-03" [1] suggests avoiding these intermittent failures by having the ES peers install the neighbor entries as before, but also injecting EVPN MAC/IP advertisement routes with a proxy indication. When the previously mentioned ES link goes down and the original EVPN MAC/IP advertisement route is withdrawn, the ES peers will not withdraw their neighbor entries, but instead start aging timers for the proxy indication. If an ES peer locally learns the neighbor entry (i.e., it becomes "reachable"), it will restart its aging timer for the entry and emit an EVPN MAC/IP advertisement route without a proxy indication. An ES peer will stop its aging timer for the proxy indication if it observes the removal of the proxy indication from at least one of the ES peers advertising the entry. In the event that the aging timer for the proxy indication expired, an ES peer will withdraw its EVPN MAC/IP advertisement route. If the timer expired on all ES peers and they all withdrew their proxy advertisements, the neighbor entry will be completely removed from the EVPN fabric. Implementation ============== In the above scheme, when the control plane (e.g., FRR) advertises a neighbor entry with a proxy indication, it expects the corresponding entry in the data plane (i.e., the kernel) to remain valid and not be removed due to garbage collection or loss of carrier. The control plane also expects the kernel to notify it if the entry was learned locally (i.e., became "reachable") so that it will remove the proxy indication from the EVPN MAC/IP advertisement route. That is why these entries cannot be programmed with dummy states such as "permanent" or "noarp". Instead, add a new neighbor flag ("extern_valid") which indicates that the entry was learned and determined to be valid externally and should not be removed or invalidated by the kernel. The kernel can probe the entry and notify user space when it becomes "reachable" (it is initially installed as "stale"). However, if the kernel does not receive a confirmation, have it return the entry to the "stale" state instead of the "failed" state. In other words, an entry marked with the "extern_valid" flag behaves like any other dynamically learned entry other than the fact that the kernel cannot remove or invalidate it. One can argue that the "extern_valid" flag should not prevent garbage collection and that instead a neighbor entry should be programmed with both the "extern_valid" and "extern_learn" flags. There are two reasons for not doing that: 1. Unclear why a control plane would like to program an entry that the kernel cannot invalidate but can completely remove. 2. The "extern_learn" flag is used by FRR for neighbor entries learned on remote VTEPs (for ARP/NS suppression) whereas here we are concerned with local entries. This distinction is currently irrelevant for the kernel, but might be relevant in the future. Given that the flag only makes sense when the neighbor has a valid state, reject attempts to add a neighbor with an invalid state and with this flag set. For example: # ip neigh add 192.0.2.1 nud none dev br0.10 extern_valid Error: Cannot create externally validated neighbor with an invalid state. # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid # ip neigh replace 192.0.2.1 nud failed dev br0.10 extern_valid Error: Cannot mark neighbor as externally validated with an invalid state. The above means that a neighbor cannot be created with the "extern_valid" flag and flags such as "use" or "managed" as they result in a neighbor being created with an invalid state ("none") and immediately getting probed: # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use Error: Cannot create externally validated neighbor with an invalid state. However, these flags can be used together with "extern_valid" after the neighbor was created with a valid state: # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use One consequence of preventing the kernel from invalidating a neighbor entry is that by default it will only try to determine reachability using unicast probes. This can be changed using the "mcast_resolicit" sysctl: # sysctl net.ipv4.neigh.br0/10.mcast_resolicit 0 # tcpdump -nn -e -i br0.10 -Q out arp & # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 # sysctl -wq net.ipv4.neigh.br0/10.mcast_resolicit=3 # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 iproute2 patches can be found here [2]. [1] https://datatracker.ietf.org/doc/html/draft-rbickhart-evpn-ip-mac-proxy-adv-03 [2] https://github.com/idosch/iproute2/tree/submit/extern_valid_v1 Signed-off-by: Ido Schimmel Acked-by: Daniel Borkmann Link: https://patch.msgid.link/20250626073111.244534-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-neigh.yaml | 1 + include/net/neighbour.h | 4 +- include/uapi/linux/neighbour.h | 5 ++ net/core/neighbour.c | 79 +++++++++++++++++++++++++++---- 4 files changed, 78 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/rt-neigh.yaml b/Documentation/netlink/specs/rt-neigh.yaml index 25cc2d528d2f..30a9ee16f128 100644 --- a/Documentation/netlink/specs/rt-neigh.yaml +++ b/Documentation/netlink/specs/rt-neigh.yaml @@ -79,6 +79,7 @@ definitions: entries: - managed - locked + - ext-validated - name: rtm-type type: enum diff --git a/include/net/neighbour.h b/include/net/neighbour.h index c7ce5ec7be23..7e865b14749d 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -261,13 +261,15 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_EXT_LEARNED BIT(5) #define NEIGH_UPDATE_F_ISROUTER BIT(6) #define NEIGH_UPDATE_F_ADMIN BIT(7) +#define NEIGH_UPDATE_F_EXT_VALIDATED BIT(8) /* In-kernel representation for NDA_FLAGS_EXT flags: */ #define NTF_OLD_MASK 0xff #define NTF_EXT_SHIFT 8 -#define NTF_EXT_MASK (NTF_EXT_MANAGED) +#define NTF_EXT_MASK (NTF_EXT_MANAGED | NTF_EXT_EXT_VALIDATED) #define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT) +#define NTF_EXT_VALIDATED (NTF_EXT_EXT_VALIDATED << NTF_EXT_SHIFT) extern const struct nla_policy nda_policy[]; diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index b851c36ad25d..c34a81245f87 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -54,6 +54,7 @@ enum { /* Extended flags under NDA_FLAGS_EXT: */ #define NTF_EXT_MANAGED (1 << 0) #define NTF_EXT_LOCKED (1 << 1) +#define NTF_EXT_EXT_VALIDATED (1 << 2) /* * Neighbor Cache Entry States. @@ -92,6 +93,10 @@ enum { * bridge in response to a host trying to communicate via a locked bridge port * with MAB enabled. Their purpose is to notify user space that a host requires * authentication. + * + * NTF_EXT_EXT_VALIDATED flagged neighbor entries were externally validated by + * a user space control plane. The kernel will not remove or invalidate them, + * but it can probe them and notify user space when they become reachable. */ struct nda_cacheinfo { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8ad9898f8e42..e5f0992ac364 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -154,11 +154,12 @@ static void neigh_update_gc_list(struct neighbour *n) if (n->dead) goto out; - /* remove from the gc list if new state is permanent or if neighbor - * is externally learned; otherwise entry should be on the gc list + /* remove from the gc list if new state is permanent or if neighbor is + * externally learned / validated; otherwise entry should be on the gc + * list */ exempt_from_gc = n->nud_state & NUD_PERMANENT || - n->flags & NTF_EXT_LEARNED; + n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED); on_gc_list = !list_empty(&n->gc_list); if (exempt_from_gc && on_gc_list) { @@ -205,6 +206,7 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; + ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_VALIDATED) ? NTF_EXT_VALIDATED : 0; if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { if (ndm_flags & NTF_EXT_LEARNED) @@ -222,6 +224,14 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, *notify = 1; *managed_update = true; } + if ((old_flags ^ ndm_flags) & NTF_EXT_VALIDATED) { + if (ndm_flags & NTF_EXT_VALIDATED) + neigh->flags |= NTF_EXT_VALIDATED; + else + neigh->flags &= ~NTF_EXT_VALIDATED; + *notify = 1; + *gc_update = true; + } } bool neigh_remove_one(struct neighbour *n) @@ -379,7 +389,9 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, dev_head = neigh_get_dev_table(dev, tbl->family); hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { - if (skip_perm && n->nud_state & NUD_PERMANENT) + if (skip_perm && + (n->nud_state & NUD_PERMANENT || + n->flags & NTF_EXT_VALIDATED)) continue; hlist_del_rcu(&n->hash); @@ -942,7 +954,8 @@ static void neigh_periodic_work(struct work_struct *work) state = n->nud_state; if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || - (n->flags & NTF_EXT_LEARNED)) { + (n->flags & + (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) { write_unlock(&n->lock); continue; } @@ -1095,9 +1108,15 @@ static void neigh_timer_handler(struct timer_list *t) if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { - WRITE_ONCE(neigh->nud_state, NUD_FAILED); + if (neigh->nud_state == NUD_PROBE && + neigh->flags & NTF_EXT_VALIDATED) { + WRITE_ONCE(neigh->nud_state, NUD_STALE); + neigh->updated = jiffies; + } else { + WRITE_ONCE(neigh->nud_state, NUD_FAILED); + neigh_invalidate(neigh); + } notify = 1; - neigh_invalidate(neigh); goto out; } @@ -1245,6 +1264,8 @@ static void neigh_update_hhs(struct neighbour *neigh) NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. + NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed + or invalidated. Caller MUST hold reference count on the entry. */ @@ -1979,7 +2000,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; - if (ndm_flags & NTF_MANAGED) { + if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } @@ -2010,7 +2031,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (neigh == NULL) { bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; bool exempt_from_gc = ndm_permanent || - ndm_flags & NTF_EXT_LEARNED; + ndm_flags & (NTF_EXT_LEARNED | + NTF_EXT_VALIDATED); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; @@ -2021,10 +2043,27 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EINVAL; goto out; } + if (ndm_flags & NTF_EXT_VALIDATED) { + u8 state = ndm->ndm_state; + + /* NTF_USE and NTF_MANAGED will result in the neighbor + * being created with an invalid state (NUD_NONE). + */ + if (ndm_flags & (NTF_USE | NTF_MANAGED)) + state = NUD_NONE; + + if (!(state & NUD_VALID)) { + NL_SET_ERR_MSG(extack, + "Cannot create externally validated neighbor with an invalid state"); + err = -EINVAL; + goto out; + } + } neigh = ___neigh_create(tbl, dst, dev, ndm_flags & - (NTF_EXT_LEARNED | NTF_MANAGED), + (NTF_EXT_LEARNED | NTF_MANAGED | + NTF_EXT_VALIDATED), exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); @@ -2036,6 +2075,24 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh_release(neigh); goto out; } + if (ndm_flags & NTF_EXT_VALIDATED) { + u8 state = ndm->ndm_state; + + /* NTF_USE and NTF_MANAGED do not update the existing + * state other than clearing it if it was + * NUD_PERMANENT. + */ + if (ndm_flags & (NTF_USE | NTF_MANAGED)) + state = READ_ONCE(neigh->nud_state) & ~NUD_PERMANENT; + + if (!(state & NUD_VALID)) { + NL_SET_ERR_MSG(extack, + "Cannot mark neighbor as externally validated with an invalid state"); + err = -EINVAL; + neigh_release(neigh); + goto out; + } + } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) flags &= ~(NEIGH_UPDATE_F_OVERRIDE | @@ -2052,6 +2109,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, flags |= NEIGH_UPDATE_F_MANAGED; if (ndm_flags & NTF_USE) flags |= NEIGH_UPDATE_F_USE; + if (ndm_flags & NTF_EXT_VALIDATED) + flags |= NEIGH_UPDATE_F_EXT_VALIDATED; err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); -- cgit v1.2.3 From 763ff02ce287c2e5c8a012d40bd2f3dab99ae5d5 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 20 Jun 2025 09:10:03 -0600 Subject: ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task Currently, UBLK_IO_REGISTER_IO_BUF and UBLK_IO_UNREGISTER_IO_BUF are only permitted on the ublk_io's daemon task. But this restriction is unnecessary. ublk_register_io_buf() calls __ublk_check_and_get_req() to look up the request from the tagset and atomically take a reference on the request without accessing the ublk_io. ublk_unregister_io_buf() doesn't use the q_id or tag at all. So allow these opcodes even on tasks other than io->task. Handle UBLK_IO_UNREGISTER_IO_BUF before obtaining the ubq and io since the buffer index being unregistered is not necessarily related to the specified q_id and tag. Add a feature flag UBLK_F_BUF_REG_OFF_DAEMON that userspace can use to determine whether the kernel supports off-daemon buffer registration. Suggested-by: Ming Lei Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250620151008.3976463-10-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 27 ++++++++++++++++++++++----- include/uapi/linux/ublk_cmd.h | 10 ++++++++++ 2 files changed, 32 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0c244fe76d27..fa1859c06211 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -70,7 +70,8 @@ | UBLK_F_UPDATE_SIZE \ | UBLK_F_AUTO_BUF_REG \ | UBLK_F_QUIESCE \ - | UBLK_F_PER_IO_DAEMON) + | UBLK_F_PER_IO_DAEMON \ + | UBLK_F_BUF_REG_OFF_DAEMON) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -2204,6 +2205,14 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (ret) goto out; + /* + * io_buffer_unregister_bvec() doesn't access the ubq or io, + * so no need to validate the q_id, tag, or task + */ + if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF) + return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr, + issue_flags); + ret = -EINVAL; if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) goto out; @@ -2224,8 +2233,17 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, return -EIOCBQUEUED; } - if (READ_ONCE(io->task) != current) + if (READ_ONCE(io->task) != current) { + /* + * ublk_register_io_buf() accesses only the io's refcount, + * so can be handled on any task + */ + if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF) + return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr, + issue_flags); + goto out; + } /* there is pending io cmd, something must be wrong */ if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) { @@ -2244,8 +2262,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, switch (_IOC_NR(cmd_op)) { case UBLK_IO_REGISTER_IO_BUF: return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr, issue_flags); - case UBLK_IO_UNREGISTER_IO_BUF: - return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr, issue_flags); case UBLK_IO_COMMIT_AND_FETCH_REQ: ret = ublk_commit_and_fetch(ubq, io, cmd, ub_cmd, issue_flags); if (ret) @@ -2961,7 +2977,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | UBLK_F_URING_CMD_COMP_IN_TASK | - UBLK_F_PER_IO_DAEMON; + UBLK_F_PER_IO_DAEMON | + UBLK_F_BUF_REG_OFF_DAEMON; /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index c9751bdfd937..ec77dabba45b 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -301,6 +301,16 @@ */ #define UBLK_F_PER_IO_DAEMON (1ULL << 13) +/* + * If this feature is set, UBLK_U_IO_REGISTER_IO_BUF/UBLK_U_IO_UNREGISTER_IO_BUF + * can be issued for an I/O on any task. q_id and tag are also ignored in + * UBLK_U_IO_UNREGISTER_IO_BUF's ublksrv_io_cmd. + * If it is unset, zero-copy buffers can only be registered and unregistered by + * the I/O's daemon task. The q_id and tag of the registered buffer are required + * in UBLK_U_IO_UNREGISTER_IO_BUF's ublksrv_io_cmd. + */ +#define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 -- cgit v1.2.3 From 9eb22f7fedfc9eb1b7f431a5359abd4d15b0b0cd Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Mon, 30 Jun 2025 14:35:48 +0530 Subject: fs: add ioctl to query metadata and protection info capabilities Add a new ioctl, FS_IOC_GETLBMD_CAP, to query metadata and protection info (PI) capabilities. This ioctl returns information about the files integrity profile. This is useful for userspace applications to understand a files end-to-end data protection support and configure the I/O accordingly. For now this interface is only supported by block devices. However the design and placement of this ioctl in generic FS ioctl space allows us to extend it to work over files as well. This maybe useful when filesystems start supporting PI-aware layouts. A new structure struct logical_block_metadata_cap is introduced, which contains the following fields: 1. lbmd_flags: bitmask of logical block metadata capability flags 2. lbmd_interval: the amount of data described by each unit of logical block metadata 3. lbmd_size: size in bytes of the logical block metadata associated with each interval 4. lbmd_opaque_size: size in bytes of the opaque block tag associated with each interval 5. lbmd_opaque_offset: offset in bytes of the opaque block tag within the logical block metadata 6. lbmd_pi_size: size in bytes of the T10 PI tuple associated with each interval 7. lbmd_pi_offset: offset in bytes of T10 PI tuple within the logical block metadata 8. lbmd_pi_guard_tag_type: T10 PI guard tag type 9. lbmd_pi_app_tag_size: size in bytes of the T10 PI application tag 10. lbmd_pi_ref_tag_size: size in bytes of the T10 PI reference tag 11. lbmd_pi_storage_tag_size: size in bytes of the T10 PI storage tag The internal logic to fetch the capability is encapsulated in a helper function blk_get_meta_cap(), which uses the blk_integrity profile associated with the device. The ioctl returns -EOPNOTSUPP, if CONFIG_BLK_DEV_INTEGRITY is not enabled. Suggested-by: Martin K. Petersen Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/20250630090548.3317-5-anuj20.g@samsung.com Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- block/blk-integrity.c | 52 ++++++++++++++++++++++++++++++++++++++ block/ioctl.c | 4 +++ include/linux/blk-integrity.h | 7 +++++ include/uapi/linux/fs.h | 59 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 122 insertions(+) (limited to 'include/uapi/linux') diff --git a/block/blk-integrity.c b/block/blk-integrity.c index c1102bf4cd8d..9d9dc9c32083 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "blk.h" @@ -54,6 +55,57 @@ new_segment: return segments; } +int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, + struct logical_block_metadata_cap __user *argp) +{ + struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk); + struct logical_block_metadata_cap meta_cap = {}; + size_t usize = _IOC_SIZE(cmd); + + if (!argp) + return -EINVAL; + if (usize < LBMD_SIZE_VER0) + return -EINVAL; + if (!bi) + goto out; + + if (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) + meta_cap.lbmd_flags |= LBMD_PI_CAP_INTEGRITY; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + meta_cap.lbmd_flags |= LBMD_PI_CAP_REFTAG; + meta_cap.lbmd_interval = 1 << bi->interval_exp; + meta_cap.lbmd_size = bi->metadata_size; + meta_cap.lbmd_pi_size = bi->pi_tuple_size; + meta_cap.lbmd_pi_offset = bi->pi_offset; + meta_cap.lbmd_opaque_size = bi->metadata_size - bi->pi_tuple_size; + if (meta_cap.lbmd_opaque_size && !bi->pi_offset) + meta_cap.lbmd_opaque_offset = bi->pi_tuple_size; + + meta_cap.lbmd_guard_tag_type = bi->csum_type; + if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE) + meta_cap.lbmd_app_tag_size = 2; + + if (bi->flags & BLK_INTEGRITY_REF_TAG) { + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + meta_cap.lbmd_ref_tag_size = + sizeof_field(struct crc64_pi_tuple, ref_tag); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + meta_cap.lbmd_ref_tag_size = + sizeof_field(struct t10_pi_tuple, ref_tag); + break; + default: + break; + } + } + +out: + return copy_struct_to_user(argp, usize, &meta_cap, sizeof(meta_cap), + NULL); +} + /** * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist * @rq: request to map diff --git a/block/ioctl.c b/block/ioctl.c index e472cc1030c6..9ad403733e19 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "blk.h" #include "blk-crypto-internal.h" @@ -566,6 +567,9 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, { unsigned int max_sectors; + if (_IOC_NR(cmd) == _IOC_NR(FS_IOC_GETLBMD_CAP)) + return blk_get_meta_cap(bdev, cmd, argp); + switch (cmd) { case BLKFLSBUF: return blkdev_flushbuf(bdev, cmd, arg); diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index d27730da47f3..e04c6e5bf1c6 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -29,6 +29,8 @@ int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); +int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, + struct logical_block_metadata_cap __user *argp); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) @@ -92,6 +94,11 @@ static inline struct bio_vec rq_integrity_vec(struct request *rq) rq->bio->bi_integrity->bip_iter); } #else /* CONFIG_BLK_DEV_INTEGRITY */ +static inline int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, + struct logical_block_metadata_cap __user *argp) +{ + return -EOPNOTSUPP; +} static inline int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *b) { diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 0098b0ce8ccb..83720a2fd20d 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -91,6 +91,63 @@ struct fs_sysfs_path { __u8 name[128]; }; +/* Protection info capability flags */ +#define LBMD_PI_CAP_INTEGRITY (1 << 0) +#define LBMD_PI_CAP_REFTAG (1 << 1) + +/* Checksum types for Protection Information */ +#define LBMD_PI_CSUM_NONE 0 +#define LBMD_PI_CSUM_IP 1 +#define LBMD_PI_CSUM_CRC16_T10DIF 2 +#define LBMD_PI_CSUM_CRC64_NVME 4 + +/* sizeof first published struct */ +#define LBMD_SIZE_VER0 16 + +/* + * Logical block metadata capability descriptor + * If the device does not support metadata, all the fields will be zero. + * Applications must check lbmd_flags to determine whether metadata is + * supported or not. + */ +struct logical_block_metadata_cap { + /* Bitmask of logical block metadata capability flags */ + __u32 lbmd_flags; + /* + * The amount of data described by each unit of logical block + * metadata + */ + __u16 lbmd_interval; + /* + * Size in bytes of the logical block metadata associated with each + * interval + */ + __u8 lbmd_size; + /* + * Size in bytes of the opaque block tag associated with each + * interval + */ + __u8 lbmd_opaque_size; + /* + * Offset in bytes of the opaque block tag within the logical block + * metadata + */ + __u8 lbmd_opaque_offset; + /* Size in bytes of the T10 PI tuple associated with each interval */ + __u8 lbmd_pi_size; + /* Offset in bytes of T10 PI tuple within the logical block metadata */ + __u8 lbmd_pi_offset; + /* T10 PI guard tag type */ + __u8 lbmd_guard_tag_type; + /* Size in bytes of the T10 PI application tag */ + __u8 lbmd_app_tag_size; + /* Size in bytes of the T10 PI reference tag */ + __u8 lbmd_ref_tag_size; + /* Size in bytes of the T10 PI storage tag */ + __u8 lbmd_storage_tag_size; + __u8 pad; +}; + /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */ #define FILE_DEDUPE_RANGE_SAME 0 #define FILE_DEDUPE_RANGE_DIFFERS 1 @@ -247,6 +304,8 @@ struct fsxattr { * also /sys/kernel/debug/ for filesystems with debugfs exports */ #define FS_IOC_GETFSSYSFSPATH _IOR(0x15, 1, struct fs_sysfs_path) +/* Get logical block metadata capability details */ +#define FS_IOC_GETLBMD_CAP _IOWR(0x15, 2, struct logical_block_metadata_cap) /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) -- cgit v1.2.3 From 3a0ae385f69e9b2d87c9b017c4ffb5567c015197 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Jun 2025 19:16:51 +0100 Subject: io_uring/mock: add basic infra for test mock files io_uring commands provide an ioctl style interface for files to implement file specific operations. io_uring provides many features and advanced api to commands, and it's getting hard to test as it requires specific files/devices. Add basic infrastucture for creating special mock files that will be implementing the cmd api and using various io_uring features we want to test. It'll also be useful to test some more obscure read/write/polling edge cases in the future. Suggested-by: chase xd Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/93f21b0af58c1367a2b22635d5a7d694ad0272fc.1750599274.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- MAINTAINERS | 1 + include/uapi/linux/io_uring/mock_file.h | 22 +++++ init/Kconfig | 11 +++ io_uring/Makefile | 1 + io_uring/mock_file.c | 148 ++++++++++++++++++++++++++++++++ 5 files changed, 183 insertions(+) create mode 100644 include/uapi/linux/io_uring/mock_file.h create mode 100644 io_uring/mock_file.c (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index c3f7fbd0d67a..24e11687f8b6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12679,6 +12679,7 @@ F: include/linux/io_uring.h F: include/linux/io_uring_types.h F: include/trace/events/io_uring.h F: include/uapi/linux/io_uring.h +F: include/uapi/linux/io_uring/ F: io_uring/ IPMI SUBSYSTEM diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h new file mode 100644 index 000000000000..a44273fd526d --- /dev/null +++ b/include/uapi/linux/io_uring/mock_file.h @@ -0,0 +1,22 @@ +#ifndef LINUX_IO_URING_MOCK_FILE_H +#define LINUX_IO_URING_MOCK_FILE_H + +#include + +struct io_uring_mock_probe { + __u64 features; + __u64 __resv[9]; +}; + +struct io_uring_mock_create { + __u32 out_fd; + __u32 flags; + __u64 __resv[15]; +}; + +enum { + IORING_MOCK_MGR_CMD_PROBE, + IORING_MOCK_MGR_CMD_CREATE, +}; + +#endif diff --git a/init/Kconfig b/init/Kconfig index af4c2f085455..c40a7c65fb4c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1801,6 +1801,17 @@ config GCOV_PROFILE_URING the io_uring subsystem, hence this should only be enabled for specific test purposes. +config IO_URING_MOCK_FILE + tristate "Enable io_uring mock files (Experimental)" if EXPERT + default n + depends on IO_URING + help + Enable mock files for io_uring subststem testing. The ABI might + still change, so it's still experimental and should only be enabled + for specific test purposes. + + If unsure, say N. + config ADVISE_SYSCALLS bool "Enable madvise/fadvise syscalls" if EXPERT default y diff --git a/io_uring/Makefile b/io_uring/Makefile index d97c6b51d584..b3f1bd492804 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -21,3 +21,4 @@ obj-$(CONFIG_EPOLL) += epoll.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o obj-$(CONFIG_NET) += net.o cmd_net.o obj-$(CONFIG_PROC_FS) += fdinfo.o +obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c new file mode 100644 index 000000000000..3681d0b8d8de --- /dev/null +++ b/io_uring/mock_file.c @@ -0,0 +1,148 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + return -ENOTSUPP; +} + +static const struct file_operations io_mock_fops = { + .owner = THIS_MODULE, + .uring_cmd = io_mock_cmd, +}; + +static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + struct io_uring_mock_create mc, __user *uarg; + struct file *file = NULL; + size_t uarg_size; + int fd, ret; + + /* + * It's a testing only driver that allows exercising edge cases + * that wouldn't be possible to hit otherwise. + */ + add_taint(TAINT_TEST, LOCKDEP_STILL_OK); + + uarg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + uarg_size = READ_ONCE(sqe->len); + + if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index) + return -EINVAL; + if (uarg_size != sizeof(mc)) + return -EINVAL; + + memset(&mc, 0, sizeof(mc)); + if (copy_from_user(&mc, uarg, uarg_size)) + return -EFAULT; + if (!mem_is_zero(mc.__resv, sizeof(mc.__resv)) || mc.flags) + return -EINVAL; + + fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (fd < 0) + return fd; + + file = anon_inode_create_getfile("[io_uring_mock]", &io_mock_fops, + NULL, O_RDWR | O_CLOEXEC, NULL); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto fail; + } + + mc.out_fd = fd; + if (copy_to_user(uarg, &mc, uarg_size)) { + fput(file); + ret = -EFAULT; + goto fail; + } + + fd_install(fd, file); + return 0; +fail: + put_unused_fd(fd); + return ret; +} + +static int io_probe_mock(struct io_uring_cmd *cmd) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + struct io_uring_mock_probe mp, __user *uarg; + size_t uarg_size; + + uarg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + uarg_size = READ_ONCE(sqe->len); + + if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index || + uarg_size != sizeof(mp)) + return -EINVAL; + + memset(&mp, 0, sizeof(mp)); + if (copy_from_user(&mp, uarg, uarg_size)) + return -EFAULT; + if (!mem_is_zero(&mp, sizeof(mp))) + return -EINVAL; + + mp.features = 0; + + if (copy_to_user(uarg, &mp, uarg_size)) + return -EFAULT; + return 0; +} + +static int iou_mock_mgr_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd->cmd_op) { + case IORING_MOCK_MGR_CMD_PROBE: + return io_probe_mock(cmd); + case IORING_MOCK_MGR_CMD_CREATE: + return io_create_mock_file(cmd, issue_flags); + } + return -EOPNOTSUPP; +} + +static const struct file_operations iou_mock_dev_fops = { + .owner = THIS_MODULE, + .uring_cmd = iou_mock_mgr_cmd, +}; + +static struct miscdevice iou_mock_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "io_uring_mock", + .fops = &iou_mock_dev_fops, +}; + +static int __init io_mock_init(void) +{ + int ret; + + ret = misc_register(&iou_mock_miscdev); + if (ret < 0) { + pr_err("Could not initialize io_uring mock device\n"); + return ret; + } + return 0; +} + +static void __exit io_mock_exit(void) +{ + misc_deregister(&iou_mock_miscdev); +} + +module_init(io_mock_init) +module_exit(io_mock_exit) + +MODULE_AUTHOR("Pavel Begunkov "); +MODULE_DESCRIPTION("io_uring mock file"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 4aac001f780388b252534396feaf49b250eae27f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Jun 2025 19:16:52 +0100 Subject: io_uring/mock: add cmd using vectored regbufs There is a command api allowing to import vectored registered buffers, add a new mock command that uses the feature and simply copies the specified registered buffer into user space or vice versa. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/229a113fd7de6b27dbef9567f7c0bf4475c9017d.1750599274.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/mock_file.h | 14 +++++++ io_uring/mock_file.c | 70 ++++++++++++++++++++++++++++++++- 2 files changed, 83 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h index a44273fd526d..73aca477d5c8 100644 --- a/include/uapi/linux/io_uring/mock_file.h +++ b/include/uapi/linux/io_uring/mock_file.h @@ -3,6 +3,12 @@ #include +enum { + IORING_MOCK_FEAT_CMD_COPY, + + IORING_MOCK_FEAT_END, +}; + struct io_uring_mock_probe { __u64 features; __u64 __resv[9]; @@ -19,4 +25,12 @@ enum { IORING_MOCK_MGR_CMD_CREATE, }; +enum { + IORING_MOCK_CMD_COPY_REGBUF, +}; + +enum { + IORING_MOCK_COPY_FROM = 1, +}; + #endif diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c index 3681d0b8d8de..8285393f4a5b 100644 --- a/io_uring/mock_file.c +++ b/io_uring/mock_file.c @@ -9,8 +9,76 @@ #include #include +#define IO_VALID_COPY_CMD_FLAGS IORING_MOCK_COPY_FROM + +static int io_copy_regbuf(struct iov_iter *reg_iter, void __user *ubuf) +{ + size_t ret, copied = 0; + size_t buflen = PAGE_SIZE; + void *tmp_buf; + + tmp_buf = kzalloc(buflen, GFP_KERNEL); + if (!tmp_buf) + return -ENOMEM; + + while (iov_iter_count(reg_iter)) { + size_t len = min(iov_iter_count(reg_iter), buflen); + + if (iov_iter_rw(reg_iter) == ITER_SOURCE) { + ret = copy_from_iter(tmp_buf, len, reg_iter); + if (ret <= 0) + break; + if (copy_to_user(ubuf, tmp_buf, ret)) + break; + } else { + if (copy_from_user(tmp_buf, ubuf, len)) + break; + ret = copy_to_iter(tmp_buf, len, reg_iter); + if (ret <= 0) + break; + } + ubuf += ret; + copied += ret; + } + + kfree(tmp_buf); + return copied; +} + +static int io_cmd_copy_regbuf(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + const struct iovec __user *iovec; + unsigned flags, iovec_len; + struct iov_iter iter; + void __user *ubuf; + int dir, ret; + + ubuf = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + iovec = u64_to_user_ptr(READ_ONCE(sqe->addr)); + iovec_len = READ_ONCE(sqe->len); + flags = READ_ONCE(sqe->file_index); + + if (unlikely(sqe->ioprio || sqe->__pad1)) + return -EINVAL; + if (flags & ~IO_VALID_COPY_CMD_FLAGS) + return -EINVAL; + + dir = (flags & IORING_MOCK_COPY_FROM) ? ITER_SOURCE : ITER_DEST; + ret = io_uring_cmd_import_fixed_vec(cmd, iovec, iovec_len, dir, &iter, + issue_flags); + if (ret) + return ret; + ret = io_copy_regbuf(&iter, ubuf); + return ret ? ret : -EFAULT; +} + static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { + switch (cmd->cmd_op) { + case IORING_MOCK_CMD_COPY_REGBUF: + return io_cmd_copy_regbuf(cmd, issue_flags); + } return -ENOTSUPP; } @@ -91,7 +159,7 @@ static int io_probe_mock(struct io_uring_cmd *cmd) if (!mem_is_zero(&mp, sizeof(mp))) return -EINVAL; - mp.features = 0; + mp.features = IORING_MOCK_FEAT_END; if (copy_to_user(uarg, &mp, uarg_size)) return -EFAULT; -- cgit v1.2.3 From d1aa0346571436203a24cc3fc0c80f14cabbd630 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Jun 2025 19:16:53 +0100 Subject: io_uring/mock: add sync read/write Add support for synchronous zero read/write for mock files. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/571f3c9fe688e918256a06a722d3db6ced9ca3d5.1750599274.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/mock_file.h | 4 +- io_uring/mock_file.c | 67 ++++++++++++++++++++++++++++++--- 2 files changed, 65 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h index 73aca477d5c8..de27295bb365 100644 --- a/include/uapi/linux/io_uring/mock_file.h +++ b/include/uapi/linux/io_uring/mock_file.h @@ -5,6 +5,7 @@ enum { IORING_MOCK_FEAT_CMD_COPY, + IORING_MOCK_FEAT_RW_ZERO, IORING_MOCK_FEAT_END, }; @@ -17,7 +18,8 @@ struct io_uring_mock_probe { struct io_uring_mock_create { __u32 out_fd; __u32 flags; - __u64 __resv[15]; + __u64 file_size; + __u64 __resv[14]; }; enum { diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c index 8285393f4a5b..90160ccb50f0 100644 --- a/io_uring/mock_file.c +++ b/io_uring/mock_file.c @@ -9,6 +9,10 @@ #include #include +struct io_mock_file { + size_t size; +}; + #define IO_VALID_COPY_CMD_FLAGS IORING_MOCK_COPY_FROM static int io_copy_regbuf(struct iov_iter *reg_iter, void __user *ubuf) @@ -82,18 +86,59 @@ static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -ENOTSUPP; } +static ssize_t io_mock_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct io_mock_file *mf = iocb->ki_filp->private_data; + size_t len = iov_iter_count(to); + + if (iocb->ki_pos + len > mf->size) + return -EINVAL; + return iov_iter_zero(len, to); +} + +static ssize_t io_mock_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct io_mock_file *mf = iocb->ki_filp->private_data; + size_t len = iov_iter_count(from); + + if (iocb->ki_pos + len > mf->size) + return -EINVAL; + iov_iter_advance(from, len); + return len; +} + +static loff_t io_mock_llseek(struct file *file, loff_t offset, int whence) +{ + struct io_mock_file *mf = file->private_data; + + return fixed_size_llseek(file, offset, whence, mf->size); +} + +static int io_mock_release(struct inode *inode, struct file *file) +{ + struct io_mock_file *mf = file->private_data; + + kfree(mf); + return 0; +} + static const struct file_operations io_mock_fops = { .owner = THIS_MODULE, + .release = io_mock_release, .uring_cmd = io_mock_cmd, + .read_iter = io_mock_read_iter, + .write_iter = io_mock_write_iter, + .llseek = io_mock_llseek, }; static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags) { const struct io_uring_sqe *sqe = cmd->sqe; struct io_uring_mock_create mc, __user *uarg; + struct io_mock_file *mf = NULL; struct file *file = NULL; size_t uarg_size; - int fd, ret; + int fd = -1, ret; /* * It's a testing only driver that allows exercising edge cases @@ -114,18 +159,28 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag return -EFAULT; if (!mem_is_zero(mc.__resv, sizeof(mc.__resv)) || mc.flags) return -EINVAL; + if (mc.file_size > SZ_1G) + return -EINVAL; + mf = kzalloc(sizeof(*mf), GFP_KERNEL_ACCOUNT); + if (!mf) + return -ENOMEM; - fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + ret = fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (fd < 0) - return fd; + goto fail; + mf->size = mc.file_size; file = anon_inode_create_getfile("[io_uring_mock]", &io_mock_fops, - NULL, O_RDWR | O_CLOEXEC, NULL); + mf, O_RDWR | O_CLOEXEC, NULL); if (IS_ERR(file)) { ret = PTR_ERR(file); goto fail; } + file->f_mode |= FMODE_READ | FMODE_CAN_READ | + FMODE_WRITE | FMODE_CAN_WRITE | + FMODE_LSEEK; + mc.out_fd = fd; if (copy_to_user(uarg, &mc, uarg_size)) { fput(file); @@ -136,7 +191,9 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag fd_install(fd, file); return 0; fail: - put_unused_fd(fd); + if (fd >= 0) + put_unused_fd(fd); + kfree(mf); return ret; } -- cgit v1.2.3 From 2f71d2386f4feed5bfb9ee7b3d2c0ad953db1fa5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Jun 2025 19:16:54 +0100 Subject: io_uring/mock: allow to choose FMODE_NOWAIT Add an option to choose whether the file supports FMODE_NOWAIT, that changes the execution path io_uring request takes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1e532565b05a05b23589d237c24ee1a3d90c2fd9.1750599274.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/mock_file.h | 5 +++++ io_uring/mock_file.c | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h index de27295bb365..125949d2b5ce 100644 --- a/include/uapi/linux/io_uring/mock_file.h +++ b/include/uapi/linux/io_uring/mock_file.h @@ -6,6 +6,7 @@ enum { IORING_MOCK_FEAT_CMD_COPY, IORING_MOCK_FEAT_RW_ZERO, + IORING_MOCK_FEAT_RW_NOWAIT, IORING_MOCK_FEAT_END, }; @@ -15,6 +16,10 @@ struct io_uring_mock_probe { __u64 __resv[9]; }; +enum { + IORING_MOCK_CREATE_F_SUPPORT_NOWAIT = 1, +}; + struct io_uring_mock_create { __u32 out_fd; __u32 flags; diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c index 90160ccb50f0..0eb1d3bd6368 100644 --- a/io_uring/mock_file.c +++ b/io_uring/mock_file.c @@ -131,6 +131,8 @@ static const struct file_operations io_mock_fops = { .llseek = io_mock_llseek, }; +#define IO_VALID_CREATE_FLAGS (IORING_MOCK_CREATE_F_SUPPORT_NOWAIT) + static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags) { const struct io_uring_sqe *sqe = cmd->sqe; @@ -157,7 +159,9 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag memset(&mc, 0, sizeof(mc)); if (copy_from_user(&mc, uarg, uarg_size)) return -EFAULT; - if (!mem_is_zero(mc.__resv, sizeof(mc.__resv)) || mc.flags) + if (!mem_is_zero(mc.__resv, sizeof(mc.__resv))) + return -EINVAL; + if (mc.flags & ~IO_VALID_CREATE_FLAGS) return -EINVAL; if (mc.file_size > SZ_1G) return -EINVAL; @@ -180,6 +184,8 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag file->f_mode |= FMODE_READ | FMODE_CAN_READ | FMODE_WRITE | FMODE_CAN_WRITE | FMODE_LSEEK; + if (mc.flags & IORING_MOCK_CREATE_F_SUPPORT_NOWAIT) + file->f_mode |= FMODE_NOWAIT; mc.out_fd = fd; if (copy_to_user(uarg, &mc, uarg_size)) { -- cgit v1.2.3 From 0c98a44329c10bf904434524425cb42043513bd6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Jun 2025 19:16:55 +0100 Subject: io_uring/mock: support for async read/write Let the user to specify a delay to read/write request. io_uring will start a timer, return -EIOCBQUEUED and complete the request asynchronously after the delay pass. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/38f9d2e143fda8522c90a724b74630e68f9bbd16.1750599274.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/mock_file.h | 4 ++- io_uring/mock_file.c | 59 ++++++++++++++++++++++++++++++--- 2 files changed, 58 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h index 125949d2b5ce..c8fa77e39c68 100644 --- a/include/uapi/linux/io_uring/mock_file.h +++ b/include/uapi/linux/io_uring/mock_file.h @@ -7,6 +7,7 @@ enum { IORING_MOCK_FEAT_CMD_COPY, IORING_MOCK_FEAT_RW_ZERO, IORING_MOCK_FEAT_RW_NOWAIT, + IORING_MOCK_FEAT_RW_ASYNC, IORING_MOCK_FEAT_END, }; @@ -24,7 +25,8 @@ struct io_uring_mock_create { __u32 out_fd; __u32 flags; __u64 file_size; - __u64 __resv[14]; + __u64 rw_delay_ns; + __u64 __resv[13]; }; enum { diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c index 0eb1d3bd6368..ed6a5505763e 100644 --- a/io_uring/mock_file.c +++ b/io_uring/mock_file.c @@ -4,13 +4,22 @@ #include #include #include +#include +#include #include #include #include +struct io_mock_iocb { + struct kiocb *iocb; + struct hrtimer timer; + int res; +}; + struct io_mock_file { - size_t size; + size_t size; + u64 rw_delay_ns; }; #define IO_VALID_COPY_CMD_FLAGS IORING_MOCK_COPY_FROM @@ -86,14 +95,48 @@ static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -ENOTSUPP; } +static enum hrtimer_restart io_mock_rw_timer_expired(struct hrtimer *timer) +{ + struct io_mock_iocb *mio = container_of(timer, struct io_mock_iocb, timer); + struct kiocb *iocb = mio->iocb; + + WRITE_ONCE(iocb->private, NULL); + iocb->ki_complete(iocb, mio->res); + kfree(mio); + return HRTIMER_NORESTART; +} + +static ssize_t io_mock_delay_rw(struct kiocb *iocb, size_t len) +{ + struct io_mock_file *mf = iocb->ki_filp->private_data; + struct io_mock_iocb *mio; + + mio = kzalloc(sizeof(*mio), GFP_KERNEL); + if (!mio) + return -ENOMEM; + + mio->iocb = iocb; + mio->res = len; + hrtimer_setup(&mio->timer, io_mock_rw_timer_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_start(&mio->timer, ns_to_ktime(mf->rw_delay_ns), + HRTIMER_MODE_REL); + return -EIOCBQUEUED; +} + static ssize_t io_mock_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct io_mock_file *mf = iocb->ki_filp->private_data; size_t len = iov_iter_count(to); + size_t nr_zeroed; if (iocb->ki_pos + len > mf->size) return -EINVAL; - return iov_iter_zero(len, to); + nr_zeroed = iov_iter_zero(len, to); + if (!mf->rw_delay_ns || nr_zeroed != len) + return nr_zeroed; + + return io_mock_delay_rw(iocb, len); } static ssize_t io_mock_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -103,8 +146,12 @@ static ssize_t io_mock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_pos + len > mf->size) return -EINVAL; - iov_iter_advance(from, len); - return len; + if (!mf->rw_delay_ns) { + iov_iter_advance(from, len); + return len; + } + + return io_mock_delay_rw(iocb, len); } static loff_t io_mock_llseek(struct file *file, loff_t offset, int whence) @@ -165,6 +212,9 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag return -EINVAL; if (mc.file_size > SZ_1G) return -EINVAL; + if (mc.rw_delay_ns > NSEC_PER_SEC) + return -EINVAL; + mf = kzalloc(sizeof(*mf), GFP_KERNEL_ACCOUNT); if (!mf) return -ENOMEM; @@ -174,6 +224,7 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag goto fail; mf->size = mc.file_size; + mf->rw_delay_ns = mc.rw_delay_ns; file = anon_inode_create_getfile("[io_uring_mock]", &io_mock_fops, mf, O_RDWR | O_CLOEXEC, NULL); if (IS_ERR(file)) { -- cgit v1.2.3 From e448d578264a9512d38deb8c418954d5f3e20712 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Jun 2025 19:16:56 +0100 Subject: io_uring/mock: add trivial poll handler Add a flag that enables polling on the mock file. For now it's trivially says that there is always data available, it'll be extended in the future. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f16de043ec4876d65fae294fc99ade57415fba0c.1750599274.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/mock_file.h | 2 ++ io_uring/mock_file.c | 37 +++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h index c8fa77e39c68..debeee8e4527 100644 --- a/include/uapi/linux/io_uring/mock_file.h +++ b/include/uapi/linux/io_uring/mock_file.h @@ -8,6 +8,7 @@ enum { IORING_MOCK_FEAT_RW_ZERO, IORING_MOCK_FEAT_RW_NOWAIT, IORING_MOCK_FEAT_RW_ASYNC, + IORING_MOCK_FEAT_POLL, IORING_MOCK_FEAT_END, }; @@ -19,6 +20,7 @@ struct io_uring_mock_probe { enum { IORING_MOCK_CREATE_F_SUPPORT_NOWAIT = 1, + IORING_MOCK_CREATE_F_POLL = 2, }; struct io_uring_mock_create { diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c index ed6a5505763e..45d3735b2708 100644 --- a/io_uring/mock_file.c +++ b/io_uring/mock_file.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,8 @@ struct io_mock_iocb { struct io_mock_file { size_t size; u64 rw_delay_ns; + bool pollable; + struct wait_queue_head poll_wq; }; #define IO_VALID_COPY_CMD_FLAGS IORING_MOCK_COPY_FROM @@ -161,6 +164,18 @@ static loff_t io_mock_llseek(struct file *file, loff_t offset, int whence) return fixed_size_llseek(file, offset, whence, mf->size); } +static __poll_t io_mock_poll(struct file *file, struct poll_table_struct *pt) +{ + struct io_mock_file *mf = file->private_data; + __poll_t mask = 0; + + poll_wait(file, &mf->poll_wq, pt); + + mask |= EPOLLOUT | EPOLLWRNORM; + mask |= EPOLLIN | EPOLLRDNORM; + return mask; +} + static int io_mock_release(struct inode *inode, struct file *file) { struct io_mock_file *mf = file->private_data; @@ -178,10 +193,22 @@ static const struct file_operations io_mock_fops = { .llseek = io_mock_llseek, }; -#define IO_VALID_CREATE_FLAGS (IORING_MOCK_CREATE_F_SUPPORT_NOWAIT) +static const struct file_operations io_mock_poll_fops = { + .owner = THIS_MODULE, + .release = io_mock_release, + .uring_cmd = io_mock_cmd, + .read_iter = io_mock_read_iter, + .write_iter = io_mock_write_iter, + .llseek = io_mock_llseek, + .poll = io_mock_poll, +}; + +#define IO_VALID_CREATE_FLAGS (IORING_MOCK_CREATE_F_SUPPORT_NOWAIT | \ + IORING_MOCK_CREATE_F_POLL) static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags) { + const struct file_operations *fops = &io_mock_fops; const struct io_uring_sqe *sqe = cmd->sqe; struct io_uring_mock_create mc, __user *uarg; struct io_mock_file *mf = NULL; @@ -223,9 +250,15 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag if (fd < 0) goto fail; + init_waitqueue_head(&mf->poll_wq); mf->size = mc.file_size; mf->rw_delay_ns = mc.rw_delay_ns; - file = anon_inode_create_getfile("[io_uring_mock]", &io_mock_fops, + if (mc.flags & IORING_MOCK_CREATE_F_POLL) { + fops = &io_mock_poll_fops; + mf->pollable = true; + } + + file = anon_inode_create_getfile("[io_uring_mock]", fops, mf, O_RDWR | O_CLOEXEC, NULL); if (IS_ERR(file)) { ret = PTR_ERR(file); -- cgit v1.2.3 From be7efb2d20d67f334a7de2aef77ae6c69367e646 Mon Sep 17 00:00:00 2001 From: Andrey Albershteyn Date: Mon, 30 Jun 2025 18:20:16 +0200 Subject: fs: introduce file_getattr and file_setattr syscalls Introduce file_getattr() and file_setattr() syscalls to manipulate inode extended attributes. The syscalls takes pair of file descriptor and pathname. Then it operates on inode opened accroding to openat() semantics. The struct file_attr is passed to obtain/change extended attributes. This is an alternative to FS_IOC_FSSETXATTR ioctl with a difference that file don't need to be open as we can reference it with a path instead of fd. By having this we can manipulated inode extended attributes not only on regular files but also on special ones. This is not possible with FS_IOC_FSSETXATTR ioctl as with special files we can not call ioctl() directly on the filesystem inode using fd. This patch adds two new syscalls which allows userspace to get/set extended inode attributes on special files by using parent directory and a path - *at() like syscall. CC: linux-api@vger.kernel.org CC: linux-fsdevel@vger.kernel.org CC: linux-xfs@vger.kernel.org Signed-off-by: Andrey Albershteyn Link: https://lore.kernel.org/20250630-xattrat-syscall-v6-6-c4e3bc35227b@kernel.org Acked-by: Arnd Bergmann Signed-off-by: Christian Brauner --- arch/alpha/kernel/syscalls/syscall.tbl | 2 + arch/arm/tools/syscall.tbl | 2 + arch/arm64/tools/syscall_32.tbl | 2 + arch/m68k/kernel/syscalls/syscall.tbl | 2 + arch/microblaze/kernel/syscalls/syscall.tbl | 2 + arch/mips/kernel/syscalls/syscall_n32.tbl | 2 + arch/mips/kernel/syscalls/syscall_n64.tbl | 2 + arch/mips/kernel/syscalls/syscall_o32.tbl | 2 + arch/parisc/kernel/syscalls/syscall.tbl | 2 + arch/powerpc/kernel/syscalls/syscall.tbl | 2 + arch/s390/kernel/syscalls/syscall.tbl | 2 + arch/sh/kernel/syscalls/syscall.tbl | 2 + arch/sparc/kernel/syscalls/syscall.tbl | 2 + arch/x86/entry/syscalls/syscall_32.tbl | 2 + arch/x86/entry/syscalls/syscall_64.tbl | 2 + arch/xtensa/kernel/syscalls/syscall.tbl | 2 + fs/file_attr.c | 152 ++++++++++++++++++++++++++++ include/linux/syscalls.h | 7 ++ include/uapi/asm-generic/unistd.h | 8 +- include/uapi/linux/fs.h | 18 ++++ scripts/syscall.tbl | 2 + 21 files changed, 218 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 2dd6340de6b4..16dca28ebf17 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -507,3 +507,5 @@ 575 common listxattrat sys_listxattrat 576 common removexattrat sys_removexattrat 577 common open_tree_attr sys_open_tree_attr +578 common file_getattr sys_file_getattr +579 common file_setattr sys_file_setattr diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 27c1d5ebcd91..b07e699aaa3c 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -482,3 +482,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl index 0765b3a8d6d6..8d9088bc577d 100644 --- a/arch/arm64/tools/syscall_32.tbl +++ b/arch/arm64/tools/syscall_32.tbl @@ -479,3 +479,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 9fe47112c586..f41d38dfbf13 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -467,3 +467,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 7b6e97828e55..580af574fe73 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -473,3 +473,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index aa70e371bb54..d824ffe9a014 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -406,3 +406,5 @@ 465 n32 listxattrat sys_listxattrat 466 n32 removexattrat sys_removexattrat 467 n32 open_tree_attr sys_open_tree_attr +468 n32 file_getattr sys_file_getattr +469 n32 file_setattr sys_file_setattr diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 1e8c44c7b614..7a7049c2c307 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -382,3 +382,5 @@ 465 n64 listxattrat sys_listxattrat 466 n64 removexattrat sys_removexattrat 467 n64 open_tree_attr sys_open_tree_attr +468 n64 file_getattr sys_file_getattr +469 n64 file_setattr sys_file_setattr diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 114a5a1a6230..d330274f0601 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -455,3 +455,5 @@ 465 o32 listxattrat sys_listxattrat 466 o32 removexattrat sys_removexattrat 467 o32 open_tree_attr sys_open_tree_attr +468 o32 file_getattr sys_file_getattr +469 o32 file_setattr sys_file_setattr diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 94df3cb957e9..88a788a7b18d 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -466,3 +466,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 9a084bdb8926..b453e80dfc00 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -558,3 +558,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index a4569b96ef06..8a6744d658db 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -470,3 +470,5 @@ 465 common listxattrat sys_listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr sys_file_setattr diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 52a7652fcff6..5e9c9eff5539 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -471,3 +471,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 83e45eb6c095..ebb7d06d1044 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -513,3 +513,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ac007ea00979..4877e16da69a 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -473,3 +473,5 @@ 465 i386 listxattrat sys_listxattrat 466 i386 removexattrat sys_removexattrat 467 i386 open_tree_attr sys_open_tree_attr +468 i386 file_getattr sys_file_getattr +469 i386 file_setattr sys_file_setattr diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index cfb5ca41e30d..92cf0fe2291e 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -391,6 +391,8 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index f657a77314f8..374e4cb788d8 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -438,3 +438,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr diff --git a/fs/file_attr.c b/fs/file_attr.c index 775f43fc9687..21d6a0607345 100644 --- a/fs/file_attr.c +++ b/fs/file_attr.c @@ -4,6 +4,10 @@ #include #include #include +#include +#include + +#include "internal.h" /** * fileattr_fill_xflags - initialize fileattr with xflags @@ -90,6 +94,19 @@ int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } EXPORT_SYMBOL(vfs_fileattr_get); +static void fileattr_to_file_attr(const struct fileattr *fa, + struct file_attr *fattr) +{ + __u32 mask = FS_XFLAGS_MASK; + + memset(fattr, 0, sizeof(struct file_attr)); + fattr->fa_xflags = fa->fsx_xflags & mask; + fattr->fa_extsize = fa->fsx_extsize; + fattr->fa_nextents = fa->fsx_nextents; + fattr->fa_projid = fa->fsx_projid; + fattr->fa_cowextsize = fa->fsx_cowextsize; +} + /** * copy_fsxattr_to_user - copy fsxattr to userspace. * @fa: fileattr pointer @@ -116,6 +133,23 @@ int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa) } EXPORT_SYMBOL(copy_fsxattr_to_user); +static int file_attr_to_fileattr(const struct file_attr *fattr, + struct fileattr *fa) +{ + __u32 mask = FS_XFLAGS_MASK; + + if (fattr->fa_xflags & ~mask) + return -EINVAL; + + fileattr_fill_xflags(fa, fattr->fa_xflags); + fa->fsx_xflags &= ~FS_XFLAG_RDONLY_MASK; + fa->fsx_extsize = fattr->fa_extsize; + fa->fsx_projid = fattr->fa_projid; + fa->fsx_cowextsize = fattr->fa_cowextsize; + + return 0; +} + static int copy_fsxattr_from_user(struct fileattr *fa, struct fsxattr __user *ufa) { @@ -344,3 +378,121 @@ int ioctl_fssetxattr(struct file *file, void __user *argp) return err; } EXPORT_SYMBOL(ioctl_fssetxattr); + +SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename, + struct file_attr __user *, ufattr, size_t, usize, + unsigned int, at_flags) +{ + struct path filepath __free(path_put) = {}; + struct filename *name __free(putname) = NULL; + unsigned int lookup_flags = 0; + struct file_attr fattr; + struct fileattr fa; + int error; + + BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST); + + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + if (!(at_flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + + if (usize > PAGE_SIZE) + return -E2BIG; + + if (usize < FILE_ATTR_SIZE_VER0) + return -EINVAL; + + name = getname_maybe_null(filename, at_flags); + if (IS_ERR(name)) + return PTR_ERR(name); + + if (!name && dfd >= 0) { + CLASS(fd, f)(dfd); + if (fd_empty(f)) + return -EBADF; + + filepath = fd_file(f)->f_path; + path_get(&filepath); + } else { + error = filename_lookup(dfd, name, lookup_flags, &filepath, + NULL); + if (error) + return error; + } + + error = vfs_fileattr_get(filepath.dentry, &fa); + if (error) + return error; + + fileattr_to_file_attr(&fa, &fattr); + error = copy_struct_to_user(ufattr, usize, &fattr, + sizeof(struct file_attr), NULL); + + return error; +} + +SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename, + struct file_attr __user *, ufattr, size_t, usize, + unsigned int, at_flags) +{ + struct path filepath __free(path_put) = {}; + struct filename *name __free(putname) = NULL; + unsigned int lookup_flags = 0; + struct file_attr fattr; + struct fileattr fa; + int error; + + BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST); + + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + if (!(at_flags & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + + if (usize > PAGE_SIZE) + return -E2BIG; + + if (usize < FILE_ATTR_SIZE_VER0) + return -EINVAL; + + error = copy_struct_from_user(&fattr, sizeof(struct file_attr), ufattr, + usize); + if (error) + return error; + + error = file_attr_to_fileattr(&fattr, &fa); + if (error) + return error; + + name = getname_maybe_null(filename, at_flags); + if (IS_ERR(name)) + return PTR_ERR(name); + + if (!name && dfd >= 0) { + CLASS(fd, f)(dfd); + if (fd_empty(f)) + return -EBADF; + + filepath = fd_file(f)->f_path; + path_get(&filepath); + } else { + error = filename_lookup(dfd, name, lookup_flags, &filepath, + NULL); + if (error) + return error; + } + + error = mnt_want_write(filepath.mnt); + if (!error) { + error = vfs_fileattr_set(mnt_idmap(filepath.mnt), + filepath.dentry, &fa); + mnt_drop_write(filepath.mnt); + } + + return error; +} diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e5603cc91963..77f45e5d4413 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -78,6 +78,7 @@ struct cachestat; struct statmount; struct mnt_id_req; struct xattr_args; +struct file_attr; #include #include @@ -371,6 +372,12 @@ asmlinkage long sys_removexattrat(int dfd, const char __user *path, asmlinkage long sys_lremovexattr(const char __user *path, const char __user *name); asmlinkage long sys_fremovexattr(int fd, const char __user *name); +asmlinkage long sys_file_getattr(int dfd, const char __user *filename, + struct file_attr __user *attr, size_t usize, + unsigned int at_flags); +asmlinkage long sys_file_setattr(int dfd, const char __user *filename, + struct file_attr __user *attr, size_t usize, + unsigned int at_flags); asmlinkage long sys_getcwd(char __user *buf, unsigned long size); asmlinkage long sys_eventfd2(unsigned int count, int flags); asmlinkage long sys_epoll_create1(int flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 2892a45023af..04e0077fb4c9 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -852,8 +852,14 @@ __SYSCALL(__NR_removexattrat, sys_removexattrat) #define __NR_open_tree_attr 467 __SYSCALL(__NR_open_tree_attr, sys_open_tree_attr) +/* fs/inode.c */ +#define __NR_file_getattr 468 +__SYSCALL(__NR_file_getattr, sys_file_getattr) +#define __NR_file_setattr 469 +__SYSCALL(__NR_file_setattr, sys_file_setattr) + #undef __NR_syscalls -#define __NR_syscalls 468 +#define __NR_syscalls 470 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 0098b0ce8ccb..9663dbdda181 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -148,6 +148,24 @@ struct fsxattr { unsigned char fsx_pad[8]; }; +/* + * Variable size structure for file_[sg]et_attr(). + * + * Note. This is alternative to the structure 'struct fileattr'/'struct fsxattr'. + * As this structure is passed to/from userspace with its size, this can + * be versioned based on the size. + */ +struct file_attr { + __u64 fa_xflags; /* xflags field value (get/set) */ + __u32 fa_extsize; /* extsize field value (get/set)*/ + __u32 fa_nextents; /* nextents field value (get) */ + __u32 fa_projid; /* project identifier (get/set) */ + __u32 fa_cowextsize; /* CoW extsize field value (get/set) */ +}; + +#define FILE_ATTR_SIZE_VER0 24 +#define FILE_ATTR_SIZE_LATEST FILE_ATTR_SIZE_VER0 + /* * Flags for the fsx_xflags field */ diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl index 580b4e246aec..d1ae5e92c615 100644 --- a/scripts/syscall.tbl +++ b/scripts/syscall.tbl @@ -408,3 +408,5 @@ 465 common listxattrat sys_listxattrat 466 common removexattrat sys_removexattrat 467 common open_tree_attr sys_open_tree_attr +468 common file_getattr sys_file_getattr +469 common file_setattr sys_file_setattr -- cgit v1.2.3 From b1fabef37bd504f378a203fd8b9227b8fa65b193 Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Wed, 18 Jun 2025 10:29:51 +0100 Subject: prctl: Introduce PR_MTE_STORE_ONLY PR_MTE_STORE_ONLY is used to restrict the MTE tag check for store opeartion only. Signed-off-by: Yeoreum Yun Reviewed-by: Mark Brown Tested-by: Mark Brown Link: https://lore.kernel.org/r/20250618092957.2069907-3-yeoreum.yun@arm.com Signed-off-by: Catalin Marinas --- include/uapi/linux/prctl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 43dec6eed559..f6fb137c407f 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -244,6 +244,8 @@ struct prctl_mm_map { # define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) /* Unused; kept only for source compatibility */ # define PR_MTE_TCF_SHIFT 1 +/* MTE tag check store only */ +# define PR_MTE_STORE_ONLY (1UL << 19) /* RISC-V pointer masking tag length */ # define PR_PMLEN_SHIFT 24 # define PR_PMLEN_MASK (0x7fUL << PR_PMLEN_SHIFT) -- cgit v1.2.3 From 566e8f108fc7847f2a8676ec6a101d37b7dd0fb4 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Sun, 29 Jun 2025 17:21:32 +0300 Subject: devlink: Extend devlink rate API with traffic classes bandwidth management Introduce support for specifying relative bandwidth shares between traffic classes (TC) in the devlink-rate API. This new option allows users to allocate bandwidth across multiple traffic classes in a single command. This feature provides a more granular control over traffic management, especially for scenarios requiring Enhanced Transmission Selection. Users can now define a relative bandwidth share for each traffic class. For example, assigning share values of 20 to TC0 (TCP/UDP) and 80 to TC5 (RoCE) will result in TC0 receiving 20% and TC5 receiving 80% of the total bandwidth. The actual percentage each class receives depends on the ratio of its share value to the sum of all shares. Example: DEV=pci/0000:08:00.0 $ devlink port function rate add $DEV/vfs_group tx_share 10Gbit \ tx_max 50Gbit tc-bw 0:20 1:0 2:0 3:0 4:0 5:80 6:0 7:0 $ devlink port function rate set $DEV/vfs_group \ tc-bw 0:20 1:0 2:0 3:0 4:0 5:20 6:60 7:0 Example usage with ynl: ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \ --do rate-set --json '{ "bus-name": "pci", "dev-name": "0000:08:00.0", "port-index": 1, "rate-tc-bws": [ {"rate-tc-index": 0, "rate-tc-bw": 50}, {"rate-tc-index": 1, "rate-tc-bw": 50}, {"rate-tc-index": 2, "rate-tc-bw": 0}, {"rate-tc-index": 3, "rate-tc-bw": 0}, {"rate-tc-index": 4, "rate-tc-bw": 0}, {"rate-tc-index": 5, "rate-tc-bw": 0}, {"rate-tc-index": 6, "rate-tc-bw": 0}, {"rate-tc-index": 7, "rate-tc-bw": 0} ] }' ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \ --do rate-get --json '{ "bus-name": "pci", "dev-name": "0000:08:00.0", "port-index": 1 }' output for rate-get: {'bus-name': 'pci', 'dev-name': '0000:08:00.0', 'port-index': 1, 'rate-tc-bws': [{'rate-tc-bw': 50, 'rate-tc-index': 0}, {'rate-tc-bw': 50, 'rate-tc-index': 1}, {'rate-tc-bw': 0, 'rate-tc-index': 2}, {'rate-tc-bw': 0, 'rate-tc-index': 3}, {'rate-tc-bw': 0, 'rate-tc-index': 4}, {'rate-tc-bw': 0, 'rate-tc-index': 5}, {'rate-tc-bw': 0, 'rate-tc-index': 6}, {'rate-tc-bw': 0, 'rate-tc-index': 7}], 'rate-tx-max': 0, 'rate-tx-priority': 0, 'rate-tx-share': 0, 'rate-tx-weight': 0, 'rate-type': 'leaf'} Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Reviewed-by: Jiri Pirko Signed-off-by: Tariq Toukan Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250629142138.361537-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 32 +++++- Documentation/networking/devlink/devlink-port.rst | 8 ++ include/net/devlink.h | 8 ++ include/uapi/linux/devlink.h | 9 ++ net/devlink/netlink_gen.c | 15 ++- net/devlink/netlink_gen.h | 1 + net/devlink/rate.c | 127 ++++++++++++++++++++++ 7 files changed, 195 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index bfba466d694a..1c4bb0cbe5f0 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -224,6 +224,10 @@ definitions: value: 10 - name: binary + - + name: rate-tc-index-max + type: const + value: 7 attribute-sets: - @@ -844,7 +848,23 @@ attribute-sets: - name: region-direct type: flag - + - + name: rate-tc-bws + type: nest + multi-attr: true + nested-attributes: dl-rate-tc-bws + - + name: rate-tc-index + type: u8 + checks: + max: rate-tc-index-max + - + name: rate-tc-bw + type: u32 + doc: | + Specifies the bandwidth share assigned to the Traffic Class. + The bandwidth for the traffic class is determined + in proportion to the sum of the shares of all configured classes. - name: dl-dev-stats subset-of: devlink @@ -1249,6 +1269,14 @@ attribute-sets: - name: flash type: flag + - + name: dl-rate-tc-bws + subset-of: devlink + attributes: + - + name: rate-tc-index + - + name: rate-tc-bw operations: enum-model: directional @@ -2176,6 +2204,7 @@ operations: - rate-tx-priority - rate-tx-weight - rate-parent-node-name + - rate-tc-bws - name: rate-new @@ -2196,6 +2225,7 @@ operations: - rate-tx-priority - rate-tx-weight - rate-parent-node-name + - rate-tc-bws - name: rate-del diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst index 9d22d41a7cd1..5e397798a402 100644 --- a/Documentation/networking/devlink/devlink-port.rst +++ b/Documentation/networking/devlink/devlink-port.rst @@ -418,6 +418,14 @@ API allows to configure following rate object's parameters: to all node children limits. ``tx_max`` is an upper limit for children. ``tx_share`` is a total bandwidth distributed among children. +``tc_bw`` + Allow users to set the bandwidth allocation per traffic class on rate + objects. This enables fine-grained QoS configurations by assigning a relative + share value to each traffic class. The bandwidth is distributed in proportion + to the share value for each class, relative to the sum of all shares. + When applied to a non-leaf node, tc_bw determines how bandwidth is shared + among its child elements. + ``tx_priority`` and ``tx_weight`` can be used simultaneously. In that case nodes with the same priority form a WFQ subgroup in the sibling group and arbitration among them is based on assigned weights. diff --git a/include/net/devlink.h b/include/net/devlink.h index 63517646a497..d0ce5a7e984c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -118,6 +118,8 @@ struct devlink_rate { u32 tx_priority; u32 tx_weight; + + u32 tc_bw[DEVLINK_RATE_TCS_MAX]; }; struct devlink_port { @@ -1486,6 +1488,9 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate, + void *priv, u32 *tc_bw, + struct netlink_ext_ack *extack); int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, @@ -1494,6 +1499,9 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate, + void *priv, u32 *tc_bw, + struct netlink_ext_ack *extack); int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index a5ee0f13740a..e72bcc239afd 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -221,6 +221,11 @@ enum devlink_port_flavour { */ }; +/* IEEE 802.1Qaz standard supported values. */ + +#define DEVLINK_RATE_TCS_MAX 8 +#define DEVLINK_RATE_TC_INDEX_MAX (DEVLINK_RATE_TCS_MAX - 1) + enum devlink_rate_type { DEVLINK_RATE_TYPE_LEAF, DEVLINK_RATE_TYPE_NODE, @@ -629,6 +634,10 @@ enum devlink_attr { DEVLINK_ATTR_REGION_DIRECT, /* flag */ + DEVLINK_ATTR_RATE_TC_BWS, /* nested */ + DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */ + DEVLINK_ATTR_RATE_TC_BW, /* u32 */ + /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate * net/devlink/netlink_gen.c. diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index e340d955cf3b..c50436433c18 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -45,6 +45,11 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_ [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15), }; +const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = { + [DEVLINK_ATTR_RATE_TC_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX), + [DEVLINK_ATTR_RATE_TC_BW] = { .type = NLA_U32, }, +}; + const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = { [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, }, }; @@ -523,7 +528,7 @@ static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_ }; /* DEVLINK_CMD_RATE_SET - do */ -static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { +static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, @@ -532,10 +537,11 @@ static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_W [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy), }; /* DEVLINK_CMD_RATE_NEW - do */ -static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { +static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, @@ -544,6 +550,7 @@ static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_W [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy), }; /* DEVLINK_CMD_RATE_DEL - do */ @@ -1191,7 +1198,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_rate_set_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_rate_set_nl_policy, - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, + .maxattr = DEVLINK_ATTR_RATE_TC_BWS, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { @@ -1201,7 +1208,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_rate_new_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_rate_new_nl_policy, - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, + .maxattr = DEVLINK_ATTR_RATE_TC_BWS, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h index 8f2bd50ddf5e..fb733b5d4ff1 100644 --- a/net/devlink/netlink_gen.h +++ b/net/devlink/netlink_gen.h @@ -13,6 +13,7 @@ /* Common nested types */ extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1]; +extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1]; extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1]; /* Ops table for devlink */ diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 8828ffaf6cbc..d39300a9b3d4 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -80,6 +80,29 @@ devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) return ERR_PTR(-EINVAL); } +static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw) +{ + struct nlattr *nla_tc_bw; + int i; + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { + nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS); + if (!nla_tc_bw) + return -EMSGSIZE; + + if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) || + nla_put_u32(msg, DEVLINK_ATTR_RATE_TC_BW, tc_bw[i])) + goto nla_put_failure; + + nla_nest_end(msg, nla_tc_bw); + } + return 0; + +nla_put_failure: + nla_nest_cancel(msg, nla_tc_bw); + return -EMSGSIZE; +} + static int devlink_nl_rate_fill(struct sk_buff *msg, struct devlink_rate *devlink_rate, enum devlink_command cmd, u32 portid, u32 seq, @@ -129,6 +152,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, devlink_rate->parent->name)) goto nla_put_failure; + if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw)) + goto nla_put_failure; + genlmsg_end(msg, hdr); return 0; @@ -316,6 +342,87 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, return 0; } +static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw, + unsigned long *bitmap, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[DEVLINK_ATTR_MAX + 1]; + u8 tc_index; + int err; + + err = nla_parse_nested(tb, DEVLINK_ATTR_MAX, parent_nest, + devlink_dl_rate_tc_bws_nl_policy, extack); + if (err) + return err; + + if (!tb[DEVLINK_ATTR_RATE_TC_INDEX]) { + NL_SET_ERR_ATTR_MISS(extack, parent_nest, + DEVLINK_ATTR_RATE_TC_INDEX); + return -EINVAL; + } + + tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]); + + if (!tb[DEVLINK_ATTR_RATE_TC_BW]) { + NL_SET_ERR_ATTR_MISS(extack, parent_nest, + DEVLINK_ATTR_RATE_TC_BW); + return -EINVAL; + } + + if (test_and_set_bit(tc_index, bitmap)) { + NL_SET_ERR_MSG_FMT(extack, + "Duplicate traffic class index specified (%u)", + tc_index); + return -EINVAL; + } + + tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_TC_BW]); + + return 0; +} + +static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate, + struct genl_info *info) +{ + DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {}; + struct devlink *devlink = devlink_rate->devlink; + const struct devlink_ops *ops = devlink->ops; + u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {}; + int rem, err = -EOPNOTSUPP, i; + struct nlattr *attr; + + nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr, + GENL_HDRLEN, rem) { + err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap, + info->extack); + if (err) + return err; + } + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { + if (!test_bit(i, bitmap)) { + NL_SET_ERR_MSG_FMT(info->extack, + "Bandwidth values must be specified for all %u traffic classes", + DEVLINK_RATE_TCS_MAX); + return -EINVAL; + } + } + + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, + tc_bw, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, + tc_bw, info->extack); + + if (err) + return err; + + memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw)); + + return 0; +} + static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, const struct devlink_ops *ops, struct genl_info *info) @@ -388,6 +495,12 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, return err; } + if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) { + err = devlink_nl_rate_tc_bw_set(devlink_rate, info); + if (err) + return err; + } + return 0; } @@ -423,6 +536,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX weight set isn't supported for the leafs"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && + !ops->rate_leaf_tc_bw_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TC_BWS], + "TC bandwidth set isn't supported for the leafs"); + return false; + } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); @@ -449,6 +569,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX weight set isn't supported for the nodes"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && + !ops->rate_node_tc_bw_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TC_BWS], + "TC bandwidth set isn't supported for the nodes"); + return false; + } } else { WARN(1, "Unknown type of rate object"); return false; -- cgit v1.2.3 From cf73d9970ea4f8cace5d8f02d2565a2723003112 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Jul 2025 21:31:54 +0100 Subject: io_uring: don't use int for ABI __kernel_rwf_t is defined as int, the actual size of which is implementation defined. It won't go well if some compiler / archs ever defines it as i64, so replace it with __u32, hoping that there is no one using i16 for it. Cc: stable@vger.kernel.org Fixes: 2b188cc1bb857 ("Add io_uring IO interface") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/47c666c4ee1df2018863af3a2028af18feef11ed.1751412511.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b6be063693c8..b8a0e70ee2fd 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -50,7 +50,7 @@ struct io_uring_sqe { }; __u32 len; /* buffer size or number of iovecs */ union { - __kernel_rwf_t rw_flags; + __u32 rw_flags; __u32 fsync_flags; __u16 poll_events; /* compatibility */ __u32 poll32_events; /* word-reversed for BE */ -- cgit v1.2.3 From aa89281bbc0b61610c96074c6390aed44474ebd0 Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Thu, 12 Jun 2025 16:58:21 +0200 Subject: media: pisp_be: Use clamp() and define max sizes Use the clamp() function from minmax.h and provide a define for the max sizes as they will be used in subsequent patches. Reviewed-by: Daniel Scally Reviewed-by: Stefan Klug Signed-off-by: Jacopo Mondi Signed-off-by: Hans Verkuil --- drivers/media/platform/raspberrypi/pisp_be/pisp_be.c | 11 +++++++---- include/uapi/linux/media/raspberrypi/pisp_be_config.h | 9 +++++---- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/media/platform/raspberrypi/pisp_be/pisp_be.c b/drivers/media/platform/raspberrypi/pisp_be/pisp_be.c index be794a123620..b30891718d8d 100644 --- a/drivers/media/platform/raspberrypi/pisp_be/pisp_be.c +++ b/drivers/media/platform/raspberrypi/pisp_be/pisp_be.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1112,10 +1113,12 @@ static void pispbe_try_format(struct v4l2_format *f, struct pispbe_node *node) f->fmt.pix_mp.pixelformat = fmt->fourcc; f->fmt.pix_mp.num_planes = fmt->num_planes; f->fmt.pix_mp.field = V4L2_FIELD_NONE; - f->fmt.pix_mp.width = max(min(f->fmt.pix_mp.width, 65536u), - PISP_BACK_END_MIN_TILE_WIDTH); - f->fmt.pix_mp.height = max(min(f->fmt.pix_mp.height, 65536u), - PISP_BACK_END_MIN_TILE_HEIGHT); + f->fmt.pix_mp.width = clamp(f->fmt.pix_mp.width, + PISP_BACK_END_MIN_TILE_WIDTH, + PISP_BACK_END_MAX_TILE_WIDTH); + f->fmt.pix_mp.height = clamp(f->fmt.pix_mp.height, + PISP_BACK_END_MIN_TILE_HEIGHT, + PISP_BACK_END_MAX_TILE_HEIGHT); /* * Fill in the actual colour space when the requested one was diff --git a/include/uapi/linux/media/raspberrypi/pisp_be_config.h b/include/uapi/linux/media/raspberrypi/pisp_be_config.h index cbeb714f4d61..2ad3b90684d7 100644 --- a/include/uapi/linux/media/raspberrypi/pisp_be_config.h +++ b/include/uapi/linux/media/raspberrypi/pisp_be_config.h @@ -21,10 +21,11 @@ /* preferred byte alignment for outputs */ #define PISP_BACK_END_OUTPUT_MAX_ALIGN 64u -/* minimum allowed tile width anywhere in the pipeline */ -#define PISP_BACK_END_MIN_TILE_WIDTH 16u -/* minimum allowed tile width anywhere in the pipeline */ -#define PISP_BACK_END_MIN_TILE_HEIGHT 16u +/* minimum allowed tile sizes anywhere in the pipeline */ +#define PISP_BACK_END_MIN_TILE_WIDTH 16u +#define PISP_BACK_END_MIN_TILE_HEIGHT 16u +#define PISP_BACK_END_MAX_TILE_WIDTH 65536u +#define PISP_BACK_END_MAX_TILE_HEIGHT 65536u #define PISP_BACK_END_NUM_OUTPUTS 2 #define PISP_BACK_END_HOG_OUTPUT 1 -- cgit v1.2.3 From 78584431e2cea6b60909cfa23c90ac8b33ab4198 Mon Sep 17 00:00:00 2001 From: Daniel Scally Date: Mon, 30 Jun 2025 23:27:34 +0100 Subject: media: v4l2: Add Renesas Camera Receiver Unit pixel formats The Renesas Camera Receiver Unit in the RZ/V2H SoC can output RAW data captured from an image sensor without conversion to an RGB/YUV format. In that case the data are packed into 64-bit blocks, with a variable amount of padding in the most significant bits depending on the bitdepth of the data. Add new V4L2 pixel format codes for the new formats, along with documentation to describe them. Reviewed-by: Lad Prabhakar Reviewed-by: Laurent Pinchart Reviewed-by: Jacopo Mondi Signed-off-by: Daniel Scally Link: https://lore.kernel.org/r/20250630222734.2712390-1-dan.scally@ideasonboard.com Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- .../userspace-api/media/v4l/pixfmt-bayer.rst | 1 + .../userspace-api/media/v4l/pixfmt-rawnn-cru.rst | 143 +++++++++++++++++++++ drivers/media/v4l2-core/v4l2-common.c | 6 + drivers/media/v4l2-core/v4l2-ioctl.c | 4 + include/uapi/linux/videodev2.h | 6 + 5 files changed, 160 insertions(+) create mode 100644 Documentation/userspace-api/media/v4l/pixfmt-rawnn-cru.rst (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/media/v4l/pixfmt-bayer.rst b/Documentation/userspace-api/media/v4l/pixfmt-bayer.rst index ed3eb432967d..b5ca501842b0 100644 --- a/Documentation/userspace-api/media/v4l/pixfmt-bayer.rst +++ b/Documentation/userspace-api/media/v4l/pixfmt-bayer.rst @@ -19,6 +19,7 @@ orders. See also `the Wikipedia article on Bayer filter .. toctree:: :maxdepth: 1 + pixfmt-rawnn-cru pixfmt-srggb8 pixfmt-srggb8-pisp-comp pixfmt-srggb10 diff --git a/Documentation/userspace-api/media/v4l/pixfmt-rawnn-cru.rst b/Documentation/userspace-api/media/v4l/pixfmt-rawnn-cru.rst new file mode 100644 index 000000000000..db81f1cfe0f5 --- /dev/null +++ b/Documentation/userspace-api/media/v4l/pixfmt-rawnn-cru.rst @@ -0,0 +1,143 @@ +.. SPDX-License-Identifier: GFDL-1.1-no-invariants-or-later + +.. _v4l2-pix-fmt-raw-cru10: +.. _v4l2-pix-fmt-raw-cru12: +.. _v4l2-pix-fmt-raw-cru14: +.. _v4l2-pix-fmt-raw-cru20: + +********************************************************************************************************************************** +V4L2_PIX_FMT_RAW_CRU10 ('CR10'), V4L2_PIX_FMT_RAW_CRU12 ('CR12'), V4L2_PIX_FMT_RAW_CRU14 ('CR14'), V4L2_PIX_FMT_RAW_CRU20 ('CR20') +********************************************************************************************************************************** + +=============================================================== +Renesas RZ/V2H Camera Receiver Unit 64-bit packed pixel formats +=============================================================== + +| V4L2_PIX_FMT_RAW_CRU10 (CR10) +| V4L2_PIX_FMT_RAW_CRU12 (CR12) +| V4L2_PIX_FMT_RAW_CRU14 (CR14) +| V4L2_PIX_FMT_RAW_CRU20 (CR20) + +Description +=========== + +These pixel formats are some of the RAW outputs for the Camera Receiver Unit in +the Renesas RZ/V2H SoC. They are raw formats which pack pixels contiguously into +64-bit units, with the 4 or 8 most significant bits padded. + +**Byte Order** + +.. flat-table:: RAW formats + :header-rows: 2 + :stub-columns: 0 + :widths: 36 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 + :fill-cells: + + * - :rspan:`1` Pixel Format Code + - :cspan:`63` Data organization + * - 63 + - 62 + - 61 + - 60 + - 59 + - 58 + - 57 + - 56 + - 55 + - 54 + - 53 + - 52 + - 51 + - 50 + - 49 + - 48 + - 47 + - 46 + - 45 + - 44 + - 43 + - 42 + - 41 + - 40 + - 39 + - 38 + - 37 + - 36 + - 35 + - 34 + - 33 + - 32 + - 31 + - 30 + - 29 + - 28 + - 27 + - 26 + - 25 + - 24 + - 23 + - 22 + - 21 + - 20 + - 19 + - 18 + - 17 + - 16 + - 15 + - 14 + - 13 + - 12 + - 11 + - 10 + - 9 + - 8 + - 7 + - 6 + - 5 + - 4 + - 3 + - 2 + - 1 + - 0 + * - V4L2_PIX_FMT_RAW_CRU10 + - 0 + - 0 + - 0 + - 0 + - :cspan:`9` P5 + - :cspan:`9` P4 + - :cspan:`9` P3 + - :cspan:`9` P2 + - :cspan:`9` P1 + - :cspan:`9` P0 + * - V4L2_PIX_FMT_RAW_CRU12 + - 0 + - 0 + - 0 + - 0 + - :cspan:`11` P4 + - :cspan:`11` P3 + - :cspan:`11` P2 + - :cspan:`11` P1 + - :cspan:`11` P0 + * - V4L2_PIX_FMT_RAW_CRU14 + - 0 + - 0 + - 0 + - 0 + - 0 + - 0 + - 0 + - 0 + - :cspan:`13` P3 + - :cspan:`13` P2 + - :cspan:`13` P1 + - :cspan:`13` P0 + * - V4L2_PIX_FMT_RAW_CRU20 + - 0 + - 0 + - 0 + - 0 + - :cspan:`19` P2 + - :cspan:`19` P1 + - :cspan:`19` P0 diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c index e1fc8fe43b74..df40756ac8f6 100644 --- a/drivers/media/v4l2-core/v4l2-common.c +++ b/drivers/media/v4l2-core/v4l2-common.c @@ -344,6 +344,12 @@ const struct v4l2_format_info *v4l2_format_info(u32 format) { .format = V4L2_PIX_FMT_SGBRG12, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .bpp_div = { 1, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SGRBG12, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .bpp_div = { 1, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 }, { .format = V4L2_PIX_FMT_SRGGB12, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .bpp_div = { 1, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 }, + + /* Renesas Camera Data Receiver Unit formats, bayer order agnostic */ + { .format = V4L2_PIX_FMT_RAW_CRU10, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 8, 0, 0, 0 }, .bpp_div = { 6, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_RAW_CRU12, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 8, 0, 0, 0 }, .bpp_div = { 5, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_RAW_CRU14, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 8, 0, 0, 0 }, .bpp_div = { 4, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_RAW_CRU20, .pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 8, 0, 0, 0 }, .bpp_div = { 3, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 }, }; unsigned int i; diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 650dc1956f73..be94a79b976e 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -1413,6 +1413,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt) case V4L2_PIX_FMT_SGBRG10DPCM8: descr = "8-bit Bayer GBGB/RGRG (DPCM)"; break; case V4L2_PIX_FMT_SGRBG10DPCM8: descr = "8-bit Bayer GRGR/BGBG (DPCM)"; break; case V4L2_PIX_FMT_SRGGB10DPCM8: descr = "8-bit Bayer RGRG/GBGB (DPCM)"; break; + case V4L2_PIX_FMT_RAW_CRU10: descr = "10-bit Raw CRU Packed"; break; case V4L2_PIX_FMT_SBGGR12: descr = "12-bit Bayer BGBG/GRGR"; break; case V4L2_PIX_FMT_SGBRG12: descr = "12-bit Bayer GBGB/RGRG"; break; case V4L2_PIX_FMT_SGRBG12: descr = "12-bit Bayer GRGR/BGBG"; break; @@ -1421,6 +1422,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt) case V4L2_PIX_FMT_SGBRG12P: descr = "12-bit Bayer GBGB/RGRG Packed"; break; case V4L2_PIX_FMT_SGRBG12P: descr = "12-bit Bayer GRGR/BGBG Packed"; break; case V4L2_PIX_FMT_SRGGB12P: descr = "12-bit Bayer RGRG/GBGB Packed"; break; + case V4L2_PIX_FMT_RAW_CRU12: descr = "12-bit Raw CRU Packed"; break; case V4L2_PIX_FMT_SBGGR14: descr = "14-bit Bayer BGBG/GRGR"; break; case V4L2_PIX_FMT_SGBRG14: descr = "14-bit Bayer GBGB/RGRG"; break; case V4L2_PIX_FMT_SGRBG14: descr = "14-bit Bayer GRGR/BGBG"; break; @@ -1429,10 +1431,12 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt) case V4L2_PIX_FMT_SGBRG14P: descr = "14-bit Bayer GBGB/RGRG Packed"; break; case V4L2_PIX_FMT_SGRBG14P: descr = "14-bit Bayer GRGR/BGBG Packed"; break; case V4L2_PIX_FMT_SRGGB14P: descr = "14-bit Bayer RGRG/GBGB Packed"; break; + case V4L2_PIX_FMT_RAW_CRU14: descr = "14-bit Raw CRU Packed"; break; case V4L2_PIX_FMT_SBGGR16: descr = "16-bit Bayer BGBG/GRGR"; break; case V4L2_PIX_FMT_SGBRG16: descr = "16-bit Bayer GBGB/RGRG"; break; case V4L2_PIX_FMT_SGRBG16: descr = "16-bit Bayer GRGR/BGBG"; break; case V4L2_PIX_FMT_SRGGB16: descr = "16-bit Bayer RGRG/GBGB"; break; + case V4L2_PIX_FMT_RAW_CRU20: descr = "14-bit Raw CRU Packed"; break; case V4L2_PIX_FMT_SN9C20X_I420: descr = "GSPCA SN9C20X I420"; break; case V4L2_PIX_FMT_SPCA501: descr = "GSPCA SPCA501"; break; case V4L2_PIX_FMT_SPCA505: descr = "GSPCA SPCA505"; break; diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 9e3b366d5fc7..6f7bd38dd5aa 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -840,6 +840,12 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_PISP_COMP2_BGGR v4l2_fourcc('P', 'C', '2', 'B') /* PiSP 8-bit mode 2 compressed BGGR bayer */ #define V4L2_PIX_FMT_PISP_COMP2_MONO v4l2_fourcc('P', 'C', '2', 'M') /* PiSP 8-bit mode 2 compressed monochrome */ +/* Renesas RZ/V2H CRU packed formats. 64-bit units with contiguous pixels */ +#define V4L2_PIX_FMT_RAW_CRU10 v4l2_fourcc('C', 'R', '1', '0') +#define V4L2_PIX_FMT_RAW_CRU12 v4l2_fourcc('C', 'R', '1', '2') +#define V4L2_PIX_FMT_RAW_CRU14 v4l2_fourcc('C', 'R', '1', '4') +#define V4L2_PIX_FMT_RAW_CRU20 v4l2_fourcc('C', 'R', '2', '0') + /* SDR formats - used only for Software Defined Radio devices */ #define V4L2_SDR_FMT_CU8 v4l2_fourcc('C', 'U', '0', '8') /* IQ u8 */ #define V4L2_SDR_FMT_CU16LE v4l2_fourcc('C', 'U', '1', '6') /* IQ u16le */ -- cgit v1.2.3 From 5ab154f1463a111e1dc8fd5d31eaa7a2a71fe2e6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:08 -0700 Subject: bpf: Introduce BPF standard streams Add support for a stream API to the kernel and expose related kfuncs to BPF programs. Two streams are exposed, BPF_STDOUT and BPF_STDERR. These can be used for printing messages that can be consumed from user space, thus it's similar in spirit to existing trace_pipe interface. The kernel will use the BPF_STDERR stream to notify the program of any errors encountered at runtime. BPF programs themselves may use both streams for writing debug messages. BPF library-like code may use BPF_STDERR to print warnings or errors on misuse at runtime. The implementation of a stream is as follows. Everytime a message is emitted from the kernel (directly, or through a BPF program), a record is allocated by bump allocating from per-cpu region backed by a page obtained using alloc_pages_nolock(). This ensures that we can allocate memory from any context. The eventual plan is to discard this scheme in favor of Alexei's kmalloc_nolock() [0]. This record is then locklessly inserted into a list (llist_add()) so that the printing side doesn't require holding any locks, and works in any context. Each stream has a maximum capacity of 4MB of text, and each printed message is accounted against this limit. Messages from a program are emitted using the bpf_stream_vprintk kfunc, which takes a stream_id argument in addition to working otherwise similar to bpf_trace_vprintk. The bprintf buffer helpers are extracted out to be reused for printing the string into them before copying it into the stream, so that we can (with the defined max limit) format a string and know its true length before performing allocations of the stream element. For consuming elements from a stream, we expose a bpf(2) syscall command named BPF_PROG_STREAM_READ_BY_FD, which allows reading data from the stream of a given prog_fd into a user space buffer. The main logic is implemented in bpf_stream_read(). The log messages are queued in bpf_stream::log by the bpf_stream_vprintk kfunc, and then pulled and ordered correctly in the stream backlog. For this purpose, we hold a lock around bpf_stream_backlog_peek(), as llist_del_first() (if we maintained a second lockless list for the backlog) wouldn't be safe from multiple threads anyway. Then, if we fail to find something in the backlog log, we splice out everything from the lockless log, and place it in the backlog log, and then return the head of the backlog. Once the full length of the element is consumed, we will pop it and free it. The lockless list bpf_stream::log is a LIFO stack. Elements obtained using a llist_del_all() operation are in LIFO order, thus would break the chronological ordering if printed directly. Hence, this batch of messages is first reversed. Then, it is stashed into a separate list in the stream, i.e. the backlog_log. The head of this list is the actual message that should always be returned to the caller. All of this is done in bpf_stream_backlog_fill(). From the kernel side, the writing into the stream will be a bit more involved than the typical printk. First, the kernel typically may print a collection of messages into the stream, and parallel writers into the stream may suffer from interleaving of messages. To ensure each group of messages is visible atomically, we can lift the advantage of using a lockless list for pushing in messages. To enable this, we add a bpf_stream_stage() macro, and require kernel users to use bpf_stream_printk statements for the passed expression to write into the stream. Underneath the macro, we have a message staging API, where a bpf_stream_stage object on the stack accumulates the messages being printed into a local llist_head, and then a commit operation splices the whole batch into the stream's lockless log list. This is especially pertinent for rqspinlock deadlock messages printed to program streams. After this change, we see each deadlock invocation as a non-interleaving contiguous message without any confusion on the reader's part, improving their user experience in debugging the fault. While programs cannot benefit from this staged stream writing API, they could just as well hold an rqspinlock around their print statements to serialize messages, hence this is kept kernel-internal for now. Overall, this infrastructure provides NMI-safe any context printing of messages to two dedicated streams. Later patches will add support for printing splats in case of BPF arena page faults, rqspinlock deadlocks, and cond_break timeouts, and integration of this facility into bpftool for dumping messages to user space. [0]: https://lore.kernel.org/bpf/20250501032718.65476-1-alexei.starovoitov@gmail.com Reviewed-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 52 +++++ include/uapi/linux/bpf.h | 24 +++ kernel/bpf/Makefile | 2 +- kernel/bpf/core.c | 5 + kernel/bpf/helpers.c | 1 + kernel/bpf/stream.c | 478 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 25 +++ kernel/bpf/verifier.c | 1 + tools/include/uapi/linux/bpf.h | 24 +++ 9 files changed, 611 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/stream.c (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a07451457d9e..f61aeccb23c3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1538,6 +1538,37 @@ struct btf_mod_pair { struct bpf_kfunc_desc_tab; +enum bpf_stream_id { + BPF_STDOUT = 1, + BPF_STDERR = 2, +}; + +struct bpf_stream_elem { + struct llist_node node; + int total_len; + int consumed_len; + char str[]; +}; + +enum { + /* 100k bytes */ + BPF_STREAM_MAX_CAPACITY = 100000ULL, +}; + +struct bpf_stream { + atomic_t capacity; + struct llist_head log; /* list of in-flight stream elements in LIFO order */ + + struct mutex lock; /* lock protecting backlog_{head,tail} */ + struct llist_node *backlog_head; /* list of in-flight stream elements in FIFO order */ + struct llist_node *backlog_tail; /* tail of the list above */ +}; + +struct bpf_stream_stage { + struct llist_head log; + int len; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -1646,6 +1677,7 @@ struct bpf_prog_aux { struct work_struct work; struct rcu_head rcu; }; + struct bpf_stream stream[2]; }; struct bpf_prog { @@ -2409,6 +2441,7 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); + int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG @@ -3574,6 +3607,25 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data); int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs); void bpf_put_buffers(void); +void bpf_prog_stream_init(struct bpf_prog *prog); +void bpf_prog_stream_free(struct bpf_prog *prog); +int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len); +void bpf_stream_stage_init(struct bpf_stream_stage *ss); +void bpf_stream_stage_free(struct bpf_stream_stage *ss); +__printf(2, 3) +int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...); +int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, + enum bpf_stream_id stream_id); + +#define bpf_stream_printk(ss, ...) bpf_stream_stage_printk(&ss, __VA_ARGS__) + +#define bpf_stream_stage(ss, prog, stream_id, expr) \ + ({ \ + bpf_stream_stage_init(&ss); \ + (expr); \ + bpf_stream_stage_commit(&ss, prog, stream_id); \ + bpf_stream_stage_free(&ss); \ + }) #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 719ba230032f..0670e15a6100 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -906,6 +906,17 @@ union bpf_iter_link_info { * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_STREAM_READ_BY_FD + * Description + * Read data of a program's BPF stream. The program is identified + * by *prog_fd*, and the stream is identified by the *stream_id*. + * The data is copied to a buffer pointed to by *stream_buf*, and + * filled less than or equal to *stream_buf_len* bytes. + * + * Return + * Number of bytes read from the stream on success, or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -961,6 +972,7 @@ enum bpf_cmd { BPF_LINK_DETACH, BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, + BPF_PROG_STREAM_READ_BY_FD, __MAX_BPF_CMD, }; @@ -1463,6 +1475,11 @@ struct bpf_stack_build_id { #define BPF_OBJ_NAME_LEN 16U +enum { + BPF_STREAM_STDOUT = 1, + BPF_STREAM_STDERR = 2, +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -1849,6 +1866,13 @@ union bpf_attr { __u32 bpffs_fd; } token_create; + struct { + __aligned_u64 stream_buf; + __u32 stream_buf_len; + __u32 stream_id; + __u32 prog_fd; + } prog_stream_read; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3a335c50e6e3..269c04a24664 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o stream.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e536a34a32c8..f0def24573ae 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -134,6 +134,10 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); +#ifdef CONFIG_BPF_SYSCALL + bpf_prog_stream_init(fp); +#endif + return fp; } @@ -2862,6 +2866,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); + bpf_prog_stream_free(aux->prog); #endif #ifdef CONFIG_CGROUP_BPF if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index da66ce307e75..6bfcbcdf6588 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3825,6 +3825,7 @@ BTF_ID_FLAGS(func, bpf_strnstr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif +BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c new file mode 100644 index 000000000000..e434541358db --- /dev/null +++ b/kernel/bpf/stream.c @@ -0,0 +1,478 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe + * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and + * stash it in a local per-CPU variable, and bump allocate from the page + * whenever items need to be printed to a stream. Each page holds a global + * atomic refcount in its first 4 bytes, and then records of variable length + * that describe the printed messages. Once the global refcount has dropped to + * zero, it is a signal to free the page back to the kernel's page allocator, + * given all the individual records in it have been consumed. + * + * It is possible the same page is used to serve allocations across different + * programs, which may be consumed at different times individually, hence + * maintaining a reference count per-page is critical for correct lifetime + * tracking. + * + * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it + * lands. + */ +struct bpf_stream_page { + refcount_t ref; + u32 consumed; + char buf[]; +}; + +/* Available room to add data to a refcounted page. */ +#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed)) + +static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock); +static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page); + +static bool bpf_stream_page_local_lock(unsigned long *flags) +{ + return local_trylock_irqsave(&stream_local_lock, *flags); +} + +static void bpf_stream_page_local_unlock(unsigned long *flags) +{ + local_unlock_irqrestore(&stream_local_lock, *flags); +} + +static void bpf_stream_page_free(struct bpf_stream_page *stream_page) +{ + struct page *p; + + if (!stream_page) + return; + p = virt_to_page(stream_page); + free_pages_nolock(p, 0); +} + +static void bpf_stream_page_get(struct bpf_stream_page *stream_page) +{ + refcount_inc(&stream_page->ref); +} + +static void bpf_stream_page_put(struct bpf_stream_page *stream_page) +{ + if (refcount_dec_and_test(&stream_page->ref)) + bpf_stream_page_free(stream_page); +} + +static void bpf_stream_page_init(struct bpf_stream_page *stream_page) +{ + refcount_set(&stream_page->ref, 1); + stream_page->consumed = 0; +} + +static struct bpf_stream_page *bpf_stream_page_replace(void) +{ + struct bpf_stream_page *stream_page, *old_stream_page; + struct page *page; + + page = alloc_pages_nolock(NUMA_NO_NODE, 0); + if (!page) + return NULL; + stream_page = page_address(page); + bpf_stream_page_init(stream_page); + + old_stream_page = this_cpu_read(stream_pcpu_page); + if (old_stream_page) + bpf_stream_page_put(old_stream_page); + this_cpu_write(stream_pcpu_page, stream_page); + return stream_page; +} + +static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len) +{ + int min = offsetof(struct bpf_stream_elem, str[0]); + int consumed = stream_page->consumed; + int total = BPF_STREAM_PAGE_SZ; + int rem = max(0, total - consumed - min); + + /* Let's give room of at least 8 bytes. */ + WARN_ON_ONCE(rem % 8 != 0); + rem = rem < 8 ? 0 : rem; + return min(len, rem); +} + +static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) +{ + init_llist_node(&elem->node); + elem->total_len = len; + elem->consumed_len = 0; +} + +static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem) +{ + unsigned long addr = (unsigned long)elem; + + return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr); +} + +static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len) +{ + u32 consumed = stream_page->consumed; + + stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8); + return (struct bpf_stream_elem *)&stream_page->buf[consumed]; +} + +static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len) +{ + struct bpf_stream_elem *elem = NULL; + struct bpf_stream_page *page; + int room = 0; + + page = this_cpu_read(stream_pcpu_page); + if (!page) + page = bpf_stream_page_replace(); + if (!page) + return NULL; + + room = bpf_stream_page_check_room(page, len); + if (room != len) + page = bpf_stream_page_replace(); + if (!page) + return NULL; + bpf_stream_page_get(page); + room = bpf_stream_page_check_room(page, len); + WARN_ON_ONCE(room != len); + + elem = bpf_stream_page_push_elem(page, room); + bpf_stream_elem_init(elem, room); + return elem; +} + +static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) +{ + const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf); + struct bpf_stream_elem *elem; + unsigned long flags; + + BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ); + /* + * Length denotes the amount of data to be written as part of stream element, + * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can + * accomodate, therefore deny allocations that won't fit into them. + */ + if (len < 0 || len > max_len) + return NULL; + + if (!bpf_stream_page_local_lock(&flags)) + return NULL; + elem = bpf_stream_page_reserve_elem(len); + bpf_stream_page_local_unlock(&flags); + return elem; +} + +static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len) +{ + struct bpf_stream_elem *elem = NULL; + + /* + * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream + * log, elements will be popped at once and reversed to print the log. + */ + elem = bpf_stream_elem_alloc(len); + if (!elem) + return -ENOMEM; + + memcpy(elem->str, str, len); + llist_add(&elem->node, log); + + return 0; +} + +static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len) +{ + if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY) + return -ENOSPC; + if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) { + atomic_sub(len, &stream->capacity); + return -ENOSPC; + } + return 0; +} + +static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem) +{ + int len = elem->total_len; + + atomic_sub(len, &stream->capacity); +} + +static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len) +{ + int ret = bpf_stream_consume_capacity(stream, len); + + return ret ?: __bpf_stream_push_str(&stream->log, str, len); +} + +static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bpf_prog_aux *aux) +{ + if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR) + return NULL; + return &aux->stream[stream_id - 1]; +} + +static void bpf_stream_free_elem(struct bpf_stream_elem *elem) +{ + struct bpf_stream_page *p; + + p = bpf_stream_page_from_elem(elem); + bpf_stream_page_put(p); +} + +static void bpf_stream_free_list(struct llist_node *list) +{ + struct bpf_stream_elem *elem, *tmp; + + llist_for_each_entry_safe(elem, tmp, list, node) + bpf_stream_free_elem(elem); +} + +static struct llist_node *bpf_stream_backlog_peek(struct bpf_stream *stream) +{ + return stream->backlog_head; +} + +static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream) +{ + struct llist_node *node; + + node = stream->backlog_head; + if (stream->backlog_head == stream->backlog_tail) + stream->backlog_head = stream->backlog_tail = NULL; + else + stream->backlog_head = node->next; + return node; +} + +static void bpf_stream_backlog_fill(struct bpf_stream *stream) +{ + struct llist_node *head, *tail; + + if (llist_empty(&stream->log)) + return; + tail = llist_del_all(&stream->log); + if (!tail) + return; + head = llist_reverse_order(tail); + + if (!stream->backlog_head) { + stream->backlog_head = head; + stream->backlog_tail = tail; + } else { + stream->backlog_tail->next = head; + stream->backlog_tail = tail; + } + + return; +} + +static bool bpf_stream_consume_elem(struct bpf_stream_elem *elem, int *len) +{ + int rem = elem->total_len - elem->consumed_len; + int used = min(rem, *len); + + elem->consumed_len += used; + *len -= used; + + return elem->consumed_len == elem->total_len; +} + +static int bpf_stream_read(struct bpf_stream *stream, void __user *buf, int len) +{ + int rem_len = len, cons_len, ret = 0; + struct bpf_stream_elem *elem = NULL; + struct llist_node *node; + + mutex_lock(&stream->lock); + + while (rem_len) { + int pos = len - rem_len; + bool cont; + + node = bpf_stream_backlog_peek(stream); + if (!node) { + bpf_stream_backlog_fill(stream); + node = bpf_stream_backlog_peek(stream); + } + if (!node) + break; + elem = container_of(node, typeof(*elem), node); + + cons_len = elem->consumed_len; + cont = bpf_stream_consume_elem(elem, &rem_len) == false; + + ret = copy_to_user(buf + pos, elem->str + cons_len, + elem->consumed_len - cons_len); + /* Restore in case of error. */ + if (ret) { + ret = -EFAULT; + elem->consumed_len = cons_len; + break; + } + + if (cont) + continue; + bpf_stream_backlog_pop(stream); + bpf_stream_release_capacity(stream, elem); + bpf_stream_free_elem(elem); + } + + mutex_unlock(&stream->lock); + return ret ? ret : len - rem_len; +} + +int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len) +{ + struct bpf_stream *stream; + + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + return -ENOENT; + return bpf_stream_read(stream, buf, len); +} + +__bpf_kfunc_start_defs(); + +/* + * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the + * enum in headers. + */ +__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog) +{ + struct bpf_bprintf_data data = { + .get_bin_args = true, + .get_buf = true, + }; + struct bpf_prog_aux *aux = aux__prog; + u32 fmt_size = strlen(fmt__str) + 1; + struct bpf_stream *stream; + u32 data_len = len__sz; + int ret, num_args; + + stream = bpf_stream_get(stream_id, aux); + if (!stream) + return -ENOENT; + + if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || + (data_len && !args)) + return -EINVAL; + num_args = data_len / 8; + + ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data); + if (ret < 0) + return ret; + + ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args); + /* Exclude NULL byte during push. */ + ret = bpf_stream_push_str(stream, data.buf, ret); + bpf_bprintf_cleanup(&data); + + return ret; +} + +__bpf_kfunc_end_defs(); + +/* Added kfunc to common_btf_ids */ + +void bpf_prog_stream_init(struct bpf_prog *prog) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + atomic_set(&prog->aux->stream[i].capacity, 0); + init_llist_head(&prog->aux->stream[i].log); + mutex_init(&prog->aux->stream[i].lock); + prog->aux->stream[i].backlog_head = NULL; + prog->aux->stream[i].backlog_tail = NULL; + } +} + +void bpf_prog_stream_free(struct bpf_prog *prog) +{ + struct llist_node *list; + int i; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + list = llist_del_all(&prog->aux->stream[i].log); + bpf_stream_free_list(list); + bpf_stream_free_list(prog->aux->stream[i].backlog_head); + } +} + +void bpf_stream_stage_init(struct bpf_stream_stage *ss) +{ + init_llist_head(&ss->log); + ss->len = 0; +} + +void bpf_stream_stage_free(struct bpf_stream_stage *ss) +{ + struct llist_node *node; + + node = llist_del_all(&ss->log); + bpf_stream_free_list(node); +} + +int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...) +{ + struct bpf_bprintf_buffers *buf; + va_list args; + int ret; + + if (bpf_try_get_buffers(&buf)) + return -EBUSY; + + va_start(args, fmt); + ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args); + va_end(args); + ss->len += ret; + /* Exclude NULL byte during push. */ + ret = __bpf_stream_push_str(&ss->log, buf->buf, ret); + bpf_put_buffers(); + return ret; +} + +int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, + enum bpf_stream_id stream_id) +{ + struct llist_node *list, *head, *tail; + struct bpf_stream *stream; + int ret; + + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + return -EINVAL; + + ret = bpf_stream_consume_capacity(stream, ss->len); + if (ret) + return ret; + + list = llist_del_all(&ss->log); + head = tail = list; + + if (!list) + return 0; + while (llist_next(list)) { + tail = llist_next(list); + list = tail; + } + llist_add_batch(head, tail, &stream->log); + return 0; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f1d9ee9717a1..7db7182a3057 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5943,6 +5943,28 @@ static int token_create(union bpf_attr *attr) return bpf_token_create(attr); } +#define BPF_PROG_STREAM_READ_BY_FD_LAST_FIELD prog_stream_read.prog_fd + +static int prog_stream_read(union bpf_attr *attr) +{ + char __user *buf = u64_to_user_ptr(attr->prog_stream_read.stream_buf); + u32 len = attr->prog_stream_read.stream_buf_len; + struct bpf_prog *prog; + int ret; + + if (CHECK_ATTR(BPF_PROG_STREAM_READ_BY_FD)) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_stream_read.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + ret = bpf_prog_stream_read(prog, attr->prog_stream_read.stream_id, buf, len); + bpf_prog_put(prog); + + return ret; +} + static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -6079,6 +6101,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) case BPF_TOKEN_CREATE: err = token_create(&attr); break; + case BPF_PROG_STREAM_READ_BY_FD: + err = prog_stream_read(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 52e36fd23f40..9f09dcd2eabb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -46,6 +46,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { enum bpf_features { BPF_FEAT_RDONLY_CAST_TO_VOID = 0, + BPF_FEAT_STREAMS = 1, __MAX_BPF_FEAT, }; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 719ba230032f..0670e15a6100 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -906,6 +906,17 @@ union bpf_iter_link_info { * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_STREAM_READ_BY_FD + * Description + * Read data of a program's BPF stream. The program is identified + * by *prog_fd*, and the stream is identified by the *stream_id*. + * The data is copied to a buffer pointed to by *stream_buf*, and + * filled less than or equal to *stream_buf_len* bytes. + * + * Return + * Number of bytes read from the stream on success, or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -961,6 +972,7 @@ enum bpf_cmd { BPF_LINK_DETACH, BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, + BPF_PROG_STREAM_READ_BY_FD, __MAX_BPF_CMD, }; @@ -1463,6 +1475,11 @@ struct bpf_stack_build_id { #define BPF_OBJ_NAME_LEN 16U +enum { + BPF_STREAM_STDOUT = 1, + BPF_STREAM_STDERR = 2, +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -1849,6 +1866,13 @@ union bpf_attr { __u32 bpffs_fd; } token_create; + struct { + __aligned_u64 stream_buf; + __u32 stream_buf_len; + __u32 stream_id; + __u32 prog_fd; + } prog_stream_read; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From ca115d7e754691c0219eec95ec94dbac7f87daef Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 3 Jul 2025 09:36:41 +0200 Subject: tree-wide: s/struct fileattr/struct file_kattr/g Now that we expose struct file_attr as our uapi struct rename all the internal struct to struct file_kattr to clearly communicate that it is a kernel internal struct. This is similar to struct mount_{k}attr and others. Link: https://lore.kernel.org/20250703-restlaufzeit-baurecht-9ed44552b481@brauner Signed-off-by: Christian Brauner --- Documentation/filesystems/locking.rst | 4 ++-- Documentation/filesystems/vfs.rst | 4 ++-- fs/bcachefs/fs.c | 4 ++-- fs/btrfs/ioctl.c | 4 ++-- fs/btrfs/ioctl.h | 6 +++--- fs/ecryptfs/inode.c | 4 ++-- fs/efivarfs/inode.c | 4 ++-- fs/ext2/ext2.h | 4 ++-- fs/ext2/ioctl.c | 4 ++-- fs/ext4/ext4.h | 4 ++-- fs/ext4/ioctl.c | 4 ++-- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/file.c | 4 ++-- fs/file_attr.c | 34 +++++++++++++++++----------------- fs/fuse/fuse_i.h | 4 ++-- fs/fuse/ioctl.c | 4 ++-- fs/gfs2/file.c | 4 ++-- fs/gfs2/inode.h | 4 ++-- fs/hfsplus/hfsplus_fs.h | 4 ++-- fs/hfsplus/inode.c | 4 ++-- fs/jfs/ioctl.c | 4 ++-- fs/jfs/jfs_inode.h | 4 ++-- fs/nilfs2/ioctl.c | 4 ++-- fs/nilfs2/nilfs.h | 4 ++-- fs/ocfs2/ioctl.c | 4 ++-- fs/ocfs2/ioctl.h | 4 ++-- fs/orangefs/inode.c | 4 ++-- fs/overlayfs/copy_up.c | 4 ++-- fs/overlayfs/inode.c | 12 ++++++------ fs/overlayfs/overlayfs.h | 10 +++++----- fs/overlayfs/util.c | 2 +- fs/ubifs/ioctl.c | 4 ++-- fs/ubifs/ubifs.h | 4 ++-- fs/xfs/xfs_ioctl.c | 18 +++++++++--------- fs/xfs/xfs_ioctl.h | 4 ++-- include/linux/fileattr.h | 14 +++++++------- include/linux/fs.h | 6 +++--- include/linux/lsm_hook_defs.h | 4 ++-- include/linux/security.h | 8 ++++---- include/uapi/linux/fs.h | 2 +- mm/shmem.c | 4 ++-- security/security.c | 4 ++-- security/selinux/hooks.c | 4 ++-- 43 files changed, 122 insertions(+), 122 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 2e567e341c3b..2ff02653d7cc 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -87,8 +87,8 @@ prototypes:: int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); int (*fileattr_set)(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); - int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); + int (*fileattr_get)(struct dentry *dentry, struct file_kattr *fa); struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int); struct offset_ctx *(*get_offset_ctx)(struct inode *inode); diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index fd32a9a17bfb..f2bbf4def123 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -515,8 +515,8 @@ As of kernel 2.6.22, the following members are defined: struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int); int (*set_acl)(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); int (*fileattr_set)(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); - int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); + int (*fileattr_get)(struct dentry *dentry, struct file_kattr *fa); struct offset_ctx *(*get_offset_ctx)(struct inode *inode); }; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 85d13f800165..7c4de887629c 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1619,7 +1619,7 @@ static const __maybe_unused unsigned bch_flags_to_xflags[] = { }; static int bch2_fileattr_get(struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -1682,7 +1682,7 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans, static int bch2_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); struct bch_fs *c = inode->v.i_sb->s_fs_info; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 913acef3f0a9..ffb28bfba4fa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -245,7 +245,7 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_ * Set flags/xflags from the internal inode flags. The remaining items of * fsxattr are zeroed. */ -int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); @@ -254,7 +254,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int btrfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_root *root = inode->root; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index e08ea446cf48..ccf6bed9cc24 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -8,7 +8,7 @@ struct file; struct dentry; struct mnt_idmap; -struct fileattr; +struct file_kattr; struct io_uring_cmd; struct btrfs_inode; struct btrfs_fs_info; @@ -16,9 +16,9 @@ struct btrfs_ioctl_balance_args; long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int btrfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 493d7f194956..d83416af17b4 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1124,13 +1124,13 @@ out: return rc; } -static int ecryptfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +static int ecryptfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { return vfs_fileattr_get(ecryptfs_dentry_to_lower(dentry), fa); } static int ecryptfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); int rc; diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 98a7299a9ee9..2891614abf8d 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -138,7 +138,7 @@ const struct inode_operations efivarfs_dir_inode_operations = { }; static int -efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +efivarfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { unsigned int i_flags; unsigned int flags = 0; @@ -154,7 +154,7 @@ efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) static int efivarfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { unsigned int i_flags = 0; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 4025f875252a..cf97b76e9fd3 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -750,9 +750,9 @@ extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); /* ioctl.c */ -extern int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa); +extern int ext2_fileattr_get(struct dentry *dentry, struct file_kattr *fa); extern int ext2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); extern long ext2_ioctl(struct file *, unsigned int, unsigned long); extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 44e04484e570..c3fea55b8efa 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -18,7 +18,7 @@ #include #include -int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int ext2_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct ext2_inode_info *ei = EXT2_I(d_inode(dentry)); @@ -28,7 +28,7 @@ int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int ext2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct ext2_inode_info *ei = EXT2_I(inode); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 18373de980f2..7d962e7f388a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3103,8 +3103,8 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); int ext4_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); -int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); +int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); int ext4_update_overhead(struct super_block *sb, bool force); int ext4_force_shutdown(struct super_block *sb, u32 flags); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5668a17458ae..84e3c73952d7 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -980,7 +980,7 @@ group_add_out: return err; } -int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct ext4_inode_info *ei = EXT4_I(inode); @@ -997,7 +997,7 @@ int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int ext4_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); u32 flags = fa->flags; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9333a22b9a01..c78464792ceb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3615,9 +3615,9 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, bool readonly, bool need_lock); int f2fs_precache_extents(struct inode *inode); -int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int f2fs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6bd3de64f2a8..90180ca22abd 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3356,7 +3356,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) } #endif -int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -3380,7 +3380,7 @@ int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int f2fs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); u32 fsflags = fa->flags, mask = F2FS_SETTABLE_FS_FL; diff --git a/fs/file_attr.c b/fs/file_attr.c index 21d6a0607345..17745c89e2be 100644 --- a/fs/file_attr.c +++ b/fs/file_attr.c @@ -17,7 +17,7 @@ * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All * other fields are zeroed. */ -void fileattr_fill_xflags(struct fileattr *fa, u32 xflags) +void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags) { memset(fa, 0, sizeof(*fa)); fa->fsx_valid = true; @@ -47,7 +47,7 @@ EXPORT_SYMBOL(fileattr_fill_xflags); * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags). * All other fields are zeroed. */ -void fileattr_fill_flags(struct fileattr *fa, u32 flags) +void fileattr_fill_flags(struct file_kattr *fa, u32 flags) { memset(fa, 0, sizeof(*fa)); fa->flags_valid = true; @@ -78,7 +78,7 @@ EXPORT_SYMBOL(fileattr_fill_flags); * * Return: 0 on success, or a negative error on failure. */ -int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); int error; @@ -94,7 +94,7 @@ int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } EXPORT_SYMBOL(vfs_fileattr_get); -static void fileattr_to_file_attr(const struct fileattr *fa, +static void fileattr_to_file_attr(const struct file_kattr *fa, struct file_attr *fattr) { __u32 mask = FS_XFLAGS_MASK; @@ -114,7 +114,7 @@ static void fileattr_to_file_attr(const struct fileattr *fa, * * Return: 0 on success, or -EFAULT on failure. */ -int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa) +int copy_fsxattr_to_user(const struct file_kattr *fa, struct fsxattr __user *ufa) { struct fsxattr xfa; __u32 mask = FS_XFLAGS_MASK; @@ -134,7 +134,7 @@ int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa) EXPORT_SYMBOL(copy_fsxattr_to_user); static int file_attr_to_fileattr(const struct file_attr *fattr, - struct fileattr *fa) + struct file_kattr *fa) { __u32 mask = FS_XFLAGS_MASK; @@ -150,7 +150,7 @@ static int file_attr_to_fileattr(const struct file_attr *fattr, return 0; } -static int copy_fsxattr_from_user(struct fileattr *fa, +static int copy_fsxattr_from_user(struct file_kattr *fa, struct fsxattr __user *ufa) { struct fsxattr xfa; @@ -179,8 +179,8 @@ static int copy_fsxattr_from_user(struct fileattr *fa, * Note: must be called with inode lock held. */ static int fileattr_set_prepare(struct inode *inode, - const struct fileattr *old_ma, - struct fileattr *fa) + const struct file_kattr *old_ma, + struct file_kattr *fa) { int err; @@ -263,10 +263,10 @@ static int fileattr_set_prepare(struct inode *inode, * Return: 0 on success, or a negative error on failure. */ int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { struct inode *inode = d_inode(dentry); - struct fileattr old_ma = {}; + struct file_kattr old_ma = {}; int err; if (!inode->i_op->fileattr_set) @@ -308,7 +308,7 @@ EXPORT_SYMBOL(vfs_fileattr_set); int ioctl_getflags(struct file *file, unsigned int __user *argp) { - struct fileattr fa = { .flags_valid = true }; /* hint only */ + struct file_kattr fa = { .flags_valid = true }; /* hint only */ int err; err = vfs_fileattr_get(file->f_path.dentry, &fa); @@ -324,7 +324,7 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; - struct fileattr fa; + struct file_kattr fa; unsigned int flags; int err; @@ -345,7 +345,7 @@ EXPORT_SYMBOL(ioctl_setflags); int ioctl_fsgetxattr(struct file *file, void __user *argp) { - struct fileattr fa = { .fsx_valid = true }; /* hint only */ + struct file_kattr fa = { .fsx_valid = true }; /* hint only */ int err; err = vfs_fileattr_get(file->f_path.dentry, &fa); @@ -362,7 +362,7 @@ int ioctl_fssetxattr(struct file *file, void __user *argp) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct dentry *dentry = file->f_path.dentry; - struct fileattr fa; + struct file_kattr fa; int err; err = copy_fsxattr_from_user(&fa, argp); @@ -387,7 +387,7 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename, struct filename *name __free(putname) = NULL; unsigned int lookup_flags = 0; struct file_attr fattr; - struct fileattr fa; + struct file_kattr fa; int error; BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0); @@ -442,7 +442,7 @@ SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename, struct filename *name __free(putname) = NULL; unsigned int lookup_flags = 0; struct file_attr fattr; - struct fileattr fa; + struct file_kattr fa; int error; BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b54f4f57789f..501f64ceeab3 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1486,9 +1486,9 @@ void fuse_dax_cancel_work(struct fuse_conn *fc); long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int fuse_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); /* iomode.c */ int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index f2692f7d5932..57032eadca6c 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -502,7 +502,7 @@ static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff) fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode)); } -int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct fuse_file *ff; @@ -542,7 +542,7 @@ cleanup: } int fuse_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct fuse_file *ff; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index fd1147aa3891..65f4371f428c 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -155,7 +155,7 @@ static inline u32 gfs2_gfsflags_to_fsflags(struct inode *inode, u32 gfsflags) return fsflags; } -int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int gfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); @@ -276,7 +276,7 @@ out: } int gfs2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); u32 fsflags = fa->flags, gfsflags = 0; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index eafe123617e6..dd970e644fe0 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -107,9 +107,9 @@ loff_t gfs2_seek_hole(struct file *file, loff_t offset); extern const struct file_operations gfs2_file_fops_nolock; extern const struct file_operations gfs2_dir_fops_nolock; -int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int gfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int gfs2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); void gfs2_set_inode_flags(struct inode *inode); #ifdef CONFIG_GFS2_FS_LOCKING_DLM diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 2f089bff0095..927db2b8b17c 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -489,9 +489,9 @@ int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path, unsigned int query_flags); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); -int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int hfsplus_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int hfsplus_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); /* ioctl.c */ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index f331e9574217..3ec0b33808c0 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -654,7 +654,7 @@ out: return res; } -int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int hfsplus_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); @@ -673,7 +673,7 @@ int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int hfsplus_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index f7bd7e8f5be4..563f148be8af 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -57,7 +57,7 @@ static long jfs_map_ext2(unsigned long flags, int from) return mapped; } -int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int jfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct jfs_inode_info *jfs_inode = JFS_IP(d_inode(dentry)); unsigned int flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE; @@ -71,7 +71,7 @@ int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int jfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct jfs_inode_info *jfs_inode = JFS_IP(inode); diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index ea80661597ac..2c6c81c8cb9f 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -9,9 +9,9 @@ struct fid; extern struct inode *ialloc(struct inode *, umode_t); extern int jfs_fsync(struct file *, loff_t, loff_t, int); -extern int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +extern int jfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); extern int jfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); extern long jfs_ioctl(struct file *, unsigned int, unsigned long); extern struct inode *jfs_iget(struct super_block *, unsigned long); extern int jfs_commit_inode(struct inode *, int); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index a66d62a51f77..3288c3b4be9e 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -118,7 +118,7 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, * * Return: always 0 as success. */ -int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int nilfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); @@ -136,7 +136,7 @@ int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) * Return: 0 on success, or a negative error code on failure. */ int nilfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct nilfs_transaction_info ti; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index cb6ed54accd7..f466daa39440 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -268,9 +268,9 @@ int nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, extern int nilfs_sync_file(struct file *, loff_t, loff_t, int); /* ioctl.c */ -int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *m); +int nilfs_fileattr_get(struct dentry *dentry, struct file_kattr *m); int nilfs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); long nilfs_ioctl(struct file *, unsigned int, unsigned long); long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *, diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 7ae96fb8807a..db14c92302a1 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -62,7 +62,7 @@ static inline int o2info_coherent(struct ocfs2_info_request *req) return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT)); } -int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); unsigned int flags; @@ -83,7 +83,7 @@ int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int ocfs2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); unsigned int flags = fa->flags; diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h index 48a5fdfe87a1..4a1c2313b429 100644 --- a/fs/ocfs2/ioctl.h +++ b/fs/ocfs2/ioctl.h @@ -11,9 +11,9 @@ #ifndef OCFS2_IOCTL_PROTO_H #define OCFS2_IOCTL_PROTO_H -int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int ocfs2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 08a6f372a352..926d1659902d 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -887,7 +887,7 @@ int orangefs_update_time(struct inode *inode, int flags) return __orangefs_setattr(inode, &iattr); } -static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +static int orangefs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { u64 val = 0; int ret; @@ -908,7 +908,7 @@ static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } static int orangefs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { u64 val = 0; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 2c646b7076d0..74817e1ece19 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -171,8 +171,8 @@ out: static int ovl_copy_fileattr(struct inode *inode, const struct path *old, const struct path *new) { - struct fileattr oldfa = { .flags_valid = true }; - struct fileattr newfa = { .flags_valid = true }; + struct file_kattr oldfa = { .flags_valid = true }; + struct file_kattr newfa = { .flags_valid = true }; int err; err = ovl_real_fileattr_get(old, &oldfa); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index cf3581dc1034..ecb9f2019395 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -610,7 +610,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Introducing security_inode_fileattr_get/set() hooks would solve this issue * properly. */ -static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa, +static int ovl_security_fileattr(const struct path *realpath, struct file_kattr *fa, bool set) { struct file *file; @@ -637,7 +637,7 @@ static int ovl_security_fileattr(const struct path *realpath, struct fileattr *f return err; } -int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa) +int ovl_real_fileattr_set(const struct path *realpath, struct file_kattr *fa) { int err; @@ -649,7 +649,7 @@ int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa) } int ovl_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct path upperpath; @@ -697,7 +697,7 @@ out: } /* Convert inode protection flags to fileattr flags */ -static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa) +static void ovl_fileattr_prot_flags(struct inode *inode, struct file_kattr *fa) { BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL); BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON); @@ -712,7 +712,7 @@ static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa) } } -int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa) +int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa) { int err; @@ -723,7 +723,7 @@ int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa) return vfs_fileattr_get(realpath->dentry, fa); } -int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct path realpath; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 8baaba0a3fe5..e19d91f22186 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -815,7 +815,7 @@ void ovl_copyattr(struct inode *to); void ovl_check_protattr(struct inode *inode, struct dentry *upper); int ovl_set_protattr(struct inode *inode, struct dentry *upper, - struct fileattr *fa); + struct file_kattr *fa); static inline void ovl_copyflags(struct inode *from, struct inode *to) { @@ -847,11 +847,11 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, /* file.c */ extern const struct file_operations ovl_file_operations; -int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa); -int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa); -int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa); +int ovl_real_fileattr_set(const struct path *realpath, struct file_kattr *fa); +int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int ovl_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); struct ovl_file; struct ovl_file *ovl_file_alloc(struct file *realfile); void ovl_file_free(struct ovl_file *of); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index dcccb4b4a66c..607860f199a8 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -959,7 +959,7 @@ void ovl_check_protattr(struct inode *inode, struct dentry *upper) } int ovl_set_protattr(struct inode *inode, struct dentry *upper, - struct fileattr *fa) + struct file_kattr *fa) { struct ovl_fs *ofs = OVL_FS(inode->i_sb); char buf[OVL_PROTATTR_MAX]; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 2c99349cf537..79536b2e3d7a 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -130,7 +130,7 @@ static int setflags(struct inode *inode, int flags) return err; } -int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int ubifs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); int flags = ubifs2ioctl(ubifs_inode(inode)->flags); @@ -145,7 +145,7 @@ int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa) } int ubifs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); int flags = fa->flags; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 256dbaeeb0de..5db45c9e26ee 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2073,9 +2073,9 @@ int ubifs_recover_size(struct ubifs_info *c, bool in_place); void ubifs_destroy_size_tree(struct ubifs_info *c); /* ioctl.c */ -int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int ubifs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int ubifs_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); void ubifs_set_inode_flags(struct inode *inode); #ifdef CONFIG_COMPAT diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d250f7f74e3b..911b68175ad8 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -444,7 +444,7 @@ static void xfs_fill_fsxattr( struct xfs_inode *ip, int whichfork, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); @@ -496,7 +496,7 @@ xfs_ioc_fsgetxattra( xfs_inode_t *ip, void __user *arg) { - struct fileattr fa; + struct file_kattr fa; xfs_ilock(ip, XFS_ILOCK_SHARED); xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa); @@ -508,7 +508,7 @@ xfs_ioc_fsgetxattra( int xfs_fileattr_get( struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_inode *ip = XFS_I(d_inode(dentry)); @@ -526,7 +526,7 @@ static int xfs_ioctl_setattr_xflags( struct xfs_trans *tp, struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); @@ -582,7 +582,7 @@ xfs_ioctl_setattr_xflags( static void xfs_ioctl_setattr_prepare_dax( struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); @@ -642,7 +642,7 @@ out_error: static int xfs_ioctl_setattr_check_extsize( struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; xfs_failaddr_t failaddr; @@ -684,7 +684,7 @@ xfs_ioctl_setattr_check_extsize( static int xfs_ioctl_setattr_check_cowextsize( struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; xfs_failaddr_t failaddr; @@ -709,7 +709,7 @@ xfs_ioctl_setattr_check_cowextsize( static int xfs_ioctl_setattr_check_projid( struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { if (!fa->fsx_valid) return 0; @@ -725,7 +725,7 @@ int xfs_fileattr_set( struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_inode *ip = XFS_I(d_inode(dentry)); struct xfs_mount *mp = ip->i_mount; diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 12124946f347..f5ed5cf9d3df 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -17,13 +17,13 @@ xfs_ioc_swapext( extern int xfs_fileattr_get( struct dentry *dentry, - struct fileattr *fa); + struct file_kattr *fa); extern int xfs_fileattr_set( struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa); + struct file_kattr *fa); extern long xfs_file_ioctl( diff --git a/include/linux/fileattr.h b/include/linux/fileattr.h index e2a2f4ae242d..f89dcfad3f8f 100644 --- a/include/linux/fileattr.h +++ b/include/linux/fileattr.h @@ -40,7 +40,7 @@ * is handled by the VFS helpers, so filesystems are free to implement just one * or both of these sub-interfaces. */ -struct fileattr { +struct file_kattr { u32 flags; /* flags (FS_IOC_GETFLAGS/FS_IOC_SETFLAGS) */ /* struct fsxattr: */ u32 fsx_xflags; /* xflags field value (get/set) */ @@ -53,10 +53,10 @@ struct fileattr { bool fsx_valid:1; }; -int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa); +int copy_fsxattr_to_user(const struct file_kattr *fa, struct fsxattr __user *ufa); -void fileattr_fill_xflags(struct fileattr *fa, u32 xflags); -void fileattr_fill_flags(struct fileattr *fa, u32 flags); +void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags); +void fileattr_fill_flags(struct file_kattr *fa, u32 flags); /** * fileattr_has_fsx - check for extended flags/attributes @@ -65,16 +65,16 @@ void fileattr_fill_flags(struct fileattr *fa, u32 flags); * Return: true if any attributes are present that are not represented in * ->flags. */ -static inline bool fileattr_has_fsx(const struct fileattr *fa) +static inline bool fileattr_has_fsx(const struct file_kattr *fa) { return fa->fsx_valid && ((fa->fsx_xflags & ~FS_XFLAG_COMMON) || fa->fsx_extsize != 0 || fa->fsx_projid != 0 || fa->fsx_cowextsize != 0); } -int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa); + struct file_kattr *fa); int ioctl_getflags(struct file *file, unsigned int __user *argp); int ioctl_setflags(struct file *file, unsigned int __user *argp); int ioctl_fsgetxattr(struct file *file, void __user *argp); diff --git a/include/linux/fs.h b/include/linux/fs.h index 96c7925a6551..0c58617645ea 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -80,7 +80,7 @@ struct fsnotify_mark_connector; struct fsnotify_sb_info; struct fs_context; struct fs_parameter_spec; -struct fileattr; +struct file_kattr; struct iomap_ops; extern void __init inode_init(void); @@ -2254,8 +2254,8 @@ struct inode_operations { int (*set_acl)(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); int (*fileattr_set)(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); - int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); + int (*fileattr_get)(struct dentry *dentry, struct file_kattr *fa); struct offset_ctx *(*get_offset_ctx)(struct inode *inode); } ____cacheline_aligned; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 9600a4350e79..fd11fffdd3c3 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -157,8 +157,8 @@ LSM_HOOK(int, 0, inode_removexattr, struct mnt_idmap *idmap, struct dentry *dentry, const char *name) LSM_HOOK(void, LSM_RET_VOID, inode_post_removexattr, struct dentry *dentry, const char *name) -LSM_HOOK(int, 0, inode_file_setattr, struct dentry *dentry, struct fileattr *fa) -LSM_HOOK(int, 0, inode_file_getattr, struct dentry *dentry, struct fileattr *fa) +LSM_HOOK(int, 0, inode_file_setattr, struct dentry *dentry, struct file_kattr *fa) +LSM_HOOK(int, 0, inode_file_getattr, struct dentry *dentry, struct file_kattr *fa) LSM_HOOK(int, 0, inode_set_acl, struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) LSM_HOOK(void, LSM_RET_VOID, inode_post_set_acl, struct dentry *dentry, diff --git a/include/linux/security.h b/include/linux/security.h index 9ed0d0e0c81f..b95b5540c429 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -452,9 +452,9 @@ int security_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name); void security_inode_post_removexattr(struct dentry *dentry, const char *name); int security_inode_file_setattr(struct dentry *dentry, - struct fileattr *fa); + struct file_kattr *fa); int security_inode_file_getattr(struct dentry *dentry, - struct fileattr *fa); + struct file_kattr *fa); int security_inode_need_killpriv(struct dentry *dentry); int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry); int security_inode_getsecurity(struct mnt_idmap *idmap, @@ -1057,13 +1057,13 @@ static inline void security_inode_post_removexattr(struct dentry *dentry, { } static inline int security_inode_file_setattr(struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { return 0; } static inline int security_inode_file_getattr(struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { return 0; } diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 9663dbdda181..6e136c9c6a22 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -151,7 +151,7 @@ struct fsxattr { /* * Variable size structure for file_[sg]et_attr(). * - * Note. This is alternative to the structure 'struct fileattr'/'struct fsxattr'. + * Note. This is alternative to the structure 'struct file_kattr'/'struct fsxattr'. * As this structure is passed to/from userspace with its size, this can * be versioned based on the size. */ diff --git a/mm/shmem.c b/mm/shmem.c index 0c5fb4ffa03a..6311fe35c577 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4183,7 +4183,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, #ifdef CONFIG_TMPFS_XATTR -static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) +static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); @@ -4193,7 +4193,7 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) } static int shmem_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); diff --git a/security/security.c b/security/security.c index 711b4de40b8d..a5766cbf6f7c 100644 --- a/security/security.c +++ b/security/security.c @@ -2632,7 +2632,7 @@ void security_inode_post_removexattr(struct dentry *dentry, const char *name) * * Return: Returns 0 if permission is granted. */ -int security_inode_file_setattr(struct dentry *dentry, struct fileattr *fa) +int security_inode_file_setattr(struct dentry *dentry, struct file_kattr *fa) { return call_int_hook(inode_file_setattr, dentry, fa); } @@ -2647,7 +2647,7 @@ int security_inode_file_setattr(struct dentry *dentry, struct fileattr *fa) * * Return: Returns 0 if permission is granted. */ -int security_inode_file_getattr(struct dentry *dentry, struct fileattr *fa) +int security_inode_file_getattr(struct dentry *dentry, struct file_kattr *fa) { return call_int_hook(inode_file_getattr, dentry, fa); } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index be7aca2269fa..0dadce2267c1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3481,13 +3481,13 @@ static int selinux_inode_removexattr(struct mnt_idmap *idmap, } static int selinux_inode_file_setattr(struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { return dentry_has_perm(current_cred(), dentry, FILE__SETATTR); } static int selinux_inode_file_getattr(struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { return dentry_has_perm(current_cred(), dentry, FILE__GETATTR); } -- cgit v1.2.3 From cdd73b1666079a73d061396f361df55d59fe96e6 Mon Sep 17 00:00:00 2001 From: Ariel Otilibili Date: Wed, 2 Jul 2025 12:00:21 +0200 Subject: uapi: fix broken link in linux/capability.h The link to the libcap library is outdated. Instead, use a link to the libcap2 library. As well, give the complete reference of the POSIX compliance. Signed-off-by: Ariel Otilibili Acked-by: Andrew G. Morgan Reviewed-by: Paul Moore Signed-off-by: Serge Hallyn --- include/uapi/linux/capability.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 2e21b5594f81..ea5a0899ecf0 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -6,9 +6,10 @@ * Alexander Kjeldaas * with help from Aleph1, Roland Buresund and Andrew Main. * - * See here for the libcap library ("POSIX draft" compliance): + * See here for the libcap2 library (compliant with Section 25 of + * the withdrawn POSIX 1003.1e Draft 17): * - * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/ + * https://www.kernel.org/pub/linux/libs/security/linux-privs/libcap2/ */ #ifndef _UAPI_LINUX_CAPABILITY_H -- cgit v1.2.3 From 9c06f26ba5f5da14bcac405c7a652dcf578a785d Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Wed, 30 Apr 2025 13:56:01 +0200 Subject: pwm: Add support for pwmchip devices for faster and easier userspace access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this change each pwmchip defining the new-style waveform callbacks can be accessed from userspace via a character device. Compared to the sysfs-API this is faster and allows to pass the whole configuration in a single ioctl allowing atomic application and thus reducing glitches. On an STM32MP13 I see: root@DistroKit:~ time pwmtestperf real 0m 1.27s user 0m 0.02s sys 0m 1.21s root@DistroKit:~ rm /dev/pwmchip0 root@DistroKit:~ time pwmtestperf real 0m 3.61s user 0m 0.27s sys 0m 3.26s pwmtestperf does essentially: for i in 0 .. 50000: pwm_set_waveform(duty_length_ns=i, period_length_ns=50000, duty_offset_ns=0) and in the presence of /dev/pwmchip0 is uses the ioctls introduced here, without that device it uses /sys/class/pwm/pwmchip0. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/ad4a4e49ae3f8ea81e23cac1ac12b338c3bf5c5b.1746010245.git.u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/core.c | 322 ++++++++++++++++++++++++++++++++++++++++++++--- include/linux/pwm.h | 3 + include/uapi/linux/pwm.h | 53 ++++++++ 3 files changed, 363 insertions(+), 15 deletions(-) create mode 100644 include/uapi/linux/pwm.h (limited to 'include/uapi/linux') diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index edf776b8ad53..50aa0528a265 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -23,9 +23,13 @@ #include +#include + #define CREATE_TRACE_POINTS #include +#define PWM_MINOR_COUNT 256 + /* protects access to pwm_chips */ static DEFINE_MUTEX(pwm_lock); @@ -2007,20 +2011,9 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id) } EXPORT_SYMBOL_GPL(pwm_get); -/** - * pwm_put() - release a PWM device - * @pwm: PWM device - */ -void pwm_put(struct pwm_device *pwm) +static void __pwm_put(struct pwm_device *pwm) { - struct pwm_chip *chip; - - if (!pwm) - return; - - chip = pwm->chip; - - guard(mutex)(&pwm_lock); + struct pwm_chip *chip = pwm->chip; /* * Trigger a warning if a consumer called pwm_put() twice. @@ -2041,6 +2034,20 @@ void pwm_put(struct pwm_device *pwm) module_put(chip->owner); } + +/** + * pwm_put() - release a PWM device + * @pwm: PWM device + */ +void pwm_put(struct pwm_device *pwm) +{ + if (!pwm) + return; + + guard(mutex)(&pwm_lock); + + __pwm_put(pwm); +} EXPORT_SYMBOL_GPL(pwm_put); static void devm_pwm_release(void *pwm) @@ -2110,6 +2117,274 @@ struct pwm_device *devm_fwnode_pwm_get(struct device *dev, } EXPORT_SYMBOL_GPL(devm_fwnode_pwm_get); +struct pwm_cdev_data { + struct pwm_chip *chip; + struct pwm_device *pwm[]; +}; + +static int pwm_cdev_open(struct inode *inode, struct file *file) +{ + struct pwm_chip *chip = container_of(inode->i_cdev, struct pwm_chip, cdev); + struct pwm_cdev_data *cdata; + + guard(mutex)(&pwm_lock); + + if (!chip->operational) + return -ENXIO; + + cdata = kzalloc(struct_size(cdata, pwm, chip->npwm), GFP_KERNEL); + if (!cdata) + return -ENOMEM; + + cdata->chip = chip; + + file->private_data = cdata; + + return nonseekable_open(inode, file); +} + +static int pwm_cdev_release(struct inode *inode, struct file *file) +{ + struct pwm_cdev_data *cdata = file->private_data; + unsigned int i; + + for (i = 0; i < cdata->chip->npwm; ++i) { + struct pwm_device *pwm = cdata->pwm[i]; + + if (pwm) { + const char *label = pwm->label; + + pwm_put(cdata->pwm[i]); + kfree(label); + } + } + kfree(cdata); + + return 0; +} + +static int pwm_cdev_request(struct pwm_cdev_data *cdata, unsigned int hwpwm) +{ + struct pwm_chip *chip = cdata->chip; + + if (hwpwm >= chip->npwm) + return -EINVAL; + + if (!cdata->pwm[hwpwm]) { + struct pwm_device *pwm = &chip->pwms[hwpwm]; + const char *label; + int ret; + + label = kasprintf(GFP_KERNEL, "pwm-cdev (pid=%d)", current->pid); + if (!label) + return -ENOMEM; + + ret = pwm_device_request(pwm, label); + if (ret < 0) { + kfree(label); + return ret; + } + + cdata->pwm[hwpwm] = pwm; + } + + return 0; +} + +static int pwm_cdev_free(struct pwm_cdev_data *cdata, unsigned int hwpwm) +{ + struct pwm_chip *chip = cdata->chip; + + if (hwpwm >= chip->npwm) + return -EINVAL; + + if (cdata->pwm[hwpwm]) { + struct pwm_device *pwm = cdata->pwm[hwpwm]; + const char *label = pwm->label; + + __pwm_put(pwm); + + kfree(label); + + cdata->pwm[hwpwm] = NULL; + } + + return 0; +} + +static struct pwm_device *pwm_cdev_get_requested_pwm(struct pwm_cdev_data *cdata, + u32 hwpwm) +{ + struct pwm_chip *chip = cdata->chip; + + if (hwpwm >= chip->npwm) + return ERR_PTR(-EINVAL); + + if (cdata->pwm[hwpwm]) + return cdata->pwm[hwpwm]; + + return ERR_PTR(-EINVAL); +} + +static long pwm_cdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int ret = 0; + struct pwm_cdev_data *cdata = file->private_data; + struct pwm_chip *chip = cdata->chip; + + guard(mutex)(&pwm_lock); + + if (!chip->operational) + return -ENODEV; + + switch (cmd) { + case PWM_IOCTL_REQUEST: + { + unsigned int hwpwm = arg; + + return pwm_cdev_request(cdata, hwpwm); + } + + case PWM_IOCTL_FREE: + { + unsigned int hwpwm = arg; + + return pwm_cdev_free(cdata, hwpwm); + } + + case PWM_IOCTL_ROUNDWF: + { + struct pwmchip_waveform cwf; + struct pwm_waveform wf; + struct pwm_device *pwm; + + ret = copy_from_user(&cwf, + (struct pwmchip_waveform __user *)arg, + sizeof(cwf)); + if (ret) + return -EFAULT; + + if (cwf.__pad != 0) + return -EINVAL; + + pwm = pwm_cdev_get_requested_pwm(cdata, cwf.hwpwm); + if (IS_ERR(pwm)) + return PTR_ERR(pwm); + + wf = (struct pwm_waveform) { + .period_length_ns = cwf.period_length_ns, + .duty_length_ns = cwf.duty_length_ns, + .duty_offset_ns = cwf.duty_offset_ns, + }; + + ret = pwm_round_waveform_might_sleep(pwm, &wf); + if (ret < 0) + return ret; + + cwf = (struct pwmchip_waveform) { + .hwpwm = cwf.hwpwm, + .period_length_ns = wf.period_length_ns, + .duty_length_ns = wf.duty_length_ns, + .duty_offset_ns = wf.duty_offset_ns, + }; + + return copy_to_user((struct pwmchip_waveform __user *)arg, + &cwf, sizeof(cwf)); + } + + case PWM_IOCTL_GETWF: + { + struct pwmchip_waveform cwf; + struct pwm_waveform wf; + struct pwm_device *pwm; + + ret = copy_from_user(&cwf, + (struct pwmchip_waveform __user *)arg, + sizeof(cwf)); + if (ret) + return -EFAULT; + + if (cwf.__pad != 0) + return -EINVAL; + + pwm = pwm_cdev_get_requested_pwm(cdata, cwf.hwpwm); + if (IS_ERR(pwm)) + return PTR_ERR(pwm); + + ret = pwm_get_waveform_might_sleep(pwm, &wf); + if (ret) + return ret; + + cwf = (struct pwmchip_waveform) { + .hwpwm = cwf.hwpwm, + .period_length_ns = wf.period_length_ns, + .duty_length_ns = wf.duty_length_ns, + .duty_offset_ns = wf.duty_offset_ns, + }; + + return copy_to_user((struct pwmchip_waveform __user *)arg, + &cwf, sizeof(cwf)); + } + + case PWM_IOCTL_SETROUNDEDWF: + case PWM_IOCTL_SETEXACTWF: + { + struct pwmchip_waveform cwf; + struct pwm_waveform wf; + struct pwm_device *pwm; + + ret = copy_from_user(&cwf, + (struct pwmchip_waveform __user *)arg, + sizeof(cwf)); + if (ret) + return -EFAULT; + + if (cwf.__pad != 0) + return -EINVAL; + + wf = (struct pwm_waveform){ + .period_length_ns = cwf.period_length_ns, + .duty_length_ns = cwf.duty_length_ns, + .duty_offset_ns = cwf.duty_offset_ns, + }; + + if (!pwm_wf_valid(&wf)) + return -EINVAL; + + pwm = pwm_cdev_get_requested_pwm(cdata, cwf.hwpwm); + if (IS_ERR(pwm)) + return PTR_ERR(pwm); + + ret = pwm_set_waveform_might_sleep(pwm, &wf, + cmd == PWM_IOCTL_SETEXACTWF); + + /* + * If userspace cares about rounding deviations it has + * to check the values anyhow, so simplify handling for + * them and don't signal uprounding. This matches the + * behaviour of PWM_IOCTL_ROUNDWF which also returns 0 + * in that case. + */ + if (ret == 1) + ret = 0; + + return ret; + } + + default: + return -ENOTTY; + } +} + +static const struct file_operations pwm_cdev_fileops = { + .open = pwm_cdev_open, + .release = pwm_cdev_release, + .owner = THIS_MODULE, + .unlocked_ioctl = pwm_cdev_ioctl, +}; + +static dev_t pwm_devt; + /** * __pwmchip_add() - register a new PWM chip * @chip: the PWM chip to add @@ -2162,7 +2437,17 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner) scoped_guard(pwmchip, chip) chip->operational = true; - ret = device_add(&chip->dev); + if (chip->ops->write_waveform) { + if (chip->id < PWM_MINOR_COUNT) + chip->dev.devt = MKDEV(MAJOR(pwm_devt), chip->id); + else + dev_warn(&chip->dev, "chip id too high to create a chardev\n"); + } + + cdev_init(&chip->cdev, &pwm_cdev_fileops); + chip->cdev.owner = owner; + + ret = cdev_device_add(&chip->cdev, &chip->dev); if (ret) goto err_device_add; @@ -2213,7 +2498,7 @@ void pwmchip_remove(struct pwm_chip *chip) idr_remove(&pwm_chips, chip->id); } - device_del(&chip->dev); + cdev_device_del(&chip->cdev, &chip->dev); } EXPORT_SYMBOL_GPL(pwmchip_remove); @@ -2357,9 +2642,16 @@ static int __init pwm_init(void) { int ret; + ret = alloc_chrdev_region(&pwm_devt, 0, PWM_MINOR_COUNT, "pwm"); + if (ret) { + pr_err("Failed to initialize chrdev region for PWM usage\n"); + return ret; + } + ret = class_register(&pwm_class); if (ret) { pr_err("Failed to initialize PWM class (%pe)\n", ERR_PTR(ret)); + unregister_chrdev_region(pwm_devt, 256); return ret; } diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 63a17d2b4ec8..2492c91452f9 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -2,6 +2,7 @@ #ifndef __LINUX_PWM_H #define __LINUX_PWM_H +#include #include #include #include @@ -311,6 +312,7 @@ struct pwm_ops { /** * struct pwm_chip - abstract a PWM controller * @dev: device providing the PWMs + * @cdev: &struct cdev for this device * @ops: callbacks for this PWM controller * @owner: module providing this chip * @id: unique number of this PWM chip @@ -325,6 +327,7 @@ struct pwm_ops { */ struct pwm_chip { struct device dev; + struct cdev cdev; const struct pwm_ops *ops; struct module *owner; unsigned int id; diff --git a/include/uapi/linux/pwm.h b/include/uapi/linux/pwm.h new file mode 100644 index 000000000000..182d59cc07ee --- /dev/null +++ b/include/uapi/linux/pwm.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ + +#ifndef _UAPI_PWM_H_ +#define _UAPI_PWM_H_ + +#include +#include + +/** + * struct pwmchip_waveform - Describe a PWM waveform for a pwm_chip's PWM channel + * @hwpwm: per-chip relative index of the PWM device + * @__pad: padding, must be zero + * @period_length_ns: duration of the repeating period. + * A value of 0 represents a disabled PWM. + * @duty_length_ns: duration of the active part in each period + * @duty_offset_ns: offset of the rising edge from a period's start + */ +struct pwmchip_waveform { + __u32 hwpwm; + __u32 __pad; + __u64 period_length_ns; + __u64 duty_length_ns; + __u64 duty_offset_ns; +}; + +/* Reserves the passed hwpwm for exclusive control. */ +#define PWM_IOCTL_REQUEST _IO(0x75, 1) + +/* counter part to PWM_IOCTL_REQUEST */ +#define PWM_IOCTL_FREE _IO(0x75, 2) + +/* + * Modifies the passed wf according to hardware constraints. All parameters are + * rounded down to the next possible value, unless there is no such value, then + * values are rounded up. Note that zero isn't considered for rounding down + * period_length_ns. + */ +#define PWM_IOCTL_ROUNDWF _IOWR(0x75, 3, struct pwmchip_waveform) + +/* Get the currently implemented waveform */ +#define PWM_IOCTL_GETWF _IOWR(0x75, 4, struct pwmchip_waveform) + +/* Like PWM_IOCTL_ROUNDWF + PWM_IOCTL_SETEXACTWF in one go. */ +#define PWM_IOCTL_SETROUNDEDWF _IOW(0x75, 5, struct pwmchip_waveform) + +/* + * Program the PWM to emit exactly the passed waveform, subject only to rounding + * down each value less than 1 ns. Returns 0 on success, -EDOM if the waveform + * cannot be implemented exactly, or other negative error codes. + */ +#define PWM_IOCTL_SETEXACTWF _IOW(0x75, 6, struct pwmchip_waveform) + +#endif /* _UAPI_PWM_H_ */ -- cgit v1.2.3 From 59f44c9ccc3bb68aa3b062b8e57ce0e1ee2fca75 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 2 Jul 2025 17:50:34 +0200 Subject: net: openvswitch: allow providing upcall pid for the 'execute' command When a packet enters OVS datapath and there is no flow to handle it, packet goes to userspace through a MISS upcall. With per-CPU upcall dispatch mechanism, we're using the current CPU id to select the Netlink PID on which to send this packet. This allows us to send packets from the same traffic flow through the same handler. The handler will process the packet, install required flow into the kernel and re-inject the original packet via OVS_PACKET_CMD_EXECUTE. While handling OVS_PACKET_CMD_EXECUTE, however, we may hit a recirculation action that will pass the (likely modified) packet through the flow lookup again. And if the flow is not found, the packet will be sent to userspace again through another MISS upcall. However, the handler thread in userspace is likely running on a different CPU core, and the OVS_PACKET_CMD_EXECUTE request is handled in the syscall context of that thread. So, when the time comes to send the packet through another upcall, the per-CPU dispatch will choose a different Netlink PID, and this packet will end up processed by a different handler thread on a different CPU. The process continues as long as there are new recirculations, each time the packet goes to a different handler thread before it is sent out of the OVS datapath to the destination port. In real setups the number of recirculations can go up to 4 or 5, sometimes more. There is always a chance to re-order packets while processing upcalls, because userspace will first install the flow and then re-inject the original packet. So, there is a race window when the flow is already installed and the second packet can match it and be forwarded to the destination before the first packet is re-injected. But the fact that packets are going through multiple upcalls handled by different userspace threads makes the reordering noticeably more likely, because we not only have a race between the kernel and a userspace handler (which is hard to avoid), but also between multiple userspace handlers. For example, let's assume that 10 packets got enqueued through a MISS upcall for handler-1, it will start processing them, will install the flow into the kernel and start re-injecting packets back, from where they will go through another MISS to handler-2. Handler-2 will install the flow into the kernel and start re-injecting the packets, while handler-1 continues to re-inject the last of the 10 packets, they will hit the flow installed by handler-2 and be forwarded without going to the handler-2, while handler-2 still re-injects the first of these 10 packets. Given multiple recirculations and misses, these 10 packets may end up completely mixed up on the output from the datapath. Let's allow userspace to specify on which Netlink PID the packets should be upcalled while processing OVS_PACKET_CMD_EXECUTE. This makes it possible to ensure that all the packets are processed by the same handler thread in the userspace even with them being upcalled multiple times in the process. Packets will remain in order since they will be enqueued to the same socket and re-injected in the same order. This doesn't eliminate re-ordering as stated above, since we still have a race between kernel and the userspace thread, but it allows to eliminate races between multiple userspace threads. Userspace knows the PID of the socket on which the original upcall is received, so there is no need to send it up from the kernel. Solution requires storing the value somewhere for the duration of the packet processing. There are two potential places for this: our skb extension or the per-CPU storage. It's not clear which is better, so just following currently used scheme of storing this kind of things along the skb. We still have a decent amount of space in the cb. Signed-off-by: Ilya Maximets Acked-by: Flavio Leitner Acked-by: Eelco Chaudron Acked-by: Aaron Conole Link: https://patch.msgid.link/20250702155043.2331772-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/openvswitch.h | 6 ++++++ net/openvswitch/actions.c | 6 ++++-- net/openvswitch/datapath.c | 8 +++++++- net/openvswitch/datapath.h | 3 +++ net/openvswitch/vport.c | 1 + 5 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 3a701bd1f31b..3092c2c6f1d2 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -186,6 +186,11 @@ enum ovs_packet_cmd { * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment * size. * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb). + * @OVS_PACKET_ATTR_UPCALL_PID: Netlink PID to use for upcalls while + * processing %OVS_PACKET_CMD_EXECUTE. Takes precedence over all other ways + * to determine the Netlink PID including %OVS_USERSPACE_ATTR_PID, + * %OVS_DP_ATTR_UPCALL_PID, %OVS_DP_ATTR_PER_CPU_PIDS and the + * %OVS_VPORT_ATTR_UPCALL_PID. * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_PACKET_* commands. @@ -205,6 +210,7 @@ enum ovs_packet_attr { OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */ OVS_PACKET_ATTR_LEN, /* Packet size before truncation. */ OVS_PACKET_ATTR_HASH, /* Packet hash. */ + OVS_PACKET_ATTR_UPCALL_PID, /* u32 Netlink PID. */ __OVS_PACKET_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 3add108340bf..2832e0794197 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -941,8 +941,10 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, break; case OVS_USERSPACE_ATTR_PID: - if (dp->user_features & - OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + if (OVS_CB(skb)->upcall_pid) + upcall.portid = OVS_CB(skb)->upcall_pid; + else if (dp->user_features & + OVS_DP_F_DISPATCH_UPCALL_PER_CPU) upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id()); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index b990dc83504f..d5b6e2002bc1 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -267,7 +267,9 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; - if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + if (OVS_CB(skb)->upcall_pid) + upcall.portid = OVS_CB(skb)->upcall_pid; + else if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id()); else @@ -651,6 +653,9 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) !!(hash & OVS_PACKET_HASH_L4_BIT)); } + OVS_CB(packet)->upcall_pid = + nla_get_u32_default(a[OVS_PACKET_ATTR_UPCALL_PID], 0); + /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); @@ -719,6 +724,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, [OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 }, + [OVS_PACKET_ATTR_UPCALL_PID] = { .type = NLA_U32 }, }; static const struct genl_small_ops dp_packet_genl_ops[] = { diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index cfeb817a1889..db0c3e69d66c 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -121,6 +121,8 @@ struct datapath { * @cutlen: The number of bytes from the packet end to be removed. * @probability: The sampling probability that was applied to this skb; 0 means * no sampling has occurred; U32_MAX means 100% probability. + * @upcall_pid: Netlink socket PID to use for sending this packet to userspace; + * 0 means "not set" and default per-CPU or per-vport dispatch should be used. */ struct ovs_skb_cb { struct vport *input_vport; @@ -128,6 +130,7 @@ struct ovs_skb_cb { u16 acts_origlen; u32 cutlen; u32 probability; + u32 upcall_pid; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 8732f6e51ae5..6bbbc16ab778 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -501,6 +501,7 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, OVS_CB(skb)->mru = 0; OVS_CB(skb)->cutlen = 0; OVS_CB(skb)->probability = 0; + OVS_CB(skb)->upcall_pid = 0; if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { u32 mark; -- cgit v1.2.3 From f55ce5a6cd33211c8cc5bce0554b6ac710a6a28b Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Sat, 5 Jul 2025 07:17:17 +0000 Subject: KVM: arm64: Expose new KVM cap for cacheable PFNMAP Introduce a new KVM capability to expose to the userspace whether cacheable mapping of PFNMAP is supported. The ability to safely do the cacheable mapping of PFNMAP is contingent on S2FWB and ARM64_HAS_CACHE_DIC. S2FWB allows KVM to avoid flushing the D cache, ARM64_HAS_CACHE_DIC allows KVM to avoid flushing the icache and turns icache_inval_pou() into a NOP. The cap would be false if those requirements are missing and is checked by making use of kvm_arch_supports_cacheable_pfnmap. This capability would allow userspace to discover the support. It could for instance be used by userspace to prevent live-migration across FWB and non-FWB hosts. CC: Catalin Marinas CC: Jason Gunthorpe CC: Oliver Upton CC: David Hildenbrand Suggested-by: Marc Zyngier Reviewed-by: Jason Gunthorpe Tested-by: Donald Dutile Signed-off-by: Ankit Agrawal Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20250705071717.5062-7-ankita@nvidia.com Signed-off-by: Oliver Upton --- Documentation/virt/kvm/api.rst | 13 ++++++++++++- arch/arm64/kvm/arm.c | 7 +++++++ include/uapi/linux/kvm.h | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9abf93ee5f65..53e0179d5294 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8585,7 +8585,7 @@ ENOSYS for the others. When enabled, KVM will exit to userspace with KVM_EXIT_SYSTEM_EVENT of type KVM_SYSTEM_EVENT_SUSPEND to process the guest suspend request. -7.37 KVM_CAP_ARM_WRITABLE_IMP_ID_REGS +7.42 KVM_CAP_ARM_WRITABLE_IMP_ID_REGS ------------------------------------- :Architectures: arm64 @@ -8614,6 +8614,17 @@ given VM. When this capability is enabled, KVM resets the VCPU when setting MP_STATE_INIT_RECEIVED through IOCTL. The original MP_STATE is preserved. +7.43 KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED +------------------------------------------- + +:Architectures: arm64 +:Target: VM +:Parameters: None + +This capability indicate to the userspace whether a PFNMAP memory region +can be safely mapped as cacheable. This relies on the presence of +force write back (FWB) feature support on the hardware. + 8. Other capabilities. ====================== diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 38a91bb5d4c7..25609aab032c 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -408,6 +408,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES: r = BIT(0); break; + case KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED: + if (!kvm) + r = -EINVAL; + else + r = kvm_supports_cacheable_pfnmap(); + break; + default: r = 0; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 37891580d05d..e4e566ff348b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -956,6 +956,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2 240 #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 +#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 struct kvm_irq_routing_irqchip { __u32 irqchip; -- cgit v1.2.3 From 3d98ee52659c3f1d3913ae5b97f7743c5247752c Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 27 Jun 2025 21:49:29 +0800 Subject: net: bonding: add broadcast_neighbor netlink option User can config or display the bonding broadcast_neighbor option via iproute2/netlink. Cc: Jay Vosburgh Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Jonathan Corbet Cc: Andrew Lunn Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nikolay Aleksandrov Signed-off-by: Tonghao Zhang Signed-off-by: Zengbing Tu Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/76b90700ba5b98027dfb51a2f3c5cfea0440a21b.1751031306.git.tonghao@bamaicloud.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_netlink.c | 16 ++++++++++++++++ include/uapi/linux/if_link.h | 1 + 2 files changed, 17 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index ac5e402c34bc..57fff2421f1b 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -124,6 +124,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = { [IFLA_BOND_MISSED_MAX] = { .type = NLA_U8 }, [IFLA_BOND_NS_IP6_TARGET] = { .type = NLA_NESTED }, [IFLA_BOND_COUPLED_CONTROL] = { .type = NLA_U8 }, + [IFLA_BOND_BROADCAST_NEIGH] = { .type = NLA_U8 }, }; static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = { @@ -561,6 +562,16 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[], return err; } + if (data[IFLA_BOND_BROADCAST_NEIGH]) { + int broadcast_neigh = nla_get_u8(data[IFLA_BOND_BROADCAST_NEIGH]); + + bond_opt_initval(&newval, broadcast_neigh); + err = __bond_opt_set(bond, BOND_OPT_BROADCAST_NEIGH, &newval, + data[IFLA_BOND_BROADCAST_NEIGH], extack); + if (err) + return err; + } + return 0; } @@ -630,6 +641,7 @@ static size_t bond_get_size(const struct net_device *bond_dev) nla_total_size(sizeof(struct nlattr)) + nla_total_size(sizeof(struct in6_addr)) * BOND_MAX_NS_TARGETS + nla_total_size(sizeof(u8)) + /* IFLA_BOND_COUPLED_CONTROL */ + nla_total_size(sizeof(u8)) + /* IFLA_BOND_BROADCAST_NEIGH */ 0; } @@ -793,6 +805,10 @@ static int bond_fill_info(struct sk_buff *skb, bond->params.coupled_control)) goto nla_put_failure; + if (nla_put_u8(skb, IFLA_BOND_BROADCAST_NEIGH, + bond->params.broadcast_neighbor)) + goto nla_put_failure; + if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info info; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 873c285996fe..784ace3a519c 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1535,6 +1535,7 @@ enum { IFLA_BOND_MISSED_MAX, IFLA_BOND_NS_IP6_TARGET, IFLA_BOND_COUPLED_CONTROL, + IFLA_BOND_BROADCAST_NEIGH, __IFLA_BOND_MAX, }; -- cgit v1.2.3 From ad39c12fcee34b8980a80ad5c803bca9906fbd4e Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 2 Jul 2025 14:20:13 +0800 Subject: net: mctp: add gateway routing support This change allows for gateway routing, where a route table entry may reference a routable endpoint (by network and EID), instead of routing directly to a netdevice. We add support for a RTM_GATEWAY attribute for netlink route updates, with an attribute format of: struct mctp_fq_addr { unsigned int net; mctp_eid_t eid; } - we need the net here to uniquely identify the target EID, as we no longer have the device reference directly (which would provide the net id in the case of direct routes). This makes route lookups recursive, as a route lookup that returns a gateway route must be resolved into a direct route (ie, to a device) eventually. We provide a limit to the route lookups, to prevent infinite loop routing. The route lookup populates a new 'nexthop' field in the dst structure, which now specifies the key for the neighbour table lookup on device output, rather than using the packet destination address directly. Signed-off-by: Jeremy Kerr Link: https://patch.msgid.link/20250702-dev-forwarding-v5-13-1468191da8a4@codeconstruct.com.au Signed-off-by: Paolo Abeni --- include/net/mctp.h | 13 ++- include/uapi/linux/mctp.h | 8 ++ net/mctp/route.c | 206 +++++++++++++++++++++++++++++++++------------- net/mctp/test/utils.c | 3 +- 4 files changed, 173 insertions(+), 57 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/mctp.h b/include/net/mctp.h index b3af0690f607..ac4f4ecdfc24 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -237,8 +237,18 @@ struct mctp_route { mctp_eid_t min, max; unsigned char type; + unsigned int mtu; - struct mctp_dev *dev; + + enum { + MCTP_ROUTE_DIRECT, + MCTP_ROUTE_GATEWAY, + } dst_type; + union { + struct mctp_dev *dev; + struct mctp_fq_addr gateway; + }; + int (*output)(struct mctp_dst *dst, struct sk_buff *skb); @@ -256,6 +266,7 @@ struct mctp_route { struct mctp_dst { struct mctp_dev *dev; unsigned int mtu; + mctp_eid_t nexthop; /* set for direct addressing */ unsigned char halen; diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h index e1db65df9359..19ad12a0cd4b 100644 --- a/include/uapi/linux/mctp.h +++ b/include/uapi/linux/mctp.h @@ -37,6 +37,14 @@ struct sockaddr_mctp_ext { __u8 smctp_haddr[MAX_ADDR_LEN]; }; +/* A "fully qualified" MCTP address, which includes the system-local network ID, + * required to uniquely resolve a routable EID. + */ +struct mctp_fq_addr { + unsigned int net; + mctp_eid_t eid; +}; + #define MCTP_NET_ANY 0x0 #define MCTP_ADDR_NULL 0x00 diff --git a/net/mctp/route.c b/net/mctp/route.c index 5eca3ce0ba3c..a20d6b11d418 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -563,7 +563,6 @@ out: static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb) { - struct mctp_hdr *hdr = mctp_hdr(skb); char daddr_buf[MAX_ADDR_LEN]; char *daddr = NULL; int rc; @@ -586,7 +585,7 @@ static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb) daddr = dst->haddr; } else { /* If lookup fails let the device handle daddr==NULL */ - if (mctp_neigh_lookup(dst->dev, hdr->dest, daddr_buf) == 0) + if (mctp_neigh_lookup(dst->dev, dst->nexthop, daddr_buf) == 0) daddr = daddr_buf; } @@ -610,7 +609,8 @@ static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb) static void mctp_route_release(struct mctp_route *rt) { if (refcount_dec_and_test(&rt->refs)) { - mctp_dev_put(rt->dev); + if (rt->dst_type == MCTP_ROUTE_DIRECT) + mctp_dev_put(rt->dev); kfree_rcu(rt, rcu); } } @@ -799,10 +799,16 @@ static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk, } /* routing lookups */ +static unsigned int mctp_route_netid(struct mctp_route *rt) +{ + return rt->dst_type == MCTP_ROUTE_DIRECT ? + READ_ONCE(rt->dev->net) : rt->gateway.net; +} + static bool mctp_rt_match_eid(struct mctp_route *rt, unsigned int net, mctp_eid_t eid) { - return READ_ONCE(rt->dev->net) == net && + return mctp_route_netid(rt) == net && rt->min <= eid && rt->max >= eid; } @@ -811,16 +817,21 @@ static bool mctp_rt_compare_exact(struct mctp_route *rt1, struct mctp_route *rt2) { ASSERT_RTNL(); - return rt1->dev->net == rt2->dev->net && + return mctp_route_netid(rt1) == mctp_route_netid(rt2) && rt1->min == rt2->min && rt1->max == rt2->max; } -static void mctp_dst_from_route(struct mctp_dst *dst, struct mctp_route *route) +/* must only be called on a direct route, as the final output hop */ +static void mctp_dst_from_route(struct mctp_dst *dst, mctp_eid_t eid, + unsigned int mtu, struct mctp_route *route) { mctp_dev_hold(route->dev); + dst->nexthop = eid; dst->dev = route->dev; - dst->mtu = route->mtu ?: READ_ONCE(dst->dev->dev->mtu); + dst->mtu = READ_ONCE(dst->dev->dev->mtu); + if (mtu) + dst->mtu = min(dst->mtu, mtu); dst->halen = 0; dst->output = route->output; } @@ -854,6 +865,7 @@ int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex, dst->mtu = READ_ONCE(netdev->mtu); dst->halen = halen; dst->output = mctp_dst_output; + dst->nexthop = 0; memcpy(dst->haddr, haddr, halen); rc = 0; @@ -868,24 +880,54 @@ void mctp_dst_release(struct mctp_dst *dst) mctp_dev_put(dst->dev); } +static struct mctp_route *mctp_route_lookup_single(struct net *net, + unsigned int dnet, + mctp_eid_t daddr) +{ + struct mctp_route *rt; + + list_for_each_entry_rcu(rt, &net->mctp.routes, list) { + if (mctp_rt_match_eid(rt, dnet, daddr)) + return rt; + } + + return NULL; +} + /* populates *dst on successful lookup, if set */ int mctp_route_lookup(struct net *net, unsigned int dnet, mctp_eid_t daddr, struct mctp_dst *dst) { + const unsigned int max_depth = 32; + unsigned int depth, mtu = 0; int rc = -EHOSTUNREACH; - struct mctp_route *rt; rcu_read_lock(); - list_for_each_entry_rcu(rt, &net->mctp.routes, list) { - /* TODO: add metrics */ - if (!mctp_rt_match_eid(rt, dnet, daddr)) - continue; + for (depth = 0; depth < max_depth; depth++) { + struct mctp_route *rt; - if (dst) - mctp_dst_from_route(dst, rt); - rc = 0; - break; + rt = mctp_route_lookup_single(net, dnet, daddr); + if (!rt) + break; + + /* clamp mtu to the smallest in the path, allowing 0 + * to specify no restrictions + */ + if (mtu && rt->mtu) + mtu = min(mtu, rt->mtu); + else + mtu = mtu ?: rt->mtu; + + if (rt->dst_type == MCTP_ROUTE_DIRECT) { + if (dst) + mctp_dst_from_route(dst, daddr, mtu, rt); + rc = 0; + break; + + } else if (rt->dst_type == MCTP_ROUTE_GATEWAY) { + daddr = rt->gateway.eid; + } } rcu_read_unlock(); @@ -902,10 +944,13 @@ static int mctp_route_lookup_null(struct net *net, struct net_device *dev, rcu_read_lock(); list_for_each_entry_rcu(rt, &net->mctp.routes, list) { - if (rt->dev->dev != dev || rt->type != RTN_LOCAL) + if (rt->dst_type != MCTP_ROUTE_DIRECT || rt->type != RTN_LOCAL) + continue; + + if (rt->dev->dev != dev) continue; - mctp_dst_from_route(dst, rt); + mctp_dst_from_route(dst, 0, 0, rt); rc = 0; break; } @@ -1085,11 +1130,6 @@ out_release: return rc; } -static unsigned int mctp_route_netid(struct mctp_route *rt) -{ - return rt->dev->net; -} - /* route management */ /* mctp_route_add(): Add the provided route, previously allocated via @@ -1097,9 +1137,9 @@ static unsigned int mctp_route_netid(struct mctp_route *rt) * hold on rt->dev for usage in the route table. On failure a caller will want * to mctp_route_release(). * - * We expect that the caller has set rt->type, rt->min, rt->max, rt->dev and - * rt->mtu, and that the route holds a reference to rt->dev (via mctp_dev_hold). - * Other fields will be populated. + * We expect that the caller has set rt->type, rt->dst_type, rt->min, rt->max, + * rt->mtu and either rt->dev (with a reference held appropriately) or + * rt->gateway. Other fields will be populated. */ static int mctp_route_add(struct net *net, struct mctp_route *rt) { @@ -1108,7 +1148,10 @@ static int mctp_route_add(struct net *net, struct mctp_route *rt) if (!mctp_address_unicast(rt->min) || !mctp_address_unicast(rt->max)) return -EINVAL; - if (!rt->dev) + if (rt->dst_type == MCTP_ROUTE_DIRECT && !rt->dev) + return -EINVAL; + + if (rt->dst_type == MCTP_ROUTE_GATEWAY && !rt->gateway.eid) return -EINVAL; switch (rt->type) { @@ -1177,6 +1220,7 @@ int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr) rt->min = addr; rt->max = addr; + rt->dst_type = MCTP_ROUTE_DIRECT; rt->dev = mdev; rt->type = RTN_LOCAL; @@ -1203,7 +1247,7 @@ void mctp_route_remove_dev(struct mctp_dev *mdev) ASSERT_RTNL(); list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) { - if (rt->dev == mdev) { + if (rt->dst_type == MCTP_ROUTE_DIRECT && rt->dev == mdev) { list_del_rcu(&rt->list); /* TODO: immediate RTM_DELROUTE */ mctp_route_release(rt); @@ -1296,21 +1340,28 @@ static const struct nla_policy rta_mctp_policy[RTA_MAX + 1] = { [RTA_DST] = { .type = NLA_U8 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_OIF] = { .type = NLA_U32 }, + [RTA_GATEWAY] = NLA_POLICY_EXACT_LEN(sizeof(struct mctp_fq_addr)), }; static const struct nla_policy rta_metrics_policy[RTAX_MAX + 1] = { [RTAX_MTU] = { .type = NLA_U32 }, }; -/* base parsing; common to both _lookup and _populate variants */ +/* base parsing; common to both _lookup and _populate variants. + * + * For gateway routes (which have a RTA_GATEWAY, and no RTA_OIF), we populate + * *gatweayp. for direct routes (RTA_OIF, no RTA_GATEWAY), we populate *mdev. + */ static int mctp_route_nlparse_common(struct net *net, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct nlattr **tb, struct rtmsg **rtm, struct mctp_dev **mdev, + struct mctp_fq_addr *gatewayp, mctp_eid_t *daddr_start) { + struct mctp_fq_addr *gateway = NULL; + unsigned int ifindex = 0; struct net_device *dev; - unsigned int ifindex; int rc; rc = nlmsg_parse(nlh, sizeof(struct rtmsg), tb, RTA_MAX, @@ -1326,11 +1377,44 @@ static int mctp_route_nlparse_common(struct net *net, struct nlmsghdr *nlh, } *daddr_start = nla_get_u8(tb[RTA_DST]); - if (!tb[RTA_OIF]) { - NL_SET_ERR_MSG(extack, "ifindex missing"); + if (tb[RTA_OIF]) + ifindex = nla_get_u32(tb[RTA_OIF]); + + if (tb[RTA_GATEWAY]) + gateway = nla_data(tb[RTA_GATEWAY]); + + if (ifindex && gateway) { + NL_SET_ERR_MSG(extack, + "cannot specify both ifindex and gateway"); + return -EINVAL; + + } else if (ifindex) { + dev = __dev_get_by_index(net, ifindex); + if (!dev) { + NL_SET_ERR_MSG(extack, "bad ifindex"); + return -ENODEV; + } + *mdev = mctp_dev_get_rtnl(dev); + if (!*mdev) + return -ENODEV; + gatewayp->eid = 0; + + } else if (gateway) { + if (!mctp_address_unicast(gateway->eid)) { + NL_SET_ERR_MSG(extack, "bad gateway"); + return -EINVAL; + } + + gatewayp->eid = gateway->eid; + gatewayp->net = gateway->net != MCTP_NET_ANY ? + gateway->net : + READ_ONCE(net->mctp.default_net); + *mdev = NULL; + + } else { + NL_SET_ERR_MSG(extack, "no route output provided"); return -EINVAL; } - ifindex = nla_get_u32(tb[RTA_OIF]); *rtm = nlmsg_data(nlh); if ((*rtm)->rtm_family != AF_MCTP) { @@ -1343,16 +1427,6 @@ static int mctp_route_nlparse_common(struct net *net, struct nlmsghdr *nlh, return -EINVAL; } - dev = __dev_get_by_index(net, ifindex); - if (!dev) { - NL_SET_ERR_MSG(extack, "bad ifindex"); - return -ENODEV; - } - - *mdev = mctp_dev_get_rtnl(dev); - if (!*mdev) - return -ENODEV; - return 0; } @@ -1366,24 +1440,34 @@ static int mctp_route_nlparse_lookup(struct net *net, struct nlmsghdr *nlh, unsigned int *daddr_extent) { struct nlattr *tb[RTA_MAX + 1]; + struct mctp_fq_addr gw; struct mctp_dev *mdev; struct rtmsg *rtm; int rc; rc = mctp_route_nlparse_common(net, nlh, extack, tb, &rtm, - &mdev, daddr_start); + &mdev, &gw, daddr_start); if (rc) return rc; - *netid = mdev->net; + if (mdev) { + *netid = mdev->net; + } else if (gw.eid) { + *netid = gw.net; + } else { + /* bug: _nlparse_common should not allow this */ + return -1; + } + *type = rtm->rtm_type; *daddr_extent = rtm->rtm_dst_len; return 0; } -/* Full route parse for RTM_NEWROUTE: populate @rt. On success, the route will - * hold a reference to the dev. +/* Full route parse for RTM_NEWROUTE: populate @rt. On success, + * MCTP_ROUTE_DIRECT routes (ie, those with a direct dev) will hold a reference + * to that dev. */ static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, @@ -1392,6 +1476,7 @@ static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh, struct nlattr *tbx[RTAX_MAX + 1]; struct nlattr *tb[RTA_MAX + 1]; unsigned int daddr_extent; + struct mctp_fq_addr gw; mctp_eid_t daddr_start; struct mctp_dev *dev; struct rtmsg *rtm; @@ -1399,7 +1484,7 @@ static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh, int rc; rc = mctp_route_nlparse_common(net, nlh, extack, tb, &rtm, - &dev, &daddr_start); + &dev, &gw, &daddr_start); if (rc) return rc; @@ -1425,8 +1510,15 @@ static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh, rt->min = daddr_start; rt->max = daddr_start + daddr_extent; rt->mtu = mtu; - rt->dev = dev; - mctp_dev_hold(rt->dev); + if (gw.eid) { + rt->dst_type = MCTP_ROUTE_GATEWAY; + rt->gateway.eid = gw.eid; + rt->gateway.net = gw.net; + } else { + rt->dst_type = MCTP_ROUTE_DIRECT; + rt->dev = dev; + mctp_dev_hold(rt->dev); + } return 0; } @@ -1446,7 +1538,8 @@ static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (rc < 0) goto err_free; - if (rt->dev->dev->flags & IFF_LOOPBACK) { + if (rt->dst_type == MCTP_ROUTE_DIRECT && + rt->dev->dev->flags & IFF_LOOPBACK) { NL_SET_ERR_MSG(extack, "no routes to loopback"); rc = -EINVAL; goto err_free; @@ -1505,7 +1598,6 @@ static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt, hdr->rtm_tos = 0; hdr->rtm_table = RT_TABLE_DEFAULT; hdr->rtm_protocol = RTPROT_STATIC; /* everything is user-defined */ - hdr->rtm_scope = RT_SCOPE_LINK; /* TODO: scope in mctp_route? */ hdr->rtm_type = rt->type; if (nla_put_u8(skb, RTA_DST, rt->min)) @@ -1522,13 +1614,17 @@ static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt, nla_nest_end(skb, metrics); - if (rt->dev) { + if (rt->dst_type == MCTP_ROUTE_DIRECT) { + hdr->rtm_scope = RT_SCOPE_LINK; if (nla_put_u32(skb, RTA_OIF, rt->dev->dev->ifindex)) goto cancel; + } else if (rt->dst_type == MCTP_ROUTE_GATEWAY) { + hdr->rtm_scope = RT_SCOPE_UNIVERSE; + if (nla_put(skb, RTA_GATEWAY, + sizeof(rt->gateway), &rt->gateway)) + goto cancel; } - /* TODO: conditional neighbour physaddr? */ - nlmsg_end(skb, nlh); return 0; diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c index 6b4dc40d882c..97b05e340586 100644 --- a/net/mctp/test/utils.c +++ b/net/mctp/test/utils.c @@ -134,6 +134,7 @@ struct mctp_test_route *mctp_test_create_route(struct net *net, rt->rt.max = eid; rt->rt.mtu = mtu; rt->rt.type = RTN_UNSPEC; + rt->rt.dst_type = MCTP_ROUTE_DIRECT; if (dev) mctp_dev_hold(dev); rt->rt.dev = dev; @@ -176,7 +177,7 @@ void mctp_test_route_destroy(struct kunit *test, struct mctp_test_route *rt) list_del_rcu(&rt->rt.list); rtnl_unlock(); - if (rt->rt.dev) + if (rt->rt.dst_type == MCTP_ROUTE_DIRECT && rt->rt.dev) mctp_dev_put(rt->rt.dev); refs = refcount_read(&rt->rt.refs); -- cgit v1.2.3 From e22da4685013922ade8e7b5e727ac6804fe5b51e Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 1 Jul 2025 16:46:57 +0200 Subject: net/handshake: Add new parameter 'HANDSHAKE_A_ACCEPT_KEYRING' Add a new netlink parameter 'HANDSHAKE_A_ACCEPT_KEYRING' to provide the serial number of the keyring to use. Signed-off-by: Hannes Reinecke Reviewed-by: Chuck Lever Acked-by: Jakub Kicinski Link: https://patch.msgid.link/20250701144657.104401-1-hare@kernel.org Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/handshake.yaml | 4 ++++ include/uapi/linux/handshake.h | 1 + net/handshake/tlshd.c | 6 ++++++ 3 files changed, 11 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/handshake.yaml b/Documentation/netlink/specs/handshake.yaml index 39ed1661c7f1..95c3fade7a8d 100644 --- a/Documentation/netlink/specs/handshake.yaml +++ b/Documentation/netlink/specs/handshake.yaml @@ -71,6 +71,9 @@ attribute-sets: - name: peername type: string + - + name: keyring + type: u32 - name: done attributes: @@ -109,6 +112,7 @@ operations: - peer-identity - certificate - peername + - keyring - name: done doc: Handler reports handshake completion diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h index 3d7ea58778c9..662e7de46c54 100644 --- a/include/uapi/linux/handshake.h +++ b/include/uapi/linux/handshake.h @@ -45,6 +45,7 @@ enum { HANDSHAKE_A_ACCEPT_PEER_IDENTITY, HANDSHAKE_A_ACCEPT_CERTIFICATE, HANDSHAKE_A_ACCEPT_PEERNAME, + HANDSHAKE_A_ACCEPT_KEYRING, __HANDSHAKE_A_ACCEPT_MAX, HANDSHAKE_A_ACCEPT_MAX = (__HANDSHAKE_A_ACCEPT_MAX - 1) diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index d6f52839827e..081093dfd553 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -230,6 +230,12 @@ static int tls_handshake_accept(struct handshake_req *req, if (ret < 0) goto out_cancel; } + if (treq->th_keyring) { + ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_KEYRING, + treq->th_keyring); + if (ret < 0) + goto out_cancel; + } ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_AUTH_MODE, treq->th_auth_mode); -- cgit v1.2.3 From 70b9c0c11e55167b9552ef395bc00f4920299177 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 30 Jun 2025 15:02:18 +0200 Subject: uapi: bitops: use UAPI-safe variant of BITS_PER_LONG again (2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BITS_PER_LONG does not exist in UAPI headers, so can't be used by the UAPI __GENMASK(). Instead __BITS_PER_LONG needs to be used. When __GENMASK() was introduced in commit 3c7a8e190bc5 ("uapi: introduce uapi-friendly macros for GENMASK"), the code was fine. A broken revert in 1e7933a575ed ("uapi: Revert "bitops: avoid integer overflow in GENMASK(_ULL)"") introduced the incorrect usage of BITS_PER_LONG. That was fixed in commit 11fcf368506d ("uapi: bitops: use UAPI-safe variant of BITS_PER_LONG again"). But a broken sync of the kernel headers with the tools/ headers in commit fc92099902fb ("tools headers: Synchronize linux/bits.h with the kernel sources") undid the fix. Reapply the fix and while at it also fix the tools header. Fixes: fc92099902fb ("tools headers: Synchronize linux/bits.h with the kernel sources") Signed-off-by: Thomas Weißschuh Acked-by: Yury Norov (NVIDIA) Signed-off-by: Yury Norov (NVIDIA) --- include/uapi/linux/bits.h | 4 ++-- tools/include/uapi/linux/bits.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bits.h b/include/uapi/linux/bits.h index 682b406e1067..a04afef9efca 100644 --- a/include/uapi/linux/bits.h +++ b/include/uapi/linux/bits.h @@ -4,9 +4,9 @@ #ifndef _UAPI_LINUX_BITS_H #define _UAPI_LINUX_BITS_H -#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (BITS_PER_LONG - 1 - (h)))) +#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (__BITS_PER_LONG - 1 - (h)))) -#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) +#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h)))) #define __GENMASK_U128(h, l) \ ((_BIT128((h)) << 1) - (_BIT128(l))) diff --git a/tools/include/uapi/linux/bits.h b/tools/include/uapi/linux/bits.h index 682b406e1067..a04afef9efca 100644 --- a/tools/include/uapi/linux/bits.h +++ b/tools/include/uapi/linux/bits.h @@ -4,9 +4,9 @@ #ifndef _UAPI_LINUX_BITS_H #define _UAPI_LINUX_BITS_H -#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (BITS_PER_LONG - 1 - (h)))) +#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (__BITS_PER_LONG - 1 - (h)))) -#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) +#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h)))) #define __GENMASK_U128(h, l) \ ((_BIT128((h)) << 1) - (_BIT128(l))) -- cgit v1.2.3 From 333c515d189657c934470c9b0b8a8fedb016ce6f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 8 Jul 2025 17:54:54 +0200 Subject: vhost-net: allow configuring extended features Use the extended feature type for 'acked_features' and implement two new ioctls operation allowing the user-space to set/query an unbounded amount of features. The actual number of processed features is limited by VIRTIO_FEATURES_MAX and attempts to set features above such limit fail with EOPNOTSUPP. Note that: the legacy ioctls implicitly truncate the negotiated features to the lower 64 bits range and the 'acked_backend_features' field don't need conversion, as the only negotiated feature there is in the low 64 bit range. Acked-by: Jason Wang Signed-off-by: Paolo Abeni --- drivers/vhost/net.c | 87 +++++++++++++++++++++++++++++++--------- drivers/vhost/vhost.c | 2 +- drivers/vhost/vhost.h | 4 +- include/uapi/linux/vhost.h | 7 ++++ include/uapi/linux/vhost_types.h | 5 +++ 5 files changed, 82 insertions(+), 23 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 7cbfc7d718b3..67d011b0d4f7 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -69,12 +69,12 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN) -enum { - VHOST_NET_FEATURES = VHOST_FEATURES | - (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | - (1ULL << VIRTIO_NET_F_MRG_RXBUF) | - (1ULL << VIRTIO_F_ACCESS_PLATFORM) | - (1ULL << VIRTIO_F_RING_RESET) +static const u64 vhost_net_features[VIRTIO_FEATURES_DWORDS] = { + VHOST_FEATURES | + (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | + (1ULL << VIRTIO_NET_F_MRG_RXBUF) | + (1ULL << VIRTIO_F_ACCESS_PLATFORM) | + (1ULL << VIRTIO_F_RING_RESET), }; enum { @@ -1614,16 +1614,17 @@ done: return err; } -static int vhost_net_set_features(struct vhost_net *n, u64 features) +static int vhost_net_set_features(struct vhost_net *n, const u64 *features) { size_t vhost_hlen, sock_hlen, hdr_len; int i; - hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | - (1ULL << VIRTIO_F_VERSION_1))) ? - sizeof(struct virtio_net_hdr_mrg_rxbuf) : - sizeof(struct virtio_net_hdr); - if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) { + hdr_len = virtio_features_test_bit(features, VIRTIO_NET_F_MRG_RXBUF) || + virtio_features_test_bit(features, VIRTIO_F_VERSION_1) ? + sizeof(struct virtio_net_hdr_mrg_rxbuf) : + sizeof(struct virtio_net_hdr); + + if (virtio_features_test_bit(features, VHOST_NET_F_VIRTIO_NET_HDR)) { /* vhost provides vnet_hdr */ vhost_hlen = hdr_len; sock_hlen = 0; @@ -1633,18 +1634,19 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features) sock_hlen = hdr_len; } mutex_lock(&n->dev.mutex); - if ((features & (1 << VHOST_F_LOG_ALL)) && + if (virtio_features_test_bit(features, VHOST_F_LOG_ALL) && !vhost_log_access_ok(&n->dev)) goto out_unlock; - if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { + if (virtio_features_test_bit(features, VIRTIO_F_ACCESS_PLATFORM)) { if (vhost_init_device_iotlb(&n->dev)) goto out_unlock; } for (i = 0; i < VHOST_NET_VQ_MAX; ++i) { mutex_lock(&n->vqs[i].vq.mutex); - n->vqs[i].vq.acked_features = features; + virtio_features_copy(n->vqs[i].vq.acked_features_array, + features); n->vqs[i].vhost_hlen = vhost_hlen; n->vqs[i].sock_hlen = sock_hlen; mutex_unlock(&n->vqs[i].vq.mutex); @@ -1681,12 +1683,13 @@ out: static long vhost_net_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { + u64 all_features[VIRTIO_FEATURES_DWORDS]; struct vhost_net *n = f->private_data; void __user *argp = (void __user *)arg; u64 __user *featurep = argp; struct vhost_vring_file backend; - u64 features; - int r; + u64 features, count, copied; + int r, i; switch (ioctl) { case VHOST_NET_SET_BACKEND: @@ -1694,16 +1697,60 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl, return -EFAULT; return vhost_net_set_backend(n, backend.index, backend.fd); case VHOST_GET_FEATURES: - features = VHOST_NET_FEATURES; + features = vhost_net_features[0]; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, featurep, sizeof features)) return -EFAULT; - if (features & ~VHOST_NET_FEATURES) + if (features & ~vhost_net_features[0]) return -EOPNOTSUPP; - return vhost_net_set_features(n, features); + + virtio_features_from_u64(all_features, features); + return vhost_net_set_features(n, all_features); + case VHOST_GET_FEATURES_ARRAY: + if (copy_from_user(&count, featurep, sizeof(count))) + return -EFAULT; + + /* Copy the net features, up to the user-provided buffer size */ + argp += sizeof(u64); + copied = min(count, VIRTIO_FEATURES_DWORDS); + if (copy_to_user(argp, vhost_net_features, + copied * sizeof(u64))) + return -EFAULT; + + /* Zero the trailing space provided by user-space, if any */ + if (clear_user(argp, size_mul(count - copied, sizeof(u64)))) + return -EFAULT; + return 0; + case VHOST_SET_FEATURES_ARRAY: + if (copy_from_user(&count, featurep, sizeof(count))) + return -EFAULT; + + virtio_features_zero(all_features); + argp += sizeof(u64); + copied = min(count, VIRTIO_FEATURES_DWORDS); + if (copy_from_user(all_features, argp, copied * sizeof(u64))) + return -EFAULT; + + /* + * Any feature specified by user-space above + * VIRTIO_FEATURES_MAX is not supported by definition. + */ + for (i = copied; i < count; ++i) { + if (copy_from_user(&features, featurep + 1 + i, + sizeof(features))) + return -EFAULT; + if (features) + return -EOPNOTSUPP; + } + + for (i = 0; i < VIRTIO_FEATURES_DWORDS; i++) + if (all_features[i] & ~vhost_net_features[i]) + return -EOPNOTSUPP; + + return vhost_net_set_features(n, all_features); case VHOST_GET_BACKEND_FEATURES: features = VHOST_NET_BACKEND_FEATURES; if (copy_to_user(featurep, &features, sizeof(features))) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 3a5ebb973dba..1094256a943c 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -372,7 +372,7 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->log_used = false; vq->log_addr = -1ull; vq->private_data = NULL; - vq->acked_features = 0; + virtio_features_zero(vq->acked_features_array); vq->acked_backend_features = 0; vq->log_base = NULL; vq->error_ctx = NULL; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index bb75a292d50c..d1aed35c4b07 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -133,7 +133,7 @@ struct vhost_virtqueue { struct vhost_iotlb *umem; struct vhost_iotlb *iotlb; void *private_data; - u64 acked_features; + VIRTIO_DECLARE_FEATURES(acked_features); u64 acked_backend_features; /* Log write descriptors */ void __user *log_base; @@ -291,7 +291,7 @@ static inline void *vhost_vq_get_backend(struct vhost_virtqueue *vq) static inline bool vhost_has_feature(struct vhost_virtqueue *vq, int bit) { - return vq->acked_features & (1ULL << bit); + return virtio_features_test_bit(vq->acked_features_array, bit); } static inline bool vhost_backend_has_feature(struct vhost_virtqueue *vq, int bit) diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index d4b3e2ae1314..d6ad01fbb8d2 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -235,4 +235,11 @@ */ #define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ struct vhost_vring_state) + +/* Extended features manipulation */ +#define VHOST_GET_FEATURES_ARRAY _IOR(VHOST_VIRTIO, 0x83, \ + struct vhost_features_array) +#define VHOST_SET_FEATURES_ARRAY _IOW(VHOST_VIRTIO, 0x83, \ + struct vhost_features_array) + #endif diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index d7656908f730..1c39cc5f5a31 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -110,6 +110,11 @@ struct vhost_msg_v2 { }; }; +struct vhost_features_array { + __u64 count; /* number of entries present in features array */ + __u64 features[] __counted_by(count); +}; + struct vhost_memory_region { __u64 guest_phys_addr; __u64 memory_size; /* bytes */ -- cgit v1.2.3 From a2fb4bc4e2a6a031683910d85b278c1d25ae5420 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 8 Jul 2025 17:54:59 +0200 Subject: net: implement virtio helpers to handle UDP GSO tunneling. The virtio specification are introducing support for GSO over UDP tunnel. This patch brings in the needed defines and the additional virtio hdr parsing/building helpers. The UDP tunnel support uses additional fields in the virtio hdr, and such fields location can change depending on other negotiated features - specifically VIRTIO_NET_F_HASH_REPORT. Try to be as conservative as possible with the new field validation. Existing implementation for plain GSO offloads allow for invalid/ self-contradictory values of such fields. With GSO over UDP tunnel we can be more strict, with no need to deal with legacy implementation. Since the checksum-related field validation is asymmetric in the driver and in the device, introduce a separate helper to implement the new checks (to be used only on the driver side). Note that while the feature space exceeds the 64-bit boundaries, the guest offload space is fixed by the specification of the VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET command to a 64-bit size. Prior to the UDP tunnel GSO support, each guest offload bit corresponded to the feature bit with the same value and vice versa. Due to the limited 'guest offload' space, relevant features in the high 64 bits are 'mapped' to free bits in the lower range. That is simpler than defining a new command (and associated features) to exchange an extended guest offloads set. As a consequence, the uAPIs also specify the mapped guest offload value corresponding to the UDP tunnel GSO features. Acked-by: Jason Wang Signed-off-by: Paolo Abeni -- v4 -> v5: - avoid lines above 80 chars v3 -> v4: - fixed offset for UDP GSO tunnel, update accordingly the helpers - tried to clarified vlan_hlen semantic - virtio_net_chk_data_valid() -> virtio_net_handle_csum_offload() v2 -> v3: - add definitions for possible vnet hdr layouts with tunnel support v1 -> v2: - 'relay' -> 'rely' typo - less unclear comment WRT enforced inner GSO checks - inner header fields are allowed only with 'modern' virtio, thus are always le - clarified in the commit message the need for 'mapped features' defines - assume little_endian is true when UDP GSO is enabled. - fix inner proto type value --- include/linux/virtio_net.h | 197 ++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/virtio_net.h | 33 +++++++ 2 files changed, 222 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 02a9f4dc594d..20e0584db1dd 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -47,9 +47,9 @@ static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, return 0; } -static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, - const struct virtio_net_hdr *hdr, - bool little_endian) +static inline int __virtio_net_hdr_to_skb(struct sk_buff *skb, + const struct virtio_net_hdr *hdr, + bool little_endian, u8 hdr_gso_type) { unsigned int nh_min_len = sizeof(struct iphdr); unsigned int gso_type = 0; @@ -57,8 +57,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, unsigned int p_off = 0; unsigned int ip_proto; - if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { - switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + if (hdr_gso_type != VIRTIO_NET_HDR_GSO_NONE) { + switch (hdr_gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: gso_type = SKB_GSO_TCPV4; ip_proto = IPPROTO_TCP; @@ -84,7 +84,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, return -EINVAL; } - if (hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN) + if (hdr_gso_type & VIRTIO_NET_HDR_GSO_ECN) gso_type |= SKB_GSO_TCP_ECN; if (hdr->gso_size == 0) @@ -122,7 +122,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!protocol) virtio_net_hdr_set_proto(skb, hdr); - else if (!virtio_net_hdr_match_proto(protocol, hdr->gso_type)) + else if (!virtio_net_hdr_match_proto(protocol, + hdr_gso_type)) return -EINVAL; else skb->protocol = protocol; @@ -153,7 +154,7 @@ retry: } } - if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + if (hdr_gso_type != VIRTIO_NET_HDR_GSO_NONE) { u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size); unsigned int nh_off = p_off; struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -199,6 +200,13 @@ retry: return 0; } +static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, + const struct virtio_net_hdr *hdr, + bool little_endian) +{ + return __virtio_net_hdr_to_skb(skb, hdr, little_endian, hdr->gso_type); +} + static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, struct virtio_net_hdr *hdr, bool little_endian, @@ -242,4 +250,177 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, return 0; } +static inline unsigned int virtio_l3min(bool is_ipv6) +{ + return is_ipv6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr); +} + +static inline int +virtio_net_hdr_tnl_to_skb(struct sk_buff *skb, + const struct virtio_net_hdr_v1_hash_tunnel *vhdr, + bool tnl_hdr_negotiated, + bool tnl_csum_negotiated, + bool little_endian) +{ + const struct virtio_net_hdr *hdr = (const struct virtio_net_hdr *)vhdr; + unsigned int inner_nh, outer_th, inner_th; + unsigned int inner_l3min, outer_l3min; + u8 gso_inner_type, gso_tunnel_type; + bool outer_isv6, inner_isv6; + int ret; + + gso_tunnel_type = hdr->gso_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL; + if (!gso_tunnel_type) + return virtio_net_hdr_to_skb(skb, hdr, little_endian); + + /* Tunnel not supported/negotiated, but the hdr asks for it. */ + if (!tnl_hdr_negotiated) + return -EINVAL; + + /* Either ipv4 or ipv6. */ + if (gso_tunnel_type == VIRTIO_NET_HDR_GSO_UDP_TUNNEL) + return -EINVAL; + + /* The UDP tunnel must carry a GSO packet, but no UFO. */ + gso_inner_type = hdr->gso_type & ~(VIRTIO_NET_HDR_GSO_ECN | + VIRTIO_NET_HDR_GSO_UDP_TUNNEL); + if (!gso_inner_type || gso_inner_type == VIRTIO_NET_HDR_GSO_UDP) + return -EINVAL; + + /* Rely on csum being present. */ + if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) + return -EINVAL; + + /* Validate offsets. */ + outer_isv6 = gso_tunnel_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6; + inner_isv6 = gso_inner_type == VIRTIO_NET_HDR_GSO_TCPV6; + inner_l3min = virtio_l3min(inner_isv6); + outer_l3min = ETH_HLEN + virtio_l3min(outer_isv6); + + inner_th = __virtio16_to_cpu(little_endian, hdr->csum_start); + inner_nh = le16_to_cpu(vhdr->inner_nh_offset); + outer_th = le16_to_cpu(vhdr->outer_th_offset); + if (outer_th < outer_l3min || + inner_nh < outer_th + sizeof(struct udphdr) || + inner_th < inner_nh + inner_l3min) + return -EINVAL; + + /* Let the basic parsing deal with plain GSO features. */ + ret = __virtio_net_hdr_to_skb(skb, hdr, true, + hdr->gso_type & ~gso_tunnel_type); + if (ret) + return ret; + + /* In case of USO, the inner protocol is still unknown and + * `inner_isv6` is just a guess, additional parsing is needed. + * The previous validation ensures that accessing an ipv4 inner + * network header is safe. + */ + if (gso_inner_type == VIRTIO_NET_HDR_GSO_UDP_L4) { + struct iphdr *iphdr = (struct iphdr *)(skb->data + inner_nh); + + inner_isv6 = iphdr->version == 6; + inner_l3min = virtio_l3min(inner_isv6); + if (inner_th < inner_nh + inner_l3min) + return -EINVAL; + } + + skb_set_inner_protocol(skb, inner_isv6 ? htons(ETH_P_IPV6) : + htons(ETH_P_IP)); + if (hdr->flags & VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM) { + if (!tnl_csum_negotiated) + return -EINVAL; + + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + } else { + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; + } + + skb->inner_transport_header = inner_th + skb_headroom(skb); + skb->inner_network_header = inner_nh + skb_headroom(skb); + skb->inner_mac_header = inner_nh + skb_headroom(skb); + skb->transport_header = outer_th + skb_headroom(skb); + skb->encapsulation = 1; + return 0; +} + +/* Checksum-related fields validation for the driver */ +static inline int virtio_net_handle_csum_offload(struct sk_buff *skb, + struct virtio_net_hdr *hdr, + bool tnl_csum_negotiated) +{ + if (!(hdr->gso_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL)) { + if (!(hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID)) + return 0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (!(hdr->flags & VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM)) + return 0; + + /* tunnel csum packets are invalid when the related + * feature has not been negotiated + */ + if (!tnl_csum_negotiated) + return -EINVAL; + skb->csum_level = 1; + return 0; + } + + /* DATA_VALID is mutually exclusive with NEEDS_CSUM, and GSO + * over UDP tunnel requires the latter + */ + if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID) + return -EINVAL; + return 0; +} + +/* + * vlan_hlen always refers to the outermost MAC header. That also + * means it refers to the only MAC header, if the packet does not carry + * any encapsulation. + */ +static inline int +virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, + struct virtio_net_hdr_v1_hash_tunnel *vhdr, + bool tnl_hdr_negotiated, + bool little_endian, + int vlan_hlen) +{ + struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)vhdr; + unsigned int inner_nh, outer_th; + int tnl_gso_type; + int ret; + + tnl_gso_type = skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL | + SKB_GSO_UDP_TUNNEL_CSUM); + if (!tnl_gso_type) + return virtio_net_hdr_from_skb(skb, hdr, little_endian, false, + vlan_hlen); + + /* Tunnel support not negotiated but skb ask for it. */ + if (!tnl_hdr_negotiated) + return -EINVAL; + + /* Let the basic parsing deal with plain GSO features. */ + skb_shinfo(skb)->gso_type &= ~tnl_gso_type; + ret = virtio_net_hdr_from_skb(skb, hdr, true, false, vlan_hlen); + skb_shinfo(skb)->gso_type |= tnl_gso_type; + if (ret) + return ret; + + if (skb->protocol == htons(ETH_P_IPV6)) + hdr->gso_type |= VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6; + else + hdr->gso_type |= VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4; + + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM) + hdr->flags |= VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM; + + inner_nh = skb->inner_network_header - skb_headroom(skb); + outer_th = skb->transport_header - skb_headroom(skb); + vhdr->inner_nh_offset = cpu_to_le16(inner_nh); + vhdr->outer_th_offset = cpu_to_le16(outer_th); + return 0; +} + #endif /* _LINUX_VIRTIO_NET_H */ diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 963540deae66..8bf27ab8bcb4 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -70,6 +70,28 @@ * with the same MAC. */ #define VIRTIO_NET_F_SPEED_DUPLEX 63 /* Device set linkspeed and duplex */ +#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO 65 /* Driver can receive + * GSO-over-UDP-tunnel packets + */ +#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM 66 /* Driver handles + * GSO-over-UDP-tunnel + * packets with partial csum + * for the outer header + */ +#define VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO 67 /* Device can receive + * GSO-over-UDP-tunnel packets + */ +#define VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM 68 /* Device handles + * GSO-over-UDP-tunnel + * packets with partial csum + * for the outer header + */ + +/* Offloads bits corresponding to VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO{,_CSUM} + * features + */ +#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED 46 +#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED 47 #ifndef VIRTIO_NET_NO_LEGACY #define VIRTIO_NET_F_GSO 6 /* Host handles pkts w/ any GSO type */ @@ -131,12 +153,17 @@ struct virtio_net_hdr_v1 { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ #define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ #define VIRTIO_NET_HDR_F_RSC_INFO 4 /* rsc info in csum_ fields */ +#define VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM 8 /* UDP tunnel csum offload */ __u8 flags; #define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ #define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ #define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ #define VIRTIO_NET_HDR_GSO_UDP_L4 5 /* GSO frame, IPv4& IPv6 UDP (USO) */ +#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 0x20 /* UDPv4 tunnel present */ +#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6 0x40 /* UDPv6 tunnel present */ +#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 | \ + VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6) #define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ __u8 gso_type; __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ @@ -181,6 +208,12 @@ struct virtio_net_hdr_v1_hash { __le16 padding; }; +struct virtio_net_hdr_v1_hash_tunnel { + struct virtio_net_hdr_v1_hash hash_hdr; + __le16 outer_th_offset; + __le16 inner_nh_offset; +}; + #ifndef VIRTIO_NET_NO_LEGACY /* This header comes first in the scatter-gather list. * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, it must -- cgit v1.2.3 From 288f30435132d2f9e7a29ec9b9745a4f9dc7fd37 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 8 Jul 2025 17:55:30 +0200 Subject: tun: enable gso over UDP tunnel support. Add new tun features to represent the newly introduced virtio GSO over UDP tunnel offload. Allows detection and selection of such features via the existing TUNSETOFFLOAD ioctl and compute the expected virtio header size and tunnel header offset using the current netdev features, so that we can plug almost seamless the newly introduced virtio helpers to serialize the extended virtio header. Acked-by: Jason Wang Signed-off-by: Paolo Abeni --- v6 -> v7: - rebased v4 -> v5: - encapsulate the guest feature guessing in a tun helper - dropped irrelevant check on xdp buff headroom - do not remove unrelated black line - avoid line len > 80 char v3 -> v4: - virtio tnl-related fields are at fixed offset, cleanup the code accordingly. - use netdev features instead of flags bit to check for the configured offload - drop packet in case of enabled features/configured hdr size mismatch v2 -> v3: - cleaned-up uAPI comments - use explicit struct layout instead of raw buf. --- drivers/net/tun.c | 58 +++++++++++++++++++++---- drivers/net/tun_vnet.h | 101 ++++++++++++++++++++++++++++++++++++++++---- include/uapi/linux/if_tun.h | 9 ++++ 3 files changed, 150 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index f8c5e2fd04df..abc91f28dac4 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -186,7 +186,8 @@ struct tun_struct { struct net_device *dev; netdev_features_t set_features; #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ - NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4) + NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4 | \ + NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_UDP_TUNNEL_CSUM) int align; int vnet_hdr_sz; @@ -925,6 +926,7 @@ static int tun_net_init(struct net_device *dev) dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; + dev->hw_enc_features = dev->hw_features; dev->features = dev->hw_features; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | @@ -1698,7 +1700,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb; size_t total_len = iov_iter_count(from); size_t len = total_len, align = tun->align, linear; - struct virtio_net_hdr gso = { 0 }; + struct virtio_net_hdr_v1_hash_tunnel hdr; + struct virtio_net_hdr *gso; int good_linear; int copylen; int hdr_len = 0; @@ -1708,6 +1711,15 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tfile); enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; + netdev_features_t features = 0; + + /* + * Keep it easy and always zero the whole buffer, even if the + * tunnel-related field will be touched only when the feature + * is enabled and the hdr size id compatible. + */ + memset(&hdr, 0, sizeof(hdr)); + gso = (struct virtio_net_hdr *)&hdr; if (!(tun->flags & IFF_NO_PI)) { if (len < sizeof(pi)) @@ -1721,7 +1733,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (tun->flags & IFF_VNET_HDR) { int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz); - hdr_len = tun_vnet_hdr_get(vnet_hdr_sz, tun->flags, from, &gso); + features = tun_vnet_hdr_guest_features(vnet_hdr_sz); + hdr_len = __tun_vnet_hdr_get(vnet_hdr_sz, tun->flags, + features, from, gso); if (hdr_len < 0) return hdr_len; @@ -1755,7 +1769,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, * (e.g gso or jumbo packet), we will do it at after * skb was created with generic XDP routine. */ - skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp); + skb = tun_build_skb(tun, tfile, from, gso, len, &skb_xdp); err = PTR_ERR_OR_ZERO(skb); if (err) goto drop; @@ -1799,7 +1813,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, } } - if (tun_vnet_hdr_to_skb(tun->flags, skb, &gso)) { + if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, &hdr)) { atomic_long_inc(&tun->rx_frame_errors); err = -EINVAL; goto free_skb; @@ -2050,13 +2064,21 @@ static ssize_t tun_put_user(struct tun_struct *tun, } if (vnet_hdr_sz) { - struct virtio_net_hdr gso; + struct virtio_net_hdr_v1_hash_tunnel hdr; + struct virtio_net_hdr *gso; - ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso); + ret = tun_vnet_hdr_tnl_from_skb(tun->flags, tun->dev, skb, + &hdr); if (ret) return ret; - ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso); + /* + * Drop the packet if the configured header size is too small + * WRT the enabled offloads. + */ + gso = (struct virtio_net_hdr *)&hdr; + ret = __tun_vnet_hdr_put(vnet_hdr_sz, tun->dev->features, + iter, gso); if (ret) return ret; } @@ -2357,10 +2379,12 @@ static int tun_xdp_one(struct tun_struct *tun, { unsigned int datasize = xdp->data_end - xdp->data; struct tun_xdp_hdr *hdr = xdp->data_hard_start; + struct virtio_net_hdr_v1_hash_tunnel *tnl_hdr; struct virtio_net_hdr *gso = &hdr->gso; struct bpf_prog *xdp_prog; struct sk_buff *skb = NULL; struct sk_buff_head *queue; + netdev_features_t features; u32 rxhash = 0, act; int buflen = hdr->buflen; int metasize = 0; @@ -2426,7 +2450,9 @@ build: if (metasize > 0) skb_metadata_set(skb, metasize); - if (tun_vnet_hdr_to_skb(tun->flags, skb, gso)) { + features = tun_vnet_hdr_guest_features(READ_ONCE(tun->vnet_hdr_sz)); + tnl_hdr = (struct virtio_net_hdr_v1_hash_tunnel *)gso; + if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, tnl_hdr)) { atomic_long_inc(&tun->rx_frame_errors); kfree_skb(skb); ret = -EINVAL; @@ -2812,6 +2838,8 @@ static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr) } +#define PLAIN_GSO (NETIF_F_GSO_UDP_L4 | NETIF_F_TSO | NETIF_F_TSO6) + /* This is like a cut-down ethtool ops, except done via tun fd so no * privs required. */ static int set_offload(struct tun_struct *tun, unsigned long arg) @@ -2841,6 +2869,18 @@ static int set_offload(struct tun_struct *tun, unsigned long arg) features |= NETIF_F_GSO_UDP_L4; arg &= ~(TUN_F_USO4 | TUN_F_USO6); } + + /* + * Tunnel offload is allowed only if some plain offload is + * available, too. + */ + if (features & PLAIN_GSO && arg & TUN_F_UDP_TUNNEL_GSO) { + features |= NETIF_F_GSO_UDP_TUNNEL; + if (arg & TUN_F_UDP_TUNNEL_GSO_CSUM) + features |= NETIF_F_GSO_UDP_TUNNEL_CSUM; + arg &= ~(TUN_F_UDP_TUNNEL_GSO | + TUN_F_UDP_TUNNEL_GSO_CSUM); + } } /* This gives the user a way to test for new features in future by diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h index 58b9ac7a5fc4..81662328b2c7 100644 --- a/drivers/net/tun_vnet.h +++ b/drivers/net/tun_vnet.h @@ -6,6 +6,8 @@ #define TUN_VNET_LE 0x80000000 #define TUN_VNET_BE 0x40000000 +#define TUN_VNET_TNL_SIZE sizeof(struct virtio_net_hdr_v1_hash_tunnel) + static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) { bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && @@ -107,16 +109,26 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, } } -static inline int tun_vnet_hdr_get(int sz, unsigned int flags, - struct iov_iter *from, - struct virtio_net_hdr *hdr) +static inline unsigned int tun_vnet_parse_size(netdev_features_t features) +{ + if (!(features & NETIF_F_GSO_UDP_TUNNEL)) + return sizeof(struct virtio_net_hdr); + + return TUN_VNET_TNL_SIZE; +} + +static inline int __tun_vnet_hdr_get(int sz, unsigned int flags, + netdev_features_t features, + struct iov_iter *from, + struct virtio_net_hdr *hdr) { + unsigned int parsed_size = tun_vnet_parse_size(features); u16 hdr_len; if (iov_iter_count(from) < sz) return -EINVAL; - if (!copy_from_iter_full(hdr, sizeof(*hdr), from)) + if (!copy_from_iter_full(hdr, parsed_size, from)) return -EFAULT; hdr_len = tun_vnet16_to_cpu(flags, hdr->hdr_len); @@ -129,32 +141,70 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags, if (hdr_len > iov_iter_count(from)) return -EINVAL; - iov_iter_advance(from, sz - sizeof(*hdr)); + iov_iter_advance(from, sz - parsed_size); return hdr_len; } -static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, - const struct virtio_net_hdr *hdr) +static inline int tun_vnet_hdr_get(int sz, unsigned int flags, + struct iov_iter *from, + struct virtio_net_hdr *hdr) +{ + return __tun_vnet_hdr_get(sz, flags, 0, from, hdr); +} + +static inline int __tun_vnet_hdr_put(int sz, netdev_features_t features, + struct iov_iter *iter, + const struct virtio_net_hdr *hdr) { + unsigned int parsed_size = tun_vnet_parse_size(features); + if (unlikely(iov_iter_count(iter) < sz)) return -EINVAL; - if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) + if (unlikely(copy_to_iter(hdr, parsed_size, iter) != parsed_size)) return -EFAULT; - if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) + if (iov_iter_zero(sz - parsed_size, iter) != sz - parsed_size) return -EFAULT; return 0; } +static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, + const struct virtio_net_hdr *hdr) +{ + return __tun_vnet_hdr_put(sz, 0, iter, hdr); +} + static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, const struct virtio_net_hdr *hdr) { return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); } +/* + * Tun is not aware of the negotiated guest features, guess them from the + * virtio net hdr size + */ +static inline netdev_features_t tun_vnet_hdr_guest_features(int vnet_hdr_sz) +{ + if (vnet_hdr_sz >= TUN_VNET_TNL_SIZE) + return NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_UDP_TUNNEL_CSUM; + return 0; +} + +static inline int +tun_vnet_hdr_tnl_to_skb(unsigned int flags, netdev_features_t features, + struct sk_buff *skb, + const struct virtio_net_hdr_v1_hash_tunnel *hdr) +{ + return virtio_net_hdr_tnl_to_skb(skb, hdr, + features & NETIF_F_GSO_UDP_TUNNEL, + features & NETIF_F_GSO_UDP_TUNNEL_CSUM, + tun_vnet_is_little_endian(flags)); +} + static inline int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb, @@ -183,4 +233,37 @@ static inline int tun_vnet_hdr_from_skb(unsigned int flags, return 0; } +static inline int +tun_vnet_hdr_tnl_from_skb(unsigned int flags, + const struct net_device *dev, + const struct sk_buff *skb, + struct virtio_net_hdr_v1_hash_tunnel *tnl_hdr) +{ + bool has_tnl_offload = !!(dev->features & NETIF_F_GSO_UDP_TUNNEL); + int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; + + if (virtio_net_hdr_tnl_from_skb(skb, tnl_hdr, has_tnl_offload, + tun_vnet_is_little_endian(flags), + vlan_hlen)) { + struct virtio_net_hdr_v1 *hdr = &tnl_hdr->hash_hdr.hdr; + struct skb_shared_info *sinfo = skb_shinfo(skb); + + if (net_ratelimit()) { + int hdr_len = tun_vnet16_to_cpu(flags, hdr->hdr_len); + + netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", + sinfo->gso_type, + tun_vnet16_to_cpu(flags, hdr->gso_size), + tun_vnet16_to_cpu(flags, hdr->hdr_len)); + print_hex_dump(KERN_ERR, "tun: ", DUMP_PREFIX_NONE, + 16, 1, skb->head, min(hdr_len, 64), + true); + } + WARN_ON_ONCE(1); + return -EINVAL; + } + + return 0; +} + #endif /* TUN_VNET_H */ diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 287cdc81c939..79d53c7a1ebd 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -93,6 +93,15 @@ #define TUN_F_USO4 0x20 /* I can handle USO for IPv4 packets */ #define TUN_F_USO6 0x40 /* I can handle USO for IPv6 packets */ +/* I can handle TSO/USO for UDP tunneled packets */ +#define TUN_F_UDP_TUNNEL_GSO 0x080 + +/* + * I can handle TSO/USO for UDP tunneled packets requiring csum offload for + * the outer header + */ +#define TUN_F_UDP_TUNNEL_GSO_CSUM 0x100 + /* Protocol info prepended to the packets (when IFF_NO_PI is not set) */ #define TUN_PKT_STRIP 0x0001 struct tun_pi { -- cgit v1.2.3 From a7cec20845a67ff4f3c924255519341f37d993f9 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 25 Jun 2025 17:12:22 -0700 Subject: KVM: x86: Provide a capability to disable APERF/MPERF read intercepts Allow a guest to read the physical IA32_APERF and IA32_MPERF MSRs without interception. The IA32_APERF and IA32_MPERF MSRs are not virtualized. Writes are not handled at all. The MSR values are not zeroed on vCPU creation, saved on suspend, or restored on resume. No accommodation is made for processor migration or for sharing a logical processor with other tasks. No adjustments are made for non-unit TSC multipliers. The MSRs do not account for time the same way as the comparable PMU events, whether the PMU is virtualized by the traditional emulation method or the new mediated pass-through approach. Nonetheless, in a properly constrained environment, this capability can be combined with a guest CPUID table that advertises support for CPUID.6:ECX.APERFMPERF[bit 0] to induce a Linux guest to report the effective physical CPU frequency in /proc/cpuinfo. Moreover, there is no performance cost for this capability. Signed-off-by: Jim Mattson Link: https://lore.kernel.org/r/20250530185239.2335185-3-jmattson@google.com Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250626001225.744268-3-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 23 +++++++++++++++++++++++ arch/x86/kvm/svm/nested.c | 4 +++- arch/x86/kvm/svm/svm.c | 5 +++++ arch/x86/kvm/vmx/nested.c | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 4 ++++ arch/x86/kvm/x86.c | 6 +++++- arch/x86/kvm/x86.h | 5 +++++ include/uapi/linux/kvm.h | 1 + tools/include/uapi/linux/kvm.h | 1 + 9 files changed, 53 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index f0d961436d0f..6be1ddedec49 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7844,6 +7844,7 @@ Valid bits in args[0] are:: #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) #define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) + #define KVM_X86_DISABLE_EXITS_APERFMPERF (1 << 4) Enabling this capability on a VM provides userspace with a way to no longer intercept some instructions for improved latency in some @@ -7854,6 +7855,28 @@ all such vmexits. Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits. +Virtualizing the ``IA32_APERF`` and ``IA32_MPERF`` MSRs requires more +than just disabling APERF/MPERF exits. While both Intel and AMD +document strict usage conditions for these MSRs--emphasizing that only +the ratio of their deltas over a time interval (T0 to T1) is +architecturally defined--simply passing through the MSRs can still +produce an incorrect ratio. + +This erroneous ratio can occur if, between T0 and T1: + +1. The vCPU thread migrates between logical processors. +2. Live migration or suspend/resume operations take place. +3. Another task shares the vCPU's logical processor. +4. C-states lower than C0 are emulated (e.g., via HLT interception). +5. The guest TSC frequency doesn't match the host TSC frequency. + +Due to these complexities, KVM does not automatically associate this +passthrough capability with the guest CPUID bit, +``CPUID.6:ECX.APERFMPERF[bit 0]``. Userspace VMMs that deem this +mechanism adequate for virtualizing the ``IA32_APERF`` and +``IA32_MPERF`` MSRs must set the guest CPUID bit explicitly. + + 7.14 KVM_CAP_S390_HPAGE_1M -------------------------- diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 749f7b866ac8..b7fd2e869998 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -194,7 +194,7 @@ void recalc_intercepts(struct vcpu_svm *svm) * Hardcode the capacity of the array based on the maximum number of _offsets_. * MSRs are batched together, so there are fewer offsets than MSRs. */ -static int nested_svm_msrpm_merge_offsets[6] __ro_after_init; +static int nested_svm_msrpm_merge_offsets[7] __ro_after_init; static int nested_svm_nr_msrpm_merge_offsets __ro_after_init; typedef unsigned long nsvm_msrpm_merge_t; @@ -216,6 +216,8 @@ int __init nested_svm_init_msrpm_merge_offsets(void) MSR_IA32_SPEC_CTRL, MSR_IA32_PRED_CMD, MSR_IA32_FLUSH_CMD, + MSR_IA32_APERF, + MSR_IA32_MPERF, MSR_IA32_LASTBRANCHFROMIP, MSR_IA32_LASTBRANCHTOIP, MSR_IA32_LASTINTFROMIP, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1261447ffcdd..fedf68c63318 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -838,6 +838,11 @@ static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW, guest_cpuid_is_intel_compatible(vcpu)); + if (kvm_aperfmperf_in_guest(vcpu->kvm)) { + svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); + svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); + } + if (sev_es_guest(vcpu->kvm)) sev_es_recalc_msr_intercepts(vcpu); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c69df3aba8d1..b8ea1969113d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -715,6 +715,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_FLUSH_CMD, MSR_TYPE_W); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_APERF, MSR_TYPE_R); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_MPERF, MSR_TYPE_R); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b064e50c6e64..77bbb2b93418 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4084,6 +4084,10 @@ void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); } + if (kvm_aperfmperf_in_guest(vcpu->kvm)) { + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); + } /* PT MSRs can be passed through iff PT is exposed to the guest. */ if (vmx_pt_mode_is_host_guest()) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6dda7bf4c44c..912260e3725d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4577,6 +4577,9 @@ static u64 kvm_get_allowed_disable_exits(void) { u64 r = KVM_X86_DISABLE_EXITS_PAUSE; + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + r |= KVM_X86_DISABLE_EXITS_APERFMPERF; + if (!mitigate_smt_rsb) { r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_CSTATE; @@ -6613,7 +6616,8 @@ split_irqchip_unlock: if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() && - (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE)) + (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE | + KVM_X86_DISABLE_EXITS_APERFMPERF))) pr_warn_once(SMT_RSB_MSG); kvm_disable_exits(kvm, cap->args[0]); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 17ec8436e565..e77281b6e2b2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -524,6 +524,11 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm) return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_CSTATE; } +static inline bool kvm_aperfmperf_in_guest(struct kvm *kvm) +{ + return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_APERFMPERF; +} + static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm) { return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7a4c35ff03fe..aeb2ca10b190 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -644,6 +644,7 @@ struct kvm_ioeventfd { #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) #define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) +#define KVM_X86_DISABLE_EXITS_APERFMPERF (1 << 4) /* for KVM_ENABLE_CAP */ struct kvm_enable_cap { diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index b6ae8ad8934b..eef57c117140 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -617,6 +617,7 @@ struct kvm_ioeventfd { #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) #define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) +#define KVM_X86_DISABLE_EXITS_APERFMPERF (1 << 4) /* for KVM_ENABLE_CAP */ struct kvm_enable_cap { -- cgit v1.2.3 From 76fdb7eb4e1c91086ce9c3db6972c2ed48c96afb Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 8 Jul 2025 23:21:51 +1000 Subject: uapi: export PROCFS_ROOT_INO The root inode of /proc having a fixed inode number has been part of the core kernel ABI since its inception, and recently some userspace programs (mainly container runtimes) have started to explicitly depend on this behaviour. The main reason this is useful to userspace is that by checking that a suspect /proc handle has fstype PROC_SUPER_MAGIC and is PROCFS_ROOT_INO, they can then use openat2(RESOLVE_{NO_{XDEV,MAGICLINK},BENEATH}) to ensure that there isn't a bind-mount that replaces some procfs file with a different one. This kind of attack has lead to security issues in container runtimes in the past (such as CVE-2019-19921) and libraries like libpathrs[1] use this feature of procfs to provide safe procfs handling functions. There was also some trailing whitespace in the "struct proc_dir_entry" initialiser, so fix that up as well. [1]: https://github.com/openSUSE/libpathrs Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250708-uapi-procfs-root-ino-v1-1-6ae61e97c79b@cyphar.com Signed-off-by: Christian Brauner --- fs/proc/root.c | 10 +++++----- include/linux/proc_ns.h | 1 - include/uapi/linux/fs.h | 11 +++++++++++ 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/proc/root.c b/fs/proc/root.c index 06a297a27ba3..ed86ac710384 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -363,12 +363,12 @@ static const struct inode_operations proc_root_inode_operations = { * This is the root "inode" in the /proc tree.. */ struct proc_dir_entry proc_root = { - .low_ino = PROC_ROOT_INO, - .namelen = 5, - .mode = S_IFDIR | S_IRUGO | S_IXUGO, - .nlink = 2, + .low_ino = PROCFS_ROOT_INO, + .namelen = 5, + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .nlink = 2, .refcnt = REFCOUNT_INIT(1), - .proc_iops = &proc_root_inode_operations, + .proc_iops = &proc_root_inode_operations, .proc_dir_ops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT, diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 6258455e49a4..4b20375f3783 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -40,7 +40,6 @@ extern const struct proc_ns_operations timens_for_children_operations; * We always define these enumerators */ enum { - PROC_ROOT_INO = 1, PROC_IPC_INIT_INO = IPC_NS_INIT_INO, PROC_UTS_INIT_INO = UTS_NS_INIT_INO, PROC_USER_INIT_INO = USER_NS_INIT_INO, diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 0098b0ce8ccb..28238a3edbc1 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -60,6 +60,17 @@ #define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ #define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ +/* + * The root inode of procfs is guaranteed to always have the same inode number. + * For programs that make heavy use of procfs, verifying that the root is a + * real procfs root and using openat2(RESOLVE_{NO_{XDEV,MAGICLINKS},BENEATH}) + * will allow you to make sure you are never tricked into operating on the + * wrong procfs file. + */ +enum procfs_ino { + PROCFS_ROOT_INO = 1, +}; + struct file_clone_range { __s64 src_fd; __u64 src_offset; -- cgit v1.2.3 From 1fff2ee377e1c2230054e65092def460dd40b587 Mon Sep 17 00:00:00 2001 From: Mehdi Djait Date: Mon, 23 Jun 2025 15:51:15 +0200 Subject: media: uapi: videodev2: Fix comment for 12-bit packed Bayer formats For 12-bit packed Bayer formats: every two consecutive samples are packed into three bytes. Fix the corresponding comment. Signed-off-by: Mehdi Djait Reviewed-by: Laurent Pinchart Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- include/uapi/linux/videodev2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 6f7bd38dd5aa..1bb1979f6c18 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -726,7 +726,7 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_SGBRG12 v4l2_fourcc('G', 'B', '1', '2') /* 12 GBGB.. RGRG.. */ #define V4L2_PIX_FMT_SGRBG12 v4l2_fourcc('B', 'A', '1', '2') /* 12 GRGR.. BGBG.. */ #define V4L2_PIX_FMT_SRGGB12 v4l2_fourcc('R', 'G', '1', '2') /* 12 RGRG.. GBGB.. */ - /* 12bit raw bayer packed, 6 bytes for every 4 pixels */ + /* 12bit raw bayer packed, 3 bytes for every 2 pixels */ #define V4L2_PIX_FMT_SBGGR12P v4l2_fourcc('p', 'B', 'C', 'C') #define V4L2_PIX_FMT_SGBRG12P v4l2_fourcc('p', 'G', 'C', 'C') #define V4L2_PIX_FMT_SGRBG12P v4l2_fourcc('p', 'g', 'C', 'C') -- cgit v1.2.3 From 45e359be1ce88fb22e61fa3aa23b2e450a6cae03 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 5 Jul 2025 00:01:38 +0800 Subject: net: xsk: introduce XDP_MAX_TX_SKB_BUDGET setsockopt This patch provides a setsockopt method to let applications leverage to adjust how many descs to be handled at most in one send syscall. It mitigates the situation where the default value (32) that is too small leads to higher frequency of triggering send syscall. Considering the prosperity/complexity the applications have, there is no absolutely ideal suggestion fitting all cases. So keep 32 as its default value like before. The patch does the following things: - Add XDP_MAX_TX_SKB_BUDGET socket option. - Set max_tx_budget to 32 by default in the initialization phase as a per-socket granular control. - Set the range of max_tx_budget as [32, xs->tx->nentries]. The idea behind this comes out of real workloads in production. We use a user-level stack with xsk support to accelerate sending packets and minimize triggering syscalls. When the packets are aggregated, it's not hard to hit the upper bound (namely, 32). The moment user-space stack fetches the -EAGAIN error number passed from sendto(), it will loop to try again until all the expected descs from tx ring are sent out to the driver. Enlarging the XDP_MAX_TX_SKB_BUDGET value contributes to less frequency of sendto() and higher throughput/PPS. Here is what I did in production, along with some numbers as follows: For one application I saw lately, I suggested using 128 as max_tx_budget because I saw two limitations without changing any default configuration: 1) XDP_MAX_TX_SKB_BUDGET, 2) socket sndbuf which is 212992 decided by net.core.wmem_default. As to XDP_MAX_TX_SKB_BUDGET, the scenario behind this was I counted how many descs are transmitted to the driver at one time of sendto() based on [1] patch and then I calculated the possibility of hitting the upper bound. Finally I chose 128 as a suitable value because 1) it covers most of the cases, 2) a higher number would not bring evident results. After twisting the parameters, a stable improvement of around 4% for both PPS and throughput and less resources consumption were found to be observed by strace -c -p xxx: 1) %time was decreased by 7.8% 2) error counter was decreased from 18367 to 572 [1]: https://lore.kernel.org/all/20250619093641.70700-1-kerneljasonxing@gmail.com/ Signed-off-by: Jason Xing Acked-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250704160138.48677-1-kerneljasonxing@gmail.com Signed-off-by: Paolo Abeni --- Documentation/networking/af_xdp.rst | 9 +++++++++ include/net/xdp_sock.h | 1 + include/uapi/linux/if_xdp.h | 1 + net/xdp/xsk.c | 21 +++++++++++++++++++-- tools/include/uapi/linux/if_xdp.h | 1 + 5 files changed, 31 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst index d486014bb31d..50d92084a49c 100644 --- a/Documentation/networking/af_xdp.rst +++ b/Documentation/networking/af_xdp.rst @@ -438,6 +438,15 @@ is created by a privileged process and passed to a non-privileged one. Once the option is set, kernel will refuse attempts to bind that socket to a different interface. Updating the value requires CAP_NET_RAW. +XDP_MAX_TX_SKB_BUDGET setsockopt +-------------------------------- + +This setsockopt sets the maximum number of descriptors that can be handled +and passed to the driver at one send syscall. It is applied in the copy +mode to allow application to tune the per-socket maximum iteration for +better throughput and less frequency of send syscall. +Allowed range is [32, xs->tx->nentries]. + XDP_STATISTICS getsockopt ------------------------- diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index e8bd6ddb7b12..ce587a225661 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -84,6 +84,7 @@ struct xdp_sock { struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; + u32 max_tx_budget; /* Protects multiple processes in the control path */ struct mutex mutex; struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 44f2bb93e7e6..23a062781468 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -79,6 +79,7 @@ struct xdp_mmap_offsets { #define XDP_UMEM_COMPLETION_RING 6 #define XDP_STATISTICS 7 #define XDP_OPTIONS 8 +#define XDP_MAX_TX_SKB_BUDGET 9 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index bd61b0bc9c24..9c3acecc14b1 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -34,7 +34,7 @@ #include "xsk.h" #define TX_BATCH_SIZE 32 -#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE) +#define MAX_PER_SOCKET_BUDGET 32 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { @@ -783,10 +783,10 @@ free_err: static int __xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); - u32 max_batch = TX_BATCH_SIZE; bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; + u32 max_batch; int err = 0; mutex_lock(&xs->mutex); @@ -800,6 +800,7 @@ static int __xsk_generic_xmit(struct sock *sk) if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; + max_batch = READ_ONCE(xs->max_tx_budget); while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { if (max_batch-- == 0) { err = -EAGAIN; @@ -1440,6 +1441,21 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, mutex_unlock(&xs->mutex); return err; } + case XDP_MAX_TX_SKB_BUDGET: + { + unsigned int budget; + + if (optlen != sizeof(budget)) + return -EINVAL; + if (copy_from_sockptr(&budget, optval, sizeof(budget))) + return -EFAULT; + if (!xs->tx || + budget < TX_BATCH_SIZE || budget > xs->tx->nentries) + return -EACCES; + + WRITE_ONCE(xs->max_tx_budget, budget); + return 0; + } default: break; } @@ -1737,6 +1753,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs = xdp_sk(sk); xs->state = XSK_READY; + xs->max_tx_budget = TX_BATCH_SIZE; mutex_init(&xs->mutex); INIT_LIST_HEAD(&xs->map_list); diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index 44f2bb93e7e6..23a062781468 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -79,6 +79,7 @@ struct xdp_mmap_offsets { #define XDP_UMEM_COMPLETION_RING 6 #define XDP_STATISTICS 7 #define XDP_OPTIONS 8 +#define XDP_MAX_TX_SKB_BUDGET 9 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ -- cgit v1.2.3 From fca02263f27eee093379844ac0fb280bf70e6aed Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:58:54 -0700 Subject: iommufd: Correct virt_id kdoc at struct iommu_vdevice_alloc The userspace-api iommufd.rst has described it correctly but the uAPI doc was remained uncorrected. Thus, fix it. Link: https://patch.msgid.link/r/2cdcecaf2babee16fda7545ccad4e5bed7a5032d.1752126748.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/uapi/linux/iommufd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index f29b6c44655e..4bc05e4621c1 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -995,7 +995,7 @@ struct iommu_viommu_alloc { * @dev_id: The physical device to allocate a virtual instance on the vIOMMU * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID - * of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table + * of AMD IOMMU, and vRID of Intel VT-d * * Allocate a virtual device instance (for a physical device) against a vIOMMU. * This instance holds the device's information (related to its vIOMMU) in a VM. -- cgit v1.2.3 From 1976cdf61ce9b6f97b5212676a3b9f74c68f6073 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:58:59 -0700 Subject: iommufd/viommu: Allow driver-specific user data for a vIOMMU object The new type of vIOMMU for tegra241-cmdqv driver needs a driver-specific user data. So, add data_len/uptr to the iommu_viommu_alloc uAPI and pass it in via the viommu_init iommu op. Link: https://patch.msgid.link/r/2315b0e164b355746387e960745ac9154caec124.1752126748.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Acked-by: Pranjal Shrivastava Acked-by: Alok Tiwari Reviewed-by: Vasant Hegde Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/viommu.c | 8 +++++++- include/uapi/linux/iommufd.h | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 2009a421efae..c0365849f849 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -17,6 +17,11 @@ void iommufd_viommu_destroy(struct iommufd_object *obj) int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) { struct iommu_viommu_alloc *cmd = ucmd->cmd; + const struct iommu_user_data user_data = { + .type = cmd->type, + .uptr = u64_to_user_ptr(cmd->data_uptr), + .len = cmd->data_len, + }; struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_viommu *viommu; struct iommufd_device *idev; @@ -84,7 +89,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) */ viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev); - rc = ops->viommu_init(viommu, hwpt_paging->common.domain, NULL); + rc = ops->viommu_init(viommu, hwpt_paging->common.domain, + user_data.len ? &user_data : NULL); if (rc) goto out_put_hwpt; diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 4bc05e4621c1..04eee77335cf 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -965,6 +965,9 @@ enum iommu_viommu_type { * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU * @hwpt_id: ID of a nesting parent HWPT to associate to * @out_viommu_id: Output virtual IOMMU ID for the allocated object + * @data_len: Length of the type specific data + * @__reserved: Must be 0 + * @data_uptr: User pointer to a driver-specific virtual IOMMU data * * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's * virtualization support that is a security-isolated slice of the real IOMMU HW @@ -985,6 +988,9 @@ struct iommu_viommu_alloc { __u32 dev_id; __u32 hwpt_id; __u32 out_viommu_id; + __u32 data_len; + __u32 __reserved; + __aligned_u64 data_uptr; }; #define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC) -- cgit v1.2.3 From d7974697de4d6fa1a1ed9ca43616a8500046f25a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Jul 2025 15:06:38 -0700 Subject: ethtool: mark ETHER_FLOW as usable for Rx hash Looks like some drivers (ena, enetc, fbnic.. there's probably more) consider ETHER_FLOW to be legitimate target for flow hashing. I'm not sure how intentional that is from the uAPI perspective vs just an effect of ethtool IOCTL doing minimal input validation. But Netlink will do strict validation, so we need to decide whether we allow this use case or not. I don't see a strong reason against it, and rejecting it would potentially regress a number of drivers. So update the comments and flow_type_hashable(). Link: https://patch.msgid.link/20250708220640.2738464-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 4 ++-- net/ethtool/ioctl.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 707c1844010c..9e9afdd1238a 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2314,7 +2314,7 @@ enum { IPV6_USER_FLOW = 0x0e, /* spec only (usr_ip6_spec; nfc only) */ IPV4_FLOW = 0x10, /* hash only */ IPV6_FLOW = 0x11, /* hash only */ - ETHER_FLOW = 0x12, /* spec only (ether_spec) */ + ETHER_FLOW = 0x12, /* hash or spec (ether_spec) */ /* Used for GTP-U IPv4 and IPv6. * The format of GTP packets only includes @@ -2371,7 +2371,7 @@ enum { /* Flag to enable RSS spreading of traffic matching rule (nfc only) */ #define FLOW_RSS 0x20000000 -/* L3-L4 network traffic flow hash options */ +/* L2-L4 network traffic flow hash options */ #define RXH_L2DA (1 << 1) #define RXH_VLAN (1 << 2) #define RXH_L3_PROTO (1 << 3) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 139f95620cdd..67f6d900a4ee 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -981,6 +981,7 @@ static int ethtool_rxnfc_copy_to_user(void __user *useraddr, static bool flow_type_hashable(u32 flow_type) { switch (flow_type) { + case ETHER_FLOW: case TCP_V4_FLOW: case UDP_V4_FLOW: case SCTP_V4_FLOW: -- cgit v1.2.3 From 178331743ca860561f60d04a7797a2fce13f0784 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Jul 2025 15:06:39 -0700 Subject: ethtool: rss: report which fields are configured for hashing Implement ETHTOOL_GRXFH over Netlink. The number of flow types is reasonable (around 20) so report all of them at once for simplicity. Do not maintain the flow ID mapping with ioctl at the uAPI level. This gives us a chance to clean up the confusion that come from RxNFC vs RxFH (flow direction vs hashing) in the ioctl. Try to align with the names used in ethtool CLI, they seem to have stood the test of time just fine. One annoyance is that we still call L4 ports the weird names, but I guess they also apply to IPSec (where they cover the SPI) so it is what it is. $ ynl --family ethtool --dump rss-get { "header": { "dev-index": 1, "dev-name": "enp1s0" }, "hfunc": 1, "hkey": b"...", "indir": [0, 1, ...], "flow-hash": { "ether": {"l2da"}, "ah-esp4": {"ip-src", "ip-dst"}, "ah-esp6": {"ip-src", "ip-dst"}, "ah4": {"ip-src", "ip-dst"}, "ah6": {"ip-src", "ip-dst"}, "esp4": {"ip-src", "ip-dst"}, "esp6": {"ip-src", "ip-dst"}, "ip4": {"ip-src", "ip-dst"}, "ip6": {"ip-src", "ip-dst"}, "sctp4": {"ip-src", "ip-dst"}, "sctp6": {"ip-src", "ip-dst"}, "udp4": {"ip-src", "ip-dst"}, "udp6": {"ip-src", "ip-dst"} "tcp4": {"l4-b-0-1", "l4-b-2-3", "ip-src", "ip-dst"}, "tcp6": {"l4-b-0-1", "l4-b-2-3", "ip-src", "ip-dst"}, }, } Link: https://patch.msgid.link/20250708220640.2738464-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 151 +++++++++++++++++++++++++ Documentation/networking/ethtool-netlink.rst | 9 +- include/uapi/linux/ethtool_netlink_generated.h | 34 ++++++ net/ethtool/ioctl.c | 6 +- net/ethtool/rss.c | 105 +++++++++++++++-- 5 files changed, 291 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 49e782a33eb6..c38c03c624f0 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -158,6 +158,35 @@ definitions: - name: pse-event-sw-pw-control-error doc: PSE faced an error managing the power control from software + - + name: rxfh-fields + name-prefix: rxh- + enum-name: + header: linux/ethtool.h + type: flags + entries: + - + name: l2da + value: 1 + - + name: vlan + - + name: l3-proto + - + name: ip-src + - + name: ip-dst + - + name: l4-b-0-1 + doc: src port in case of TCP/UDP/SCTP + - + name: l4-b-2-3 + doc: dst port in case of TCP/UDP/SCTP + - + name: gtp-teid + - + name: discard + value: 31 attribute-sets: - @@ -1447,6 +1476,123 @@ attribute-sets: name: pse-prio type: u32 name-prefix: ethtool-a- + - + name: flow + attr-cnt-name: --ethtool-a-flow-cnt + doc: | + Flow types, corresponding to those defined in the old + ethtool header for RXFH and RXNFC as ${PROTO}_FLOW. + The values are not matching the old ones to avoid carrying + into Netlink the IP_USER_FLOW vs IPV4_FLOW vs IPV4_USER_FLOW confusion. + attributes: + - + name: ether + type: uint + enum: rxfh-fields + - + name: ip4 + type: uint + enum: rxfh-fields + - + name: ip6 + type: uint + enum: rxfh-fields + - + name: tcp4 + type: uint + enum: rxfh-fields + - + name: tcp6 + type: uint + enum: rxfh-fields + - + name: udp4 + type: uint + enum: rxfh-fields + - + name: udp6 + type: uint + enum: rxfh-fields + - + name: sctp4 + type: uint + enum: rxfh-fields + - + name: sctp6 + type: uint + enum: rxfh-fields + - + name: ah4 + type: uint + enum: rxfh-fields + - + name: ah6 + type: uint + enum: rxfh-fields + - + name: esp4 + type: uint + enum: rxfh-fields + - + name: esp6 + type: uint + enum: rxfh-fields + - + name: ah-esp4 + type: uint + enum: rxfh-fields + - + name: ah-esp6 + type: uint + enum: rxfh-fields + - + name: gtpu4 + type: uint + enum: rxfh-fields + - + name: gtpu6 + type: uint + enum: rxfh-fields + - + name: gtpc4 + type: uint + enum: rxfh-fields + - + name: gtpc6 + type: uint + enum: rxfh-fields + - + name: gtpc-teid4 + type: uint + enum: rxfh-fields + - + name: gtpc-teid6 + type: uint + enum: rxfh-fields + - + name: gtpu-eh4 + type: uint + enum: rxfh-fields + - + name: gtpu-eh6 + type: uint + enum: rxfh-fields + - + name: gtpu-ul4 + type: uint + enum: rxfh-fields + - + name: gtpu-ul6 + type: uint + enum: rxfh-fields + - + name: gtpu-dl4 + type: uint + enum: rxfh-fields + - + name: gtpu-dl6 + type: uint + enum: rxfh-fields - name: rss attr-cnt-name: __ethtool-a-rss-cnt @@ -1478,6 +1624,10 @@ attribute-sets: - name: start-context type: u32 + - + name: flow-hash + type: nest + nested-attributes: flow - name: plca attr-cnt-name: __ethtool-a-plca-cnt @@ -2307,6 +2457,7 @@ operations: - indir - hkey - input-xfrm + - flow-hash dump: request: attributes: diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 07e9808ebd2c..248bc3d93da9 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1969,14 +1969,15 @@ used to ignore context 0s and only dump additional contexts). Kernel response contents: -===================================== ====== ========================== +===================================== ====== =============================== ``ETHTOOL_A_RSS_HEADER`` nested reply header ``ETHTOOL_A_RSS_CONTEXT`` u32 context number ``ETHTOOL_A_RSS_HFUNC`` u32 RSS hash func ``ETHTOOL_A_RSS_INDIR`` binary Indir table bytes ``ETHTOOL_A_RSS_HKEY`` binary Hash key bytes ``ETHTOOL_A_RSS_INPUT_XFRM`` u32 RSS input data transformation -===================================== ====== ========================== + ``ETHTOOL_A_RSS_FLOW_HASH`` nested Header fields included in hash +===================================== ====== =============================== ETHTOOL_A_RSS_HFUNC attribute is bitmap indicating the hash function being used. Current supported options are toeplitz, xor or crc32. @@ -1985,6 +1986,8 @@ indicates queue number. ETHTOOL_A_RSS_INPUT_XFRM attribute is a bitmap indicating the type of transformation applied to the input protocol fields before given to the RSS hfunc. Current supported options are symmetric-xor and symmetric-or-xor. +ETHTOOL_A_RSS_FLOW_HASH carries per-flow type bitmask of which header +fields are included in the hash calculation. PLCA_GET_CFG ============ @@ -2436,7 +2439,7 @@ are netlink only. ``ETHTOOL_SFLAGS`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_GPFLAGS`` ``ETHTOOL_MSG_PRIVFLAGS_GET`` ``ETHTOOL_SPFLAGS`` ``ETHTOOL_MSG_PRIVFLAGS_SET`` - ``ETHTOOL_GRXFH`` n/a + ``ETHTOOL_GRXFH`` ``ETHTOOL_MSG_RSS_GET`` ``ETHTOOL_SRXFH`` n/a ``ETHTOOL_GGRO`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_SGRO`` ``ETHTOOL_MSG_FEATURES_SET`` diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 8f30ffa1cd14..96027e26ffba 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -678,6 +678,39 @@ enum { ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) }; +enum { + ETHTOOL_A_FLOW_ETHER = 1, + ETHTOOL_A_FLOW_IP4, + ETHTOOL_A_FLOW_IP6, + ETHTOOL_A_FLOW_TCP4, + ETHTOOL_A_FLOW_TCP6, + ETHTOOL_A_FLOW_UDP4, + ETHTOOL_A_FLOW_UDP6, + ETHTOOL_A_FLOW_SCTP4, + ETHTOOL_A_FLOW_SCTP6, + ETHTOOL_A_FLOW_AH4, + ETHTOOL_A_FLOW_AH6, + ETHTOOL_A_FLOW_ESP4, + ETHTOOL_A_FLOW_ESP6, + ETHTOOL_A_FLOW_AH_ESP4, + ETHTOOL_A_FLOW_AH_ESP6, + ETHTOOL_A_FLOW_GTPU4, + ETHTOOL_A_FLOW_GTPU6, + ETHTOOL_A_FLOW_GTPC4, + ETHTOOL_A_FLOW_GTPC6, + ETHTOOL_A_FLOW_GTPC_TEID4, + ETHTOOL_A_FLOW_GTPC_TEID6, + ETHTOOL_A_FLOW_GTPU_EH4, + ETHTOOL_A_FLOW_GTPU_EH6, + ETHTOOL_A_FLOW_GTPU_UL4, + ETHTOOL_A_FLOW_GTPU_UL6, + ETHTOOL_A_FLOW_GTPU_DL4, + ETHTOOL_A_FLOW_GTPU_DL6, + + __ETHTOOL_A_FLOW_CNT, + ETHTOOL_A_FLOW_MAX = (__ETHTOOL_A_FLOW_CNT - 1) +}; + enum { ETHTOOL_A_RSS_UNSPEC, ETHTOOL_A_RSS_HEADER, @@ -687,6 +720,7 @@ enum { ETHTOOL_A_RSS_HKEY, ETHTOOL_A_RSS_INPUT_XFRM, ETHTOOL_A_RSS_START_CONTEXT, + ETHTOOL_A_RSS_FLOW_HASH, __ETHTOOL_A_RSS_CNT, ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 67f6d900a4ee..cccb4694f5e1 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1101,7 +1101,11 @@ ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr) rc = ops->set_rxfh_fields(dev, &fields, NULL); exit_unlock: mutex_unlock(&dev->ethtool->rss_lock); - return rc; + if (rc) + return rc; + + ethtool_rss_notify(dev, fields.rss_context); + return 0; } static noinline_for_stack int diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 37a7b20fcd07..41ab9fc67652 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -12,6 +12,7 @@ struct rss_req_info { struct rss_reply_data { struct ethnl_reply_data base; + bool has_flow_hash; bool no_key_fields; u32 indir_size; u32 hkey_size; @@ -19,6 +20,37 @@ struct rss_reply_data { u32 input_xfrm; u32 *indir_table; u8 *hkey; + int flow_hash[__ETHTOOL_A_FLOW_CNT]; +}; + +static const u8 ethtool_rxfh_ft_nl2ioctl[] = { + [ETHTOOL_A_FLOW_ETHER] = ETHER_FLOW, + [ETHTOOL_A_FLOW_IP4] = IPV4_FLOW, + [ETHTOOL_A_FLOW_IP6] = IPV6_FLOW, + [ETHTOOL_A_FLOW_TCP4] = TCP_V4_FLOW, + [ETHTOOL_A_FLOW_UDP4] = UDP_V4_FLOW, + [ETHTOOL_A_FLOW_SCTP4] = SCTP_V4_FLOW, + [ETHTOOL_A_FLOW_AH_ESP4] = AH_ESP_V4_FLOW, + [ETHTOOL_A_FLOW_TCP6] = TCP_V6_FLOW, + [ETHTOOL_A_FLOW_UDP6] = UDP_V6_FLOW, + [ETHTOOL_A_FLOW_SCTP6] = SCTP_V6_FLOW, + [ETHTOOL_A_FLOW_AH_ESP6] = AH_ESP_V6_FLOW, + [ETHTOOL_A_FLOW_AH4] = AH_V4_FLOW, + [ETHTOOL_A_FLOW_ESP4] = ESP_V4_FLOW, + [ETHTOOL_A_FLOW_AH6] = AH_V6_FLOW, + [ETHTOOL_A_FLOW_ESP6] = ESP_V6_FLOW, + [ETHTOOL_A_FLOW_GTPU4] = GTPU_V4_FLOW, + [ETHTOOL_A_FLOW_GTPU6] = GTPU_V6_FLOW, + [ETHTOOL_A_FLOW_GTPC4] = GTPC_V4_FLOW, + [ETHTOOL_A_FLOW_GTPC6] = GTPC_V6_FLOW, + [ETHTOOL_A_FLOW_GTPC_TEID4] = GTPC_TEID_V4_FLOW, + [ETHTOOL_A_FLOW_GTPC_TEID6] = GTPC_TEID_V6_FLOW, + [ETHTOOL_A_FLOW_GTPU_EH4] = GTPU_EH_V4_FLOW, + [ETHTOOL_A_FLOW_GTPU_EH6] = GTPU_EH_V6_FLOW, + [ETHTOOL_A_FLOW_GTPU_UL4] = GTPU_UL_V4_FLOW, + [ETHTOOL_A_FLOW_GTPU_UL6] = GTPU_UL_V6_FLOW, + [ETHTOOL_A_FLOW_GTPU_DL4] = GTPU_DL_V4_FLOW, + [ETHTOOL_A_FLOW_GTPU_DL6] = GTPU_DL_V6_FLOW, }; #define RSS_REQINFO(__req_base) \ @@ -49,6 +81,37 @@ rss_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, return 0; } +static void +rss_prepare_flow_hash(const struct rss_req_info *req, struct net_device *dev, + struct rss_reply_data *data, const struct genl_info *info) +{ + int i; + + data->has_flow_hash = false; + + if (!dev->ethtool_ops->get_rxfh_fields) + return; + if (req->rss_context && !dev->ethtool_ops->rxfh_per_ctx_fields) + return; + + mutex_lock(&dev->ethtool->rss_lock); + for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) { + struct ethtool_rxfh_fields fields = { + .flow_type = ethtool_rxfh_ft_nl2ioctl[i], + .rss_context = req->rss_context, + }; + + if (dev->ethtool_ops->get_rxfh_fields(dev, &fields)) { + data->flow_hash[i] = -1; /* Unsupported */ + continue; + } + + data->flow_hash[i] = fields.data; + data->has_flow_hash = true; + } + mutex_unlock(&dev->ethtool->rss_lock); +} + static int rss_prepare_get(const struct rss_req_info *request, struct net_device *dev, struct rss_reply_data *data, const struct genl_info *info) @@ -153,6 +216,8 @@ static int rss_prepare(const struct rss_req_info *request, struct net_device *dev, struct rss_reply_data *data, const struct genl_info *info) { + rss_prepare_flow_hash(request, dev, data, info); + if (request->rss_context) return rss_prepare_ctx(request, dev, data, info); return rss_prepare_get(request, dev, data, info); @@ -190,7 +255,10 @@ rss_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */ nla_total_size(sizeof(u32)) + /* _RSS_INPUT_XFRM */ nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */ - nla_total_size(data->hkey_size); /* _RSS_HKEY */ + nla_total_size(data->hkey_size) + /* _RSS_HKEY */ + nla_total_size(0) + /* _RSS_FLOW_HASH */ + nla_total_size(sizeof(u32)) * ETHTOOL_A_FLOW_MAX + + 0; return len; } @@ -211,17 +279,34 @@ rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, sizeof(u32) * data->indir_size, data->indir_table))) return -EMSGSIZE; - if (data->no_key_fields) - return 0; - - if ((data->hfunc && - nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) || - (data->input_xfrm && - nla_put_u32(skb, ETHTOOL_A_RSS_INPUT_XFRM, data->input_xfrm)) || - (data->hkey_size && - nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey))) + if (!data->no_key_fields && + ((data->hfunc && + nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) || + (data->input_xfrm && + nla_put_u32(skb, ETHTOOL_A_RSS_INPUT_XFRM, data->input_xfrm)) || + (data->hkey_size && + nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey)))) return -EMSGSIZE; + if (data->has_flow_hash) { + struct nlattr *nest; + int i; + + nest = nla_nest_start(skb, ETHTOOL_A_RSS_FLOW_HASH); + if (!nest) + return -EMSGSIZE; + + for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) { + if (data->flow_hash[i] >= 0 && + nla_put_uint(skb, i, data->flow_hash[i])) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + } + + nla_nest_end(skb, nest); + } + return 0; } -- cgit v1.2.3 From 760e6f7befbab9a84c54457a8ee45313b7b91ee5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 10 Jul 2025 13:00:09 +0200 Subject: futex: Remove support for IMMUTABLE The FH_FLAG_IMMUTABLE flag was meant to avoid the reference counting on the private hash and so to avoid the performance regression on big machines. With the switch to per-CPU counter this is no longer needed. That flag was never useable on any released kernel. Remove any support for IMMUTABLE while preserve the flags argument and enforce it to be zero. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250710110011.384614-5-bigeasy@linutronix.de --- include/uapi/linux/prctl.h | 2 -- kernel/futex/core.c | 36 +++--------------------------------- 2 files changed, 3 insertions(+), 35 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 43dec6eed559..3b93fb906e3c 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -367,8 +367,6 @@ struct prctl_mm_map { /* FUTEX hash management */ #define PR_FUTEX_HASH 78 # define PR_FUTEX_HASH_SET_SLOTS 1 -# define FH_FLAG_IMMUTABLE (1ULL << 0) # define PR_FUTEX_HASH_GET_SLOTS 2 -# define PR_FUTEX_HASH_GET_IMMUTABLE 3 #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 1981574a459d..d9bb5567af0c 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -69,7 +69,6 @@ struct futex_private_hash { struct rcu_head rcu; void *mm; bool custom; - bool immutable; struct futex_hash_bucket queues[]; }; @@ -145,15 +144,11 @@ static inline bool futex_key_is_private(union futex_key *key) static bool futex_private_hash_get(struct futex_private_hash *fph) { - if (fph->immutable) - return true; return futex_ref_get(fph); } void futex_private_hash_put(struct futex_private_hash *fph) { - if (fph->immutable) - return; if (futex_ref_put(fph)) wake_up_var(fph->mm); } @@ -1530,7 +1525,6 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, } #define FH_CUSTOM 0x01 -#define FH_IMMUTABLE 0x02 #ifdef CONFIG_FUTEX_PRIVATE_HASH @@ -1800,7 +1794,7 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) */ scoped_guard(rcu) { fph = rcu_dereference(mm->futex_phash); - if (fph && (!fph->hash_mask || fph->immutable)) { + if (fph && !fph->hash_mask) { if (custom) return -EBUSY; return 0; @@ -1814,7 +1808,6 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) fph->hash_mask = hash_slots ? hash_slots - 1 : 0; fph->custom = custom; - fph->immutable = !!(flags & FH_IMMUTABLE); fph->mm = mm; for (i = 0; i < hash_slots; i++) @@ -1838,7 +1831,7 @@ again: mm->futex_phash_new = NULL; if (fph) { - if (cur && (!cur->hash_mask || cur->immutable)) { + if (cur && !cur->hash_mask) { /* * If two threads simultaneously request the global * hash then the first one performs the switch, @@ -1931,19 +1924,6 @@ static int futex_hash_get_slots(void) return 0; } -static int futex_hash_get_immutable(void) -{ - struct futex_private_hash *fph; - - guard(rcu)(); - fph = rcu_dereference(current->mm->futex_phash); - if (fph && fph->immutable) - return 1; - if (fph && !fph->hash_mask) - return 1; - return 0; -} - #else static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) @@ -1956,10 +1936,6 @@ static int futex_hash_get_slots(void) return 0; } -static int futex_hash_get_immutable(void) -{ - return 0; -} #endif int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) @@ -1969,10 +1945,8 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) switch (arg2) { case PR_FUTEX_HASH_SET_SLOTS: - if (arg4 & ~FH_FLAG_IMMUTABLE) + if (arg4) return -EINVAL; - if (arg4 & FH_FLAG_IMMUTABLE) - flags |= FH_IMMUTABLE; ret = futex_hash_allocate(arg3, flags); break; @@ -1980,10 +1954,6 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) ret = futex_hash_get_slots(); break; - case PR_FUTEX_HASH_GET_IMMUTABLE: - ret = futex_hash_get_immutable(); - break; - default: ret = -EINVAL; break; -- cgit v1.2.3 From e2e9360022585c21dc30d2b19f5866c252f40806 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:05 -0700 Subject: iommufd/viommu: Introduce IOMMUFD_OBJ_HW_QUEUE and its related struct Add IOMMUFD_OBJ_HW_QUEUE with an iommufd_hw_queue structure, representing a HW-accelerated queue type of IOMMU's physical queue that can be passed through to a user space VM for direct hardware control, such as: - NVIDIA's Virtual Command Queue - AMD vIOMMU's Command Buffer, Event Log Buffers, and PPR Log Buffers Add new viommu ops for iommufd to communicate with IOMMU drivers to fetch supported HW queue structure size and to forward user space ioctls to the IOMMU drivers for initialization/destroy. As the existing HWs, NVIDIA's VCMDQs access the guest memory via physical addresses, while AMD's Buffers access the guest memory via guest physical addresses (i.e. iova of the nesting parent HWPT). Separate two mutually exclusive hw_queue_init and hw_queue_init_phys ops to indicate whether a vIOMMU HW accesses the guest queue in the guest physical space (via iova) or the host physical space (via pa). In a latter case, the iommufd core will validate the physical pages of a given guest queue, to ensure the underlying physical pages are contiguous and pinned. Since this is introduced with NVIDIA's VCMDQs, add hw_queue_init_phys for now, and leave some notes for hw_queue_init in the near future (for AMD). Either NVIDIA's or AMD's HW is a multi-queue model: NVIDIA's will be only one type in enum iommu_hw_queue_type, while AMD's will be three different types (two of which will have multi queues). Compared to letting the core manage multiple queues with three types per vIOMMU object, it'd be easier for the driver to manage that by having three different driver-structure arrays per vIOMMU object. Thus, pass in the index to the init op. Link: https://patch.msgid.link/r/6939b73699e278e60ce167e911b3d9be68882bad.1752126748.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Reviewed-by: Vasant Hegde Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 42 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/iommufd.h | 9 +++++++++ 2 files changed, 51 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index bdd10a85eeef..f13f3ca6adb5 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -37,6 +37,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_VIOMMU, IOMMUFD_OBJ_VDEVICE, IOMMUFD_OBJ_VEVENTQ, + IOMMUFD_OBJ_HW_QUEUE, #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif @@ -119,6 +120,19 @@ struct iommufd_vdevice { void (*destroy)(struct iommufd_vdevice *vdev); }; +struct iommufd_hw_queue { + struct iommufd_object obj; + struct iommufd_viommu *viommu; + + u64 base_addr; /* in guest physical address space */ + size_t length; + + enum iommu_hw_queue_type type; + + /* Clean up all driver-specific parts of an iommufd_hw_queue */ + void (*destroy)(struct iommufd_hw_queue *hw_queue); +}; + /** * struct iommufd_viommu_ops - vIOMMU specific operations * @destroy: Clean up all driver-specific parts of an iommufd_viommu. The memory @@ -143,6 +157,22 @@ struct iommufd_vdevice { * include/uapi/linux/iommufd.h) * If driver has a deinit function to revert what vdevice_init op * does, it should set it to the @vdev->destroy function pointer + * @get_hw_queue_size: Get the size of a driver-defined HW queue structure for a + * given @viommu corresponding to @queue_type. Driver should + * return 0 if HW queue aren't supported accordingly. It is + * required for driver to use the HW_QUEUE_STRUCT_SIZE macro + * to sanitize the driver-level HW queue structure related + * to the core one + * @hw_queue_init_phys: Initialize the driver-level structure of a HW queue that + * is initialized with its core-level structure that holds + * all the info about a guest queue memory. + * Driver providing this op indicates that HW accesses the + * guest queue memory via physical addresses. + * @index carries the logical HW QUEUE ID per vIOMMU in a + * guest VM, for a multi-queue model. @base_addr_pa carries + * the physical location of the guest queue + * If driver has a deinit function to revert what this op + * does, it should set it to the @hw_queue->destroy pointer */ struct iommufd_viommu_ops { void (*destroy)(struct iommufd_viommu *viommu); @@ -153,6 +183,11 @@ struct iommufd_viommu_ops { struct iommu_user_data_array *array); const size_t vdevice_size; int (*vdevice_init)(struct iommufd_vdevice *vdev); + size_t (*get_hw_queue_size)(struct iommufd_viommu *viommu, + enum iommu_hw_queue_type queue_type); + /* AMD's HW will add hw_queue_init simply using @hw_queue->base_addr */ + int (*hw_queue_init_phys)(struct iommufd_hw_queue *hw_queue, u32 index, + phys_addr_t base_addr_pa); }; #if IS_ENABLED(CONFIG_IOMMUFD) @@ -255,4 +290,11 @@ static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu, BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) + \ BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_vdevice, \ ((drv_struct *)NULL)->member))) + +#define HW_QUEUE_STRUCT_SIZE(drv_struct, member) \ + (sizeof(drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) + \ + BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_hw_queue, \ + ((drv_struct *)NULL)->member))) + #endif diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 04eee77335cf..640a8b5147c2 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -1147,4 +1147,13 @@ struct iommu_veventq_alloc { __u32 __reserved; }; #define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC) + +/** + * enum iommu_hw_queue_type - HW Queue Type + * @IOMMU_HW_QUEUE_TYPE_DEFAULT: Reserved for future use + */ +enum iommu_hw_queue_type { + IOMMU_HW_QUEUE_TYPE_DEFAULT = 0, +}; + #endif -- cgit v1.2.3 From 2238ddc2b0560734c2dabb1c1fb4b342b5193625 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:06 -0700 Subject: iommufd/viommu: Add IOMMUFD_CMD_HW_QUEUE_ALLOC ioctl Introduce a new IOMMUFD_CMD_HW_QUEUE_ALLOC ioctl for user space to allocate a HW QUEUE object for a vIOMMU specific HW-accelerated queue, e.g.: - NVIDIA's Virtual Command Queue - AMD vIOMMU's Command Buffer, Event Log Buffers, and PPR Log Buffers Since this is introduced with NVIDIA's VCMDQs that access the guest memory in the physical address space, add an iommufd_hw_queue_alloc_phys() helper that will create an access object to the queue memory in the IOAS, to avoid the mappings of the guest memory from being unmapped, during the life cycle of the HW queue object. AMD's HW will need an hw_queue_init op that is mutually exclusive with the hw_queue_init_phys op, and their case will bypass the access part, i.e. no iommufd_hw_queue_alloc_phys() call. Link: https://patch.msgid.link/r/dab4ace747deb46c1fe70a5c663307f46990ae56.1752126748.git.nicolinc@nvidia.com Reviewed-by: Pranjal Shrivastava Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/iommufd_private.h | 2 + drivers/iommu/iommufd/main.c | 6 ++ drivers/iommu/iommufd/viommu.c | 180 ++++++++++++++++++++++++++++++++ include/linux/iommufd.h | 1 + include/uapi/linux/iommufd.h | 34 ++++++ 5 files changed, 223 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 06b8c2e2d9e6..dcd609573244 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -652,6 +652,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_viommu_destroy(struct iommufd_object *obj); int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_vdevice_destroy(struct iommufd_object *obj); +int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_hw_queue_destroy(struct iommufd_object *obj); #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 778694d7c207..4e8dbbfac890 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -354,6 +354,7 @@ union ucmd_buffer { struct iommu_destroy destroy; struct iommu_fault_alloc fault; struct iommu_hw_info info; + struct iommu_hw_queue_alloc hw_queue; struct iommu_hwpt_alloc hwpt; struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; struct iommu_hwpt_invalidate cache; @@ -396,6 +397,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { struct iommu_fault_alloc, out_fault_fd), IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, __reserved), + IOCTL_OP(IOMMU_HW_QUEUE_ALLOC, iommufd_hw_queue_alloc_ioctl, + struct iommu_hw_queue_alloc, length), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, __reserved), IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, @@ -559,6 +562,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_FAULT] = { .destroy = iommufd_fault_destroy, }, + [IOMMUFD_OBJ_HW_QUEUE] = { + .destroy = iommufd_hw_queue_destroy, + }, [IOMMUFD_OBJ_HWPT_PAGING] = { .destroy = iommufd_hwpt_paging_destroy, .abort = iommufd_hwpt_paging_abort, diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 081ee6697a11..91339f799916 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -201,3 +201,183 @@ out_put_viommu: iommufd_put_object(ucmd->ictx, &viommu->obj); return rc; } + +static void iommufd_hw_queue_destroy_access(struct iommufd_ctx *ictx, + struct iommufd_access *access, + u64 base_iova, size_t length) +{ + u64 aligned_iova = PAGE_ALIGN_DOWN(base_iova); + u64 offset = base_iova - aligned_iova; + + iommufd_access_unpin_pages(access, aligned_iova, + PAGE_ALIGN(length + offset)); + iommufd_access_detach_internal(access); + iommufd_access_destroy_internal(ictx, access); +} + +void iommufd_hw_queue_destroy(struct iommufd_object *obj) +{ + struct iommufd_hw_queue *hw_queue = + container_of(obj, struct iommufd_hw_queue, obj); + + if (hw_queue->destroy) + hw_queue->destroy(hw_queue); + if (hw_queue->access) + iommufd_hw_queue_destroy_access(hw_queue->viommu->ictx, + hw_queue->access, + hw_queue->base_addr, + hw_queue->length); + if (hw_queue->viommu) + refcount_dec(&hw_queue->viommu->obj.users); +} + +/* + * When the HW accesses the guest queue via physical addresses, the underlying + * physical pages of the guest queue must be contiguous. Also, for the security + * concern that IOMMUFD_CMD_IOAS_UNMAP could potentially remove the mappings of + * the guest queue from the nesting parent iopt while the HW is still accessing + * the guest queue memory physically, such a HW queue must require an access to + * pin the underlying pages and prevent that from happening. + */ +static struct iommufd_access * +iommufd_hw_queue_alloc_phys(struct iommu_hw_queue_alloc *cmd, + struct iommufd_viommu *viommu, phys_addr_t *base_pa) +{ + u64 aligned_iova = PAGE_ALIGN_DOWN(cmd->nesting_parent_iova); + u64 offset = cmd->nesting_parent_iova - aligned_iova; + struct iommufd_access *access; + struct page **pages; + size_t max_npages; + size_t length; + size_t i; + int rc; + + /* max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE) */ + if (check_add_overflow(offset, cmd->length, &length)) + return ERR_PTR(-ERANGE); + if (check_add_overflow(length, PAGE_SIZE - 1, &length)) + return ERR_PTR(-ERANGE); + max_npages = length / PAGE_SIZE; + /* length needs to be page aligned too */ + length = max_npages * PAGE_SIZE; + + /* + * Use kvcalloc() to avoid memory fragmentation for a large page array. + * Set __GFP_NOWARN to avoid syzkaller blowups + */ + pages = kvcalloc(max_npages, sizeof(*pages), GFP_KERNEL | __GFP_NOWARN); + if (!pages) + return ERR_PTR(-ENOMEM); + + access = iommufd_access_create_internal(viommu->ictx); + if (IS_ERR(access)) { + rc = PTR_ERR(access); + goto out_free; + } + + rc = iommufd_access_attach_internal(access, viommu->hwpt->ioas); + if (rc) + goto out_destroy; + + rc = iommufd_access_pin_pages(access, aligned_iova, length, pages, 0); + if (rc) + goto out_detach; + + /* Validate if the underlying physical pages are contiguous */ + for (i = 1; i < max_npages; i++) { + if (page_to_pfn(pages[i]) == page_to_pfn(pages[i - 1]) + 1) + continue; + rc = -EFAULT; + goto out_unpin; + } + + *base_pa = (page_to_pfn(pages[0]) << PAGE_SHIFT) + offset; + kfree(pages); + return access; + +out_unpin: + iommufd_access_unpin_pages(access, aligned_iova, length); +out_detach: + iommufd_access_detach_internal(access); +out_destroy: + iommufd_access_destroy_internal(viommu->ictx, access); +out_free: + kfree(pages); + return ERR_PTR(rc); +} + +int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_hw_queue_alloc *cmd = ucmd->cmd; + struct iommufd_hw_queue *hw_queue; + struct iommufd_viommu *viommu; + struct iommufd_access *access; + size_t hw_queue_size; + phys_addr_t base_pa; + u64 last; + int rc; + + if (cmd->flags || cmd->type == IOMMU_HW_QUEUE_TYPE_DEFAULT) + return -EOPNOTSUPP; + if (!cmd->length) + return -EINVAL; + if (check_add_overflow(cmd->nesting_parent_iova, cmd->length - 1, + &last)) + return -EOVERFLOW; + + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); + if (IS_ERR(viommu)) + return PTR_ERR(viommu); + + if (!viommu->ops || !viommu->ops->get_hw_queue_size || + !viommu->ops->hw_queue_init_phys) { + rc = -EOPNOTSUPP; + goto out_put_viommu; + } + + hw_queue_size = viommu->ops->get_hw_queue_size(viommu, cmd->type); + if (!hw_queue_size) { + rc = -EOPNOTSUPP; + goto out_put_viommu; + } + + /* + * It is a driver bug for providing a hw_queue_size smaller than the + * core HW queue structure size + */ + if (WARN_ON_ONCE(hw_queue_size < sizeof(*hw_queue))) { + rc = -EOPNOTSUPP; + goto out_put_viommu; + } + + hw_queue = (struct iommufd_hw_queue *)_iommufd_object_alloc_ucmd( + ucmd, hw_queue_size, IOMMUFD_OBJ_HW_QUEUE); + if (IS_ERR(hw_queue)) { + rc = PTR_ERR(hw_queue); + goto out_put_viommu; + } + + access = iommufd_hw_queue_alloc_phys(cmd, viommu, &base_pa); + if (IS_ERR(access)) { + rc = PTR_ERR(access); + goto out_put_viommu; + } + + hw_queue->viommu = viommu; + refcount_inc(&viommu->obj.users); + hw_queue->access = access; + hw_queue->type = cmd->type; + hw_queue->length = cmd->length; + hw_queue->base_addr = cmd->nesting_parent_iova; + + rc = viommu->ops->hw_queue_init_phys(hw_queue, cmd->index, base_pa); + if (rc) + goto out_put_viommu; + + cmd->out_hw_queue_id = hw_queue->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_put_viommu: + iommufd_put_object(ucmd->ictx, &viommu->obj); + return rc; +} diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index f13f3ca6adb5..ce4011a2fc27 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -123,6 +123,7 @@ struct iommufd_vdevice { struct iommufd_hw_queue { struct iommufd_object obj; struct iommufd_viommu *viommu; + struct iommufd_access *access; u64 base_addr; /* in guest physical address space */ size_t length; diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 640a8b5147c2..b928c1ed2395 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -56,6 +56,7 @@ enum { IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92, IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93, + IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94, }; /** @@ -1156,4 +1157,37 @@ enum iommu_hw_queue_type { IOMMU_HW_QUEUE_TYPE_DEFAULT = 0, }; +/** + * struct iommu_hw_queue_alloc - ioctl(IOMMU_HW_QUEUE_ALLOC) + * @size: sizeof(struct iommu_hw_queue_alloc) + * @flags: Must be 0 + * @viommu_id: Virtual IOMMU ID to associate the HW queue with + * @type: One of enum iommu_hw_queue_type + * @index: The logical index to the HW queue per virtual IOMMU for a multi-queue + * model + * @out_hw_queue_id: The ID of the new HW queue + * @nesting_parent_iova: Base address of the queue memory in the guest physical + * address space + * @length: Length of the queue memory + * + * Allocate a HW queue object for a vIOMMU-specific HW-accelerated queue, which + * allows HW to access a guest queue memory described using @nesting_parent_iova + * and @length. + * + * A vIOMMU can allocate multiple queues, but it must use a different @index per + * type to separate each allocation, e.g:: + * + * Type1 HW queue0, Type1 HW queue1, Type2 HW queue0, ... + */ +struct iommu_hw_queue_alloc { + __u32 size; + __u32 flags; + __u32 viommu_id; + __u32 type; + __u32 index; + __u32 out_hw_queue_id; + __aligned_u64 nesting_parent_iova; + __aligned_u64 length; +}; +#define IOMMU_HW_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HW_QUEUE_ALLOC) #endif -- cgit v1.2.3 From 2ab4019aa34dc2aec4a0824fbf1e49300884fbbf Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 7 Jul 2025 18:34:04 +0000 Subject: media: uvcvideo: Introduce V4L2_META_FMT_UVC_MSXU_1_5 The UVC driver provides two metadata types V4L2_META_FMT_UVC, and V4L2_META_FMT_D4XX. The only difference between the two of them is that V4L2_META_FMT_UVC only copies PTS, SCR, size and flags, and V4L2_META_FMT_D4XX copies the whole metadata section. Now we only enable V4L2_META_FMT_D4XX for the Intel D4xx family of devices, but it is useful to have the whole metadata payload for any device where vendors include other metadata, such as the one described by Microsoft: https://learn.microsoft.com/en-us/windows-hardware/drivers/stream/mf-capture-metadata This patch introduces a new format V4L2_META_FMT_UVC_MSXU_1_5, that is identical to V4L2_META_FMT_D4XX. Let the user enable this format with a quirk for now. This way they can test if their devices provide useful metadata without rebuilding the kernel. They can later contribute patches to auto-quirk their devices. We will also work in methods to auto-detect devices compatible with this new metadata format. Suggested-by: Hans de Goede Reviewed-by: Hans de Goede Signed-off-by: Ricardo Ribalda Link: https://lore.kernel.org/r/20250707-uvc-meta-v8-4-ed17f8b1218b@chromium.org Signed-off-by: Hans de Goede Signed-off-by: Hans Verkuil --- .../userspace-api/media/v4l/meta-formats.rst | 1 + .../media/v4l/metafmt-uvc-msxu-1-5.rst | 23 ++++++++++++++++++++++ MAINTAINERS | 1 + drivers/media/usb/uvc/uvc_metadata.c | 4 ++++ drivers/media/usb/uvc/uvcvideo.h | 1 + drivers/media/v4l2-core/v4l2-ioctl.c | 1 + include/uapi/linux/videodev2.h | 1 + 7 files changed, 32 insertions(+) create mode 100644 Documentation/userspace-api/media/v4l/metafmt-uvc-msxu-1-5.rst (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/media/v4l/meta-formats.rst b/Documentation/userspace-api/media/v4l/meta-formats.rst index bb6876cfc271..0de80328c36b 100644 --- a/Documentation/userspace-api/media/v4l/meta-formats.rst +++ b/Documentation/userspace-api/media/v4l/meta-formats.rst @@ -20,6 +20,7 @@ These formats are used for the :ref:`metadata` interface only. metafmt-pisp-fe metafmt-rkisp1 metafmt-uvc + metafmt-uvc-msxu-1-5 metafmt-vivid metafmt-vsp1-hgo metafmt-vsp1-hgt diff --git a/Documentation/userspace-api/media/v4l/metafmt-uvc-msxu-1-5.rst b/Documentation/userspace-api/media/v4l/metafmt-uvc-msxu-1-5.rst new file mode 100644 index 000000000000..dd1c3076df24 --- /dev/null +++ b/Documentation/userspace-api/media/v4l/metafmt-uvc-msxu-1-5.rst @@ -0,0 +1,23 @@ +.. SPDX-License-Identifier: GFDL-1.1-no-invariants-or-later + +.. _v4l2-meta-fmt-uvc-msxu-1-5: + +*********************************** +V4L2_META_FMT_UVC_MSXU_1_5 ('UVCM') +*********************************** + +Microsoft(R)'s UVC Payload Metadata. + + +Description +=========== + +V4L2_META_FMT_UVC_MSXU_1_5 buffers follow the metadata buffer layout of +V4L2_META_FMT_UVC with the only difference that it includes all the UVC +metadata in the `buffer[]` field, not just the first 2-12 bytes. + +The metadata format follows the specification from Microsoft(R) [1]. + +.. _1: + +[1] https://docs.microsoft.com/en-us/windows-hardware/drivers/stream/uvc-extensions-1-5 diff --git a/MAINTAINERS b/MAINTAINERS index 1ef99240a57e..5f69c82d53ee 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -25837,6 +25837,7 @@ S: Maintained W: http://www.ideasonboard.org/uvc/ T: git git://linuxtv.org/media.git F: Documentation/userspace-api/media/drivers/uvcvideo.rst +F: Documentation/userspace-api/media/v4l/metafmt-uvc-msxu-1-5.rst F: Documentation/userspace-api/media/v4l/metafmt-uvc.rst F: drivers/media/common/uvc.c F: drivers/media/usb/uvc/ diff --git a/drivers/media/usb/uvc/uvc_metadata.c b/drivers/media/usb/uvc/uvc_metadata.c index 58691df60dd3..d0ee139dbda7 100644 --- a/drivers/media/usb/uvc/uvc_metadata.c +++ b/drivers/media/usb/uvc/uvc_metadata.c @@ -189,6 +189,10 @@ void uvc_meta_init(struct uvc_device *dev) !WARN_ON(dev->info->meta_format == V4L2_META_FMT_UVC)) dev->meta_formats[i++] = dev->info->meta_format; + if (dev->quirks & UVC_QUIRK_MSXU_META && + !WARN_ON(dev->info->meta_format == V4L2_META_FMT_UVC_MSXU_1_5)) + dev->meta_formats[i++] = V4L2_META_FMT_UVC_MSXU_1_5; + /* IMPORTANT: for new meta-formats update UVC_MAX_META_DATA_FORMATS. */ dev->meta_formats[i++] = 0; } diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h index 81ec171fdfde..eb164d063199 100644 --- a/drivers/media/usb/uvc/uvcvideo.h +++ b/drivers/media/usb/uvc/uvcvideo.h @@ -77,6 +77,7 @@ #define UVC_QUIRK_DISABLE_AUTOSUSPEND 0x00008000 #define UVC_QUIRK_INVALID_DEVICE_SOF 0x00010000 #define UVC_QUIRK_MJPEG_NO_EOF 0x00020000 +#define UVC_QUIRK_MSXU_META 0x00040000 /* Format flags */ #define UVC_FMT_FLAG_COMPRESSED 0x00000001 diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 0db05f9c7117..46da373066f4 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -1463,6 +1463,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt) case V4L2_META_FMT_VSP1_HGO: descr = "R-Car VSP1 1-D Histogram"; break; case V4L2_META_FMT_VSP1_HGT: descr = "R-Car VSP1 2-D Histogram"; break; case V4L2_META_FMT_UVC: descr = "UVC Payload Header Metadata"; break; + case V4L2_META_FMT_UVC_MSXU_1_5: descr = "UVC MSXU Metadata"; break; case V4L2_META_FMT_D4XX: descr = "Intel D4xx UVC Metadata"; break; case V4L2_META_FMT_VIVID: descr = "Vivid Metadata"; break; case V4L2_META_FMT_RK_ISP1_PARAMS: descr = "Rockchip ISP1 3A Parameters"; break; diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 1bb1979f6c18..3dd9fa45dde1 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -867,6 +867,7 @@ struct v4l2_pix_format { #define V4L2_META_FMT_VSP1_HGT v4l2_fourcc('V', 'S', 'P', 'T') /* R-Car VSP1 2-D Histogram */ #define V4L2_META_FMT_UVC v4l2_fourcc('U', 'V', 'C', 'H') /* UVC Payload Header metadata */ #define V4L2_META_FMT_D4XX v4l2_fourcc('D', '4', 'X', 'X') /* D4XX Payload Header metadata */ +#define V4L2_META_FMT_UVC_MSXU_1_5 v4l2_fourcc('U', 'V', 'C', 'M') /* UVC MSXU metadata */ #define V4L2_META_FMT_VIVID v4l2_fourcc('V', 'I', 'V', 'D') /* Vivid Metadata */ /* Vendor specific - used for RK_ISP1 camera sub-system */ -- cgit v1.2.3 From 62622a8753fa6af3c104f9552863e6473b92fb31 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:12 -0700 Subject: iommu: Allow an input type in hw_info op The hw_info uAPI will support a bidirectional data_type field that can be used as an input field for user space to request for a specific info data. To prepare for the uAPI update, change the iommu layer first: - Add a new IOMMU_HW_INFO_TYPE_DEFAULT as an input, for which driver can output its only (or firstly) supported type - Update the kdoc accordingly - Roll out the type validation in the existing drivers Link: https://patch.msgid.link/r/00f4a2d3d930721f61367014717b3ba2d1e82a81.1752126748.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 4 ++++ drivers/iommu/intel/iommu.c | 4 ++++ drivers/iommu/iommufd/device.c | 3 +++ drivers/iommu/iommufd/selftest.c | 4 ++++ include/linux/iommu.h | 3 ++- include/uapi/linux/iommufd.h | 4 +++- 6 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 170d69162848..eb9fe1f6311a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -15,6 +15,10 @@ void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 __iomem *base_idr; unsigned int i; + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3) + return ERR_PTR(-EOPNOTSUPP); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 850f1a6f548c..5f75faffca15 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4098,6 +4098,10 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, struct intel_iommu *iommu = info->iommu; struct iommu_hw_info_vtd *vtd; + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_INTEL_VTD) + return ERR_PTR(-EOPNOTSUPP); + vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); if (!vtd) return ERR_PTR(-ENOMEM); diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 0567faff5680..14955dc43892 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1512,6 +1512,9 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) cmd->__reserved[2]) return -EOPNOTSUPP; + /* Clear the type field since drivers don't support a random input */ + cmd->out_data_type = IOMMU_HW_INFO_TYPE_DEFAULT; + idev = iommufd_get_device(ucmd, cmd->dev_id); if (IS_ERR(idev)) return PTR_ERR(idev); diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 8b2c44b32530..a5dc36219a90 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -310,6 +310,10 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, { struct iommu_test_hw_info *info; + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_SELFTEST) + return ERR_PTR(-EOPNOTSUPP); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e06a0fbe4bc7..e8b59ef54e48 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -603,7 +603,8 @@ __iommu_copy_struct_to_user(const struct iommu_user_data *dst_data, * @capable: check capability * @hw_info: report iommu hardware information. The data buffer returned by this * op is allocated in the iommu driver and freed by the caller after - * use. + * use. @type can input a requested type and output a supported type. + * Driver should reject an unsupported data @type input * @domain_alloc: Do not use in new drivers * @domain_alloc_identity: allocate an IDENTITY domain. Drivers should prefer to * use identity_domain instead. This should only be used diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index b928c1ed2395..9c8c304b5de2 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -593,13 +593,15 @@ struct iommu_hw_info_arm_smmuv3 { /** * enum iommu_hw_info_type - IOMMU Hardware Info Types - * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware + * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware * info + * @IOMMU_HW_INFO_TYPE_DEFAULT: Input to request for a default type * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE = 0, + IOMMU_HW_INFO_TYPE_DEFAULT = 0, IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, }; -- cgit v1.2.3 From a9f10bab2e5084d6746391fccd7bef6ac87620b8 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:13 -0700 Subject: iommufd: Allow an input data_type via iommu_hw_info The iommu_hw_info can output via the out_data_type field the vendor data type from a driver, but this only allows driver to report one data type. Now, with SMMUv3 having a Tegra241 CMDQV implementation, it has two sets of types and data structs to report. One way to support that is to use the same type field bidirectionally. Reuse the same field by adding an "in_data_type", allowing user space to request for a specific type and to get the corresponding data. For backward compatibility, since the ioctl handler has never checked an input value, add an IOMMU_HW_INFO_FLAG_INPUT_TYPE to switch between the old output-only field and the new bidirectional field. Link: https://patch.msgid.link/r/887378a7167e1786d9d13cde0c36263ed61823d7.1752126748.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 9 ++++++--- include/uapi/linux/iommufd.h | 20 +++++++++++++++++++- 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 14955dc43892..e2ba21c43ad2 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1499,6 +1499,7 @@ EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, "IOMMUFD"); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) { + const u32 SUPPORTED_FLAGS = IOMMU_HW_INFO_FLAG_INPUT_TYPE; struct iommu_hw_info *cmd = ucmd->cmd; void __user *user_ptr = u64_to_user_ptr(cmd->data_uptr); const struct iommu_ops *ops; @@ -1508,12 +1509,14 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) void *data; int rc; - if (cmd->flags || cmd->__reserved[0] || cmd->__reserved[1] || - cmd->__reserved[2]) + if (cmd->flags & ~SUPPORTED_FLAGS) + return -EOPNOTSUPP; + if (cmd->__reserved[0] || cmd->__reserved[1] || cmd->__reserved[2]) return -EOPNOTSUPP; /* Clear the type field since drivers don't support a random input */ - cmd->out_data_type = IOMMU_HW_INFO_TYPE_DEFAULT; + if (!(cmd->flags & IOMMU_HW_INFO_FLAG_INPUT_TYPE)) + cmd->in_data_type = IOMMU_HW_INFO_TYPE_DEFAULT; idev = iommufd_get_device(ucmd, cmd->dev_id); if (IS_ERR(idev)) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 9c8c304b5de2..32ee02380912 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -628,6 +628,15 @@ enum iommufd_hw_capabilities { IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, }; +/** + * enum iommufd_hw_info_flags - Flags for iommu_hw_info + * @IOMMU_HW_INFO_FLAG_INPUT_TYPE: If set, @in_data_type carries an input type + * for user space to request for a specific info + */ +enum iommufd_hw_info_flags { + IOMMU_HW_INFO_FLAG_INPUT_TYPE = 1 << 0, +}; + /** * struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO) * @size: sizeof(struct iommu_hw_info) @@ -637,6 +646,12 @@ enum iommufd_hw_capabilities { * data that kernel supports * @data_uptr: User pointer to a user-space buffer used by the kernel to fill * the iommu type specific hardware information data + * @in_data_type: This shares the same field with @out_data_type, making it be + * a bidirectional field. When IOMMU_HW_INFO_FLAG_INPUT_TYPE is + * set, an input type carried via this @in_data_type field will + * be valid, requesting for the info data to the given type. If + * IOMMU_HW_INFO_FLAG_INPUT_TYPE is unset, any input value will + * be seen as IOMMU_HW_INFO_TYPE_DEFAULT * @out_data_type: Output the iommu hardware info type as defined in the enum * iommu_hw_info_type. * @out_capabilities: Output the generic iommu capability info type as defined @@ -666,7 +681,10 @@ struct iommu_hw_info { __u32 dev_id; __u32 data_len; __aligned_u64 data_uptr; - __u32 out_data_type; + union { + __u32 in_data_type; + __u32 out_data_type; + }; __u8 out_max_pasid_log2; __u8 __reserved[3]; __aligned_u64 out_capabilities; -- cgit v1.2.3 From 4dc0d12474f9d4833c3dd96b73d61e406d3f5dc7 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:20 -0700 Subject: iommu/tegra241-cmdqv: Add user-space use support The CMDQV HW supports a user-space use for virtualization cases. It allows the VM to issue guest-level TLBI or ATC_INV commands directly to the queue and executes them without a VMEXIT, as HW will replace the VMID field in a TLBI command and the SID field in an ATC_INV command with the preset VMID and SID. This is built upon the vIOMMU infrastructure by allowing VMM to allocate a VINTF (as a vIOMMU object) and assign VCMDQs (HW QUEUE objs) to the VINTF. So firstly, replace the standard vSMMU model with the VINTF implementation but reuse the standard cache_invalidate op (for unsupported commands) and the standard alloc_domain_nested op (for standard nested STE). Each VINTF has two 64KB MMIO pages (128B per logical VCMDQ): - Page0 (directly accessed by guest) has all the control and status bits. - Page1 (trapped by VMM) has guest-owned queue memory location/size info. VMM should trap the emulated VINTF0's page1 of the guest VM for the guest- level VCMDQ location/size info and forward that to the kernel to translate to a physical memory location to program the VCMDQ HW during an allocation call. Then, it should mmap the assigned VINTF's page0 to the VINTF0 page0 of the guest VM. This allows the guest OS to read and write the guest-own VINTF's page0 for direct control of the VCMDQ HW. For ATC invalidation commands that hold an SID, it requires all devices to register their virtual SIDs to the SID_MATCH registers and their physical SIDs to the pairing SID_REPLACE registers, so that HW can use those as a lookup table to replace those virtual SIDs with the correct physical SIDs. Thus, implement the driver-allocated vDEVICE op with a tegra241_vintf_sid structure to allocate SID_REPLACE and to program the SIDs accordingly. This enables the HW accelerated feature for NVIDIA Grace CPU. Compared to the standard SMMUv3 operating in the nested translation mode trapping CMDQ for TLBI and ATC_INV commands, this gives a huge performance improvement: 70% to 90% reductions of invalidation time were measured by various DMA unmap tests running in a guest OS. Link: https://patch.msgid.link/r/fb0eab83f529440b6aa181798912a6f0afa21eb0.1752126748.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 6 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 + drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 400 ++++++++++++++++++++- include/uapi/linux/iommufd.h | 59 +++ 4 files changed, 466 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 1cf9646e776f..d9bea8f1f636 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -225,7 +225,7 @@ static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg, return 0; } -static struct iommu_domain * +struct iommu_domain * arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, const struct iommu_user_data *user_data) { @@ -336,8 +336,8 @@ static int arm_vsmmu_convert_user_cmd(struct arm_vsmmu *vsmmu, return 0; } -static int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu, - struct iommu_user_data_array *array) +int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array) { struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); struct arm_smmu_device *smmu = vsmmu->smmu; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index b7c7fdef531e..3fa02c51df9f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -1057,10 +1057,17 @@ int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state); void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master); int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt); +struct iommu_domain * +arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data); +int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array); #else #define arm_smmu_get_viommu_size NULL #define arm_smmu_hw_info NULL #define arm_vsmmu_init NULL +#define arm_vsmmu_alloc_domain_nested NULL +#define arm_vsmmu_cache_invalidate NULL static inline int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index 869c90b660c1..3eeb8444fadf 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include #include @@ -26,8 +28,10 @@ #define CMDQV_EN BIT(0) #define TEGRA241_CMDQV_PARAM 0x0004 +#define CMDQV_NUM_SID_PER_VM_LOG2 GENMASK(15, 12) #define CMDQV_NUM_VINTF_LOG2 GENMASK(11, 8) #define CMDQV_NUM_VCMDQ_LOG2 GENMASK(7, 4) +#define CMDQV_VER GENMASK(3, 0) #define TEGRA241_CMDQV_STATUS 0x0008 #define CMDQV_ENABLED BIT(0) @@ -53,6 +57,9 @@ #define VINTF_STATUS GENMASK(3, 1) #define VINTF_ENABLED BIT(0) +#define TEGRA241_VINTF_SID_MATCH(s) (0x0040 + 0x4*(s)) +#define TEGRA241_VINTF_SID_REPLACE(s) (0x0080 + 0x4*(s)) + #define TEGRA241_VINTF_LVCMDQ_ERR_MAP_64(m) \ (0x00C0 + 0x8*(m)) #define LVCMDQ_ERR_MAP_NUM_64 2 @@ -114,16 +121,20 @@ MODULE_PARM_DESC(bypass_vcmdq, /** * struct tegra241_vcmdq - Virtual Command Queue + * @core: Embedded iommufd_hw_queue structure * @idx: Global index in the CMDQV * @lidx: Local index in the VINTF * @enabled: Enable status * @cmdqv: Parent CMDQV pointer * @vintf: Parent VINTF pointer + * @prev: Previous LVCMDQ to depend on * @cmdq: Command Queue struct * @page0: MMIO Page0 base address * @page1: MMIO Page1 base address */ struct tegra241_vcmdq { + struct iommufd_hw_queue core; + u16 idx; u16 lidx; @@ -131,22 +142,30 @@ struct tegra241_vcmdq { struct tegra241_cmdqv *cmdqv; struct tegra241_vintf *vintf; + struct tegra241_vcmdq *prev; struct arm_smmu_cmdq cmdq; void __iomem *page0; void __iomem *page1; }; +#define hw_queue_to_vcmdq(v) container_of(v, struct tegra241_vcmdq, core) /** * struct tegra241_vintf - Virtual Interface + * @vsmmu: Embedded arm_vsmmu structure * @idx: Global index in the CMDQV * @enabled: Enable status * @hyp_own: Owned by hypervisor (in-kernel) * @cmdqv: Parent CMDQV pointer * @lvcmdqs: List of logical VCMDQ pointers + * @lvcmdq_mutex: Lock to serialize user-allocated lvcmdqs * @base: MMIO base address + * @mmap_offset: Offset argument for mmap() syscall + * @sids: Stream ID mapping resources */ struct tegra241_vintf { + struct arm_vsmmu vsmmu; + u16 idx; bool enabled; @@ -154,19 +173,41 @@ struct tegra241_vintf { struct tegra241_cmdqv *cmdqv; struct tegra241_vcmdq **lvcmdqs; + struct mutex lvcmdq_mutex; /* user space race */ void __iomem *base; + unsigned long mmap_offset; + + struct ida sids; +}; +#define viommu_to_vintf(v) container_of(v, struct tegra241_vintf, vsmmu.core) + +/** + * struct tegra241_vintf_sid - Virtual Interface Stream ID Mapping + * @core: Embedded iommufd_vdevice structure, holding virtual Stream ID + * @vintf: Parent VINTF pointer + * @sid: Physical Stream ID + * @idx: Mapping index in the VINTF + */ +struct tegra241_vintf_sid { + struct iommufd_vdevice core; + struct tegra241_vintf *vintf; + u32 sid; + u8 idx; }; +#define vdev_to_vsid(v) container_of(v, struct tegra241_vintf_sid, core) /** * struct tegra241_cmdqv - CMDQ-V for SMMUv3 * @smmu: SMMUv3 device * @dev: CMDQV device * @base: MMIO base address + * @base_phys: MMIO physical base address, for mmap * @irq: IRQ number * @num_vintfs: Total number of VINTFs * @num_vcmdqs: Total number of VCMDQs * @num_lvcmdqs_per_vintf: Number of logical VCMDQs per VINTF + * @num_sids_per_vintf: Total number of SID mappings per VINTF * @vintf_ids: VINTF id allocator * @vintfs: List of VINTFs */ @@ -175,12 +216,14 @@ struct tegra241_cmdqv { struct device *dev; void __iomem *base; + phys_addr_t base_phys; int irq; /* CMDQV Hardware Params */ u16 num_vintfs; u16 num_vcmdqs; u16 num_lvcmdqs_per_vintf; + u16 num_sids_per_vintf; struct ida vintf_ids; @@ -351,6 +394,29 @@ tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu, /* HW Reset Functions */ +/* + * When a guest-owned VCMDQ is disabled, if the guest did not enqueue a CMD_SYNC + * following an ATC_INV command at the end of the guest queue while this ATC_INV + * is timed out, the TIMEOUT will not be reported until this VCMDQ gets assigned + * to the next VM, which will be a false alarm potentially causing some unwanted + * behavior in the new VM. Thus, a guest-owned VCMDQ must flush the TIMEOUT when + * it gets disabled. This can be done by just issuing a CMD_SYNC to SMMU CMDQ. + */ +static void tegra241_vcmdq_hw_flush_timeout(struct tegra241_vcmdq *vcmdq) +{ + struct arm_smmu_device *smmu = &vcmdq->cmdqv->smmu; + u64 cmd_sync[CMDQ_ENT_DWORDS] = {}; + + cmd_sync[0] = FIELD_PREP(CMDQ_0_OP, CMDQ_OP_CMD_SYNC) | + FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_NONE); + + /* + * It does not hurt to insert another CMD_SYNC, taking advantage of the + * arm_smmu_cmdq_issue_cmdlist() that waits for the CMD_SYNC completion. + */ + arm_smmu_cmdq_issue_cmdlist(smmu, &smmu->cmdq, cmd_sync, 1, true); +} + /* This function is for LVCMDQ, so @vcmdq must not be unmapped yet */ static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq) { @@ -364,6 +430,8 @@ static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq) readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR)), readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, CONS))); } + tegra241_vcmdq_hw_flush_timeout(vcmdq); + writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, PROD)); writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, CONS)); writeq_relaxed(0, REG_VCMDQ_PAGE1(vcmdq, BASE)); @@ -420,6 +488,7 @@ static void tegra241_vcmdq_unmap_lvcmdq(struct tegra241_vcmdq *vcmdq) static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf) { u16 lidx = vintf->cmdqv->num_lvcmdqs_per_vintf; + int sidx; /* HW requires to unmap LVCMDQs in descending order */ while (lidx--) { @@ -429,6 +498,10 @@ static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf) } } vintf_write_config(vintf, 0); + for (sidx = 0; sidx < vintf->cmdqv->num_sids_per_vintf; sidx++) { + writel(0, REG_VINTF(vintf, SID_MATCH(sidx))); + writel(0, REG_VINTF(vintf, SID_REPLACE(sidx))); + } } /* Map a global VCMDQ to the pre-assigned LVCMDQ */ @@ -457,7 +530,8 @@ static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own) * whether enabling it here or not, as !HYP_OWN cmdq HWs only support a * restricted set of supported commands. */ - regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own); + regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own) | + FIELD_PREP(VINTF_VMID, vintf->vsmmu.vmid); writel(regval, REG_VINTF(vintf, CONFIG)); ret = vintf_write_config(vintf, regval | VINTF_EN); @@ -584,7 +658,9 @@ static void tegra241_vintf_free_lvcmdq(struct tegra241_vintf *vintf, u16 lidx) dev_dbg(vintf->cmdqv->dev, "%sdeallocated\n", lvcmdq_error_header(vcmdq, header, 64)); - kfree(vcmdq); + /* Guest-owned VCMDQ is free-ed with hw_queue by iommufd core */ + if (vcmdq->vintf->hyp_own) + kfree(vcmdq); } static struct tegra241_vcmdq * @@ -671,7 +747,13 @@ static void tegra241_cmdqv_remove_vintf(struct tegra241_cmdqv *cmdqv, u16 idx) dev_dbg(cmdqv->dev, "VINTF%u: deallocated\n", vintf->idx); tegra241_cmdqv_deinit_vintf(cmdqv, idx); - kfree(vintf); + if (!vintf->hyp_own) { + mutex_destroy(&vintf->lvcmdq_mutex); + ida_destroy(&vintf->sids); + /* Guest-owned VINTF is free-ed with viommu by iommufd core */ + } else { + kfree(vintf); + } } static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu) @@ -699,10 +781,45 @@ static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu) put_device(cmdqv->dev); /* smmu->impl_dev */ } +static int +tegra241_cmdqv_init_vintf_user(struct arm_vsmmu *vsmmu, + const struct iommu_user_data *user_data); + +static void *tegra241_cmdqv_hw_info(struct arm_smmu_device *smmu, u32 *length, + enum iommu_hw_info_type *type) +{ + struct tegra241_cmdqv *cmdqv = + container_of(smmu, struct tegra241_cmdqv, smmu); + struct iommu_hw_info_tegra241_cmdqv *info; + u32 regval; + + if (*type != IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV) + return ERR_PTR(-EOPNOTSUPP); + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + regval = readl_relaxed(REG_CMDQV(cmdqv, PARAM)); + info->log2vcmdqs = ilog2(cmdqv->num_lvcmdqs_per_vintf); + info->log2vsids = ilog2(cmdqv->num_sids_per_vintf); + info->version = FIELD_GET(CMDQV_VER, regval); + + *length = sizeof(*info); + *type = IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV; + return info; +} + static struct arm_smmu_impl_ops tegra241_cmdqv_impl_ops = { + /* For in-kernel use */ .get_secondary_cmdq = tegra241_cmdqv_get_cmdq, .device_reset = tegra241_cmdqv_hw_reset, .device_remove = tegra241_cmdqv_remove, + /* For user-space use */ + .hw_info = tegra241_cmdqv_hw_info, + .vsmmu_size = VIOMMU_STRUCT_SIZE(struct tegra241_vintf, vsmmu.core), + .vsmmu_type = IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV, + .vsmmu_init = tegra241_cmdqv_init_vintf_user, }; /* Probe Functions */ @@ -844,6 +961,7 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res, cmdqv->irq = irq; cmdqv->base = base; cmdqv->dev = smmu->impl_dev; + cmdqv->base_phys = res->start; if (cmdqv->irq > 0) { ret = request_threaded_irq(irq, NULL, tegra241_cmdqv_isr, @@ -860,6 +978,8 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res, cmdqv->num_vintfs = 1 << FIELD_GET(CMDQV_NUM_VINTF_LOG2, regval); cmdqv->num_vcmdqs = 1 << FIELD_GET(CMDQV_NUM_VCMDQ_LOG2, regval); cmdqv->num_lvcmdqs_per_vintf = cmdqv->num_vcmdqs / cmdqv->num_vintfs; + cmdqv->num_sids_per_vintf = + 1 << FIELD_GET(CMDQV_NUM_SID_PER_VM_LOG2, regval); cmdqv->vintfs = kcalloc(cmdqv->num_vintfs, sizeof(*cmdqv->vintfs), GFP_KERNEL); @@ -913,3 +1033,277 @@ out_fallback: put_device(smmu->impl_dev); return ERR_PTR(-ENODEV); } + +/* User space VINTF and VCMDQ Functions */ + +static size_t tegra241_vintf_get_vcmdq_size(struct iommufd_viommu *viommu, + enum iommu_hw_queue_type queue_type) +{ + if (queue_type != IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV) + return 0; + return HW_QUEUE_STRUCT_SIZE(struct tegra241_vcmdq, core); +} + +static int tegra241_vcmdq_hw_init_user(struct tegra241_vcmdq *vcmdq) +{ + char header[64]; + + /* Configure the vcmdq only; User space does the enabling */ + writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE)); + + dev_dbg(vcmdq->cmdqv->dev, "%sinited at host PA 0x%llx size 0x%lx\n", + lvcmdq_error_header(vcmdq, header, 64), + vcmdq->cmdq.q.q_base & VCMDQ_ADDR, + 1UL << (vcmdq->cmdq.q.q_base & VCMDQ_LOG2SIZE)); + return 0; +} + +static void +tegra241_vintf_destroy_lvcmdq_user(struct iommufd_hw_queue *hw_queue) +{ + struct tegra241_vcmdq *vcmdq = hw_queue_to_vcmdq(hw_queue); + + mutex_lock(&vcmdq->vintf->lvcmdq_mutex); + tegra241_vcmdq_hw_deinit(vcmdq); + tegra241_vcmdq_unmap_lvcmdq(vcmdq); + tegra241_vintf_free_lvcmdq(vcmdq->vintf, vcmdq->lidx); + if (vcmdq->prev) + iommufd_hw_queue_undepend(vcmdq, vcmdq->prev, core); + mutex_unlock(&vcmdq->vintf->lvcmdq_mutex); +} + +static int tegra241_vintf_alloc_lvcmdq_user(struct iommufd_hw_queue *hw_queue, + u32 lidx, phys_addr_t base_addr_pa) +{ + struct tegra241_vintf *vintf = viommu_to_vintf(hw_queue->viommu); + struct tegra241_vcmdq *vcmdq = hw_queue_to_vcmdq(hw_queue); + struct tegra241_cmdqv *cmdqv = vintf->cmdqv; + struct arm_smmu_device *smmu = &cmdqv->smmu; + struct tegra241_vcmdq *prev = NULL; + u32 log2size, max_n_shift; + char header[64]; + int ret; + + if (hw_queue->type != IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV) + return -EOPNOTSUPP; + if (lidx >= cmdqv->num_lvcmdqs_per_vintf) + return -EINVAL; + + mutex_lock(&vintf->lvcmdq_mutex); + + if (vintf->lvcmdqs[lidx]) { + ret = -EEXIST; + goto unlock; + } + + /* + * HW requires to map LVCMDQs in ascending order, so reject if the + * previous lvcmdqs is not allocated yet. + */ + if (lidx) { + prev = vintf->lvcmdqs[lidx - 1]; + if (!prev) { + ret = -EIO; + goto unlock; + } + } + + /* + * hw_queue->length must be a power of 2, in range of + * [ 32, 2 ^ (idr[1].CMDQS + CMDQ_ENT_SZ_SHIFT) ] + */ + max_n_shift = FIELD_GET(IDR1_CMDQS, + readl_relaxed(smmu->base + ARM_SMMU_IDR1)); + if (!is_power_of_2(hw_queue->length) || hw_queue->length < 32 || + hw_queue->length > (1 << (max_n_shift + CMDQ_ENT_SZ_SHIFT))) { + ret = -EINVAL; + goto unlock; + } + log2size = ilog2(hw_queue->length) - CMDQ_ENT_SZ_SHIFT; + + /* base_addr_pa must be aligned to hw_queue->length */ + if (base_addr_pa & ~VCMDQ_ADDR || + base_addr_pa & (hw_queue->length - 1)) { + ret = -EINVAL; + goto unlock; + } + + /* + * HW requires to unmap LVCMDQs in descending order, so destroy() must + * follow this rule. Set a dependency on its previous LVCMDQ so iommufd + * core will help enforce it. + */ + if (prev) { + ret = iommufd_hw_queue_depend(vcmdq, prev, core); + if (ret) + goto unlock; + } + vcmdq->prev = prev; + + ret = tegra241_vintf_init_lvcmdq(vintf, lidx, vcmdq); + if (ret) + goto undepend_vcmdq; + + dev_dbg(cmdqv->dev, "%sallocated\n", + lvcmdq_error_header(vcmdq, header, 64)); + + tegra241_vcmdq_map_lvcmdq(vcmdq); + + vcmdq->cmdq.q.q_base = base_addr_pa & VCMDQ_ADDR; + vcmdq->cmdq.q.q_base |= log2size; + + ret = tegra241_vcmdq_hw_init_user(vcmdq); + if (ret) + goto unmap_lvcmdq; + + hw_queue->destroy = &tegra241_vintf_destroy_lvcmdq_user; + mutex_unlock(&vintf->lvcmdq_mutex); + return 0; + +unmap_lvcmdq: + tegra241_vcmdq_unmap_lvcmdq(vcmdq); + tegra241_vintf_deinit_lvcmdq(vintf, lidx); +undepend_vcmdq: + if (vcmdq->prev) + iommufd_hw_queue_undepend(vcmdq, vcmdq->prev, core); +unlock: + mutex_unlock(&vintf->lvcmdq_mutex); + return ret; +} + +static void tegra241_cmdqv_destroy_vintf_user(struct iommufd_viommu *viommu) +{ + struct tegra241_vintf *vintf = viommu_to_vintf(viommu); + + if (vintf->mmap_offset) + iommufd_viommu_destroy_mmap(&vintf->vsmmu.core, + vintf->mmap_offset); + tegra241_cmdqv_remove_vintf(vintf->cmdqv, vintf->idx); +} + +static void tegra241_vintf_destroy_vsid(struct iommufd_vdevice *vdev) +{ + struct tegra241_vintf_sid *vsid = vdev_to_vsid(vdev); + struct tegra241_vintf *vintf = vsid->vintf; + + writel(0, REG_VINTF(vintf, SID_MATCH(vsid->idx))); + writel(0, REG_VINTF(vintf, SID_REPLACE(vsid->idx))); + ida_free(&vintf->sids, vsid->idx); + dev_dbg(vintf->cmdqv->dev, + "VINTF%u: deallocated SID_REPLACE%d for pSID=%x\n", vintf->idx, + vsid->idx, vsid->sid); +} + +static int tegra241_vintf_init_vsid(struct iommufd_vdevice *vdev) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(vdev->dev); + struct tegra241_vintf *vintf = viommu_to_vintf(vdev->viommu); + struct tegra241_vintf_sid *vsid = vdev_to_vsid(vdev); + struct arm_smmu_stream *stream = &master->streams[0]; + u64 virt_sid = vdev->virt_id; + int sidx; + + if (virt_sid > UINT_MAX) + return -EINVAL; + + WARN_ON_ONCE(master->num_streams != 1); + + /* Find an empty pair of SID_REPLACE and SID_MATCH */ + sidx = ida_alloc_max(&vintf->sids, vintf->cmdqv->num_sids_per_vintf - 1, + GFP_KERNEL); + if (sidx < 0) + return sidx; + + writel(stream->id, REG_VINTF(vintf, SID_REPLACE(sidx))); + writel(virt_sid << 1 | 0x1, REG_VINTF(vintf, SID_MATCH(sidx))); + dev_dbg(vintf->cmdqv->dev, + "VINTF%u: allocated SID_REPLACE%d for pSID=%x, vSID=%x\n", + vintf->idx, sidx, stream->id, (u32)virt_sid); + + vsid->idx = sidx; + vsid->vintf = vintf; + vsid->sid = stream->id; + + vdev->destroy = &tegra241_vintf_destroy_vsid; + return 0; +} + +static struct iommufd_viommu_ops tegra241_cmdqv_viommu_ops = { + .destroy = tegra241_cmdqv_destroy_vintf_user, + .alloc_domain_nested = arm_vsmmu_alloc_domain_nested, + /* Non-accelerated commands will be still handled by the kernel */ + .cache_invalidate = arm_vsmmu_cache_invalidate, + .vdevice_size = VDEVICE_STRUCT_SIZE(struct tegra241_vintf_sid, core), + .vdevice_init = tegra241_vintf_init_vsid, + .get_hw_queue_size = tegra241_vintf_get_vcmdq_size, + .hw_queue_init_phys = tegra241_vintf_alloc_lvcmdq_user, +}; + +static int +tegra241_cmdqv_init_vintf_user(struct arm_vsmmu *vsmmu, + const struct iommu_user_data *user_data) +{ + struct tegra241_cmdqv *cmdqv = + container_of(vsmmu->smmu, struct tegra241_cmdqv, smmu); + struct tegra241_vintf *vintf = viommu_to_vintf(&vsmmu->core); + struct iommu_viommu_tegra241_cmdqv data; + phys_addr_t page0_base; + int ret; + + if (!user_data) + return -EINVAL; + + ret = iommu_copy_struct_from_user(&data, user_data, + IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV, + out_vintf_mmap_length); + if (ret) + return ret; + + ret = tegra241_cmdqv_init_vintf(cmdqv, cmdqv->num_vintfs - 1, vintf); + if (ret < 0) { + dev_err(cmdqv->dev, "no more available vintf\n"); + return ret; + } + + /* + * Initialize the user-owned VINTF without a LVCMDQ, as it cannot pre- + * allocate a LVCMDQ until user space wants one, for security reasons. + * It is different than the kernel-owned VINTF0, which had pre-assigned + * and pre-allocated global VCMDQs that would be mapped to the LVCMDQs + * by the tegra241_vintf_hw_init() call. + */ + ret = tegra241_vintf_hw_init(vintf, false); + if (ret) + goto deinit_vintf; + + page0_base = cmdqv->base_phys + TEGRA241_VINTFi_PAGE0(vintf->idx); + ret = iommufd_viommu_alloc_mmap(&vintf->vsmmu.core, page0_base, SZ_64K, + &vintf->mmap_offset); + if (ret) + goto hw_deinit_vintf; + + data.out_vintf_mmap_length = SZ_64K; + data.out_vintf_mmap_offset = vintf->mmap_offset; + ret = iommu_copy_struct_to_user(user_data, &data, + IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV, + out_vintf_mmap_length); + if (ret) + goto free_mmap; + + ida_init(&vintf->sids); + mutex_init(&vintf->lvcmdq_mutex); + + dev_dbg(cmdqv->dev, "VINTF%u: allocated with vmid (%d)\n", vintf->idx, + vintf->vsmmu.vmid); + + vsmmu->core.ops = &tegra241_cmdqv_viommu_ops; + return 0; + +free_mmap: + iommufd_viommu_destroy_mmap(&vintf->vsmmu.core, vintf->mmap_offset); +hw_deinit_vintf: + tegra241_vintf_hw_deinit(vintf); +deinit_vintf: + tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx); + return ret; +} diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 32ee02380912..2fecea1973bc 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -591,6 +591,28 @@ struct iommu_hw_info_arm_smmuv3 { __u32 aidr; }; +/** + * struct iommu_hw_info_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Hardware + * Information (IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV) + * + * @flags: Must be 0 + * @version: Version number for the CMDQ-V HW for PARAM bits[03:00] + * @log2vcmdqs: Log2 of the total number of VCMDQs for PARAM bits[07:04] + * @log2vsids: Log2 of the total number of SID replacements for PARAM bits[15:12] + * @__reserved: Must be 0 + * + * VMM can use these fields directly in its emulated global PARAM register. Note + * that only one Virtual Interface (VINTF) should be exposed to a VM, i.e. PARAM + * bits[11:08] should be set to 0 for log2 of the total number of VINTFs. + */ +struct iommu_hw_info_tegra241_cmdqv { + __u32 flags; + __u8 version; + __u8 log2vcmdqs; + __u8 log2vsids; + __u8 __reserved; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware @@ -598,12 +620,15 @@ struct iommu_hw_info_arm_smmuv3 { * @IOMMU_HW_INFO_TYPE_DEFAULT: Input to request for a default type * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type + * @IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM + * SMMUv3) info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE = 0, IOMMU_HW_INFO_TYPE_DEFAULT = 0, IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, + IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV = 3, }; /** @@ -972,10 +997,29 @@ struct iommu_fault_alloc { * enum iommu_viommu_type - Virtual IOMMU Type * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type + * @IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM + * SMMUv3) enabled ARM SMMUv3 type */ enum iommu_viommu_type { IOMMU_VIOMMU_TYPE_DEFAULT = 0, IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1, + IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV = 2, +}; + +/** + * struct iommu_viommu_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Virtual Interface + * (IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV) + * @out_vintf_mmap_offset: mmap offset argument for VINTF's page0 + * @out_vintf_mmap_length: mmap length argument for VINTF's page0 + * + * Both @out_vintf_mmap_offset and @out_vintf_mmap_length are reported by kernel + * for user space to mmap the VINTF page0 from the host physical address space + * to the guest physical address space so that a guest kernel can directly R/W + * access to the VINTF page0 in order to control its virtual command queues. + */ +struct iommu_viommu_tegra241_cmdqv { + __aligned_u64 out_vintf_mmap_offset; + __aligned_u64 out_vintf_mmap_length; }; /** @@ -1172,9 +1216,24 @@ struct iommu_veventq_alloc { /** * enum iommu_hw_queue_type - HW Queue Type * @IOMMU_HW_QUEUE_TYPE_DEFAULT: Reserved for future use + * @IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM + * SMMUv3) Virtual Command Queue (VCMDQ) */ enum iommu_hw_queue_type { IOMMU_HW_QUEUE_TYPE_DEFAULT = 0, + /* + * TEGRA241_CMDQV requirements (otherwise, allocation will fail) + * - alloc starts from the lowest @index=0 in ascending order + * - destroy starts from the last allocated @index in descending order + * - @base_addr must be aligned to @length in bytes and mapped in IOAS + * - @length must be a power of 2, with a minimum 32 bytes and a maximum + * 2 ^ idr[1].CMDQS * 16 bytes (use GET_HW_INFO call to read idr[1] + * from struct iommu_hw_info_arm_smmuv3) + * - suggest to back the queue memory with contiguous physical pages or + * a single huge page with alignment of the queue size, and limit the + * emulated vSMMU's IDR1.CMDQS to log2(huge page size / 16 bytes) + */ + IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV = 1, }; /** -- cgit v1.2.3 From 32b2d3a57e26804ca96d82a222667ac0fa226cb7 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:21 -0700 Subject: iommu/tegra241-cmdqv: Add IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV support Add a new vEVENTQ type for VINTFs that are assigned to the user space. Simply report the two 64-bit LVCMDQ_ERR_MAPs register values. Link: https://patch.msgid.link/r/68161a980da41fa5022841209638aeff258557b5.1752126748.git.nicolinc@nvidia.com Reviewed-by: Alok Tiwari Reviewed-by: Pranjal Shrivastava Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 22 ++++++++++++++++++++++ include/uapi/linux/iommufd.h | 15 +++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index 3eeb8444fadf..d5d43a1c7708 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -295,6 +295,20 @@ static inline int vcmdq_write_config(struct tegra241_vcmdq *vcmdq, u32 regval) /* ISR Functions */ +static void tegra241_vintf_user_handle_error(struct tegra241_vintf *vintf) +{ + struct iommufd_viommu *viommu = &vintf->vsmmu.core; + struct iommu_vevent_tegra241_cmdqv vevent_data; + int i; + + for (i = 0; i < LVCMDQ_ERR_MAP_NUM_64; i++) + vevent_data.lvcmdq_err_map[i] = + readq_relaxed(REG_VINTF(vintf, LVCMDQ_ERR_MAP_64(i))); + + iommufd_viommu_report_event(viommu, IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV, + &vevent_data, sizeof(vevent_data)); +} + static void tegra241_vintf0_handle_error(struct tegra241_vintf *vintf) { int i; @@ -340,6 +354,14 @@ static irqreturn_t tegra241_cmdqv_isr(int irq, void *devid) vintf_map &= ~BIT_ULL(0); } + /* Handle other user VINTFs and their LVCMDQs */ + while (vintf_map) { + unsigned long idx = __ffs64(vintf_map); + + tegra241_vintf_user_handle_error(cmdqv->vintfs[idx]); + vintf_map &= ~BIT_ULL(idx); + } + return IRQ_HANDLED; } diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 2fecea1973bc..554aacf89ea7 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -1146,10 +1146,12 @@ struct iommufd_vevent_header { * enum iommu_veventq_type - Virtual Event Queue Type * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use * @IOMMU_VEVENTQ_TYPE_ARM_SMMUV3: ARM SMMUv3 Virtual Event Queue + * @IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension IRQ */ enum iommu_veventq_type { IOMMU_VEVENTQ_TYPE_DEFAULT = 0, IOMMU_VEVENTQ_TYPE_ARM_SMMUV3 = 1, + IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV = 2, }; /** @@ -1173,6 +1175,19 @@ struct iommu_vevent_arm_smmuv3 { __aligned_le64 evt[4]; }; +/** + * struct iommu_vevent_tegra241_cmdqv - Tegra241 CMDQV IRQ + * (IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV) + * @lvcmdq_err_map: 128-bit logical vcmdq error map, little-endian. + * (Refer to register LVCMDQ_ERR_MAPs per VINTF ) + * + * The 128-bit register value from HW exclusively reflect the error bits for a + * Virtual Interface represented by a vIOMMU object. Read and report directly. + */ +struct iommu_vevent_tegra241_cmdqv { + __aligned_le64 lvcmdq_err_map[2]; +}; + /** * struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC) * @size: sizeof(struct iommu_veventq_alloc) -- cgit v1.2.3 From c3ff7f06c7876bc292cac1c7d4df3d0bfd74f3b7 Mon Sep 17 00:00:00 2001 From: I Viswanath Date: Wed, 9 Jul 2025 20:37:18 +0530 Subject: i2c: Clarify behavior of I2C_M_RD flag Update the description of I2C_M_RD to clarify that not setting it signals a write transaction Signed-off-by: I Viswanath Signed-off-by: Wolfram Sang --- include/uapi/linux/i2c.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/i2c.h b/include/uapi/linux/i2c.h index 92326ebde350..a2db2a56c8b0 100644 --- a/include/uapi/linux/i2c.h +++ b/include/uapi/linux/i2c.h @@ -21,7 +21,8 @@ * * @flags: * Supported by all adapters: - * %I2C_M_RD: read data (from slave to master). Guaranteed to be 0x0001! + * %I2C_M_RD: read data (from slave to master). Guaranteed to be 0x0001! If + * not set, the transaction is interpreted as write. * * Optional: * %I2C_M_DMA_SAFE: the buffer of this message is DMA safe. Makes only sense -- cgit v1.2.3 From 36a686c0784fcccdaa4f38b498a9ef0d42ea7cb8 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 10 Jul 2025 18:43:42 +0200 Subject: Revert "netfilter: nf_tables: Add notifications for hook changes" This reverts commit 465b9ee0ee7bc268d7f261356afd6c4262e48d82. Such notifications fit better into core or nfnetlink_hook code, following the NFNL_MSG_HOOK_GET message format. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 --- include/uapi/linux/netfilter/nf_tables.h | 10 ------ include/uapi/linux/netfilter/nfnetlink.h | 2 -- net/netfilter/nf_tables_api.c | 59 -------------------------------- net/netfilter/nfnetlink.c | 1 - net/netfilter/nft_chain_filter.c | 2 -- 6 files changed, 79 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e4d8e451e935..5e49619ae49c 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1142,11 +1142,6 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); -struct nft_hook; -void nf_tables_chain_device_notify(const struct nft_chain *chain, - const struct nft_hook *hook, - const struct net_device *dev, int event); - enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 518ba144544c..2beb30be2c5f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -142,8 +142,6 @@ enum nf_tables_msg_types { NFT_MSG_DESTROYOBJ, NFT_MSG_DESTROYFLOWTABLE, NFT_MSG_GETSETELEM_RESET, - NFT_MSG_NEWDEV, - NFT_MSG_DELDEV, NFT_MSG_MAX, }; @@ -1786,18 +1784,10 @@ enum nft_synproxy_attributes { * enum nft_device_attributes - nf_tables device netlink attributes * * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) - * @NFTA_DEVICE_TABLE: table containing the flowtable or chain hooking into the device (NLA_STRING) - * @NFTA_DEVICE_FLOWTABLE: flowtable hooking into the device (NLA_STRING) - * @NFTA_DEVICE_CHAIN: chain hooking into the device (NLA_STRING) - * @NFTA_DEVICE_SPEC: hook spec matching the device (NLA_STRING) */ enum nft_devices_attributes { NFTA_DEVICE_UNSPEC, NFTA_DEVICE_NAME, - NFTA_DEVICE_TABLE, - NFTA_DEVICE_FLOWTABLE, - NFTA_DEVICE_CHAIN, - NFTA_DEVICE_SPEC, __NFTA_DEVICE_MAX }; #define NFTA_DEVICE_MAX (__NFTA_DEVICE_MAX - 1) diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h index 50d807af2649..6cd58cd2a6f0 100644 --- a/include/uapi/linux/netfilter/nfnetlink.h +++ b/include/uapi/linux/netfilter/nfnetlink.h @@ -25,8 +25,6 @@ enum nfnetlink_groups { #define NFNLGRP_ACCT_QUOTA NFNLGRP_ACCT_QUOTA NFNLGRP_NFTRACE, #define NFNLGRP_NFTRACE NFNLGRP_NFTRACE - NFNLGRP_NFT_DEV, -#define NFNLGRP_NFT_DEV NFNLGRP_NFT_DEV __NFNLGRP_MAX, }; #define NFNLGRP_MAX (__NFNLGRP_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 24c71ecb2179..a7240736f98e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9686,64 +9686,6 @@ struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, } EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu); -static void -nf_tables_device_notify(const struct nft_table *table, int attr, - const char *name, const struct nft_hook *hook, - const struct net_device *dev, int event) -{ - struct net *net = dev_net(dev); - struct nlmsghdr *nlh; - struct sk_buff *skb; - u16 flags = 0; - - if (!nfnetlink_has_listeners(net, NFNLGRP_NFT_DEV)) - return; - - skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!skb) - goto err; - - event = event == NETDEV_REGISTER ? NFT_MSG_NEWDEV : NFT_MSG_DELDEV; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nfnl_msg_put(skb, 0, 0, event, flags, table->family, - NFNETLINK_V0, nft_base_seq(net)); - if (!nlh) - goto err; - - if (nla_put_string(skb, NFTA_DEVICE_TABLE, table->name) || - nla_put_string(skb, attr, name) || - nla_put(skb, NFTA_DEVICE_SPEC, hook->ifnamelen, hook->ifname) || - nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) - goto err; - - nlmsg_end(skb, nlh); - nfnetlink_send(skb, net, 0, NFNLGRP_NFT_DEV, - nlmsg_report(nlh), GFP_KERNEL); - return; -err: - if (skb) - kfree_skb(skb); - nfnetlink_set_err(net, 0, NFNLGRP_NFT_DEV, -ENOBUFS); -} - -void -nf_tables_chain_device_notify(const struct nft_chain *chain, - const struct nft_hook *hook, - const struct net_device *dev, int event) -{ - nf_tables_device_notify(chain->table, NFTA_DEVICE_CHAIN, - chain->name, hook, dev, event); -} - -static void -nf_tables_flowtable_device_notify(const struct nft_flowtable *ft, - const struct nft_hook *hook, - const struct net_device *dev, int event) -{ - nf_tables_device_notify(ft->table, NFTA_DEVICE_FLOWTABLE, - ft->name, hook, dev, event); -} - static int nft_flowtable_event(unsigned long event, struct net_device *dev, struct nft_flowtable *flowtable, bool changename) { @@ -9791,7 +9733,6 @@ static int nft_flowtable_event(unsigned long event, struct net_device *dev, list_add_tail_rcu(&ops->list, &hook->ops_list); break; } - nf_tables_flowtable_device_notify(flowtable, hook, dev, event); break; } return 0; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index ac77fc21632d..e598a2a252b0 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -86,7 +86,6 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES, [NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT, [NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES, - [NFNLGRP_NFT_DEV] = NFNL_SUBSYS_NFTABLES, }; static struct nfnl_net *nfnl_pernet(struct net *net) diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 846d48ba8965..b16185e9a6dd 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -363,8 +363,6 @@ static int nft_netdev_event(unsigned long event, struct net_device *dev, list_add_tail_rcu(&ops->list, &hook->ops_list); break; } - nf_tables_chain_device_notify(&basechain->chain, - hook, dev, event); break; } return 0; -- cgit v1.2.3 From 5a8f77e24a30bbce2fa57926f3dede84894fd10a Mon Sep 17 00:00:00 2001 From: Michał Winiarski Date: Wed, 2 Jul 2025 11:35:18 +0200 Subject: PCI/IOV: Restore VF resizable BAR state after reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to regular resizable BARs, VF BARs can also be resized, e.g. by the system firmware or the PCI subsystem itself. The capability layout is the same as PCI_EXT_CAP_ID_REBAR. Add the capability ID and restore it as a part of IOV state. See PCIe r6.2, sec 7.8.7. Signed-off-by: Michał Winiarski Signed-off-by: Bjorn Helgaas Reviewed-by: Ilpo Järvinen Reviewed-by: Christian König Link: https://patch.msgid.link/20250702093522.518099-2-michal.winiarski@intel.com --- drivers/pci/iov.c | 30 +++++++++++++++++++++++++++++- drivers/pci/pci.h | 12 ++++++++++++ include/uapi/linux/pci_regs.h | 9 +++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 10693b5d7eb6..10ccef8afe14 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -7,6 +7,7 @@ * Copyright (C) 2009 Intel Corporation, Yu Zhao */ +#include #include #include #include @@ -850,6 +851,7 @@ found: pci_read_config_byte(dev, pos + PCI_SRIOV_FUNC_LINK, &iov->link); if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END) iov->link = PCI_DEVFN(PCI_SLOT(dev->devfn), iov->link); + iov->vf_rebar_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_VF_REBAR); if (pdev) iov->dev = pci_dev_get(pdev); @@ -888,6 +890,30 @@ static void sriov_release(struct pci_dev *dev) dev->sriov = NULL; } +static void sriov_restore_vf_rebar_state(struct pci_dev *dev) +{ + unsigned int pos, nbars, i; + u32 ctrl; + + pos = pci_iov_vf_rebar_cap(dev); + if (!pos) + return; + + pci_read_config_dword(dev, pos + PCI_VF_REBAR_CTRL, &ctrl); + nbars = FIELD_GET(PCI_VF_REBAR_CTRL_NBAR_MASK, ctrl); + + for (i = 0; i < nbars; i++, pos += 8) { + int bar_idx, size; + + pci_read_config_dword(dev, pos + PCI_VF_REBAR_CTRL, &ctrl); + bar_idx = FIELD_GET(PCI_VF_REBAR_CTRL_BAR_IDX, ctrl); + size = pci_rebar_bytes_to_size(dev->sriov->barsz[bar_idx]); + ctrl &= ~PCI_VF_REBAR_CTRL_BAR_SIZE; + ctrl |= FIELD_PREP(PCI_VF_REBAR_CTRL_BAR_SIZE, size); + pci_write_config_dword(dev, pos + PCI_VF_REBAR_CTRL, ctrl); + } +} + static void sriov_restore_state(struct pci_dev *dev) { int i; @@ -1047,8 +1073,10 @@ resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno) */ void pci_restore_iov_state(struct pci_dev *dev) { - if (dev->is_physfn) + if (dev->is_physfn) { + sriov_restore_vf_rebar_state(dev); sriov_restore_state(dev); + } } /** diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 12215ee72afb..69258c445b28 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -486,6 +486,7 @@ struct pci_sriov { u16 subsystem_vendor; /* VF subsystem vendor */ u16 subsystem_device; /* VF subsystem device */ resource_size_t barsz[PCI_SRIOV_NUM_BARS]; /* VF BAR size */ + u16 vf_rebar_cap; /* VF Resizable BAR capability offset */ bool drivers_autoprobe; /* Auto probing of VFs by driver */ }; @@ -710,6 +711,13 @@ void pci_iov_update_resource(struct pci_dev *dev, int resno); resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno); void pci_restore_iov_state(struct pci_dev *dev); int pci_iov_bus_range(struct pci_bus *bus); +static inline u16 pci_iov_vf_rebar_cap(struct pci_dev *dev) +{ + if (!dev->is_physfn) + return 0; + + return dev->sriov->vf_rebar_cap; +} static inline bool pci_resource_is_iov(int resno) { return resno >= PCI_IOV_RESOURCES && resno <= PCI_IOV_RESOURCE_END; @@ -734,6 +742,10 @@ static inline int pci_iov_bus_range(struct pci_bus *bus) { return 0; } +static inline u16 pci_iov_vf_rebar_cap(struct pci_dev *dev) +{ + return 0; +} static inline bool pci_resource_is_iov(int resno) { return false; diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index a3a3e942dedf..f5b17745de60 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -745,6 +745,7 @@ #define PCI_EXT_CAP_ID_L1SS 0x1E /* L1 PM Substates */ #define PCI_EXT_CAP_ID_PTM 0x1F /* Precision Time Measurement */ #define PCI_EXT_CAP_ID_DVSEC 0x23 /* Designated Vendor-Specific */ +#define PCI_EXT_CAP_ID_VF_REBAR 0x24 /* VF Resizable BAR */ #define PCI_EXT_CAP_ID_DLF 0x25 /* Data Link Feature */ #define PCI_EXT_CAP_ID_PL_16GT 0x26 /* Physical Layer 16.0 GT/s */ #define PCI_EXT_CAP_ID_NPEM 0x29 /* Native PCIe Enclosure Management */ @@ -1141,6 +1142,14 @@ #define PCI_DVSEC_HEADER2 0x8 /* Designated Vendor-Specific Header2 */ #define PCI_DVSEC_HEADER2_ID(x) ((x) & 0xffff) +/* VF Resizable BARs, same layout as PCI_REBAR */ +#define PCI_VF_REBAR_CAP PCI_REBAR_CAP +#define PCI_VF_REBAR_CAP_SIZES PCI_REBAR_CAP_SIZES +#define PCI_VF_REBAR_CTRL PCI_REBAR_CTRL +#define PCI_VF_REBAR_CTRL_BAR_IDX PCI_REBAR_CTRL_BAR_IDX +#define PCI_VF_REBAR_CTRL_NBAR_MASK PCI_REBAR_CTRL_NBAR_MASK +#define PCI_VF_REBAR_CTRL_BAR_SIZE PCI_REBAR_CTRL_BAR_SIZE + /* Data Link Feature */ #define PCI_DLF_CAP 0x04 /* Capabilities Register */ #define PCI_DLF_EXCHANGE_ENABLE 0x80000000 /* Data Link Feature Exchange Enable */ -- cgit v1.2.3 From 2677010e7793451c20d895c477c4dc76f6e6a10e Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Thu, 10 Jul 2025 21:12:03 +0000 Subject: Add support to set NAPI threaded for individual NAPI A net device has a threaded sysctl that can be used to enable threaded NAPI polling on all of the NAPI contexts under that device. Allow enabling threaded NAPI polling at individual NAPI level using netlink. Extend the netlink operation `napi-set` and allow setting the threaded attribute of a NAPI. This will enable the threaded polling on a NAPI context. Add a test in `nl_netdev.py` that verifies various cases of threaded NAPI being set at NAPI and at device level. Tested ./tools/testing/selftests/net/nl_netdev.py TAP version 13 1..7 ok 1 nl_netdev.empty_check ok 2 nl_netdev.lo_check ok 3 nl_netdev.page_pool_check ok 4 nl_netdev.napi_list_check ok 5 nl_netdev.dev_set_threaded ok 6 nl_netdev.napi_set_threaded ok 7 nl_netdev.nsim_rxq_reset_down # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Samiullah Khawaja Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250710211203.3979655-1-skhawaja@google.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 10 ++++ Documentation/networking/napi.rst | 9 +++- include/linux/netdevice.h | 1 + include/uapi/linux/netdev.h | 1 + net/core/dev.c | 30 +++++++++-- net/core/dev.h | 7 +++ net/core/netdev-genl-gen.c | 5 +- net/core/netdev-genl.c | 14 +++++ tools/include/uapi/linux/netdev.h | 1 + tools/testing/selftests/net/nl_netdev.py | 91 +++++++++++++++++++++++++++++++- 10 files changed, 162 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index ce4cfec82100..85d0ea6ac426 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -283,6 +283,14 @@ attribute-sets: doc: The timeout, in nanoseconds, of how long to suspend irq processing, if event polling finds events type: uint + - + name: threaded + doc: Whether the NAPI is configured to operate in threaded polling + mode. If this is set to 1 then the NAPI context operates in + threaded polling mode. + type: uint + checks: + max: 1 - name: xsk-info attributes: [] @@ -694,6 +702,7 @@ operations: - defer-hard-irqs - gro-flush-timeout - irq-suspend-timeout + - threaded dump: request: attributes: @@ -746,6 +755,7 @@ operations: - defer-hard-irqs - gro-flush-timeout - irq-suspend-timeout + - threaded - name: bind-tx doc: Bind dmabuf to netdev for TX diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst index d0e3953cae6a..a15754adb041 100644 --- a/Documentation/networking/napi.rst +++ b/Documentation/networking/napi.rst @@ -444,7 +444,14 @@ dependent). The NAPI instance IDs will be assigned in the opposite order than the process IDs of the kernel threads. Threaded NAPI is controlled by writing 0/1 to the ``threaded`` file in -netdev's sysfs directory. +netdev's sysfs directory. It can also be enabled for a specific NAPI using +netlink interface. + +For example, using the script: + +.. code-block:: bash + + $ ynl --family netdev --do napi-set --json='{"id": 66, "threaded": 1}' .. rubric:: Footnotes diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ec23cee5245d..e49d8c98d284 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -369,6 +369,7 @@ struct napi_config { u64 irq_suspend_timeout; u32 defer_hard_irqs; cpumask_t affinity_mask; + bool threaded; unsigned int napi_id; }; diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7eb9571786b8..1f3719a9a0eb 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -134,6 +134,7 @@ enum { NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + NETDEV_A_NAPI_THREADED, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) diff --git a/net/core/dev.c b/net/core/dev.c index 19ddc3e6990a..621a639aeba1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6961,6 +6961,31 @@ static void napi_stop_kthread(struct napi_struct *napi) napi->thread = NULL; } +int napi_set_threaded(struct napi_struct *napi, bool threaded) +{ + if (threaded) { + if (!napi->thread) { + int err = napi_kthread_create(napi); + + if (err) + return err; + } + } + + if (napi->config) + napi->config->threaded = threaded; + + if (!threaded && napi->thread) { + napi_stop_kthread(napi); + } else { + /* Make sure kthread is created before THREADED bit is set. */ + smp_mb__before_atomic(); + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + } + + return 0; +} + int dev_set_threaded(struct net_device *dev, bool threaded) { struct napi_struct *napi; @@ -6968,9 +6993,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded) netdev_assert_locked_or_invisible(dev); - if (dev->threaded == threaded) - return 0; - if (threaded) { list_for_each_entry(napi, &dev->napi_list, dev_list) { if (!napi->thread) { @@ -7221,6 +7243,8 @@ static void napi_restore_config(struct napi_struct *n) napi_hash_add(n); n->config->napi_id = n->napi_id; } + + WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded)); } static void napi_save_config(struct napi_struct *n) diff --git a/net/core/dev.h b/net/core/dev.h index e93f36b7ddf3..a603387fb566 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -315,6 +315,13 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, WRITE_ONCE(n->irq_suspend_timeout, timeout); } +static inline bool napi_get_threaded(struct napi_struct *n) +{ + return test_bit(NAPI_STATE_THREADED, &n->state); +} + +int napi_set_threaded(struct napi_struct *n, bool threaded); + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 4fc44587f493..0994bd68a7e6 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -92,11 +92,12 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] }; /* NETDEV_CMD_NAPI_SET - do */ -static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = { +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED + 1] = { [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_UINT, 1), }; /* NETDEV_CMD_BIND_TX - do */ @@ -193,7 +194,7 @@ static const struct genl_split_ops netdev_nl_ops[] = { .cmd = NETDEV_CMD_NAPI_SET, .doit = netdev_nl_napi_set_doit, .policy = netdev_napi_set_nl_policy, - .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + .maxattr = NETDEV_A_NAPI_THREADED, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 2afa7b2141aa..5875df372415 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -184,6 +184,10 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) goto nla_put_failure; + if (nla_put_uint(rsp, NETDEV_A_NAPI_THREADED, + napi_get_threaded(napi))) + goto nla_put_failure; + if (napi->thread) { pid = task_pid_nr(napi->thread); if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) @@ -322,8 +326,18 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) { u64 irq_suspend_timeout = 0; u64 gro_flush_timeout = 0; + u8 threaded = 0; u32 defer = 0; + if (info->attrs[NETDEV_A_NAPI_THREADED]) { + int ret; + + threaded = nla_get_uint(info->attrs[NETDEV_A_NAPI_THREADED]); + ret = napi_set_threaded(napi, !!threaded); + if (ret) + return ret; + } + if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); napi_set_defer_hard_irqs(napi, defer); diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 7eb9571786b8..1f3719a9a0eb 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -134,6 +134,7 @@ enum { NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + NETDEV_A_NAPI_THREADED, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py index c9109627a741..c8ffade79a52 100755 --- a/tools/testing/selftests/net/nl_netdev.py +++ b/tools/testing/selftests/net/nl_netdev.py @@ -35,6 +35,91 @@ def napi_list_check(nf) -> None: ksft_eq(len(napis), 100, comment=f"queue count after reset queue {q} mode {i}") +def napi_set_threaded(nf) -> None: + """ + Test that verifies various cases of napi threaded + set and unset at napi and device level. + """ + with NetdevSimDev(queue_count=2) as nsimdev: + nsim = nsimdev.nsims[0] + + ip(f"link set dev {nsim.ifname} up") + + napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True) + ksft_eq(len(napis), 2) + + napi0_id = napis[0]['id'] + napi1_id = napis[1]['id'] + + # set napi threaded and verify + nf.napi_set({'id': napi0_id, 'threaded': 1}) + napi0 = nf.napi_get({'id': napi0_id}) + ksft_eq(napi0['threaded'], 1) + ksft_ne(napi0.get('pid'), None) + + # check it is not set for napi1 + napi1 = nf.napi_get({'id': napi1_id}) + ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1.get('pid'), None) + + ip(f"link set dev {nsim.ifname} down") + ip(f"link set dev {nsim.ifname} up") + + # verify if napi threaded is still set + napi0 = nf.napi_get({'id': napi0_id}) + ksft_eq(napi0['threaded'], 1) + ksft_ne(napi0.get('pid'), None) + + # check it is still not set for napi1 + napi1 = nf.napi_get({'id': napi1_id}) + ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1.get('pid'), None) + + # unset napi threaded and verify + nf.napi_set({'id': napi0_id, 'threaded': 0}) + napi0 = nf.napi_get({'id': napi0_id}) + ksft_eq(napi0['threaded'], 0) + ksft_eq(napi0.get('pid'), None) + + # set threaded at device level + system(f"echo 1 > /sys/class/net/{nsim.ifname}/threaded") + + # check napi threaded is set for both napis + napi0 = nf.napi_get({'id': napi0_id}) + ksft_eq(napi0['threaded'], 1) + ksft_ne(napi0.get('pid'), None) + napi1 = nf.napi_get({'id': napi1_id}) + ksft_eq(napi1['threaded'], 1) + ksft_ne(napi1.get('pid'), None) + + # unset threaded at device level + system(f"echo 0 > /sys/class/net/{nsim.ifname}/threaded") + + # check napi threaded is unset for both napis + napi0 = nf.napi_get({'id': napi0_id}) + ksft_eq(napi0['threaded'], 0) + ksft_eq(napi0.get('pid'), None) + napi1 = nf.napi_get({'id': napi1_id}) + ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1.get('pid'), None) + + # set napi threaded for napi0 + nf.napi_set({'id': napi0_id, 'threaded': 1}) + napi0 = nf.napi_get({'id': napi0_id}) + ksft_eq(napi0['threaded'], 1) + ksft_ne(napi0.get('pid'), None) + + # unset threaded at device level + system(f"echo 0 > /sys/class/net/{nsim.ifname}/threaded") + + # check napi threaded is unset for both napis + napi0 = nf.napi_get({'id': napi0_id}) + ksft_eq(napi0['threaded'], 0) + ksft_eq(napi0.get('pid'), None) + napi1 = nf.napi_get({'id': napi1_id}) + ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1.get('pid'), None) + def dev_set_threaded(nf) -> None: """ Test that verifies various cases of napi threaded @@ -56,8 +141,10 @@ def dev_set_threaded(nf) -> None: # check napi threaded is set for both napis napi0 = nf.napi_get({'id': napi0_id}) + ksft_eq(napi0['threaded'], 1) ksft_ne(napi0.get('pid'), None) napi1 = nf.napi_get({'id': napi1_id}) + ksft_eq(napi1['threaded'], 1) ksft_ne(napi1.get('pid'), None) # unset threaded @@ -65,8 +152,10 @@ def dev_set_threaded(nf) -> None: # check napi threaded is unset for both napis napi0 = nf.napi_get({'id': napi0_id}) + ksft_eq(napi0['threaded'], 0) ksft_eq(napi0.get('pid'), None) napi1 = nf.napi_get({'id': napi1_id}) + ksft_eq(napi1['threaded'], 0) ksft_eq(napi1.get('pid'), None) def nsim_rxq_reset_down(nf) -> None: @@ -156,7 +245,7 @@ def page_pool_check(nf) -> None: def main() -> None: nf = NetdevFamily() ksft_run([empty_check, lo_check, page_pool_check, napi_list_check, - dev_set_threaded, nsim_rxq_reset_down], + dev_set_threaded, napi_set_threaded, nsim_rxq_reset_down], args=(nf, )) ksft_exit() -- cgit v1.2.3 From 6c758062c64dfbd61862801fbde4e0702f4f3a23 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Jul 2025 11:40:00 +0000 Subject: tcp: add LINUX_MIB_BEYOND_WINDOW Add a new SNMP MIB : LINUX_MIB_BEYOND_WINDOW Incremented when an incoming packet is received beyond the receiver window. nstat -az | grep TcpExtBeyondWindow Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250711114006.480026-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/snmp.rst | 1 + include/net/dropreason-core.h | 1 + include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_input.c | 1 + 5 files changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/net_cachelines/snmp.rst b/Documentation/networking/net_cachelines/snmp.rst index bd44b3eebbef..bce4eb35ec48 100644 --- a/Documentation/networking/net_cachelines/snmp.rst +++ b/Documentation/networking/net_cachelines/snmp.rst @@ -36,6 +36,7 @@ unsigned_long LINUX_MIB_TIMEWAITRECYCLED unsigned_long LINUX_MIB_TIMEWAITKILLED unsigned_long LINUX_MIB_PAWSACTIVEREJECTED unsigned_long LINUX_MIB_PAWSESTABREJECTED +unsigned_long LINUX_MIB_BEYOND_WINDOW unsigned_long LINUX_MIB_TSECR_REJECTED unsigned_long LINUX_MIB_PAWS_OLD_ACK unsigned_long LINUX_MIB_PAWS_TW_REJECTED diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index beb134d55747..229bb1826f2a 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -309,6 +309,7 @@ enum skb_drop_reason { /** * @SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE: * Not acceptable END_SEQ field. + * Corresponds to LINUX_MIB_BEYOND_WINDOW. */ SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE, /** diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 1d234d7e1892..49f5640092a0 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -186,6 +186,7 @@ enum LINUX_MIB_TIMEWAITKILLED, /* TimeWaitKilled */ LINUX_MIB_PAWSACTIVEREJECTED, /* PAWSActiveRejected */ LINUX_MIB_PAWSESTABREJECTED, /* PAWSEstabRejected */ + LINUX_MIB_BEYOND_WINDOW, /* BeyondWindow */ LINUX_MIB_TSECRREJECTED, /* TSEcrRejected */ LINUX_MIB_PAWS_OLD_ACK, /* PAWSOldAck */ LINUX_MIB_PAWS_TW_REJECTED, /* PAWSTimewait */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ea2f01584379..65b0d0ab0084 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -189,6 +189,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), + SNMP_MIB_ITEM("BeyondWindow", LINUX_MIB_BEYOND_WINDOW), SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED), SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK), SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED), diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f0f9c78654b4..5e2d82c273e2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5900,6 +5900,7 @@ step1: if (!th->rst) { if (th->syn) goto syn_challenge; + NET_INC_STATS(sock_net(sk), LINUX_MIB_BEYOND_WINDOW); if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSEQ, &tp->last_oow_ack_time)) -- cgit v1.2.3 From 19d18fdfc79217c86802271c9ce5b4ed174628cc Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Wed, 16 Jul 2025 21:46:53 +0800 Subject: bpf: Add struct bpf_token_info The 'commit 35f96de04127 ("bpf: Introduce BPF token object")' added BPF token as a new kind of BPF kernel object. And BPF_OBJ_GET_INFO_BY_FD already used to get BPF object info, so we can also get token info with this cmd. One usage scenario, when program runs failed with token, because of the permission failure, we can report what BPF token is allowing with this API for debugging. Acked-by: Andrii Nakryiko Signed-off-by: Tao Chen Link: https://lore.kernel.org/r/20250716134654.1162635-1-chen.dylane@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 +++++++++++ include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/syscall.c | 18 ++++++++++++++++++ kernel/bpf/token.c | 25 ++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 8 ++++++++ 5 files changed, 69 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bc887831eaa5..f9cd2164ed23 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2354,6 +2354,7 @@ extern const struct super_operations bpf_super_ops; extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; +extern const struct file_operations bpf_token_fops; #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ extern const struct bpf_prog_ops _name ## _prog_ops; \ @@ -2551,6 +2552,9 @@ void bpf_token_inc(struct bpf_token *token); void bpf_token_put(struct bpf_token *token); int bpf_token_create(union bpf_attr *attr); struct bpf_token *bpf_token_get_from_fd(u32 ufd); +int bpf_token_get_info_by_fd(struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr); bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); @@ -2949,6 +2953,13 @@ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) return ERR_PTR(-EOPNOTSUPP); } +static inline int bpf_token_get_info_by_fd(struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EOPNOTSUPP; +} + static inline void __dev_flush(struct list_head *flush_list) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0670e15a6100..233de8677382 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -450,6 +450,7 @@ union bpf_iter_link_info { * * **struct bpf_map_info** * * **struct bpf_btf_info** * * **struct bpf_link_info** + * * **struct bpf_token_info** * * Return * Returns zero on success. On error, -1 is returned and *errno* @@ -6803,6 +6804,13 @@ struct bpf_link_info { }; } __attribute__((aligned(8))); +struct bpf_token_info { + __u64 allowed_cmds; + __u64 allowed_maps; + __u64 allowed_progs; + __u64 allowed_attachs; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach type). diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1a26d17536be..e63039817af3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5239,6 +5239,21 @@ static int bpf_link_get_info_by_fd(struct file *file, } +static int token_get_info_by_fd(struct file *file, + struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); + if (err) + return err; + return bpf_token_get_info_by_fd(token, attr, uattr); +} + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -5262,6 +5277,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll) return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); + else if (fd_file(f)->f_op == &bpf_token_fops) + return token_get_info_by_fd(fd_file(f), fd_file(f)->private_data, + attr, uattr); return -EINVAL; } diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 26057aa13503..0bbe412f854e 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -103,7 +103,7 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) static const struct inode_operations bpf_token_iops = { }; -static const struct file_operations bpf_token_fops = { +const struct file_operations bpf_token_fops = { .release = bpf_token_release, .show_fdinfo = bpf_token_show_fdinfo, }; @@ -210,6 +210,29 @@ out_file: return err; } +int bpf_token_get_info_by_fd(struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_token_info info; + u32 info_len = attr->info.info_len; + + info_len = min_t(u32, info_len, sizeof(info)); + memset(&info, 0, sizeof(info)); + + info.allowed_cmds = token->allowed_cmds; + info.allowed_maps = token->allowed_maps; + info.allowed_progs = token->allowed_progs; + info.allowed_attachs = token->allowed_attachs; + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + struct bpf_token *bpf_token_get_from_fd(u32 ufd) { CLASS(fd, f)(ufd); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0670e15a6100..233de8677382 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -450,6 +450,7 @@ union bpf_iter_link_info { * * **struct bpf_map_info** * * **struct bpf_btf_info** * * **struct bpf_link_info** + * * **struct bpf_token_info** * * Return * Returns zero on success. On error, -1 is returned and *errno* @@ -6803,6 +6804,13 @@ struct bpf_link_info { }; } __attribute__((aligned(8))); +struct bpf_token_info { + __u64 allowed_cmds; + __u64 allowed_maps; + __u64 allowed_progs; + __u64 allowed_attachs; +} __attribute__((aligned(8))); + /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed * by user and intended to be used by socket (e.g. to bind to, depends on * attach type). -- cgit v1.2.3 From c0ae03588bbb95378758fe80e7436a9b4cfc71f6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 15 Jul 2025 17:03:21 -0700 Subject: ethtool: rss: initial RSS_SET (indirection table handling) Add initial support for RSS_SET, for now only operations on the indirection table are supported. Unlike the ioctl don't check if at least one parameter is being changed. This is how other ethtool-nl ops behave, so pick the ethtool-nl consistency vs copying ioctl behavior. There are two special cases here: 1) resetting the table to defaults; 2) support for tables of different size. For (1) I use an empty Netlink attribute (array of size 0). (2) may require some background. AFAICT a lot of modern devices allow allocating RSS tables of different sizes. mlx5 can upsize its tables, bnxt has some "table size calculation", and Intel folks asked about RSS table sizing in context of resource allocation in the past. The ethtool IOCTL API has a concept of table size, but right now the user is expected to provide a table exactly the size the device requests. Some drivers may change the table size at runtime (in response to queue count changes) but the user is not in control of this. What's not great is that all RSS contexts share the same table size. For example a device with 128 queues enabled, 16 RSS contexts 8 queues in each will likely have 256 entry tables for each of the 16 contexts, while 32 would be more than enough given each context only has 8 queues. To address this the Netlink API should avoid enforcing table size at the uAPI level, and should allow the user to express the min table size they expect. To fully solve (2) we will need more driver plumbing but at the uAPI level this patch allows the user to specify a table size smaller than what the device advertises. The device table size must be a multiple of the user requested table size. We then replicate the user-provided table to fill the full device size table. This addresses the "allow the user to express the min table size" objective, while not enforcing any fixed size. From Netlink perspective .get_rxfh_indir_size() is now de facto the "max" table size supported by the device. We may choose to support table replication in ethtool, too, when we actually plumb this thru the device APIs. Initially I was considering moving full pattern generation to the kernel (which queues to use, at which frequency and what min sequence length). I don't think this complexity would buy us much and most if not all devices have pow-2 table sizes, which simplifies the replication a lot. Reviewed-by: Gal Pressman Link: https://patch.msgid.link/20250716000331.1378807-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 12 ++ Documentation/networking/ethtool-netlink.rst | 26 +++- include/uapi/linux/ethtool_netlink_generated.h | 1 + net/ethtool/netlink.c | 8 + net/ethtool/netlink.h | 1 + net/ethtool/rss.c | 195 +++++++++++++++++++++++++ 6 files changed, 242 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index c38c03c624f0..1eca88a508a0 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -2643,6 +2643,18 @@ operations: attributes: - header - events + - + name: rss-set + doc: Set RSS params. + + attribute-set: rss + + do: + request: + attributes: + - header + - context + - indir - name: rss-ntf doc: | diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 248bc3d93da9..27db7540e60e 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -239,6 +239,7 @@ Userspace to kernel: ``ETHTOOL_MSG_PHY_GET`` get Ethernet PHY information ``ETHTOOL_MSG_TSCONFIG_GET`` get hw timestamping configuration ``ETHTOOL_MSG_TSCONFIG_SET`` set hw timestamping configuration + ``ETHTOOL_MSG_RSS_SET`` set RSS settings ===================================== ================================= Kernel to userspace: @@ -292,6 +293,7 @@ Kernel to userspace: ``ETHTOOL_MSG_TSCONFIG_GET_REPLY`` hw timestamping configuration ``ETHTOOL_MSG_TSCONFIG_SET_REPLY`` new hw timestamping configuration ``ETHTOOL_MSG_PSE_NTF`` PSE events notification + ``ETHTOOL_MSG_RSS_NTF`` RSS settings notification ======================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -1989,6 +1991,28 @@ hfunc. Current supported options are symmetric-xor and symmetric-or-xor. ETHTOOL_A_RSS_FLOW_HASH carries per-flow type bitmask of which header fields are included in the hash calculation. +RSS_SET +======= + +Request contents: + +===================================== ====== ============================== + ``ETHTOOL_A_RSS_HEADER`` nested request header + ``ETHTOOL_A_RSS_CONTEXT`` u32 context number + ``ETHTOOL_A_RSS_INDIR`` binary Indir table bytes +===================================== ====== ============================== + +``ETHTOOL_A_RSS_INDIR`` is the minimal RSS table the user expects. Kernel and +the device driver may replicate the table if its smaller than smallest table +size supported by the device. For example if user requests ``[0, 1]`` but the +device needs at least 8 entries - the real table in use will end up being +``[0, 1, 0, 1, 0, 1, 0, 1]``. Most devices require the table size to be power +of 2, so tables which size is not a power of 2 will likely be rejected. +Using table of size 0 will reset the indirection table to the default. + +Note that, at present, only a subset of RSS configuration can be accomplished +over Netlink. + PLCA_GET_CFG ============ @@ -2455,7 +2479,7 @@ are netlink only. ``ETHTOOL_GRXNTUPLE`` n/a ``ETHTOOL_GSSET_INFO`` ``ETHTOOL_MSG_STRSET_GET`` ``ETHTOOL_GRXFHINDIR`` ``ETHTOOL_MSG_RSS_GET`` - ``ETHTOOL_SRXFHINDIR`` n/a + ``ETHTOOL_SRXFHINDIR`` ``ETHTOOL_MSG_RSS_SET`` ``ETHTOOL_GFEATURES`` ``ETHTOOL_MSG_FEATURES_GET`` ``ETHTOOL_SFEATURES`` ``ETHTOOL_MSG_FEATURES_SET`` ``ETHTOOL_GCHANNELS`` ``ETHTOOL_MSG_CHANNELS_GET`` diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 96027e26ffba..130bdf5c3516 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -840,6 +840,7 @@ enum { ETHTOOL_MSG_PHY_GET, ETHTOOL_MSG_TSCONFIG_GET, ETHTOOL_MSG_TSCONFIG_SET, + ETHTOOL_MSG_RSS_SET, __ETHTOOL_MSG_USER_CNT, ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1) diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index b1f8999c1adc..0ae0d7a9667c 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -405,6 +405,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_PSE_GET] = ðnl_pse_request_ops, [ETHTOOL_MSG_PSE_SET] = ðnl_pse_request_ops, [ETHTOOL_MSG_RSS_GET] = ðnl_rss_request_ops, + [ETHTOOL_MSG_RSS_SET] = ðnl_rss_request_ops, [ETHTOOL_MSG_PLCA_GET_CFG] = ðnl_plca_cfg_request_ops, [ETHTOOL_MSG_PLCA_SET_CFG] = ðnl_plca_cfg_request_ops, [ETHTOOL_MSG_PLCA_GET_STATUS] = ðnl_plca_status_request_ops, @@ -1504,6 +1505,13 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_tsconfig_set_policy, .maxattr = ARRAY_SIZE(ethnl_tsconfig_set_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_RSS_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_set_doit, + .policy = ethnl_rss_set_policy, + .maxattr = ARRAY_SIZE(ethnl_rss_set_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 94a7eb402022..620dd1ab9b3b 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -484,6 +484,7 @@ extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MO extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1]; extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_START_CONTEXT + 1]; +extern const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_START_CONTEXT + 1]; extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1]; extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1]; extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1]; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 41ab9fc67652..c8db523671de 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -218,6 +218,10 @@ rss_prepare(const struct rss_req_info *request, struct net_device *dev, { rss_prepare_flow_hash(request, dev, data, info); + /* Coming from RSS_SET, driver may only have flow_hash_fields ops */ + if (!dev->ethtool_ops->get_rxfh) + return 0; + if (request->rss_context) return rss_prepare_ctx(request, dev, data, info); return rss_prepare_get(request, dev, data, info); @@ -466,6 +470,193 @@ void ethtool_rss_notify(struct net_device *dev, u32 rss_context) ethnl_notify(dev, ETHTOOL_MSG_RSS_NTF, &req_info.base); } +/* RSS_SET */ + +const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_START_CONTEXT + 1] = { + [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_RSS_CONTEXT] = { .type = NLA_U32, }, + [ETHTOOL_A_RSS_INDIR] = { .type = NLA_BINARY, }, +}; + +static int +ethnl_rss_set_validate(struct ethnl_req_info *req_info, struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + struct rss_req_info *request = RSS_REQINFO(req_info); + struct nlattr **tb = info->attrs; + struct nlattr *bad_attr = NULL; + + if (request->rss_context && !ops->create_rxfh_context) + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_CONTEXT]; + + if (bad_attr) { + NL_SET_BAD_ATTR(info->extack, bad_attr); + return -EOPNOTSUPP; + } + + return 1; +} + +static int +rss_set_prep_indir(struct net_device *dev, struct genl_info *info, + struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh, + bool *reset, bool *mod) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct netlink_ext_ack *extack = info->extack; + struct nlattr **tb = info->attrs; + struct ethtool_rxnfc rx_rings; + size_t alloc_size; + u32 user_size; + int i, err; + + if (!tb[ETHTOOL_A_RSS_INDIR]) + return 0; + if (!data->indir_size || !ops->get_rxnfc) + return -EOPNOTSUPP; + + rx_rings.cmd = ETHTOOL_GRXRINGS; + err = ops->get_rxnfc(dev, &rx_rings, NULL); + if (err) + return err; + + if (nla_len(tb[ETHTOOL_A_RSS_INDIR]) % 4) { + NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_INDIR]); + return -EINVAL; + } + user_size = nla_len(tb[ETHTOOL_A_RSS_INDIR]) / 4; + if (!user_size) { + if (rxfh->rss_context) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_RSS_INDIR], + "can't reset table for a context"); + return -EINVAL; + } + *reset = true; + } else if (data->indir_size % user_size) { + NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR], + "size (%d) mismatch with device indir table (%d)", + user_size, data->indir_size); + return -EINVAL; + } + + rxfh->indir_size = data->indir_size; + alloc_size = array_size(data->indir_size, sizeof(rxfh->indir[0])); + rxfh->indir = kzalloc(alloc_size, GFP_KERNEL); + if (!rxfh->indir) + return -ENOMEM; + + nla_memcpy(rxfh->indir, tb[ETHTOOL_A_RSS_INDIR], alloc_size); + for (i = 0; i < user_size; i++) { + if (rxfh->indir[i] < rx_rings.data) + continue; + + NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR], + "entry %d: queue out of range (%d)", + i, rxfh->indir[i]); + err = -EINVAL; + goto err_free; + } + + if (user_size) { + /* Replicate the user-provided table to fill the device table */ + for (i = user_size; i < data->indir_size; i++) + rxfh->indir[i] = rxfh->indir[i % user_size]; + } else { + for (i = 0; i < data->indir_size; i++) + rxfh->indir[i] = + ethtool_rxfh_indir_default(i, rx_rings.data); + } + + *mod |= memcmp(rxfh->indir, data->indir_table, data->indir_size); + + return 0; + +err_free: + kfree(rxfh->indir); + rxfh->indir = NULL; + return err; +} + +static void +rss_set_ctx_update(struct ethtool_rxfh_context *ctx, struct nlattr **tb, + struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh) +{ + int i; + + if (rxfh->indir) { + for (i = 0; i < data->indir_size; i++) + ethtool_rxfh_context_indir(ctx)[i] = rxfh->indir[i]; + ctx->indir_configured = !!nla_len(tb[ETHTOOL_A_RSS_INDIR]); + } +} + +static int +ethnl_rss_set(struct ethnl_req_info *req_info, struct genl_info *info) +{ + struct rss_req_info *request = RSS_REQINFO(req_info); + struct ethtool_rxfh_context *ctx = NULL; + struct net_device *dev = req_info->dev; + struct ethtool_rxfh_param rxfh = {}; + bool indir_reset = false, indir_mod; + struct nlattr **tb = info->attrs; + struct rss_reply_data data = {}; + const struct ethtool_ops *ops; + bool mod = false; + int ret; + + ops = dev->ethtool_ops; + data.base.dev = dev; + + ret = rss_prepare(request, dev, &data, info); + if (ret) + return ret; + + rxfh.rss_context = request->rss_context; + + ret = rss_set_prep_indir(dev, info, &data, &rxfh, &indir_reset, &mod); + if (ret) + goto exit_clean_data; + indir_mod = !!tb[ETHTOOL_A_RSS_INDIR]; + + rxfh.hfunc = ETH_RSS_HASH_NO_CHANGE; + rxfh.input_xfrm = RXH_XFRM_NO_CHANGE; + + mutex_lock(&dev->ethtool->rss_lock); + if (request->rss_context) { + ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context); + if (!ctx) { + ret = -ENOENT; + goto exit_unlock; + } + } + + if (!mod) + ret = 0; /* nothing to tell the driver */ + else if (!ops->set_rxfh) + ret = -EOPNOTSUPP; + else if (!rxfh.rss_context) + ret = ops->set_rxfh(dev, &rxfh, info->extack); + else + ret = ops->modify_rxfh_context(dev, ctx, &rxfh, info->extack); + if (ret) + goto exit_unlock; + + if (ctx) + rss_set_ctx_update(ctx, tb, &data, &rxfh); + else if (indir_reset) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else if (indir_mod) + dev->priv_flags |= IFF_RXFH_CONFIGURED; + +exit_unlock: + mutex_unlock(&dev->ethtool->rss_lock); + kfree(rxfh.indir); +exit_clean_data: + rss_cleanup_data(&data.base); + + return ret ?: mod; +} + const struct ethnl_request_ops ethnl_rss_request_ops = { .request_cmd = ETHTOOL_MSG_RSS_GET, .reply_cmd = ETHTOOL_MSG_RSS_GET_REPLY, @@ -478,4 +669,8 @@ const struct ethnl_request_ops ethnl_rss_request_ops = { .reply_size = rss_reply_size, .fill_reply = rss_fill_reply, .cleanup_data = rss_cleanup_data, + + .set_validate = ethnl_rss_set_validate, + .set = ethnl_rss_set, + .set_ntf_cmd = ETHTOOL_MSG_RSS_NTF, }; -- cgit v1.2.3 From 6624a0af82a6e3a4d3609264ef591a8fa3467139 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 17 Jul 2025 17:42:02 +1000 Subject: wifi: cfg80211: support configuring an S1G short beaconing BSS S1G short beacons are an optional frame type used in an S1G BSS that contain a limited set of elements. While they are optional, they are a fundamental part of S1G that enables significant power saving. Expose 2 additional netlink attributes, NL80211_ATTR_S1G_LONG_BEACON_PERIOD which denotes the number of beacon intervals between each long beacon and NL80211_ATTR_S1G_SHORT_BEACON which is a nested attribute containing the short beacon tail and head. We split them as the long beacon period cannot be updated, and is only used when initialisng the interface, whereas the short beacon data can be used to both initialise and update the templates. This follows how things such as the beacon interval and DTIM period currently operate. During the initialisation path, we ensure we have the long beacon period if the short beacon data is being passed down, whereas the update path will simply update the template if its sent down. The short beacon data is validated using the same routines for regular beacons as they support correctly parsing the short beacon format while ensuring the frame is well-formed. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250717074205.312577-2-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 23 ++++++++++++++ include/uapi/linux/nl80211.h | 39 ++++++++++++++++++++++++ net/wireless/nl80211.c | 72 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 17f2a665dce6..44a1055a81ba 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1423,6 +1423,23 @@ struct cfg80211_unsol_bcast_probe_resp { const u8 *tmpl; }; +/** + * struct cfg80211_s1g_short_beacon - S1G short beacon data. + * + * @update: Set to true if the feature configuration should be updated. + * @short_head: Short beacon head. + * @short_tail: Short beacon tail. + * @short_head_len: Short beacon head len. + * @short_tail_len: Short beacon tail len. + */ +struct cfg80211_s1g_short_beacon { + bool update; + const u8 *short_head; + const u8 *short_tail; + size_t short_head_len; + size_t short_tail_len; +}; + /** * struct cfg80211_ap_settings - AP configuration * @@ -1463,6 +1480,8 @@ struct cfg80211_unsol_bcast_probe_resp { * @fils_discovery: FILS discovery transmission parameters * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @mbssid_config: AP settings for multiple bssid + * @s1g_long_beacon_period: S1G long beacon period + * @s1g_short_beacon: S1G short beacon data */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1496,6 +1515,8 @@ struct cfg80211_ap_settings { struct cfg80211_fils_discovery fils_discovery; struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; struct cfg80211_mbssid_config mbssid_config; + u8 s1g_long_beacon_period; + struct cfg80211_s1g_short_beacon s1g_short_beacon; }; @@ -1507,11 +1528,13 @@ struct cfg80211_ap_settings { * @beacon: beacon data * @fils_discovery: FILS discovery transmission parameters * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters + * @s1g_short_beacon: S1G short beacon data */ struct cfg80211_ap_update { struct cfg80211_beacon_data beacon; struct cfg80211_fils_discovery fils_discovery; struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; + struct cfg80211_s1g_short_beacon s1g_short_beacon; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 39460334dafb..d1a14f2892d9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2915,6 +2915,19 @@ enum nl80211_commands { * applicable to that specific radio only. If the radio id is greater * thank the number of radios, error denoting invalid value is returned. * + * @NL80211_ATTR_S1G_LONG_BEACON_PERIOD: (u8) Integer attribute that represents + * the number of beacon intervals between each long beacon transmission + * for an S1G BSS with short beaconing enabled. This is a required + * attribute for initialising an S1G short beaconing BSS. When updating + * the short beacon data, this is not required. It has a minimum value of + * 2 (i.e 2 beacon intervals). + * + * @NL80211_ATTR_S1G_SHORT_BEACON: Nested attribute containing the short beacon + * head and tail used to set or update the short beacon templates. When + * bringing up a new interface, %NL80211_ATTR_S1G_LONG_BEACON_PERIOD is + * required alongside this attribute. Refer to + * @enum nl80211_s1g_short_beacon_attrs for the attribute definitions. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3474,6 +3487,9 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_RADIO_INDEX, + NL80211_ATTR_S1G_LONG_BEACON_PERIOD, + NL80211_ATTR_S1G_SHORT_BEACON, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -8148,4 +8164,27 @@ enum nl80211_wiphy_radio_freq_range { NL80211_WIPHY_RADIO_FREQ_ATTR_MAX = __NL80211_WIPHY_RADIO_FREQ_ATTR_LAST - 1, }; +/** + * enum nl80211_s1g_short_beacon_attrs - S1G short beacon data + * + * @__NL80211_S1G_SHORT_BEACON_ATTR_INVALID: Invalid + * + * @NL80211_S1G_SHORT_BEACON_ATTR_HEAD: Short beacon head (binary). + * @NL80211_S1G_SHORT_BEACON_ATTR_TAIL: Short beacon tail (binary). + * + * @__NL80211_S1G_SHORT_BEACON_ATTR_LAST: Internal + * @NL80211_S1G_SHORT_BEACON_ATTR_MAX: Highest attribute + */ +enum nl80211_s1g_short_beacon_attrs { + __NL80211_S1G_SHORT_BEACON_ATTR_INVALID, + + NL80211_S1G_SHORT_BEACON_ATTR_HEAD, + NL80211_S1G_SHORT_BEACON_ATTR_TAIL, + + /* keep last */ + __NL80211_S1G_SHORT_BEACON_ATTR_LAST, + NL80211_S1G_SHORT_BEACON_ATTR_MAX = + __NL80211_S1G_SHORT_BEACON_ATTR_LAST - 1 +}; + #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 20bc0f052c16..1c808b08b747 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -482,6 +482,16 @@ nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] = { [NL80211_STA_WME_MAX_SP] = { .type = NLA_U8 }, }; +static const struct nla_policy +nl80211_s1g_short_beacon[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1] = { + [NL80211_S1G_SHORT_BEACON_ATTR_HEAD] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_beacon_head, + IEEE80211_MAX_DATA_LEN), + [NL80211_S1G_SHORT_BEACON_ATTR_TAIL] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), +}; + static const struct netlink_range_validation nl80211_punct_bitmap_range = { .min = 0, .max = 0xffff, @@ -858,6 +868,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_EPCS] = { .type = NLA_FLAG }, [NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS] = { .type = NLA_U16 }, [NL80211_ATTR_WIPHY_RADIO_INDEX] = { .type = NLA_U8 }, + [NL80211_ATTR_S1G_LONG_BEACON_PERIOD] = NLA_POLICY_MIN(NLA_U8, 2), + [NL80211_ATTR_S1G_SHORT_BEACON] = + NLA_POLICY_NESTED(nl80211_s1g_short_beacon), }; /* policy for the key attributes */ @@ -6202,6 +6215,41 @@ static int nl80211_validate_ap_phy_operation(struct cfg80211_ap_settings *params return 0; } +static int +nl80211_parse_s1g_short_beacon(struct cfg80211_registered_device *rdev, + struct nlattr *attrs, + struct cfg80211_s1g_short_beacon *sb) +{ + struct nlattr *tb[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1]; + int ret; + + if (!rdev->wiphy.bands[NL80211_BAND_S1GHZ]) + return -EINVAL; + + ret = nla_parse_nested(tb, NL80211_S1G_SHORT_BEACON_ATTR_MAX, attrs, + NULL, NULL); + if (ret) + return ret; + + /* Short beacon tail is optional (i.e might only include the TIM) */ + if (!tb[NL80211_S1G_SHORT_BEACON_ATTR_HEAD]) + return -EINVAL; + + sb->short_head = nla_data(tb[NL80211_S1G_SHORT_BEACON_ATTR_HEAD]); + sb->short_head_len = nla_len(tb[NL80211_S1G_SHORT_BEACON_ATTR_HEAD]); + sb->short_tail_len = 0; + + if (tb[NL80211_S1G_SHORT_BEACON_ATTR_TAIL]) { + sb->short_tail = + nla_data(tb[NL80211_S1G_SHORT_BEACON_ATTR_TAIL]); + sb->short_tail_len = + nla_len(tb[NL80211_S1G_SHORT_BEACON_ATTR_TAIL]); + } + + sb->update = true; + return 0; +} + static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -6442,6 +6490,22 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) goto out; } + if (info->attrs[NL80211_ATTR_S1G_SHORT_BEACON]) { + if (!info->attrs[NL80211_ATTR_S1G_LONG_BEACON_PERIOD]) { + err = -EINVAL; + goto out; + } + + params->s1g_long_beacon_period = nla_get_u8( + info->attrs[NL80211_ATTR_S1G_LONG_BEACON_PERIOD]); + + err = nl80211_parse_s1g_short_beacon( + rdev, info->attrs[NL80211_ATTR_S1G_SHORT_BEACON], + ¶ms->s1g_short_beacon); + if (err) + goto out; + } + err = nl80211_calculate_ap_params(params); if (err) goto out; @@ -6550,6 +6614,14 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info) goto out; } + attr = info->attrs[NL80211_ATTR_S1G_SHORT_BEACON]; + if (attr) { + err = nl80211_parse_s1g_short_beacon(rdev, attr, + ¶ms->s1g_short_beacon); + if (err) + goto out; + } + err = rdev_change_beacon(rdev, dev, params); out: -- cgit v1.2.3 From 850f14f5b91986e586b66565c9c75bdd4c834571 Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Wed, 16 Jul 2025 15:03:45 +0800 Subject: iommufd: Destroy vdevice on idevice destroy Destroy iommufd_vdevice (vdev) on iommufd_idevice (idev) destruction so that vdev can't outlive idev. idev represents the physical device bound to iommufd, while the vdev represents the virtual instance of the physical device in the VM. The lifecycle of the vdev should not be longer than idev. This doesn't cause real problem on existing use cases cause vdev doesn't impact the physical device, only provides virtualization information. But to extend vdev for Confidential Computing (CC), there are needs to do secure configuration for the vdev, e.g. TSM Bind/Unbind. These configurations should be rolled back on idev destroy, or the external driver (VFIO) functionality may be impact. The idev is created by external driver so its destruction can't fail. The idev implements pre_destroy() op to actively remove its associated vdev before destroying itself. There are 3 cases on idev pre_destroy(): 1. vdev is already destroyed by userspace. No extra handling needed. 2. vdev is still alive. Use iommufd_object_tombstone_user() to destroy vdev and tombstone the vdev ID. 3. vdev is being destroyed by userspace. The vdev ID is already freed, but vdev destroy handler is not completed. This requires multi-threads syncing - vdev holds idev's short term users reference until vdev destruction completes, idev leverages existing wait_shortterm mechanism for syncing. idev should also block any new reference to it after pre_destroy(), or the following wait shortterm would timeout. Introduce a 'destroying' flag, set it to true on idev pre_destroy(). Any attempt to reference idev should honor this flag under the protection of idev->igroup->lock. Link: https://patch.msgid.link/r/20250716070349.1807226-5-yilun.xu@linux.intel.com Originally-by: Nicolin Chen Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Co-developed-by: "Aneesh Kumar K.V (Arm)" Signed-off-by: "Aneesh Kumar K.V (Arm)" Tested-by: Nicolin Chen Signed-off-by: Xu Yilun Signed-off-by: Jason Gunthorpe --- drivers/iommu/iommufd/device.c | 51 ++++++++++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 12 ++++++++ drivers/iommu/iommufd/main.c | 2 ++ drivers/iommu/iommufd/viommu.c | 52 ++++++++++++++++++++++++++++++--- include/linux/iommufd.h | 1 + include/uapi/linux/iommufd.h | 5 ++++ 6 files changed, 119 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index e2ba21c43ad2..ee6ff4caf398 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -137,6 +137,57 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, } } +static void iommufd_device_remove_vdev(struct iommufd_device *idev) +{ + struct iommufd_vdevice *vdev; + + mutex_lock(&idev->igroup->lock); + /* prevent new references from vdev */ + idev->destroying = true; + /* vdev has been completely destroyed by userspace */ + if (!idev->vdev) + goto out_unlock; + + vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id); + /* + * An ongoing vdev destroy ioctl has removed the vdev from the object + * xarray, but has not finished iommufd_vdevice_destroy() yet as it + * needs the same mutex. We exit the locking then wait on short term + * users for the vdev destruction. + */ + if (IS_ERR(vdev)) + goto out_unlock; + + /* Should never happen */ + if (WARN_ON(vdev != idev->vdev)) { + iommufd_put_object(idev->ictx, &vdev->obj); + goto out_unlock; + } + + /* + * vdev is still alive. Hold a users refcount to prevent racing with + * userspace destruction, then use iommufd_object_tombstone_user() to + * destroy it and leave a tombstone. + */ + refcount_inc(&vdev->obj.users); + iommufd_put_object(idev->ictx, &vdev->obj); + mutex_unlock(&idev->igroup->lock); + iommufd_object_tombstone_user(idev->ictx, &vdev->obj); + return; + +out_unlock: + mutex_unlock(&idev->igroup->lock); +} + +void iommufd_device_pre_destroy(struct iommufd_object *obj) +{ + struct iommufd_device *idev = + container_of(obj, struct iommufd_device, obj); + + /* Release the short term users on this */ + iommufd_device_remove_vdev(idev); +} + void iommufd_device_destroy(struct iommufd_object *obj) { struct iommufd_device *idev = diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 149545060029..5d6ea5395cfe 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -489,6 +489,8 @@ struct iommufd_device { /* always the physical device */ struct device *dev; bool enforce_cache_coherency; + struct iommufd_vdevice *vdev; + bool destroying; }; static inline struct iommufd_device * @@ -499,6 +501,7 @@ iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id) struct iommufd_device, obj); } +void iommufd_device_pre_destroy(struct iommufd_object *obj); void iommufd_device_destroy(struct iommufd_object *obj); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd); @@ -687,9 +690,18 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_viommu_destroy(struct iommufd_object *obj); int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_vdevice_destroy(struct iommufd_object *obj); +void iommufd_vdevice_abort(struct iommufd_object *obj); int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_hw_queue_destroy(struct iommufd_object *obj); +static inline struct iommufd_vdevice * +iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id) +{ + return container_of(iommufd_get_object(ictx, id, + IOMMUFD_OBJ_VDEVICE), + struct iommufd_vdevice, obj); +} + #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); void iommufd_selftest_destroy(struct iommufd_object *obj); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 53085d24ce4a..99c1aab3d396 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -655,6 +655,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { .destroy = iommufd_access_destroy_object, }, [IOMMUFD_OBJ_DEVICE] = { + .pre_destroy = iommufd_device_pre_destroy, .destroy = iommufd_device_destroy, }, [IOMMUFD_OBJ_FAULT] = { @@ -676,6 +677,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { }, [IOMMUFD_OBJ_VDEVICE] = { .destroy = iommufd_vdevice_destroy, + .abort = iommufd_vdevice_abort, }, [IOMMUFD_OBJ_VEVENTQ] = { .destroy = iommufd_veventq_destroy, diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index dcf8a85b9f6e..ecbae5091ffe 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -110,20 +110,37 @@ out_put_idev: return rc; } -void iommufd_vdevice_destroy(struct iommufd_object *obj) +void iommufd_vdevice_abort(struct iommufd_object *obj) { struct iommufd_vdevice *vdev = container_of(obj, struct iommufd_vdevice, obj); struct iommufd_viommu *viommu = vdev->viommu; + struct iommufd_device *idev = vdev->idev; + + lockdep_assert_held(&idev->igroup->lock); if (vdev->destroy) vdev->destroy(vdev); /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */ xa_cmpxchg(&viommu->vdevs, vdev->virt_id, vdev, NULL, GFP_KERNEL); refcount_dec(&viommu->obj.users); + idev->vdev = NULL; put_device(vdev->dev); } +void iommufd_vdevice_destroy(struct iommufd_object *obj) +{ + struct iommufd_vdevice *vdev = + container_of(obj, struct iommufd_vdevice, obj); + struct iommufd_device *idev = vdev->idev; + struct iommufd_ctx *ictx = idev->ictx; + + mutex_lock(&idev->igroup->lock); + iommufd_vdevice_abort(obj); + mutex_unlock(&idev->igroup->lock); + iommufd_put_object(ictx, &idev->obj); +} + int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) { struct iommu_vdevice_alloc *cmd = ucmd->cmd; @@ -153,6 +170,17 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) goto out_put_idev; } + mutex_lock(&idev->igroup->lock); + if (idev->destroying) { + rc = -ENOENT; + goto out_unlock_igroup; + } + + if (idev->vdev) { + rc = -EEXIST; + goto out_unlock_igroup; + } + if (viommu->ops && viommu->ops->vdevice_size) { /* * It is a driver bug for: @@ -171,7 +199,7 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) ucmd->ictx, vdev_size, IOMMUFD_OBJ_VDEVICE); if (IS_ERR(vdev)) { rc = PTR_ERR(vdev); - goto out_put_idev; + goto out_unlock_igroup; } vdev->virt_id = virt_id; @@ -179,6 +207,19 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) get_device(idev->dev); vdev->viommu = viommu; refcount_inc(&viommu->obj.users); + /* + * A short term users reference is held on the idev so long as we have + * the pointer. iommufd_device_pre_destroy() will revoke it before the + * idev real destruction. + */ + vdev->idev = idev; + + /* + * iommufd_device_destroy() delays until idev->vdev is NULL before + * freeing the idev, which only happens once the vdev is finished + * destruction. + */ + idev->vdev = vdev; curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL); if (curr) { @@ -197,12 +238,15 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) if (rc) goto out_abort; iommufd_object_finalize(ucmd->ictx, &vdev->obj); - goto out_put_idev; + goto out_unlock_igroup; out_abort: iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj); +out_unlock_igroup: + mutex_unlock(&idev->igroup->lock); out_put_idev: - iommufd_put_object(ucmd->ictx, &idev->obj); + if (rc) + iommufd_put_object(ucmd->ictx, &idev->obj); out_put_viommu: iommufd_put_object(ucmd->ictx, &viommu->obj); return rc; diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index e3a0cd47384d..b88911026bc4 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -108,6 +108,7 @@ struct iommufd_viommu { struct iommufd_vdevice { struct iommufd_object obj; struct iommufd_viommu *viommu; + struct iommufd_device *idev; struct device *dev; /* diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 554aacf89ea7..c218c89e0e2e 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -1070,6 +1070,11 @@ struct iommu_viommu_alloc { * * Allocate a virtual device instance (for a physical device) against a vIOMMU. * This instance holds the device's information (related to its vIOMMU) in a VM. + * User should use IOMMU_DESTROY to destroy the virtual device before + * destroying the physical device (by closing vfio_cdev fd). Otherwise the + * virtual device would be forcibly destroyed on physical device destruction, + * its vdevice_id would be permanently leaked (unremovable & unreusable) until + * iommu fd closed. */ struct iommu_vdevice_alloc { __u32 size; -- cgit v1.2.3 From 009b2056cb259c90426b3c57e5b145d1cd9fa9e2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 9 Jul 2025 16:29:17 +0200 Subject: btrfs: defrag: add flag to force no-compression Currently the defrag ioctl cannot rewrite the extents without compression. Add a new flag for that, as setting compression to 0 (or "no compression") means to do no changes to compression so take what is the current default, like mount options or properties. The defrag setting overrides mount or properties. The compression BTRFS_DEFRAG_DONT_COMPRESS is only used for in-memory operations and does not need to have a fixed value. Mount with zstd:9, copy test file from /usr/bin/ (about 260KB): $ mount -o compress=zstd:9 /dev/vda /mnt $ filefrag -vsb testfile filefrag: -b needs a blocksize option, assuming 1024-byte blocks. Filesystem type is: 9123683e File size of testfile is 297704 (292 blocks of 1024 bytes) ext: logical_offset: physical_offset: length: expected: flags: 0: 0.. 127: 13312.. 13439: 128: encoded 1: 128.. 255: 13364.. 13491: 128: 13440: encoded 2: 256.. 291: 13424.. 13459: 36: 13492: last,encoded,eof testfile: 3 extents found $ compsize testfile Processed 1 file, 3 regular extents (3 refs), 0 inline, 1 fragments. Type Perc Disk Usage Uncompressed Referenced TOTAL 42% 124K 292K 292K zstd 42% 124K 292K 292K Defrag to uncompressed: $ btrfs fi defrag --nocomp testfile $ filefrag -vsb testfile filefrag: -b needs a blocksize option, assuming 1024-byte blocks. Filesystem type is: 9123683e File size of testfile is 297704 (292 blocks of 1024 bytes) ext: logical_offset: physical_offset: length: expected: flags: 0: 0.. 291: 291840.. 292131: 292: last,eof testfile: 1 extent found $ compsize testfile Processed 1 file, 1 regular extents (1 refs), 0 inline, 1 fragments. Type Perc Disk Usage Uncompressed Referenced TOTAL 100% 292K 292K 292K none 100% 292K 292K 292K Compress again with LZO: $ btrfs fi defrag -clzo testfile $ filefrag -vsb testfile filefrag: -b needs a blocksize option, assuming 1024-byte blocks. Filesystem type is: 9123683e File size of testfile is 297704 (292 blocks of 1024 bytes) ext: logical_offset: physical_offset: length: expected: flags: 0: 0.. 127: 13312.. 13439: 128: encoded 1: 128.. 255: 13392.. 13519: 128: 13440: encoded 2: 256.. 291: 13480.. 13515: 36: 13520: last,encoded,eof testfile: 3 extents found $ compsize testfile Processed 1 file, 3 regular extents (3 refs), 0 inline, 1 fragments. Type Perc Disk Usage Uncompressed Referenced TOTAL 64% 188K 292K 292K lzo 64% 188K 292K 292K Signed-off-by: David Sterba --- fs/btrfs/compression.h | 2 ++ fs/btrfs/defrag.c | 13 +++++++++---- fs/btrfs/inode.c | 11 +++++++---- fs/btrfs/ioctl.c | 10 ++++++++-- include/uapi/linux/btrfs.h | 3 +++ 5 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 1df3c8dec40a..1b38e707bbd9 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -113,6 +113,8 @@ enum btrfs_compression_type { BTRFS_COMPRESS_LZO = 2, BTRFS_COMPRESS_ZSTD = 3, BTRFS_NR_COMPRESS_TYPES = 4, + + BTRFS_DEFRAG_DONT_COMPRESS, }; struct workspace_manager { diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 701b6b51ea85..738179a5e170 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -947,7 +947,7 @@ struct defrag_target_range { * @extent_thresh: file extent size threshold, any extent size >= this value * will be ignored * @newer_than: only defrag extents newer than this value - * @do_compress: whether the defrag is doing compression + * @do_compress: whether the defrag is doing compression or no-compression * if true, @extent_thresh will be ignored and all regular * file extents meeting @newer_than will be targets. * @locked: if the range has already held extent lock @@ -1364,6 +1364,7 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); + bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS); int compress_type = BTRFS_COMPRESS_ZLIB; int compress_level = 0; int ret = 0; @@ -1394,6 +1395,9 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, if (range->compress_type) compress_type = range->compress_type; } + } else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) { + compress_type = BTRFS_DEFRAG_DONT_COMPRESS; + compress_level = 1; } if (extent_thresh == 0) @@ -1444,13 +1448,14 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, btrfs_inode_unlock(inode, 0); break; } - if (do_compress) { + if (do_compress || no_compress) { inode->defrag_compress = compress_type; inode->defrag_compress_level = compress_level; } ret = defrag_one_cluster(inode, ra, cur, cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, §ors_defragged, + newer_than, do_compress || no_compress, + §ors_defragged, max_to_defrag, &last_scanned); if (sectors_defragged > prev_sectors_defragged) @@ -1489,7 +1494,7 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); ret = sectors_defragged; } - if (do_compress) { + if (do_compress || no_compress) { btrfs_inode_lock(inode, 0); inode->defrag_compress = BTRFS_COMPRESS_NONE; btrfs_inode_unlock(inode, 0); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7ed340cac33f..b77dd22b8cdb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -781,12 +781,15 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, return 0; } + /* Defrag ioctl takes precedence over mount options and properties. */ + if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) + return 0; + if (BTRFS_COMPRESS_NONE < inode->defrag_compress && + inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) + return 1; /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; - /* defrag ioctl */ - if (inode->defrag_compress) - return 1; /* bad compression ratios */ if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; @@ -942,7 +945,7 @@ again: goto cleanup_and_bail_uncompressed; } - if (inode->defrag_compress) { + if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { compress_type = inode->defrag_compress; compress_level = inode->defrag_compress_level; } else if (inode->prop_compress) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 680c4e794e67..bf561be18885 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2554,8 +2554,14 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EOPNOTSUPP; goto out; } - /* compression requires us to start the IO */ - if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) && + (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) { + ret = -EINVAL; + goto out; + } + /* Compression or no-compression require to start the IO. */ + if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) || + (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) { range.flags |= BTRFS_DEFRAG_RANGE_START_IO; range.extent_thresh = (u32)-1; } diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index dd02160015b2..8e710bbb688e 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -616,8 +616,11 @@ struct btrfs_ioctl_clone_range_args { #define BTRFS_DEFRAG_RANGE_COMPRESS 1 #define BTRFS_DEFRAG_RANGE_START_IO 2 #define BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL 4 +/* Request no compression on the range (uncompress if necessary). */ +#define BTRFS_DEFRAG_RANGE_NOCOMPRESS 8 #define BTRFS_DEFRAG_RANGE_FLAGS_SUPP (BTRFS_DEFRAG_RANGE_COMPRESS | \ BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL | \ + BTRFS_DEFRAG_RANGE_NOCOMPRESS | \ BTRFS_DEFRAG_RANGE_START_IO) struct btrfs_ioctl_defrag_range_args { -- cgit v1.2.3 From a166ab7816c534973745b0fe7bce3c8cefc5426f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Jul 2025 16:43:41 -0700 Subject: ethtool: rss: support creating contexts via Netlink Support creating contexts via Netlink. Setting flow hashing fields on the new context is not supported at this stage, it can be added later. An empty indirection table is not supported. This is a carry over from the IOCTL interface where empty indirection table meant delete. We can repurpose empty indirection table in Netlink but for now to avoid confusion reject it using the policy. Support letting user choose the ID for the new context. This was not possible in IOCTL since the context ID field for the create action had to be set to the ETH_RXFH_CONTEXT_ALLOC magic value. Link: https://patch.msgid.link/20250717234343.2328602-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 23 ++- Documentation/networking/ethtool-netlink.rst | 27 ++++ include/uapi/linux/ethtool_netlink_generated.h | 3 + net/ethtool/ioctl.c | 1 + net/ethtool/netlink.c | 15 ++ net/ethtool/netlink.h | 3 + net/ethtool/rss.c | 203 +++++++++++++++++++++++++ 7 files changed, 273 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 069269edde01..25ffed5fddd5 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -2684,9 +2684,28 @@ operations: name: rss-ntf doc: | Notification for change in RSS configuration. - For additional contexts only modifications are modified, not creation - or removal of the contexts. + For additional contexts only modifications use this notification, + creation and deletion have dedicated messages. notify: rss-get + - + name: rss-create-act + doc: Create an RSS context. + attribute-set: rss + do: + request: &rss-create-attrs + attributes: + - header + - context + - hfunc + - indir + - hkey + - input-xfrm + reply: *rss-create-attrs + - + name: rss-create-ntf + doc: | + Notification for creation of an additional RSS context. + notify: rss-create-act mcast-groups: list: diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 056832c77ffd..2646fafb8512 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -240,6 +240,7 @@ Userspace to kernel: ``ETHTOOL_MSG_TSCONFIG_GET`` get hw timestamping configuration ``ETHTOOL_MSG_TSCONFIG_SET`` set hw timestamping configuration ``ETHTOOL_MSG_RSS_SET`` set RSS settings + ``ETHTOOL_MSG_RSS_CREATE_ACT`` create an additional RSS context ===================================== ================================= Kernel to userspace: @@ -294,6 +295,8 @@ Kernel to userspace: ``ETHTOOL_MSG_TSCONFIG_SET_REPLY`` new hw timestamping configuration ``ETHTOOL_MSG_PSE_NTF`` PSE events notification ``ETHTOOL_MSG_RSS_NTF`` RSS settings notification + ``ETHTOOL_MSG_RSS_CREATE_ACT_REPLY`` create an additional RSS context + ``ETHTOOL_MSG_RSS_CREATE_NTF`` additional RSS context created ======================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -2014,6 +2017,30 @@ device needs at least 8 entries - the real table in use will end up being of 2, so tables which size is not a power of 2 will likely be rejected. Using table of size 0 will reset the indirection table to the default. +RSS_CREATE_ACT +============== + +Request contents: + +===================================== ====== ============================== + ``ETHTOOL_A_RSS_HEADER`` nested request header + ``ETHTOOL_A_RSS_CONTEXT`` u32 context number + ``ETHTOOL_A_RSS_HFUNC`` u32 RSS hash func + ``ETHTOOL_A_RSS_INDIR`` binary Indir table bytes + ``ETHTOOL_A_RSS_HKEY`` binary Hash key bytes + ``ETHTOOL_A_RSS_INPUT_XFRM`` u32 RSS input data transformation +===================================== ====== ============================== + +Kernel response contents: + +===================================== ====== ============================== + ``ETHTOOL_A_RSS_HEADER`` nested request header + ``ETHTOOL_A_RSS_CONTEXT`` u32 context number +===================================== ====== ============================== + +Create an additional RSS context, if ``ETHTOOL_A_RSS_CONTEXT`` is not +specified kernel will allocate one automatically. + PLCA_GET_CFG ============ diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 130bdf5c3516..dea77abd295f 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -841,6 +841,7 @@ enum { ETHTOOL_MSG_TSCONFIG_GET, ETHTOOL_MSG_TSCONFIG_SET, ETHTOOL_MSG_RSS_SET, + ETHTOOL_MSG_RSS_CREATE_ACT, __ETHTOOL_MSG_USER_CNT, ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1) @@ -898,6 +899,8 @@ enum { ETHTOOL_MSG_TSCONFIG_SET_REPLY, ETHTOOL_MSG_PSE_NTF, ETHTOOL_MSG_RSS_NTF, + ETHTOOL_MSG_RSS_CREATE_ACT_REPLY, + ETHTOOL_MSG_RSS_CREATE_NTF, __ETHTOOL_MSG_KERNEL_CNT, ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index c53868889969..4b586b0f18e8 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1640,6 +1640,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, ntf = ETHTOOL_MSG_RSS_NTF; ret = ops->set_rxfh(dev, &rxfh_dev, extack); } else if (create) { + ntf = ETHTOOL_MSG_RSS_CREATE_NTF; ret = ops->create_rxfh_context(dev, ctx, &rxfh_dev, extack); /* Make sure driver populates defaults */ WARN_ON_ONCE(!ret && !rxfh_dev.key && ops->rxfh_per_ctx_key && diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 0ae0d7a9667c..e9696113a96b 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -81,6 +81,12 @@ static void ethnl_sock_priv_destroy(void *priv) } } +u32 ethnl_bcast_seq_next(void) +{ + ASSERT_RTNL(); + return ++ethnl_bcast_seq; +} + int ethnl_ops_begin(struct net_device *dev) { int ret; @@ -954,6 +960,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_PLCA_NTF] = ðnl_plca_cfg_request_ops, [ETHTOOL_MSG_MM_NTF] = ðnl_mm_request_ops, [ETHTOOL_MSG_RSS_NTF] = ðnl_rss_request_ops, + [ETHTOOL_MSG_RSS_CREATE_NTF] = ðnl_rss_request_ops, }; /* default notification handler */ @@ -1061,6 +1068,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_PLCA_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MM_NTF] = ethnl_default_notify, [ETHTOOL_MSG_RSS_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_RSS_CREATE_NTF] = ethnl_default_notify, }; void ethnl_notify(struct net_device *dev, unsigned int cmd, @@ -1512,6 +1520,13 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_rss_set_policy, .maxattr = ARRAY_SIZE(ethnl_rss_set_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_RSS_CREATE_ACT, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_rss_create_doit, + .policy = ethnl_rss_create_policy, + .maxattr = ARRAY_SIZE(ethnl_rss_create_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index ddb2fb00f929..b530bf9f85ee 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -10,6 +10,7 @@ struct ethnl_req_info; +u32 ethnl_bcast_seq_next(void); int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *nest, struct net *net, struct netlink_ext_ack *extack, @@ -485,6 +486,7 @@ extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1]; extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_START_CONTEXT + 1]; extern const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1]; +extern const struct nla_policy ethnl_rss_create_policy[ETHTOOL_A_RSS_INPUT_XFRM + 1]; extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1]; extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1]; extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1]; @@ -507,6 +509,7 @@ int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_tsinfo_start(struct netlink_callback *cb); int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_tsinfo_done(struct netlink_callback *cb); +int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index e5516e529b4a..be092dfa4407 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -893,3 +893,206 @@ const struct ethnl_request_ops ethnl_rss_request_ops = { .set = ethnl_rss_set, .set_ntf_cmd = ETHTOOL_MSG_RSS_NTF, }; + +/* RSS_CREATE */ + +const struct nla_policy ethnl_rss_create_policy[ETHTOOL_A_RSS_INPUT_XFRM + 1] = { + [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_RSS_CONTEXT] = NLA_POLICY_MIN(NLA_U32, 1), + [ETHTOOL_A_RSS_HFUNC] = NLA_POLICY_MIN(NLA_U32, 1), + [ETHTOOL_A_RSS_INDIR] = NLA_POLICY_MIN(NLA_BINARY, 1), + [ETHTOOL_A_RSS_HKEY] = NLA_POLICY_MIN(NLA_BINARY, 1), + [ETHTOOL_A_RSS_INPUT_XFRM] = + NLA_POLICY_MAX(NLA_U32, RXH_XFRM_SYM_OR_XOR), +}; + +static int +ethnl_rss_create_validate(struct net_device *dev, struct genl_info *info) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct nlattr **tb = info->attrs; + struct nlattr *bad_attr = NULL; + u32 rss_context, input_xfrm; + + if (!ops->create_rxfh_context) + return -EOPNOTSUPP; + + rss_context = nla_get_u32_default(tb[ETHTOOL_A_RSS_CONTEXT], 0); + if (ops->rxfh_max_num_contexts && + ops->rxfh_max_num_contexts <= rss_context) { + NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT]); + return -ERANGE; + } + + if (!ops->rxfh_per_ctx_key) { + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HFUNC]; + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HKEY]; + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM]; + } + + input_xfrm = nla_get_u32_default(tb[ETHTOOL_A_RSS_INPUT_XFRM], 0); + if (input_xfrm & ~ops->supported_input_xfrm) + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM]; + + if (bad_attr) { + NL_SET_BAD_ATTR(info->extack, bad_attr); + return -EOPNOTSUPP; + } + + return 0; +} + +static void +ethnl_rss_create_send_ntf(struct sk_buff *rsp, struct net_device *dev) +{ + struct nlmsghdr *nlh = (void *)rsp->data; + struct genlmsghdr *genl_hdr; + + /* Convert the reply into a notification */ + nlh->nlmsg_pid = 0; + nlh->nlmsg_seq = ethnl_bcast_seq_next(); + + genl_hdr = nlmsg_data(nlh); + genl_hdr->cmd = ETHTOOL_MSG_RSS_CREATE_NTF; + + ethnl_multicast(rsp, dev); +} + +int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info) +{ + bool indir_dflt = false, mod = false, ntf_fail = false; + struct ethtool_rxfh_param rxfh = {}; + struct ethtool_rxfh_context *ctx; + struct nlattr **tb = info->attrs; + struct rss_reply_data data = {}; + const struct ethtool_ops *ops; + struct rss_req_info req = {}; + struct net_device *dev; + struct sk_buff *rsp; + void *hdr; + u32 limit; + int ret; + + rsp = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + ret = ethnl_parse_header_dev_get(&req.base, tb[ETHTOOL_A_RSS_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + goto exit_free_rsp; + + dev = req.base.dev; + ops = dev->ethtool_ops; + + req.rss_context = nla_get_u32_default(tb[ETHTOOL_A_RSS_CONTEXT], 0); + + ret = ethnl_rss_create_validate(dev, info); + if (ret) + goto exit_free_dev; + + rtnl_lock(); + netdev_lock_ops(dev); + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto exit_dev_unlock; + + ret = rss_get_data_alloc(dev, &data); + if (ret) + goto exit_ops; + + ret = rss_set_prep_indir(dev, info, &data, &rxfh, &indir_dflt, &mod); + if (ret) + goto exit_clean_data; + + ethnl_update_u8(&rxfh.hfunc, tb[ETHTOOL_A_RSS_HFUNC], &mod); + + ret = rss_set_prep_hkey(dev, info, &data, &rxfh, &mod); + if (ret) + goto exit_free_indir; + + rxfh.input_xfrm = RXH_XFRM_NO_CHANGE; + ethnl_update_u8(&rxfh.input_xfrm, tb[ETHTOOL_A_RSS_INPUT_XFRM], &mod); + + ctx = ethtool_rxfh_ctx_alloc(ops, data.indir_size, data.hkey_size); + if (!ctx) { + ret = -ENOMEM; + goto exit_free_hkey; + } + + mutex_lock(&dev->ethtool->rss_lock); + if (!req.rss_context) { + limit = ops->rxfh_max_num_contexts ?: U32_MAX; + ret = xa_alloc(&dev->ethtool->rss_ctx, &req.rss_context, ctx, + XA_LIMIT(1, limit - 1), GFP_KERNEL_ACCOUNT); + } else { + ret = xa_insert(&dev->ethtool->rss_ctx, + req.rss_context, ctx, GFP_KERNEL_ACCOUNT); + } + if (ret < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT], + "error allocating context ID"); + goto err_unlock_free_ctx; + } + rxfh.rss_context = req.rss_context; + + ret = ops->create_rxfh_context(dev, ctx, &rxfh, info->extack); + if (ret) + goto err_ctx_id_free; + + /* Make sure driver populates defaults */ + WARN_ON_ONCE(!rxfh.key && ops->rxfh_per_ctx_key && + !memchr_inv(ethtool_rxfh_context_key(ctx), 0, + ctx->key_size)); + + /* Store the config from rxfh to Xarray.. */ + rss_set_ctx_update(ctx, tb, &data, &rxfh); + /* .. copy from Xarray to data. */ + __rss_prepare_ctx(dev, &data, ctx); + + hdr = ethnl_unicast_put(rsp, info->snd_portid, info->snd_seq, + ETHTOOL_MSG_RSS_CREATE_ACT_REPLY); + ntf_fail = ethnl_fill_reply_header(rsp, dev, ETHTOOL_A_RSS_HEADER); + ntf_fail |= rss_fill_reply(rsp, &req.base, &data.base); + if (WARN_ON(!hdr || ntf_fail)) { + ret = -EMSGSIZE; + goto exit_unlock; + } + + genlmsg_end(rsp, hdr); + + /* Use the same skb for the response and the notification, + * genlmsg_reply() will copy the skb if it has elevated user count. + */ + skb_get(rsp); + ret = genlmsg_reply(rsp, info); + ethnl_rss_create_send_ntf(rsp, dev); + rsp = NULL; + +exit_unlock: + mutex_unlock(&dev->ethtool->rss_lock); +exit_free_hkey: + kfree(rxfh.key); +exit_free_indir: + kfree(rxfh.indir); +exit_clean_data: + rss_get_data_free(&data); +exit_ops: + ethnl_ops_complete(dev); +exit_dev_unlock: + netdev_unlock_ops(dev); + rtnl_unlock(); +exit_free_dev: + ethnl_parse_header_dev_put(&req.base); +exit_free_rsp: + nlmsg_free(rsp); + return ret; + +err_ctx_id_free: + xa_erase(&dev->ethtool->rss_ctx, req.rss_context); +err_unlock_free_ctx: + kfree(ctx); + goto exit_unlock; +} -- cgit v1.2.3 From fbe09277fa6324b50cc4eedb4d99498cf7dad897 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Jul 2025 16:43:42 -0700 Subject: ethtool: rss: support removing contexts via Netlink Implement removing additional RSS contexts via Netlink. Technically it'd be possible to shoehorn the delete operation into ethnl_request_ops-compatible handler. The code ends up longer than open coded version, and I think we'll need a custom way of sending notifications at some stage (if we allow tying the context lifetime to the netlink socket, in the future). Link: https://patch.msgid.link/20250717234343.2328602-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 18 ++++ Documentation/networking/ethtool-netlink.rst | 14 ++++ include/uapi/linux/ethtool_netlink_generated.h | 2 + net/ethtool/common.c | 1 + net/ethtool/ioctl.c | 1 + net/ethtool/netlink.c | 7 ++ net/ethtool/netlink.h | 2 + net/ethtool/rss.c | 109 ++++++++++++++++++++++++- 8 files changed, 153 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 25ffed5fddd5..1063d5d32fea 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -2706,6 +2706,24 @@ operations: doc: | Notification for creation of an additional RSS context. notify: rss-create-act + - + name: rss-delete-act + doc: Delete an RSS context. + attribute-set: rss + do: + request: + attributes: + - header + - context + - + name: rss-delete-ntf + doc: | + Notification for deletion of an additional RSS context. + attribute-set: rss + event: + attributes: + - header + - context mcast-groups: list: diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 2646fafb8512..ab20c644af24 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -241,6 +241,7 @@ Userspace to kernel: ``ETHTOOL_MSG_TSCONFIG_SET`` set hw timestamping configuration ``ETHTOOL_MSG_RSS_SET`` set RSS settings ``ETHTOOL_MSG_RSS_CREATE_ACT`` create an additional RSS context + ``ETHTOOL_MSG_RSS_DELETE_ACT`` delete an additional RSS context ===================================== ================================= Kernel to userspace: @@ -297,6 +298,7 @@ Kernel to userspace: ``ETHTOOL_MSG_RSS_NTF`` RSS settings notification ``ETHTOOL_MSG_RSS_CREATE_ACT_REPLY`` create an additional RSS context ``ETHTOOL_MSG_RSS_CREATE_NTF`` additional RSS context created + ``ETHTOOL_MSG_RSS_DELETE_NTF`` additional RSS context deleted ======================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -2041,6 +2043,18 @@ Kernel response contents: Create an additional RSS context, if ``ETHTOOL_A_RSS_CONTEXT`` is not specified kernel will allocate one automatically. +RSS_DELETE_ACT +============== + +Request contents: + +===================================== ====== ============================== + ``ETHTOOL_A_RSS_HEADER`` nested request header + ``ETHTOOL_A_RSS_CONTEXT`` u32 context number +===================================== ====== ============================== + +Delete an additional RSS context. + PLCA_GET_CFG ============ diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index dea77abd295f..e3b8813465d7 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -842,6 +842,7 @@ enum { ETHTOOL_MSG_TSCONFIG_SET, ETHTOOL_MSG_RSS_SET, ETHTOOL_MSG_RSS_CREATE_ACT, + ETHTOOL_MSG_RSS_DELETE_ACT, __ETHTOOL_MSG_USER_CNT, ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1) @@ -901,6 +902,7 @@ enum { ETHTOOL_MSG_RSS_NTF, ETHTOOL_MSG_RSS_CREATE_ACT_REPLY, ETHTOOL_MSG_RSS_CREATE_NTF, + ETHTOOL_MSG_RSS_DELETE_NTF, __ETHTOOL_MSG_KERNEL_CNT, ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 2a1d40efb1fc..4f58648a27ad 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -1136,5 +1136,6 @@ void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id) netdev_err(dev, "device error, RSS context %d lost\n", context_id); ctx = xa_erase(&dev->ethtool->rss_ctx, context_id); kfree(ctx); + ethtool_rss_notify(dev, ETHTOOL_MSG_RSS_DELETE_NTF, context_id); } EXPORT_SYMBOL(ethtool_rxfh_context_lost); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 4b586b0f18e8..43a7854e784e 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1647,6 +1647,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, !memchr_inv(ethtool_rxfh_context_key(ctx), 0, ctx->key_size)); } else if (rxfh_dev.rss_delete) { + ntf = ETHTOOL_MSG_RSS_DELETE_NTF; ret = ops->remove_rxfh_context(dev, ctx, rxfh.rss_context, extack); } else { diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index e9696113a96b..2f813f25f07e 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -1527,6 +1527,13 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_rss_create_policy, .maxattr = ARRAY_SIZE(ethnl_rss_create_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_RSS_DELETE_ACT, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_rss_delete_doit, + .policy = ethnl_rss_delete_policy, + .maxattr = ARRAY_SIZE(ethnl_rss_delete_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index b530bf9f85ee..1d4f9ecb3d26 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -487,6 +487,7 @@ extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_START_CONTEXT + 1]; extern const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1]; extern const struct nla_policy ethnl_rss_create_policy[ETHTOOL_A_RSS_INPUT_XFRM + 1]; +extern const struct nla_policy ethnl_rss_delete_policy[ETHTOOL_A_RSS_CONTEXT + 1]; extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1]; extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1]; extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1]; @@ -510,6 +511,7 @@ int ethnl_tsinfo_start(struct netlink_callback *cb); int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_tsinfo_done(struct netlink_callback *cb); int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info); +int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index be092dfa4407..992e98abe9dd 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -486,13 +486,49 @@ int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb) /* RSS_NTF */ +static void ethnl_rss_delete_notify(struct net_device *dev, u32 rss_context) +{ + struct sk_buff *ntf; + size_t ntf_size; + void *hdr; + + ntf_size = ethnl_reply_header_size() + + nla_total_size(sizeof(u32)); /* _RSS_CONTEXT */ + + ntf = genlmsg_new(ntf_size, GFP_KERNEL); + if (!ntf) + goto out_warn; + + hdr = ethnl_bcastmsg_put(ntf, ETHTOOL_MSG_RSS_DELETE_NTF); + if (!hdr) + goto out_free_ntf; + + if (ethnl_fill_reply_header(ntf, dev, ETHTOOL_A_RSS_HEADER) || + nla_put_u32(ntf, ETHTOOL_A_RSS_CONTEXT, rss_context)) + goto out_free_ntf; + + genlmsg_end(ntf, hdr); + if (ethnl_multicast(ntf, dev)) + goto out_warn; + + return; + +out_free_ntf: + nlmsg_free(ntf); +out_warn: + pr_warn_once("Failed to send a RSS delete notification"); +} + void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context) { struct rss_req_info req_info = { .rss_context = rss_context, }; - ethnl_notify(dev, type, &req_info.base); + if (type == ETHTOOL_MSG_RSS_DELETE_NTF) + ethnl_rss_delete_notify(dev, rss_context); + else + ethnl_notify(dev, type, &req_info.base); } /* RSS_SET */ @@ -1096,3 +1132,74 @@ err_unlock_free_ctx: kfree(ctx); goto exit_unlock; } + +/* RSS_DELETE */ + +const struct nla_policy ethnl_rss_delete_policy[ETHTOOL_A_RSS_CONTEXT + 1] = { + [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_RSS_CONTEXT] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct ethtool_rxfh_context *ctx; + struct nlattr **tb = info->attrs; + struct ethnl_req_info req = {}; + const struct ethtool_ops *ops; + struct net_device *dev; + u32 rss_context; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, ETHTOOL_A_RSS_CONTEXT)) + return -EINVAL; + rss_context = nla_get_u32(tb[ETHTOOL_A_RSS_CONTEXT]); + + ret = ethnl_parse_header_dev_get(&req, tb[ETHTOOL_A_RSS_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + dev = req.dev; + ops = dev->ethtool_ops; + + if (!ops->create_rxfh_context) + goto exit_free_dev; + + rtnl_lock(); + netdev_lock_ops(dev); + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto exit_dev_unlock; + + mutex_lock(&dev->ethtool->rss_lock); + ret = ethtool_check_rss_ctx_busy(dev, rss_context); + if (ret) + goto exit_unlock; + + ctx = xa_load(&dev->ethtool->rss_ctx, rss_context); + if (!ctx) { + ret = -ENOENT; + goto exit_unlock; + } + + ret = ops->remove_rxfh_context(dev, ctx, rss_context, info->extack); + if (ret) + goto exit_unlock; + + WARN_ON(xa_erase(&dev->ethtool->rss_ctx, rss_context) != ctx); + kfree(ctx); + + ethnl_rss_delete_notify(dev, rss_context); + +exit_unlock: + mutex_unlock(&dev->ethtool->rss_lock); + ethnl_ops_complete(dev); +exit_dev_unlock: + netdev_unlock_ops(dev); + rtnl_unlock(); +exit_free_dev: + ethnl_parse_header_dev_put(&req); + return ret; +} -- cgit v1.2.3 From 1bbdb81a98363fd5cd0c2ac16ad5346bdf814dff Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 22 Jul 2025 12:13:29 +0300 Subject: devlink: Fix excessive stack usage in rate TC bandwidth parsing The devlink_nl_rate_tc_bw_parse function uses a large stack array for devlink attributes, which triggers a warning about excessive stack usage: net/devlink/rate.c: In function 'devlink_nl_rate_tc_bw_parse': net/devlink/rate.c:382:1: error: the frame size of 1648 bytes is larger than 1536 bytes [-Werror=frame-larger-than=] Introduce a separate attribute set specifically for rate TC bandwidth parsing that only contains the two attributes actually used: index and bandwidth. This reduces the stack array from DEVLINK_ATTR_MAX entries to just 2 entries, solving the stack usage issue. Update devlink selftest to use the new 'index' and 'bw' attribute names consistent with the YAML spec. Example usage with ynl with the new spec: ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \ --do rate-set --json '{ "bus-name": "pci", "dev-name": "0000:08:00.0", "port-index": 1, "rate-tc-bws": [ {"index": 0, "bw": 50}, {"index": 1, "bw": 50}, {"index": 2, "bw": 0}, {"index": 3, "bw": 0}, {"index": 4, "bw": 0}, {"index": 5, "bw": 0}, {"index": 6, "bw": 0}, {"index": 7, "bw": 0} ] }' ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \ --do rate-get --json '{ "bus-name": "pci", "dev-name": "0000:08:00.0", "port-index": 1 }' output for rate-get: {'bus-name': 'pci', 'dev-name': '0000:08:00.0', 'port-index': 1, 'rate-tc-bws': [{'bw': 50, 'index': 0}, {'bw': 50, 'index': 1}, {'bw': 0, 'index': 2}, {'bw': 0, 'index': 3}, {'bw': 0, 'index': 4}, {'bw': 0, 'index': 5}, {'bw': 0, 'index': 6}, {'bw': 0, 'index': 7}], 'rate-tx-max': 0, 'rate-tx-priority': 0, 'rate-tx-share': 0, 'rate-tx-weight': 0, 'rate-type': 'leaf'} Fixes: 566e8f108fc7 ("devlink: Extend devlink rate API with traffic classes bandwidth management") Reported-by: Arnd Bergmann Closes: https://lore.kernel.org/netdev/20250708160652.1810573-1-arnd@kernel.org/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202507171943.W7DJcs6Y-lkp@intel.com/ Suggested-by: Jakub Kicinski Signed-off-by: Carolina Jubran Tested-by: Carolina Jubran Signed-off-by: Tariq Toukan Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/1753175609-330621-1-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 26 +++++++++------------- include/uapi/linux/devlink.h | 11 +++++++-- net/devlink/netlink_gen.c | 6 ++--- net/devlink/netlink_gen.h | 2 +- net/devlink/rate.c | 20 ++++++++--------- .../selftests/drivers/net/hw/devlink_rate_tc_bw.py | 16 ++++++------- 6 files changed, 42 insertions(+), 39 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index 1c4bb0cbe5f0..bb87111d5e16 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -853,18 +853,6 @@ attribute-sets: type: nest multi-attr: true nested-attributes: dl-rate-tc-bws - - - name: rate-tc-index - type: u8 - checks: - max: rate-tc-index-max - - - name: rate-tc-bw - type: u32 - doc: | - Specifies the bandwidth share assigned to the Traffic Class. - The bandwidth for the traffic class is determined - in proportion to the sum of the shares of all configured classes. - name: dl-dev-stats subset-of: devlink @@ -1271,12 +1259,20 @@ attribute-sets: type: flag - name: dl-rate-tc-bws - subset-of: devlink + name-prefix: devlink-rate-tc-attr- attributes: - - name: rate-tc-index + name: index + type: u8 + checks: + max: rate-tc-index-max - - name: rate-tc-bw + name: bw + type: u32 + doc: | + Specifies the bandwidth share assigned to the Traffic Class. + The bandwidth for the traffic class is determined + in proportion to the sum of the shares of all configured classes. operations: enum-model: directional diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index e72bcc239afd..9fcb25a0f447 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -635,8 +635,6 @@ enum devlink_attr { DEVLINK_ATTR_REGION_DIRECT, /* flag */ DEVLINK_ATTR_RATE_TC_BWS, /* nested */ - DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */ - DEVLINK_ATTR_RATE_TC_BW, /* u32 */ /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate @@ -647,6 +645,15 @@ enum devlink_attr { DEVLINK_ATTR_MAX = __DEVLINK_ATTR_MAX - 1 }; +enum devlink_rate_tc_attr { + DEVLINK_RATE_TC_ATTR_UNSPEC, + DEVLINK_RATE_TC_ATTR_INDEX, /* u8 */ + DEVLINK_RATE_TC_ATTR_BW, /* u32 */ + + __DEVLINK_RATE_TC_ATTR_MAX, + DEVLINK_RATE_TC_ATTR_MAX = __DEVLINK_RATE_TC_ATTR_MAX - 1 +}; + /* Mapping between internal resource described by the field and system * structure */ diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index c50436433c18..d97c326a9045 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -45,9 +45,9 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_ [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15), }; -const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = { - [DEVLINK_ATTR_RATE_TC_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX), - [DEVLINK_ATTR_RATE_TC_BW] = { .type = NLA_U32, }, +const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_RATE_TC_ATTR_BW + 1] = { + [DEVLINK_RATE_TC_ATTR_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX), + [DEVLINK_RATE_TC_ATTR_BW] = { .type = NLA_U32, }, }; const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = { diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h index fb733b5d4ff1..09cc6f264ccf 100644 --- a/net/devlink/netlink_gen.h +++ b/net/devlink/netlink_gen.h @@ -13,7 +13,7 @@ /* Common nested types */ extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1]; -extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1]; +extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_RATE_TC_ATTR_BW + 1]; extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1]; /* Ops table for devlink */ diff --git a/net/devlink/rate.c b/net/devlink/rate.c index d39300a9b3d4..110b3fa8a0b1 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -90,8 +90,8 @@ static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw) if (!nla_tc_bw) return -EMSGSIZE; - if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) || - nla_put_u32(msg, DEVLINK_ATTR_RATE_TC_BW, tc_bw[i])) + if (nla_put_u8(msg, DEVLINK_RATE_TC_ATTR_INDEX, i) || + nla_put_u32(msg, DEVLINK_RATE_TC_ATTR_BW, tc_bw[i])) goto nla_put_failure; nla_nest_end(msg, nla_tc_bw); @@ -346,26 +346,26 @@ static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw, unsigned long *bitmap, struct netlink_ext_ack *extack) { - struct nlattr *tb[DEVLINK_ATTR_MAX + 1]; + struct nlattr *tb[DEVLINK_RATE_TC_ATTR_MAX + 1]; u8 tc_index; int err; - err = nla_parse_nested(tb, DEVLINK_ATTR_MAX, parent_nest, + err = nla_parse_nested(tb, DEVLINK_RATE_TC_ATTR_MAX, parent_nest, devlink_dl_rate_tc_bws_nl_policy, extack); if (err) return err; - if (!tb[DEVLINK_ATTR_RATE_TC_INDEX]) { + if (!tb[DEVLINK_RATE_TC_ATTR_INDEX]) { NL_SET_ERR_ATTR_MISS(extack, parent_nest, - DEVLINK_ATTR_RATE_TC_INDEX); + DEVLINK_RATE_TC_ATTR_INDEX); return -EINVAL; } - tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]); + tc_index = nla_get_u8(tb[DEVLINK_RATE_TC_ATTR_INDEX]); - if (!tb[DEVLINK_ATTR_RATE_TC_BW]) { + if (!tb[DEVLINK_RATE_TC_ATTR_BW]) { NL_SET_ERR_ATTR_MISS(extack, parent_nest, - DEVLINK_ATTR_RATE_TC_BW); + DEVLINK_RATE_TC_ATTR_BW); return -EINVAL; } @@ -376,7 +376,7 @@ static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw, return -EINVAL; } - tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_TC_BW]); + tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_RATE_TC_ATTR_BW]); return 0; } diff --git a/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py index 820d8a03becc..835c357919a8 100755 --- a/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py +++ b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py @@ -208,14 +208,14 @@ def setup_devlink_rate(cfg): "port-index": port_index, "rate-tx-max": 125000000, "rate-tc-bws": [ - {"rate-tc-index": 0, "rate-tc-bw": 0}, - {"rate-tc-index": 1, "rate-tc-bw": 0}, - {"rate-tc-index": 2, "rate-tc-bw": 0}, - {"rate-tc-index": 3, "rate-tc-bw": 20}, - {"rate-tc-index": 4, "rate-tc-bw": 80}, - {"rate-tc-index": 5, "rate-tc-bw": 0}, - {"rate-tc-index": 6, "rate-tc-bw": 0}, - {"rate-tc-index": 7, "rate-tc-bw": 0}, + {"index": 0, "bw": 0}, + {"index": 1, "bw": 0}, + {"index": 2, "bw": 0}, + {"index": 3, "bw": 20}, + {"index": 4, "bw": 80}, + {"index": 5, "bw": 0}, + {"index": 6, "bw": 0}, + {"index": 7, "bw": 0}, ] }) except NlError as exc: -- cgit v1.2.3 From 320d031ad6e4d67e8e1ab08ac71efda02bc85683 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 22 Jul 2025 11:59:10 +0200 Subject: sched: Struct definition and parsing of dualpi2 qdisc DualPI2 is the reference implementation of IETF RFC9332 DualQ Coupled AQM (https://datatracker.ietf.org/doc/html/rfc9332) providing two queues called low latency (L-queue) and classic (C-queue). By default, it enqueues non-ECN and ECT(0) packets into the C-queue and ECT(1) and CE packets into the low latency queue (L-queue), as per IETF RFC9332 spec. This patch defines the dualpi2 Qdisc structure and parsing, and the following two patches include dumping and enqueue/dequeue for the DualPI2. Signed-off-by: Chia-Yu Chang Link: https://patch.msgid.link/20250722095915.24485-2-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 53 ++++ net/sched/sch_dualpi2.c | 591 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 644 insertions(+) create mode 100644 net/sched/sch_dualpi2.c (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 3e41349f3fa2..75d685ea8368 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1211,4 +1211,57 @@ enum { #define TCA_ETS_MAX (__TCA_ETS_MAX - 1) +/* DUALPI2 */ +enum tc_dualpi2_drop_overload { + TC_DUALPI2_DROP_OVERLOAD_OVERFLOW = 0, + TC_DUALPI2_DROP_OVERLOAD_DROP = 1, + __TCA_DUALPI2_DROP_OVERLOAD_MAX, +}; +#define TCA_DUALPI2_DROP_OVERLOAD_MAX (__TCA_DUALPI2_DROP_OVERLOAD_MAX - 1) + +enum tc_dualpi2_drop_early { + TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE = 0, + TC_DUALPI2_DROP_EARLY_DROP_ENQUEUE = 1, + __TCA_DUALPI2_DROP_EARLY_MAX, +}; +#define TCA_DUALPI2_DROP_EARLY_MAX (__TCA_DUALPI2_DROP_EARLY_MAX - 1) + +enum tc_dualpi2_ecn_mask { + TC_DUALPI2_ECN_MASK_L4S_ECT = 1, + TC_DUALPI2_ECN_MASK_CLA_ECT = 2, + TC_DUALPI2_ECN_MASK_ANY_ECT = 3, + __TCA_DUALPI2_ECN_MASK_MAX, +}; +#define TCA_DUALPI2_ECN_MASK_MAX (__TCA_DUALPI2_ECN_MASK_MAX - 1) + +enum tc_dualpi2_split_gso { + TC_DUALPI2_SPLIT_GSO_NO_SPLIT_GSO = 0, + TC_DUALPI2_SPLIT_GSO_SPLIT_GSO = 1, + __TCA_DUALPI2_SPLIT_GSO_MAX, +}; +#define TCA_DUALPI2_SPLIT_GSO_MAX (__TCA_DUALPI2_SPLIT_GSO_MAX - 1) + +enum { + TCA_DUALPI2_UNSPEC, + TCA_DUALPI2_LIMIT, /* Packets */ + TCA_DUALPI2_MEMORY_LIMIT, /* Bytes */ + TCA_DUALPI2_TARGET, /* us */ + TCA_DUALPI2_TUPDATE, /* us */ + TCA_DUALPI2_ALPHA, /* Hz scaled up by 256 */ + TCA_DUALPI2_BETA, /* Hz scaled up by 256 */ + TCA_DUALPI2_STEP_THRESH_PKTS, /* Step threshold in packets */ + TCA_DUALPI2_STEP_THRESH_US, /* Step threshold in microseconds */ + TCA_DUALPI2_MIN_QLEN_STEP, /* Minimum qlen to apply STEP_THRESH */ + TCA_DUALPI2_COUPLING, /* Coupling factor between queues */ + TCA_DUALPI2_DROP_OVERLOAD, /* Whether to drop on overload */ + TCA_DUALPI2_DROP_EARLY, /* Whether to drop on enqueue */ + TCA_DUALPI2_C_PROTECTION, /* Percentage */ + TCA_DUALPI2_ECN_MASK, /* L4S queue classification mask */ + TCA_DUALPI2_SPLIT_GSO, /* Split GSO packets at enqueue */ + TCA_DUALPI2_PAD, + __TCA_DUALPI2_MAX +}; + +#define TCA_DUALPI2_MAX (__TCA_DUALPI2_MAX - 1) + #endif diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c new file mode 100644 index 000000000000..c11ec66786d4 --- /dev/null +++ b/net/sched/sch_dualpi2.c @@ -0,0 +1,591 @@ +// SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +/* Copyright (C) 2024 Nokia + * + * Author: Koen De Schepper + * Author: Olga Albisser + * Author: Henrik Steen + * Author: Olivier Tilmans + * Author: Chia-Yu Chang + * + * DualPI Improved with a Square (dualpi2): + * - Supports congestion controls that comply with the Prague requirements + * in RFC9331 (e.g. TCP-Prague) + * - Supports coupled dual-queue with PI2 as defined in RFC9332 + * - Supports ECN L4S-identifier (IP.ECN==0b*1) + * + * note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks, + * they do not meet the 'Prague L4S Requirements' listed in RFC 9331 + * Section 4, so they can only be used with DualPI2 in a datacenter + * context. + * + * References: + * - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332 + * - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and + * scalable TCP." in proc. ACM CoNEXT'16, 2016. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* 32b enable to support flows with windows up to ~8.6 * 1e9 packets + * i.e., twice the maximal snd_cwnd. + * MAX_PROB must be consistent with the RNG in dualpi2_roll(). + */ +#define MAX_PROB U32_MAX + +/* alpha/beta values exchanged over netlink are in units of 256ns */ +#define ALPHA_BETA_SHIFT 8 + +/* Scaled values of alpha/beta must fit in 32b to avoid overflow in later + * computations. Consequently (see and dualpi2_scale_alpha_beta()), their + * netlink-provided values can use at most 31b, i.e. be at most (2^23)-1 + * (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to + * control flows whose maximal RTTs can be in usec up to few secs. + */ +#define ALPHA_BETA_MAX ((1U << 31) - 1) + +/* Internal alpha/beta are in units of 64ns. + * This enables to use all alpha/beta values in the allowed range without loss + * of precision due to rounding when scaling them internally, e.g., + * scale_alpha_beta(1) will not round down to 0. + */ +#define ALPHA_BETA_GRANULARITY 6 + +#define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY) + +/* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */ +#define MAX_WC 100 + +struct dualpi2_sched_data { + struct Qdisc *l_queue; /* The L4S Low latency queue (L-queue) */ + struct Qdisc *sch; /* The Classic queue (C-queue) */ + + /* Registered tc filters */ + struct tcf_proto __rcu *tcf_filters; + struct tcf_block *tcf_block; + + /* PI2 parameters */ + u64 pi2_target; /* Target delay in nanoseconds */ + u32 pi2_tupdate; /* Timer frequency in nanoseconds */ + u32 pi2_prob; /* Base PI probability */ + u32 pi2_alpha; /* Gain factor for the integral rate response */ + u32 pi2_beta; /* Gain factor for the proportional response */ + struct hrtimer pi2_timer; /* prob update timer */ + + /* Step AQM (L-queue only) parameters */ + u32 step_thresh; /* Step threshold */ + bool step_in_packets; /* Step thresh in packets (1) or time (0) */ + + /* C-queue starvation protection */ + s32 c_protection_credit; /* Credit (sign indicates which queue) */ + s32 c_protection_init; /* Reset value of the credit */ + u8 c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */ + u8 c_protection_wl; /* L-queue weight (MAX_WC - wc) */ + + /* General dualQ parameters */ + u32 memory_limit; /* Memory limit of both queues */ + u8 coupling_factor;/* Coupling factor (k) between both queues */ + u8 ecn_mask; /* Mask to match packets into L-queue */ + u32 min_qlen_step; /* Minimum queue length to apply step thresh */ + bool drop_early; /* Drop at enqueue (1) instead of dequeue (0) */ + bool drop_overload; /* Drop (1) on overload, or overflow (0) */ + bool split_gso; /* Split aggregated skb (1) or leave as is (0) */ + + /* Statistics */ + u64 c_head_ts; /* Enqueue timestamp of the C-queue head */ + u64 l_head_ts; /* Enqueue timestamp of the L-queue head */ + u64 last_qdelay; /* Q delay val at the last probability update */ + u32 packets_in_c; /* Enqueue packet counter of the C-queue */ + u32 packets_in_l; /* Enqueue packet counter of the L-queue */ + u32 maxq; /* Maximum queue size of the C-queue */ + u32 ecn_mark; /* ECN mark pkt counter due to PI probability */ + u32 step_marks; /* ECN mark pkt counter due to step AQM */ + u32 memory_used; /* Memory used of both queues */ + u32 max_memory_used;/* Maximum used memory */ +}; + +static u32 dualpi2_scale_alpha_beta(u32 param) +{ + u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING); + + do_div(tmp, NSEC_PER_SEC); + return tmp; +} + +static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q) +{ + return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate); +} + +static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q) +{ + q->c_protection_credit = q->c_protection_init; +} + +/* This computes the initial credit value and WRR weight for the L queue (wl) + * from the weight of the C queue (wc). + * If wl > wc, the scheduler will start with the L queue when reset. + */ +static void dualpi2_calculate_c_protection(struct Qdisc *sch, + struct dualpi2_sched_data *q, u32 wc) +{ + q->c_protection_wc = wc; + q->c_protection_wl = MAX_WC - wc; + q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) * + ((int)q->c_protection_wc - (int)q->c_protection_wl); + dualpi2_reset_c_protection(q); +} + +static s64 __scale_delta(u64 diff) +{ + do_div(diff, 1 << ALPHA_BETA_GRANULARITY); + return diff; +} + +static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c, + u64 *qdelay_l) +{ + u64 now, qc, ql; + + now = ktime_get_ns(); + qc = q->c_head_ts; + ql = q->l_head_ts; + + *qdelay_c = qc ? now - qc : 0; + *qdelay_l = ql ? now - ql : 0; +} + +static u32 calculate_probability(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + u32 new_prob; + u64 qdelay_c; + u64 qdelay_l; + u64 qdelay; + s64 delta; + + get_queue_delays(q, &qdelay_c, &qdelay_l); + qdelay = max(qdelay_l, qdelay_c); + + /* Alpha and beta take at most 32b, i.e, the delay difference would + * overflow for queuing delay differences > ~4.2sec. + */ + delta = ((s64)qdelay - (s64)q->pi2_target) * q->pi2_alpha; + delta += ((s64)qdelay - (s64)q->last_qdelay) * q->pi2_beta; + q->last_qdelay = qdelay; + + /* Bound new_prob between 0 and MAX_PROB */ + if (delta > 0) { + new_prob = __scale_delta(delta) + q->pi2_prob; + if (new_prob < q->pi2_prob) + new_prob = MAX_PROB; + } else { + new_prob = q->pi2_prob - __scale_delta(~delta + 1); + if (new_prob > q->pi2_prob) + new_prob = 0; + } + + /* If we do not drop on overload, ensure we cap the L4S probability to + * 100% to keep window fairness when overflowing. + */ + if (!q->drop_overload) + return min_t(u32, new_prob, MAX_PROB / q->coupling_factor); + return new_prob; +} + +static u32 get_memory_limit(struct Qdisc *sch, u32 limit) +{ + /* Apply rule of thumb, i.e., doubling the packet length, + * to further include per packet overhead in memory_limit. + */ + u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch))); + + if (upper_32_bits(memlim)) + return U32_MAX; + else + return lower_32_bits(memlim); +} + +static u32 convert_us_to_nsec(u32 us) +{ + u64 ns = mul_u32_u32(us, NSEC_PER_USEC); + + if (upper_32_bits(ns)) + return U32_MAX; + + return lower_32_bits(ns); +} + +static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer) +{ + struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer); + struct Qdisc *sch = q->sch; + spinlock_t *root_lock; /* to lock qdisc for probability calculations */ + + rcu_read_lock(); + root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + spin_lock(root_lock); + + q->pi2_prob = calculate_probability(sch); + hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q)); + + spin_unlock(root_lock); + rcu_read_unlock(); + return HRTIMER_RESTART; +} + +static struct netlink_range_validation dualpi2_alpha_beta_range = { + .min = 1, + .max = ALPHA_BETA_MAX, +}; + +static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = { + [TCA_DUALPI2_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), + [TCA_DUALPI2_MEMORY_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), + [TCA_DUALPI2_TARGET] = { .type = NLA_U32 }, + [TCA_DUALPI2_TUPDATE] = NLA_POLICY_MIN(NLA_U32, 1), + [TCA_DUALPI2_ALPHA] = + NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), + [TCA_DUALPI2_BETA] = + NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), + [TCA_DUALPI2_STEP_THRESH_PKTS] = { .type = NLA_U32 }, + [TCA_DUALPI2_STEP_THRESH_US] = { .type = NLA_U32 }, + [TCA_DUALPI2_MIN_QLEN_STEP] = { .type = NLA_U32 }, + [TCA_DUALPI2_COUPLING] = NLA_POLICY_MIN(NLA_U8, 1), + [TCA_DUALPI2_DROP_OVERLOAD] = + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_OVERLOAD_MAX), + [TCA_DUALPI2_DROP_EARLY] = + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_EARLY_MAX), + [TCA_DUALPI2_C_PROTECTION] = + NLA_POLICY_RANGE(NLA_U8, 0, MAX_WC), + [TCA_DUALPI2_ECN_MASK] = + NLA_POLICY_RANGE(NLA_U8, TC_DUALPI2_ECN_MASK_L4S_ECT, + TCA_DUALPI2_ECN_MASK_MAX), + [TCA_DUALPI2_SPLIT_GSO] = + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_SPLIT_GSO_MAX), +}; + +static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_DUALPI2_MAX + 1]; + struct dualpi2_sched_data *q; + int old_backlog; + int old_qlen; + int err; + + if (!opt || !nla_len(opt)) { + NL_SET_ERR_MSG_MOD(extack, "Dualpi2 options are required"); + return -EINVAL; + } + err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy, + extack); + if (err < 0) + return err; + if (tb[TCA_DUALPI2_STEP_THRESH_PKTS] && tb[TCA_DUALPI2_STEP_THRESH_US]) { + NL_SET_ERR_MSG_MOD(extack, "multiple step thresh attributes"); + return -EINVAL; + } + + q = qdisc_priv(sch); + sch_tree_lock(sch); + + if (tb[TCA_DUALPI2_LIMIT]) { + u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]); + + sch->limit = limit; + q->memory_limit = get_memory_limit(sch, limit); + } + + if (tb[TCA_DUALPI2_MEMORY_LIMIT]) + q->memory_limit = nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]); + + if (tb[TCA_DUALPI2_TARGET]) { + u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]); + + q->pi2_target = target * NSEC_PER_USEC; + } + + if (tb[TCA_DUALPI2_TUPDATE]) { + u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]); + + q->pi2_tupdate = convert_us_to_nsec(tupdate); + } + + if (tb[TCA_DUALPI2_ALPHA]) { + u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]); + + q->pi2_alpha = dualpi2_scale_alpha_beta(alpha); + } + + if (tb[TCA_DUALPI2_BETA]) { + u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]); + + q->pi2_beta = dualpi2_scale_alpha_beta(beta); + } + + if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) { + u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]); + + q->step_in_packets = true; + q->step_thresh = step_th; + } else if (tb[TCA_DUALPI2_STEP_THRESH_US]) { + u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]); + + q->step_in_packets = false; + q->step_thresh = convert_us_to_nsec(step_th); + } + + if (tb[TCA_DUALPI2_MIN_QLEN_STEP]) + q->min_qlen_step = nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]); + + if (tb[TCA_DUALPI2_COUPLING]) { + u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]); + + q->coupling_factor = coupling; + } + + if (tb[TCA_DUALPI2_DROP_OVERLOAD]) { + u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]); + + q->drop_overload = (bool)drop_overload; + } + + if (tb[TCA_DUALPI2_DROP_EARLY]) { + u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]); + + q->drop_early = (bool)drop_early; + } + + if (tb[TCA_DUALPI2_C_PROTECTION]) { + u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]); + + dualpi2_calculate_c_protection(sch, q, wc); + } + + if (tb[TCA_DUALPI2_ECN_MASK]) { + u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]); + + q->ecn_mask = ecn_mask; + } + + if (tb[TCA_DUALPI2_SPLIT_GSO]) { + u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]); + + q->split_gso = (bool)split_gso; + } + + old_qlen = qdisc_qlen(sch); + old_backlog = sch->qstats.backlog; + while (qdisc_qlen(sch) > sch->limit || + q->memory_used > q->memory_limit) { + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); + + q->memory_used -= skb->truesize; + qdisc_qstats_backlog_dec(sch, skb); + rtnl_qdisc_drop(skb, sch); + } + qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch), + old_backlog - sch->qstats.backlog); + + sch_tree_unlock(sch); + return 0; +} + +/* Default alpha/beta values give a 10dB stability margin with max_rtt=100ms. */ +static void dualpi2_reset_default(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + q->sch->limit = 10000; /* Max 125ms at 1Gbps */ + q->memory_limit = get_memory_limit(sch, q->sch->limit); + + q->pi2_target = 15 * NSEC_PER_MSEC; + q->pi2_tupdate = 16 * NSEC_PER_MSEC; + q->pi2_alpha = dualpi2_scale_alpha_beta(41); /* ~0.16 Hz * 256 */ + q->pi2_beta = dualpi2_scale_alpha_beta(819); /* ~3.20 Hz * 256 */ + + q->step_thresh = 1 * NSEC_PER_MSEC; + q->step_in_packets = false; + + dualpi2_calculate_c_protection(q->sch, q, 10); /* wc=10%, wl=90% */ + + q->ecn_mask = TC_DUALPI2_ECN_MASK_L4S_ECT; /* INET_ECN_ECT_1 */ + q->min_qlen_step = 0; /* Always apply step mark in L-queue */ + q->coupling_factor = 2; /* window fairness for equal RTTs */ + q->drop_overload = TC_DUALPI2_DROP_OVERLOAD_DROP; /* Drop overload */ + q->drop_early = TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE; /* Drop dequeue */ + q->split_gso = TC_DUALPI2_SPLIT_GSO_SPLIT_GSO; /* Split GSO */ +} + +static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + int err; + + q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, 1), extack); + if (!q->l_queue) + return -ENOMEM; + + err = tcf_block_get(&q->tcf_block, &q->tcf_filters, sch, extack); + if (err) + return err; + + q->sch = sch; + dualpi2_reset_default(sch); + hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + + if (opt && nla_len(opt)) { + err = dualpi2_change(sch, opt, extack); + + if (err) + return err; + } + + hrtimer_start(&q->pi2_timer, next_pi2_timeout(q), + HRTIMER_MODE_ABS_PINNED); + return 0; +} + +/* Reset both L-queue and C-queue, internal packet counters, PI probability, + * C-queue protection credit, and timestamps, while preserving current + * configuration of DUALPI2. + */ +static void dualpi2_reset(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + qdisc_reset_queue(sch); + qdisc_reset_queue(q->l_queue); + q->c_head_ts = 0; + q->l_head_ts = 0; + q->pi2_prob = 0; + q->packets_in_c = 0; + q->packets_in_l = 0; + q->maxq = 0; + q->ecn_mark = 0; + q->step_marks = 0; + q->memory_used = 0; + q->max_memory_used = 0; + dualpi2_reset_c_protection(q); +} + +static void dualpi2_destroy(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + q->pi2_tupdate = 0; + hrtimer_cancel(&q->pi2_timer); + if (q->l_queue) + qdisc_put(q->l_queue); + tcf_block_put(q->tcf_block); +} + +static struct Qdisc *dualpi2_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long dualpi2_find(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long dualpi2_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + +static void dualpi2_unbind(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_block *dualpi2_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return q->tcf_block; +} + +static void dualpi2_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + unsigned int i; + + if (arg->stop) + return; + + /* We statically define only 2 queues */ + for (i = 0; i < 2; i++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +/* Minimal class support to handle tc filters */ +static const struct Qdisc_class_ops dualpi2_class_ops = { + .leaf = dualpi2_leaf, + .find = dualpi2_find, + .tcf_block = dualpi2_tcf_block, + .bind_tcf = dualpi2_bind, + .unbind_tcf = dualpi2_unbind, + .walk = dualpi2_walk, +}; + +static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = { + .id = "dualpi2", + .cl_ops = &dualpi2_class_ops, + .priv_size = sizeof(struct dualpi2_sched_data), + .peek = qdisc_peek_dequeued, + .init = dualpi2_init, + .destroy = dualpi2_destroy, + .reset = dualpi2_reset, + .change = dualpi2_change, + .owner = THIS_MODULE, +}; + +static int __init dualpi2_module_init(void) +{ + return register_qdisc(&dualpi2_qdisc_ops); +} + +static void __exit dualpi2_module_exit(void) +{ + unregister_qdisc(&dualpi2_qdisc_ops); +} + +module_init(dualpi2_module_init); +module_exit(dualpi2_module_exit); + +MODULE_DESCRIPTION("Dual Queue with Proportional Integral controller Improved with a Square (dualpi2) scheduler"); +MODULE_AUTHOR("Koen De Schepper "); +MODULE_AUTHOR("Chia-Yu Chang "); +MODULE_AUTHOR("Olga Albisser "); +MODULE_AUTHOR("Henrik Steen "); +MODULE_AUTHOR("Olivier Tilmans "); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION("1.0"); -- cgit v1.2.3 From d4de8bffbef4a7e4ad14b9fd2ff8e2d0e06b3fa5 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 22 Jul 2025 11:59:11 +0200 Subject: sched: Dump configuration and statistics of dualpi2 qdisc The configuration and statistics dump of the DualPI2 Qdisc provides information related to both queues, such as packet numbers and queuing delays in the L-queue and C-queue, as well as general information such as probability value, WRR credits, memory usage, packet marking counters, max queue size, etc. The following patch includes enqueue/dequeue for DualPI2. Signed-off-by: Chia-Yu Chang Link: https://patch.msgid.link/20250722095915.24485-3-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 15 ++++ net/sched/sch_dualpi2.c | 154 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 152 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 75d685ea8368..c2da76e78bad 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1264,4 +1264,19 @@ enum { #define TCA_DUALPI2_MAX (__TCA_DUALPI2_MAX - 1) +struct tc_dualpi2_xstats { + __u32 prob; /* current probability */ + __u32 delay_c; /* current delay in C queue */ + __u32 delay_l; /* current delay in L queue */ + __u32 packets_in_c; /* number of packets enqueued in C queue */ + __u32 packets_in_l; /* number of packets enqueued in L queue */ + __u32 maxq; /* maximum queue size */ + __u32 ecn_mark; /* packets marked with ecn*/ + __u32 step_marks; /* ECN marks due to the step AQM */ + __s32 credit; /* current c_protection credit */ + __u32 memory_used; /* Memory used by both queues */ + __u32 max_memory_used; /* Maximum used memory */ + __u32 memory_limit; /* Memory limit of both queues */ +}; + #endif diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index c11ec66786d4..0a96d57c40d1 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -123,6 +123,14 @@ static u32 dualpi2_scale_alpha_beta(u32 param) return tmp; } +static u32 dualpi2_unscale_alpha_beta(u32 param) +{ + u64 tmp = ((u64)param * NSEC_PER_SEC << ALPHA_BETA_SCALING); + + do_div(tmp, MAX_PROB); + return tmp; +} + static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q) { return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate); @@ -227,6 +235,15 @@ static u32 convert_us_to_nsec(u32 us) return lower_32_bits(ns); } +static u32 convert_ns_to_usec(u64 ns) +{ + do_div(ns, NSEC_PER_USEC); + if (upper_32_bits(ns)) + return U32_MAX; + + return lower_32_bits(ns); +} + static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer) { struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer); @@ -304,68 +321,70 @@ static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_DUALPI2_LIMIT]) { u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]); - sch->limit = limit; - q->memory_limit = get_memory_limit(sch, limit); + WRITE_ONCE(sch->limit, limit); + WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit)); } if (tb[TCA_DUALPI2_MEMORY_LIMIT]) - q->memory_limit = nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]); + WRITE_ONCE(q->memory_limit, + nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT])); if (tb[TCA_DUALPI2_TARGET]) { u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]); - q->pi2_target = target * NSEC_PER_USEC; + WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC); } if (tb[TCA_DUALPI2_TUPDATE]) { u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]); - q->pi2_tupdate = convert_us_to_nsec(tupdate); + WRITE_ONCE(q->pi2_tupdate, convert_us_to_nsec(tupdate)); } if (tb[TCA_DUALPI2_ALPHA]) { u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]); - q->pi2_alpha = dualpi2_scale_alpha_beta(alpha); + WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha)); } if (tb[TCA_DUALPI2_BETA]) { u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]); - q->pi2_beta = dualpi2_scale_alpha_beta(beta); + WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta)); } if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) { u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]); - q->step_in_packets = true; - q->step_thresh = step_th; + WRITE_ONCE(q->step_in_packets, true); + WRITE_ONCE(q->step_thresh, step_th); } else if (tb[TCA_DUALPI2_STEP_THRESH_US]) { u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]); - q->step_in_packets = false; - q->step_thresh = convert_us_to_nsec(step_th); + WRITE_ONCE(q->step_in_packets, false); + WRITE_ONCE(q->step_thresh, convert_us_to_nsec(step_th)); } if (tb[TCA_DUALPI2_MIN_QLEN_STEP]) - q->min_qlen_step = nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]); + WRITE_ONCE(q->min_qlen_step, + nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP])); if (tb[TCA_DUALPI2_COUPLING]) { u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]); - q->coupling_factor = coupling; + WRITE_ONCE(q->coupling_factor, coupling); } if (tb[TCA_DUALPI2_DROP_OVERLOAD]) { u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]); - q->drop_overload = (bool)drop_overload; + WRITE_ONCE(q->drop_overload, (bool)drop_overload); } if (tb[TCA_DUALPI2_DROP_EARLY]) { u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]); - q->drop_early = (bool)drop_early; + WRITE_ONCE(q->drop_early, (bool)drop_early); } if (tb[TCA_DUALPI2_C_PROTECTION]) { @@ -377,13 +396,13 @@ static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_DUALPI2_ECN_MASK]) { u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]); - q->ecn_mask = ecn_mask; + WRITE_ONCE(q->ecn_mask, ecn_mask); } if (tb[TCA_DUALPI2_SPLIT_GSO]) { u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]); - q->split_gso = (bool)split_gso; + WRITE_ONCE(q->split_gso, (bool)split_gso); } old_qlen = qdisc_qlen(sch); @@ -460,6 +479,105 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, return 0; } +static int dualpi2_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + bool step_in_pkts; + u32 step_th; + + step_in_pkts = READ_ONCE(q->step_in_packets); + step_th = READ_ONCE(q->step_thresh); + + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!opts) + goto nla_put_failure; + + if (step_in_pkts && + (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT, + READ_ONCE(q->memory_limit)) || + nla_put_u32(skb, TCA_DUALPI2_TARGET, + convert_ns_to_usec(READ_ONCE(q->pi2_target))) || + nla_put_u32(skb, TCA_DUALPI2_TUPDATE, + convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) || + nla_put_u32(skb, TCA_DUALPI2_ALPHA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) || + nla_put_u32(skb, TCA_DUALPI2_BETA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) || + nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_PKTS, step_th) || + nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP, + READ_ONCE(q->min_qlen_step)) || + nla_put_u8(skb, TCA_DUALPI2_COUPLING, + READ_ONCE(q->coupling_factor)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD, + READ_ONCE(q->drop_overload)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY, + READ_ONCE(q->drop_early)) || + nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION, + READ_ONCE(q->c_protection_wc)) || + nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) || + nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso)))) + goto nla_put_failure; + + if (!step_in_pkts && + (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT, + READ_ONCE(q->memory_limit)) || + nla_put_u32(skb, TCA_DUALPI2_TARGET, + convert_ns_to_usec(READ_ONCE(q->pi2_target))) || + nla_put_u32(skb, TCA_DUALPI2_TUPDATE, + convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) || + nla_put_u32(skb, TCA_DUALPI2_ALPHA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) || + nla_put_u32(skb, TCA_DUALPI2_BETA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) || + nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_US, + convert_ns_to_usec(step_th)) || + nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP, + READ_ONCE(q->min_qlen_step)) || + nla_put_u8(skb, TCA_DUALPI2_COUPLING, + READ_ONCE(q->coupling_factor)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD, + READ_ONCE(q->drop_overload)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY, + READ_ONCE(q->drop_early)) || + nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION, + READ_ONCE(q->c_protection_wc)) || + nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) || + nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso)))) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; +} + +static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct tc_dualpi2_xstats st = { + .prob = q->pi2_prob, + .packets_in_c = q->packets_in_c, + .packets_in_l = q->packets_in_l, + .maxq = q->maxq, + .ecn_mark = q->ecn_mark, + .credit = q->c_protection_credit, + .step_marks = q->step_marks, + .memory_used = q->memory_used, + .max_memory_used = q->max_memory_used, + .memory_limit = q->memory_limit, + }; + u64 qc, ql; + + get_queue_delays(q, &qc, &ql); + st.delay_l = convert_ns_to_usec(ql); + st.delay_c = convert_ns_to_usec(qc); + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + /* Reset both L-queue and C-queue, internal packet counters, PI probability, * C-queue protection credit, and timestamps, while preserving current * configuration of DUALPI2. @@ -564,6 +682,8 @@ static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = { .destroy = dualpi2_destroy, .reset = dualpi2_reset, .change = dualpi2_change, + .dump = dualpi2_dump, + .dump_stats = dualpi2_dump_stats, .owner = THIS_MODULE, }; -- cgit v1.2.3 From eefb83790a0dda112d1755e4f5e213738d717e76 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Thu, 10 Jul 2025 15:13:53 -0400 Subject: misc: pci_endpoint_test: Add doorbell test case Add doorbell support with the help of three new registers: PCIE_ENDPOINT_TEST_DB_BAR, PCIE_ENDPOINT_TEST_DB_ADDR, and PCIE_ENDPOINT_TEST_DB_DATA. The testcase works by triggering the doorbell in Endpoint by writing the value from PCI_ENDPOINT_TEST_DB_DATA register to the address provided by PCI_ENDPOINT_TEST_DB_OFFSET register of the BAR indicated by the PCIE_ENDPOINT_TEST_DB_BAR register and waiting for the completion status from the Endpoint. Signed-off-by: Frank Li [mani: removed one spurious change and reworded the commit message] Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Tested-by: Niklas Cassel Link: https://patch.msgid.link/20250710-ep-msi-v21-7-57683fc7fb25@nxp.com --- drivers/misc/pci_endpoint_test.c | 83 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/pcitest.h | 1 + 2 files changed, 84 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c index c4e5e2c977be..1c156a3f845e 100644 --- a/drivers/misc/pci_endpoint_test.c +++ b/drivers/misc/pci_endpoint_test.c @@ -37,6 +37,8 @@ #define COMMAND_READ BIT(3) #define COMMAND_WRITE BIT(4) #define COMMAND_COPY BIT(5) +#define COMMAND_ENABLE_DOORBELL BIT(6) +#define COMMAND_DISABLE_DOORBELL BIT(7) #define PCI_ENDPOINT_TEST_STATUS 0x8 #define STATUS_READ_SUCCESS BIT(0) @@ -48,6 +50,11 @@ #define STATUS_IRQ_RAISED BIT(6) #define STATUS_SRC_ADDR_INVALID BIT(7) #define STATUS_DST_ADDR_INVALID BIT(8) +#define STATUS_DOORBELL_SUCCESS BIT(9) +#define STATUS_DOORBELL_ENABLE_SUCCESS BIT(10) +#define STATUS_DOORBELL_ENABLE_FAIL BIT(11) +#define STATUS_DOORBELL_DISABLE_SUCCESS BIT(12) +#define STATUS_DOORBELL_DISABLE_FAIL BIT(13) #define PCI_ENDPOINT_TEST_LOWER_SRC_ADDR 0x0c #define PCI_ENDPOINT_TEST_UPPER_SRC_ADDR 0x10 @@ -62,6 +69,7 @@ #define PCI_ENDPOINT_TEST_IRQ_NUMBER 0x28 #define PCI_ENDPOINT_TEST_FLAGS 0x2c + #define FLAG_USE_DMA BIT(0) #define PCI_ENDPOINT_TEST_CAPS 0x30 @@ -70,6 +78,10 @@ #define CAP_MSIX BIT(2) #define CAP_INTX BIT(3) +#define PCI_ENDPOINT_TEST_DB_BAR 0x34 +#define PCI_ENDPOINT_TEST_DB_OFFSET 0x38 +#define PCI_ENDPOINT_TEST_DB_DATA 0x3c + #define PCI_DEVICE_ID_TI_AM654 0xb00c #define PCI_DEVICE_ID_TI_J7200 0xb00f #define PCI_DEVICE_ID_TI_AM64 0xb010 @@ -100,6 +112,7 @@ enum pci_barno { BAR_3, BAR_4, BAR_5, + NO_BAR = -1, }; struct pci_endpoint_test { @@ -841,6 +854,73 @@ static int pci_endpoint_test_set_irq(struct pci_endpoint_test *test, return 0; } +static int pci_endpoint_test_doorbell(struct pci_endpoint_test *test) +{ + struct pci_dev *pdev = test->pdev; + struct device *dev = &pdev->dev; + int irq_type = test->irq_type; + enum pci_barno bar; + u32 data, status; + u32 addr; + int left; + + if (irq_type < PCITEST_IRQ_TYPE_INTX || + irq_type > PCITEST_IRQ_TYPE_MSIX) { + dev_err(dev, "Invalid IRQ type\n"); + return -EINVAL; + } + + pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_TYPE, irq_type); + pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_NUMBER, 1); + pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND, + COMMAND_ENABLE_DOORBELL); + + left = wait_for_completion_timeout(&test->irq_raised, msecs_to_jiffies(1000)); + + status = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_STATUS); + if (!left || (status & STATUS_DOORBELL_ENABLE_FAIL)) { + dev_err(dev, "Failed to enable doorbell\n"); + return -EINVAL; + } + + data = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_DB_DATA); + addr = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_DB_OFFSET); + bar = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_DB_BAR); + + pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_TYPE, irq_type); + pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_NUMBER, 1); + + pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_STATUS, 0); + + bar = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_DB_BAR); + + writel(data, test->bar[bar] + addr); + + left = wait_for_completion_timeout(&test->irq_raised, msecs_to_jiffies(1000)); + + status = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_STATUS); + + if (!left || !(status & STATUS_DOORBELL_SUCCESS)) + dev_err(dev, "Failed to trigger doorbell in endpoint\n"); + + pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND, + COMMAND_DISABLE_DOORBELL); + + wait_for_completion_timeout(&test->irq_raised, msecs_to_jiffies(1000)); + + status |= pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_STATUS); + + if (status & STATUS_DOORBELL_DISABLE_FAIL) { + dev_err(dev, "Failed to disable doorbell\n"); + return -EINVAL; + } + + if (!(status & STATUS_DOORBELL_SUCCESS)) + return -EINVAL; + + return 0; +} + static long pci_endpoint_test_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -891,6 +971,9 @@ static long pci_endpoint_test_ioctl(struct file *file, unsigned int cmd, case PCITEST_CLEAR_IRQ: ret = pci_endpoint_test_clear_irq(test); break; + case PCITEST_DOORBELL: + ret = pci_endpoint_test_doorbell(test); + break; } ret: diff --git a/include/uapi/linux/pcitest.h b/include/uapi/linux/pcitest.h index d3aa8715a525..d6023a45a9d0 100644 --- a/include/uapi/linux/pcitest.h +++ b/include/uapi/linux/pcitest.h @@ -21,6 +21,7 @@ #define PCITEST_SET_IRQTYPE _IOW('P', 0x8, int) #define PCITEST_GET_IRQTYPE _IO('P', 0x9) #define PCITEST_BARS _IO('P', 0xa) +#define PCITEST_DOORBELL _IO('P', 0xb) #define PCITEST_CLEAR_IRQ _IO('P', 0x10) #define PCITEST_IRQ_TYPE_UNDEFINED -1 -- cgit v1.2.3 From 8e7583a4f65f3dbf3e8deb4e60f3679c276bef62 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Wed, 23 Jul 2025 01:30:31 +0000 Subject: net: define an enum for the napi threaded state Instead of using '0' and '1' for napi threaded state use an enum with 'disabled' and 'enabled' states. Tested: ./tools/testing/selftests/net/nl_netdev.py TAP version 13 1..7 ok 1 nl_netdev.empty_check ok 2 nl_netdev.lo_check ok 3 nl_netdev.page_pool_check ok 4 nl_netdev.napi_list_check ok 5 nl_netdev.dev_set_threaded ok 6 nl_netdev.napi_set_threaded ok 7 nl_netdev.nsim_rxq_reset_down # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Samiullah Khawaja Link: https://patch.msgid.link/20250723013031.2911384-4-skhawaja@google.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 13 +++++--- .../networking/net_cachelines/net_device.rst | 2 +- include/linux/netdevice.h | 10 +++--- include/uapi/linux/netdev.h | 5 +++ net/core/dev.c | 12 +++++--- net/core/dev.h | 13 +++++--- net/core/dev_api.c | 3 +- net/core/netdev-genl-gen.c | 2 +- net/core/netdev-genl.c | 2 +- tools/include/uapi/linux/netdev.h | 5 +++ tools/testing/selftests/net/nl_netdev.py | 36 +++++++++++----------- 11 files changed, 62 insertions(+), 41 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 85d0ea6ac426..c035dc0f64fd 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -85,6 +85,10 @@ definitions: name: qstats-scope type: flags entries: [queue] + - + name: napi-threaded + type: enum + entries: [disabled, enabled] attribute-sets: - @@ -286,11 +290,10 @@ attribute-sets: - name: threaded doc: Whether the NAPI is configured to operate in threaded polling - mode. If this is set to 1 then the NAPI context operates in - threaded polling mode. - type: uint - checks: - max: 1 + mode. If this is set to enabled then the NAPI context operates + in threaded polling mode. + type: u32 + enum: napi-threaded - name: xsk-info attributes: [] diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 2d3dc4692d20..1c19bb7705df 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -68,6 +68,7 @@ unsigned_char addr_assign_type unsigned_char addr_len unsigned_char upper_level unsigned_char lower_level +u8 threaded napi_poll(napi_enable,netif_set_threaded) unsigned_short neigh_priv_len unsigned_short padded unsigned_short dev_id @@ -165,7 +166,6 @@ struct sfp_bus* sfp_bus struct lock_class_key* qdisc_tx_busylock bool proto_down unsigned:1 wol_enabled -unsigned:1 threaded napi_poll(napi_enable,netif_set_threaded) unsigned_long:1 see_all_hwtstamp_requests unsigned_long:1 change_proto_down unsigned_long:1 netns_immutable diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a97c9a337d6b..5e5de4b0a433 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -369,7 +369,7 @@ struct napi_config { u64 irq_suspend_timeout; u32 defer_hard_irqs; cpumask_t affinity_mask; - bool threaded; + u8 threaded; unsigned int napi_id; }; @@ -590,7 +590,8 @@ static inline bool napi_complete(struct napi_struct *n) } void netif_threaded_enable(struct net_device *dev); -int dev_set_threaded(struct net_device *dev, bool threaded); +int dev_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded); void napi_disable(struct napi_struct *n); void napi_disable_locked(struct napi_struct *n); @@ -1872,6 +1873,7 @@ enum netdev_reg_state { * @addr_len: Hardware address length * @upper_level: Maximum depth level of upper devices. * @lower_level: Maximum depth level of lower devices. + * @threaded: napi threaded state. * @neigh_priv_len: Used in neigh_alloc() * @dev_id: Used to differentiate devices that share * the same link layer address @@ -2011,8 +2013,6 @@ enum netdev_reg_state { * switch driver and used to set the phys state of the * switch port. * - * @threaded: napi threaded mode is enabled - * * @irq_affinity_auto: driver wants the core to store and re-assign the IRQ * affinity. Set by netif_enable_irq_affinity(), then * the driver must create a persistent napi by @@ -2248,6 +2248,7 @@ struct net_device { unsigned char addr_len; unsigned char upper_level; unsigned char lower_level; + u8 threaded; unsigned short neigh_priv_len; unsigned short dev_id; @@ -2429,7 +2430,6 @@ struct net_device { struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; bool proto_down; - bool threaded; bool irq_affinity_auto; bool rx_cpu_rmap_auto; diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 1f3719a9a0eb..48eb49aa03d4 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -77,6 +77,11 @@ enum netdev_qstats_scope { NETDEV_QSTATS_SCOPE_QUEUE = 1, }; +enum netdev_napi_threaded { + NETDEV_NAPI_THREADED_DISABLED, + NETDEV_NAPI_THREADED_ENABLED, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, diff --git a/net/core/dev.c b/net/core/dev.c index f28661d6f5ea..1c6e755841ce 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6963,7 +6963,8 @@ static void napi_stop_kthread(struct napi_struct *napi) napi->thread = NULL; } -int napi_set_threaded(struct napi_struct *napi, bool threaded) +int napi_set_threaded(struct napi_struct *napi, + enum netdev_napi_threaded threaded) { if (threaded) { if (!napi->thread) { @@ -6988,7 +6989,8 @@ int napi_set_threaded(struct napi_struct *napi, bool threaded) return 0; } -int netif_set_threaded(struct net_device *dev, bool threaded) +int netif_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded) { struct napi_struct *napi; int err = 0; @@ -7000,7 +7002,7 @@ int netif_set_threaded(struct net_device *dev, bool threaded) if (!napi->thread) { err = napi_kthread_create(napi); if (err) { - threaded = false; + threaded = NETDEV_NAPI_THREADED_DISABLED; break; } } @@ -7043,7 +7045,7 @@ int netif_set_threaded(struct net_device *dev, bool threaded) */ void netif_threaded_enable(struct net_device *dev) { - WARN_ON_ONCE(netif_set_threaded(dev, true)); + WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED)); } EXPORT_SYMBOL(netif_threaded_enable); @@ -7360,7 +7362,7 @@ void netif_napi_add_weight_locked(struct net_device *dev, * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) - dev->threaded = false; + dev->threaded = NETDEV_NAPI_THREADED_DISABLED; netif_napi_set_irq_locked(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight_locked); diff --git a/net/core/dev.h b/net/core/dev.h index f5b567310908..ab69edc0c3e3 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -315,14 +315,19 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, WRITE_ONCE(n->irq_suspend_timeout, timeout); } -static inline bool napi_get_threaded(struct napi_struct *n) +static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n) { - return test_bit(NAPI_STATE_THREADED, &n->state); + if (test_bit(NAPI_STATE_THREADED, &n->state)) + return NETDEV_NAPI_THREADED_ENABLED; + + return NETDEV_NAPI_THREADED_DISABLED; } -int napi_set_threaded(struct napi_struct *n, bool threaded); +int napi_set_threaded(struct napi_struct *n, + enum netdev_napi_threaded threaded); -int netif_set_threaded(struct net_device *dev, bool threaded); +int netif_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded); int rps_cpumask_housekeeping(struct cpumask *mask); diff --git a/net/core/dev_api.c b/net/core/dev_api.c index dd7f57013ce5..f28852078aa6 100644 --- a/net/core/dev_api.c +++ b/net/core/dev_api.c @@ -368,7 +368,8 @@ void netdev_state_change(struct net_device *dev) } EXPORT_SYMBOL(netdev_state_change); -int dev_set_threaded(struct net_device *dev, bool threaded) +int dev_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded) { int ret; diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 0994bd68a7e6..e9a2a6f26cb7 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -97,7 +97,7 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, - [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_UINT, 1), + [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1), }; /* NETDEV_CMD_BIND_TX - do */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 5875df372415..6314eb7bdf69 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -333,7 +333,7 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) int ret; threaded = nla_get_uint(info->attrs[NETDEV_A_NAPI_THREADED]); - ret = napi_set_threaded(napi, !!threaded); + ret = napi_set_threaded(napi, threaded); if (ret) return ret; } diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 1f3719a9a0eb..48eb49aa03d4 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -77,6 +77,11 @@ enum netdev_qstats_scope { NETDEV_QSTATS_SCOPE_QUEUE = 1, }; +enum netdev_napi_threaded { + NETDEV_NAPI_THREADED_DISABLED, + NETDEV_NAPI_THREADED_ENABLED, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py index c8ffade79a52..5c66421ab8aa 100755 --- a/tools/testing/selftests/net/nl_netdev.py +++ b/tools/testing/selftests/net/nl_netdev.py @@ -52,14 +52,14 @@ def napi_set_threaded(nf) -> None: napi1_id = napis[1]['id'] # set napi threaded and verify - nf.napi_set({'id': napi0_id, 'threaded': 1}) + nf.napi_set({'id': napi0_id, 'threaded': "enabled"}) napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 1) + ksft_eq(napi0['threaded'], "enabled") ksft_ne(napi0.get('pid'), None) # check it is not set for napi1 napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1['threaded'], "disabled") ksft_eq(napi1.get('pid'), None) ip(f"link set dev {nsim.ifname} down") @@ -67,18 +67,18 @@ def napi_set_threaded(nf) -> None: # verify if napi threaded is still set napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 1) + ksft_eq(napi0['threaded'], "enabled") ksft_ne(napi0.get('pid'), None) # check it is still not set for napi1 napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1['threaded'], "disabled") ksft_eq(napi1.get('pid'), None) # unset napi threaded and verify - nf.napi_set({'id': napi0_id, 'threaded': 0}) + nf.napi_set({'id': napi0_id, 'threaded': "disabled"}) napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 0) + ksft_eq(napi0['threaded'], "disabled") ksft_eq(napi0.get('pid'), None) # set threaded at device level @@ -86,10 +86,10 @@ def napi_set_threaded(nf) -> None: # check napi threaded is set for both napis napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 1) + ksft_eq(napi0['threaded'], "enabled") ksft_ne(napi0.get('pid'), None) napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 1) + ksft_eq(napi1['threaded'], "enabled") ksft_ne(napi1.get('pid'), None) # unset threaded at device level @@ -97,16 +97,16 @@ def napi_set_threaded(nf) -> None: # check napi threaded is unset for both napis napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 0) + ksft_eq(napi0['threaded'], "disabled") ksft_eq(napi0.get('pid'), None) napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1['threaded'], "disabled") ksft_eq(napi1.get('pid'), None) # set napi threaded for napi0 nf.napi_set({'id': napi0_id, 'threaded': 1}) napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 1) + ksft_eq(napi0['threaded'], "enabled") ksft_ne(napi0.get('pid'), None) # unset threaded at device level @@ -114,10 +114,10 @@ def napi_set_threaded(nf) -> None: # check napi threaded is unset for both napis napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 0) + ksft_eq(napi0['threaded'], "disabled") ksft_eq(napi0.get('pid'), None) napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1['threaded'], "disabled") ksft_eq(napi1.get('pid'), None) def dev_set_threaded(nf) -> None: @@ -141,10 +141,10 @@ def dev_set_threaded(nf) -> None: # check napi threaded is set for both napis napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 1) + ksft_eq(napi0['threaded'], "enabled") ksft_ne(napi0.get('pid'), None) napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 1) + ksft_eq(napi1['threaded'], "enabled") ksft_ne(napi1.get('pid'), None) # unset threaded @@ -152,10 +152,10 @@ def dev_set_threaded(nf) -> None: # check napi threaded is unset for both napis napi0 = nf.napi_get({'id': napi0_id}) - ksft_eq(napi0['threaded'], 0) + ksft_eq(napi0['threaded'], "disabled") ksft_eq(napi0.get('pid'), None) napi1 = nf.napi_get({'id': napi1_id}) - ksft_eq(napi1['threaded'], 0) + ksft_eq(napi1['threaded'], "disabled") ksft_eq(napi1.get('pid'), None) def nsim_rxq_reset_down(nf) -> None: -- cgit v1.2.3 From bc8c43adfdc57c8253884fc1853cb6679cd5953d Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 8 Jul 2025 15:04:02 +0200 Subject: netfilter: nfnetlink_hook: Dump flowtable info Introduce NFNL_HOOK_TYPE_NFT_FLOWTABLE to distinguish flowtable hooks from base chain ones. Nested attributes are shared with the old NFTABLES hook info type since they fit apart from their misleading name. Old nftables in user space will ignore this new hook type and thus continue to print flowtable hooks just like before, e.g.: | family netdev { | hook ingress device test0 { | 0000000000 nf_flow_offload_ip_hook [nf_flow_table] | } | } With this patch in place and support for the new hook info type, output becomes more useful: | family netdev { | hook ingress device test0 { | 0000000000 flowtable ip mytable myft [nf_flow_table] | } | } Suggested-by: Florian Westphal Signed-off-by: Phil Sutter Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 1 + include/uapi/linux/netfilter/nfnetlink_hook.h | 2 ++ net/netfilter/nf_tables_api.c | 24 +++++++++--------- net/netfilter/nfnetlink_hook.c | 35 +++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 5f896fcc074d..efbbfa770d66 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -92,6 +92,7 @@ enum nf_hook_ops_type { NF_HOOK_OP_UNDEFINED, NF_HOOK_OP_NF_TABLES, NF_HOOK_OP_BPF, + NF_HOOK_OP_NFT_FT, }; struct nf_hook_ops { diff --git a/include/uapi/linux/netfilter/nfnetlink_hook.h b/include/uapi/linux/netfilter/nfnetlink_hook.h index 84a561a74b98..1a2c4d6424b5 100644 --- a/include/uapi/linux/netfilter/nfnetlink_hook.h +++ b/include/uapi/linux/netfilter/nfnetlink_hook.h @@ -61,10 +61,12 @@ enum nfnl_hook_chain_desc_attributes { * * @NFNL_HOOK_TYPE_NFTABLES: nf_tables base chain * @NFNL_HOOK_TYPE_BPF: bpf program + * @NFNL_HOOK_TYPE_NFT_FLOWTABLE: nf_tables flowtable */ enum nfnl_hook_chaintype { NFNL_HOOK_TYPE_NFTABLES = 0x1, NFNL_HOOK_TYPE_BPF, + NFNL_HOOK_TYPE_NFT_FLOWTABLE, }; /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 04795af6e586..13d0ed9d1895 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8895,11 +8895,12 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, list_for_each_entry(hook, &flowtable_hook->list, list) { list_for_each_entry(ops, &hook->ops_list, list) { - ops->pf = NFPROTO_NETDEV; - ops->hooknum = flowtable_hook->num; - ops->priority = flowtable_hook->priority; - ops->priv = &flowtable->data; - ops->hook = flowtable->data.type->hook; + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable_hook->num; + ops->priority = flowtable_hook->priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + ops->hook_ops_type = NF_HOOK_OP_NFT_FT; } } @@ -9727,12 +9728,13 @@ static int nft_flowtable_event(unsigned long event, struct net_device *dev, if (!ops) return 1; - ops->pf = NFPROTO_NETDEV; - ops->hooknum = flowtable->hooknum; - ops->priority = flowtable->data.priority; - ops->priv = &flowtable->data; - ops->hook = flowtable->data.type->hook; - ops->dev = dev; + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable->hooknum; + ops->priority = flowtable->data.priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + ops->hook_ops_type = NF_HOOK_OP_NFT_FT; + ops->dev = dev; if (nft_register_flowtable_ops(dev_net(dev), flowtable, ops)) { kfree(ops); diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index cd4056527ede..92d869317cba 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -156,6 +156,38 @@ static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb, return 0; } +static int nfnl_hook_put_nft_ft_info(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + unsigned int seq, + struct nf_flowtable *nf_ft) +{ + struct nft_flowtable *ft = + container_of(nf_ft, struct nft_flowtable, data); + struct net *net = sock_net(nlskb->sk); + struct nlattr *nest; + int ret = 0; + + if (WARN_ON_ONCE(!nf_ft)) + return 0; + + if (!nft_is_active(net, ft)) + return 0; + + nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFT_FLOWTABLE); + if (!nest) + return -EMSGSIZE; + + ret = nfnl_hook_put_nft_info_desc(nlskb, ft->table->name, + ft->name, ft->table->family); + if (ret) { + nla_nest_cancel(nlskb, nest); + return ret; + } + + nla_nest_end(nlskb, nest); + return 0; +} + static int nfnl_hook_dump_one(struct sk_buff *nlskb, const struct nfnl_dump_hook_data *ctx, const struct nf_hook_ops *ops, @@ -223,6 +255,9 @@ static int nfnl_hook_dump_one(struct sk_buff *nlskb, case NF_HOOK_OP_BPF: ret = nfnl_hook_put_bpf_prog_info(nlskb, ctx, seq, ops->priv); break; + case NF_HOOK_OP_NFT_FT: + ret = nfnl_hook_put_nft_ft_info(nlskb, ctx, seq, ops->priv); + break; case NF_HOOK_OP_UNDEFINED: break; default: -- cgit v1.2.3 From f24987ef6959a7efaf79bffd265522c3df18d431 Mon Sep 17 00:00:00 2001 From: Gabriel Goller Date: Tue, 22 Jul 2025 10:18:45 +0200 Subject: ipv6: add `force_forwarding` sysctl to enable per-interface forwarding It is currently impossible to enable ipv6 forwarding on a per-interface basis like in ipv4. To enable forwarding on an ipv6 interface we need to enable it on all interfaces and disable it on the other interfaces using a netfilter rule. This is especially cumbersome if you have lots of interfaces and only want to enable forwarding on a few. According to the sysctl docs [0] the `net.ipv6.conf.all.forwarding` enables forwarding for all interfaces, while the interface-specific `net.ipv6.conf..forwarding` configures the interface Host/Router configuration. Introduce a new sysctl flag `force_forwarding`, which can be set on every interface. The ip6_forwarding function will then check if the global forwarding flag OR the force_forwarding flag is active and forward the packet. To preserve backwards-compatibility reset the flag (on all interfaces) to 0 if the net.ipv6.conf.all.forwarding flag is set to 0. Add a short selftest that checks if a packet gets forwarded with and without `force_forwarding`. [0]: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt Acked-by: Nicolas Dichtel Signed-off-by: Gabriel Goller Link: https://patch.msgid.link/20250722081847.132632-1-g.goller@proxmox.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 8 +- include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + include/uapi/linux/netconf.h | 1 + include/uapi/linux/sysctl.h | 1 + net/ipv6/addrconf.c | 82 ++++++++++++++++ net/ipv6/ip6_output.c | 3 +- tools/testing/selftests/net/Makefile | 1 + .../testing/selftests/net/ipv6_force_forwarding.sh | 105 +++++++++++++++++++++ 9 files changed, 200 insertions(+), 3 deletions(-) create mode 100755 tools/testing/selftests/net/ipv6_force_forwarding.sh (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 14700ea77e75..bb620f554598 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -2543,8 +2543,8 @@ conf/all/disable_ipv6 - BOOLEAN conf/all/forwarding - BOOLEAN Enable global IPv6 forwarding between all interfaces. - IPv4 and IPv6 work differently here; e.g. netfilter must be used - to control which interfaces may forward packets and which not. + IPv4 and IPv6 work differently here; the ``force_forwarding`` flag must + be used to control which interfaces may forward packets. This also sets all interfaces' Host/Router setting 'forwarding' to the specified value. See below for details. @@ -2561,6 +2561,10 @@ proxy_ndp - BOOLEAN Default: 0 (disabled) +force_forwarding - BOOLEAN + Enable forwarding on this interface only -- regardless of the setting on + ``conf/all/forwarding``. When setting ``conf.all.forwarding`` to 0, + the ``force_forwarding`` flag will be reset on all interfaces. fwmark_reflect - BOOLEAN Controls the fwmark of kernel-generated IPv6 reply packets that are not diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index db0eb0d86b64..bc6ec2959173 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -17,6 +17,7 @@ struct ipv6_devconf { __s32 hop_limit; __s32 mtu6; __s32 forwarding; + __s32 force_forwarding; __s32 disable_policy; __s32 proxy_ndp; __cacheline_group_end(ipv6_devconf_read_txrx); diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index cf592d7b630f..d4d3ae774b26 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -199,6 +199,7 @@ enum { DEVCONF_NDISC_EVICT_NOCARRIER, DEVCONF_ACCEPT_UNTRACKED_NA, DEVCONF_ACCEPT_RA_MIN_LFT, + DEVCONF_FORCE_FORWARDING, DEVCONF_MAX }; diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h index fac4edd55379..1c8c84d65ae3 100644 --- a/include/uapi/linux/netconf.h +++ b/include/uapi/linux/netconf.h @@ -19,6 +19,7 @@ enum { NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, NETCONFA_INPUT, NETCONFA_BC_FORWARDING, + NETCONFA_FORCE_FORWARDING, __NETCONFA_MAX }; #define NETCONFA_MAX (__NETCONFA_MAX - 1) diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 8981f00204db..63d1464cb71c 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -573,6 +573,7 @@ enum { NET_IPV6_ACCEPT_RA_FROM_LOCAL=26, NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27, NET_IPV6_RA_DEFRTR_METRIC=28, + NET_IPV6_FORCE_FORWARDING=29, __NET_IPV6_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4f1d7d110302..81a067a2e526 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -239,6 +239,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .ndisc_evict_nocarrier = 1, .ra_honor_pio_life = 0, .ra_honor_pio_pflag = 0, + .force_forwarding = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -303,6 +304,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .ndisc_evict_nocarrier = 1, .ra_honor_pio_life = 0, .ra_honor_pio_pflag = 0, + .force_forwarding = 0, }; /* Check if link is ready: is it up and is a valid qdisc available */ @@ -857,6 +859,9 @@ static void addrconf_forward_change(struct net *net, __s32 newf) idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.forwarding) ^ (!newf); + /* Disabling all.forwarding sets 0 to force_forwarding for all interfaces */ + if (newf == 0) + WRITE_ONCE(idev->cnf.force_forwarding, 0); WRITE_ONCE(idev->cnf.forwarding, newf); if (changed) @@ -5710,6 +5715,7 @@ static void ipv6_store_devconf(const struct ipv6_devconf *cnf, array[DEVCONF_ACCEPT_UNTRACKED_NA] = READ_ONCE(cnf->accept_untracked_na); array[DEVCONF_ACCEPT_RA_MIN_LFT] = READ_ONCE(cnf->accept_ra_min_lft); + array[DEVCONF_FORCE_FORWARDING] = READ_ONCE(cnf->force_forwarding); } static inline size_t inet6_ifla6_size(void) @@ -6738,6 +6744,75 @@ static int addrconf_sysctl_disable_policy(const struct ctl_table *ctl, int write return ret; } +static void addrconf_force_forward_change(struct net *net, __s32 newf) +{ + struct net_device *dev; + struct inet6_dev *idev; + + for_each_netdev(net, dev) { + idev = __in6_dev_get_rtnl_net(dev); + if (idev) { + int changed = (!idev->cnf.force_forwarding) ^ (!newf); + + WRITE_ONCE(idev->cnf.force_forwarding, newf); + if (changed) + inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, + NETCONFA_FORCE_FORWARDING, + dev->ifindex, &idev->cnf); + } + } +} + +static int addrconf_sysctl_force_forwarding(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct inet6_dev *idev = ctl->extra1; + struct ctl_table tmp_ctl = *ctl; + struct net *net = ctl->extra2; + int *valp = ctl->data; + int new_val = *valp; + int old_val = *valp; + loff_t pos = *ppos; + int ret; + + tmp_ctl.extra1 = SYSCTL_ZERO; + tmp_ctl.extra2 = SYSCTL_ONE; + tmp_ctl.data = &new_val; + + ret = proc_douintvec_minmax(&tmp_ctl, write, buffer, lenp, ppos); + + if (write && old_val != new_val) { + if (!rtnl_net_trylock(net)) + return restart_syscall(); + + WRITE_ONCE(*valp, new_val); + + if (valp == &net->ipv6.devconf_dflt->force_forwarding) { + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORCE_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + } else if (valp == &net->ipv6.devconf_all->force_forwarding) { + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORCE_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + + addrconf_force_forward_change(net, new_val); + } else { + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORCE_FORWARDING, + idev->dev->ifindex, + &idev->cnf); + } + rtnl_net_unlock(net); + } + + if (ret) + *ppos = pos; + return ret; +} + static int minus_one = -1; static const int two_five_five = 255; static u32 ioam6_if_id_max = U16_MAX; @@ -7208,6 +7283,13 @@ static const struct ctl_table addrconf_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "force_forwarding", + .data = &ipv6_devconf.force_forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_force_forwarding, + }, }; static int __addrconf_sysctl_register(struct net *net, char *dev_name, diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0412f8544695..1e1410237b6e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -511,7 +511,8 @@ int ip6_forward(struct sk_buff *skb) u32 mtu; idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); - if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) + if (!READ_ONCE(net->ipv6.devconf_all->forwarding) && + (!idev || !READ_ONCE(idev->cnf.force_forwarding))) goto error; if (skb->pkt_type != PACKET_HOST) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 13e2678d418b..b31a71f2b372 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -116,6 +116,7 @@ TEST_GEN_FILES += skf_net_off TEST_GEN_FILES += tfo TEST_PROGS += tfo_passive.sh TEST_PROGS += broadcast_pmtu.sh +TEST_PROGS += ipv6_force_forwarding.sh # YNL files, must be before "include ..lib.mk" YNL_GEN_FILES := busy_poller netlink-dumps diff --git a/tools/testing/selftests/net/ipv6_force_forwarding.sh b/tools/testing/selftests/net/ipv6_force_forwarding.sh new file mode 100755 index 000000000000..bf0243366caa --- /dev/null +++ b/tools/testing/selftests/net/ipv6_force_forwarding.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test IPv6 force_forwarding interface property +# +# This test verifies that the force_forwarding property works correctly: +# - When global forwarding is disabled, packets are not forwarded normally +# - When force_forwarding is enabled on an interface, packets are forwarded +# regardless of the global forwarding setting + +source lib.sh + +cleanup() { + cleanup_ns $ns1 $ns2 $ns3 +} + +trap cleanup EXIT + +setup_test() { + # Create three namespaces: sender, router, receiver + setup_ns ns1 ns2 ns3 + + # Create veth pairs: ns1 <-> ns2 <-> ns3 + ip link add name veth12 type veth peer name veth21 + ip link add name veth23 type veth peer name veth32 + + # Move interfaces to namespaces + ip link set veth12 netns $ns1 + ip link set veth21 netns $ns2 + ip link set veth23 netns $ns2 + ip link set veth32 netns $ns3 + + # Configure interfaces + ip -n $ns1 addr add 2001:db8:1::1/64 dev veth12 nodad + ip -n $ns2 addr add 2001:db8:1::2/64 dev veth21 nodad + ip -n $ns2 addr add 2001:db8:2::1/64 dev veth23 nodad + ip -n $ns3 addr add 2001:db8:2::2/64 dev veth32 nodad + + # Bring up interfaces + ip -n $ns1 link set veth12 up + ip -n $ns2 link set veth21 up + ip -n $ns2 link set veth23 up + ip -n $ns3 link set veth32 up + + # Add routes + ip -n $ns1 route add 2001:db8:2::/64 via 2001:db8:1::2 + ip -n $ns3 route add 2001:db8:1::/64 via 2001:db8:2::1 + + # Disable global forwarding + ip netns exec $ns2 sysctl -qw net.ipv6.conf.all.forwarding=0 +} + +test_force_forwarding() { + local ret=0 + + echo "TEST: force_forwarding functionality" + + # Check if force_forwarding sysctl exists + if ! ip netns exec $ns2 test -f /proc/sys/net/ipv6/conf/veth21/force_forwarding; then + echo "SKIP: force_forwarding not available" + return $ksft_skip + fi + + # Test 1: Without force_forwarding, ping should fail + ip netns exec $ns2 sysctl -qw net.ipv6.conf.veth21.force_forwarding=0 + ip netns exec $ns2 sysctl -qw net.ipv6.conf.veth23.force_forwarding=0 + + if ip netns exec $ns1 ping -6 -c 1 -W 2 2001:db8:2::2 &>/dev/null; then + echo "FAIL: ping succeeded when forwarding disabled" + ret=1 + else + echo "PASS: forwarding disabled correctly" + fi + + # Test 2: With force_forwarding enabled, ping should succeed + ip netns exec $ns2 sysctl -qw net.ipv6.conf.veth21.force_forwarding=1 + ip netns exec $ns2 sysctl -qw net.ipv6.conf.veth23.force_forwarding=1 + + if ip netns exec $ns1 ping -6 -c 1 -W 2 2001:db8:2::2 &>/dev/null; then + echo "PASS: force_forwarding enabled forwarding" + else + echo "FAIL: ping failed with force_forwarding enabled" + ret=1 + fi + + return $ret +} + +echo "IPv6 force_forwarding test" +echo "==========================" + +setup_test +test_force_forwarding +ret=$? + +if [ $ret -eq 0 ]; then + echo "OK" + exit 0 +elif [ $ret -eq $ksft_skip ]; then + echo "SKIP" + exit $ksft_skip +else + echo "FAIL" + exit 1 +fi -- cgit v1.2.3 From 97c01e65ef4c1878532be245b2899fc4363cc453 Mon Sep 17 00:00:00 2001 From: Vicki Pfau Date: Sun, 27 Jul 2025 01:15:17 -0700 Subject: Input: Add and document BTN_GRIP* Many controllers these days have started including grip buttons. As there has been no particular assigned BTN_* constants for these, they've been haphazardly assigned to BTN_TRIGGER_HAPPY*. Unfortunately, the assignment of these has varied significantly between drivers. Add and document new constants for these grip buttons. Signed-off-by: Vicki Pfau Link: https://lore.kernel.org/r/20250702040102.125432-2-vi@endrift.com Signed-off-by: Dmitry Torokhov --- Documentation/input/gamepad.rst | 13 +++++++++++++ drivers/hid/hid-debug.c | 2 ++ include/uapi/linux/input-event-codes.h | 5 +++++ 3 files changed, 20 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/input/gamepad.rst b/Documentation/input/gamepad.rst index 2bba721aa20b..0c918b6f288b 100644 --- a/Documentation/input/gamepad.rst +++ b/Documentation/input/gamepad.rst @@ -190,6 +190,19 @@ Gamepads report the following events: Rumble is advertised as FF_RUMBLE. +- Grip buttons: + + Many pads include buttons on the rear, usually referred to as either grip or + rear buttons, or paddles. These are often reprogrammable by the firmware to + appear as "normal" buttons, but are sometimes exposed to software too. Some + notable examples of this are the Steam Deck, which has R4, R5, L4, and L5 on + the back; the Xbox Elite pads, which have P1-P4; and the Switch 2 Pro + Controller, which has GL and GR. + + For these controllers, BTN_GRIPR and BTN_GRIPR2 should be used for the top + and bottom (if present) right grip button(s), and BTN_GRIPL and BTN_GRIPL2 + should be used for the top and bottom (if present) left grip button(s). + - Profile: Some pads provide a multi-value profile selection switch. Examples include diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 8433306148d5..3cd9c1150cdf 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -3291,6 +3291,8 @@ static const char *keys[KEY_MAX + 1] = { [BTN_TR2] = "BtnTR2", [BTN_SELECT] = "BtnSelect", [BTN_START] = "BtnStart", [BTN_MODE] = "BtnMode", [BTN_THUMBL] = "BtnThumbL", [BTN_THUMBR] = "BtnThumbR", + [BTN_GRIPL] = "BtnGripL", [BTN_GRIPR] = "BtnGripR", + [BTN_GRIPL2] = "BtnGripL2", [BTN_GRIPR2] = "BtnGripR2", [BTN_TOOL_PEN] = "ToolPen", [BTN_TOOL_RUBBER] = "ToolRubber", [BTN_TOOL_BRUSH] = "ToolBrush", [BTN_TOOL_PENCIL] = "ToolPencil", [BTN_TOOL_AIRBRUSH] = "ToolAirbrush", [BTN_TOOL_FINGER] = "ToolFinger", diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 5a199f3d4a26..5426297d93fd 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -601,6 +601,11 @@ #define BTN_DPAD_LEFT 0x222 #define BTN_DPAD_RIGHT 0x223 +#define BTN_GRIPL 0x224 +#define BTN_GRIPR 0x225 +#define BTN_GRIPL2 0x226 +#define BTN_GRIPR2 0x227 + #define KEY_ALS_TOGGLE 0x230 /* Ambient light sensor */ #define KEY_ROTATE_LOCK_TOGGLE 0x231 /* Display rotation lock */ #define KEY_REFRESH_RATE_TOGGLE 0x232 /* Display refresh rate toggle */ -- cgit v1.2.3 From 6f02527729bd31ca4e473bff19fda4ccd5889148 Mon Sep 17 00:00:00 2001 From: Norman Maurer Date: Mon, 28 Jul 2025 20:59:53 -1000 Subject: io_uring/net: Allow to do vectorized send At the moment you have to use sendmsg for vectorized send. While this works it's suboptimal as it also means you need to allocate a struct msghdr that needs to be kept alive until a submission happens. We can remove this limitation by just allowing to use send directly. Signed-off-by: Norman Maurer Link: https://lore.kernel.org/r/20250729065952.26646-1-norman_maurer@apple.com [axboe: remove -EINVAL return for SENDMSG and SEND_VECTORIZED] [axboe: allow send_zc to set SEND_VECTORIZED too] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 4 ++++ io_uring/net.c | 9 +++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b8a0e70ee2fd..6957dc539d83 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -392,12 +392,16 @@ enum io_uring_op { * the starting buffer ID in cqe->flags as per * usual for provided buffer usage. The buffers * will be contiguous from the starting buffer ID. + * + * IORING_SEND_VECTORIZED If set, SEND[_ZC] will take a pointer to a io_vec + * to allow vectorized send operations. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) #define IORING_RECVSEND_FIXED_BUF (1U << 2) #define IORING_SEND_ZC_REPORT_USAGE (1U << 3) #define IORING_RECVSEND_BUNDLE (1U << 4) +#define IORING_SEND_VECTORIZED (1U << 5) /* * cqe.res for IORING_CQE_F_NOTIF if diff --git a/io_uring/net.c b/io_uring/net.c index 35585bdc59f3..dd96e355982f 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -382,6 +382,10 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) } if (req->flags & REQ_F_BUFFER_SELECT) return 0; + + if (sr->flags & IORING_SEND_VECTORIZED) + return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE); + return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); } @@ -409,7 +413,7 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE); } -#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) +#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE | IORING_SEND_VECTORIZED) int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -1318,7 +1322,8 @@ void io_send_zc_cleanup(struct io_kiocb *req) } #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) -#define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) +#define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE | \ + IORING_SEND_VECTORIZED) int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -- cgit v1.2.3 From 907a99c314a5a695e35acff78ac61f4ec950a6d3 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Tue, 22 Jul 2025 11:33:40 +0800 Subject: md: rename recovery_cp to resync_offset 'recovery_cp' was used to represent the progress of sync, but its name contains recovery, which can cause confusion. Replaces 'recovery_cp' with 'resync_offset' for clarity. Signed-off-by: Li Nan Link: https://lore.kernel.org/linux-raid/20250722033340.1933388-1-linan666@huaweicloud.com Signed-off-by: Yu Kuai --- drivers/md/dm-raid.c | 42 +++++++++++++++++------------------ drivers/md/md-bitmap.c | 8 +++---- drivers/md/md-cluster.c | 16 +++++++------- drivers/md/md.c | 50 +++++++++++++++++++++--------------------- drivers/md/md.h | 2 +- drivers/md/raid0.c | 6 ++--- drivers/md/raid1-10.c | 2 +- drivers/md/raid1.c | 10 ++++----- drivers/md/raid10.c | 16 +++++++------- drivers/md/raid5-ppl.c | 6 ++--- drivers/md/raid5.c | 30 ++++++++++++------------- include/uapi/linux/raid/md_p.h | 2 +- 12 files changed, 95 insertions(+), 95 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index e8c0a8c6fb51..9835f2fe26e9 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -439,7 +439,7 @@ static bool rs_is_reshapable(struct raid_set *rs) /* Return true, if raid set in @rs is recovering */ static bool rs_is_recovering(struct raid_set *rs) { - return rs->md.recovery_cp < rs->md.dev_sectors; + return rs->md.resync_offset < rs->md.dev_sectors; } /* Return true, if raid set in @rs is reshaping */ @@ -769,7 +769,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r rs->md.layout = raid_type->algorithm; rs->md.new_layout = rs->md.layout; rs->md.delta_disks = 0; - rs->md.recovery_cp = MaxSector; + rs->md.resync_offset = MaxSector; for (i = 0; i < raid_devs; i++) md_rdev_init(&rs->dev[i].rdev); @@ -913,7 +913,7 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as) rs->md.external = 0; rs->md.persistent = 1; rs->md.major_version = 2; - } else if (rebuild && !rs->md.recovery_cp) { + } else if (rebuild && !rs->md.resync_offset) { /* * Without metadata, we will not be able to tell if the array * is in-sync or not - we must assume it is not. Therefore, @@ -1696,20 +1696,20 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) { /* raid0 does not recover */ if (rs_is_raid0(rs)) - rs->md.recovery_cp = MaxSector; + rs->md.resync_offset = MaxSector; /* * A raid6 set has to be recovered either * completely or for the grown part to * ensure proper parity and Q-Syndrome */ else if (rs_is_raid6(rs)) - rs->md.recovery_cp = dev_sectors; + rs->md.resync_offset = dev_sectors; /* * Other raid set types may skip recovery * depending on the 'nosync' flag. */ else - rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) + rs->md.resync_offset = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) ? MaxSector : dev_sectors; } @@ -2144,7 +2144,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev) sb->events = cpu_to_le64(mddev->events); sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); - sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); + sb->array_resync_offset = cpu_to_le64(mddev->resync_offset); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); @@ -2335,18 +2335,18 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) } if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) - mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); + mddev->resync_offset = le64_to_cpu(sb->array_resync_offset); /* * During load, we set FirstUse if a new superblock was written. * There are two reasons we might not have a superblock: * 1) The raid set is brand new - in which case, all of the * devices must have their In_sync bit set. Also, - * recovery_cp must be 0, unless forced. + * resync_offset must be 0, unless forced. * 2) This is a new device being added to an old raid set * and the new device needs to be rebuilt - in which * case the In_sync bit will /not/ be set and - * recovery_cp must be MaxSector. + * resync_offset must be MaxSector. * 3) This is/are a new device(s) being added to an old * raid set during takeover to a higher raid level * to provide capacity for redundancy or during reshape @@ -2391,8 +2391,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) new_devs > 1 ? "s" : ""); return -EINVAL; } else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) { - DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", - (unsigned long long) mddev->recovery_cp); + DMERR("'rebuild' specified while raid set is not in-sync (resync_offset=%llu)", + (unsigned long long) mddev->resync_offset); return -EINVAL; } else if (rs_is_reshaping(rs)) { DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)", @@ -2697,11 +2697,11 @@ static int rs_adjust_data_offsets(struct raid_set *rs) } out: /* - * Raise recovery_cp in case data_offset != 0 to + * Raise resync_offset in case data_offset != 0 to * avoid false recovery positives in the constructor. */ - if (rs->md.recovery_cp < rs->md.dev_sectors) - rs->md.recovery_cp += rs->dev[0].rdev.data_offset; + if (rs->md.resync_offset < rs->md.dev_sectors) + rs->md.resync_offset += rs->dev[0].rdev.data_offset; /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */ rdev_for_each(rdev, &rs->md) { @@ -2756,7 +2756,7 @@ static int rs_setup_takeover(struct raid_set *rs) } clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags); - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; while (d--) { rdev = &rs->dev[d].rdev; @@ -2764,7 +2764,7 @@ static int rs_setup_takeover(struct raid_set *rs) if (test_bit(d, (void *) rs->rebuild_disks)) { clear_bit(In_sync, &rdev->flags); clear_bit(Faulty, &rdev->flags); - mddev->recovery_cp = rdev->recovery_offset = 0; + mddev->resync_offset = rdev->recovery_offset = 0; /* Bitmap has to be created when we do an "up" takeover */ set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); } @@ -3222,7 +3222,7 @@ size_check: if (r) goto bad; - rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors); + rs_setup_recovery(rs, rs->md.resync_offset < rs->md.dev_sectors ? rs->md.resync_offset : rs->md.dev_sectors); } else { /* This is no size change or it is shrinking, update size and record in superblocks */ r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); @@ -3446,7 +3446,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, } else { if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery)) - r = mddev->recovery_cp; + r = mddev->resync_offset; else r = mddev->curr_resync_completed; @@ -4074,9 +4074,9 @@ static int raid_preresume(struct dm_target *ti) } /* Check for any resize/reshape on @rs and adjust/initiate */ - if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { + if (mddev->resync_offset && mddev->resync_offset < MaxSector) { set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - mddev->resync_min = mddev->recovery_cp; + mddev->resync_min = mddev->resync_offset; if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) mddev->resync_max_sectors = mddev->dev_sectors; } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 7f524a26cebc..334b71404930 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1987,12 +1987,12 @@ static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s, md_bitmap_set_memory_bits(bitmap, sec, 1); md_bitmap_file_set_bit(bitmap, sec); - if (sec < bitmap->mddev->recovery_cp) + if (sec < bitmap->mddev->resync_offset) /* We are asserting that the array is dirty, - * so move the recovery_cp address back so + * so move the resync_offset address back so * that it is obvious that it is dirty */ - bitmap->mddev->recovery_cp = sec; + bitmap->mddev->resync_offset = sec; } } @@ -2258,7 +2258,7 @@ static int bitmap_load(struct mddev *mddev) || bitmap->events_cleared == mddev->events) /* no need to keep dirty bits to optimise a * re-add of a missing device */ - start = mddev->recovery_cp; + start = mddev->resync_offset; mutex_lock(&mddev->bitmap_info.mutex); err = md_bitmap_init_from_disk(bitmap, start); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 94221d964d4f..5497eaee96e7 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -337,11 +337,11 @@ static void recover_bitmaps(struct md_thread *thread) md_wakeup_thread(mddev->sync_thread); if (hi > 0) { - if (lo < mddev->recovery_cp) - mddev->recovery_cp = lo; + if (lo < mddev->resync_offset) + mddev->resync_offset = lo; /* wake up thread to continue resync in case resync * is not finished */ - if (mddev->recovery_cp != MaxSector) { + if (mddev->resync_offset != MaxSector) { /* * clear the REMOTE flag since we will launch * resync thread in current node. @@ -863,9 +863,9 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) lockres_free(bm_lockres); continue; } - if ((hi > 0) && (lo < mddev->recovery_cp)) { + if ((hi > 0) && (lo < mddev->resync_offset)) { set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - mddev->recovery_cp = lo; + mddev->resync_offset = lo; md_check_recovery(mddev); } @@ -1027,7 +1027,7 @@ static int leave(struct mddev *mddev) * Also, we should send BITMAP_NEEDS_SYNC message in * case reshaping is interrupted. */ - if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) || + if ((cinfo->slot_number > 0 && mddev->resync_offset != MaxSector) || (mddev->reshape_position != MaxSector && test_bit(MD_CLOSING, &mddev->flags))) resync_bitmap(mddev); @@ -1605,8 +1605,8 @@ static int gather_bitmaps(struct md_rdev *rdev) pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); goto out; } - if ((hi > 0) && (lo < mddev->recovery_cp)) - mddev->recovery_cp = lo; + if ((hi > 0) && (lo < mddev->resync_offset)) + mddev->resync_offset = lo; } out: return err; diff --git a/drivers/md/md.c b/drivers/md/md.c index 8af97ef80ec5..9c7ed23c45ad 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1415,13 +1415,13 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru mddev->layout = -1; if (sb->state & (1<recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; else { if (sb->events_hi == sb->cp_events_hi && sb->events_lo == sb->cp_events_lo) { - mddev->recovery_cp = sb->recovery_cp; + mddev->resync_offset = sb->resync_offset; } else - mddev->recovery_cp = 0; + mddev->resync_offset = 0; } memcpy(mddev->uuid+0, &sb->set_uuid0, 4); @@ -1547,13 +1547,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) mddev->minor_version = sb->minor_version; if (mddev->in_sync) { - sb->recovery_cp = mddev->recovery_cp; + sb->resync_offset = mddev->resync_offset; sb->cp_events_hi = (mddev->events>>32); sb->cp_events_lo = (u32)mddev->events; - if (mddev->recovery_cp == MaxSector) + if (mddev->resync_offset == MaxSector) sb->state = (1<< MD_SB_CLEAN); } else - sb->recovery_cp = 0; + sb->resync_offset = 0; sb->layout = mddev->layout; sb->chunk_size = mddev->chunk_sectors << 9; @@ -1901,7 +1901,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc mddev->bitmap_info.default_space = (4096-1024) >> 9; mddev->reshape_backwards = 0; - mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + mddev->resync_offset = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; @@ -2087,7 +2087,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->utime = cpu_to_le64((__u64)mddev->utime); sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) - sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + sb->resync_offset = cpu_to_le64(mddev->resync_offset); else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) sb->resync_offset = cpu_to_le64(MaxSector); else @@ -2767,7 +2767,7 @@ repeat: /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ if (nospares - && (mddev->in_sync && mddev->recovery_cp == MaxSector) + && (mddev->in_sync && mddev->resync_offset == MaxSector) && mddev->can_decrease_events && mddev->events != 1) { mddev->events--; @@ -4303,9 +4303,9 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(struct mddev *mddev, char *page) { - if (mddev->recovery_cp == MaxSector) + if (mddev->resync_offset == MaxSector) return sprintf(page, "none\n"); - return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); + return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset); } static ssize_t @@ -4331,7 +4331,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) err = -EBUSY; if (!err) { - mddev->recovery_cp = n; + mddev->resync_offset = n; if (mddev->pers) set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } @@ -6423,7 +6423,7 @@ static void md_clean(struct mddev *mddev) mddev->external_size = 0; mddev->dev_sectors = 0; mddev->raid_disks = 0; - mddev->recovery_cp = 0; + mddev->resync_offset = 0; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; @@ -7368,9 +7368,9 @@ int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) * openned */ if (info->state & (1<recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; else - mddev->recovery_cp = 0; + mddev->resync_offset = 0; mddev->persistent = ! info->not_persistent; mddev->external = 0; @@ -8309,7 +8309,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, "\tresync=REMOTE"); return 1; } - if (mddev->recovery_cp < MaxSector) { + if (mddev->resync_offset < MaxSector) { seq_printf(seq, "\tresync=PENDING"); return 1; } @@ -8952,7 +8952,7 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) return mddev->resync_min; case ACTION_RESYNC: if (!mddev->bitmap) - return mddev->recovery_cp; + return mddev->resync_offset; return 0; case ACTION_RESHAPE: /* @@ -9190,8 +9190,8 @@ void md_do_sync(struct md_thread *thread) atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = j; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && - j > mddev->recovery_cp) - mddev->recovery_cp = j; + j > mddev->resync_offset) + mddev->resync_offset = j; update_time = jiffies; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9311,19 +9311,19 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync > MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - if (mddev->curr_resync >= mddev->recovery_cp) { + if (mddev->curr_resync >= mddev->resync_offset) { pr_debug("md: checkpointing %s of %s.\n", desc, mdname(mddev)); if (test_bit(MD_RECOVERY_ERROR, &mddev->recovery)) - mddev->recovery_cp = + mddev->resync_offset = mddev->curr_resync_completed; else - mddev->recovery_cp = + mddev->resync_offset = mddev->curr_resync; } } else - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; @@ -9539,7 +9539,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) } /* Check if resync is in progress. */ - if (mddev->recovery_cp < MaxSector) { + if (mddev->resync_offset < MaxSector) { remove_spares(mddev, NULL); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); @@ -9720,7 +9720,7 @@ void md_check_recovery(struct mddev *mddev) test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || (mddev->safemode == 2 - && !mddev->in_sync && mddev->recovery_cp == MaxSector) + && !mddev->in_sync && mddev->resync_offset == MaxSector) )) return; diff --git a/drivers/md/md.h b/drivers/md/md.h index 67b365621507..51af29a03079 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -523,7 +523,7 @@ struct mddev { unsigned long normal_io_events; /* IO event timestamp */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; - sector_t recovery_cp; + sector_t resync_offset; sector_t resync_min; /* user requested sync * starts here */ sector_t resync_max; /* resync should pause diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index cbe2a9054cb9..f1d8811a542a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -674,7 +674,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev) mddev->raid_disks--; mddev->delta_disks = -1; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); @@ -717,7 +717,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev) mddev->raid_disks += mddev->delta_disks; mddev->degraded = 0; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); @@ -760,7 +760,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev) mddev->delta_disks = 1 - mddev->raid_disks; mddev->raid_disks = 1; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index b8b3a9069701..52881e6032da 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -283,7 +283,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev, static inline bool raid1_should_read_first(struct mddev *mddev, sector_t this_sector, int len) { - if ((mddev->recovery_cp < this_sector + len)) + if ((mddev->resync_offset < this_sector + len)) return true; if (mddev_is_clustered(mddev) && diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 64b8176907a9..6cee738a645f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2822,7 +2822,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } if (mddev->bitmap == NULL && - mddev->recovery_cp == MaxSector && + mddev->resync_offset == MaxSector && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && conf->fullsync == 0) { *skipped = 1; @@ -3282,9 +3282,9 @@ static int raid1_run(struct mddev *mddev) } if (conf->raid_disks - mddev->degraded == 1) - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; - if (mddev->recovery_cp != MaxSector) + if (mddev->resync_offset != MaxSector) pr_info("md/raid1:%s: not clean -- starting background reconstruction\n", mdname(mddev)); pr_info("md/raid1:%s: active with %d out of %d mirrors\n", @@ -3345,8 +3345,8 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && - mddev->recovery_cp > mddev->dev_sectors) { - mddev->recovery_cp = mddev->dev_sectors; + mddev->resync_offset > mddev->dev_sectors) { + mddev->resync_offset = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->dev_sectors = sectors; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 95dc354a86a0..b60c30bfb6c7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2117,7 +2117,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) int last = conf->geo.raid_disks - 1; struct raid10_info *p; - if (mddev->recovery_cp < MaxSector) + if (mddev->resync_offset < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync */ @@ -3185,7 +3185,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * of a clean array, like RAID1 does. */ if (mddev->bitmap == NULL && - mddev->recovery_cp == MaxSector && + mddev->resync_offset == MaxSector && mddev->reshape_position == MaxSector && !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && @@ -4145,7 +4145,7 @@ static int raid10_run(struct mddev *mddev) disk->recovery_disabled = mddev->recovery_disabled - 1; } - if (mddev->recovery_cp != MaxSector) + if (mddev->resync_offset != MaxSector) pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", mdname(mddev)); pr_info("md/raid10:%s: active with %d out of %d devices\n", @@ -4245,8 +4245,8 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) md_set_array_sectors(mddev, size); if (sectors > mddev->dev_sectors && - mddev->recovery_cp > oldsize) { - mddev->recovery_cp = oldsize; + mddev->resync_offset > oldsize) { + mddev->resync_offset = oldsize; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } calc_sectors(conf, sectors); @@ -4275,7 +4275,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) mddev->delta_disks = mddev->raid_disks; mddev->raid_disks *= 2; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev->dev_sectors = size; conf = setup_conf(mddev); @@ -5087,8 +5087,8 @@ static void raid10_finish_reshape(struct mddev *mddev) return; if (mddev->delta_disks > 0) { - if (mddev->recovery_cp > mddev->resync_max_sectors) { - mddev->recovery_cp = mddev->resync_max_sectors; + if (mddev->resync_offset > mddev->resync_max_sectors) { + mddev->resync_offset = mddev->resync_max_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->resync_max_sectors = mddev->array_sectors; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index c0fb335311aa..56b234683ee6 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1163,7 +1163,7 @@ static int ppl_load_distributed(struct ppl_log *log) le64_to_cpu(pplhdr->generation)); /* attempt to recover from log if we are starting a dirty array */ - if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector) + if (pplhdr && !mddev->pers && mddev->resync_offset != MaxSector) ret = ppl_recover(log, pplhdr, pplhdr_offset); /* write empty header if we are starting the array */ @@ -1422,14 +1422,14 @@ int ppl_init_log(struct r5conf *conf) if (ret) { goto err; - } else if (!mddev->pers && mddev->recovery_cp == 0 && + } else if (!mddev->pers && mddev->resync_offset == 0 && ppl_conf->recovered_entries > 0 && ppl_conf->mismatch_count == 0) { /* * If we are starting a dirty array and the recovery succeeds * without any issues, set the array as clean. */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } else if (mddev->pers && ppl_conf->mismatch_count > 0) { /* no mismatch allowed when enabling PPL for a running array */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7ec61ee7b218..023649fe2476 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3740,7 +3740,7 @@ static int want_replace(struct stripe_head *sh, int disk_idx) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && (rdev->recovery_offset <= sh->sector - || rdev->mddev->recovery_cp <= sh->sector)) + || rdev->mddev->resync_offset <= sh->sector)) rv = 1; return rv; } @@ -3832,7 +3832,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, * is missing/faulty, then we need to read everything we can. */ if (!force_rcw && - sh->sector < sh->raid_conf->mddev->recovery_cp) + sh->sector < sh->raid_conf->mddev->resync_offset) /* reconstruct-write isn't being forced */ return 0; for (i = 0; i < s->failed && i < 2; i++) { @@ -4097,7 +4097,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, int disks) { int rmw = 0, rcw = 0, i; - sector_t recovery_cp = conf->mddev->recovery_cp; + sector_t resync_offset = conf->mddev->resync_offset; /* Check whether resync is now happening or should start. * If yes, then the array is dirty (after unclean shutdown or @@ -4107,14 +4107,14 @@ static int handle_stripe_dirtying(struct r5conf *conf, * generate correct data from the parity. */ if (conf->rmw_level == PARITY_DISABLE_RMW || - (recovery_cp < MaxSector && sh->sector >= recovery_cp && + (resync_offset < MaxSector && sh->sector >= resync_offset && s->failed == 0)) { /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ rcw = 1; rmw = 2; - pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", - conf->rmw_level, (unsigned long long)recovery_cp, + pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n", + conf->rmw_level, (unsigned long long)resync_offset, (unsigned long long)sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ @@ -4770,14 +4770,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(STRIPE_SYNCING, &sh->state)) { /* If there is a failed device being replaced, * we must be recovering. - * else if we are after recovery_cp, we must be syncing + * else if we are after resync_offset, we must be syncing * else if MD_RECOVERY_REQUESTED is set, we also are syncing. * else we can only be replacing * sync and recovery both need to read all devices, and so * use the same flag. */ if (do_recovery || - sh->sector >= conf->mddev->recovery_cp || + sh->sector >= conf->mddev->resync_offset || test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) s->syncing = 1; else @@ -7780,7 +7780,7 @@ static int raid5_run(struct mddev *mddev) int first = 1; int ret = -EIO; - if (mddev->recovery_cp != MaxSector) + if (mddev->resync_offset != MaxSector) pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", mdname(mddev)); @@ -7921,7 +7921,7 @@ static int raid5_run(struct mddev *mddev) mdname(mddev)); mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); - } else if (mddev->recovery_cp == MaxSector) + } else if (mddev->resync_offset == MaxSector) set_bit(MD_JOURNAL_CLEAN, &mddev->flags); } @@ -7988,7 +7988,7 @@ static int raid5_run(struct mddev *mddev) mddev->resync_max_sectors = mddev->dev_sectors; if (mddev->degraded > dirty_parity_disks && - mddev->recovery_cp != MaxSector) { + mddev->resync_offset != MaxSector) { if (test_bit(MD_HAS_PPL, &mddev->flags)) pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", mdname(mddev)); @@ -8328,8 +8328,8 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && - mddev->recovery_cp > mddev->dev_sectors) { - mddev->recovery_cp = mddev->dev_sectors; + mddev->resync_offset > mddev->dev_sectors) { + mddev->resync_offset = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->dev_sectors = sectors; @@ -8423,7 +8423,7 @@ static int raid5_start_reshape(struct mddev *mddev) return -EINVAL; /* raid5 can't handle concurrent reshape and recovery */ - if (mddev->recovery_cp < MaxSector) + if (mddev->resync_offset < MaxSector) return -EBUSY; for (i = 0; i < conf->raid_disks; i++) if (conf->disks[i].replacement) @@ -8648,7 +8648,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level) mddev->raid_disks += 1; mddev->delta_disks = 1; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; return setup_conf(mddev); } diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index ff47b6f0ba0f..b13946287277 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -173,7 +173,7 @@ typedef struct mdp_superblock_s { #else #error unspecified endianness #endif - __u32 recovery_cp; /* 11 recovery checkpoint sector count */ + __u32 resync_offset; /* 11 resync checkpoint sector count */ /* There are only valid for minor_version > 90 */ __u64 reshape_position; /* 12,13 next address in array-space for reshape */ __u32 new_level; /* 14 new level we are reshaping to */ -- cgit v1.2.3 From 55a984928bfa30c7877e28f16910e6de1c170f1f Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Fri, 1 Aug 2025 10:26:13 +0200 Subject: Revert "tty: vt: use _IO() to define ioctl numbers" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f1180ca37abe3d117e4a19be12142fe722612a7c. Since the commit, the vt ioctl numbers are defined differently on platforms where _IOC_NONE is non-zero: alpha, mips, powerpc, sparc. Signed-off-by: "Jiri Slaby (SUSE)" Reported-by: Christophe Leroy Link: https://lore.kernel.org/all/436489B9-E67B-4630-909F-386C30A2AAC9@xenosoft.de/ Link: https://lore.kernel.org/all/97ec2636-915a-498c-903b-d66957420d21@csgroup.eu/ Cc: Nicolas Pitre Cc: Ilpo Järvinen Link: https://lore.kernel.org/r/20250801082613.2564584-1-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/vt.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h index b60fcdfb2746..714483d68c69 100644 --- a/include/uapi/linux/vt.h +++ b/include/uapi/linux/vt.h @@ -14,9 +14,9 @@ /* Note: the ioctl VT_GETSTATE does not work for consoles 16 and higher (since it returns a short) */ -/* 'V' to avoid collision with termios and kd */ +/* 0x56 is 'V', to avoid collision with termios and kd */ -#define VT_OPENQRY _IO('V', 0x00) /* find available vt */ +#define VT_OPENQRY 0x5600 /* find available vt */ struct vt_mode { __u8 mode; /* vt mode */ @@ -25,8 +25,8 @@ struct vt_mode { __s16 acqsig; /* signal to raise on acquisition */ __s16 frsig; /* unused (set to 0) */ }; -#define VT_GETMODE _IO('V', 0x01) /* get mode of active vt */ -#define VT_SETMODE _IO('V', 0x02) /* set mode of active vt */ +#define VT_GETMODE 0x5601 /* get mode of active vt */ +#define VT_SETMODE 0x5602 /* set mode of active vt */ #define VT_AUTO 0x00 /* auto vt switching */ #define VT_PROCESS 0x01 /* process controls switching */ #define VT_ACKACQ 0x02 /* acknowledge switch */ @@ -36,21 +36,21 @@ struct vt_stat { __u16 v_signal; /* signal to send */ __u16 v_state; /* vt bitmask */ }; -#define VT_GETSTATE _IO('V', 0x03) /* get global vt state info */ -#define VT_SENDSIG _IO('V', 0x04) /* signal to send to bitmask of vts */ +#define VT_GETSTATE 0x5603 /* get global vt state info */ +#define VT_SENDSIG 0x5604 /* signal to send to bitmask of vts */ -#define VT_RELDISP _IO('V', 0x05) /* release display */ +#define VT_RELDISP 0x5605 /* release display */ -#define VT_ACTIVATE _IO('V', 0x06) /* make vt active */ -#define VT_WAITACTIVE _IO('V', 0x07) /* wait for vt active */ -#define VT_DISALLOCATE _IO('V', 0x08) /* free memory associated to vt */ +#define VT_ACTIVATE 0x5606 /* make vt active */ +#define VT_WAITACTIVE 0x5607 /* wait for vt active */ +#define VT_DISALLOCATE 0x5608 /* free memory associated to vt */ struct vt_sizes { __u16 v_rows; /* number of rows */ __u16 v_cols; /* number of columns */ __u16 v_scrollsize; /* number of lines of scrollback */ }; -#define VT_RESIZE _IO('V', 0x09) /* set kernel's idea of screensize */ +#define VT_RESIZE 0x5609 /* set kernel's idea of screensize */ struct vt_consize { __u16 v_rows; /* number of rows */ @@ -60,10 +60,10 @@ struct vt_consize { __u16 v_vcol; /* number of pixel columns on screen */ __u16 v_ccol; /* number of pixel columns per character */ }; -#define VT_RESIZEX _IO('V', 0x0A) /* set kernel's idea of screensize + more */ -#define VT_LOCKSWITCH _IO('V', 0x0B) /* disallow vt switching */ -#define VT_UNLOCKSWITCH _IO('V', 0x0C) /* allow vt switching */ -#define VT_GETHIFONTMASK _IO('V', 0x0D) /* return hi font mask */ +#define VT_RESIZEX 0x560A /* set kernel's idea of screensize + more */ +#define VT_LOCKSWITCH 0x560B /* disallow vt switching */ +#define VT_UNLOCKSWITCH 0x560C /* allow vt switching */ +#define VT_GETHIFONTMASK 0x560D /* return hi font mask */ struct vt_event { __u32 event; @@ -77,14 +77,14 @@ struct vt_event { __u32 pad[4]; /* Padding for expansion */ }; -#define VT_WAITEVENT _IO('V', 0x0E) /* Wait for an event */ +#define VT_WAITEVENT 0x560E /* Wait for an event */ struct vt_setactivate { __u32 console; struct vt_mode mode; }; -#define VT_SETACTIVATE _IO('V', 0x0F) /* Activate and set the mode of a console */ +#define VT_SETACTIVATE 0x560F /* Activate and set the mode of a console */ /* get console size and cursor position */ struct vt_consizecsrpos { -- cgit v1.2.3 From 7d9896e9f6d02d8aa85e63f736871f96c59a5263 Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Mon, 14 Jul 2025 15:12:32 +0800 Subject: vhost: Reintroduce kthread API and add mode selection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 6e890c5d5021 ("vhost: use vhost_tasks for worker threads"), the vhost uses vhost_task and operates as a child of the owner thread. This is required for correct CPU usage accounting, especially when using containers. However, this change has caused confusion for some legacy userspace applications, and we didn't notice until it's too late. Unfortunately, it's too late to revert - we now have userspace depending both on old and new behaviour :( To address the issue, reintroduce kthread mode for vhost workers and provide a configuration to select between kthread and task worker. - Add 'fork_owner' parameter to vhost_dev to let users select kthread or task mode. Default mode is task mode(VHOST_FORK_OWNER_TASK). - Reintroduce kthread mode support: * Bring back the original vhost_worker() implementation, and renamed to vhost_run_work_kthread_list(). * Add cgroup support for the kthread * Introduce struct vhost_worker_ops: - Encapsulates create / stop / wake‑up callbacks. - vhost_worker_create() selects the proper ops according to inherit_owner. - Userspace configuration interface: * New IOCTLs: - VHOST_SET_FORK_FROM_OWNER lets userspace select task mode (VHOST_FORK_OWNER_TASK) or kthread mode (VHOST_FORK_OWNER_KTHREAD) - VHOST_GET_FORK_FROM_OWNER reads the current worker mode * Expose module parameter 'fork_from_owner_default' to allow system administrators to configure the default mode for vhost workers * Kconfig option CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL controls whether these IOCTLs and the parameter are available - The VHOST_NEW_WORKER functionality requires fork_owner to be set to true, with validation added to ensure proper configuration This partially reverts or improves upon: commit 6e890c5d5021 ("vhost: use vhost_tasks for worker threads") commit 1cdaafa1b8b4 ("vhost: replace single worker pointer with xarray") Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads"), Signed-off-by: Cindy Lu Message-Id: <20250714071333.59794-2-lulu@redhat.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Tested-by: Lei Yang --- drivers/vhost/Kconfig | 18 ++++ drivers/vhost/vhost.c | 244 +++++++++++++++++++++++++++++++++++++++++---- drivers/vhost/vhost.h | 22 ++++ include/uapi/linux/vhost.h | 29 ++++++ 4 files changed, 295 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 020d4fbb947c..bc0f38574497 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -95,4 +95,22 @@ config VHOST_CROSS_ENDIAN_LEGACY If unsure, say "N". +config VHOST_ENABLE_FORK_OWNER_CONTROL + bool "Enable VHOST_ENABLE_FORK_OWNER_CONTROL" + default y + help + This option enables two IOCTLs: VHOST_SET_FORK_FROM_OWNER and + VHOST_GET_FORK_FROM_OWNER. These allow userspace applications + to modify the vhost worker mode for vhost devices. + + Also expose module parameter 'fork_from_owner_default' to allow users + to configure the default mode for vhost workers. + + By default, `VHOST_ENABLE_FORK_OWNER_CONTROL` is set to `y`, + users can change the worker thread mode as needed. + If this config is disabled (n),the related IOCTLs and parameters will + be unavailable. + + If unsure, say "Y". + endif diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 4390e3a14218..f4c1bc6adeda 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,13 @@ static int max_iotlb_entries = 2048; module_param(max_iotlb_entries, int, 0444); MODULE_PARM_DESC(max_iotlb_entries, "Maximum number of iotlb entries. (default: 2048)"); +static bool fork_from_owner_default = VHOST_FORK_OWNER_TASK; + +#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL +module_param(fork_from_owner_default, bool, 0444); +MODULE_PARM_DESC(fork_from_owner_default, + "Set task mode as the default(default: Y)"); +#endif enum { VHOST_MEMORY_F_LOG = 0x1, @@ -242,7 +250,7 @@ static void vhost_worker_queue(struct vhost_worker *worker, * test_and_set_bit() implies a memory barrier. */ llist_add(&work->node, &worker->work_list); - vhost_task_wake(worker->vtsk); + worker->ops->wakeup(worker); } } @@ -388,6 +396,44 @@ static void vhost_vq_reset(struct vhost_dev *dev, __vhost_vq_meta_reset(vq); } +static int vhost_run_work_kthread_list(void *data) +{ + struct vhost_worker *worker = data; + struct vhost_work *work, *work_next; + struct vhost_dev *dev = worker->dev; + struct llist_node *node; + + kthread_use_mm(dev->mm); + + for (;;) { + /* mb paired w/ kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } + node = llist_del_all(&worker->work_list); + if (!node) + schedule(); + + node = llist_reverse_order(node); + /* make sure flag is seen after deletion */ + smp_wmb(); + llist_for_each_entry_safe(work, work_next, node, node) { + clear_bit(VHOST_WORK_QUEUED, &work->flags); + __set_current_state(TASK_RUNNING); + kcov_remote_start_common(worker->kcov_handle); + work->fn(work); + kcov_remote_stop(); + cond_resched(); + } + } + kthread_unuse_mm(dev->mm); + + return 0; +} + static bool vhost_run_work_list(void *data) { struct vhost_worker *worker = data; @@ -552,6 +598,7 @@ void vhost_dev_init(struct vhost_dev *dev, dev->byte_weight = byte_weight; dev->use_worker = use_worker; dev->msg_handler = msg_handler; + dev->fork_owner = fork_from_owner_default; init_waitqueue_head(&dev->wait); INIT_LIST_HEAD(&dev->read_list); INIT_LIST_HEAD(&dev->pending_list); @@ -581,6 +628,46 @@ long vhost_dev_check_owner(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_check_owner); +struct vhost_attach_cgroups_struct { + struct vhost_work work; + struct task_struct *owner; + int ret; +}; + +static void vhost_attach_cgroups_work(struct vhost_work *work) +{ + struct vhost_attach_cgroups_struct *s; + + s = container_of(work, struct vhost_attach_cgroups_struct, work); + s->ret = cgroup_attach_task_all(s->owner, current); +} + +static int vhost_attach_task_to_cgroups(struct vhost_worker *worker) +{ + struct vhost_attach_cgroups_struct attach; + int saved_cnt; + + attach.owner = current; + + vhost_work_init(&attach.work, vhost_attach_cgroups_work); + vhost_worker_queue(worker, &attach.work); + + mutex_lock(&worker->mutex); + + /* + * Bypass attachment_cnt check in __vhost_worker_flush: + * Temporarily change it to INT_MAX to bypass the check + */ + saved_cnt = worker->attachment_cnt; + worker->attachment_cnt = INT_MAX; + __vhost_worker_flush(worker); + worker->attachment_cnt = saved_cnt; + + mutex_unlock(&worker->mutex); + + return attach.ret; +} + /* Caller should have device mutex */ bool vhost_dev_has_owner(struct vhost_dev *dev) { @@ -626,7 +713,7 @@ static void vhost_worker_destroy(struct vhost_dev *dev, WARN_ON(!llist_empty(&worker->work_list)); xa_erase(&dev->worker_xa, worker->id); - vhost_task_stop(worker->vtsk); + worker->ops->stop(worker); kfree(worker); } @@ -649,42 +736,115 @@ static void vhost_workers_free(struct vhost_dev *dev) xa_destroy(&dev->worker_xa); } +static void vhost_task_wakeup(struct vhost_worker *worker) +{ + return vhost_task_wake(worker->vtsk); +} + +static void vhost_kthread_wakeup(struct vhost_worker *worker) +{ + wake_up_process(worker->kthread_task); +} + +static void vhost_task_do_stop(struct vhost_worker *worker) +{ + return vhost_task_stop(worker->vtsk); +} + +static void vhost_kthread_do_stop(struct vhost_worker *worker) +{ + kthread_stop(worker->kthread_task); +} + +static int vhost_task_worker_create(struct vhost_worker *worker, + struct vhost_dev *dev, const char *name) +{ + struct vhost_task *vtsk; + u32 id; + int ret; + + vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, + worker, name); + if (IS_ERR(vtsk)) + return PTR_ERR(vtsk); + + worker->vtsk = vtsk; + vhost_task_start(vtsk); + ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); + if (ret < 0) { + vhost_task_do_stop(worker); + return ret; + } + worker->id = id; + return 0; +} + +static int vhost_kthread_worker_create(struct vhost_worker *worker, + struct vhost_dev *dev, const char *name) +{ + struct task_struct *task; + u32 id; + int ret; + + task = kthread_create(vhost_run_work_kthread_list, worker, "%s", name); + if (IS_ERR(task)) + return PTR_ERR(task); + + worker->kthread_task = task; + wake_up_process(task); + ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); + if (ret < 0) + goto stop_worker; + + ret = vhost_attach_task_to_cgroups(worker); + if (ret) + goto stop_worker; + + worker->id = id; + return 0; + +stop_worker: + vhost_kthread_do_stop(worker); + return ret; +} + +static const struct vhost_worker_ops kthread_ops = { + .create = vhost_kthread_worker_create, + .stop = vhost_kthread_do_stop, + .wakeup = vhost_kthread_wakeup, +}; + +static const struct vhost_worker_ops vhost_task_ops = { + .create = vhost_task_worker_create, + .stop = vhost_task_do_stop, + .wakeup = vhost_task_wakeup, +}; + static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) { struct vhost_worker *worker; - struct vhost_task *vtsk; char name[TASK_COMM_LEN]; int ret; - u32 id; + const struct vhost_worker_ops *ops = dev->fork_owner ? &vhost_task_ops : + &kthread_ops; worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT); if (!worker) return NULL; worker->dev = dev; + worker->ops = ops; snprintf(name, sizeof(name), "vhost-%d", current->pid); - vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, - worker, name); - if (IS_ERR(vtsk)) - goto free_worker; - mutex_init(&worker->mutex); init_llist_head(&worker->work_list); worker->kcov_handle = kcov_common_handle(); - worker->vtsk = vtsk; - - vhost_task_start(vtsk); - - ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); + ret = ops->create(worker, dev, name); if (ret < 0) - goto stop_worker; - worker->id = id; + goto free_worker; return worker; -stop_worker: - vhost_task_stop(vtsk); free_worker: kfree(worker); return NULL; @@ -865,6 +1025,14 @@ long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, switch (ioctl) { /* dev worker ioctls */ case VHOST_NEW_WORKER: + /* + * vhost_tasks will account for worker threads under the parent's + * NPROC value but kthreads do not. To avoid userspace overflowing + * the system with worker threads fork_owner must be true. + */ + if (!dev->fork_owner) + return -EFAULT; + ret = vhost_new_worker(dev, &state); if (!ret && copy_to_user(argp, &state, sizeof(state))) ret = -EFAULT; @@ -982,6 +1150,7 @@ void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem) vhost_dev_cleanup(dev); + dev->fork_owner = fork_from_owner_default; dev->umem = umem; /* We don't need VQ locks below since vhost_dev_cleanup makes sure * VQs aren't running. @@ -2135,6 +2304,45 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp) goto done; } +#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL + if (ioctl == VHOST_SET_FORK_FROM_OWNER) { + /* Only allow modification before owner is set */ + if (vhost_dev_has_owner(d)) { + r = -EBUSY; + goto done; + } + u8 fork_owner_val; + + if (get_user(fork_owner_val, (u8 __user *)argp)) { + r = -EFAULT; + goto done; + } + if (fork_owner_val != VHOST_FORK_OWNER_TASK && + fork_owner_val != VHOST_FORK_OWNER_KTHREAD) { + r = -EINVAL; + goto done; + } + d->fork_owner = !!fork_owner_val; + r = 0; + goto done; + } + if (ioctl == VHOST_GET_FORK_FROM_OWNER) { + u8 fork_owner_val = d->fork_owner; + + if (fork_owner_val != VHOST_FORK_OWNER_TASK && + fork_owner_val != VHOST_FORK_OWNER_KTHREAD) { + r = -EINVAL; + goto done; + } + if (put_user(fork_owner_val, (u8 __user *)argp)) { + r = -EFAULT; + goto done; + } + r = 0; + goto done; + } +#endif + /* You must be the owner to do anything else */ r = vhost_dev_check_owner(d); if (r) diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index bb75a292d50c..ab704d84fb34 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -26,7 +26,18 @@ struct vhost_work { unsigned long flags; }; +struct vhost_worker; +struct vhost_dev; + +struct vhost_worker_ops { + int (*create)(struct vhost_worker *worker, struct vhost_dev *dev, + const char *name); + void (*stop)(struct vhost_worker *worker); + void (*wakeup)(struct vhost_worker *worker); +}; + struct vhost_worker { + struct task_struct *kthread_task; struct vhost_task *vtsk; struct vhost_dev *dev; /* Used to serialize device wide flushing with worker swapping. */ @@ -36,6 +47,7 @@ struct vhost_worker { u32 id; int attachment_cnt; bool killed; + const struct vhost_worker_ops *ops; }; /* Poll a file (eventfd or socket) */ @@ -176,6 +188,16 @@ struct vhost_dev { int byte_weight; struct xarray worker_xa; bool use_worker; + /* + * If fork_owner is true we use vhost_tasks to create + * the worker so all settings/limits like cgroups, NPROC, + * scheduler, etc are inherited from the owner. If false, + * we use kthreads and only attach to the same cgroups + * as the owner for compat with older kernels. + * here we use true as default value. + * The default value is set by fork_from_owner_default + */ + bool fork_owner; int (*msg_handler)(struct vhost_dev *dev, u32 asid, struct vhost_iotlb_msg *msg); }; diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index d4b3e2ae1314..e72f2655459e 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -235,4 +235,33 @@ */ #define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ struct vhost_vring_state) + +/* fork_owner values for vhost */ +#define VHOST_FORK_OWNER_KTHREAD 0 +#define VHOST_FORK_OWNER_TASK 1 + +/** + * VHOST_SET_FORK_FROM_OWNER - Set the fork_owner flag for the vhost device, + * This ioctl must called before VHOST_SET_OWNER. + * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y + * + * @param fork_owner: An 8-bit value that determines the vhost thread mode + * + * When fork_owner is set to VHOST_FORK_OWNER_TASK(default value): + * - Vhost will create vhost worker as tasks forked from the owner, + * inheriting all of the owner's attributes. + * + * When fork_owner is set to VHOST_FORK_OWNER_KTHREAD: + * - Vhost will create vhost workers as kernel threads. + */ +#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x83, __u8) + +/** + * VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device. + * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y + * + * @return: An 8-bit value indicating the current thread mode. + */ +#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x84, __u8) + #endif -- cgit v1.2.3 From 07d24902977e4704fab8472981e73a0ad6dfa1fd Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 10 Jun 2025 08:53:27 +0000 Subject: kexec: enable CMA based contiguous allocation When booting a new kernel with kexec_file, the kernel picks a target location that the kernel should live at, then allocates random pages, checks whether any of those patches magically happens to coincide with a target address range and if so, uses them for that range. For every page allocated this way, it then creates a page list that the relocation code - code that executes while all CPUs are off and we are just about to jump into the new kernel - copies to their final memory location. We can not put them there before, because chances are pretty good that at least some page in the target range is already in use by the currently running Linux environment. Copying is happening from a single CPU at RAM rate, which takes around 4-50 ms per 100 MiB. All of this is inefficient and error prone. To successfully kexec, we need to quiesce all devices of the outgoing kernel so they don't scribble over the new kernel's memory. We have seen cases where that does not happen properly (*cough* GIC *cough*) and hence the new kernel was corrupted. This started a month long journey to root cause failing kexecs to eventually see memory corruption, because the new kernel was corrupted severely enough that it could not emit output to tell us about the fact that it was corrupted. By allocating memory for the next kernel from a memory range that is guaranteed scribbling free, we can boot the next kernel up to a point where it is at least able to detect corruption and maybe even stop it before it becomes severe. This increases the chance for successful kexecs. Since kexec got introduced, Linux has gained the CMA framework which can perform physically contiguous memory mappings, while keeping that memory available for movable memory when it is not needed for contiguous allocations. The default CMA allocator is for DMA allocations. This patch adds logic to the kexec file loader to attempt to place the target payload at a location allocated from CMA. If successful, it uses that memory range directly instead of creating copy instructions during the hot phase. To ensure that there is a safety net in case anything goes wrong with the CMA allocation, it also adds a flag for user space to force disable CMA allocations. Using CMA allocations has two advantages: 1) Faster by 4-50 ms per 100 MiB. There is no more need to copy in the hot phase. 2) More robust. Even if by accident some page is still in use for DMA, the new kernel image will be safe from that access because it resides in a memory region that is considered allocated in the old kernel and has a chance to reinitialize that component. Link: https://lkml.kernel.org/r/20250610085327.51817-1-graf@amazon.com Signed-off-by: Alexander Graf Acked-by: Baoquan He Reviewed-by: Pasha Tatashin Cc: Zhongkun He Signed-off-by: Andrew Morton --- arch/riscv/kernel/kexec_elf.c | 1 + include/linux/kexec.h | 10 +++++ include/uapi/linux/kexec.h | 1 + kernel/kexec.c | 2 +- kernel/kexec_core.c | 100 ++++++++++++++++++++++++++++++++++++++---- kernel/kexec_file.c | 51 ++++++++++++++++++++- kernel/kexec_internal.h | 2 +- 7 files changed, 156 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c index f4755d49b89e..56444c7bd34e 100644 --- a/arch/riscv/kernel/kexec_elf.c +++ b/arch/riscv/kernel/kexec_elf.c @@ -95,6 +95,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, kbuf.buf_align = PMD_SIZE; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE); + kbuf.cma = NULL; kbuf.top_down = false; ret = arch_kexec_locate_mem_hole(&kbuf); if (!ret) { diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 03f85ad03025..1b10a5d84b68 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -79,6 +79,12 @@ extern note_buf_t __percpu *crash_notes; typedef unsigned long kimage_entry_t; +/* + * This is a copy of the UAPI struct kexec_segment and must be identical + * to it because it gets copied straight from user space into kernel + * memory. Do not modify this structure unless you change the way segments + * get ingested from user space. + */ struct kexec_segment { /* * This pointer can point to user memory if kexec_load() system @@ -172,6 +178,7 @@ int kexec_image_post_load_cleanup_default(struct kimage *image); * @buf_align: Minimum alignment needed. * @buf_min: The buffer can't be placed below this address. * @buf_max: The buffer can't be placed above this address. + * @cma: CMA page if the buffer is backed by CMA. * @top_down: Allocate from top of memory. * @random: Place the buffer at a random position. */ @@ -184,6 +191,7 @@ struct kexec_buf { unsigned long buf_align; unsigned long buf_min; unsigned long buf_max; + struct page *cma; bool top_down; #ifdef CONFIG_CRASH_DUMP bool random; @@ -340,6 +348,7 @@ struct kimage { unsigned long nr_segments; struct kexec_segment segment[KEXEC_SEGMENT_MAX]; + struct page *segment_cma[KEXEC_SEGMENT_MAX]; struct list_head control_pages; struct list_head dest_pages; @@ -361,6 +370,7 @@ struct kimage { */ unsigned int hotplug_support:1; #endif + unsigned int no_cma:1; #ifdef ARCH_HAS_KIMAGE_ARCH struct kimage_arch arch; diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 5ae1741ea8ea..8958ebfcff94 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -27,6 +27,7 @@ #define KEXEC_FILE_ON_CRASH 0x00000002 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 #define KEXEC_FILE_DEBUG 0x00000008 +#define KEXEC_FILE_NO_CMA 0x00000010 /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. diff --git a/kernel/kexec.c b/kernel/kexec.c index a6b3f96bb50c..28008e3d462e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, goto out; for (i = 0; i < nr_segments; i++) { - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 3a9a9f240dbc..e390c0df6d55 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_entry_t entry) kimage_free_pages(page); } +static void kimage_free_cma(struct kimage *image) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + struct page *cma = image->segment_cma[i]; + u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT; + + if (!cma) + continue; + + arch_kexec_pre_free_pages(page_address(cma), nr_pages); + dma_release_from_contiguous(NULL, cma, nr_pages); + image->segment_cma[i] = NULL; + } + +} + void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; @@ -591,6 +610,9 @@ void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + /* Free CMA allocations */ + kimage_free_cma(image); + /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. @@ -716,9 +738,69 @@ static struct page *kimage_alloc_page(struct kimage *image, return page; } -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_cma_segment(struct kimage *image, int idx) +{ + struct kexec_segment *segment = &image->segment[idx]; + struct page *cma = image->segment_cma[idx]; + char *ptr = page_address(cma); + unsigned long maddr; + size_t ubytes, mbytes; + int result = 0; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + /* Then copy from source buffer to the CMA one */ + while (mbytes) { + size_t uchunk, mchunk; + + ptr += maddr & ~PAGE_MASK; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + + if (uchunk) { + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + ubytes -= uchunk; + if (image->file_mode) + kbuf += uchunk; + else + buf += uchunk; + } + + if (result) { + result = -EFAULT; + goto out; + } + + ptr += mchunk; + maddr += mchunk; + mbytes -= mchunk; + + cond_resched(); + } + + /* Clear any remainder */ + memset(ptr, 0, mbytes); + +out: + return result; +} + +static int kimage_load_normal_segment(struct kimage *image, int idx) { + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -733,6 +815,9 @@ static int kimage_load_normal_segment(struct kimage *image, mbytes = segment->memsz; maddr = segment->mem; + if (image->segment_cma[idx]) + return kimage_load_cma_segment(image, idx); + result = kimage_set_destination(image, maddr); if (result < 0) goto out; @@ -787,13 +872,13 @@ out: } #ifdef CONFIG_CRASH_DUMP -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_crash_segment(struct kimage *image, int idx) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -858,18 +943,17 @@ out: } #endif -int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) +int kimage_load_segment(struct kimage *image, int idx) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); + result = kimage_load_normal_segment(image, idx); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); + result = kimage_load_crash_segment(image, idx); break; #endif } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 69fe76fd9233..41271eee0f99 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "kexec_internal.h" #ifdef CONFIG_KEXEC_SIG @@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, ret = 0; } + image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); if (IS_ERR(image->cmdline_buf)) { @@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, i, ksegment->buf, ksegment->bufsz, ksegment->mem, ksegment->memsz); - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } @@ -663,6 +666,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf, return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } +static int kexec_alloc_contig(struct kexec_buf *kbuf) +{ + size_t nr_pages = kbuf->memsz >> PAGE_SHIFT; + unsigned long mem; + struct page *p; + + /* User space disabled CMA allocations, bail out. */ + if (kbuf->image->no_cma) + return -EPERM; + + /* Skip CMA logic for crash kernel */ + if (kbuf->image->type == KEXEC_TYPE_CRASH) + return -EPERM; + + p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true); + if (!p) + return -ENOMEM; + + pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p)); + + mem = page_to_boot_pfn(p) << PAGE_SHIFT; + + if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) { + /* Our region is already in use by a statically defined one. Bail out. */ + pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz); + dma_release_from_contiguous(NULL, p, nr_pages); + return -EBUSY; + } + + kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT; + kbuf->cma = p; + + arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0); + + return 0; +} + /** * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel * @kbuf: Parameters for the memory search. @@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (ret <= 0) return ret; + /* + * Try to find a free physically contiguous block of memory first. With that, we + * can avoid any copying at kexec time. + */ + if (!kexec_alloc_contig(kbuf)) + return 0; + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else @@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Ensure minimum alignment needed for segments. */ kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE); kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); + kbuf->cma = NULL; /* Walk the RAM ranges and allocate a suitable range for the buffer */ ret = arch_kexec_locate_mem_hole(kbuf); @@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) ksegment->bufsz = kbuf->bufsz; ksegment->mem = kbuf->mem; ksegment->memsz = kbuf->memsz; + kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma; kbuf->image->nr_segments++; return 0; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 30a733a55a67..228bb88c018b 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void); int sanity_check_segment_list(struct kimage *image); void kimage_free_page_list(struct list_head *list); void kimage_free(struct kimage *image); -int kimage_load_segment(struct kimage *image, struct kexec_segment *segment); +int kimage_load_segment(struct kimage *image, int idx); void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); -- cgit v1.2.3 From 89c52146392948f4cdda3853da9d82ec6d1dd1f4 Mon Sep 17 00:00:00 2001 From: Marcos Alano Date: Tue, 5 Aug 2025 13:44:29 -0700 Subject: Input: add keycode for performance mode key Alienware calls this key "Performance Boost". Dell calls it "G-Mode". The goal is to have a specific keycode to detect when this key is pressed, so userspace can act upon it and do what have to do, usually starting the power profile for performance. Signed-off-by: Marcos Alano Link: https://lore.kernel.org/r/20250509193708.2190586-1-marcoshalano@gmail.com Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 08cb157ab593..ca5851e97fac 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -770,6 +770,9 @@ #define KEY_KBD_LCD_MENU4 0x2bb #define KEY_KBD_LCD_MENU5 0x2bc +/* Performance Boost key (Alienware)/G-Mode key (Dell) */ +#define KEY_PERFORMANCE 0x2bd + #define BTN_TRIGGER_HAPPY 0x2c0 #define BTN_TRIGGER_HAPPY1 0x2c0 #define BTN_TRIGGER_HAPPY2 0x2c1 -- cgit v1.2.3 From 86624ba3b522b6512def25534341da93356c8da4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Jul 2025 13:08:25 -0300 Subject: vfio/pci: Do vf_token checks for VFIO_DEVICE_BIND_IOMMUFD This was missed during the initial implementation. The VFIO PCI encodes the vf_token inside the device name when opening the device from the group FD, something like: "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3" This is used to control access to a VF unless there is co-ordination with the owner of the PF. Since we no longer have a device name in the cdev path, pass the token directly through VFIO_DEVICE_BIND_IOMMUFD using an optional field indicated by VFIO_DEVICE_BIND_FLAG_TOKEN. Fixes: 5fcc26969a16 ("vfio: Add VFIO_DEVICE_BIND_IOMMUFD") Tested-by: Shameer Kolothum Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/0-v3-bdd8716e85fe+3978a-vfio_token_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/device_cdev.c | 38 ++++++++++++++++++++++++-- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 1 + drivers/vfio/pci/mlx5/main.c | 1 + drivers/vfio/pci/nvgrace-gpu/main.c | 2 ++ drivers/vfio/pci/pds/vfio_dev.c | 1 + drivers/vfio/pci/qat/main.c | 1 + drivers/vfio/pci/vfio_pci.c | 1 + drivers/vfio/pci/vfio_pci_core.c | 22 +++++++++------ drivers/vfio/pci/virtio/main.c | 3 ++ include/linux/vfio.h | 4 +++ include/linux/vfio_pci_core.h | 2 ++ include/uapi/linux/vfio.h | 12 +++++++- 12 files changed, 76 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index 281a8dc3ed49..480cac3a0c27 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -60,22 +60,50 @@ static void vfio_df_get_kvm_safe(struct vfio_device_file *df) spin_unlock(&df->kvm_ref_lock); } +static int vfio_df_check_token(struct vfio_device *device, + const struct vfio_device_bind_iommufd *bind) +{ + uuid_t uuid; + + if (!device->ops->match_token_uuid) { + if (bind->flags & VFIO_DEVICE_BIND_FLAG_TOKEN) + return -EINVAL; + return 0; + } + + if (!(bind->flags & VFIO_DEVICE_BIND_FLAG_TOKEN)) + return device->ops->match_token_uuid(device, NULL); + + if (copy_from_user(&uuid, u64_to_user_ptr(bind->token_uuid_ptr), + sizeof(uuid))) + return -EFAULT; + return device->ops->match_token_uuid(device, &uuid); +} + long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, struct vfio_device_bind_iommufd __user *arg) { + const u32 VALID_FLAGS = VFIO_DEVICE_BIND_FLAG_TOKEN; struct vfio_device *device = df->device; struct vfio_device_bind_iommufd bind; unsigned long minsz; + u32 user_size; int ret; static_assert(__same_type(arg->out_devid, df->devid)); minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); - if (copy_from_user(&bind, arg, minsz)) - return -EFAULT; + ret = get_user(user_size, &arg->argsz); + if (ret) + return ret; + if (user_size < minsz) + return -EINVAL; + ret = copy_struct_from_user(&bind, minsz, arg, user_size); + if (ret) + return ret; - if (bind.argsz < minsz || bind.flags || bind.iommufd < 0) + if (bind.iommufd < 0 || bind.flags & ~VALID_FLAGS) return -EINVAL; /* BIND_IOMMUFD only allowed for cdev fds */ @@ -93,6 +121,10 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, goto out_unlock; } + ret = vfio_df_check_token(device, &bind); + if (ret) + goto out_unlock; + df->iommufd = iommufd_ctx_from_fd(bind.iommufd); if (IS_ERR(df->iommufd)) { ret = PTR_ERR(df->iommufd); diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 2149f49aeec7..397f5e445136 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1583,6 +1583,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 93f894fe60d2..7ec47e736a8e 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1372,6 +1372,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e5ac39c4cc6b..d95761dcdd58 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -696,6 +696,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .mmap = nvgrace_gpu_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -715,6 +716,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index f6e0253a8a14..f3ccb0008f67 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -201,6 +201,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index 845ed15b6771..5cce6b0b8d2f 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -614,6 +614,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 5ba39f7623bb..ac10f14417f2 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -138,6 +138,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 261a6dc5a5fc..fad410cf91bc 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1821,9 +1821,13 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count) } EXPORT_SYMBOL_GPL(vfio_pci_core_request); -static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, - bool vf_token, uuid_t *uuid) +int vfio_pci_core_match_token_uuid(struct vfio_device *core_vdev, + const uuid_t *uuid) + { + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + /* * There's always some degree of trust or collaboration between SR-IOV * PF and VFs, even if just that the PF hosts the SR-IOV capability and @@ -1854,7 +1858,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, bool match; if (!pf_vdev) { - if (!vf_token) + if (!uuid) return 0; /* PF is not vfio-pci, no VF token */ pci_info_ratelimited(vdev->pdev, @@ -1862,7 +1866,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, return -EINVAL; } - if (!vf_token) { + if (!uuid) { pci_info_ratelimited(vdev->pdev, "VF token required to access device\n"); return -EACCES; @@ -1880,7 +1884,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, } else if (vdev->vf_token) { mutex_lock(&vdev->vf_token->lock); if (vdev->vf_token->users) { - if (!vf_token) { + if (!uuid) { mutex_unlock(&vdev->vf_token->lock); pci_info_ratelimited(vdev->pdev, "VF token required to access device\n"); @@ -1893,12 +1897,12 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, "Incorrect VF token provided for device\n"); return -EACCES; } - } else if (vf_token) { + } else if (uuid) { uuid_copy(&vdev->vf_token->uuid, uuid); } mutex_unlock(&vdev->vf_token->lock); - } else if (vf_token) { + } else if (uuid) { pci_info_ratelimited(vdev->pdev, "VF token incorrectly provided, not a PF or VF\n"); return -EINVAL; @@ -1906,6 +1910,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, return 0; } +EXPORT_SYMBOL_GPL(vfio_pci_core_match_token_uuid); #define VF_TOKEN_ARG "vf_token=" @@ -1952,7 +1957,8 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf) } } - ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid); + ret = core_vdev->ops->match_token_uuid(core_vdev, + vf_token ? &uuid : NULL); if (ret) return ret; diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index 515fe1b9f94d..8084f3e36a9f 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -94,6 +94,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -114,6 +115,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -134,6 +136,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 707b00772ce1..eb563f538dee 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -105,6 +105,9 @@ struct vfio_device { * @match: Optional device name match callback (return: 0 for no-match, >0 for * match, -errno for abort (ex. match with insufficient or incorrect * additional args) + * @match_token_uuid: Optional device token match/validation. Return 0 + * if the uuid is valid for the device, -errno otherwise. uuid is NULL + * if none was provided. * @dma_unmap: Called when userspace unmaps IOVA from the container * this device is attached to. * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl @@ -132,6 +135,7 @@ struct vfio_device_ops { int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); + int (*match_token_uuid)(struct vfio_device *vdev, const uuid_t *uuid); void (*dma_unmap)(struct vfio_device *vdev, u64 iova, u64 length); int (*device_feature)(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index fbb472dd99b3..f541044e42a2 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -122,6 +122,8 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma); void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count); int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf); +int vfio_pci_core_match_token_uuid(struct vfio_device *core_vdev, + const uuid_t *uuid); int vfio_pci_core_enable(struct vfio_pci_core_device *vdev); void vfio_pci_core_disable(struct vfio_pci_core_device *vdev); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 5764f315137f..75100bf009ba 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -905,10 +905,12 @@ struct vfio_device_feature { * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 18, * struct vfio_device_bind_iommufd) * @argsz: User filled size of this data. - * @flags: Must be 0. + * @flags: Must be 0 or a bit flags of VFIO_DEVICE_BIND_* * @iommufd: iommufd to bind. * @out_devid: The device id generated by this bind. devid is a handle for * this device/iommufd bond and can be used in IOMMUFD commands. + * @token_uuid_ptr: Valid if VFIO_DEVICE_BIND_FLAG_TOKEN. Points to a 16 byte + * UUID in the same format as VFIO_DEVICE_FEATURE_PCI_VF_TOKEN. * * Bind a vfio_device to the specified iommufd. * @@ -917,13 +919,21 @@ struct vfio_device_feature { * * Unbind is automatically conducted when device fd is closed. * + * A token is sometimes required to open the device, unless this is known to be + * needed VFIO_DEVICE_BIND_FLAG_TOKEN should not be set and token_uuid_ptr is + * ignored. The only case today is a PF/VF relationship where the VF bind must + * be provided the same token as VFIO_DEVICE_FEATURE_PCI_VF_TOKEN provided to + * the PF. + * * Return: 0 on success, -errno on failure. */ struct vfio_device_bind_iommufd { __u32 argsz; __u32 flags; +#define VFIO_DEVICE_BIND_FLAG_TOKEN (1 << 0) __s32 iommufd; __u32 out_devid; + __aligned_u64 token_uuid_ptr; }; #define VFIO_DEVICE_BIND_IOMMUFD _IO(VFIO_TYPE, VFIO_BASE + 18) -- cgit v1.2.3 From f22cc6f766f84496b260347d4f0d92cf95f30699 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 11 Aug 2025 16:42:09 -0700 Subject: net: ethtool: support including Flow Label in the flow hash for RSS Some modern NICs support including the IPv6 Flow Label in the flow hash for RSS queue selection. This is outside the old "Microsoft spec", but was included in the OCP NIC spec: [ ] RSS include flow label in the hash (configurable) https://www.opencompute.org/w/index.php?title=Core_Offloads#Receive_Side_Scaling RSS Flow Label hashing allows TCP Protective Load Balancing (PLB) to recover from receiver congestion / overload. Rx CPU/queue hotspots are relatively common for data ingest workloads, and so far we had to try to detect the condition at the RPC layer and reopen the connection. PLB lets us change the Flow Label and therefore Rx CPU on RTO, with minimal packet reordering. PLB reaction times are much faster, and can happen at any point in the connection, not just at RPC boundaries. Due to the nature of host processing (relatively long queues, other kernel subsystems masking IRQs for 100s of msecs) the risk of reordering within the host is higher than in the network. But for applications which need it - it is far preferable to potentially persistent overload of subset of queues. It is expected that the hash communicated to the host may change if the Flow Label changes. This may be surprising to some host software, but I don't expect the devices can compute two Toeplitz hashes, one with the Flow Label for queue selection and one without for the rx hash communicated to the host. Besides, changing the hash may potentially help to change the path thru host queues. User can disable NETIF_F_RXHASH if they require a stable flow hash. The name RXH_IP6_FL was chosen based on what we call Flow Label variables in IPv6 processing (fl). I prefer fl_lbl but that appears to be an fbnic-only spelling. We could spell out RXH_IP6_FLOW_LABEL but existing RXH_ defines are a lot more terse. Willem notes [1] that Flow Label is defined as identifying the flow and therefore including both the flow label _and_ the L4 header fields is not generally necessary. But it should not hurt so it's not explicitly prevented if the driver supports hashing on both at the same time. Link: https://lore.kernel.org/68483433b45e2_3cd66f29440@willemb.c.googlers.com.notmuch [1] Signed-off-by: Jakub Kicinski Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250811234212.580748-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 3 +++ include/uapi/linux/ethtool.h | 1 + net/ethtool/ioctl.c | 25 +++++++++++++++++++++++++ net/ethtool/rss.c | 27 ++++++++++++++------------- 4 files changed, 43 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 1bc1bd7d33c2..7a7594713f1f 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -204,6 +204,9 @@ definitions: doc: dst port in case of TCP/UDP/SCTP - name: gtp-teid + - + name: ip6-fl + doc: IPv6 Flow Label - name: discard value: 31 diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9e9afdd1238a..8bd5ea5469d9 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2380,6 +2380,7 @@ enum { #define RXH_L4_B_0_1 (1 << 6) /* src port in case of TCP/UDP/SCTP */ #define RXH_L4_B_2_3 (1 << 7) /* dst port in case of TCP/UDP/SCTP */ #define RXH_GTP_TEID (1 << 8) /* teid in case of GTP */ +#define RXH_IP6_FL (1 << 9) /* IPv6 flow label */ #define RXH_DISCARD (1 << 31) #define RX_CLS_FLOW_DISC 0xffffffffffffffffULL diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 43a7854e784e..0b2a4d0573b3 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1014,6 +1014,28 @@ static bool flow_type_hashable(u32 flow_type) return false; } +static bool flow_type_v6(u32 flow_type) +{ + switch (flow_type) { + case TCP_V6_FLOW: + case UDP_V6_FLOW: + case SCTP_V6_FLOW: + case AH_ESP_V6_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case IPV6_FLOW: + case GTPU_V6_FLOW: + case GTPC_V6_FLOW: + case GTPC_TEID_V6_FLOW: + case GTPU_EH_V6_FLOW: + case GTPU_UL_V6_FLOW: + case GTPU_DL_V6_FLOW: + return true; + } + + return false; +} + /* When adding a new type, update the assert and, if it's hashable, add it to * the flow_type_hashable switch case. */ @@ -1077,6 +1099,9 @@ ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr) if (rc) return rc; + if (info.data & RXH_IP6_FL && !flow_type_v6(info.flow_type)) + return -EINVAL; + if (info.flow_type & FLOW_RSS && info.rss_context && !ops->rxfh_per_ctx_fields) return -EINVAL; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 992e98abe9dd..202d95e8bf3e 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -536,35 +536,36 @@ void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context) #define RFH_MASK (RXH_L2DA | RXH_VLAN | RXH_IP_SRC | RXH_IP_DST | \ RXH_L3_PROTO | RXH_L4_B_0_1 | RXH_L4_B_2_3 | \ RXH_GTP_TEID | RXH_DISCARD) +#define RFH_MASKv6 (RFH_MASK | RXH_IP6_FL) static const struct nla_policy ethnl_rss_flows_policy[] = { [ETHTOOL_A_FLOW_ETHER] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_IP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_TCP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_UDP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_SCTP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_AH_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_AH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPC4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPC_TEID4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_EH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_UL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_DL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), }; const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1] = { -- cgit v1.2.3 From 37d1ade89606875c9cd6eb3b4ee416b7e1800fc4 Mon Sep 17 00:00:00 2001 From: Hans Zhang <18255117159@163.com> Date: Wed, 13 Aug 2025 22:45:24 +0800 Subject: PCI: Clean up __pci_find_next_cap_ttl() readability Refactor the __pci_find_next_cap_ttl() to improve code clarity: - Replace magic number 0x40 with PCI_STD_HEADER_SIZEOF. - Use ALIGN_DOWN() for position alignment instead of manual bitmask. - Extract PCI capability fields via FIELD_GET() with standardized masks. - Add necessary headers (linux/align.h). No functional changes intended. Signed-off-by: Hans Zhang <18255117159@163.com> Signed-off-by: Bjorn Helgaas Tested-by: Niklas Schnelle Acked-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20250813144529.303548-2-18255117159@163.com --- drivers/pci/pci.c | 9 +++++---- include/uapi/linux/pci_regs.h | 3 +++ 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b0f4d98036cd..40a5c87d9a6b 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -432,17 +433,17 @@ static u8 __pci_find_next_cap_ttl(struct pci_bus *bus, unsigned int devfn, pci_bus_read_config_byte(bus, devfn, pos, &pos); while ((*ttl)--) { - if (pos < 0x40) + if (pos < PCI_STD_HEADER_SIZEOF) break; - pos &= ~3; + pos = ALIGN_DOWN(pos, 4); pci_bus_read_config_word(bus, devfn, pos, &ent); - id = ent & 0xff; + id = FIELD_GET(PCI_CAP_ID_MASK, ent); if (id == 0xff) break; if (id == cap) return pos; - pos = (ent >> 8); + pos = FIELD_GET(PCI_CAP_LIST_NEXT_MASK, ent); } return 0; } diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f5b17745de60..1bba99b46227 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -207,6 +207,9 @@ /* Capability lists */ +#define PCI_CAP_ID_MASK 0x00ff /* Capability ID mask */ +#define PCI_CAP_LIST_NEXT_MASK 0xff00 /* Next Capability Pointer mask */ + #define PCI_CAP_LIST_ID 0 /* Capability ID */ #define PCI_CAP_ID_PM 0x01 /* Power Management */ #define PCI_CAP_ID_AGP 0x02 /* Accelerated Graphics Port */ -- cgit v1.2.3 From c27973211ffcdf0a092eec265d5993e64b89adaf Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Fri, 15 Aug 2025 12:00:28 +0800 Subject: md: keep recovery_cp in mdp_superblock_s commit 907a99c314a5 ("md: rename recovery_cp to resync_offset") replaces recovery_cp with resync_offset in mdp_superblock_s which is in md_p.h. md_p.h is used in userspace too. So mdadm building fails because of this. This patch revert this change. Fixes: 907a99c314a5 ("md: rename recovery_cp to resync_offset") Signed-off-by: Xiao Ni Link: https://lore.kernel.org/linux-raid/20250815040028.18085-1-xni@redhat.com Signed-off-by: Yu Kuai --- drivers/md/md.c | 6 +++--- include/uapi/linux/raid/md_p.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/md/md.c b/drivers/md/md.c index 772cffe02ff5..3836fc7eff67 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1423,7 +1423,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru else { if (sb->events_hi == sb->cp_events_hi && sb->events_lo == sb->cp_events_lo) { - mddev->resync_offset = sb->resync_offset; + mddev->resync_offset = sb->recovery_cp; } else mddev->resync_offset = 0; } @@ -1551,13 +1551,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) mddev->minor_version = sb->minor_version; if (mddev->in_sync) { - sb->resync_offset = mddev->resync_offset; + sb->recovery_cp = mddev->resync_offset; sb->cp_events_hi = (mddev->events>>32); sb->cp_events_lo = (u32)mddev->events; if (mddev->resync_offset == MaxSector) sb->state = (1<< MD_SB_CLEAN); } else - sb->resync_offset = 0; + sb->recovery_cp = 0; sb->layout = mddev->layout; sb->chunk_size = mddev->chunk_sectors << 9; diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index b13946287277..ac74133a4768 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -173,7 +173,7 @@ typedef struct mdp_superblock_s { #else #error unspecified endianness #endif - __u32 resync_offset; /* 11 resync checkpoint sector count */ + __u32 recovery_cp; /* 11 resync checkpoint sector count */ /* There are only valid for minor_version > 90 */ __u64 reshape_position; /* 12,13 next address in array-space for reshape */ __u32 new_level; /* 14 new level we are reshaping to */ -- cgit v1.2.3 From 450bbe43ef90a213d66fac1def64050d9d9ada8e Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 21 Jul 2025 14:12:32 +0000 Subject: crypto: ccp - New bit-field definitions for SNP_PLATFORM_STATUS command Define new bit-field definitions returned by SNP_PLATFORM_STATUS command such as new capabilities like SNP_FEATURE_INFO command availability, ciphertext hiding enabled and capability. Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Reviewed-by: Kim Phillips Signed-off-by: Herbert Xu --- include/uapi/linux/psp-sev.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index eeb20dfb1fda..c2fd324623c4 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -185,6 +185,10 @@ struct sev_user_data_get_id2 { * @mask_chip_id: whether chip id is present in attestation reports or not * @mask_chip_key: whether attestation reports are signed or not * @vlek_en: VLEK (Version Loaded Endorsement Key) hashstick is loaded + * @feature_info: whether SNP_FEATURE_INFO command is available + * @rapl_dis: whether RAPL is disabled + * @ciphertext_hiding_cap: whether platform has ciphertext hiding capability + * @ciphertext_hiding_en: whether ciphertext hiding is enabled * @rsvd1: reserved * @guest_count: the number of guest currently managed by the firmware * @current_tcb_version: current TCB version @@ -200,7 +204,11 @@ struct sev_user_data_snp_status { __u32 mask_chip_id:1; /* Out */ __u32 mask_chip_key:1; /* Out */ __u32 vlek_en:1; /* Out */ - __u32 rsvd1:29; + __u32 feature_info:1; /* Out */ + __u32 rapl_dis:1; /* Out */ + __u32 ciphertext_hiding_cap:1; /* Out */ + __u32 ciphertext_hiding_en:1; /* Out */ + __u32 rsvd1:25; __u32 guest_count; /* Out */ __u64 current_tcb_version; /* Out */ __u64 reported_tcb_version; /* Out */ -- cgit v1.2.3 From 63740349eba78f242bcbf60d5244d7f2b2600853 Mon Sep 17 00:00:00 2001 From: Li Li Date: Sun, 27 Jul 2025 18:29:06 +0000 Subject: binder: introduce transaction reports via netlink Introduce a generic netlink multicast event to report binder transaction failures to userspace. This allows subscribers to monitor these events and take appropriate actions, such as stopping a misbehaving application that is spamming a service with huge amount of transactions. The multicast event contains full details of the failed transactions, including the sender/target PIDs, payload size and specific error code. This interface is defined using a YAML spec, from which the UAPI and kernel headers and source are auto-generated. Signed-off-by: Li Li Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20250727182932.2499194-4-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- Documentation/netlink/specs/binder.yaml | 93 +++++++++++++++++++++++++++++ MAINTAINERS | 1 + drivers/android/Kconfig | 1 + drivers/android/Makefile | 2 +- drivers/android/binder.c | 85 ++++++++++++++++++++++++-- drivers/android/binder_netlink.c | 31 ++++++++++ drivers/android/binder_netlink.h | 20 +++++++ include/uapi/linux/android/binder_netlink.h | 37 ++++++++++++ 8 files changed, 265 insertions(+), 5 deletions(-) create mode 100644 Documentation/netlink/specs/binder.yaml create mode 100644 drivers/android/binder_netlink.c create mode 100644 drivers/android/binder_netlink.h create mode 100644 include/uapi/linux/android/binder_netlink.h (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/binder.yaml b/Documentation/netlink/specs/binder.yaml new file mode 100644 index 000000000000..140b77a6afee --- /dev/null +++ b/Documentation/netlink/specs/binder.yaml @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +# +# Copyright 2025 Google LLC +# +--- +name: binder +protocol: genetlink +uapi-header: linux/android/binder_netlink.h +doc: Binder interface over generic netlink + +attribute-sets: + - + name: report + doc: | + Attributes included within a transaction failure report. The elements + correspond directly with the specific transaction that failed, along + with the error returned to the sender e.g. BR_DEAD_REPLY. + + attributes: + - + name: error + type: u32 + doc: The enum binder_driver_return_protocol returned to the sender. + - + name: context + type: string + doc: The binder context where the transaction occurred. + - + name: from_pid + type: u32 + doc: The PID of the sender process. + - + name: from_tid + type: u32 + doc: The TID of the sender thread. + - + name: to_pid + type: u32 + doc: | + The PID of the recipient process. This attribute may not be present + if the target could not be determined. + - + name: to_tid + type: u32 + doc: | + The TID of the recipient thread. This attribute may not be present + if the target could not be determined. + - + name: is_reply + type: flag + doc: When present, indicates the failed transaction is a reply. + - + name: flags + type: u32 + doc: The bitmask of enum transaction_flags from the transaction. + - + name: code + type: u32 + doc: The application-defined code from the transaction. + - + name: data_size + type: u32 + doc: The transaction payload size in bytes. + +operations: + list: + - + name: report + doc: | + A multicast event sent to userspace subscribers to notify them about + binder transaction failures. The generated report provides the full + details of the specific transaction that failed. The intention is for + programs to monitor these events and react to the failures as needed. + + attribute-set: report + mcgrp: report + event: + attributes: + - error + - context + - from_pid + - from_tid + - to_pid + - to_tid + - is_reply + - flags + - code + - data_size + +mcast-groups: + list: + - + name: report diff --git a/MAINTAINERS b/MAINTAINERS index daf520a13bdf..febc92cca659 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1790,6 +1790,7 @@ M: Suren Baghdasaryan L: linux-kernel@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git +F: Documentation/netlink/specs/binder.yaml F: drivers/android/ ANDROID GOLDFISH PIC DRIVER diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 5b3b8041f827..75af3cf472c8 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -4,6 +4,7 @@ menu "Android" config ANDROID_BINDER_IPC bool "Android Binder IPC Driver" depends on MMU + depends on NET default n help Binder is used in Android for both communication between processes, diff --git a/drivers/android/Makefile b/drivers/android/Makefile index c5d47be0276c..f422f91e026b 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -2,5 +2,5 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o -obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o +obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o binder_netlink.o obj-$(CONFIG_ANDROID_BINDER_ALLOC_KUNIT_TEST) += tests/ diff --git a/drivers/android/binder.c b/drivers/android/binder.c index e4f9cbc6c93f..edfa15d39d6f 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -74,6 +74,7 @@ #include +#include "binder_netlink.h" #include "binder_internal.h" #include "binder_trace.h" @@ -2993,6 +2994,67 @@ static void binder_set_txn_from_error(struct binder_transaction *t, int id, binder_thread_dec_tmpref(from); } +/** + * binder_netlink_report() - report a transaction failure via netlink + * @proc: the binder proc sending the transaction + * @t: the binder transaction that failed + * @data_size: the user provided data size for the transaction + * @error: enum binder_driver_return_protocol returned to sender + */ +static void binder_netlink_report(struct binder_proc *proc, + struct binder_transaction *t, + u32 data_size, + u32 error) +{ + const char *context = proc->context->name; + struct sk_buff *skb; + void *hdr; + + if (!genl_has_listeners(&binder_nl_family, &init_net, + BINDER_NLGRP_REPORT)) + return; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return; + + hdr = genlmsg_put(skb, 0, 0, &binder_nl_family, 0, BINDER_CMD_REPORT); + if (!hdr) + goto free_skb; + + if (nla_put_u32(skb, BINDER_A_REPORT_ERROR, error) || + nla_put_string(skb, BINDER_A_REPORT_CONTEXT, context) || + nla_put_u32(skb, BINDER_A_REPORT_FROM_PID, t->from_pid) || + nla_put_u32(skb, BINDER_A_REPORT_FROM_TID, t->from_tid)) + goto cancel_skb; + + if (t->to_proc && + nla_put_u32(skb, BINDER_A_REPORT_TO_PID, t->to_proc->pid)) + goto cancel_skb; + + if (t->to_thread && + nla_put_u32(skb, BINDER_A_REPORT_TO_TID, t->to_thread->pid)) + goto cancel_skb; + + if (t->is_reply && nla_put_flag(skb, BINDER_A_REPORT_IS_REPLY)) + goto cancel_skb; + + if (nla_put_u32(skb, BINDER_A_REPORT_FLAGS, t->flags) || + nla_put_u32(skb, BINDER_A_REPORT_CODE, t->code) || + nla_put_u32(skb, BINDER_A_REPORT_DATA_SIZE, data_size)) + goto cancel_skb; + + genlmsg_end(skb, hdr); + genlmsg_multicast(&binder_nl_family, skb, 0, BINDER_NLGRP_REPORT, + GFP_KERNEL); + return; + +cancel_skb: + genlmsg_cancel(skb, hdr); +free_skb: + nlmsg_free(skb); +} + static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, @@ -3679,10 +3741,13 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_copy_data_failed; } - if (t->buffer->oneway_spam_suspect) + if (t->buffer->oneway_spam_suspect) { tcomplete->type = BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT; - else + binder_netlink_report(proc, t, tr->data_size, + BR_ONEWAY_SPAM_SUSPECT); + } else { tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; + } if (reply) { binder_enqueue_thread_work(thread, tcomplete); @@ -3730,8 +3795,11 @@ static void binder_transaction(struct binder_proc *proc, * process and is put in a pending queue, waiting for the target * process to be unfrozen. */ - if (return_error == BR_TRANSACTION_PENDING_FROZEN) + if (return_error == BR_TRANSACTION_PENDING_FROZEN) { tcomplete->type = BINDER_WORK_TRANSACTION_PENDING; + binder_netlink_report(proc, t, tr->data_size, + return_error); + } binder_enqueue_thread_work(thread, tcomplete); if (return_error && return_error != BR_TRANSACTION_PENDING_FROZEN) @@ -3789,6 +3857,8 @@ err_invalid_target_handle: binder_dec_node(target_node, 1, 0); binder_dec_node_tmpref(target_node); } + + binder_netlink_report(proc, t, tr->data_size, return_error); kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); err_alloc_t_failed: @@ -7059,12 +7129,19 @@ static int __init binder_init(void) } } - ret = init_binderfs(); + ret = genl_register_family(&binder_nl_family); if (ret) goto err_init_binder_device_failed; + ret = init_binderfs(); + if (ret) + goto err_init_binderfs_failed; + return ret; +err_init_binderfs_failed: + genl_unregister_family(&binder_nl_family); + err_init_binder_device_failed: hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) { misc_deregister(&device->miscdev); diff --git a/drivers/android/binder_netlink.c b/drivers/android/binder_netlink.c new file mode 100644 index 000000000000..d05397a50ca6 --- /dev/null +++ b/drivers/android/binder_netlink.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/binder.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "binder_netlink.h" + +#include + +/* Ops table for binder */ +static const struct genl_split_ops binder_nl_ops[] = { +}; + +static const struct genl_multicast_group binder_nl_mcgrps[] = { + [BINDER_NLGRP_REPORT] = { "report", }, +}; + +struct genl_family binder_nl_family __ro_after_init = { + .name = BINDER_FAMILY_NAME, + .version = BINDER_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = binder_nl_ops, + .n_split_ops = ARRAY_SIZE(binder_nl_ops), + .mcgrps = binder_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(binder_nl_mcgrps), +}; diff --git a/drivers/android/binder_netlink.h b/drivers/android/binder_netlink.h new file mode 100644 index 000000000000..882c7a6b537e --- /dev/null +++ b/drivers/android/binder_netlink.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/binder.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_BINDER_GEN_H +#define _LINUX_BINDER_GEN_H + +#include +#include + +#include + +enum { + BINDER_NLGRP_REPORT, +}; + +extern struct genl_family binder_nl_family; + +#endif /* _LINUX_BINDER_GEN_H */ diff --git a/include/uapi/linux/android/binder_netlink.h b/include/uapi/linux/android/binder_netlink.h new file mode 100644 index 000000000000..b218f96d6668 --- /dev/null +++ b/include/uapi/linux/android/binder_netlink.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/binder.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_ANDROID_BINDER_NETLINK_H +#define _UAPI_LINUX_ANDROID_BINDER_NETLINK_H + +#define BINDER_FAMILY_NAME "binder" +#define BINDER_FAMILY_VERSION 1 + +enum { + BINDER_A_REPORT_ERROR = 1, + BINDER_A_REPORT_CONTEXT, + BINDER_A_REPORT_FROM_PID, + BINDER_A_REPORT_FROM_TID, + BINDER_A_REPORT_TO_PID, + BINDER_A_REPORT_TO_TID, + BINDER_A_REPORT_IS_REPLY, + BINDER_A_REPORT_FLAGS, + BINDER_A_REPORT_CODE, + BINDER_A_REPORT_DATA_SIZE, + + __BINDER_A_REPORT_MAX, + BINDER_A_REPORT_MAX = (__BINDER_A_REPORT_MAX - 1) +}; + +enum { + BINDER_CMD_REPORT = 1, + + __BINDER_CMD_MAX, + BINDER_CMD_MAX = (__BINDER_CMD_MAX - 1) +}; + +#define BINDER_MCGRP_REPORT "report" + +#endif /* _UAPI_LINUX_ANDROID_BINDER_NETLINK_H */ -- cgit v1.2.3 From 8151320c747efb22d30b035af989fed0d502176e Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Tue, 22 Jul 2025 22:32:33 +0800 Subject: ACPI: pfr_update: Fix the driver update version check The security-version-number check should be used rather than the runtime version check for driver updates. Otherwise, the firmware update would fail when the update binary had a lower runtime version number than the current one. Fixes: 0db89fa243e5 ("ACPI: Introduce Platform Firmware Runtime Update device driver") Cc: 5.17+ # 5.17+ Reported-by: "Govindarajulu, Hariganesh" Signed-off-by: Chen Yu Link: https://patch.msgid.link/20250722143233.3970607-1-yu.c.chen@intel.com [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/pfr_update.c | 2 +- include/uapi/linux/pfrut.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/acpi/pfr_update.c b/drivers/acpi/pfr_update.c index 318683744ed1..11b1c2828005 100644 --- a/drivers/acpi/pfr_update.c +++ b/drivers/acpi/pfr_update.c @@ -329,7 +329,7 @@ static bool applicable_image(const void *data, struct pfru_update_cap_info *cap, if (type == PFRU_CODE_INJECT_TYPE) return payload_hdr->rt_ver >= cap->code_rt_version; - return payload_hdr->rt_ver >= cap->drv_rt_version; + return payload_hdr->svn_ver >= cap->drv_svn; } static void print_update_debug_info(struct pfru_updated_result *result, diff --git a/include/uapi/linux/pfrut.h b/include/uapi/linux/pfrut.h index 42fa15f8310d..b77d5c210c26 100644 --- a/include/uapi/linux/pfrut.h +++ b/include/uapi/linux/pfrut.h @@ -89,6 +89,7 @@ struct pfru_payload_hdr { __u32 hw_ver; __u32 rt_ver; __u8 platform_id[16]; + __u32 svn_ver; }; enum pfru_dsm_status { -- cgit v1.2.3 From 620a50c927004f5c9420a7ca9b1a55673dbf3941 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 21 Aug 2025 12:02:07 +0800 Subject: io_uring: uring_cmd: add multishot support Add UAPI flag IORING_URING_CMD_MULTISHOT for supporting multishot uring_cmd operations with provided buffer. This enables drivers to post multiple completion events from a single uring_cmd submission, which is useful for: - Notifying userspace of device events (e.g., interrupt handling) - Supporting devices with multiple event sources (e.g., multi-queue devices) - Avoiding the need for device poll() support when events originate from multiple sources device-wide The implementation adds two new APIs: - io_uring_cmd_select_buffer(): selects a buffer from the provided buffer group for multishot uring_cmd - io_uring_mshot_cmd_post_cqe(): posts a CQE after event data is pushed to the provided buffer Multishot uring_cmd must be used with buffer select (IOSQE_BUFFER_SELECT) and is mutually exclusive with IORING_URING_CMD_FIXED for now. The ublk driver will be the first user of this functionality: https://github.com/ming1/linux/commits/ublk-devel/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250821040210.1152145-3-ming.lei@redhat.com [axboe: fold in fix for !CONFIG_IO_URING] Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 26 ++++++++++++++++ include/uapi/linux/io_uring.h | 6 +++- io_uring/opdef.c | 1 + io_uring/uring_cmd.c | 71 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 102 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index cfa6d0c0c322..4bd3a7339243 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -70,6 +70,21 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, /* Execute the request from a blocking context */ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); +/* + * Select a buffer from the provided buffer group for multishot uring_cmd. + * Returns the selected buffer address and size. + */ +struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, + unsigned buf_group, size_t *len, + unsigned int issue_flags); + +/* + * Complete a multishot uring_cmd event. This will post a CQE to the completion + * queue and update the provided buffer. + */ +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags); + #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -102,6 +117,17 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) { } +static inline struct io_br_sel +io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, unsigned buf_group, + size_t *len, unsigned int issue_flags) +{ + return (struct io_br_sel) { .val = -EOPNOTSUPP }; +} +static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + ssize_t ret, unsigned int issue_flags) +{ + return true; +} #endif /* diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6957dc539d83..1e935f8901c5 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -298,9 +298,13 @@ enum io_uring_op { * sqe->uring_cmd_flags top 8bits aren't available for userspace * IORING_URING_CMD_FIXED use registered buffer; pass this flag * along with setting sqe->buf_index. + * IORING_URING_CMD_MULTISHOT must be used with buffer select, like other + * multishot commands. Not compatible with + * IORING_URING_CMD_FIXED, for now. */ #define IORING_URING_CMD_FIXED (1U << 0) -#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED +#define IORING_URING_CMD_MULTISHOT (1U << 1) +#define IORING_URING_CMD_MASK (IORING_URING_CMD_FIXED | IORING_URING_CMD_MULTISHOT) /* diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 9568785810d9..932319633eac 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -413,6 +413,7 @@ const struct io_issue_def io_issue_defs[] = { #endif }, [IORING_OP_URING_CMD] = { + .buffer_select = 1, .needs_file = 1, .plug = 1, .iopoll = 1, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 053bac89b6c0..3cfb5d51b88a 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -11,6 +11,7 @@ #include "io_uring.h" #include "alloc_cache.h" #include "rsrc.h" +#include "kbuf.h" #include "uring_cmd.h" #include "poll.h" @@ -194,8 +195,21 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ioucmd->flags & ~IORING_URING_CMD_MASK) return -EINVAL; - if (ioucmd->flags & IORING_URING_CMD_FIXED) + if (ioucmd->flags & IORING_URING_CMD_FIXED) { + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) + return -EINVAL; req->buf_index = READ_ONCE(sqe->buf_index); + } + + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) { + if (ioucmd->flags & IORING_URING_CMD_FIXED) + return -EINVAL; + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + } else { + if (req->flags & REQ_F_BUFFER_SELECT) + return -EINVAL; + } ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); @@ -251,6 +265,10 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } ret = file->f_op->uring_cmd(ioucmd, issue_flags); + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) { + if (ret >= 0) + return IOU_ISSUE_SKIP_COMPLETE; + } if (ret == -EAGAIN) { ioucmd->flags |= IORING_URING_CMD_REISSUE; return ret; @@ -333,3 +351,54 @@ bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd, return false; return io_req_post_cqe32(req, cqe); } + +/* + * Work with io_uring_mshot_cmd_post_cqe() together for committing the + * provided buffer upfront + */ +struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, + unsigned buf_group, size_t *len, + unsigned int issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) + return (struct io_br_sel) { .val = -EINVAL }; + + if (WARN_ON_ONCE(!io_do_buffer_select(req))) + return (struct io_br_sel) { .val = -EINVAL }; + + return io_buffer_select(req, len, buf_group, issue_flags); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_buffer_select); + +/* + * Return true if this multishot uring_cmd needs to be completed, otherwise + * the event CQE is posted successfully. + * + * This function must use `struct io_br_sel` returned from + * io_uring_cmd_buffer_select() for committing the buffer in the same + * uring_cmd submission context. + */ +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + unsigned int cflags = 0; + + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) + return true; + + if (sel->val > 0) { + cflags = io_put_kbuf(req, sel->val, sel->buf_list); + if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) + return false; + } + + io_kbuf_recycle(req, sel->buf_list, issue_flags); + if (sel->val < 0) + req_set_fail(req); + io_req_set_res(req, sel->val, cflags); + return true; +} +EXPORT_SYMBOL_GPL(io_uring_mshot_cmd_post_cqe); -- cgit v1.2.3 From b69458735d826f0676585623d028a0fd474f3e4f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:08:14 -0600 Subject: io_uring: add UAPI definitions for mixed CQE postings This adds the CQE flags related to supporting a mixed CQ ring mode, where both normal (16b) and big (32b) CQEs may be posted. No functional changes in this patch. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1e935f8901c5..7af8d10b3aba 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -491,12 +491,22 @@ struct io_uring_cqe { * other provided buffer type, all completions with a * buffer passed back is automatically returned to the * application. + * IORING_CQE_F_SKIP If set, then the application/liburing must ignore this + * CQE. It's only purpose is to fill a gap in the ring, + * if a large CQE is attempted posted when the ring has + * just a single small CQE worth of space left before + * wrapping. + * IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings + * setup in a mixed CQE mode, where both 16b and 32b + * CQEs may be posted to the CQ ring. */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) #define IORING_CQE_F_BUF_MORE (1U << 4) +#define IORING_CQE_F_SKIP (1U << 5) +#define IORING_CQE_F_32 (1U << 15) #define IORING_CQE_BUFFER_SHIFT 16 -- cgit v1.2.3 From 24fc631539cc78225f5c61f99c7666fcff48024d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Aug 2025 23:39:57 -0700 Subject: vhost: Fix ioctl # for VHOST_[GS]ET_FORK_FROM_OWNER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VHOST_[GS]ET_FEATURES_ARRAY ioctl already took 0x83 and it would result in a build error when the vhost uapi header is used for perf tool build like below. In file included from trace/beauty/ioctl.c:93: tools/perf/trace/beauty/generated/ioctl/vhost_virtio_ioctl_array.c: In function ‘ioctl__scnprintf_vhost_virtio_cmd’: tools/perf/trace/beauty/generated/ioctl/vhost_virtio_ioctl_array.c:36:18: error: initialized field overwritten [-Werror=override-init] 36 | [0x83] = "SET_FORK_FROM_OWNER", | ^~~~~~~~~~~~~~~~~~~~~ tools/perf/trace/beauty/generated/ioctl/vhost_virtio_ioctl_array.c:36:18: note: (near initialization for ‘vhost_virtio_ioctl_cmds[131]’) Fixes: 7d9896e9f6d02d8a ("vhost: Reintroduce kthread API and add mode selection") Signed-off-by: Namhyung Kim Message-Id: <20250819063958.833770-1-namhyung@kernel.org> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang --- include/uapi/linux/vhost.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 283348b64af9..c57674a6aa0d 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -260,7 +260,7 @@ * When fork_owner is set to VHOST_FORK_OWNER_KTHREAD: * - Vhost will create vhost workers as kernel threads. */ -#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x83, __u8) +#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x84, __u8) /** * VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device. @@ -268,6 +268,6 @@ * * @return: An 8-bit value indicating the current thread mode. */ -#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x84, __u8) +#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x85, __u8) #endif -- cgit v1.2.3 From da0e2197645c8e01bb6080c7a2b86d9a56cc64a9 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 24 Aug 2025 11:43:53 +0300 Subject: devlink: Make health reporter burst period configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable configuration of the burst period — a time window starting from the first error recovery, during which the reporter allows recovery attempts for each reported error. This feature is helpful when a single underlying issue causes multiple errors, as it delays the start of the grace period to allow sufficient time for recovering all related errors. For example, if multiple TX queues time out simultaneously, a sufficient burst period could allow all affected TX queues to be recovered within that window. Without this period, only the first TX queue that reports a timeout will undergo recovery, while the remaining TX queues will be blocked once the grace period begins. Configuration example: $ devlink health set pci/0000:00:09.0 reporter tx burst_period 500 Configuration example with ynl: ./tools/net/ynl/pyynl/cli.py \ --spec Documentation/netlink/specs/devlink.yaml \ --do health-reporter-set --json '{ "bus-name": "auxiliary", "dev-name": "mlx5_core.eth.0", "port-index": 65535, "health-reporter-name": "tx", "health-reporter-burst-period": 500 }' Signed-off-by: Shahar Shitrit Reviewed-by: Jiri Pirko Reviewed-by: Dragos Tatulea Reviewed-by: Carolina Jubran Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250824084354.533182-5-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 7 ++++++ .../networking/devlink/devlink-health.rst | 2 +- include/uapi/linux/devlink.h | 2 ++ net/devlink/health.c | 28 ++++++++++++++++++++-- net/devlink/netlink_gen.c | 5 ++-- 5 files changed, 39 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index bb87111d5e16..3db59c965869 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -853,6 +853,10 @@ attribute-sets: type: nest multi-attr: true nested-attributes: dl-rate-tc-bws + - + name: health-reporter-burst-period + type: u64 + doc: Time (in msec) for recoveries before starting the grace period. - name: dl-dev-stats subset-of: devlink @@ -1216,6 +1220,8 @@ attribute-sets: name: health-reporter-dump-ts-ns - name: health-reporter-auto-dump + - + name: health-reporter-burst-period - name: dl-attr-stats @@ -1961,6 +1967,7 @@ operations: - health-reporter-graceful-period - health-reporter-auto-recover - health-reporter-auto-dump + - health-reporter-burst-period - name: health-reporter-recover diff --git a/Documentation/networking/devlink/devlink-health.rst b/Documentation/networking/devlink/devlink-health.rst index e0b8cfed610a..4d10536377ab 100644 --- a/Documentation/networking/devlink/devlink-health.rst +++ b/Documentation/networking/devlink/devlink-health.rst @@ -50,7 +50,7 @@ Once an error is reported, devlink health will perform the following actions: * Auto recovery attempt is being done. Depends on: - Auto-recovery configuration - - Grace period vs. time passed since last recover + - Grace period (and burst period) vs. time passed since last recover Devlink formatted message ========================= diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 9fcb25a0f447..bcad11a787a5 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -636,6 +636,8 @@ enum devlink_attr { DEVLINK_ATTR_RATE_TC_BWS, /* nested */ + DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, /* u64 */ + /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate * net/devlink/netlink_gen.c. diff --git a/net/devlink/health.c b/net/devlink/health.c index 94ab77f77add..136a67c36a20 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -116,6 +116,9 @@ __devlink_health_reporter_create(struct devlink *devlink, if (WARN_ON(ops->default_graceful_period && !ops->recover)) return ERR_PTR(-EINVAL); + if (WARN_ON(ops->default_burst_period && !ops->default_graceful_period)) + return ERR_PTR(-EINVAL); + reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); if (!reporter) return ERR_PTR(-ENOMEM); @@ -293,6 +296,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, reporter->graceful_period)) goto reporter_nest_cancel; + if (reporter->ops->recover && + devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, + reporter->burst_period)) + goto reporter_nest_cancel; if (reporter->ops->recover && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, reporter->auto_recover)) @@ -458,16 +465,33 @@ int devlink_nl_health_reporter_set_doit(struct sk_buff *skb, if (!reporter->ops->recover && (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] || + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD])) return -EOPNOTSUPP; if (!reporter->ops->dump && info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) return -EOPNOTSUPP; - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) { reporter->graceful_period = nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); + if (!reporter->graceful_period) + reporter->burst_period = 0; + } + + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]) { + u64 burst_period = + nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]); + + if (!reporter->graceful_period && burst_period) { + NL_SET_ERR_MSG_MOD(info->extack, + "Cannot set burst period without a grace period."); + return -EINVAL; + } + + reporter->burst_period = burst_period; + } if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) reporter->auto_recover = diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index d97c326a9045..9fd00977d59e 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -389,7 +389,7 @@ static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLIN }; /* DEVLINK_CMD_HEALTH_REPORTER_SET - do */ -static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP + 1] = { +static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, @@ -397,6 +397,7 @@ static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATT [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64, }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8, }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8, }, + [DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = { .type = NLA_U64, }, }; /* DEVLINK_CMD_HEALTH_REPORTER_RECOVER - do */ @@ -1032,7 +1033,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_health_reporter_set_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_health_reporter_set_nl_policy, - .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, + .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { -- cgit v1.2.3 From 3d3a04fad25a6621828518a2abe536142d2c1a7d Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:52 -0700 Subject: KVM: Allow and advertise support for host mmap() on guest_memfd files Now that all the x86 and arm64 plumbing for mmap() on guest_memfd is in place, allow userspace to set GUEST_MEMFD_FLAG_MMAP and advertise support via a new capability, KVM_CAP_GUEST_MEMFD_MMAP. The availability of this capability is determined per architecture, and its enablement for a specific guest_memfd instance is controlled by the GUEST_MEMFD_FLAG_MMAP flag at creation time. Update the KVM API documentation to detail the KVM_CAP_GUEST_MEMFD_MMAP capability, the associated GUEST_MEMFD_FLAG_MMAP, and provide essential information regarding support for mmap in guest_memfd. Reviewed-by: David Hildenbrand Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Xiaoyao Li Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-22-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 9 +++++++++ include/uapi/linux/kvm.h | 2 ++ virt/kvm/guest_memfd.c | 7 ++++++- virt/kvm/kvm_main.c | 2 ++ 4 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 6aa40ee05a4a..c17a87a0a5ac 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6414,6 +6414,15 @@ most one mapping per page, i.e. binding multiple memory regions to a single guest_memfd range is not allowed (any number of memory regions can be bound to a single guest_memfd file, but the bound ranges must not overlap). +When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field +supports GUEST_MEMFD_FLAG_MMAP. Setting this flag on guest_memfd creation +enables mmap() and faulting of guest_memfd memory to host userspace. + +When the KVM MMU performs a PFN lookup to service a guest fault and the backing +guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be +consumed from guest_memfd, regardless of whether it is a shared or a private +fault. + See KVM_SET_USER_MEMORY_REGION2 for additional details. 4.143 KVM_PRE_FAULT_MEMORY diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f0f0d49d2544..6efa98a57ec1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -962,6 +962,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 +#define KVM_CAP_GUEST_MEMFD_MMAP 244 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1598,6 +1599,7 @@ struct kvm_memory_attributes { #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) +#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0) struct kvm_create_guest_memfd { __u64 size; diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index d5b445548af4..08a6bc7d25b6 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -314,7 +314,9 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) static bool kvm_gmem_supports_mmap(struct inode *inode) { - return false; + const u64 flags = (u64)inode->i_private; + + return flags & GUEST_MEMFD_FLAG_MMAP; } static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) @@ -522,6 +524,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) u64 flags = args->flags; u64 valid_flags = 0; + if (kvm_arch_supports_gmem_mmap(kvm)) + valid_flags |= GUEST_MEMFD_FLAG_MMAP; + if (flags & ~valid_flags) return -EINVAL; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4f57cb92e109..18f29ef93543 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4918,6 +4918,8 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CAP_GUEST_MEMFD: return 1; + case KVM_CAP_GUEST_MEMFD_MMAP: + return !kvm || kvm_arch_supports_gmem_mmap(kvm); #endif default: break; -- cgit v1.2.3 From 7a37f55af7af868119b4fb69285f5fa03ba8cf35 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 5 Aug 2025 17:10:56 +0200 Subject: fuse: add COPY_FILE_RANGE_64 that allows large copies The FUSE protocol uses struct fuse_write_out to convey the return value of copy_file_range, which is restricted to uint32_t. But the COPY_FILE_RANGE interface supports a 64-bit size copies and there's no reason why copies should be limited to 32-bit. Introduce a new op COPY_FILE_RANGE_64, which is identical, except the number of bytes copied is returned in a 64-bit value. If the fuse server does not support COPY_FILE_RANGE_64, fall back to COPY_FILE_RANGE. Reported-by: Florian Weimer Closes: https://lore.kernel.org/all/lhuh5ynl8z5.fsf@oldenburg.str.redhat.com/ Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 44 ++++++++++++++++++++++++++++++++------------ fs/fuse/fuse_i.h | 3 +++ include/uapi/linux/fuse.h | 12 +++++++++++- 3 files changed, 46 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4adcf09d4b01..867b5fde1237 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2960,10 +2960,12 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, .nodeid_out = ff_out->nodeid, .fh_out = ff_out->fh, .off_out = pos_out, - .len = min_t(size_t, len, UINT_MAX & PAGE_MASK), + .len = len, .flags = flags }; struct fuse_write_out outarg; + struct fuse_copy_file_range_out outarg_64; + u64 bytes_copied; ssize_t err; /* mark unstable when write-back is not used, and file_out gets * extended */ @@ -3013,33 +3015,51 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); - args.opcode = FUSE_COPY_FILE_RANGE; + args.opcode = FUSE_COPY_FILE_RANGE_64; args.nodeid = ff_in->nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg_64); + args.out_args[0].value = &outarg_64; + if (fc->no_copy_file_range_64) { +fallback: + /* Fall back to old op that can't handle large copy length */ + args.opcode = FUSE_COPY_FILE_RANGE; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK); + } err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_copy_file_range = 1; - err = -EOPNOTSUPP; + if (fc->no_copy_file_range_64) { + fc->no_copy_file_range = 1; + err = -EOPNOTSUPP; + } else { + fc->no_copy_file_range_64 = 1; + goto fallback; + } } - if (!err && outarg.size > len) - err = -EIO; - if (err) goto out; + bytes_copied = fc->no_copy_file_range_64 ? + outarg.size : outarg_64.bytes_copied; + + if (bytes_copied > len) { + err = -EIO; + goto out; + } + truncate_inode_pages_range(inode_out->i_mapping, ALIGN_DOWN(pos_out, PAGE_SIZE), - ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); + ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1); file_update_time(file_out); - fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size); + fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied); - err = outarg.size; + err = bytes_copied; out: if (is_unstable) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index cc428d04be3e..d68eea4dd743 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -856,6 +856,9 @@ struct fuse_conn { /** Does the filesystem support copy_file_range? */ unsigned no_copy_file_range:1; + /** Does the filesystem support copy_file_range_64? */ + unsigned no_copy_file_range_64:1; + /* Send DESTROY request */ unsigned int destroy:1; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 122d6586e8d4..94621f68a5cc 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -235,6 +235,10 @@ * * 7.44 * - add FUSE_NOTIFY_INC_EPOCH + * + * 7.45 + * - add FUSE_COPY_FILE_RANGE_64 + * - add struct fuse_copy_file_range_out */ #ifndef _LINUX_FUSE_H @@ -270,7 +274,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 44 +#define FUSE_KERNEL_MINOR_VERSION 45 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -657,6 +661,7 @@ enum fuse_opcode { FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, FUSE_STATX = 52, + FUSE_COPY_FILE_RANGE_64 = 53, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1148,6 +1153,11 @@ struct fuse_copy_file_range_in { uint64_t flags; }; +/* For FUSE_COPY_FILE_RANGE_64 */ +struct fuse_copy_file_range_out { + uint64_t bytes_copied; +}; + #define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0) #define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1) struct fuse_setupmapping_in { -- cgit v1.2.3 From e26dca67fde194340582cfbb0c0bf661825e9e46 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:14:41 -0600 Subject: io_uring: add support for IORING_SETUP_CQE_MIXED Normal rings support 16b CQEs for posting completions, while certain features require the ring to be configured with IORING_SETUP_CQE32, as they need to convey more information per completion. This, in turn, makes ALL the CQEs be 32b in size. This is somewhat wasteful and inefficient, particularly when only certain CQEs need to be of the bigger variant. This adds support for setting up a ring with mixed CQE sizes, using IORING_SETUP_CQE_MIXED. When setup in this mode, CQEs posted to the ring may be either 16b or 32b in size. If a CQE is 32b in size, then IORING_CQE_F_32 is set in the CQE flags to indicate that this is the case. If this flag isn't set, the CQE is the normal 16b variant. CQEs on these types of mixed rings may also have IORING_CQE_F_SKIP set. This can happen if the ring is one (small) CQE entry away from wrapping, and an attempt is made to post a 32b CQE. As CQEs must be contigious in the CQ ring, a 32b CQE cannot wrap the ring. For this case, a single dummy CQE is posted with the SKIP flag set. The application should simply ignore those. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 6 ++++ io_uring/io_uring.c | 78 ++++++++++++++++++++++++++++++++++--------- io_uring/io_uring.h | 49 +++++++++++++++++++-------- io_uring/register.c | 3 +- 4 files changed, 105 insertions(+), 31 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7af8d10b3aba..5135e1be0390 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -225,6 +225,12 @@ enum io_uring_sqe_flags_bit { /* Use hybrid poll in iopoll process */ #define IORING_SETUP_HYBRID_IOPOLL (1U << 17) +/* + * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have + * IORING_CQE_F_32 set in cqe->flags. + */ +#define IORING_SETUP_CQE_MIXED (1U << 18) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5166f11f07c7..6c07efac977c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -620,27 +620,29 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { - size_t cqe_size = sizeof(struct io_uring_cqe); - lockdep_assert_held(&ctx->uring_lock); /* don't abort if we're dying, entries must get freed */ if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; - if (ctx->flags & IORING_SETUP_CQE32) - cqe_size <<= 1; - io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { + size_t cqe_size = sizeof(struct io_uring_cqe); struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; + bool is_cqe32 = false; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); + if (ocqe->cqe.flags & IORING_CQE_F_32 || + ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + cqe_size <<= 1; + } if (!dying) { - if (!io_get_cqe_overflow(ctx, &cqe, true)) + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) break; memcpy(cqe, &ocqe->cqe, cqe_size); } @@ -752,10 +754,12 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + bool is_cqe32 = false; - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + ocq_size <<= 1; + } ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); @@ -773,12 +777,30 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, return ocqe; } +/* + * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE + * because the ring is a single 16b entry away from wrapping. + */ +static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) +{ + if (__io_cqring_events(ctx) < ctx->cq_entries) { + struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; + + cqe->user_data = 0; + cqe->res = 0; + cqe->flags = IORING_CQE_F_SKIP; + ctx->cached_cq_tail++; + return true; + } + return false; +} + /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); @@ -792,12 +814,22 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) return false; + /* + * Post dummy CQE if a 32b CQE is needed and there's only room for a + * 16b CQE before the ring wraps. + */ + if (cqe32 && off + 1 == ctx->cq_entries) { + if (!io_fill_nop_cqe(ctx, off)) + return false; + off = 0; + } + /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); free = ctx->cq_entries - queued; /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); - if (!len) + if (len < (cqe32 + 1)) return false; if (ctx->flags & IORING_SETUP_CQE32) { @@ -815,9 +847,9 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, { struct io_uring_cqe *cqe; - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) + if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) return false; - if (unlikely(!io_get_cqe(ctx, &cqe))) + if (unlikely(!io_get_cqe(ctx, &cqe, true))) return false; memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); @@ -828,14 +860,15 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + bool cqe32 = cflags & IORING_CQE_F_32; struct io_uring_cqe *cqe; - if (likely(io_get_cqe(ctx, &cqe))) { + if (likely(io_get_cqe(ctx, &cqe, cqe32))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - if (ctx->flags & IORING_SETUP_CQE32) { + if (cqe32) { WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } @@ -2756,6 +2789,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, if (check_shl_overflow(off, 1, &off)) return SIZE_MAX; } + if (flags & IORING_SETUP_CQE_MIXED) { + if (cq_entries < 2) + return SIZE_MAX; + } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); @@ -3680,6 +3717,14 @@ static int io_uring_sanitise_params(struct io_uring_params *p) !(flags & IORING_SETUP_SINGLE_ISSUER)) return -EINVAL; + /* + * Nonsensical to ask for CQE32 and mixed CQE support, it's not + * supported to post 16b CQEs on a ring setup with CQE32. + */ + if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == + (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) + return -EINVAL; + return 0; } @@ -3906,7 +3951,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | - IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) + IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL | + IORING_SETUP_CQE_MIXED)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index abc6de227f74..2bcb565d9de6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -75,7 +75,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) unsigned long rings_size(unsigned int flags, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset); int io_uring_fill_params(unsigned entries, struct io_uring_params *p); -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); @@ -169,25 +169,31 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, - bool overflow) + bool overflow, bool cqe32) { io_lockdep_assert_cq_locked(ctx); - if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { - if (unlikely(!io_cqe_cache_refill(ctx, overflow))) + if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) { + if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32))) return false; } *ret = ctx->cqe_cached; ctx->cached_cq_tail++; ctx->cqe_cached++; - if (ctx->flags & IORING_SETUP_CQE32) + if (ctx->flags & IORING_SETUP_CQE32) { + ctx->cqe_cached++; + } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) { ctx->cqe_cached++; + ctx->cached_cq_tail++; + } + WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel); return true; } -static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) +static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, + bool cqe32) { - return io_get_cqe_overflow(ctx, ret, false); + return io_get_cqe_overflow(ctx, ret, false, cqe32); } static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, @@ -196,25 +202,24 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, io_lockdep_assert_cq_locked(ctx); ctx->submit_state.cq_flush = true; - return io_get_cqe(ctx, cqe_ret); + return io_get_cqe(ctx, cqe_ret, ctx->flags & IORING_SETUP_CQE_MIXED); } static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { + bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32; struct io_uring_cqe *cqe; /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. + * If we can't get a cq entry, userspace overflowed the submission + * (by quite a lot). */ - if (unlikely(!io_get_cqe(ctx, &cqe))) + if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32))) return false; - memcpy(cqe, &req->cqe, sizeof(*cqe)); - if (ctx->flags & IORING_SETUP_CQE32) { + if (is_cqe32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } @@ -239,6 +244,22 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.flags = cflags; } +static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx) +{ + if (ctx->flags & IORING_SETUP_CQE_MIXED) + return IORING_CQE_F_32; + return 0; +} + +static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags, + __u64 extra1, __u64 extra2) +{ + req->cqe.res = res; + req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx); + req->big_cqe.extra1 = extra1; + req->big_cqe.extra2 = extra2; +} + static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, struct io_kiocb *req) { diff --git a/io_uring/register.c b/io_uring/register.c index a59589249fce..a1a9b2884eae 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -396,7 +396,8 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ - IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ + IORING_SETUP_CQE_MIXED) static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { -- cgit v1.2.3 From 806ecb209aa86fcc1d92bc9f10323cf773f64d6d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:22:16 -0600 Subject: io_uring/nop: add support for IORING_SETUP_CQE_MIXED This adds support for setting IORING_NOP_CQE32 as a flag for a NOP command, in which case a 32b CQE will be posted rather than a regular one. This is the default if the ring has been setup with IORING_SETUP_CQE32. If the ring has been setup with IORING_SETUP_CQE_MIXED, then 16b CQEs will be posted without this flag set, and 32b CQEs if this flag is set. For the latter case, sqe->off is what will be posted as cqe->big_cqe[0] and sqe->addr is what will be posted as cqe->big_cqe[1]. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/nop.c | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5135e1be0390..04ebff33d0e6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -464,6 +464,7 @@ enum io_uring_msg_ring_flags { #define IORING_NOP_FIXED_FILE (1U << 2) #define IORING_NOP_FIXED_BUFFER (1U << 3) #define IORING_NOP_TW (1U << 4) +#define IORING_NOP_CQE32 (1U << 5) /* * IO completion data structure (Completion Queue Entry) diff --git a/io_uring/nop.c b/io_uring/nop.c index 20ed0f85b1c2..3caf07878f8a 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -17,11 +17,13 @@ struct io_nop { int result; int fd; unsigned int flags; + __u64 extra1; + __u64 extra2; }; #define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \ - IORING_NOP_TW) + IORING_NOP_TW | IORING_NOP_CQE32) int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -41,6 +43,14 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) nop->fd = -1; if (nop->flags & IORING_NOP_FIXED_BUFFER) req->buf_index = READ_ONCE(sqe->buf_index); + if (nop->flags & IORING_NOP_CQE32) { + struct io_ring_ctx *ctx = req->ctx; + + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) + return -EINVAL; + nop->extra1 = READ_ONCE(sqe->off); + nop->extra2 = READ_ONCE(sqe->addr); + } return 0; } @@ -68,7 +78,10 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags) done: if (ret < 0) req_set_fail(req); - io_req_set_res(req, nop->result, 0); + if (nop->flags & IORING_NOP_CQE32) + io_req_set_res32(req, nop->result, 0, nop->extra1, nop->extra2); + else + io_req_set_res(req, nop->result, 0); if (nop->flags & IORING_NOP_TW) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); -- cgit v1.2.3 From c2a756891bb428104fa8899998ba277042274cdb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 25 Aug 2025 13:18:28 -0700 Subject: uapi: wrap compiler_types.h in an ifdef instead of the implicit strip The uAPI stddef header includes compiler_types.h, a kernel-only header, to make sure that kernel definitions of annotations like __counted_by() take precedence. There is a hack in scripts/headers_install.sh which strips includes of compiler.h and compiler_types.h when installing uAPI headers. While explicit handling makes sense for compiler.h, which is included all over the uAPI, compiler_types.h is only included by stddef.h (within the uAPI, obviously it's included in kernel code a lot). Remove the stripping from scripts/headers_install.sh and wrap the include of compiler_types.h in #ifdef __KERNEL__ instead. This should be equivalent functionally, but is easier to understand to a casual reader of the code. It also makes it easier to work with kernel headers directly from under tools/ Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250825201828.2370083-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/uapi/linux/stddef.h | 2 ++ scripts/headers_install.sh | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index b87df1b485c2..9a28f7d9a334 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -2,7 +2,9 @@ #ifndef _UAPI_LINUX_STDDEF_H #define _UAPI_LINUX_STDDEF_H +#ifdef __KERNEL__ #include +#endif #ifndef __always_inline #define __always_inline inline diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh index 6bbccb43f7e7..4c20c62c4faf 100755 --- a/scripts/headers_install.sh +++ b/scripts/headers_install.sh @@ -32,7 +32,7 @@ fi sed -E -e ' s/([[:space:](])(__user|__force|__iomem)[[:space:]]/\1/g s/__attribute_const__([[:space:]]|$)/\1/g - s@^#include @@ + s@^#include @@ s/(^|[^a-zA-Z0-9])__packed([^a-zA-Z0-9_]|$)/\1__attribute__((packed))\2/g s/(^|[[:space:](])(inline|asm|volatile)([[:space:](]|$)/\1__\2__\3/g s@#(ifndef|define|endif[[:space:]]*/[*])[[:space:]]*_UAPI@#\1 @ -- cgit v1.2.3 From 4247053aaacda75480e1918e0d58a687ae5a266a Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Tue, 5 Aug 2025 22:47:17 +0200 Subject: media: uapi: Move colorimetry controls at the end of the file The colorimetry controls class is defined after the stateless codec class at the top of the controls header. It is currently defined in the middle of stateless codec controls. Move the colorimetry controls after the stateless codec controls, at the end of the file. Signed-off-by: Paul Kocialkowski Signed-off-by: Hans Verkuil --- include/uapi/linux/v4l2-controls.h | 68 +++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index f836512e9deb..4a483ff1c418 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -2549,40 +2549,6 @@ struct v4l2_ctrl_hevc_scaling_matrix { __u8 scaling_list_dc_coef_32x32[2]; }; -#define V4L2_CID_COLORIMETRY_CLASS_BASE (V4L2_CTRL_CLASS_COLORIMETRY | 0x900) -#define V4L2_CID_COLORIMETRY_CLASS (V4L2_CTRL_CLASS_COLORIMETRY | 1) - -#define V4L2_CID_COLORIMETRY_HDR10_CLL_INFO (V4L2_CID_COLORIMETRY_CLASS_BASE + 0) - -struct v4l2_ctrl_hdr10_cll_info { - __u16 max_content_light_level; - __u16 max_pic_average_light_level; -}; - -#define V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY (V4L2_CID_COLORIMETRY_CLASS_BASE + 1) - -#define V4L2_HDR10_MASTERING_PRIMARIES_X_LOW 5 -#define V4L2_HDR10_MASTERING_PRIMARIES_X_HIGH 37000 -#define V4L2_HDR10_MASTERING_PRIMARIES_Y_LOW 5 -#define V4L2_HDR10_MASTERING_PRIMARIES_Y_HIGH 42000 -#define V4L2_HDR10_MASTERING_WHITE_POINT_X_LOW 5 -#define V4L2_HDR10_MASTERING_WHITE_POINT_X_HIGH 37000 -#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_LOW 5 -#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_HIGH 42000 -#define V4L2_HDR10_MASTERING_MAX_LUMA_LOW 50000 -#define V4L2_HDR10_MASTERING_MAX_LUMA_HIGH 100000000 -#define V4L2_HDR10_MASTERING_MIN_LUMA_LOW 1 -#define V4L2_HDR10_MASTERING_MIN_LUMA_HIGH 50000 - -struct v4l2_ctrl_hdr10_mastering_display { - __u16 display_primaries_x[3]; - __u16 display_primaries_y[3]; - __u16 white_point_x; - __u16 white_point_y; - __u32 max_display_mastering_luminance; - __u32 min_display_mastering_luminance; -}; - /* Stateless VP9 controls */ #define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED 0x1 @@ -3515,4 +3481,38 @@ struct v4l2_ctrl_av1_film_grain { #define V4L2_CID_MPEG_MFC51_BASE V4L2_CID_CODEC_MFC51_BASE #endif +#define V4L2_CID_COLORIMETRY_CLASS_BASE (V4L2_CTRL_CLASS_COLORIMETRY | 0x900) +#define V4L2_CID_COLORIMETRY_CLASS (V4L2_CTRL_CLASS_COLORIMETRY | 1) + +#define V4L2_CID_COLORIMETRY_HDR10_CLL_INFO (V4L2_CID_COLORIMETRY_CLASS_BASE + 0) + +struct v4l2_ctrl_hdr10_cll_info { + __u16 max_content_light_level; + __u16 max_pic_average_light_level; +}; + +#define V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY (V4L2_CID_COLORIMETRY_CLASS_BASE + 1) + +#define V4L2_HDR10_MASTERING_PRIMARIES_X_LOW 5 +#define V4L2_HDR10_MASTERING_PRIMARIES_X_HIGH 37000 +#define V4L2_HDR10_MASTERING_PRIMARIES_Y_LOW 5 +#define V4L2_HDR10_MASTERING_PRIMARIES_Y_HIGH 42000 +#define V4L2_HDR10_MASTERING_WHITE_POINT_X_LOW 5 +#define V4L2_HDR10_MASTERING_WHITE_POINT_X_HIGH 37000 +#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_LOW 5 +#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_HIGH 42000 +#define V4L2_HDR10_MASTERING_MAX_LUMA_LOW 50000 +#define V4L2_HDR10_MASTERING_MAX_LUMA_HIGH 100000000 +#define V4L2_HDR10_MASTERING_MIN_LUMA_LOW 1 +#define V4L2_HDR10_MASTERING_MIN_LUMA_HIGH 50000 + +struct v4l2_ctrl_hdr10_mastering_display { + __u16 display_primaries_x[3]; + __u16 display_primaries_y[3]; + __u16 white_point_x; + __u16 white_point_y; + __u32 max_display_mastering_luminance; + __u32 min_display_mastering_luminance; +}; + #endif -- cgit v1.2.3 From 481c12018c252f7fc88b4bd05e882b9e1bf260c3 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Tue, 5 Aug 2025 22:47:18 +0200 Subject: media: uapi: Cleanup tab after define in headers Some definitions use a tab after the define keyword instead of the usual single space. Replace it for better consistency. Signed-off-by: Paul Kocialkowski Signed-off-by: Hans Verkuil --- include/uapi/linux/v4l2-controls.h | 30 +++++++++++++++--------------- include/uapi/linux/videodev2.h | 18 +++++++++--------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 4a483ff1c418..7aef88465d04 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -1193,7 +1193,7 @@ enum v4l2_flash_strobe_source { #define V4L2_CID_JPEG_CLASS_BASE (V4L2_CTRL_CLASS_JPEG | 0x900) #define V4L2_CID_JPEG_CLASS (V4L2_CTRL_CLASS_JPEG | 1) -#define V4L2_CID_JPEG_CHROMA_SUBSAMPLING (V4L2_CID_JPEG_CLASS_BASE + 1) +#define V4L2_CID_JPEG_CHROMA_SUBSAMPLING (V4L2_CID_JPEG_CLASS_BASE + 1) enum v4l2_jpeg_chroma_subsampling { V4L2_JPEG_CHROMA_SUBSAMPLING_444 = 0, V4L2_JPEG_CHROMA_SUBSAMPLING_422 = 1, @@ -1202,15 +1202,15 @@ enum v4l2_jpeg_chroma_subsampling { V4L2_JPEG_CHROMA_SUBSAMPLING_410 = 4, V4L2_JPEG_CHROMA_SUBSAMPLING_GRAY = 5, }; -#define V4L2_CID_JPEG_RESTART_INTERVAL (V4L2_CID_JPEG_CLASS_BASE + 2) -#define V4L2_CID_JPEG_COMPRESSION_QUALITY (V4L2_CID_JPEG_CLASS_BASE + 3) +#define V4L2_CID_JPEG_RESTART_INTERVAL (V4L2_CID_JPEG_CLASS_BASE + 2) +#define V4L2_CID_JPEG_COMPRESSION_QUALITY (V4L2_CID_JPEG_CLASS_BASE + 3) -#define V4L2_CID_JPEG_ACTIVE_MARKER (V4L2_CID_JPEG_CLASS_BASE + 4) -#define V4L2_JPEG_ACTIVE_MARKER_APP0 (1 << 0) -#define V4L2_JPEG_ACTIVE_MARKER_APP1 (1 << 1) -#define V4L2_JPEG_ACTIVE_MARKER_COM (1 << 16) -#define V4L2_JPEG_ACTIVE_MARKER_DQT (1 << 17) -#define V4L2_JPEG_ACTIVE_MARKER_DHT (1 << 18) +#define V4L2_CID_JPEG_ACTIVE_MARKER (V4L2_CID_JPEG_CLASS_BASE + 4) +#define V4L2_JPEG_ACTIVE_MARKER_APP0 (1 << 0) +#define V4L2_JPEG_ACTIVE_MARKER_APP1 (1 << 1) +#define V4L2_JPEG_ACTIVE_MARKER_COM (1 << 16) +#define V4L2_JPEG_ACTIVE_MARKER_DQT (1 << 17) +#define V4L2_JPEG_ACTIVE_MARKER_DHT (1 << 18) /* Image source controls */ @@ -1243,10 +1243,10 @@ enum v4l2_jpeg_chroma_subsampling { #define V4L2_CID_DV_CLASS_BASE (V4L2_CTRL_CLASS_DV | 0x900) #define V4L2_CID_DV_CLASS (V4L2_CTRL_CLASS_DV | 1) -#define V4L2_CID_DV_TX_HOTPLUG (V4L2_CID_DV_CLASS_BASE + 1) -#define V4L2_CID_DV_TX_RXSENSE (V4L2_CID_DV_CLASS_BASE + 2) -#define V4L2_CID_DV_TX_EDID_PRESENT (V4L2_CID_DV_CLASS_BASE + 3) -#define V4L2_CID_DV_TX_MODE (V4L2_CID_DV_CLASS_BASE + 4) +#define V4L2_CID_DV_TX_HOTPLUG (V4L2_CID_DV_CLASS_BASE + 1) +#define V4L2_CID_DV_TX_RXSENSE (V4L2_CID_DV_CLASS_BASE + 2) +#define V4L2_CID_DV_TX_EDID_PRESENT (V4L2_CID_DV_CLASS_BASE + 3) +#define V4L2_CID_DV_TX_MODE (V4L2_CID_DV_CLASS_BASE + 4) enum v4l2_dv_tx_mode { V4L2_DV_TX_MODE_DVI_D = 0, V4L2_DV_TX_MODE_HDMI = 1, @@ -1267,7 +1267,7 @@ enum v4l2_dv_it_content_type { V4L2_DV_IT_CONTENT_TYPE_NO_ITC = 4, }; -#define V4L2_CID_DV_RX_POWER_PRESENT (V4L2_CID_DV_CLASS_BASE + 100) +#define V4L2_CID_DV_RX_POWER_PRESENT (V4L2_CID_DV_CLASS_BASE + 100) #define V4L2_CID_DV_RX_RGB_RANGE (V4L2_CID_DV_CLASS_BASE + 101) #define V4L2_CID_DV_RX_IT_CONTENT_TYPE (V4L2_CID_DV_CLASS_BASE + 102) @@ -2552,7 +2552,7 @@ struct v4l2_ctrl_hevc_scaling_matrix { /* Stateless VP9 controls */ #define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED 0x1 -#define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE 0x2 +#define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE 0x2 /** * struct v4l2_vp9_loop_filter - VP9 loop filter parameters diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 3dd9fa45dde1..64943f1a6149 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -1607,8 +1607,8 @@ struct v4l2_bt_timings { } __attribute__ ((packed)); /* Interlaced or progressive format */ -#define V4L2_DV_PROGRESSIVE 0 -#define V4L2_DV_INTERLACED 1 +#define V4L2_DV_PROGRESSIVE 0 +#define V4L2_DV_INTERLACED 1 /* Polarities. If bit is not set, it is assumed to be negative polarity */ #define V4L2_DV_VSYNC_POS_POL 0x00000001 @@ -2788,15 +2788,15 @@ struct v4l2_remove_buffers { * Only implemented if CONFIG_VIDEO_ADV_DEBUG is defined. * You must be root to use these ioctls. Never use these in applications! */ -#define VIDIOC_DBG_S_REGISTER _IOW('V', 79, struct v4l2_dbg_register) -#define VIDIOC_DBG_G_REGISTER _IOWR('V', 80, struct v4l2_dbg_register) +#define VIDIOC_DBG_S_REGISTER _IOW('V', 79, struct v4l2_dbg_register) +#define VIDIOC_DBG_G_REGISTER _IOWR('V', 80, struct v4l2_dbg_register) #define VIDIOC_S_HW_FREQ_SEEK _IOW('V', 82, struct v4l2_hw_freq_seek) -#define VIDIOC_S_DV_TIMINGS _IOWR('V', 87, struct v4l2_dv_timings) -#define VIDIOC_G_DV_TIMINGS _IOWR('V', 88, struct v4l2_dv_timings) -#define VIDIOC_DQEVENT _IOR('V', 89, struct v4l2_event) -#define VIDIOC_SUBSCRIBE_EVENT _IOW('V', 90, struct v4l2_event_subscription) -#define VIDIOC_UNSUBSCRIBE_EVENT _IOW('V', 91, struct v4l2_event_subscription) +#define VIDIOC_S_DV_TIMINGS _IOWR('V', 87, struct v4l2_dv_timings) +#define VIDIOC_G_DV_TIMINGS _IOWR('V', 88, struct v4l2_dv_timings) +#define VIDIOC_DQEVENT _IOR('V', 89, struct v4l2_event) +#define VIDIOC_SUBSCRIBE_EVENT _IOW('V', 90, struct v4l2_event_subscription) +#define VIDIOC_UNSUBSCRIBE_EVENT _IOW('V', 91, struct v4l2_event_subscription) #define VIDIOC_CREATE_BUFS _IOWR('V', 92, struct v4l2_create_buffers) #define VIDIOC_PREPARE_BUF _IOWR('V', 93, struct v4l2_buffer) #define VIDIOC_G_SELECTION _IOWR('V', 94, struct v4l2_selection) -- cgit v1.2.3 From 039b9302d64ec35f70c91919cd7bcdbc1aef3707 Mon Sep 17 00:00:00 2001 From: Jammy Huang Date: Tue, 26 Aug 2025 10:25:01 +0800 Subject: media: aspeed: Allow to capture from SoC display (GFX) ASPEED BMC IC has 2 different display engines. Please find AST2600's datasheet to get detailed information. 1. VGA on PCIe 2. SoC Display (GFX) By default, video engine (VE) will capture video from VGA. This patch adds an option to capture video from GFX with standard ioctl, vidioc_s_input. An enum, aspeed_video_input, is added for this purpose. enum aspeed_video_input { VIDEO_INPUT_VGA = 0, VIDEO_INPUT_GFX, VIDEO_INPUT_MAX }; To test this feature, you will need to enable GFX first. Please refer to ASPEED's SDK_User_Guide, 6.3.x Soc Display driver, for more information. In your application, you will need to use v4l2 ioctl, VIDIOC_S_INPUT, as below to select before start streaming. int rc; struct v4l2_input input; input.index = VIDEO_INPUT_GFX; rc = ioctl(fd, VIDIOC_S_INPUT, &input); if (rc < 0) { ... } Link: https://github.com/AspeedTech-BMC/openbmc/releases Signed-off-by: Jammy Huang Signed-off-by: Hans Verkuil [hverkuil: split up three overly long lines] --- drivers/media/platform/aspeed/aspeed-video.c | 199 +++++++++++++++++++++++---- include/uapi/linux/aspeed-video.h | 7 + 2 files changed, 178 insertions(+), 28 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/media/platform/aspeed/aspeed-video.c b/drivers/media/platform/aspeed/aspeed-video.c index 54cae0da9aca..b83e43245277 100644 --- a/drivers/media/platform/aspeed/aspeed-video.c +++ b/drivers/media/platform/aspeed/aspeed-video.c @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -25,6 +26,8 @@ #include #include #include +#include +#include #include #include #include @@ -203,6 +206,25 @@ #define VE_MEM_RESTRICT_START 0x310 #define VE_MEM_RESTRICT_END 0x314 +/* SCU's registers */ +#define SCU_MISC_CTRL 0xC0 +#define SCU_DPLL_SOURCE BIT(20) + +/* GFX's registers */ +#define GFX_CTRL 0x60 +#define GFX_CTRL_ENABLE BIT(0) +#define GFX_CTRL_FMT GENMASK(9, 7) + +#define GFX_H_DISPLAY 0x70 +#define GFX_H_DISPLAY_DE GENMASK(28, 16) +#define GFX_H_DISPLAY_TOTAL GENMASK(12, 0) + +#define GFX_V_DISPLAY 0x78 +#define GFX_V_DISPLAY_DE GENMASK(27, 16) +#define GFX_V_DISPLAY_TOTAL GENMASK(11, 0) + +#define GFX_DISPLAY_ADDR 0x80 + /* * VIDEO_MODE_DETECT_DONE: a flag raised if signal lock * VIDEO_RES_CHANGE: a flag raised if res_change work on-going @@ -262,6 +284,7 @@ struct aspeed_video_perf { /* * struct aspeed_video - driver data * + * version: holds the version of aspeed SoC * res_work: holds the delayed_work for res-detection if unlock * buffers: holds the list of buffer queued from user * flags: holds the state of video @@ -273,6 +296,7 @@ struct aspeed_video_perf { * yuv420: a flag raised if JPEG subsampling is 420 * format: holds the video format * hq_mode: a flag raised if HQ is enabled. Only for VIDEO_FMT_ASPEED + * input: holds the video input * frame_rate: holds the frame_rate * jpeg_quality: holds jpeq's quality (0~11) * jpeg_hq_quality: holds hq's quality (1~12) only if hq_mode enabled @@ -298,6 +322,9 @@ struct aspeed_video { struct video_device vdev; struct mutex video_lock; /* v4l2 and videobuf2 lock */ + struct regmap *scu; + struct regmap *gfx; + u32 version; u32 jpeg_mode; u32 comp_size_read; @@ -316,6 +343,7 @@ struct aspeed_video { bool yuv420; enum aspeed_video_format format; bool hq_mode; + enum aspeed_video_input input; unsigned int frame_rate; unsigned int jpeg_quality; unsigned int jpeg_hq_quality; @@ -331,21 +359,25 @@ struct aspeed_video { #define to_aspeed_video(x) container_of((x), struct aspeed_video, v4l2_dev) struct aspeed_video_config { + u32 version; u32 jpeg_mode; u32 comp_size_read; }; static const struct aspeed_video_config ast2400_config = { + .version = 4, .jpeg_mode = AST2400_VE_SEQ_CTRL_JPEG_MODE, .comp_size_read = AST2400_VE_COMP_SIZE_READ_BACK, }; static const struct aspeed_video_config ast2500_config = { + .version = 5, .jpeg_mode = AST2500_VE_SEQ_CTRL_JPEG_MODE, .comp_size_read = AST2400_VE_COMP_SIZE_READ_BACK, }; static const struct aspeed_video_config ast2600_config = { + .version = 6, .jpeg_mode = AST2500_VE_SEQ_CTRL_JPEG_MODE, .comp_size_read = AST2600_VE_COMP_SIZE_READ_BACK, }; @@ -485,6 +517,7 @@ static const struct v4l2_dv_timings_cap aspeed_video_timings_cap = { static const char * const format_str[] = {"Standard JPEG", "Aspeed JPEG"}; +static const char * const input_str[] = {"HOST VGA", "BMC GFX"}; static unsigned int debug; @@ -609,6 +642,14 @@ static int aspeed_video_start_frame(struct aspeed_video *video) aspeed_video_free_buf(video, &video->bcd); } + if (video->input == VIDEO_INPUT_GFX) { + u32 val; + + // update input buffer address as gfx's + regmap_read(video->gfx, GFX_DISPLAY_ADDR, &val); + aspeed_video_write(video, VE_TGS_0, val); + } + spin_lock_irqsave(&video->lock, flags); buf = list_first_entry_or_null(&video->buffers, struct aspeed_video_buffer, link); @@ -1026,9 +1067,23 @@ static void aspeed_video_get_timings(struct aspeed_video *v, } } +static void aspeed_video_get_resolution_gfx(struct aspeed_video *video, + struct v4l2_bt_timings *det) +{ + u32 h_val, v_val; + + regmap_read(video->gfx, GFX_H_DISPLAY, &h_val); + regmap_read(video->gfx, GFX_V_DISPLAY, &v_val); + + det->width = FIELD_GET(GFX_H_DISPLAY_DE, h_val) + 1; + det->height = FIELD_GET(GFX_V_DISPLAY_DE, v_val) + 1; + video->v4l2_input_status = 0; +} + #define res_check(v) test_and_clear_bit(VIDEO_MODE_DETECT_DONE, &(v)->flags) -static void aspeed_video_get_resolution(struct aspeed_video *video) +static void aspeed_video_get_resolution_vga(struct aspeed_video *video, + struct v4l2_bt_timings *det) { bool invalid_resolution = true; int rc; @@ -1036,7 +1091,6 @@ static void aspeed_video_get_resolution(struct aspeed_video *video) u32 mds; u32 src_lr_edge; u32 src_tb_edge; - struct v4l2_bt_timings *det = &video->detected_timings; det->width = MIN_WIDTH; det->height = MIN_HEIGHT; @@ -1113,14 +1167,20 @@ static void aspeed_video_get_resolution(struct aspeed_video *video) aspeed_video_get_timings(video, det); - /* - * Enable mode-detect watchdog, resolution-change watchdog and - * automatic compression after frame capture. - */ + /* Enable mode-detect watchdog, resolution-change watchdog */ aspeed_video_update(video, VE_INTERRUPT_CTRL, 0, VE_INTERRUPT_MODE_DETECT_WD); - aspeed_video_update(video, VE_SEQ_CTRL, 0, - VE_SEQ_CTRL_AUTO_COMP | VE_SEQ_CTRL_EN_WATCHDOG); + aspeed_video_update(video, VE_SEQ_CTRL, 0, VE_SEQ_CTRL_EN_WATCHDOG); +} + +static void aspeed_video_get_resolution(struct aspeed_video *video) +{ + struct v4l2_bt_timings *det = &video->detected_timings; + + if (video->input == VIDEO_INPUT_GFX) + aspeed_video_get_resolution_gfx(video, det); + else + aspeed_video_get_resolution_vga(video, det); v4l2_dbg(1, debug, &video->v4l2_dev, "Got resolution: %dx%d\n", det->width, det->height); @@ -1156,7 +1216,7 @@ static void aspeed_video_set_resolution(struct aspeed_video *video) aspeed_video_write(video, VE_SRC_SCANLINE_OFFSET, act->width * 4); /* Don't use direct mode below 1024 x 768 (irqs don't fire) */ - if (size < DIRECT_FETCH_THRESHOLD) { + if (video->input == VIDEO_INPUT_VGA && size < DIRECT_FETCH_THRESHOLD) { v4l2_dbg(1, debug, &video->v4l2_dev, "Capture: Sync Mode\n"); aspeed_video_write(video, VE_TGS_0, FIELD_PREP(VE_TGS_FIRST, @@ -1171,10 +1231,20 @@ static void aspeed_video_set_resolution(struct aspeed_video *video) VE_CTRL_INT_DE | VE_CTRL_DIRECT_FETCH, VE_CTRL_INT_DE); } else { + u32 ctrl, val, bpp; + v4l2_dbg(1, debug, &video->v4l2_dev, "Capture: Direct Mode\n"); + ctrl = VE_CTRL_DIRECT_FETCH; + if (video->input == VIDEO_INPUT_GFX) { + regmap_read(video->gfx, GFX_CTRL, &val); + bpp = FIELD_GET(GFX_CTRL_FMT, val) ? 32 : 16; + if (bpp == 16) + ctrl |= VE_CTRL_INT_DE; + aspeed_video_write(video, VE_TGS_1, act->width * (bpp >> 3)); + } aspeed_video_update(video, VE_CTRL, VE_CTRL_INT_DE | VE_CTRL_DIRECT_FETCH, - VE_CTRL_DIRECT_FETCH); + ctrl); } size *= 4; @@ -1207,6 +1277,22 @@ err_mem: aspeed_video_free_buf(video, &video->srcs[0]); } +/* + * Update relative parameters when timing changed. + * + * @video: the struct of aspeed_video + * @timings: the new timings + */ +static void aspeed_video_update_timings(struct aspeed_video *video, struct v4l2_bt_timings *timings) +{ + video->active_timings = *timings; + aspeed_video_set_resolution(video); + + video->pix_fmt.width = timings->width; + video->pix_fmt.height = timings->height; + video->pix_fmt.sizeimage = video->max_compressed_size; +} + static void aspeed_video_update_regs(struct aspeed_video *video) { u8 jpeg_hq_quality = clamp((int)video->jpeg_hq_quality - 1, 0, @@ -1219,6 +1305,8 @@ static void aspeed_video_update_regs(struct aspeed_video *video) u32 ctrl = 0; u32 seq_ctrl = 0; + v4l2_dbg(1, debug, &video->v4l2_dev, "input(%s)\n", + input_str[video->input]); v4l2_dbg(1, debug, &video->v4l2_dev, "framerate(%d)\n", video->frame_rate); v4l2_dbg(1, debug, &video->v4l2_dev, "jpeg format(%s) subsample(%s)\n", @@ -1234,6 +1322,9 @@ static void aspeed_video_update_regs(struct aspeed_video *video) else aspeed_video_update(video, VE_BCD_CTRL, VE_BCD_CTRL_EN_BCD, 0); + if (video->input == VIDEO_INPUT_VGA) + ctrl |= VE_CTRL_AUTO_OR_CURSOR; + if (video->frame_rate) ctrl |= FIELD_PREP(VE_CTRL_FRC, video->frame_rate); @@ -1252,7 +1343,9 @@ static void aspeed_video_update_regs(struct aspeed_video *video) aspeed_video_update(video, VE_SEQ_CTRL, video->jpeg_mode | VE_SEQ_CTRL_YUV420, seq_ctrl); - aspeed_video_update(video, VE_CTRL, VE_CTRL_FRC, ctrl); + aspeed_video_update(video, VE_CTRL, + VE_CTRL_FRC | VE_CTRL_AUTO_OR_CURSOR | + VE_CTRL_SOURCE, ctrl); aspeed_video_update(video, VE_COMP_CTRL, VE_COMP_CTRL_DCT_LUM | VE_COMP_CTRL_DCT_CHR | VE_COMP_CTRL_EN_HQ | VE_COMP_CTRL_HQ_DCT_LUM | @@ -1280,6 +1373,7 @@ static void aspeed_video_init_regs(struct aspeed_video *video) aspeed_video_write(video, VE_JPEG_ADDR, video->jpeg.dma); /* Set control registers */ + aspeed_video_write(video, VE_SEQ_CTRL, VE_SEQ_CTRL_AUTO_COMP); aspeed_video_write(video, VE_CTRL, ctrl); aspeed_video_write(video, VE_COMP_CTRL, VE_COMP_CTRL_RSVD); @@ -1311,12 +1405,7 @@ static void aspeed_video_start(struct aspeed_video *video) aspeed_video_get_resolution(video); /* Set timings since the device is being opened for the first time */ - video->active_timings = video->detected_timings; - aspeed_video_set_resolution(video); - - video->pix_fmt.width = video->active_timings.width; - video->pix_fmt.height = video->active_timings.height; - video->pix_fmt.sizeimage = video->max_compressed_size; + aspeed_video_update_timings(video, &video->detected_timings); } static void aspeed_video_stop(struct aspeed_video *video) @@ -1401,10 +1490,10 @@ static int aspeed_video_enum_input(struct file *file, void *fh, { struct aspeed_video *video = video_drvdata(file); - if (inp->index) + if (inp->index >= VIDEO_INPUT_MAX) return -EINVAL; - strscpy(inp->name, "Host VGA capture", sizeof(inp->name)); + sprintf(inp->name, "%s capture", input_str[inp->index]); inp->type = V4L2_INPUT_TYPE_CAMERA; inp->capabilities = V4L2_IN_CAP_DV_TIMINGS; inp->status = video->v4l2_input_status; @@ -1414,16 +1503,57 @@ static int aspeed_video_enum_input(struct file *file, void *fh, static int aspeed_video_get_input(struct file *file, void *fh, unsigned int *i) { - *i = 0; + struct aspeed_video *video = video_drvdata(file); + + *i = video->input; return 0; } static int aspeed_video_set_input(struct file *file, void *fh, unsigned int i) { - if (i) + struct aspeed_video *video = video_drvdata(file); + + if (i >= VIDEO_INPUT_MAX) return -EINVAL; + if (i == video->input) + return 0; + + if (vb2_is_busy(&video->queue)) + return -EBUSY; + + if (IS_ERR(video->scu)) { + v4l2_dbg(1, debug, &video->v4l2_dev, + "%s: scu isn't ready for input-control\n", __func__); + return -EINVAL; + } + + if (IS_ERR(video->gfx) && i == VIDEO_INPUT_GFX) { + v4l2_dbg(1, debug, &video->v4l2_dev, + "%s: gfx isn't ready for GFX input\n", __func__); + return -EINVAL; + } + + video->input = i; + + if (video->version == 6) { + /* modify dpll source per current input */ + if (video->input == VIDEO_INPUT_VGA) + regmap_update_bits(video->scu, SCU_MISC_CTRL, + SCU_DPLL_SOURCE, 0); + else + regmap_update_bits(video->scu, SCU_MISC_CTRL, + SCU_DPLL_SOURCE, SCU_DPLL_SOURCE); + } + + aspeed_video_update_regs(video); + + /* update signal status */ + aspeed_video_get_resolution(video); + if (!video->v4l2_input_status) + aspeed_video_update_timings(video, &video->detected_timings); + return 0; } @@ -1527,13 +1657,7 @@ static int aspeed_video_set_dv_timings(struct file *file, void *fh, if (vb2_is_busy(&video->queue)) return -EBUSY; - video->active_timings = timings->bt; - - aspeed_video_set_resolution(video); - - video->pix_fmt.width = timings->bt.width; - video->pix_fmt.height = timings->bt.height; - video->pix_fmt.sizeimage = video->max_compressed_size; + aspeed_video_update_timings(video, &timings->bt); timings->type = V4L2_DV_BT_656_1120; @@ -1909,6 +2033,7 @@ static int aspeed_video_debugfs_show(struct seq_file *s, void *data) val08 = aspeed_video_read(v, VE_CTRL); if (FIELD_GET(VE_CTRL_DIRECT_FETCH, val08)) { seq_printf(s, " %-20s:\tDirect fetch\n", "Mode"); + seq_printf(s, " %-20s:\t%s\n", "Input", input_str[v->input]); seq_printf(s, " %-20s:\t%s\n", "VGA bpp mode", FIELD_GET(VE_CTRL_INT_DE, val08) ? "16" : "32"); } else { @@ -2068,12 +2193,29 @@ static int aspeed_video_setup_video(struct aspeed_video *video) return 0; } +/* + * Get regmap without checking res, such as clk/reset, that could lead to + * conflict. + */ +static struct regmap *aspeed_regmap_lookup(struct device_node *np, const char *property) +{ + struct device_node *syscon_np __free(device_node) = of_parse_phandle(np, property, 0); + + if (!syscon_np) + return ERR_PTR(-ENODEV); + + return device_node_to_regmap(syscon_np); +} + static int aspeed_video_init(struct aspeed_video *video) { int irq; int rc; struct device *dev = video->dev; + video->scu = aspeed_regmap_lookup(dev->of_node, "aspeed,scu"); + video->gfx = aspeed_regmap_lookup(dev->of_node, "aspeed,gfx"); + irq = irq_of_parse_and_map(dev->of_node, 0); if (!irq) { dev_err(dev, "Unable to find IRQ\n"); @@ -2165,6 +2307,7 @@ static int aspeed_video_probe(struct platform_device *pdev) if (!config) return -ENODEV; + video->version = config->version; video->jpeg_mode = config->jpeg_mode; video->comp_size_read = config->comp_size_read; diff --git a/include/uapi/linux/aspeed-video.h b/include/uapi/linux/aspeed-video.h index 6586a65548c4..15168e8c931e 100644 --- a/include/uapi/linux/aspeed-video.h +++ b/include/uapi/linux/aspeed-video.h @@ -8,6 +8,13 @@ #include +/* aspeed video's input types */ +enum aspeed_video_input { + VIDEO_INPUT_VGA = 0, + VIDEO_INPUT_GFX, + VIDEO_INPUT_MAX +}; + #define V4L2_CID_ASPEED_HQ_MODE (V4L2_CID_USER_ASPEED_BASE + 1) #define V4L2_CID_ASPEED_HQ_JPEG_QUALITY (V4L2_CID_USER_ASPEED_BASE + 2) -- cgit v1.2.3 From db2ab24a341ce89351a1bede37a96a3e3ce1726a Mon Sep 17 00:00:00 2001 From: Lauri Vasama Date: Wed, 27 Aug 2025 16:39:00 +0300 Subject: Add RWF_NOSIGNAL flag for pwritev2 For a user mode library to avoid generating SIGPIPE signals (e.g. because this behaviour is not portable across operating systems) is cumbersome. It is generally bad form to change the process-wide signal mask in a library, so a local solution is needed instead. For I/O performed directly using system calls (synchronous or readiness based asynchronous) this currently involves applying a thread-specific signal mask before the operation and reverting it afterwards. This can be avoided when it is known that the file descriptor refers to neither a pipe nor a socket, but a conservative implementation must always apply the mask. This incurs the cost of two additional system calls. In the case of sockets, the existing MSG_NOSIGNAL flag can be used with send. For asynchronous I/O performed using io_uring, currently the only option (apart from MSG_NOSIGNAL for sockets), is to mask SIGPIPE entirely in the call to io_uring_enter. Thankfully io_uring_enter takes a signal mask, so only a single syscall is needed. However, copying the signal mask on every call incurs a non-zero performance penalty. Furthermore, this mask applies to all completions, meaning that if the non-signaling behaviour is desired only for some subset of operations, the desired signals must be raised manually from user-mode depending on the completed operation. Add RWF_NOSIGNAL flag for pwritev2. This flag prevents the SIGPIPE signal from being raised when writing on disconnected pipes or sockets. The flag is handled directly by the pipe filesystem and converted to the existing MSG_NOSIGNAL flag for sockets. Signed-off-by: Lauri Vasama Link: https://lore.kernel.org/20250827133901.1820771-1-git@vasama.org Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/pipe.c | 6 ++++-- include/linux/fs.h | 1 + include/uapi/linux/fs.h | 5 ++++- net/socket.c | 3 +++ 4 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/pipe.c b/fs/pipe.c index 731622d0738d..42fead1efe52 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&pipe->mutex); if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } @@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; diff --git a/include/linux/fs.h b/include/linux/fs.h index 780e9c774c54..34693cae15a2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -356,6 +356,7 @@ struct readahead_control; #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE +#define IOCB_NOSIGNAL (__force int) RWF_NOSIGNAL /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 0bd678a4a10e..beb4c2d1e41c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -430,10 +430,13 @@ typedef int __bitwise __kernel_rwf_t; /* buffered IO that drops the cache after reading or writing data */ #define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) +/* prevent pipe and socket writes from raising SIGPIPE */ +#define RWF_NOSIGNAL ((__force __kernel_rwf_t)0x00000100) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ - RWF_DONTCACHE) + RWF_DONTCACHE | RWF_NOSIGNAL) #define PROCFS_IOCTL_MAGIC 'f' diff --git a/net/socket.c b/net/socket.c index 682969deaed3..bac335ecee4c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1176,6 +1176,9 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; + if (iocb->ki_flags & IOCB_NOSIGNAL) + msg.msg_flags |= MSG_NOSIGNAL; + res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; -- cgit v1.2.3 From eb59d494eebd4c5414728a35cdea6a0ba78ff26e Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:58 -0700 Subject: audit: add record for multiple task security contexts Replace the single skb pointer in an audit_buffer with a list of skb pointers. Add the audit_stamp information to the audit_buffer as there's no guarantee that there will be an audit_context containing the stamp associated with the event. At audit_log_end() time create auxiliary records as have been added to the list. Functions are created to manage the skb list in the audit_buffer. Create a new audit record AUDIT_MAC_TASK_CONTEXTS. An example of the MAC_TASK_CONTEXTS record is: type=MAC_TASK_CONTEXTS msg=audit(1600880931.832:113) subj_apparmor=unconfined subj_smack=_ When an audit event includes a AUDIT_MAC_TASK_CONTEXTS record the "subj=" field in other records in the event will be "subj=?". An AUDIT_MAC_TASK_CONTEXTS record is supplied when the system has multiple security modules that may make access decisions based on a subject security context. Refactor audit_log_task_context(), creating a new audit_log_subj_ctx(). This is used in netlabel auditing to provide multiple subject security contexts as necessary. Suggested-by: Paul Moore Signed-off-by: Casey Schaufler [PM: subj tweak, audit example readability indents] Signed-off-by: Paul Moore --- include/linux/audit.h | 16 ++++ include/uapi/linux/audit.h | 1 + kernel/audit.c | 208 ++++++++++++++++++++++++++++++++++++------- net/netlabel/netlabel_user.c | 9 +- security/apparmor/lsm.c | 3 + security/selinux/hooks.c | 3 + security/smack/smack_lsm.c | 3 + 7 files changed, 202 insertions(+), 41 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index e3f06eba9c6e..a1f068bcb3a0 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -37,6 +37,8 @@ struct audit_watch; struct audit_tree; struct sk_buff; struct kern_ipc_perm; +struct lsm_id; +struct lsm_prop; struct audit_krule { u32 pflags; @@ -147,6 +149,9 @@ extern unsigned compat_signal_class[]; #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) +/* bit values for audit_cfg_lsm */ +#define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) + struct filename; #define AUDIT_OFF 0 @@ -185,6 +190,7 @@ extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); +extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -210,6 +216,8 @@ extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); +extern void audit_cfg_lsm(const struct lsm_id *lsmid, int flags); + #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, @@ -245,6 +253,11 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } +static inline int audit_log_subj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; @@ -269,6 +282,9 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return 0; } +static inline void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ } + #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 9a4ecc9f6dc5..8cad2f307719 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -148,6 +148,7 @@ #define AUDIT_IPE_POLICY_LOAD 1422 /* IPE policy load */ #define AUDIT_LANDLOCK_ACCESS 1423 /* Landlock denial */ #define AUDIT_LANDLOCK_DOMAIN 1424 /* Landlock domain status */ +#define AUDIT_MAC_TASK_CONTEXTS 1425 /* Multiple LSM task contexts */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/kernel/audit.c b/kernel/audit.c index 226c8ae00d04..c924b30f2524 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,11 @@ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; +/* Number of modules that provide a security context. + List of lsms that provide a security context */ +static u32 audit_subj_secctx_cnt; +static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; + /** * struct audit_net - audit private network namespace data * @sk: communication socket @@ -195,8 +201,10 @@ static struct audit_ctl_mutex { * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { - struct sk_buff *skb; /* formatted skb ready to send */ + struct sk_buff *skb; /* the skb for audit_log functions */ + struct sk_buff_head skb_list; /* formatted skbs, ready to send */ struct audit_context *ctx; /* NULL or associated context */ + struct audit_stamp stamp; /* audit stamp for these records */ gfp_t gfp_mask; }; @@ -278,6 +286,27 @@ static pid_t auditd_pid_vnr(void) return pid; } +/** + * audit_cfg_lsm - Identify a security module as providing a secctx. + * @lsmid: LSM identity + * @flags: which contexts are provided + * + * Description: + * Increments the count of the security modules providing a secctx. + * If the LSM id is already in the list leave it alone. + */ +void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ + int i; + + if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) { + for (i = 0 ; i < audit_subj_secctx_cnt; i++) + if (audit_subj_lsms[i] == lsmid) + return; + audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; + } +} + /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace @@ -1776,10 +1805,13 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { + struct sk_buff *skb; + if (!ab) return; - kfree_skb(ab->skb); + while ((skb = skb_dequeue(&ab->skb_list))) + kfree_skb(skb); kmem_cache_free(audit_buffer_cache, ab); } @@ -1795,6 +1827,10 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; + + skb_queue_head_init(&ab->skb_list); + skb_queue_tail(&ab->skb_list, ab->skb); + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; @@ -1860,7 +1896,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; - struct audit_stamp stamp; if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1915,14 +1950,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; } - audit_get_stamp(ab->ctx, &stamp); + audit_get_stamp(ab->ctx, &ab->stamp); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", - (unsigned long long)stamp.ctime.tv_sec, - stamp.ctime.tv_nsec/1000000, - stamp.serial); + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); return ab; } @@ -2178,31 +2213,128 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } -int audit_log_task_context(struct audit_buffer *ab) +/** + * audit_buffer_aux_new - Add an aux record buffer to the skb list + * @ab: audit_buffer + * @type: message type + * + * Aux records are allocated and added to the skb list of + * the "main" record. The ab->skb is reset to point to the + * aux record on its creation. When the aux record in complete + * ab->skb has to be reset to point to the "main" record. + * This allows the audit_log_ functions to be ignorant of + * which kind of record it is logging to. It also avoids adding + * special data for aux records. + * + * On success ab->skb will point to the new aux record. + * Returns 0 on success, -ENOMEM should allocation fail. + */ +static int audit_buffer_aux_new(struct audit_buffer *ab, int type) +{ + WARN_ON(ab->skb != skb_peek(&ab->skb_list)); + + ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask); + if (!ab->skb) + goto err; + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) + goto err; + skb_queue_tail(&ab->skb_list, ab->skb); + + audit_log_format(ab, "audit(%llu.%03lu:%u): ", + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); + + return 0; + +err: + kfree_skb(ab->skb); + ab->skb = skb_peek(&ab->skb_list); + return -ENOMEM; +} + +/** + * audit_buffer_aux_end - Switch back to the "main" record from an aux record + * @ab: audit_buffer + * + * Restores the "main" audit record to ab->skb. + */ +static void audit_buffer_aux_end(struct audit_buffer *ab) +{ + ab->skb = skb_peek(&ab->skb_list); +} + +/** + * audit_log_subj_ctx - Add LSM subject information + * @ab: audit_buffer + * @prop: LSM subject properties. + * + * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record. + */ +int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { - struct lsm_prop prop; struct lsm_context ctx; + char *space = ""; int error; + int i; - security_current_getlsmprop_subj(&prop); - if (!lsmprop_is_set(&prop)) + security_current_getlsmprop_subj(prop); + if (!lsmprop_is_set(prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx, LSM_ID_UNDEF); - if (error < 0) { - if (error != -EINVAL) - goto error_path; + if (audit_subj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return 0; + } + audit_log_format(ab, " subj=%s", ctx.context); + security_release_secctx(&ctx); return 0; } - - audit_log_format(ab, " subj=%s", ctx.context); - security_release_secctx(&ctx); + /* Multiple LSMs provide contexts. Include an aux record. */ + audit_log_format(ab, " subj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_subj_secctx_cnt; i++) { + error = security_lsmprop_to_secctx(prop, &ctx, + audit_subj_lsms[i]->id); + if (error < 0) { + /* + * Don't print anything. An LSM like BPF could + * claim to support contexts, but only do so under + * certain conditions. + */ + if (error == -EOPNOTSUPP) + continue; + if (error != -EINVAL) + audit_panic("error in audit_log_subj_ctx"); + } else { + audit_log_format(ab, "%ssubj_%s=%s", space, + audit_subj_lsms[i]->name, ctx.context); + space = " "; + security_release_secctx(&ctx); + } + } + audit_buffer_aux_end(ab); return 0; error_path: - audit_panic("error in audit_log_task_context"); + audit_panic("error in audit_log_subj_ctx"); return error; } +EXPORT_SYMBOL(audit_log_subj_ctx); + +int audit_log_task_context(struct audit_buffer *ab) +{ + struct lsm_prop prop; + + security_current_getlsmprop_subj(&prop); + return audit_log_subj_ctx(ab, &prop); +} EXPORT_SYMBOL(audit_log_task_context); void audit_log_d_path_exe(struct audit_buffer *ab, @@ -2411,6 +2543,26 @@ int audit_signal_info(int sig, struct task_struct *t) return audit_signal_info_syscall(t); } +/** + * __audit_log_end - enqueue one audit record + * @skb: the buffer to send + */ +static void __audit_log_end(struct sk_buff *skb) +{ + struct nlmsghdr *nlh; + + if (audit_rate_check()) { + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ + nlh = nlmsg_hdr(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* queue the netlink packet */ + skb_queue_tail(&audit_queue, skb); + } else + audit_log_lost("rate limit exceeded"); +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer @@ -2423,25 +2575,15 @@ int audit_signal_info(int sig, struct task_struct *t) void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; - struct nlmsghdr *nlh; if (!ab) return; - if (audit_rate_check()) { - skb = ab->skb; - ab->skb = NULL; + while ((skb = skb_dequeue(&ab->skb_list))) + __audit_log_end(skb); - /* setup the netlink header, see the comments in - * kauditd_send_multicast_skb() for length quirks */ - nlh = nlmsg_hdr(skb); - nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; - - /* queue the netlink packet and poke the kauditd thread */ - skb_queue_tail(&audit_queue, skb); - wake_up_interruptible(&kauditd_wait); - } else - audit_log_lost("rate limit exceeded"); + /* poke the kauditd thread */ + wake_up_interruptible(&kauditd_wait); audit_buffer_free(ab); } diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 6d6545297ee3..0da652844dd6 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -84,7 +84,6 @@ struct audit_buffer *netlbl_audit_start_common(int type, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; - struct lsm_context ctx; if (audit_enabled == AUDIT_OFF) return NULL; @@ -96,13 +95,7 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_log_format(audit_buf, "netlabel: auid=%u ses=%u", from_kuid(&init_user_ns, audit_info->loginuid), audit_info->sessionid); - - if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &ctx, - LSM_ID_UNDEF) > 0) { - audit_log_format(audit_buf, " subj=%s", ctx.context); - security_release_secctx(&ctx); - } + audit_log_subj_ctx(audit_buf, &audit_info->prop); return audit_buf; } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8e1cc229b41b..220d1684b8d4 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -2530,6 +2530,9 @@ static int __init apparmor_init(void) security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), &apparmor_lsmid); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&apparmor_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; if (aa_g_profile_mode == APPARMOR_COMPLAIN) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d..975b84b466b4 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7618,6 +7618,9 @@ static __init int selinux_init(void) /* Set the security state for the initial task. */ cred_init_security(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&selinux_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); if (!default_noexec) pr_notice("SELinux: virtual memory is executable by default\n"); diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index fc340a6f0dde..eaff9b8901a7 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5267,6 +5267,9 @@ static __init int smack_init(void) /* initialize the smack_known_list */ init_smack_known_list(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&smack_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + return 0; } -- cgit v1.2.3 From 0ffbc876d03c80b83d70aeefac7bbb94a9f4e135 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:59 -0700 Subject: audit: add record for multiple object contexts Create a new audit record AUDIT_MAC_OBJ_CONTEXTS. An example of the MAC_OBJ_CONTEXTS record is: type=MAC_OBJ_CONTEXTS msg=audit(1601152467.009:1050): obj_selinux=unconfined_u:object_r:user_home_t:s0 When an audit event includes a AUDIT_MAC_OBJ_CONTEXTS record the "obj=" field in other records in the event will be "obj=?". An AUDIT_MAC_OBJ_CONTEXTS record is supplied when the system has multiple security modules that may make access decisions based on an object security context. Signed-off-by: Casey Schaufler [PM: subj tweak, audit example readability indents] Signed-off-by: Paul Moore --- include/linux/audit.h | 7 ++++++ include/uapi/linux/audit.h | 1 + kernel/audit.c | 58 +++++++++++++++++++++++++++++++++++++++++++++- kernel/auditsc.c | 38 ++++++------------------------ security/selinux/hooks.c | 4 +++- security/smack/smack_lsm.c | 4 +++- 6 files changed, 78 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index a1f068bcb3a0..536f8ee8da81 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -151,6 +151,7 @@ extern unsigned compat_signal_class[]; /* bit values for audit_cfg_lsm */ #define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) +#define AUDIT_CFG_LSM_SECCTX_OBJECT BIT(1) struct filename; @@ -191,6 +192,7 @@ extern void audit_log_path_denied(int type, extern void audit_log_lost(const char *message); extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); +extern int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -258,6 +260,11 @@ static inline int audit_log_subj_ctx(struct audit_buffer *ab, { return 0; } +static inline int audit_log_obj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 8cad2f307719..14a1c1fe013a 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -149,6 +149,7 @@ #define AUDIT_LANDLOCK_ACCESS 1423 /* Landlock denial */ #define AUDIT_LANDLOCK_DOMAIN 1424 /* Landlock domain status */ #define AUDIT_MAC_TASK_CONTEXTS 1425 /* Multiple LSM task contexts */ +#define AUDIT_MAC_OBJ_CONTEXTS 1426 /* Multiple LSM objext contexts */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/kernel/audit.c b/kernel/audit.c index c924b30f2524..bd7474fd8d2c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -85,7 +85,9 @@ static unsigned int audit_net_id; /* Number of modules that provide a security context. List of lsms that provide a security context */ static u32 audit_subj_secctx_cnt; +static u32 audit_obj_secctx_cnt; static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; +static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT]; /** * struct audit_net - audit private network namespace data @@ -305,6 +307,12 @@ void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) return; audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; } + if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) { + for (i = 0 ; i < audit_obj_secctx_cnt; i++) + if (audit_obj_lsms[i] == lsmid) + return; + audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid; + } } /** @@ -1142,7 +1150,6 @@ static int is_audit_feature_set(int i) return af.features & AUDIT_FEATURE_TO_MASK(i); } - static int audit_get_feature(struct sk_buff *skb) { u32 seq; @@ -2337,6 +2344,55 @@ int audit_log_task_context(struct audit_buffer *ab) } EXPORT_SYMBOL(audit_log_task_context); +int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) +{ + int i; + int rc; + int error = 0; + char *space = ""; + struct lsm_context ctx; + + if (audit_obj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return error; + } + audit_log_format(ab, " obj=%s", ctx.context); + security_release_secctx(&ctx); + return 0; + } + audit_log_format(ab, " obj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_obj_secctx_cnt; i++) { + rc = security_lsmprop_to_secctx(prop, &ctx, + audit_obj_lsms[i]->id); + if (rc < 0) { + audit_log_format(ab, "%sobj_%s=?", space, + audit_obj_lsms[i]->name); + if (rc != -EINVAL) + audit_panic("error in audit_log_obj_ctx"); + error = rc; + } else { + audit_log_format(ab, "%sobj_%s=%s", space, + audit_obj_lsms[i]->name, ctx.context); + security_release_secctx(&ctx); + } + space = " "; + } + + audit_buffer_aux_end(ab); + return error; + +error_path: + audit_panic("error in audit_log_obj_ctx"); + return error; +} + void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3b606fd4ae8e..d1966144bdfe 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1098,7 +1098,6 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, char *comm) { struct audit_buffer *ab; - struct lsm_context ctx; int rc = 0; ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); @@ -1108,15 +1107,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); - if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF) < 0) { - audit_log_format(ab, " obj=(none)"); - rc = 1; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop)) + rc = 1; + audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); audit_log_end(ab); @@ -1392,16 +1385,8 @@ static void show_special(struct audit_context *context, int *call_panic) from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); if (lsmprop_is_set(&context->ipc.oprop)) { - struct lsm_context lsmctx; - - if (security_lsmprop_to_secctx(&context->ipc.oprop, - &lsmctx, - LSM_ID_UNDEF) < 0) { + if (audit_log_obj_ctx(ab, &context->ipc.oprop)) *call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", lsmctx.context); - security_release_secctx(&lsmctx); - } } if (context->ipc.has_perm) { audit_log_end(ab); @@ -1558,18 +1543,9 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); - if (lsmprop_is_set(&n->oprop)) { - struct lsm_context ctx; - - if (security_lsmprop_to_secctx(&n->oprop, &ctx, - LSM_ID_UNDEF) < 0) { - if (call_panic) - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(&n->oprop) && + audit_log_obj_ctx(ab, &n->oprop)) + *call_panic = 2; /* log the audit_names record type */ switch (n->type) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 975b84b466b4..3999f58a1842 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7619,7 +7619,9 @@ static __init int selinux_init(void) cred_init_security(); /* Inform the audit system that secctx is used */ - audit_cfg_lsm(&selinux_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + audit_cfg_lsm(&selinux_lsmid, + AUDIT_CFG_LSM_SECCTX_SUBJECT | + AUDIT_CFG_LSM_SECCTX_OBJECT); default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); if (!default_noexec) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index eaff9b8901a7..fdf2f193a291 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5268,7 +5268,9 @@ static __init int smack_init(void) init_smack_known_list(); /* Inform the audit system that secctx is used */ - audit_cfg_lsm(&smack_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + audit_cfg_lsm(&smack_lsmid, + AUDIT_CFG_LSM_SECCTX_SUBJECT | + AUDIT_CFG_LSM_SECCTX_OBJECT); return 0; } -- cgit v1.2.3 From dfb84c33079497bf27058b15780e1c7bba4c371b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Aug 2025 13:10:44 +0200 Subject: fuse: allow synchronous FUSE_INIT FUSE_INIT has always been asynchronous with mount. That means that the server processed this request after the mount syscall returned. This means that FUSE_INIT can't supply the root inode's ID, hence it currently has a hardcoded value. There are other limitations such as not being able to perform getxattr during mount, which is needed by selinux. To remove these limitations allow server to process FUSE_INIT while initializing the in-core super block for the fuse filesystem. This can only be done if the server is prepared to handle this, so add FUSE_DEV_IOC_SYNC_INIT ioctl, which a) lets the server know whether this feature is supported, returning ENOTTY othewrwise. b) lets the kernel know to perform a synchronous initialization The implementation is slightly tricky, since fuse_dev/fuse_conn are set up only during super block creation. This is solved by setting the private data of the fuse device file to a special value ((struct fuse_dev *) 1) and waiting for this to be turned into a proper fuse_dev before commecing with operations on the device file. Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 3 +- fs/fuse/dev.c | 71 +++++++++++++++++++++++++++++++++++------------ fs/fuse/dev_uring.c | 4 +-- fs/fuse/fuse_dev_i.h | 13 +++++++-- fs/fuse/fuse_i.h | 5 +++- fs/fuse/inode.c | 51 +++++++++++++++++++++++++++------- include/uapi/linux/fuse.h | 1 + 7 files changed, 114 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b39844d75a80..28c96961e85d 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -52,6 +52,7 @@ #include #include "fuse_i.h" +#include "fuse_dev_i.h" #define CUSE_CONNTBL_LEN 64 @@ -547,7 +548,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ static int cuse_channel_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = file->private_data; + struct fuse_dev *fud = __fuse_get_dev(file); struct cuse_conn *cc = fc_to_cc(fud->fc); /* remove from the conntbl, no more access from this point on */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f6259711ca1c..2f1619e266e1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1530,14 +1530,34 @@ static int fuse_dev_open(struct inode *inode, struct file *file) return 0; } +struct fuse_dev *fuse_get_dev(struct file *file) +{ + struct fuse_dev *fud = __fuse_get_dev(file); + int err; + + if (likely(fud)) + return fud; + + err = wait_event_interruptible(fuse_dev_waitq, + READ_ONCE(file->private_data) != FUSE_DEV_SYNC_INIT); + if (err) + return ERR_PTR(err); + + fud = __fuse_get_dev(file); + if (!fud) + return ERR_PTR(-EPERM); + + return fud; +} + static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!user_backed_iter(to)) return -EINVAL; @@ -1557,8 +1577,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(in); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer), GFP_KERNEL); @@ -2231,7 +2251,7 @@ copy_finish: static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; - struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + struct fuse_dev *fud = __fuse_get_dev(iocb->ki_filp); if (!fud) return -EPERM; @@ -2253,11 +2273,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_dev *fud; + struct fuse_dev *fud = __fuse_get_dev(out); size_t rem; ssize_t ret; - fud = fuse_get_dev(out); if (!fud) return -EPERM; @@ -2343,7 +2362,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) struct fuse_iqueue *fiq; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) + if (IS_ERR(fud)) return EPOLLERR; fiq = &fud->fc->iq; @@ -2490,7 +2509,7 @@ void fuse_wait_aborted(struct fuse_conn *fc) int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (fud) { struct fuse_conn *fc = fud->fc; @@ -2521,8 +2540,8 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) { struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); /* No locking - fasync_helper does its own locking */ return fasync_helper(fd, file, on, &fud->fc->iq.fasync); @@ -2532,7 +2551,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) { struct fuse_dev *fud; - if (new->private_data) + if (__fuse_get_dev(new)) return -EINVAL; fud = fuse_dev_alloc_install(fc); @@ -2563,7 +2582,7 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) * uses the same ioctl handler. */ if (fd_file(f)->f_op == file->f_op) - fud = fuse_get_dev(fd_file(f)); + fud = __fuse_get_dev(fd_file(f)); res = -EINVAL; if (fud) { @@ -2581,8 +2600,8 @@ static long fuse_dev_ioctl_backing_open(struct file *file, struct fuse_dev *fud = fuse_get_dev(file); struct fuse_backing_map map; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2598,8 +2617,8 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) struct fuse_dev *fud = fuse_get_dev(file); int backing_id; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2610,6 +2629,19 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) return fuse_backing_close(fud->fc, backing_id); } +static long fuse_dev_ioctl_sync_init(struct file *file) +{ + int err = -EINVAL; + + mutex_lock(&fuse_mutex); + if (!__fuse_get_dev(file)) { + WRITE_ONCE(file->private_data, FUSE_DEV_SYNC_INIT); + err = 0; + } + mutex_unlock(&fuse_mutex); + return err; +} + static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2625,6 +2657,9 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, case FUSE_DEV_IOC_BACKING_CLOSE: return fuse_dev_ioctl_backing_close(file, argp); + case FUSE_DEV_IOC_SYNC_INIT: + return fuse_dev_ioctl_sync_init(file); + default: return -ENOTTY; } @@ -2633,7 +2668,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, #ifdef CONFIG_PROC_FS static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (!fud) return; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 249b210becb1..bef38ed78249 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1139,9 +1139,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EINVAL; fud = fuse_get_dev(cmd->file); - if (!fud) { + if (IS_ERR(fud)) { pr_info_ratelimited("No fuse device found\n"); - return -ENOTCONN; + return PTR_ERR(fud); } fc = fud->fc; diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 5a9bd771a319..6e8373f97040 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -12,6 +12,8 @@ #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) +extern struct wait_queue_head fuse_dev_waitq; + struct fuse_arg; struct fuse_args; struct fuse_pqueue; @@ -37,15 +39,22 @@ struct fuse_copy_state { } ring; }; -static inline struct fuse_dev *fuse_get_dev(struct file *file) +#define FUSE_DEV_SYNC_INIT ((struct fuse_dev *) 1) +#define FUSE_DEV_PTR_MASK (~1UL) + +static inline struct fuse_dev *__fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return READ_ONCE(file->private_data); + struct fuse_dev *fud = READ_ONCE(file->private_data); + + return (typeof(fud)) ((unsigned long) fud & FUSE_DEV_PTR_MASK); } +struct fuse_dev *fuse_get_dev(struct file *file); + unsigned int fuse_req_hash(u64 unique); struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 486fa550c951..233c6111f768 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -904,6 +904,9 @@ struct fuse_conn { /* Is link not implemented by fs? */ unsigned int no_link:1; + /* Is synchronous FUSE_INIT allowed? */ + unsigned int sync_init:1; + /* Use io_uring for communication */ unsigned int io_uring; @@ -1318,7 +1321,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); -void fuse_send_init(struct fuse_mount *fm); +int fuse_send_init(struct fuse_mount *fm); /** * Fill in superblock and initialize fuse connection diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9d26a5bc394d..5b7897bf7e45 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dev_i.h" #include "dev_uring_i.h" #include @@ -34,6 +35,7 @@ MODULE_LICENSE("GPL"); static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); +DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq); static int set_global_limit(const char *val, const struct kernel_param *kp); @@ -1466,7 +1468,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, wake_up_all(&fc->blocked_waitq); } -void fuse_send_init(struct fuse_mount *fm) +static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) { struct fuse_init_args *ia; u64 flags; @@ -1525,10 +1527,30 @@ void fuse_send_init(struct fuse_mount *fm) ia->args.out_args[0].value = &ia->out; ia->args.force = true; ia->args.nocreds = true; - ia->args.end = process_init_reply; - if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) - process_init_reply(fm, &ia->args, -ENOTCONN); + return ia; +} + +int fuse_send_init(struct fuse_mount *fm) +{ + struct fuse_init_args *ia = fuse_new_init(fm); + int err; + + if (fm->fc->sync_init) { + err = fuse_simple_request(fm, &ia->args); + /* Ignore size of init reply */ + if (err > 0) + err = 0; + } else { + ia->args.end = process_init_reply; + err = fuse_simple_background(fm, &ia->args, GFP_KERNEL); + if (!err) + return 0; + } + process_init_reply(fm, &ia->args, err); + if (fm->fc->conn_error) + return -ENOTCONN; + return 0; } EXPORT_SYMBOL_GPL(fuse_send_init); @@ -1867,8 +1889,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_lock(&fuse_mutex); err = -EINVAL; - if (ctx->fudptr && *ctx->fudptr) - goto err_unlock; + if (ctx->fudptr && *ctx->fudptr) { + if (*ctx->fudptr == FUSE_DEV_SYNC_INIT) + fc->sync_init = 1; + else + goto err_unlock; + } err = fuse_ctl_add_conn(fc); if (err) @@ -1876,8 +1902,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - if (ctx->fudptr) + if (ctx->fudptr) { *ctx->fudptr = fud; + wake_up_all(&fuse_dev_waitq); + } mutex_unlock(&fuse_mutex); return 0; @@ -1898,6 +1926,7 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_mount *fm; int err; if (!ctx->file || !ctx->rootmode_present || @@ -1918,8 +1947,10 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) return err; /* file->private_data shall be visible on all CPUs after this */ smp_mb(); - fuse_send_init(get_fuse_mount_super(sb)); - return 0; + + fm = get_fuse_mount_super(sb); + + return fuse_send_init(fm); } /* @@ -1980,7 +2011,7 @@ static int fuse_get_tree(struct fs_context *fsc) * Allow creating a fuse mount with an already initialized fuse * connection */ - fud = READ_ONCE(ctx->file->private_data); + fud = __fuse_get_dev(ctx->file); if (ctx->file->f_op == &fuse_dev_operations && fud) { fsc->sget_key = fud->fc; sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 94621f68a5cc..6b9fb8b08768 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1131,6 +1131,7 @@ struct fuse_backing_map { #define FUSE_DEV_IOC_BACKING_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 1, \ struct fuse_backing_map) #define FUSE_DEV_IOC_BACKING_CLOSE _IOW(FUSE_DEV_IOC_MAGIC, 2, uint32_t) +#define FUSE_DEV_IOC_SYNC_INIT _IO(FUSE_DEV_IOC_MAGIC, 3) struct fuse_lseek_in { uint64_t fh; -- cgit v1.2.3 From 4039ce7ef40474d5ba46f414c50cc7020b9cf8ae Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 7 Aug 2025 15:49:59 +0200 Subject: netfilter: nf_tables: Introduce NFTA_DEVICE_PREFIX This new attribute is supposed to be used instead of NFTA_DEVICE_NAME for simple wildcard interface specs. It holds a NUL-terminated string representing an interface name prefix to match on. While kernel code to distinguish full names from prefixes in NFTA_DEVICE_NAME is simpler than this solution, reusing the existing attribute with different semantics leads to confusion between different versions of kernel and user space though: * With old kernels, wildcards submitted by user space are accepted yet silently treated as regular names. * With old user space, wildcards submitted by kernel may cause crashes since libnftnl expects NUL-termination when there is none. Using a distinct attribute type sanitizes these situations as the receiving part detects and rejects the unexpected attribute nested in *_HOOK_DEVS attributes. Fixes: 6d07a289504a ("netfilter: nf_tables: Support wildcard netdev hook specs") Signed-off-by: Phil Sutter Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 42 +++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2beb30be2c5f..8e0eb832bc01 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1784,10 +1784,12 @@ enum nft_synproxy_attributes { * enum nft_device_attributes - nf_tables device netlink attributes * * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) + * @NFTA_DEVICE_PREFIX: device name prefix, a simple wildcard (NLA_STRING) */ enum nft_devices_attributes { NFTA_DEVICE_UNSPEC, NFTA_DEVICE_NAME, + NFTA_DEVICE_PREFIX, __NFTA_DEVICE_MAX }; #define NFTA_DEVICE_MAX (__NFTA_DEVICE_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 58c5425d61c2..c1082de09656 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1959,6 +1959,18 @@ nla_put_failure: return -ENOSPC; } +static bool hook_is_prefix(struct nft_hook *hook) +{ + return strlen(hook->ifname) >= hook->ifnamelen; +} + +static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook) +{ + int attr = hook_is_prefix(hook) ? NFTA_DEVICE_PREFIX : NFTA_DEVICE_NAME; + + return nla_put_string(skb, attr, hook->ifname); +} + static int nft_dump_basechain_hook(struct sk_buff *skb, const struct net *net, int family, const struct nft_base_chain *basechain, @@ -1990,16 +2002,15 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, if (!first) first = hook; - if (nla_put(skb, NFTA_DEVICE_NAME, - hook->ifnamelen, hook->ifname)) + if (nft_nla_put_hook_dev(skb, hook)) goto nla_put_failure; n++; } nla_nest_end(skb, nest_devs); if (n == 1 && - nla_put(skb, NFTA_HOOK_DEV, - first->ifnamelen, first->ifname)) + !hook_is_prefix(first) && + nla_put_string(skb, NFTA_HOOK_DEV, first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -2310,7 +2321,8 @@ void nf_tables_chain_destroy(struct nft_chain *chain) } static struct nft_hook *nft_netdev_hook_alloc(struct net *net, - const struct nlattr *attr) + const struct nlattr *attr, + bool prefix) { struct nf_hook_ops *ops; struct net_device *dev; @@ -2327,7 +2339,8 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, if (err < 0) goto err_hook_free; - hook->ifnamelen = nla_len(attr); + /* include the terminating NUL-char when comparing non-prefixes */ + hook->ifnamelen = strlen(hook->ifname) + !prefix; /* nf_tables_netdev_event() is called under rtnl_mutex, this is * indirectly serializing all the other holders of the commit_mutex with @@ -2374,14 +2387,22 @@ static int nf_tables_parse_netdev_hooks(struct net *net, struct nft_hook *hook, *next; const struct nlattr *tmp; int rem, n = 0, err; + bool prefix; nla_for_each_nested(tmp, attr, rem) { - if (nla_type(tmp) != NFTA_DEVICE_NAME) { + switch (nla_type(tmp)) { + case NFTA_DEVICE_NAME: + prefix = false; + break; + case NFTA_DEVICE_PREFIX: + prefix = true; + break; + default: err = -EINVAL; goto err_hook; } - hook = nft_netdev_hook_alloc(net, tmp); + hook = nft_netdev_hook_alloc(net, tmp, prefix); if (IS_ERR(hook)) { NL_SET_BAD_ATTR(extack, tmp); err = PTR_ERR(hook); @@ -2427,7 +2448,7 @@ static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[], int err; if (tb[NFTA_HOOK_DEV]) { - hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]); + hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV], false); if (IS_ERR(hook)) { NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]); return PTR_ERR(hook); @@ -9458,8 +9479,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, list_for_each_entry_rcu(hook, hook_list, list, lockdep_commit_lock_is_held(net)) { - if (nla_put(skb, NFTA_DEVICE_NAME, - hook->ifnamelen, hook->ifname)) + if (nft_nla_put_hook_dev(skb, hook)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); -- cgit v1.2.3 From 34837c444cd42236b2b43ce871f30d83776a3431 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Sun, 24 Aug 2025 20:07:34 +0200 Subject: media: uapi: v4l2-controls: Cleanup codec definitions Move some fields closer to where they are used, add missing tabs and remove an extra newline. Signed-off-by: Paul Kocialkowski Reviewed-by: Nicolas Dufresne Signed-off-by: Nicolas Dufresne Signed-off-by: Hans Verkuil --- include/uapi/linux/v4l2-controls.h | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 7aef88465d04..2d30107e047e 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -1537,15 +1537,6 @@ struct v4l2_ctrl_h264_pred_weights { struct v4l2_h264_weight_factors weight_factors[2]; }; -#define V4L2_H264_SLICE_TYPE_P 0 -#define V4L2_H264_SLICE_TYPE_B 1 -#define V4L2_H264_SLICE_TYPE_I 2 -#define V4L2_H264_SLICE_TYPE_SP 3 -#define V4L2_H264_SLICE_TYPE_SI 4 - -#define V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED 0x01 -#define V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH 0x02 - #define V4L2_H264_TOP_FIELD_REF 0x1 #define V4L2_H264_BOTTOM_FIELD_REF 0x2 #define V4L2_H264_FRAME_REF 0x3 @@ -1566,8 +1557,17 @@ struct v4l2_h264_reference { * Maximum DPB size, as specified by section 'A.3.1 Level limits * common to the Baseline, Main, and Extended profiles'. */ -#define V4L2_H264_NUM_DPB_ENTRIES 16 -#define V4L2_H264_REF_LIST_LEN (2 * V4L2_H264_NUM_DPB_ENTRIES) +#define V4L2_H264_NUM_DPB_ENTRIES 16 +#define V4L2_H264_REF_LIST_LEN (2 * V4L2_H264_NUM_DPB_ENTRIES) + +#define V4L2_H264_SLICE_TYPE_P 0 +#define V4L2_H264_SLICE_TYPE_B 1 +#define V4L2_H264_SLICE_TYPE_I 2 +#define V4L2_H264_SLICE_TYPE_SP 3 +#define V4L2_H264_SLICE_TYPE_SI 4 + +#define V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED 0x01 +#define V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH 0x02 #define V4L2_CID_STATELESS_H264_SLICE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 6) /** @@ -1707,7 +1707,6 @@ struct v4l2_ctrl_h264_decode_params { __u32 flags; }; - /* Stateless FWHT control, used by the vicodec driver */ /* Current FWHT version */ -- cgit v1.2.3 From ee63609454838ea2b108f96f74a287be72d281ee Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Fri, 25 Jul 2025 23:22:19 +1000 Subject: wifi: mac80211: support block bitmap S1G TIM encoding An S1G TIM PVB is encoded differently compared to a non-s1g TIM PVB. As the AP dictates which encoding mode it uses, here we only implement block bitmap encoding. This is the default encoding mode used by all current vendor implementations. Additionally, S1G has a maximum AID count of 8192, however we are limiting the current implementation to 1600. This has no resemblence to the standard and is purely an implementation detail. The reason for this is due to the TIM elements maximum length of 255. This allows for, at most, 25 encoded blocks for a PVB encoded with block bitmap. Support for the maximum of 8192 AIDs will require an implementation of page slicing to be added to mac80211. As a result, we perform extra validation on both the STA and AP side when receiving an AID as an S1G interface. Add support for block bitmap encoding for an S1G AP and limit the maximum AID count to 1600 for the current mac80211 implementations. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250725132221.258217-2-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 +- net/mac80211/cfg.c | 10 ++- net/mac80211/ieee80211_i.h | 8 +++ net/mac80211/mlme.c | 19 ++--- net/mac80211/tx.c | 166 +++++++++++++++++++++++++++++++++---------- 5 files changed, 156 insertions(+), 50 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d1a14f2892d9..a4bc0c2729f6 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2283,7 +2283,8 @@ enum nl80211_commands { * @NL80211_ATTR_PEER_AID: Association ID for the peer TDLS station (u16). * This is similar to @NL80211_ATTR_STA_AID but with a difference of being * allowed to be used with the first @NL80211_CMD_SET_STATION command to - * update a TDLS peer STA entry. + * update a TDLS peer STA entry. For S1G interfaces, this is limited to + * 1600 for the current mac80211 implementation. * * @NL80211_ATTR_COALESCE_RULE: Coalesce rule information. * diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 2ed07fa121ab..4603350989c8 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2171,10 +2171,16 @@ static int sta_apply_parameters(struct ieee80211_local *local, /* * cfg80211 validates this (1-2007) and allows setting the AID - * only when creating a new station entry + * only when creating a new station entry. For S1G APs, the current + * implementation supports a maximum of 1600 AIDs. */ - if (params->aid) + if (params->aid) { + if (sdata->vif.cfg.s1g && + params->aid > IEEE80211_MAX_SUPPORTED_S1G_AID) + return -EINVAL; + sta->sta.aid = params->aid; + } /* * Some of the following updates would be racy if called on an diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8afa2404eaa8..07f5fb11569b 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -86,6 +86,14 @@ extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS]; #define IEEE80211_MAX_NAN_INSTANCE_ID 255 +/* + * Current mac80211 implementation supports a maximum of 1600 AIDS + * for S1G interfaces. With regards to an S1G TIM, this covers 25 blocks + * as each block is 64 AIDs. + */ +#define IEEE80211_MAX_SUPPORTED_S1G_AID 1600 +#define IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS 25 + enum ieee80211_status_data { IEEE80211_STATUS_TYPE_MASK = 0x00f, IEEE80211_STATUS_TYPE_INVALID = 0, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 353e89973d1e..f5d09ded9827 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -6351,6 +6351,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, }; u8 ap_mld_addr[ETH_ALEN] __aligned(2); unsigned int link_id; + u16 max_aid = IEEE80211_MAX_AID; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -6377,10 +6378,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control); capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); - if (assoc_data->s1g) + if (assoc_data->s1g) { elem_start = mgmt->u.s1g_assoc_resp.variable; - else + max_aid = IEEE80211_MAX_SUPPORTED_S1G_AID; + } else { elem_start = mgmt->u.assoc_resp.variable; + } /* * Note: this may not be perfect, AP might misbehave - if @@ -6404,16 +6407,15 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, if (elems->aid_resp) aid = le16_to_cpu(elems->aid_resp->aid); - else if (assoc_data->s1g) - aid = 0; /* TODO */ else aid = le16_to_cpu(mgmt->u.assoc_resp.aid); /* - * The 5 MSB of the AID field are reserved - * (802.11-2016 9.4.1.8 AID field) + * The 5 MSB of the AID field are reserved for a non-S1G STA. For + * an S1G STA the 3 MSBs are reserved. + * (802.11-2016 9.4.1.8 AID field). */ - aid &= 0x7ff; + aid &= assoc_data->s1g ? 0x1fff : 0x7ff; sdata_info(sdata, "RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n", @@ -6450,7 +6452,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { - if (aid == 0 || aid > IEEE80211_MAX_AID) { + if (aid == 0 || aid > max_aid) { sdata_info(sdata, "invalid AID value %d (out of range), turn off PS\n", aid); @@ -6488,6 +6490,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, } sdata->vif.cfg.aid = aid; + sdata->vif.cfg.s1g = assoc_data->s1g; if (!ieee80211_assoc_success(sdata, mgmt, elems, elem_start, elem_len)) { diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 00671ae45b2f..0ece8d89e094 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4882,15 +4882,114 @@ void ieee80211_tx_pending(struct tasklet_struct *t) /* functions for drivers to get certain frames */ +static void ieee80211_beacon_add_tim_pvb(struct ps_data *ps, + struct sk_buff *skb, + bool mcast_traffic) +{ + int i, n1 = 0, n2; + + /* + * Find largest even number N1 so that bits numbered 1 through + * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits + * (N2 + 1) x 8 through 2007 are 0. + */ + for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { + if (ps->tim[i]) { + n1 = i & 0xfe; + break; + } + } + n2 = n1; + for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { + if (ps->tim[i]) { + n2 = i; + break; + } + } + + /* Bitmap control */ + skb_put_u8(skb, n1 | mcast_traffic); + /* Part Virt Bitmap */ + skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); +} + +/* + * mac80211 currently supports encoding using block bitmap mode, non + * inversed. The current implementation supports up to 1600 AIDs. + * + * Block bitmap encoding breaks down the AID bitmap into blocks of 64 + * AIDs. Each block contains between 0 and 8 subblocks. Each subblock + * describes 8 AIDs and the presence of a subblock is determined by + * the block bitmap. + */ +static void ieee80211_s1g_beacon_add_tim_pvb(struct ps_data *ps, + struct sk_buff *skb, + bool mcast_traffic) +{ + int blk; + + /* + * Emit a bitmap control block with a page slice number of 31 and a + * page index of 0 which indicates as per IEEE80211-2024 9.4.2.5.1 + * that the entire page (2048 bits) indicated by the page index + * is encoded in the partial virtual bitmap. + */ + skb_put_u8(skb, mcast_traffic | (31 << 1)); + + /* Emit an encoded block for each non-zero sub-block */ + for (blk = 0; blk < IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS; blk++) { + u8 blk_bmap = 0; + int sblk; + + for (sblk = 0; sblk < 8; sblk++) { + int sblk_idx = blk * 8 + sblk; + + /* + * If the current subblock is non-zero, increase the + * number of subblocks to emit for the current block. + */ + if (ps->tim[sblk_idx]) + blk_bmap |= BIT(sblk); + } + + /* If the current block contains no non-zero sublocks */ + if (!blk_bmap) + continue; + + /* + * Emit a block control byte for the current encoded block + * with an encoding mode of block bitmap (0x0), not inverse + * (0x0) and the current block offset (5 bits) + */ + skb_put_u8(skb, blk << 3); + + /* + * Emit the block bitmap for the current encoded block which + * contains the present subblocks. + */ + skb_put_u8(skb, blk_bmap); + + /* Emit the present subblocks */ + for (sblk = 0; sblk < 8; sblk++) { + int sblk_idx = blk * 8 + sblk; + + if (!(blk_bmap & BIT(sblk))) + continue; + + skb_put_u8(skb, ps->tim[sblk_idx]); + } + } +} + static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ps_data *ps, struct sk_buff *skb, bool is_template) { - u8 *pos, *tim; - int aid0 = 0; - int i, have_bits = 0, n1, n2; + struct element *tim; + bool mcast_traffic = false, have_bits = false; struct ieee80211_bss_conf *link_conf = link->conf; + bool s1g = ieee80211_get_link_sband(link)->band == NL80211_BAND_S1GHZ; /* Generate bitmap for TIM only if there are any STAs in power save * mode. */ @@ -4898,7 +4997,8 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, /* in the hope that this is faster than * checking byte-for-byte */ have_bits = !bitmap_empty((unsigned long *)ps->tim, - IEEE80211_MAX_AID+1); + IEEE80211_MAX_AID + 1); + if (!is_template) { if (ps->dtim_count == 0) ps->dtim_count = link_conf->dtim_period - 1; @@ -4906,51 +5006,39 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, ps->dtim_count--; } - tim = pos = skb_put(skb, 5); - *pos++ = WLAN_EID_TIM; - *pos++ = 3; - *pos++ = ps->dtim_count; - *pos++ = link_conf->dtim_period; + /* Length is set after parsing the AID bitmap */ + tim = skb_put(skb, sizeof(struct element)); + tim->id = WLAN_EID_TIM; + skb_put_u8(skb, ps->dtim_count); + skb_put_u8(skb, link_conf->dtim_period); if (ps->dtim_count == 0 && !skb_queue_empty(&ps->bc_buf)) - aid0 = 1; + mcast_traffic = true; - ps->dtim_bc_mc = aid0 == 1; + ps->dtim_bc_mc = mcast_traffic; if (have_bits) { - /* Find largest even number N1 so that bits numbered 1 through - * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits - * (N2 + 1) x 8 through 2007 are 0. */ - n1 = 0; - for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { - if (ps->tim[i]) { - n1 = i & 0xfe; - break; - } - } - n2 = n1; - for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { - if (ps->tim[i]) { - n2 = i; - break; - } - } - - /* Bitmap control */ - *pos++ = n1 | aid0; - /* Part Virt Bitmap */ - skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); - - tim[1] = n2 - n1 + 4; + if (s1g) + ieee80211_s1g_beacon_add_tim_pvb(ps, skb, + mcast_traffic); + else + ieee80211_beacon_add_tim_pvb(ps, skb, mcast_traffic); } else { - *pos++ = aid0; /* Bitmap control */ - - if (ieee80211_get_link_sband(link)->band != NL80211_BAND_S1GHZ) { - tim[1] = 4; + /* + * If there is no buffered unicast traffic for an S1G + * interface, we can exclude the bitmap control. This is in + * contrast to other phy types as they do include the bitmap + * control and pvb even when there is no buffered traffic. + */ + if (!s1g) { + /* Bitmap control */ + skb_put_u8(skb, mcast_traffic); /* Part Virt Bitmap */ skb_put_u8(skb, 0); } } + + tim->datalen = skb_tail_pointer(skb) - tim->data; } static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3 From d0bf06158c39e7129524dd8b43b82aed84d68faa Mon Sep 17 00:00:00 2001 From: Muna Sinada Date: Fri, 15 Aug 2025 14:30:11 -0700 Subject: wifi: nl80211: Add EHT fixed Tx rate support Add new attributes to support EHT MCS/NSS Tx rates and EHT GI/LTF. Parse EHT fixed MCS/NSS Tx rates and EHT GI/LTF values passed by the userspace, validate and add as part of cfg80211_bitrate_mask. MCS mask is constructed by new function, eht_build_mcs_mask(). Max NSS supported for MCS rates of 7, 9, 11 and 13 is utilized to set MCS bitmask for each NSS. MCS rates 14, and 15 if supported, are set only for NSS = 0. Co-developed-by: Aloka Dixit Signed-off-by: Aloka Dixit Signed-off-by: Muna Sinada Link: https://patch.msgid.link/20250815213011.2704803-1-muna.sinada@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 + include/uapi/linux/nl80211.h | 41 +++++++- net/wireless/nl80211.c | 229 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 266 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cb1c36be2749..7d881aa7e48b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -841,9 +841,12 @@ struct cfg80211_bitrate_mask { u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mcs[NL80211_VHT_NSS_MAX]; u16 he_mcs[NL80211_HE_NSS_MAX]; + u16 eht_mcs[NL80211_EHT_NSS_MAX]; enum nl80211_txrate_gi gi; enum nl80211_he_gi he_gi; + enum nl80211_eht_gi eht_gi; enum nl80211_he_ltf he_ltf; + enum nl80211_eht_ltf eht_ltf; } control[NUM_NL80211_BANDS]; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a4bc0c2729f6..4f08264bbc8e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1943,8 +1943,9 @@ enum nl80211_commands { * The driver must also specify support for this with the extended * features NL80211_EXT_FEATURE_BEACON_RATE_LEGACY, * NL80211_EXT_FEATURE_BEACON_RATE_HT, - * NL80211_EXT_FEATURE_BEACON_RATE_VHT and - * NL80211_EXT_FEATURE_BEACON_RATE_HE. + * NL80211_EXT_FEATURE_BEACON_RATE_VHT, + * NL80211_EXT_FEATURE_BEACON_RATE_HE and + * NL80211_EXT_FEATURE_BEACON_RATE_EHT. * * @NL80211_ATTR_FRAME_MATCH: A binary attribute which typically must contain * at least one byte, currently used with @NL80211_CMD_REGISTER_FRAME. @@ -3736,6 +3737,22 @@ enum nl80211_eht_gi { NL80211_RATE_INFO_EHT_GI_3_2, }; +/** + * enum nl80211_eht_ltf - EHT long training field + * @NL80211_RATE_INFO_EHT_1XLTF: 3.2 usec + * @NL80211_RATE_INFO_EHT_2XLTF: 6.4 usec + * @NL80211_RATE_INFO_EHT_4XLTF: 12.8 usec + * @NL80211_RATE_INFO_EHT_6XLTF: 19.2 usec + * @NL80211_RATE_INFO_EHT_8XLTF: 25.6 usec + */ +enum nl80211_eht_ltf { + NL80211_RATE_INFO_EHT_1XLTF, + NL80211_RATE_INFO_EHT_2XLTF, + NL80211_RATE_INFO_EHT_4XLTF, + NL80211_RATE_INFO_EHT_6XLTF, + NL80211_RATE_INFO_EHT_8XLTF, +}; + /** * enum nl80211_eht_ru_alloc - EHT RU allocation values * @NL80211_RATE_INFO_EHT_RU_ALLOC_26: 26-tone RU allocation @@ -5482,6 +5499,10 @@ enum nl80211_key_attributes { * see &struct nl80211_txrate_he * @NL80211_TXRATE_HE_GI: configure HE GI, 0.8us, 1.6us and 3.2us. * @NL80211_TXRATE_HE_LTF: configure HE LTF, 1XLTF, 2XLTF and 4XLTF. + * @NL80211_TXRATE_EHT: EHT rates allowed for TX rate selection, + * see &struct nl80211_txrate_eht + * @NL80211_TXRATE_EHT_GI: configure EHT GI, (u8, see &enum nl80211_eht_gi) + * @NL80211_TXRATE_EHT_LTF: configure EHT LTF, (u8, see &enum nl80211_eht_ltf) * @__NL80211_TXRATE_AFTER_LAST: internal * @NL80211_TXRATE_MAX: highest TX rate attribute */ @@ -5494,6 +5515,9 @@ enum nl80211_tx_rate_attributes { NL80211_TXRATE_HE, NL80211_TXRATE_HE_GI, NL80211_TXRATE_HE_LTF, + NL80211_TXRATE_EHT, + NL80211_TXRATE_EHT_GI, + NL80211_TXRATE_EHT_LTF, /* keep last */ __NL80211_TXRATE_AFTER_LAST, @@ -5526,6 +5550,15 @@ enum nl80211_txrate_gi { NL80211_TXRATE_FORCE_LGI, }; +#define NL80211_EHT_NSS_MAX 16 +/** + * struct nl80211_txrate_eht - EHT MCS/NSS txrate bitmap + * @mcs: MCS bitmap table for each NSS (array index 0 for 1 stream, etc.) + */ +struct nl80211_txrate_eht { + __u16 mcs[NL80211_EHT_NSS_MAX]; +}; + /** * enum nl80211_band - Frequency band * @NL80211_BAND_2GHZ: 2.4 GHz ISM band @@ -6650,6 +6683,9 @@ enum nl80211_feature_flags { * (signaling and payload protected) A-MSDUs and this shall be advertised * in the RSNXE. * + * @NL80211_EXT_FEATURE_BEACON_RATE_EHT: Driver supports beacon rate + * configuration (AP/mesh) with EHT rates. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6725,6 +6761,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_OWE_OFFLOAD_AP, NL80211_EXT_FEATURE_DFS_CONCURRENT, NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT, + NL80211_EXT_FEATURE_BEACON_RATE_EHT, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 89519aa52893..364d83fb9992 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -411,6 +411,14 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { [NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8, NL80211_RATE_INFO_HE_1XLTF, NL80211_RATE_INFO_HE_4XLTF), + [NL80211_TXRATE_EHT] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_eht)), + [NL80211_TXRATE_EHT_GI] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_GI_0_8, + NL80211_RATE_INFO_EHT_GI_3_2), + [NL80211_TXRATE_EHT_LTF] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_1XLTF, + NL80211_RATE_INFO_EHT_8XLTF), + }; static const struct nla_policy @@ -5393,6 +5401,164 @@ static bool he_set_mcs_mask(struct genl_info *info, return true; } +static void eht_build_mcs_mask(struct genl_info *info, + const struct ieee80211_sta_eht_cap *eht_cap, + u8 mcs_nss_len, u16 *mcs_mask) +{ + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + u8 nss, mcs_7 = 0, mcs_9 = 0, mcs_11 = 0, mcs_13 = 0; + unsigned int link_id = nl80211_link_id(info->attrs); + + if (mcs_nss_len == 4) { + const struct ieee80211_eht_mcs_nss_supp_20mhz_only *mcs = + &eht_cap->eht_mcs_nss_supp.only_20mhz; + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs7_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + + } else { + const struct ieee80211_eht_mcs_nss_supp_bw *mcs; + enum nl80211_chan_width width; + + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + width = wdev->u.ibss.chandef.width; + break; + case NL80211_IFTYPE_MESH_POINT: + width = wdev->u.mesh.chandef.width; + break; + case NL80211_IFTYPE_OCB: + width = wdev->u.ocb.chandef.width; + break; + default: + if (wdev->valid_links) + width = wdev->links[link_id].ap.chandef.width; + else + width = wdev->u.ap.preset_chandef.width; + break; + } + + switch (width) { + case NL80211_CHAN_WIDTH_320: + mcs = &eht_cap->eht_mcs_nss_supp.bw._320; + break; + case NL80211_CHAN_WIDTH_160: + mcs = &eht_cap->eht_mcs_nss_supp.bw._160; + break; + default: + mcs = &eht_cap->eht_mcs_nss_supp.bw._80; + break; + } + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + } + + /* Enable MCS 14 for NSS 0 */ + if (eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP) + mcs_mask[0] |= 0x4000; + + /* Enable MCS 15 for NSS 0 */ + mcs_mask[0] |= 0x8000; + + for (nss = 0; nss < NL80211_EHT_NSS_MAX; nss++) { + if (!mcs_7) + continue; + mcs_mask[nss] |= 0x00FF; + mcs_7--; + + if (!mcs_9) + continue; + mcs_mask[nss] |= 0x0300; + mcs_9--; + + if (!mcs_11) + continue; + mcs_mask[nss] |= 0x0C00; + mcs_11--; + + if (!mcs_13) + continue; + mcs_mask[nss] |= 0x3000; + mcs_13--; + } +} + +static bool eht_set_mcs_mask(struct genl_info *info, struct wireless_dev *wdev, + struct ieee80211_supported_band *sband, + struct nl80211_txrate_eht *txrate, + u16 mcs[NL80211_EHT_NSS_MAX]) +{ + const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u16 tx_mcs_mask[NL80211_EHT_NSS_MAX] = { 0 }; + u8 i, mcs_nss_len; + + he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); + if (!he_cap) + return false; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + return false; + + /* Checks for MCS 14 */ + if (txrate->mcs[0] & 0x4000) { + if (sband->band != NL80211_BAND_6GHZ) + return false; + + if (!(eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP)) + return false; + } + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + if (mcs_nss_len == 3) { + /* Supported iftypes for setting non-20 MHZ only EHT MCS */ + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_OCB: + break; + default: + return false; + } + } + + /* Build eht_mcs_mask from EHT and HE capabilities */ + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, tx_mcs_mask); + + memset(mcs, 0, sizeof(u16) * NL80211_EHT_NSS_MAX); + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) + mcs[i] = txrate->mcs[i]; + else + return false; + } + + return true; +} + static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, struct nlattr *attrs[], enum nl80211_attrs attr, @@ -5413,6 +5579,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, /* Default to all rates enabled */ for (i = 0; i < NUM_NL80211_BANDS; i++) { const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u8 mcs_nss_len; if (!default_all_enabled) break; @@ -5441,6 +5609,21 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[i].he_gi = 0xFF; mask->control[i].he_ltf = 0xFF; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + continue; + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, + mask->control[i].eht_mcs); + + mask->control[i].eht_gi = 0xFF; + mask->control[i].eht_ltf = 0xFF; } /* if no rates are given set it back to the defaults */ @@ -5512,13 +5695,27 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[band].he_ltf = nla_get_u8(tb[NL80211_TXRATE_HE_LTF]); + if (tb[NL80211_TXRATE_EHT] && + !eht_set_mcs_mask(info, wdev, sband, + nla_data(tb[NL80211_TXRATE_EHT]), + mask->control[band].eht_mcs)) + return -EINVAL; + + if (tb[NL80211_TXRATE_EHT_GI]) + mask->control[band].eht_gi = + nla_get_u8(tb[NL80211_TXRATE_EHT_GI]); + if (tb[NL80211_TXRATE_EHT_LTF]) + mask->control[band].eht_ltf = + nla_get_u8(tb[NL80211_TXRATE_EHT_LTF]); + if (mask->control[band].legacy == 0) { - /* don't allow empty legacy rates if HT, VHT or HE + /* don't allow empty legacy rates if HT, VHT, HE or EHT * are not even supported. */ if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || rdev->wiphy.bands[band]->vht_cap.vht_supported || - ieee80211_get_he_iftype_cap(sband, wdev->iftype))) + ieee80211_get_he_iftype_cap(sband, wdev->iftype) || + ieee80211_get_eht_iftype_cap(sband, wdev->iftype))) return -EINVAL; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) @@ -5533,6 +5730,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (mask->control[band].he_mcs[i]) goto out; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) + if (mask->control[band].eht_mcs[i]) + goto out; + /* legacy and mcs rates may not be both empty */ return -EINVAL; } @@ -5546,7 +5747,7 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, enum nl80211_band band, struct cfg80211_bitrate_mask *beacon_rate) { - u32 count_ht, count_vht, count_he, i; + u32 count_ht, count_vht, count_he, count_eht, i; u32 rate = beacon_rate->control[band].legacy; /* Allow only one rate */ @@ -5592,8 +5793,21 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, return -EINVAL; } - if ((count_ht && count_vht && count_he) || - (!rate && !count_ht && !count_vht && !count_he)) + count_eht = 0; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if (hweight16(beacon_rate->control[band].eht_mcs[i]) > 1) { + return -EINVAL; + } else if (beacon_rate->control[band].eht_mcs[i]) { + count_eht++; + if (count_eht > 1) + return -EINVAL; + } + if (count_eht && rate) + return -EINVAL; + } + + if ((count_ht && count_vht && count_he && count_eht) || + (!rate && !count_ht && !count_vht && !count_he && !count_eht)) return -EINVAL; if (rate && @@ -5613,6 +5827,11 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_BEACON_RATE_HE)) return -EINVAL; + if (count_eht && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_EHT)) + return -EINVAL; + return 0; } -- cgit v1.2.3 From 24185534915b5d926ded098336f47bdcca333aec Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Sun, 17 Aug 2025 21:04:32 +0200 Subject: wifi: nl80211: allow drivers to support subset of NL80211_CMD_SET_BSS The so-called fullmac devices rely on firmware functionality and/or API to change BSS parameters. Today there are limited drivers supporting the nl80211 primitive, but they only handle a subset of the bss parameters passed if any. The mac80211 driver does handle all parameters and stores their configured values. Some of the BSS parameters were already conditional by wiphy->features. For these the wiphy->bss_param_support and wiphy->features fields are silently aligned in wiphy_register(). Maybe better to issue a warning instead when they are misaligned. Signed-off-by: Arend van Spriel Link: https://patch.msgid.link/20250817190435.1495094-2-arend.vanspriel@broadcom.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 29 +++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 4 ++++ net/wireless/core.c | 9 +++++++++ net/wireless/nl80211.c | 39 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 79 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7d881aa7e48b..4072a67c9cc9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2459,6 +2459,29 @@ struct mpath_info { int generation; }; +/** + * enum wiphy_bss_param_flags - bit positions for supported bss parameters. + * + * @WIPHY_BSS_PARAM_CTS_PROT: support changing CTS protection. + * @WIPHY_BSS_PARAM_SHORT_PREAMBLE: support changing short preamble usage. + * @WIPHY_BSS_PARAM_SHORT_SLOT_TIME: support changing short slot time usage. + * @WIPHY_BSS_PARAM_BASIC_RATES: support reconfiguring basic rates. + * @WIPHY_BSS_PARAM_AP_ISOLATE: support changing AP isolation. + * @WIPHY_BSS_PARAM_HT_OPMODE: support changing HT operating mode. + * @WIPHY_BSS_PARAM_P2P_CTWINDOW: support reconfiguring ctwindow. + * @WIPHY_BSS_PARAM_P2P_OPPPS: support changing P2P opportunistic power-save. + */ +enum wiphy_bss_param_flags { + WIPHY_BSS_PARAM_CTS_PROT = BIT(0), + WIPHY_BSS_PARAM_SHORT_PREAMBLE = BIT(1), + WIPHY_BSS_PARAM_SHORT_SLOT_TIME = BIT(2), + WIPHY_BSS_PARAM_BASIC_RATES = BIT(3), + WIPHY_BSS_PARAM_AP_ISOLATE = BIT(4), + WIPHY_BSS_PARAM_HT_OPMODE = BIT(5), + WIPHY_BSS_PARAM_P2P_CTWINDOW = BIT(6), + WIPHY_BSS_PARAM_P2P_OPPPS = BIT(7), +}; + /** * struct bss_parameters - BSS parameters * @@ -5785,6 +5808,11 @@ struct wiphy_radio { * and probe responses. This value should be set if the driver * wishes to limit the number of csa counters. Default (0) means * infinite. + * @bss_param_support: bitmask indicating which bss_parameters as defined in + * &struct bss_parameters the driver can actually handle in the + * .change_bss() callback. The bit positions are defined in &enum + * wiphy_bss_param_flags. + * * @bss_select_support: bitmask indicating the BSS selection criteria supported * by the driver in the .connect() callback. The bit position maps to the * attribute indices defined in &enum nl80211_bss_select_attr. @@ -5970,6 +5998,7 @@ struct wiphy { u8 max_num_csa_counters; + u32 bss_param_support; u32 bss_select_support; u8 nan_supported_bands; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4f08264bbc8e..6c07100fc01f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2930,6 +2930,9 @@ enum nl80211_commands { * required alongside this attribute. Refer to * @enum nl80211_s1g_short_beacon_attrs for the attribute definitions. * + * @NL80211_ATTR_BSS_PARAM: nested attribute used with %NL80211_CMD_GET_WIPHY + * which indicates which BSS parameters can be modified. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3491,6 +3494,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_LONG_BEACON_PERIOD, NL80211_ATTR_S1G_SHORT_BEACON, + NL80211_ATTR_BSS_PARAM, /* add attributes here, update the policy in nl80211.c */ diff --git a/net/wireless/core.c b/net/wireless/core.c index a7e2931ffb2e..797f9f2004a6 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1018,6 +1018,15 @@ int wiphy_register(struct wiphy *wiphy) rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_CTWIN; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_CTWINDOW; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_OPPPS; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_OPPPS; + rtnl_lock(); wiphy_lock(&rdev->wiphy); res = device_add(&rdev->wiphy.dev); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 364d83fb9992..153644a04072 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3027,6 +3027,40 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->wiphy.ext_features)) goto nla_put_failure; + if (rdev->wiphy.bss_param_support) { + struct nlattr *nested; + u32 parsup = rdev->wiphy.bss_param_support; + + nested = nla_nest_start(msg, NL80211_ATTR_BSS_PARAM); + if (!nested) + goto nla_put_failure; + + if ((parsup & WIPHY_BSS_PARAM_CTS_PROT) && + nla_put_flag(msg, NL80211_ATTR_BSS_CTS_PROT)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_PREAMBLE) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_PREAMBLE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_SLOT_TIME) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_SLOT_TIME)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_BASIC_RATES) && + nla_put_flag(msg, NL80211_ATTR_BSS_BASIC_RATES)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_AP_ISOLATE) && + nla_put_flag(msg, NL80211_ATTR_AP_ISOLATE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_HT_OPMODE) && + nla_put_flag(msg, NL80211_ATTR_BSS_HT_OPMODE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_CTWINDOW) && + nla_put_flag(msg, NL80211_ATTR_P2P_CTWINDOW)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_OPPPS) && + nla_put_flag(msg, NL80211_ATTR_P2P_OPPPS)) + goto nla_put_failure; + nla_nest_end(msg, nested); + } if (rdev->wiphy.bss_select_support) { struct nlattr *nested; u32 bss_select_support = rdev->wiphy.bss_select_support; @@ -9048,6 +9082,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct bss_parameters params; + u32 bss_param_support = rdev->wiphy.bss_param_support; memset(¶ms, 0, sizeof(params)); params.link_id = nl80211_link_id_or_invalid(info->attrs); @@ -9087,7 +9122,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) params.p2p_ctwindow = nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); if (params.p2p_ctwindow != 0 && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) + !(bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW)) return -EINVAL; } @@ -9099,7 +9134,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); params.p2p_opp_ps = tmp; if (params.p2p_opp_ps && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) + !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) return -EINVAL; } -- cgit v1.2.3 From 4f652a390db4246c5d3c51bf25d03ed0e4178fdc Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Sun, 17 Aug 2025 21:04:34 +0200 Subject: wifi: nl80211: strict checking attributes for NL80211_CMD_SET_BSS Assure user-space only modifies attributes for NL80211_CMD_SET_BSS that are supported by the driver. This stricter checking is only done when user-space commits to it by including NL80211_ATTR_BSS_PARAM. Signed-off-by: Arend van Spriel Link: https://patch.msgid.link/20250817190435.1495094-4-arend.vanspriel@broadcom.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 5 ++++- net/wireless/nl80211.c | 52 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6c07100fc01f..aed0b4c5d5e8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2931,7 +2931,10 @@ enum nl80211_commands { * @enum nl80211_s1g_short_beacon_attrs for the attribute definitions. * * @NL80211_ATTR_BSS_PARAM: nested attribute used with %NL80211_CMD_GET_WIPHY - * which indicates which BSS parameters can be modified. + * which indicates which BSS parameters can be modified. The attribute can + * also be used as flag attribute by user-space in %NL80211_CMD_SET_BSS to + * indicate that it wants strict checking on the BSS parameters to be + * modified. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 153644a04072..99e2aadc65f7 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -879,6 +879,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_S1G_LONG_BEACON_PERIOD] = NLA_POLICY_MIN(NLA_U8, 2), [NL80211_ATTR_S1G_SHORT_BEACON] = NLA_POLICY_NESTED(nl80211_s1g_short_beacon), + [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -9083,6 +9084,8 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; struct bss_parameters params; u32 bss_param_support = rdev->wiphy.bss_param_support; + u32 changed = 0; + bool strict; memset(¶ms, 0, sizeof(params)); params.link_id = nl80211_link_id_or_invalid(info->attrs); @@ -9095,26 +9098,54 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) params.p2p_ctwindow = -1; params.p2p_opp_ps = -1; - if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) + strict = nla_get_flag(info->attrs[NL80211_ATTR_BSS_PARAM]); + if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_CTS_PROT)) + return -EINVAL; params.use_cts_prot = nla_get_u8(info->attrs[NL80211_ATTR_BSS_CTS_PROT]); - if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) + changed |= WIPHY_BSS_PARAM_CTS_PROT; + } + if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_SHORT_PREAMBLE)) + return -EINVAL; params.use_short_preamble = nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]); - if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) + changed |= WIPHY_BSS_PARAM_SHORT_PREAMBLE; + } + if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_SHORT_SLOT_TIME)) + return -EINVAL; params.use_short_slot_time = nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]); + changed |= WIPHY_BSS_PARAM_SHORT_SLOT_TIME; + } if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_BASIC_RATES)) + return -EINVAL; params.basic_rates = nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); params.basic_rates_len = nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); + changed |= WIPHY_BSS_PARAM_BASIC_RATES; } - if (info->attrs[NL80211_ATTR_AP_ISOLATE]) - params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); - if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) + if (info->attrs[NL80211_ATTR_AP_ISOLATE]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_AP_ISOLATE)) + return -EINVAL; + params.ap_isolate = + !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); + changed |= WIPHY_BSS_PARAM_AP_ISOLATE; + } + if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_HT_OPMODE)) + return -EINVAL; params.ht_opmode = nla_get_u16(info->attrs[NL80211_ATTR_BSS_HT_OPMODE]); + changed |= WIPHY_BSS_PARAM_HT_OPMODE; + } if (info->attrs[NL80211_ATTR_P2P_CTWINDOW]) { if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) @@ -9124,6 +9155,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) if (params.p2p_ctwindow != 0 && !(bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW)) return -EINVAL; + changed |= WIPHY_BSS_PARAM_P2P_CTWINDOW; } if (info->attrs[NL80211_ATTR_P2P_OPPPS]) { @@ -9132,9 +9164,11 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EINVAL; tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); + if (tmp && !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) + return -EINVAL; params.p2p_opp_ps = tmp; if (params.p2p_opp_ps && - !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) + !(rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) return -EINVAL; } @@ -9145,6 +9179,10 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; + changed &= rdev->wiphy.bss_param_support; + if (!changed) + return 0; + return rdev_change_bss(rdev, dev, ¶ms); } -- cgit v1.2.3 From c8ab5e888bb6721e6e084881e6e24ef2678832c3 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Mon, 1 Sep 2025 09:44:52 +0200 Subject: PCI/AER: Print TLP Log for errors introduced since PCIe r1.1 When reporting an error, the AER driver prints the TLP Header / Prefix Log only for errors enumerated in the AER_LOG_TLP_MASKS macro. The macro was never amended since its introduction in 2006 with commit 6c2b374d7485 ("PCI-Express AER implemetation: AER core and aerdriver"). At the time, PCIe r1.1 was the latest spec revision. Amend the macro with errors defined since then to avoid omitting the TLP Header / Prefix Log for newer errors. The order of the errors in AER_LOG_TLP_MASKS follows PCIe r1.1 sec 6.2.7 rather than 7.10.2, because only the former documents for which errors a TLP Header / Prefix is logged. Retain this order. The section number is still 6.2.7 in today's PCIe r7.0. For Completion Timeouts, the TLP Header / Prefix is only logged if the Completion Timeout Prefix / Header Log Capable bit is set in the AER Capabilities and Control register. Introduce a tlp_header_logged() helper to check whether the TLP Header / Prefix Log is populated and use it in the two places which currently match against AER_LOG_TLP_MASKS directly. For Uncorrectable Internal Errors, logging of the TLP Header / Prefix is optional per PCIe r7.0 sec 6.2.7. If needed, drivers could indicate through a flag whether devices are capable and tlp_header_logged() could then check that flag. pcitools introduced macros for newer errors with commit 144b0911cc0b ("ls-ecaps: extend decode support for more fields for AER CE and UE status"): https://git.kernel.org/pub/scm/utils/pciutils/pciutils.git/commit/?id=144b0911cc0b Unfortunately some of those macros are overly long: PCI_ERR_UNC_POISONED_TLP_EGRESS PCI_ERR_UNC_DMWR_REQ_EGRESS_BLOCKED PCI_ERR_UNC_IDE_CHECK PCI_ERR_UNC_MISR_IDE_TLP PCI_ERR_UNC_PCRC_CHECK PCI_ERR_UNC_TLP_XLAT_EGRESS_BLOCKED This seems unsuitable for , so shorten to: PCI_ERR_UNC_POISON_BLK PCI_ERR_UNC_DMWR_BLK PCI_ERR_UNC_IDE_CHECK PCI_ERR_UNC_MISR_IDE PCI_ERR_UNC_PCRC_CHECK PCI_ERR_UNC_XLAT_BLK Note that some of the existing macros in do not match exactly with pcitools (e.g. PCI_ERR_UNC_SDES versus PCI_ERR_UNC_SURPDN), so it does not seem mandatory for them to be identical. Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/5f707caf1260bd8f15012bb032f7da9a9b898aba.1756712066.git.lukas@wunner.de --- drivers/pci/pcie/aer.c | 30 +++++++++++++++++++++++++++--- include/uapi/linux/pci_regs.h | 7 +++++++ 2 files changed, 34 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 15ed541d2fbe..62c74b5f99ae 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -96,11 +96,21 @@ struct aer_info { }; #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ + PCI_ERR_UNC_POISON_BLK | \ PCI_ERR_UNC_ECRC| \ PCI_ERR_UNC_UNSUP| \ PCI_ERR_UNC_COMP_ABORT| \ PCI_ERR_UNC_UNX_COMP| \ - PCI_ERR_UNC_MALF_TLP) + PCI_ERR_UNC_ACSV | \ + PCI_ERR_UNC_MCBTLP | \ + PCI_ERR_UNC_ATOMEG | \ + PCI_ERR_UNC_DMWR_BLK | \ + PCI_ERR_UNC_XLAT_BLK | \ + PCI_ERR_UNC_TLPPRE | \ + PCI_ERR_UNC_MALF_TLP | \ + PCI_ERR_UNC_IDE_CHECK | \ + PCI_ERR_UNC_MISR_IDE | \ + PCI_ERR_UNC_PCRC_CHECK) #define SYSTEM_ERROR_INTR_ON_MESG_MASK (PCI_EXP_RTCTL_SECEE| \ PCI_EXP_RTCTL_SENFEE| \ @@ -796,6 +806,20 @@ static int aer_ratelimit(struct pci_dev *dev, unsigned int severity) } } +static bool tlp_header_logged(u32 status, u32 capctl) +{ + /* Errors for which a header is always logged (PCIe r7.0 sec 6.2.7) */ + if (status & AER_LOG_TLP_MASKS) + return true; + + /* Completion Timeout header is only logged on capable devices */ + if (status & PCI_ERR_UNC_COMP_TIME && + capctl & PCI_ERR_CAP_COMP_TIME_LOG) + return true; + + return false; +} + static void __aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { const char **strings; @@ -910,7 +934,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, status = aer->uncor_status; mask = aer->uncor_mask; info.level = KERN_ERR; - tlp_header_valid = status & AER_LOG_TLP_MASKS; + tlp_header_valid = tlp_header_logged(status, aer->cap_control); } info.status = status; @@ -1401,7 +1425,7 @@ int aer_get_device_error_info(struct aer_err_info *info, int i) pci_read_config_dword(dev, aer + PCI_ERR_CAP, &aercc); info->first_error = PCI_ERR_CAP_FEP(aercc); - if (info->status & AER_LOG_TLP_MASKS) { + if (tlp_header_logged(info->status, aercc)) { info->tlp_header_valid = 1; pcie_read_tlp_log(dev, aer + PCI_ERR_HEADER_LOG, aer + PCI_ERR_PREFIX_LOG, diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f5b17745de60..ae1f52e8d515 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -776,6 +776,12 @@ #define PCI_ERR_UNC_MCBTLP 0x00800000 /* MC blocked TLP */ #define PCI_ERR_UNC_ATOMEG 0x01000000 /* Atomic egress blocked */ #define PCI_ERR_UNC_TLPPRE 0x02000000 /* TLP prefix blocked */ +#define PCI_ERR_UNC_POISON_BLK 0x04000000 /* Poisoned TLP Egress Blocked */ +#define PCI_ERR_UNC_DMWR_BLK 0x08000000 /* DMWr Request Egress Blocked */ +#define PCI_ERR_UNC_IDE_CHECK 0x10000000 /* IDE Check Failed */ +#define PCI_ERR_UNC_MISR_IDE 0x20000000 /* Misrouted IDE TLP */ +#define PCI_ERR_UNC_PCRC_CHECK 0x40000000 /* PCRC Check Failed */ +#define PCI_ERR_UNC_XLAT_BLK 0x80000000 /* TLP Translation Egress Blocked */ #define PCI_ERR_UNCOR_MASK 0x08 /* Uncorrectable Error Mask */ /* Same bits as above */ #define PCI_ERR_UNCOR_SEVER 0x0c /* Uncorrectable Error Severity */ @@ -798,6 +804,7 @@ #define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */ #define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */ #define PCI_ERR_CAP_PREFIX_LOG_PRESENT 0x00000800 /* TLP Prefix Log Present */ +#define PCI_ERR_CAP_COMP_TIME_LOG 0x00001000 /* Completion Timeout Prefix/Header Log Capable */ #define PCI_ERR_CAP_TLP_LOG_FLIT 0x00040000 /* TLP was logged in Flit Mode */ #define PCI_ERR_CAP_TLP_LOG_SIZE 0x00f80000 /* Logged TLP Size (only in Flit mode) */ #define PCI_ERR_HEADER_LOG 0x1c /* Header Log Register (16 bytes) */ -- cgit v1.2.3 From 0a0fdb98d16e334e259352893462030f15fb887f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 26 Aug 2025 17:08:19 +0200 Subject: fuse: remove FUSE_NOTIFY_CODE_MAX from Constants that change value from version to version have no place in an interface definition. Hopefully this won't break anything. Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 6b9fb8b08768..30bf0846547f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -680,7 +680,6 @@ enum fuse_notify_code { FUSE_NOTIFY_DELETE = 6, FUSE_NOTIFY_RESEND = 7, FUSE_NOTIFY_INC_EPOCH = 8, - FUSE_NOTIFY_CODE_MAX, }; /* The read buffer is required to be at least 8k, but may be much larger */ -- cgit v1.2.3 From 3f29d59e92a96d843c2ff10ebfed92ac26878658 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 2 Sep 2025 10:22:06 +0200 Subject: fuse: add prune notification Some fuse servers need to prune their caches, which can only be done if the kernel's own dentry/inode caches are pruned first to avoid dangling references. Add FUSE_NOTIFY_PRUNE, which takes an array of node ID's to try and get rid of. Inodes with active references are skipped. A similar functionality is already provided by FUSE_NOTIFY_INVAL_ENTRY with the FUSE_EXPIRE_ONLY flag. Differences in the interface are FUSE_NOTIFY_INVAL_ENTRY: - can only prune one dentry - dentry is determined by parent ID and name - if inode has multiple aliases (cached hard links), then they would have to be invalidated individually to be able to get rid of the inode FUSE_NOTIFY_PRUNE: - can prune multiple inodes - inodes determined by their node ID - aliases are taken care of automatically Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 39 +++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 6 ++++++ fs/fuse/inode.c | 11 +++++++++++ include/uapi/linux/fuse.h | 8 ++++++++ 4 files changed, 64 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1258acee9704..4229b38546bb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2034,6 +2034,42 @@ static int fuse_notify_inc_epoch(struct fuse_conn *fc) return 0; } +static int fuse_notify_prune(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_prune_out outarg; + const unsigned int batch = 512; + u64 *nodeids __free(kfree) = kmalloc(sizeof(u64) * batch, GFP_KERNEL); + unsigned int num, i; + int err; + + if (!nodeids) + return -ENOMEM; + + if (size < sizeof(outarg)) + return -EINVAL; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + return err; + + if (size - sizeof(outarg) != outarg.count * sizeof(u64)) + return -EINVAL; + + for (; outarg.count; outarg.count -= num) { + num = min(batch, outarg.count); + err = fuse_copy_one(cs, nodeids, num * sizeof(u64)); + if (err) + return err; + + scoped_guard(rwsem_read, &fc->killsb) { + for (i = 0; i < num; i++) + fuse_try_prune_one_inode(fc, nodeids[i]); + } + } + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -2065,6 +2101,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_INC_EPOCH: return fuse_notify_inc_epoch(fc); + case FUSE_NOTIFY_PRUNE: + return fuse_notify_prune(fc, size, cs); + default: return -EINVAL; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 233c6111f768..fb6604120b53 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1413,6 +1413,12 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name, u32 flags); +/* + * Try to prune this inode. If neither the inode itself nor dentries associated + * with this inode have any external reference, then the inode can be freed. + */ +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid); + int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 5b7897bf7e45..a4d361b34d06 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -585,6 +585,17 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, return 0; } +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid) +{ + struct inode *inode; + + inode = fuse_ilookup(fc, nodeid, NULL); + if (!inode) + return; + d_prune_aliases(inode); + iput(inode); +} + bool fuse_lock_inode(struct inode *inode) { bool locked = false; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 30bf0846547f..c13e1f9a2f12 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -239,6 +239,7 @@ * 7.45 * - add FUSE_COPY_FILE_RANGE_64 * - add struct fuse_copy_file_range_out + * - add FUSE_NOTIFY_PRUNE */ #ifndef _LINUX_FUSE_H @@ -680,6 +681,7 @@ enum fuse_notify_code { FUSE_NOTIFY_DELETE = 6, FUSE_NOTIFY_RESEND = 7, FUSE_NOTIFY_INC_EPOCH = 8, + FUSE_NOTIFY_PRUNE = 9, }; /* The read buffer is required to be at least 8k, but may be much larger */ @@ -1118,6 +1120,12 @@ struct fuse_notify_retrieve_in { uint64_t dummy4; }; +struct fuse_notify_prune_out { + uint32_t count; + uint32_t padding; + uint64_t spare; +}; + struct fuse_backing_map { int32_t fd; uint32_t flags; -- cgit v1.2.3 From c265ae75f900cea4e415230a77b5d152377627dd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Sep 2025 00:03:00 +0100 Subject: io_uring: introduce io_uring querying There are many parameters users might want to query about io_uring like available request types or the ring sizes. This patch introduces an interface for such slow path queries. It was written with several requirements in mind: - Can be used with or without an io_uring instance. Asking for supported setup flags before creating an instance as well as qeurying info about an already created ring are valid use cases. - Should be moderately fast. For example, users might use it to periodically retrieve ring attributes at runtime. As a consequence, it should be able to query multiple attributes in a single syscall. - Backward and forward compatible. - Should be reasobably easy to use. - Reduce the kernel code size for introducing new query types. It's implemented as a new registration opcode IORING_REGISTER_QUERY. The user passes one or more query strutctures linked together, each represented by struct io_uring_query_hdr. The header stores common control fields needed for processing and points to query type specific information. The header contains - The query type - The result field, which on return contains the error code for the query - Pointer to the query type specific information - The size of the query structure. The kernel will only populate up to the size, which helps with backward compatibility. The kernel can also reduce the size, so if the current kernel is older than the inteface the user tries to use, it'll get only the supported bits. - next_entry field is used to chain multiple queries. Apart from common registeration syscall failures, it can only immediately return an error code in case when the headers are incorrect or any other addresses and invalid. That usually mean that the userspace doesn't use the API right and should be corrected. All query type specific errors are returned in the header's result field. As an example, the patch adds a single query type for now, i.e. IO_URING_QUERY_OPCODES, which tells what register / request / etc. opcodes are supported, but there are particular plans to extend it. Note: there is a request probing interface via IORING_REGISTER_PROBE, but it's a mess. It requires the user to create a ring first, it only works for requests, and requires dynamic allocations. Reviewed-by: Martin K. Petersen Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 3 ++ include/uapi/linux/io_uring/query.h | 41 ++++++++++++++++ io_uring/Makefile | 2 +- io_uring/query.c | 93 +++++++++++++++++++++++++++++++++++++ io_uring/query.h | 9 ++++ io_uring/register.c | 6 +++ 6 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/io_uring/query.h create mode 100644 io_uring/query.c create mode 100644 io_uring/query.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 04ebff33d0e6..1ce17c535944 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -686,6 +686,9 @@ enum io_uring_register_op { IORING_REGISTER_MEM_REGION = 34, + /* query various aspects of io_uring, see linux/io_uring/query.h */ + IORING_REGISTER_QUERY = 35, + /* this goes last */ IORING_REGISTER_LAST, diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h new file mode 100644 index 000000000000..5d754322a27c --- /dev/null +++ b/include/uapi/linux/io_uring/query.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ +/* + * Header file for the io_uring query interface. + */ +#ifndef LINUX_IO_URING_QUERY_H +#define LINUX_IO_URING_QUERY_H + +#include + +struct io_uring_query_hdr { + __u64 next_entry; + __u64 query_data; + __u32 query_op; + __u32 size; + __s32 result; + __u32 __resv[3]; +}; + +enum { + IO_URING_QUERY_OPCODES = 0, + + __IO_URING_QUERY_MAX, +}; + +/* Doesn't require a ring */ +struct io_uring_query_opcode { + /* The number of supported IORING_OP_* opcodes */ + __u32 nr_request_opcodes; + /* The number of supported IORING_[UN]REGISTER_* opcodes */ + __u32 nr_register_opcodes; + /* Bitmask of all supported IORING_FEAT_* flags */ + __u64 feature_flags; + /* Bitmask of all supported IORING_SETUP_* flags */ + __u64 ring_setup_flags; + /* Bitmask of all supported IORING_ENTER_** flags */ + __u64 enter_flags; + /* Bitmask of all supported IOSQE_* flags */ + __u64 sqe_flags; +}; + +#endif diff --git a/io_uring/Makefile b/io_uring/Makefile index b3f1bd492804..bc4e4a3fa0a5 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ sync.o msg_ring.o advise.o openclose.o \ statx.o timeout.o cancel.o \ waitid.o register.o truncate.o \ - memmap.o alloc_cache.o + memmap.o alloc_cache.o query.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o diff --git a/io_uring/query.c b/io_uring/query.c new file mode 100644 index 000000000000..9eed0f371956 --- /dev/null +++ b/io_uring/query.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "linux/io_uring/query.h" + +#include "query.h" +#include "io_uring.h" + +#define IO_MAX_QUERY_SIZE (sizeof(struct io_uring_query_opcode)) + +static ssize_t io_query_ops(void *data) +{ + struct io_uring_query_opcode *e = data; + + BUILD_BUG_ON(sizeof(*e) > IO_MAX_QUERY_SIZE); + + e->nr_request_opcodes = IORING_OP_LAST; + e->nr_register_opcodes = IORING_REGISTER_LAST; + e->feature_flags = IORING_FEAT_FLAGS; + e->ring_setup_flags = IORING_SETUP_FLAGS; + e->enter_flags = IORING_ENTER_FLAGS; + e->sqe_flags = SQE_VALID_FLAGS; + return sizeof(*e); +} + +static int io_handle_query_entry(struct io_ring_ctx *ctx, + void *data, void __user *uhdr, + u64 *next_entry) +{ + struct io_uring_query_hdr hdr; + size_t usize, res_size = 0; + ssize_t ret = -EINVAL; + void __user *udata; + + if (copy_from_user(&hdr, uhdr, sizeof(hdr))) + return -EFAULT; + usize = hdr.size; + hdr.size = min(hdr.size, IO_MAX_QUERY_SIZE); + udata = u64_to_user_ptr(hdr.query_data); + + if (hdr.query_op >= __IO_URING_QUERY_MAX) { + ret = -EOPNOTSUPP; + goto out; + } + if (!mem_is_zero(hdr.__resv, sizeof(hdr.__resv)) || hdr.result || !hdr.size) + goto out; + if (copy_from_user(data, udata, hdr.size)) + return -EFAULT; + + switch (hdr.query_op) { + case IO_URING_QUERY_OPCODES: + ret = io_query_ops(data); + break; + } + + if (ret >= 0) { + if (WARN_ON_ONCE(ret > IO_MAX_QUERY_SIZE)) + return -EFAULT; + res_size = ret; + ret = 0; + } +out: + hdr.result = ret; + hdr.size = min_t(size_t, usize, res_size); + + if (copy_struct_to_user(udata, usize, data, hdr.size, NULL)) + return -EFAULT; + if (copy_to_user(uhdr, &hdr, sizeof(hdr))) + return -EFAULT; + *next_entry = hdr.next_entry; + return 0; +} + +int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + char entry_buffer[IO_MAX_QUERY_SIZE]; + void __user *uhdr = arg; + int ret; + + memset(entry_buffer, 0, sizeof(entry_buffer)); + + if (nr_args) + return -EINVAL; + + while (uhdr) { + u64 next_hdr; + + ret = io_handle_query_entry(ctx, entry_buffer, uhdr, &next_hdr); + if (ret) + return ret; + uhdr = u64_to_user_ptr(next_hdr); + } + return 0; +} diff --git a/io_uring/query.h b/io_uring/query.h new file mode 100644 index 000000000000..171d47ccaaba --- /dev/null +++ b/io_uring/query.h @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IORING_QUERY_H +#define IORING_QUERY_H + +#include + +int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); + +#endif diff --git a/io_uring/register.c b/io_uring/register.c index 50ce87928be0..9c31a8afb83d 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -31,6 +31,7 @@ #include "msg_ring.h" #include "memmap.h" #include "zcrx.h" +#include "query.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -832,6 +833,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_mem_region(ctx, arg); break; + case IORING_REGISTER_QUERY: + ret = io_query(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; @@ -901,6 +905,8 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg, switch (opcode) { case IORING_REGISTER_SEND_MSG_RING: return io_uring_register_send_msg_ring(arg, nr_args); + case IORING_REGISTER_QUERY: + return io_query(NULL, arg, nr_args); } return -EINVAL; } -- cgit v1.2.3 From faf23f54d366467bb449a7b2c39b382db9f92e80 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 12 Aug 2025 17:17:06 +0300 Subject: ptp: Add ioctl commands to expose raw cycle counter values Introduce two new ioctl commands, PTP_SYS_OFFSET_PRECISE_CYCLES and PTP_SYS_OFFSET_EXTENDED_CYCLES, to allow user space to access the raw free-running cycle counter from PTP devices. These ioctls are variants of the existing PRECISE and EXTENDED offset queries, but instead of returning device time in realtime, they return the raw cycle counter value. Signed-off-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Acked-by: Richard Cochran Link: https://patch.msgid.link/1755008228-88881-2-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- drivers/ptp/ptp_chardev.c | 34 ++++++++++++++++++++++++++-------- include/uapi/linux/ptp_clock.h | 4 ++++ 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 4ca5a464a46a..e9719f365aab 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -285,17 +285,21 @@ static long ptp_enable_pps(struct ptp_clock *ptp, bool enable) return ops->enable(ops, &req, enable); } -static long ptp_sys_offset_precise(struct ptp_clock *ptp, void __user *arg) +typedef int (*ptp_crosststamp_fn)(struct ptp_clock_info *, + struct system_device_crosststamp *); + +static long ptp_sys_offset_precise(struct ptp_clock *ptp, void __user *arg, + ptp_crosststamp_fn crosststamp_fn) { struct ptp_sys_offset_precise precise_offset; struct system_device_crosststamp xtstamp; struct timespec64 ts; int err; - if (!ptp->info->getcrosststamp) + if (!crosststamp_fn) return -EOPNOTSUPP; - err = ptp->info->getcrosststamp(ptp->info, &xtstamp); + err = crosststamp_fn(ptp->info, &xtstamp); if (err) return err; @@ -313,12 +317,17 @@ static long ptp_sys_offset_precise(struct ptp_clock *ptp, void __user *arg) return copy_to_user(arg, &precise_offset, sizeof(precise_offset)) ? -EFAULT : 0; } -static long ptp_sys_offset_extended(struct ptp_clock *ptp, void __user *arg) +typedef int (*ptp_gettimex_fn)(struct ptp_clock_info *, + struct timespec64 *, + struct ptp_system_timestamp *); + +static long ptp_sys_offset_extended(struct ptp_clock *ptp, void __user *arg, + ptp_gettimex_fn gettimex_fn) { struct ptp_sys_offset_extended *extoff __free(kfree) = NULL; struct ptp_system_timestamp sts; - if (!ptp->info->gettimex64) + if (!gettimex_fn) return -EOPNOTSUPP; extoff = memdup_user(arg, sizeof(*extoff)); @@ -346,7 +355,7 @@ static long ptp_sys_offset_extended(struct ptp_clock *ptp, void __user *arg) struct timespec64 ts; int err; - err = ptp->info->gettimex64(ptp->info, &ts, &sts); + err = gettimex_fn(ptp->info, &ts, &sts); if (err) return err; @@ -497,11 +506,13 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, case PTP_SYS_OFFSET_PRECISE: case PTP_SYS_OFFSET_PRECISE2: - return ptp_sys_offset_precise(ptp, argptr); + return ptp_sys_offset_precise(ptp, argptr, + ptp->info->getcrosststamp); case PTP_SYS_OFFSET_EXTENDED: case PTP_SYS_OFFSET_EXTENDED2: - return ptp_sys_offset_extended(ptp, argptr); + return ptp_sys_offset_extended(ptp, argptr, + ptp->info->gettimex64); case PTP_SYS_OFFSET: case PTP_SYS_OFFSET2: @@ -523,6 +534,13 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, case PTP_MASK_EN_SINGLE: return ptp_mask_en_single(pccontext->private_clkdata, argptr); + case PTP_SYS_OFFSET_PRECISE_CYCLES: + return ptp_sys_offset_precise(ptp, argptr, + ptp->info->getcrosscycles); + + case PTP_SYS_OFFSET_EXTENDED_CYCLES: + return ptp_sys_offset_extended(ptp, argptr, + ptp->info->getcyclesx64); default: return -ENOTTY; } diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 18eefa6d93d6..65f187b5f0d0 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -245,6 +245,10 @@ struct ptp_pin_desc { _IOWR(PTP_CLK_MAGIC, 18, struct ptp_sys_offset_extended) #define PTP_MASK_CLEAR_ALL _IO(PTP_CLK_MAGIC, 19) #define PTP_MASK_EN_SINGLE _IOW(PTP_CLK_MAGIC, 20, unsigned int) +#define PTP_SYS_OFFSET_PRECISE_CYCLES \ + _IOWR(PTP_CLK_MAGIC, 21, struct ptp_sys_offset_precise) +#define PTP_SYS_OFFSET_EXTENDED_CYCLES \ + _IOWR(PTP_CLK_MAGIC, 22, struct ptp_sys_offset_extended) struct ptp_extts_event { struct ptp_clock_time t; /* Time event occurred. */ -- cgit v1.2.3 From 6b6dc81ee7e8ca87c71a533e1d69cf96a4f1e986 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 2 Sep 2025 06:44:59 +0000 Subject: bonding: add support for per-port LACP actor priority Introduce a new netlink attribute 'actor_port_prio' to allow setting the LACP actor port priority on a per-slave basis. This extends the existing bonding infrastructure to support more granular control over LACP negotiations. The priority value is embedded in LACPDU packets and will be used by subsequent patches to influence aggregator selection policies. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250902064501.360822-2-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- Documentation/networking/bonding.rst | 9 +++++++++ drivers/net/bonding/bond_3ad.c | 4 ++++ drivers/net/bonding/bond_netlink.c | 16 ++++++++++++++++ drivers/net/bonding/bond_options.c | 36 ++++++++++++++++++++++++++++++++++++ include/net/bond_3ad.h | 1 + include/net/bond_options.h | 1 + include/uapi/linux/if_link.h | 1 + 7 files changed, 68 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index a2b42ae719d2..345b60992446 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -193,6 +193,15 @@ ad_actor_sys_prio This parameter has effect only in 802.3ad mode and is available through SysFs interface. +actor_port_prio + + In an AD system, this specifies the port priority. The allowed range + is 1 - 65535. If the value is not specified, it takes 255 as the + default value. + + This parameter has effect only in 802.3ad mode and is available through + netlink interface. + ad_actor_system In an AD system, this specifies the mac-address for the actor in diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 4edc8e6b6b64..67ca78923b04 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -436,6 +436,7 @@ static void __ad_actor_update_port(struct port *port) port->actor_system = BOND_AD_INFO(bond).system.sys_mac_addr; port->actor_system_priority = BOND_AD_INFO(bond).system.sys_priority; + port->actor_port_priority = SLAVE_AD_INFO(port->slave)->port_priority; } /* Conversions */ @@ -2209,6 +2210,9 @@ void bond_3ad_bind_slave(struct slave *slave) ad_initialize_port(port, &bond->params); + /* Port priority is initialized. Update it to slave's ad info */ + SLAVE_AD_INFO(slave)->port_priority = port->actor_port_priority; + port->slave = slave; port->actor_port_number = SLAVE_AD_INFO(slave)->id; /* key is determined according to the link speed, duplex and diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index e573b34a1bbc..ba71d95a82d2 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -28,6 +28,7 @@ static size_t bond_get_slave_size(const struct net_device *bond_dev, nla_total_size(sizeof(u8)) + /* IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE */ nla_total_size(sizeof(s32)) + /* IFLA_BOND_SLAVE_PRIO */ + nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_ACTOR_PORT_PRIO */ 0; } @@ -77,6 +78,10 @@ static int bond_fill_slave_info(struct sk_buff *skb, ad_port->partner_oper.port_state)) goto nla_put_failure; } + + if (nla_put_u16(skb, IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, + SLAVE_AD_INFO(slave)->port_priority)) + goto nla_put_failure; } return 0; @@ -130,6 +135,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = { static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = { [IFLA_BOND_SLAVE_QUEUE_ID] = { .type = NLA_U16 }, [IFLA_BOND_SLAVE_PRIO] = { .type = NLA_S32 }, + [IFLA_BOND_SLAVE_ACTOR_PORT_PRIO] = { .type = NLA_U16 }, }; static int bond_validate(struct nlattr *tb[], struct nlattr *data[], @@ -180,6 +186,16 @@ static int bond_slave_changelink(struct net_device *bond_dev, return err; } + if (data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO]) { + u16 ad_prio = nla_get_u16(data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO]); + + bond_opt_slave_initval(&newval, &slave_dev, ad_prio); + err = __bond_opt_set(bond, BOND_OPT_ACTOR_PORT_PRIO, &newval, + data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO], extack); + if (err) + return err; + } + return 0; } diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index c0a5eb8766b5..897022272334 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -79,6 +79,8 @@ static int bond_option_tlb_dynamic_lb_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_actor_sys_prio_set(struct bonding *bond, const struct bond_opt_value *newval); +static int bond_option_actor_port_prio_set(struct bonding *bond, + const struct bond_opt_value *newval); static int bond_option_ad_actor_system_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_user_port_key_set(struct bonding *bond, @@ -222,6 +224,13 @@ static const struct bond_opt_value bond_ad_actor_sys_prio_tbl[] = { { NULL, -1, 0}, }; +static const struct bond_opt_value bond_actor_port_prio_tbl[] = { + { "minval", 0, BOND_VALFLAG_MIN}, + { "maxval", 65535, BOND_VALFLAG_MAX}, + { "default", 255, BOND_VALFLAG_DEFAULT}, + { NULL, -1, 0}, +}; + static const struct bond_opt_value bond_ad_user_port_key_tbl[] = { { "minval", 0, BOND_VALFLAG_MIN | BOND_VALFLAG_DEFAULT}, { "maxval", 1023, BOND_VALFLAG_MAX}, @@ -483,6 +492,13 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = { .values = bond_ad_actor_sys_prio_tbl, .set = bond_option_ad_actor_sys_prio_set, }, + [BOND_OPT_ACTOR_PORT_PRIO] = { + .id = BOND_OPT_ACTOR_PORT_PRIO, + .name = "actor_port_prio", + .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), + .values = bond_actor_port_prio_tbl, + .set = bond_option_actor_port_prio_set, + }, [BOND_OPT_AD_ACTOR_SYSTEM] = { .id = BOND_OPT_AD_ACTOR_SYSTEM, .name = "ad_actor_system", @@ -1812,6 +1828,26 @@ static int bond_option_ad_actor_sys_prio_set(struct bonding *bond, return 0; } +static int bond_option_actor_port_prio_set(struct bonding *bond, + const struct bond_opt_value *newval) +{ + struct slave *slave; + + slave = bond_slave_get_rtnl(newval->slave_dev); + if (!slave) { + netdev_dbg(bond->dev, "%s called on NULL slave\n", __func__); + return -ENODEV; + } + + netdev_dbg(newval->slave_dev, "Setting actor_port_prio to %llu\n", + newval->value); + + SLAVE_AD_INFO(slave)->port_priority = newval->value; + bond_3ad_update_ad_actor_settings(bond); + + return 0; +} + static int bond_option_ad_actor_system_set(struct bonding *bond, const struct bond_opt_value *newval) { diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index dba369a2cf27..e9188646e22e 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -274,6 +274,7 @@ struct ad_slave_info { struct port port; /* 802.3ad port structure */ struct bond_3ad_stats stats; u16 id; + u16 port_priority; }; static inline const char *bond_3ad_churn_desc(churn_state_t state) diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 022b122a9fb6..e6eedf23aea1 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -78,6 +78,7 @@ enum { BOND_OPT_PRIO, BOND_OPT_COUPLED_CONTROL, BOND_OPT_BROADCAST_NEIGH, + BOND_OPT_ACTOR_PORT_PRIO, BOND_OPT_LAST }; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 784ace3a519c..45f56c9f95d9 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1564,6 +1564,7 @@ enum { IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, IFLA_BOND_SLAVE_PRIO, + IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, __IFLA_BOND_SLAVE_MAX, }; -- cgit v1.2.3 From ce4c356d760f6fb22d69f0c5091542891ba6f394 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 24 Jun 2025 08:36:16 +0200 Subject: media: update Hans Verkuil's email address Replace hansverk@cisco.com by hverkuil@kernel.org. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/core/cec-core.c | 2 +- drivers/media/cec/platform/cec-gpio/cec-gpio.c | 2 +- drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c | 2 +- drivers/media/i2c/adv7604.c | 2 +- drivers/media/i2c/adv7842.c | 2 +- drivers/media/i2c/ths7303.c | 2 +- drivers/media/mc/mc-request.c | 2 +- drivers/media/pci/cobalt/cobalt-driver.c | 2 +- drivers/media/radio/radio-aimslab.c | 2 +- drivers/media/radio/radio-gemtek.c | 2 +- drivers/media/radio/radio-isa.c | 2 +- drivers/media/radio/radio-isa.h | 2 +- drivers/media/radio/radio-miropcm20.c | 2 +- drivers/media/radio/radio-rtrack2.c | 2 +- drivers/media/radio/radio-terratec.c | 2 +- drivers/media/radio/radio-zoltrix.c | 2 +- drivers/media/test-drivers/vicodec/vicodec-core.c | 2 +- include/media/i2c/ths7303.h | 2 +- include/media/media-request.h | 2 +- include/uapi/linux/v4l2-dv-timings.h | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/media/cec/core/cec-core.c b/drivers/media/cec/core/cec-core.c index e10bd588a586..d7259599029f 100644 --- a/drivers/media/cec/core/cec-core.c +++ b/drivers/media/cec/core/cec-core.c @@ -439,6 +439,6 @@ static void __exit cec_devnode_exit(void) subsys_initcall(cec_devnode_init); module_exit(cec_devnode_exit) -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_DESCRIPTION("Device node registration for cec drivers"); MODULE_LICENSE("GPL"); diff --git a/drivers/media/cec/platform/cec-gpio/cec-gpio.c b/drivers/media/cec/platform/cec-gpio/cec-gpio.c index 3c27789d8657..842555ed42c7 100644 --- a/drivers/media/cec/platform/cec-gpio/cec-gpio.c +++ b/drivers/media/cec/platform/cec-gpio/cec-gpio.c @@ -291,6 +291,6 @@ static struct platform_driver cec_gpio_pdrv = { module_platform_driver(cec_gpio_pdrv); -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("CEC GPIO driver"); diff --git a/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c b/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c index c476e8d1279e..e2eff17952ab 100644 --- a/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c +++ b/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c @@ -28,7 +28,7 @@ #include "extron-da-hd-4k-plus.h" -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_DESCRIPTION("Extron DA HD 4K PLUS HDMI CEC driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/media/i2c/adv7604.c b/drivers/media/i2c/adv7604.c index afed38596362..8fe7c2f72883 100644 --- a/drivers/media/i2c/adv7604.c +++ b/drivers/media/i2c/adv7604.c @@ -42,7 +42,7 @@ module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "debug level (0-2)"); MODULE_DESCRIPTION("Analog Devices ADV7604/10/11/12 video decoder driver"); -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_AUTHOR("Mats Randgaard "); MODULE_LICENSE("GPL"); diff --git a/drivers/media/i2c/adv7842.c b/drivers/media/i2c/adv7842.c index 5545cd23e113..9780082db841 100644 --- a/drivers/media/i2c/adv7842.c +++ b/drivers/media/i2c/adv7842.c @@ -38,7 +38,7 @@ module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "debug level (0-2)"); MODULE_DESCRIPTION("Analog Devices ADV7842 video decoder driver"); -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_AUTHOR("Martin Bugge "); MODULE_LICENSE("GPL"); diff --git a/drivers/media/i2c/ths7303.c b/drivers/media/i2c/ths7303.c index b7cedc5b3e8e..ff268ebeb4d9 100644 --- a/drivers/media/i2c/ths7303.c +++ b/drivers/media/i2c/ths7303.c @@ -7,7 +7,7 @@ * Author: Chaithrika U S * * Contributors: - * Hans Verkuil + * Hans Verkuil * Lad, Prabhakar * Martin Bugge * diff --git a/drivers/media/mc/mc-request.c b/drivers/media/mc/mc-request.c index 5edfc2791ce7..f66f728b1b43 100644 --- a/drivers/media/mc/mc-request.c +++ b/drivers/media/mc/mc-request.c @@ -6,7 +6,7 @@ * Copyright (C) 2018 Intel Corporation * Copyright (C) 2018 Google, Inc. * - * Author: Hans Verkuil + * Author: Hans Verkuil * Author: Sakari Ailus */ diff --git a/drivers/media/pci/cobalt/cobalt-driver.c b/drivers/media/pci/cobalt/cobalt-driver.c index 39e25cc53edb..b7695705fdee 100644 --- a/drivers/media/pci/cobalt/cobalt-driver.c +++ b/drivers/media/pci/cobalt/cobalt-driver.c @@ -44,7 +44,7 @@ module_param_named(ignore_err, cobalt_ignore_err, int, 0644); MODULE_PARM_DESC(ignore_err, "If set then ignore missing i2c adapters/receivers. Default: 0\n"); -MODULE_AUTHOR("Hans Verkuil & Morten Hestnes"); +MODULE_AUTHOR("Hans Verkuil & Morten Hestnes"); MODULE_DESCRIPTION("cobalt driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/media/radio/radio-aimslab.c b/drivers/media/radio/radio-aimslab.c index 3c8c17d64821..2c1d413e8636 100644 --- a/drivers/media/radio/radio-aimslab.c +++ b/drivers/media/radio/radio-aimslab.c @@ -4,7 +4,7 @@ * * Copyright 1997 M. Kirkwood * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * Converted to V4L2 API by Mauro Carvalho Chehab * Converted to new API by Alan Cox * Various bugfixes and enhancements by Russell Kroll diff --git a/drivers/media/radio/radio-gemtek.c b/drivers/media/radio/radio-gemtek.c index 5ca6274c45bd..a3265f1dd189 100644 --- a/drivers/media/radio/radio-gemtek.c +++ b/drivers/media/radio/radio-gemtek.c @@ -15,7 +15,7 @@ * Converted to new API by Alan Cox * Various bugfixes and enhancements by Russell Kroll * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * Converted to V4L2 API by Mauro Carvalho Chehab * * Note: this card seems to swap the left and right audio channels! diff --git a/drivers/media/radio/radio-isa.c b/drivers/media/radio/radio-isa.c index 4f87c76a2a96..1a144536ffa7 100644 --- a/drivers/media/radio/radio-isa.c +++ b/drivers/media/radio/radio-isa.c @@ -4,7 +4,7 @@ * This takes care of all the V4L2 scaffolding, allowing the ISA drivers * to concentrate on the actual hardware operation. * - * Copyright (C) 2012 Hans Verkuil + * Copyright (C) 2012 Hans Verkuil */ #include diff --git a/drivers/media/radio/radio-isa.h b/drivers/media/radio/radio-isa.h index 0f3db473da5e..62ff5c3fb5d5 100644 --- a/drivers/media/radio/radio-isa.h +++ b/drivers/media/radio/radio-isa.h @@ -4,7 +4,7 @@ * This takes care of all the V4L2 scaffolding, allowing the ISA drivers * to concentrate on the actual hardware operation. * - * Copyright (C) 2012 Hans Verkuil + * Copyright (C) 2012 Hans Verkuil */ #ifndef _RADIO_ISA_H_ diff --git a/drivers/media/radio/radio-miropcm20.c b/drivers/media/radio/radio-miropcm20.c index 27f058c5e677..67712ab3d564 100644 --- a/drivers/media/radio/radio-miropcm20.c +++ b/drivers/media/radio/radio-miropcm20.c @@ -23,7 +23,7 @@ * This code has been reintroduced and converted to use * the new V4L2 RDS API by: * - * Hans Verkuil + * Hans Verkuil */ #include diff --git a/drivers/media/radio/radio-rtrack2.c b/drivers/media/radio/radio-rtrack2.c index 16b13a63bfed..efc02069bf9d 100644 --- a/drivers/media/radio/radio-rtrack2.c +++ b/drivers/media/radio/radio-rtrack2.c @@ -7,7 +7,7 @@ * Converted to new API by Alan Cox * Various bugfixes and enhancements by Russell Kroll * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * Converted to V4L2 API by Mauro Carvalho Chehab * * Fully tested with actual hardware and the v4l2-compliance tool. diff --git a/drivers/media/radio/radio-terratec.c b/drivers/media/radio/radio-terratec.c index 720080634454..43817dd0a0fe 100644 --- a/drivers/media/radio/radio-terratec.c +++ b/drivers/media/radio/radio-terratec.c @@ -17,7 +17,7 @@ * Frequency control is done digitally -- ie out(port,encodefreq(95.8)); * Volume Control is done digitally * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * Converted to V4L2 API by Mauro Carvalho Chehab */ diff --git a/drivers/media/radio/radio-zoltrix.c b/drivers/media/radio/radio-zoltrix.c index 099b7af6a410..e043bee52384 100644 --- a/drivers/media/radio/radio-zoltrix.c +++ b/drivers/media/radio/radio-zoltrix.c @@ -30,7 +30,7 @@ * 2006-07-24 - Converted to V4L2 API * by Mauro Carvalho Chehab * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * * Note that this is the driver for the Zoltrix Radio Plus. * This driver does not work for the Zoltrix Radio Plus 108 or the diff --git a/drivers/media/test-drivers/vicodec/vicodec-core.c b/drivers/media/test-drivers/vicodec/vicodec-core.c index 174ea761d09a..a3df3a33237e 100644 --- a/drivers/media/test-drivers/vicodec/vicodec-core.c +++ b/drivers/media/test-drivers/vicodec/vicodec-core.c @@ -26,7 +26,7 @@ #include "codec-v4l2-fwht.h" MODULE_DESCRIPTION("Virtual codec device"); -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_LICENSE("GPL v2"); static bool multiplanar; diff --git a/include/media/i2c/ths7303.h b/include/media/i2c/ths7303.h index fc937025cdb4..7eda467b6725 100644 --- a/include/media/i2c/ths7303.h +++ b/include/media/i2c/ths7303.h @@ -5,7 +5,7 @@ * Copyright 2013 Cisco Systems, Inc. and/or its affiliates. * * Contributors: - * Hans Verkuil + * Hans Verkuil * Lad, Prabhakar * Martin Bugge */ diff --git a/include/media/media-request.h b/include/media/media-request.h index d4ac557678a7..bb500b2f9da4 100644 --- a/include/media/media-request.h +++ b/include/media/media-request.h @@ -5,7 +5,7 @@ * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved. * Copyright (C) 2018 Intel Corporation * - * Author: Hans Verkuil + * Author: Hans Verkuil * Author: Sakari Ailus */ diff --git a/include/uapi/linux/v4l2-dv-timings.h b/include/uapi/linux/v4l2-dv-timings.h index 44a16e0e5a12..58f478f98a35 100644 --- a/include/uapi/linux/v4l2-dv-timings.h +++ b/include/uapi/linux/v4l2-dv-timings.h @@ -2,7 +2,7 @@ /* * V4L2 DV timings header. * - * Copyright (C) 2012-2016 Hans Verkuil + * Copyright (C) 2012-2016 Hans Verkuil */ #ifndef _V4L2_DV_TIMINGS_H -- cgit v1.2.3 From e8c8d961d8ad53d8d104d7474c08d8e4626c7767 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 24 Jun 2025 08:41:15 +0200 Subject: media: include: update Hans Verkuil's email address Replace hverkuil@xs4all.nl by hverkuil@kernel.org. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/dt-bindings/media/tvp5150.h | 2 +- include/linux/videodev2.h | 2 +- include/media/drv-intf/cx25840.h | 2 +- include/media/drv-intf/msp3400.h | 2 +- include/media/i2c/bt819.h | 2 +- include/media/i2c/cs5345.h | 2 +- include/media/i2c/cs53l32a.h | 2 +- include/media/i2c/m52790.h | 2 +- include/media/i2c/mt9v011.h | 2 +- include/media/i2c/saa7115.h | 2 +- include/media/i2c/saa7127.h | 2 +- include/media/i2c/tvaudio.h | 2 +- include/media/i2c/upd64031a.h | 2 +- include/media/i2c/upd64083.h | 2 +- include/media/i2c/wm8775.h | 2 +- include/media/v4l2-common.h | 2 +- include/media/v4l2-ctrls.h | 2 +- include/media/v4l2-device.h | 2 +- include/media/v4l2-subdev.h | 2 +- include/uapi/linux/ivtv.h | 2 +- include/uapi/linux/videodev2.h | 2 +- 21 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/dt-bindings/media/tvp5150.h b/include/dt-bindings/media/tvp5150.h index dda00c038530..ba34c420c303 100644 --- a/include/dt-bindings/media/tvp5150.h +++ b/include/dt-bindings/media/tvp5150.h @@ -2,7 +2,7 @@ /* tvp5150.h - definition for tvp5150 inputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index 219037f4c08d..9609cf365e8e 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -50,7 +50,7 @@ * * Author: Bill Dirks * Justin Schoeman - * Hans Verkuil + * Hans Verkuil * et al. */ #ifndef __LINUX_VIDEODEV2_H diff --git a/include/media/drv-intf/cx25840.h b/include/media/drv-intf/cx25840.h index ba69bc525382..8b455d9dd5ca 100644 --- a/include/media/drv-intf/cx25840.h +++ b/include/media/drv-intf/cx25840.h @@ -3,7 +3,7 @@ /* * cx25840.h - definition for cx25840/1/2/3 inputs * - * Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + * Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ #ifndef _CX25840_H_ diff --git a/include/media/drv-intf/msp3400.h b/include/media/drv-intf/msp3400.h index d6dfae104a6f..853258ee6bbd 100644 --- a/include/media/drv-intf/msp3400.h +++ b/include/media/drv-intf/msp3400.h @@ -2,7 +2,7 @@ /* msp3400.h - definition for msp3400 inputs and outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/bt819.h b/include/media/i2c/bt819.h index 70aa46bd5182..2277a7eb9548 100644 --- a/include/media/i2c/bt819.h +++ b/include/media/i2c/bt819.h @@ -2,7 +2,7 @@ /* bt819.h - bt819 notifications - Copyright (C) 2009 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2009 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/cs5345.h b/include/media/i2c/cs5345.h index d41e4dca3fcc..39e1cf6c1a2f 100644 --- a/include/media/i2c/cs5345.h +++ b/include/media/i2c/cs5345.h @@ -2,7 +2,7 @@ /* cs5345.h - definition for cs5345 inputs and outputs - Copyright (C) 2007 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2007 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/cs53l32a.h b/include/media/i2c/cs53l32a.h index 52ceb2f916d3..777f667855cb 100644 --- a/include/media/i2c/cs53l32a.h +++ b/include/media/i2c/cs53l32a.h @@ -2,7 +2,7 @@ /* cs53l32a.h - definition for cs53l32a inputs and outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/m52790.h b/include/media/i2c/m52790.h index 3f214fa9bc64..cedaaf215273 100644 --- a/include/media/i2c/m52790.h +++ b/include/media/i2c/m52790.h @@ -2,7 +2,7 @@ /* m52790.h - definition for m52790 inputs and outputs - Copyright (C) 2007 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2007 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/mt9v011.h b/include/media/i2c/mt9v011.h index 41c00b3e7184..552839756e64 100644 --- a/include/media/i2c/mt9v011.h +++ b/include/media/i2c/mt9v011.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* mt9v011 sensor * - * Copyright (C) 2011 Hans Verkuil + * Copyright (C) 2011 Hans Verkuil */ #ifndef __MT9V011_H__ diff --git a/include/media/i2c/saa7115.h b/include/media/i2c/saa7115.h index 0cd6080d7cb1..a607c91ef5f3 100644 --- a/include/media/i2c/saa7115.h +++ b/include/media/i2c/saa7115.h @@ -2,7 +2,7 @@ /* saa7115.h - definition for saa7111/3/4/5 inputs and frequency flags - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/saa7127.h b/include/media/i2c/saa7127.h index 53ee999e6090..c81ee1743df1 100644 --- a/include/media/i2c/saa7127.h +++ b/include/media/i2c/saa7127.h @@ -2,7 +2,7 @@ /* saa7127.h - definition for saa7126/7/8/9 inputs/outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/tvaudio.h b/include/media/i2c/tvaudio.h index 42cd3206fb6c..206f42ed4e69 100644 --- a/include/media/i2c/tvaudio.h +++ b/include/media/i2c/tvaudio.h @@ -2,7 +2,7 @@ /* tvaudio.h - definition for tvaudio inputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/upd64031a.h b/include/media/i2c/upd64031a.h index b6570abc84ef..d39b2b7f0cf3 100644 --- a/include/media/i2c/upd64031a.h +++ b/include/media/i2c/upd64031a.h @@ -2,7 +2,7 @@ /* * upd64031a - NEC Electronics Ghost Reduction input defines * - * 2006 by Hans Verkuil (hverkuil@xs4all.nl) + * 2006 by Hans Verkuil (hverkuil@kernel.org) */ #ifndef _UPD64031A_H_ diff --git a/include/media/i2c/upd64083.h b/include/media/i2c/upd64083.h index 17fb7b5201cc..72cf547c25fc 100644 --- a/include/media/i2c/upd64083.h +++ b/include/media/i2c/upd64083.h @@ -2,7 +2,7 @@ /* * upd6408x - NEC Electronics 3-Dimensional Y/C separation input defines * - * 2006 by Hans Verkuil (hverkuil@xs4all.nl) + * 2006 by Hans Verkuil (hverkuil@kernel.org) */ #ifndef _UPD64083_H_ diff --git a/include/media/i2c/wm8775.h b/include/media/i2c/wm8775.h index 6ccdeb3817ab..a02695ee3a58 100644 --- a/include/media/i2c/wm8775.h +++ b/include/media/i2c/wm8775.h @@ -2,7 +2,7 @@ /* wm8775.h - definition for wm8775 inputs and outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index d8e23991a656..594314459333 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -7,7 +7,7 @@ Each ioctl begins with VIDIOC_INT_ to clearly mark that it is an internal define, - Copyright (C) 2005 Hans Verkuil + Copyright (C) 2005 Hans Verkuil */ diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h index 4a294a5c7bdd..31fc1bee3797 100644 --- a/include/media/v4l2-ctrls.h +++ b/include/media/v4l2-ctrls.h @@ -2,7 +2,7 @@ /* * V4L2 controls support header. * - * Copyright (C) 2010 Hans Verkuil + * Copyright (C) 2010 Hans Verkuil */ #ifndef _V4L2_CTRLS_H diff --git a/include/media/v4l2-device.h b/include/media/v4l2-device.h index dd897a362f36..25f69b1b8db0 100644 --- a/include/media/v4l2-device.h +++ b/include/media/v4l2-device.h @@ -2,7 +2,7 @@ /* V4L2 device support header. - Copyright (C) 2008 Hans Verkuil + Copyright (C) 2008 Hans Verkuil */ diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index 4b28086808c9..e0bb58cb6d04 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -2,7 +2,7 @@ /* * V4L2 sub-device support header. * - * Copyright (C) 2008 Hans Verkuil + * Copyright (C) 2008 Hans Verkuil */ #ifndef _V4L2_SUBDEV_H diff --git a/include/uapi/linux/ivtv.h b/include/uapi/linux/ivtv.h index e74f18642b11..c9241f7271c4 100644 --- a/include/uapi/linux/ivtv.h +++ b/include/uapi/linux/ivtv.h @@ -2,7 +2,7 @@ /* Public ivtv API header Copyright (C) 2003-2004 Kevin Thayer - Copyright (C) 2004-2007 Hans Verkuil + Copyright (C) 2004-2007 Hans Verkuil This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 64943f1a6149..becd08fdbddb 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -51,7 +51,7 @@ * * Author: Bill Dirks * Justin Schoeman - * Hans Verkuil + * Hans Verkuil * et al. */ #ifndef _UAPI__LINUX_VIDEODEV2_H -- cgit v1.2.3 From 146bf4e75ecab9759ed78c9d167e860042d627fb Mon Sep 17 00:00:00 2001 From: Etienne Carriere Date: Wed, 13 Aug 2025 08:02:54 +0200 Subject: tee: new ioctl to a register tee_shm from a dmabuf file descriptor Add a userspace API to create a tee_shm object that refers to a dmabuf reference. Userspace registers the dmabuf file descriptor as in a tee_shm object. The registration is completed with a tee_shm returned file descriptor. Userspace is free to close the dmabuf file descriptor after it has been registered since all the resources are now held via the new tee_shm object. Closing the tee_shm file descriptor will eventually release all resources used by the tee_shm object when all references are released. The new IOCTL, TEE_IOC_SHM_REGISTER_FD, supports dmabuf references to physically contiguous memory buffers. Dmabuf references acquired from the TEE DMA-heap can be used as protected memory for Secure Video Path and such use cases. It depends on the TEE and the TEE driver if dmabuf references acquired by other means can be used. A new tee_shm flag is added to identify tee_shm objects built from a registered dmabuf, TEE_SHM_DMA_BUF. Signed-off-by: Etienne Carriere Signed-off-by: Olivier Masse Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 62 ++++++++++++++++++++++++++++++++++++++- drivers/tee/tee_private.h | 8 +++++ drivers/tee/tee_shm.c | 74 +++++++++++++++++++++++++++++++++++++++++++++-- include/linux/tee_core.h | 1 + include/linux/tee_drv.h | 10 +++++++ include/uapi/linux/tee.h | 31 ++++++++++++++++++++ 6 files changed, 182 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 4996f194cd9e..cea4da3ea46f 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -354,11 +354,49 @@ tee_ioctl_shm_register(struct tee_context *ctx, return ret; } +static int +tee_ioctl_shm_register_fd(struct tee_context *ctx, + struct tee_ioctl_shm_register_fd_data __user *udata) +{ + struct tee_ioctl_shm_register_fd_data data; + struct tee_shm *shm; + long ret; + + if (copy_from_user(&data, udata, sizeof(data))) + return -EFAULT; + + /* Currently no input flags are supported */ + if (data.flags) + return -EINVAL; + + shm = tee_shm_register_fd(ctx, data.fd); + if (IS_ERR(shm)) + return -EINVAL; + + data.id = shm->id; + data.flags = shm->flags; + data.size = shm->size; + + if (copy_to_user(udata, &data, sizeof(data))) + ret = -EFAULT; + else + ret = tee_shm_get_fd(shm); + + /* + * When user space closes the file descriptor the shared memory + * should be freed or if tee_shm_get_fd() failed then it will + * be freed immediately. + */ + tee_shm_put(shm); + return ret; +} + static int param_from_user_memref(struct tee_context *ctx, struct tee_param_memref *memref, struct tee_ioctl_param *ip) { struct tee_shm *shm; + size_t offs = 0; /* * If a NULL pointer is passed to a TA in the TEE, @@ -389,6 +427,26 @@ static int param_from_user_memref(struct tee_context *ctx, tee_shm_put(shm); return -EINVAL; } + + if (shm->flags & TEE_SHM_DMA_BUF) { + struct tee_shm_dmabuf_ref *ref; + + ref = container_of(shm, struct tee_shm_dmabuf_ref, shm); + if (ref->parent_shm) { + /* + * The shm already has one reference to + * ref->parent_shm so we are clear of 0. + * We're getting another reference since + * this shm will be used in the parameter + * list instead of the shm we got with + * tee_shm_get_from_id() above. + */ + refcount_inc(&ref->parent_shm->refcount); + tee_shm_put(shm); + shm = ref->parent_shm; + offs = ref->offset; + } + } } else if (ctx->cap_memref_null) { /* Pass NULL pointer to OP-TEE */ shm = NULL; @@ -396,7 +454,7 @@ static int param_from_user_memref(struct tee_context *ctx, return -EINVAL; } - memref->shm_offs = ip->a; + memref->shm_offs = ip->a + offs; memref->size = ip->b; memref->shm = shm; @@ -842,6 +900,8 @@ static long tee_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return tee_ioctl_shm_alloc(ctx, uarg); case TEE_IOC_SHM_REGISTER: return tee_ioctl_shm_register(ctx, uarg); + case TEE_IOC_SHM_REGISTER_FD: + return tee_ioctl_shm_register_fd(ctx, uarg); case TEE_IOC_OPEN_SESSION: return tee_ioctl_open_session(ctx, uarg); case TEE_IOC_INVOKE: diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h index 6c6ff5d5eed2..a9b5e4a6a8f7 100644 --- a/drivers/tee/tee_private.h +++ b/drivers/tee/tee_private.h @@ -13,6 +13,14 @@ #include #include +/* extra references appended to shm object for registered shared memory */ +struct tee_shm_dmabuf_ref { + struct tee_shm shm; + size_t offset; + struct dma_buf *dmabuf; + struct tee_shm *parent_shm; +}; + int tee_shm_get_fd(struct tee_shm *shm); bool tee_device_get(struct tee_device *teedev); diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index daf6e5cfd59a..76195a398c89 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -4,6 +4,7 @@ */ #include #include +#include #include #include #include @@ -45,7 +46,15 @@ static void release_registered_pages(struct tee_shm *shm) static void tee_shm_release(struct tee_device *teedev, struct tee_shm *shm) { - if (shm->flags & TEE_SHM_POOL) { + void *p = shm; + + if (shm->flags & TEE_SHM_DMA_BUF) { + struct tee_shm_dmabuf_ref *ref; + + ref = container_of(shm, struct tee_shm_dmabuf_ref, shm); + p = ref; + dma_buf_put(ref->dmabuf); + } else if (shm->flags & TEE_SHM_POOL) { teedev->pool->ops->free(teedev->pool, shm); } else if (shm->flags & TEE_SHM_DYNAMIC) { int rc = teedev->desc->ops->shm_unregister(shm->ctx, shm); @@ -59,7 +68,7 @@ static void tee_shm_release(struct tee_device *teedev, struct tee_shm *shm) teedev_ctx_put(shm->ctx); - kfree(shm); + kfree(p); tee_device_put(teedev); } @@ -169,7 +178,7 @@ struct tee_shm *tee_shm_alloc_user_buf(struct tee_context *ctx, size_t size) * tee_client_invoke_func(). The memory allocated is later freed with a * call to tee_shm_free(). * - * @returns a pointer to 'struct tee_shm' + * @returns a pointer to 'struct tee_shm' on success, and ERR_PTR on failure */ struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size) { @@ -179,6 +188,62 @@ struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size) } EXPORT_SYMBOL_GPL(tee_shm_alloc_kernel_buf); +struct tee_shm *tee_shm_register_fd(struct tee_context *ctx, int fd) +{ + struct tee_shm_dmabuf_ref *ref; + int rc; + + if (!tee_device_get(ctx->teedev)) + return ERR_PTR(-EINVAL); + + teedev_ctx_get(ctx); + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) { + rc = -ENOMEM; + goto err_put_tee; + } + + refcount_set(&ref->shm.refcount, 1); + ref->shm.ctx = ctx; + ref->shm.id = -1; + ref->shm.flags = TEE_SHM_DMA_BUF; + + ref->dmabuf = dma_buf_get(fd); + if (IS_ERR(ref->dmabuf)) { + rc = PTR_ERR(ref->dmabuf); + goto err_kfree_ref; + } + + rc = tee_heap_update_from_dma_buf(ctx->teedev, ref->dmabuf, + &ref->offset, &ref->shm, + &ref->parent_shm); + if (rc) + goto err_put_dmabuf; + + mutex_lock(&ref->shm.ctx->teedev->mutex); + ref->shm.id = idr_alloc(&ref->shm.ctx->teedev->idr, &ref->shm, + 1, 0, GFP_KERNEL); + mutex_unlock(&ref->shm.ctx->teedev->mutex); + if (ref->shm.id < 0) { + rc = ref->shm.id; + goto err_put_dmabuf; + } + + return &ref->shm; + +err_put_dmabuf: + dma_buf_put(ref->dmabuf); +err_kfree_ref: + kfree(ref); +err_put_tee: + teedev_ctx_put(ctx); + tee_device_put(ctx->teedev); + + return ERR_PTR(rc); +} +EXPORT_SYMBOL_GPL(tee_shm_register_fd); + /** * tee_shm_alloc_priv_buf() - Allocate shared memory for a privately shared * kernel buffer @@ -442,6 +507,9 @@ static int tee_shm_fop_mmap(struct file *filp, struct vm_area_struct *vma) /* Refuse sharing shared memory provided by application */ if (shm->flags & TEE_SHM_USER_MAPPED) return -EINVAL; + /* Refuse sharing registered DMA_bufs with the application */ + if (shm->flags & TEE_SHM_DMA_BUF) + return -EINVAL; /* check for overflowing the buffer's size */ if (vma->vm_pgoff + vma_pages(vma) > shm->size >> PAGE_SHIFT) diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index 28b65010b9ed..b6c54b34a8b5 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -28,6 +28,7 @@ #define TEE_SHM_USER_MAPPED BIT(1) /* Memory mapped in user space */ #define TEE_SHM_POOL BIT(2) /* Memory allocated from pool */ #define TEE_SHM_PRIV BIT(3) /* Memory private to TEE driver */ +#define TEE_SHM_DMA_BUF BIT(4) /* Memory with dma-buf handle */ #define TEE_DEVICE_FLAG_REGISTERED 0x1 #define TEE_MAX_DEV_NAME_LEN 32 diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index a54c203000ed..824f1251de60 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -116,6 +116,16 @@ struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size); struct tee_shm *tee_shm_register_kernel_buf(struct tee_context *ctx, void *addr, size_t length); +/** + * tee_shm_register_fd() - Register shared memory from file descriptor + * + * @ctx: Context that allocates the shared memory + * @fd: Shared memory file descriptor reference + * + * @returns a pointer to 'struct tee_shm' on success, and ERR_PTR on failure + */ +struct tee_shm *tee_shm_register_fd(struct tee_context *ctx, int fd); + /** * tee_shm_free() - Free shared memory * @shm: Handle to shared memory to free diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index d0430bee8292..d843cf980d98 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -378,6 +378,37 @@ struct tee_ioctl_shm_register_data { __s32 id; }; +/** + * struct tee_ioctl_shm_register_fd_data - Shared memory registering argument + * @fd: [in] File descriptor identifying dmabuf reference + * @size: [out] Size of referenced memory + * @flags: [in] Flags to/from allocation. + * @id: [out] Identifier of the shared memory + * + * The flags field should currently be zero as input. Updated by the call + * with actual flags as defined by TEE_IOCTL_SHM_* above. + * This structure is used as argument for TEE_IOC_SHM_REGISTER_FD below. + */ +struct tee_ioctl_shm_register_fd_data { + __s64 fd; + __u64 size; + __u32 flags; + __s32 id; +}; + +/** + * TEE_IOC_SHM_REGISTER_FD - register a shared memory from a file descriptor + * + * Returns a file descriptor on success or < 0 on failure + * + * The returned file descriptor refers to the shared memory object in the + * kernel. The supplied file deccriptor can be closed if it's not needed + * for other purposes. The shared memory is freed when the descriptor is + * closed. + */ +#define TEE_IOC_SHM_REGISTER_FD _IOWR(TEE_IOC_MAGIC, TEE_IOC_BASE + 8, \ + struct tee_ioctl_shm_register_fd_data) + /** * TEE_IOC_SHM_REGISTER - Register shared memory argument * -- cgit v1.2.3 From cbd2257dc96e3e46217540fcb095a757ffa20d96 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 2 Sep 2025 13:28:08 +0200 Subject: netfilter: nft_meta_bridge: introduce NFT_META_BRI_IIFHWADDR support Expose the input bridge interface ethernet address so it can be used to redirect the packet to the receiving physical device for processing. Tested with nft command line tool. table bridge nat { chain PREROUTING { type filter hook prerouting priority 0; policy accept; ether daddr de:ad:00:00:be:ef meta pkttype set host ether daddr set meta ibrhwdr accept } } Joint work with Pablo Neira. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/bridge/netfilter/nft_meta_bridge.c | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 8e0eb832bc01..7c0c915f0306 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -959,6 +959,7 @@ enum nft_exthdr_attributes { * @NFT_META_SDIF: slave device interface index * @NFT_META_SDIFNAME: slave device interface name * @NFT_META_BRI_BROUTE: packet br_netfilter_broute bit + * @NFT_META_BRI_IIFHWADDR: packet input bridge interface ethernet address */ enum nft_meta_keys { NFT_META_LEN, @@ -999,6 +1000,7 @@ enum nft_meta_keys { NFT_META_SDIFNAME, NFT_META_BRI_BROUTE, __NFT_META_IIFTYPE, + NFT_META_BRI_IIFHWADDR, }; /** diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 5adced1e7d0c..b7af36bbd306 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -59,6 +59,13 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, nft_reg_store_be16(dest, htons(p_proto)); return; } + case NFT_META_BRI_IIFHWADDR: + br_dev = nft_meta_get_bridge(in); + if (!br_dev) + goto err; + + memcpy(dest, br_dev->dev_addr, ETH_ALEN); + return; default: return nft_meta_get_eval(expr, regs, pkt); } @@ -86,6 +93,9 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, case NFT_META_BRI_IIFVPROTO: len = sizeof(u16); break; + case NFT_META_BRI_IIFHWADDR: + len = ETH_ALEN; + break; default: return nft_meta_get_init(ctx, expr, tb); } @@ -175,6 +185,7 @@ static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx, switch (priv->key) { case NFT_META_BRI_BROUTE: + case NFT_META_BRI_IIFHWADDR: hooks = 1 << NF_BR_PRE_ROUTING; break; default: -- cgit v1.2.3 From 21446c06b441b9c993870efae71aef4e9aa72ec7 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 4 Sep 2025 19:07:23 +0200 Subject: net: bridge: Introduce UAPI for BR_BOOLOPT_FDB_LOCAL_VLAN_0 The previous patches introduced a new option, BR_BOOLOPT_FDB_LOCAL_VLAN_0. When enabled, it has local FDB entries installed only on VLAN 0, instead of duplicating them across all VLANs. In this patch, add the corresponding UAPI toggle, and the code for turning the feature on and off. Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/ea99bfb10f687fa58091e6e1c2f8acc33f47ca45.1757004393.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 3 ++ net/bridge/br.c | 22 ++++++++++ net/bridge/br_fdb.c | 96 ++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_private.h | 2 + 4 files changed, 123 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 73876c0e2bba..e52f8207ab27 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -823,6 +823,8 @@ struct br_mcast_stats { /* bridge boolean options * BR_BOOLOPT_NO_LL_LEARN - disable learning from link-local packets * BR_BOOLOPT_MCAST_VLAN_SNOOPING - control vlan multicast snooping + * BR_BOOLOPT_FDB_LOCAL_VLAN_0 - local FDB entries installed by the bridge + * driver itself should only be added on VLAN 0 * * IMPORTANT: if adding a new option do not forget to handle * it in br_boolopt_toggle/get and bridge sysfs @@ -832,6 +834,7 @@ enum br_boolopt_id { BR_BOOLOPT_MCAST_VLAN_SNOOPING, BR_BOOLOPT_MST_ENABLE, BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION, + BR_BOOLOPT_FDB_LOCAL_VLAN_0, BR_BOOLOPT_MAX }; diff --git a/net/bridge/br.c b/net/bridge/br.c index c683baa3847f..512872a2ef81 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -259,6 +259,23 @@ static struct notifier_block br_switchdev_blocking_notifier = { .notifier_call = br_switchdev_blocking_event, }; +static int +br_toggle_fdb_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + int err; + + if (br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0) == on) + return 0; + + err = br_fdb_toggle_local_vlan_0(br, on, extack); + if (err) + return err; + + br_opt_toggle(br, BROPT_FDB_LOCAL_VLAN_0, on); + return 0; +} + /* br_boolopt_toggle - change user-controlled boolean option * * @br: bridge device @@ -287,6 +304,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on); break; + case BR_BOOLOPT_FDB_LOCAL_VLAN_0: + err = br_toggle_fdb_local_vlan_0(br, on, extack); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -307,6 +327,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) return br_opt_get(br, BROPT_MST_ENABLED); case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION); + case BR_BOOLOPT_FDB_LOCAL_VLAN_0: + return br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0); default: /* shouldn't be called with unsupported options */ WARN_ON(1); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 4a20578517a5..58d22e2b85fc 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -582,6 +582,102 @@ void br_fdb_cleanup(struct work_struct *work) mod_delayed_work(system_long_wq, &br->gc_work, work_delay); } +static void br_fdb_delete_locals_per_vlan_port(struct net_bridge *br, + struct net_bridge_port *p) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_device *dev; + + if (p) { + vg = nbp_vlan_group(p); + dev = p->dev; + } else { + vg = br_vlan_group(br); + dev = br->dev; + } + + list_for_each_entry(v, &vg->vlan_list, vlist) + br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); +} + +static void br_fdb_delete_locals_per_vlan(struct net_bridge *br) +{ + struct net_bridge_port *p; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) + br_fdb_delete_locals_per_vlan_port(br, p); + + br_fdb_delete_locals_per_vlan_port(br, NULL); +} + +static int br_fdb_insert_locals_per_vlan_port(struct net_bridge *br, + struct net_bridge_port *p, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_device *dev; + int err; + + if (p) { + vg = nbp_vlan_group(p); + dev = p->dev; + } else { + vg = br_vlan_group(br); + dev = br->dev; + } + + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + + err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); + if (err) + return err; + } + + return 0; +} + +static int br_fdb_insert_locals_per_vlan(struct net_bridge *br, + struct netlink_ext_ack *extack) +{ + struct net_bridge_port *p; + int err; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) { + err = br_fdb_insert_locals_per_vlan_port(br, p, extack); + if (err) + goto rollback; + } + + err = br_fdb_insert_locals_per_vlan_port(br, NULL, extack); + if (err) + goto rollback; + + return 0; + +rollback: + NL_SET_ERR_MSG_MOD(extack, "fdb_local_vlan_0 toggle: FDB entry insertion failed"); + br_fdb_delete_locals_per_vlan(br); + return err; +} + +int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + if (!on) + return br_fdb_insert_locals_per_vlan(br, extack); + + br_fdb_delete_locals_per_vlan(br); + return 0; +} + static bool __fdb_flush_matches(const struct net_bridge *br, const struct net_bridge_fdb_entry *f, const struct net_bridge_fdb_flush_desc *desc) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 87da287f19fe..16be5d250402 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -844,6 +844,8 @@ void br_fdb_find_delete_local(struct net_bridge *br, void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); void br_fdb_cleanup(struct work_struct *work); +int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack); void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all); struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, -- cgit v1.2.3 From f15bc37d8c336e79491209b268e73868c44733c4 Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Mon, 8 Sep 2025 07:35:21 +0000 Subject: iio: add IIO_ALTCURRENT channel type Add support for IIO_ALTCURRENT channel type to distinguish AC current measurements from DC current measurements. This follows the same pattern as IIO_VOLTAGE and IIO_ALTVOLTAGE. Reviewed-by: David Lechner Signed-off-by: Antoniu Miclaus Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-core.c | 1 + include/uapi/linux/iio/types.h | 1 + tools/iio/iio_event_monitor.c | 2 ++ 3 files changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 38848d753a33..4ff2c44f9324 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -97,6 +97,7 @@ static const char * const iio_chan_type_name_spec[] = { [IIO_COLORTEMP] = "colortemp", [IIO_CHROMATICITY] = "chromaticity", [IIO_ATTENTION] = "attention", + [IIO_ALTCURRENT] = "altcurrent", }; static const char * const iio_modifier_names[] = { diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h index 3eb0821af7a4..3c3cc1497a1e 100644 --- a/include/uapi/linux/iio/types.h +++ b/include/uapi/linux/iio/types.h @@ -52,6 +52,7 @@ enum iio_chan_type { IIO_COLORTEMP, IIO_CHROMATICITY, IIO_ATTENTION, + IIO_ALTCURRENT, }; enum iio_modifier { diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c index eab7b082f19d..d26aff649f3f 100644 --- a/tools/iio/iio_event_monitor.c +++ b/tools/iio/iio_event_monitor.c @@ -64,6 +64,7 @@ static const char * const iio_chan_type_name_spec[] = { [IIO_COLORTEMP] = "colortemp", [IIO_CHROMATICITY] = "chromaticity", [IIO_ATTENTION] = "attention", + [IIO_ALTCURRENT] = "altcurrent", }; static const char * const iio_ev_type_text[] = { @@ -187,6 +188,7 @@ static bool event_is_known(struct iio_event_data *event) case IIO_COLORTEMP: case IIO_CHROMATICITY: case IIO_ATTENTION: + case IIO_ALTCURRENT: break; default: return false; -- cgit v1.2.3 From 70da02061499ca89ab92f7c4310f815d5fe674ec Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Mon, 8 Sep 2025 07:35:22 +0000 Subject: iio: add power and energy measurement modifiers Add new IIO modifiers to support power and energy measurement devices: Power modifiers: - IIO_MOD_ACTIVE: Real power consumed by the load - IIO_MOD_REACTIVE: Power that oscillates between source and load - IIO_MOD_APPARENT: Magnitude of complex power Signal quality modifiers: - IIO_MOD_RMS: Root Mean Square value Additionally adds: - IIO_CHAN_INFO_POWERFACTOR: Power factor channel info type for representing the ratio of active power to apparent power These modifiers enable proper representation of power measurement devices like energy meters and power analyzers. Signed-off-by: Antoniu Miclaus Signed-off-by: Jonathan Cameron --- Documentation/ABI/testing/sysfs-bus-iio | 29 +++++++++++++++++++++++++++++ drivers/iio/industrialio-core.c | 5 +++++ include/linux/iio/types.h | 1 + include/uapi/linux/iio/types.h | 4 ++++ tools/iio/iio_event_monitor.c | 8 ++++++++ 5 files changed, 47 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index 2fb2cea4b192..78da68826307 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -167,7 +167,18 @@ Description: is required is a consistent labeling. Units after application of scale and offset are millivolts. +What: /sys/bus/iio/devices/iio:deviceX/in_altvoltageY_rms_raw +KernelVersion: 6.18 +Contact: linux-iio@vger.kernel.org +Description: + Raw (unscaled) Root Mean Square (RMS) voltage measurement from + channel Y. Units after application of scale and offset are + millivolts. + What: /sys/bus/iio/devices/iio:deviceX/in_powerY_raw +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_active_raw +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_reactive_raw +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_apparent_raw KernelVersion: 4.5 Contact: linux-iio@vger.kernel.org Description: @@ -176,6 +187,13 @@ Description: unique to allow association with event codes. Units after application of scale and offset are milliwatts. +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_powerfactor +KernelVersion: 6.18 +Contact: linux-iio@vger.kernel.org +Description: + Power factor measurement from channel Y. Power factor is the + ratio of active power to apparent power. The value is unitless. + What: /sys/bus/iio/devices/iio:deviceX/in_capacitanceY_raw KernelVersion: 3.2 Contact: linux-iio@vger.kernel.org @@ -1569,6 +1587,9 @@ Description: What: /sys/.../iio:deviceX/in_energy_input What: /sys/.../iio:deviceX/in_energy_raw +What: /sys/.../iio:deviceX/in_energyY_active_raw +What: /sys/.../iio:deviceX/in_energyY_reactive_raw +What: /sys/.../iio:deviceX/in_energyY_apparent_raw KernelVersion: 4.0 Contact: linux-iio@vger.kernel.org Description: @@ -1707,6 +1728,14 @@ Description: component of the signal while the 'q' channel contains the quadrature component. +What: /sys/bus/iio/devices/iio:deviceX/in_altcurrentY_rms_raw +KernelVersion: 6.18 +Contact: linux-iio@vger.kernel.org +Description: + Raw (unscaled no bias removal etc.) Root Mean Square (RMS) current + measurement from channel Y. Units after application of scale and + offset are milliamps. + What: /sys/.../iio:deviceX/in_energy_en What: /sys/.../iio:deviceX/in_distance_en What: /sys/.../iio:deviceX/in_velocity_sqrt(x^2+y^2+z^2)_en diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 4ff2c44f9324..88c3d585a1bd 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -153,6 +153,10 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_PITCH] = "pitch", [IIO_MOD_YAW] = "yaw", [IIO_MOD_ROLL] = "roll", + [IIO_MOD_RMS] = "rms", + [IIO_MOD_ACTIVE] = "active", + [IIO_MOD_REACTIVE] = "reactive", + [IIO_MOD_APPARENT] = "apparent", }; /* relies on pairs of these shared then separate */ @@ -190,6 +194,7 @@ static const char * const iio_chan_info_postfix[] = { [IIO_CHAN_INFO_ZEROPOINT] = "zeropoint", [IIO_CHAN_INFO_TROUGH] = "trough_raw", [IIO_CHAN_INFO_CONVDELAY] = "convdelay", + [IIO_CHAN_INFO_POWERFACTOR] = "powerfactor", }; /** * iio_device_id() - query the unique ID for the device diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index ad2761efcc83..34eebad12d2c 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -70,6 +70,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_ZEROPOINT, IIO_CHAN_INFO_TROUGH, IIO_CHAN_INFO_CONVDELAY, + IIO_CHAN_INFO_POWERFACTOR, }; #endif /* _IIO_TYPES_H_ */ diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h index 3c3cc1497a1e..6d269b844271 100644 --- a/include/uapi/linux/iio/types.h +++ b/include/uapi/linux/iio/types.h @@ -109,6 +109,10 @@ enum iio_modifier { IIO_MOD_ROLL, IIO_MOD_LIGHT_UVA, IIO_MOD_LIGHT_UVB, + IIO_MOD_RMS, + IIO_MOD_ACTIVE, + IIO_MOD_REACTIVE, + IIO_MOD_APPARENT, }; enum iio_event_type { diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c index d26aff649f3f..03ca33869ce8 100644 --- a/tools/iio/iio_event_monitor.c +++ b/tools/iio/iio_event_monitor.c @@ -141,6 +141,10 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_PITCH] = "pitch", [IIO_MOD_YAW] = "yaw", [IIO_MOD_ROLL] = "roll", + [IIO_MOD_RMS] = "rms", + [IIO_MOD_ACTIVE] = "active", + [IIO_MOD_REACTIVE] = "reactive", + [IIO_MOD_APPARENT] = "apparent", }; static bool event_is_known(struct iio_event_data *event) @@ -240,6 +244,10 @@ static bool event_is_known(struct iio_event_data *event) case IIO_MOD_PM4: case IIO_MOD_PM10: case IIO_MOD_O2: + case IIO_MOD_RMS: + case IIO_MOD_ACTIVE: + case IIO_MOD_REACTIVE: + case IIO_MOD_APPARENT: break; default: return false; -- cgit v1.2.3 From 56b060d0a1d3b1fd0429daeac366f00c030fca59 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Tue, 5 Aug 2025 13:50:47 -0700 Subject: mempolicy: clarify what zone reclaim means The zone_reclaim_mode API controls the reclaim behavior when a node runs out of memory. Contrary to its user-facing name, it is internally referred to as "node_reclaim_mode". This can be confusing. But because we cannot change the name of the API since it has been in place since at least 2.6, let's try to be more explicit about what the behavior of this API is. Change the description to clarify what zone reclaim entails, and be explicit about the RECLAIM_ZONE bit, whose purpose has led to some confusion in the past already [1] [2]. While at it, also soften the warning about changing these bits. [joshua.hahnjy@gmail.com: remove the reference to the vm.zone_reclaim_mode sysctl as an ABI] Link: https://lkml.kernel.org/r/20250806134404.2000234-1-joshua.hahnjy@gmail.com Link: https://lkml.kernel.org/r/20250805205048.1518453-1-joshua.hahnjy@gmail.com Link: https://lore.kernel.org/linux-mm/1579005573-58923-1-git-send-email-alex.shi@linux.alibaba.com/ [1] Link: https://lore.kernel.org/linux-mm/20200626003459.D8E015CA@viggo.jf.intel.com/ [2] Signed-off-by: Joshua Hahn Acked-by: SeongJae Park Acked-by: David Hildenbrand Reviewed-by: Huang Ying Acked-by: Zi Yan Acked-by: Byungchul Park Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: Mathew Brost Cc: Rakie Kim Signed-off-by: Andrew Morton --- include/uapi/linux/mempolicy.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 1f9bb10d1a47..8fbbe613611a 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -66,10 +66,16 @@ enum { #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ /* - * These bit locations are exposed in the vm.zone_reclaim_mode sysctl - * ABI. New bits are OK, but existing bits can never change. + * Enabling zone reclaim means the page allocator will attempt to fulfill + * the allocation request on the current node by triggering reclaim and + * trying to shrink the current node. + * Fallback allocations on the next candidates in the zonelist are considered + * when reclaim fails to free up enough memory in the current node/zone. + * + * These bit locations are exposed in the vm.zone_reclaim_mode sysctl. + * New bits are OK, but existing bits should not be changed. */ -#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_ZONE (1<<0) /* Enable zone reclaim */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ -- cgit v1.2.3 From 9dc21bbd62edeae6f63e6f25e1edb7167452457b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:53 +0100 Subject: prctl: extend PR_SET_THP_DISABLE to optionally exclude VM_HUGEPAGE Patch series "prctl: extend PR_SET_THP_DISABLE to only provide THPs when advised", v5. This will allow individual processes to opt-out of THP = "always" into THP = "madvise", without affecting other workloads on the system. This has been extensively discussed on the mailing list and has been summarized very well by David in the first patch which also includes the links to alternatives, please refer to the first patch commit message for the motivation for this series. Patch 1 adds the PR_THP_DISABLE_EXCEPT_ADVISED flag to implement this, along with the MMF changes. Patch 2 is a cleanup patch for tva_flags that will allow the forced collapse case to be transmitted to vma_thp_disabled (which is done in patch 3). Patch 4 adds documentation for PR_SET_THP_DISABLE/PR_GET_THP_DISABLE. Patches 6-7 implement the selftests for PR_SET_THP_DISABLE for completely disabling THPs (old behaviour) and only enabling it at advise (PR_THP_DISABLE_EXCEPT_ADVISED). This patch (of 7): People want to make use of more THPs, for example, moving from the "never" system policy to "madvise", or from "madvise" to "always". While this is great news for every THP desperately waiting to get allocated out there, apparently there are some workloads that require a bit of care during that transition: individual processes may need to opt-out from this behavior for various reasons, and this should be permitted without needing to make all other workloads on the system similarly opt-out. The following scenarios are imaginable: (1) Switch from "none" system policy to "madvise"/"always", but keep THPs disabled for selected workloads. (2) Stay at "none" system policy, but enable THPs for selected workloads, making only these workloads use the "madvise" or "always" policy. (3) Switch from "madvise" system policy to "always", but keep the "madvise" policy for selected workloads: allocate THPs only when advised. (4) Stay at "madvise" system policy, but enable THPs even when not advised for selected workloads -- "always" policy. Once can emulate (2) through (1), by setting the system policy to "madvise"/"always" while disabling THPs for all processes that don't want THPs. It requires configuring all workloads, but that is a user-space problem to sort out. (4) can be emulated through (3) in a similar way. Back when (1) was relevant in the past, as people started enabling THPs, we added PR_SET_THP_DISABLE, so relevant workloads that were not ready yet (i.e., used by Redis) were able to just disable THPs completely. Redis still implements the option to use this interface to disable THPs completely. With PR_SET_THP_DISABLE, we added a way to force-disable THPs for a workload -- a process, including fork+exec'ed process hierarchy. That essentially made us support (1): simply disable THPs for all workloads that are not ready for THPs yet, while still enabling THPs system-wide. The quest for handling (3) and (4) started, but current approaches (completely new prctl, options to set other policies per process, alternatives to prctl -- mctrl, cgroup handling) don't look particularly promising. Likely, the future will use bpf or something similar to implement better policies, in particular to also make better decisions about THP sizes to use, but this will certainly take a while as that work just started. Long story short: a simple enable/disable is not really suitable for the future, so we're not willing to add completely new toggles. While we could emulate (3)+(4) through (1)+(2) by simply disabling THPs completely for these processes, this is a step backwards, because these processes can no longer allocate THPs in regions where THPs were explicitly advised: regions flagged as VM_HUGEPAGE. Apparently, that imposes a problem for relevant workloads, because "not THPs" is certainly worse than "THPs only when advised". Could we simply relax PR_SET_THP_DISABLE, to "disable THPs unless not explicitly advised by the app through MAD_HUGEPAGE"? *maybe*, but this would change the documented semantics quite a bit, and the versatility to use it for debugging purposes, so I am not 100% sure that is what we want -- although it would certainly be much easier. So instead, as an easy way forward for (3) and (4), add an option to make PR_SET_THP_DISABLE disable *less* THPs for a process. In essence, this patch: (A) Adds PR_THP_DISABLE_EXCEPT_ADVISED, to be used as a flag in arg3 of prctl(PR_SET_THP_DISABLE) when disabling THPs (arg2 != 0). prctl(PR_SET_THP_DISABLE, 1, PR_THP_DISABLE_EXCEPT_ADVISED). (B) Makes prctl(PR_GET_THP_DISABLE) return 3 if PR_THP_DISABLE_EXCEPT_ADVISED was set while disabling. Previously, it would return 1 if THPs were disabled completely. Now it returns the set flags as well: 3 if PR_THP_DISABLE_EXCEPT_ADVISED was set. (C) Renames MMF_DISABLE_THP to MMF_DISABLE_THP_COMPLETELY, to express the semantics clearly. Fortunately, there are only two instances outside of prctl() code. (D) Adds MMF_DISABLE_THP_EXCEPT_ADVISED to express "no THP except for VMAs with VM_HUGEPAGE" -- essentially "thp=madvise" behavior Fortunately, we only have to extend vma_thp_disabled(). (E) Indicates "THP_enabled: 0" in /proc/pid/status only if THPs are disabled completely Only indicating that THPs are disabled when they are really disabled completely, not only partially. For now, we don't add another interface to obtained whether THPs are disabled partially (PR_THP_DISABLE_EXCEPT_ADVISED was set). If ever required, we could add a new entry. The documented semantics in the man page for PR_SET_THP_DISABLE "is inherited by a child created via fork(2) and is preserved across execve(2)" is maintained. This behavior, for example, allows for disabling THPs for a workload through the launching process (e.g., systemd where we fork() a helper process to then exec()). For now, MADV_COLLAPSE will *fail* in regions without VM_HUGEPAGE and VM_NOHUGEPAGE. As MADV_COLLAPSE is a clear advise that user space thinks a THP is a good idea, we'll enable that separately next (requiring a bit of cleanup first). There is currently not way to prevent that a process will not issue PR_SET_THP_DISABLE itself to re-enable THP. There are not really known users for re-enabling it, and it's against the purpose of the original interface. So if ever required, we could investigate just forbidding to re-enable them, or make this somehow configurable. Link: https://lkml.kernel.org/r/20250815135549.130506-1-usamaarif642@gmail.com Link: https://lkml.kernel.org/r/20250815135549.130506-2-usamaarif642@gmail.com Acked-by: Zi Yan Acked-by: Usama Arif Tested-by: Usama Arif Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Signed-off-by: Usama Arif Cc: Arnd Bergmann Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 5 ++-- fs/proc/array.c | 2 +- include/linux/huge_mm.h | 20 +++++++++---- include/linux/mm_types.h | 13 ++++----- include/uapi/linux/prctl.h | 10 +++++++ kernel/sys.c | 59 ++++++++++++++++++++++++++++++-------- mm/khugepaged.c | 2 +- 7 files changed, 82 insertions(+), 29 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 2971551b7235..915a3e44bc12 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -291,8 +291,9 @@ It's slow but very precise. HugetlbPages size of hugetlb memory portions CoreDumping process's memory is currently being dumped (killing the process may lead to a corrupted core) - THP_enabled process is allowed to use THP (returns 0 when - PR_SET_THP_DISABLE is set on the process + THP_enabled process is allowed to use THP (returns 0 when + PR_SET_THP_DISABLE is set on the process to disable + THP completely, not just partially) Threads number of threads SigQ number of signals queued/max. number for queue SigPnd bitmap of pending signals for the thread diff --git a/fs/proc/array.c b/fs/proc/array.c index c286dc12325e..d84b291dd1ed 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -422,7 +422,7 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE); if (thp_enabled) - thp_enabled = !mm_flags_test(MMF_DISABLE_THP, mm); + thp_enabled = !mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 84b7eebe0d68..22b8b067b295 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -318,16 +318,26 @@ struct thpsize { (transparent_hugepage_flags & \ (1<vm_mm)) + return true; /* - * Explicitly disabled through madvise or prctl, or some - * architectures may disable THP for some mappings, for - * example, s390 kvm. + * Are THPs disabled only for VMAs where we didn't get an explicit + * advise to use them? */ - return (vm_flags & VM_NOHUGEPAGE) || - mm_flags_test(MMF_DISABLE_THP, vma->vm_mm); + if (vm_flags & VM_HUGEPAGE) + return false; + return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm); } static inline bool thp_disabled_by_hw(void) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 05475b5fd516..d247da2fdb52 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1792,19 +1792,16 @@ enum { #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */ -/* - * This one-shot flag is dropped due to necessity of changing exe once again - * on NFS restore - */ -//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ +#define MMF_HUGE_ZERO_FOLIO 18 /* mm has ever used the global huge zero folio */ #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */ -#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK BIT(MMF_DISABLE_THP) +#define MMF_DISABLE_THP_EXCEPT_ADVISED 23 /* no THP except when advised (e.g., VM_HUGEPAGE) */ +#define MMF_DISABLE_THP_COMPLETELY 24 /* no THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (BIT(MMF_DISABLE_THP_COMPLETELY) | \ + BIT(MMF_DISABLE_THP_EXCEPT_ADVISED)) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index ed3aed264aeb..150b6deebfb1 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -177,7 +177,17 @@ struct prctl_mm_map { #define PR_GET_TID_ADDRESS 40 +/* + * Flags for PR_SET_THP_DISABLE are only applicable when disabling. Bit 0 + * is reserved, so PR_GET_THP_DISABLE can return "1 | flags", to effectively + * return "1" when no flags were specified for PR_SET_THP_DISABLE. + */ #define PR_SET_THP_DISABLE 41 +/* + * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / + * VM_HUGEPAGE). + */ +# define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) #define PR_GET_THP_DISABLE 42 /* diff --git a/kernel/sys.c b/kernel/sys.c index 605f7fe9a143..a46d9b75880b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2452,6 +2452,51 @@ static int prctl_get_auxv(void __user *addr, unsigned long len) return sizeof(mm->saved_auxv); } +static int prctl_get_thp_disable(unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + /* If disabled, we return "1 | flags", otherwise 0. */ + if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm)) + return 1; + else if (mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, mm)) + return 1 | PR_THP_DISABLE_EXCEPT_ADVISED; + return 0; +} + +static int prctl_set_thp_disable(bool thp_disable, unsigned long flags, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg4 || arg5) + return -EINVAL; + + /* Flags are only allowed when disabling. */ + if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED)) + return -EINVAL; + if (mmap_write_lock_killable(current->mm)) + return -EINTR; + if (thp_disable) { + if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_set(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } else { + mm_flags_set(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + } else { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + mmap_write_unlock(current->mm); + return 0; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2625,20 +2670,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = !!mm_flags_test(MMF_DISABLE_THP, me->mm); + error = prctl_get_thp_disable(arg2, arg3, arg4, arg5); break; case PR_SET_THP_DISABLE: - if (arg3 || arg4 || arg5) - return -EINVAL; - if (mmap_write_lock_killable(me->mm)) - return -EINTR; - if (arg2) - mm_flags_set(MMF_DISABLE_THP, me->mm); - else - mm_flags_clear(MMF_DISABLE_THP, me->mm); - mmap_write_unlock(me->mm); + error = prctl_set_thp_disable(arg2, arg3, arg4, arg5); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 550eb00116c5..1a416b865997 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -410,7 +410,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || - mm_flags_test(MMF_DISABLE_THP, mm); + mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } static bool hugepage_pmd_enabled(void) -- cgit v1.2.3 From 8cdc4d27019356b0304308eb799484c899b62a87 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:55 +0100 Subject: mm/huge_memory: respect MADV_COLLAPSE with PR_THP_DISABLE_EXCEPT_ADVISED Let's allow for making MADV_COLLAPSE succeed on areas that neither have VM_HUGEPAGE nor VM_NOHUGEPAGE when we have THP disabled unless explicitly advised (PR_THP_DISABLE_EXCEPT_ADVISED). MADV_COLLAPSE is a clear advice that we want to collapse. Note that we still respect the VM_NOHUGEPAGE flag, just like MADV_COLLAPSE always does. So consequently, MADV_COLLAPSE is now only refused on VM_NOHUGEPAGE with PR_THP_DISABLE_EXCEPT_ADVISED, including for shmem. Link: https://lkml.kernel.org/r/20250815135549.130506-4-usamaarif642@gmail.com Co-developed-by: Usama Arif Signed-off-by: Usama Arif Signed-off-by: David Hildenbrand Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Arnd Bergmann Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 +++++++- include/uapi/linux/prctl.h | 2 +- mm/huge_memory.c | 5 +++-- mm/memory.c | 6 ++++-- mm/shmem.c | 2 +- 5 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 92ea0b9771fa..1ac0d06fb3c1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -329,7 +329,7 @@ struct thpsize { * through madvise or prctl. */ static inline bool vma_thp_disabled(struct vm_area_struct *vma, - vm_flags_t vm_flags) + vm_flags_t vm_flags, bool forced_collapse) { /* Are THPs disabled for this VMA? */ if (vm_flags & VM_NOHUGEPAGE) @@ -343,6 +343,12 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, */ if (vm_flags & VM_HUGEPAGE) return false; + /* + * Forcing a collapse (e.g., madv_collapse), is a clear advice to + * use THPs. + */ + if (forced_collapse) + return false; return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm); } diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 150b6deebfb1..51c4e8c82b1e 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -185,7 +185,7 @@ struct prctl_mm_map { #define PR_SET_THP_DISABLE 41 /* * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / - * VM_HUGEPAGE). + * VM_HUGEPAGE, MADV_COLLAPSE). */ # define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) #define PR_GET_THP_DISABLE 42 diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 899d9ac86ecd..d89992b65acc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -104,7 +104,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, { const bool smaps = type == TVA_SMAPS; const bool in_pf = type == TVA_PAGEFAULT; - const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; + const bool forced_collapse = type == TVA_FORCED_COLLAPSE; + const bool enforce_sysfs = !forced_collapse; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ @@ -122,7 +123,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!vma->vm_mm) /* vdso */ return 0; - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse)) return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ diff --git a/mm/memory.c b/mm/memory.c index 7b1e8f137fa3..d9de6c056179 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5332,9 +5332,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa * It is too late to allocate a small folio, we already have a large * folio in the pagecache: especially s390 KVM cannot tolerate any * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any - * PMD mappings if THPs are disabled. + * PMD mappings if THPs are disabled. As we already have a THP, + * behave as if we are forcing a collapse. */ - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags, + /* forced_collapse=*/ true)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) diff --git a/mm/shmem.c b/mm/shmem.c index e2c76a30802b..d945de3a7f0e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1817,7 +1817,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; - if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) return 0; global_orders = shmem_huge_global_enabled(inode, index, write_end, -- cgit v1.2.3 From f367474b5884edbc42661e7fecf784cb131dd25d Mon Sep 17 00:00:00 2001 From: Brian Mak Date: Tue, 5 Aug 2025 14:15:27 -0700 Subject: x86/kexec: carry forward the boot DTB on kexec Currently, the kexec_file_load syscall on x86 does not support passing a device tree blob to the new kernel. Some embedded x86 systems use device trees. On these systems, failing to pass a device tree to the new kernel causes a boot failure. To add support for this, we copy the behavior of ARM64 and PowerPC and copy the current boot's device tree blob for use in the new kernel. We do this on x86 by passing the device tree blob as a setup_data entry in accordance with the x86 boot protocol. This behavior is gated behind the KEXEC_FILE_FORCE_DTB flag. Link: https://lkml.kernel.org/r/20250805211527.122367-3-makb@juniper.net Signed-off-by: Brian Mak Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Betkov Cc: Dave Young Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Rob Herring Cc: Saravana Kannan Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- arch/x86/kernel/kexec-bzimage64.c | 47 ++++++++++++++++++++++++++++++++++++--- include/linux/kexec.h | 5 ++++- include/uapi/linux/kexec.h | 4 ++++ kernel/kexec_file.c | 1 + 4 files changed, 53 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 24a41f0e0cf1..c3244ac680d1 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include @@ -212,6 +214,28 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, } #endif /* CONFIG_EFI */ +#ifdef CONFIG_OF_FLATTREE +static void setup_dtb(struct boot_params *params, + unsigned long params_load_addr, + unsigned int dtb_setup_data_offset) +{ + struct setup_data *sd = (void *)params + dtb_setup_data_offset; + unsigned long setup_data_phys, dtb_len; + + dtb_len = fdt_totalsize(initial_boot_params); + sd->type = SETUP_DTB; + sd->len = dtb_len; + + /* Carry over current boot DTB with setup_data */ + memcpy(sd->data, initial_boot_params, dtb_len); + + /* Add setup data */ + setup_data_phys = params_load_addr + dtb_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +} +#endif /* CONFIG_OF_FLATTREE */ + static void setup_ima_state(const struct kimage *image, struct boot_params *params, unsigned long params_load_addr, @@ -336,6 +360,17 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, sizeof(struct efi_setup_data); #endif +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) { + setup_dtb(params, params_load_addr, setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); + } else { + pr_debug("Not carrying over DTB, force_dtb = %d\n", + image->force_dtb); + } +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) { /* Setup IMA log buffer state */ setup_ima_state(image, params, params_load_addr, @@ -529,6 +564,12 @@ static void *bzImage64_load(struct kimage *image, char *kernel, sizeof(struct setup_data) + RNG_SEED_LENGTH; +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) + kbuf.bufsz += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct ima_setup_data); @@ -537,7 +578,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct kho_data); - params = kzalloc(kbuf.bufsz, GFP_KERNEL); + params = kvzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; @@ -647,7 +688,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, return ldata; out_free_params: - kfree(params); + kvfree(params); return ERR_PTR(ret); } @@ -659,7 +700,7 @@ static int bzImage64_cleanup(void *loader_data) if (!ldata) return 0; - kfree(ldata->bootparams_buf); + kvfree(ldata->bootparams_buf); ldata->bootparams_buf = NULL; return 0; diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 39fe3e6cd282..ff7e231b0485 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -395,6 +395,9 @@ struct kimage { /* Information for loading purgatory */ struct purgatory_info purgatory_info; + + /* Force carrying over the DTB from the current boot */ + bool force_dtb; #endif #ifdef CONFIG_CRASH_HOTPLUG @@ -461,7 +464,7 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \ - KEXEC_FILE_NO_CMA) + KEXEC_FILE_NO_CMA | KEXEC_FILE_FORCE_DTB) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 8958ebfcff94..55749cb0b81d 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -22,12 +22,16 @@ * KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image. * KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd * fd field. + * KEXEC_FILE_FORCE_DTB : Force carrying over the current boot's DTB to the new + * kernel on x86. This is already the default behavior on + * some other architectures, like ARM64 and PowerPC. */ #define KEXEC_FILE_UNLOAD 0x00000001 #define KEXEC_FILE_ON_CRASH 0x00000002 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 #define KEXEC_FILE_DEBUG 0x00000008 #define KEXEC_FILE_NO_CMA 0x00000010 +#define KEXEC_FILE_FORCE_DTB 0x00000020 /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 91d46502a817..eb62a9794242 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -255,6 +255,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, } image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + image->force_dtb = flags & KEXEC_FILE_FORCE_DTB; if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); -- cgit v1.2.3 From 08a72a220e960e7f153a810fb633638afd0b7563 Mon Sep 17 00:00:00 2001 From: Angela Czubak Date: Mon, 18 Aug 2025 23:08:43 +0000 Subject: Input: add FF_HAPTIC effect type FF_HAPTIC effect type can be used to trigger haptic feedback with HID simple haptic usages. Signed-off-by: Angela Czubak Co-developed-by: Jonathan Denose Signed-off-by: Jonathan Denose Acked-by: Dmitry Torokhov Signed-off-by: Benjamin Tissoires --- include/uapi/linux/input.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index 127119c287cf..6aa703fcfcfb 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -429,6 +429,24 @@ struct ff_rumble_effect { __u16 weak_magnitude; }; +/** + * struct ff_haptic_effect + * @hid_usage: hid_usage according to Haptics page (WAVEFORM_CLICK, etc.) + * @vendor_id: the waveform vendor ID if hid_usage is in the vendor-defined range + * @vendor_waveform_page: the vendor waveform page if hid_usage is in the vendor-defined range + * @intensity: strength of the effect as percentage + * @repeat_count: number of times to retrigger effect + * @retrigger_period: time before effect is retriggered (in ms) + */ +struct ff_haptic_effect { + __u16 hid_usage; + __u16 vendor_id; + __u8 vendor_waveform_page; + __u16 intensity; + __u16 repeat_count; + __u16 retrigger_period; +}; + /** * struct ff_effect - defines force feedback effect * @type: type of the effect (FF_CONSTANT, FF_PERIODIC, FF_RAMP, FF_SPRING, @@ -465,6 +483,7 @@ struct ff_effect { struct ff_periodic_effect periodic; struct ff_condition_effect condition[2]; /* One for each axis */ struct ff_rumble_effect rumble; + struct ff_haptic_effect haptic; } u; }; @@ -472,6 +491,7 @@ struct ff_effect { * Force feedback effect types */ +#define FF_HAPTIC 0x4f #define FF_RUMBLE 0x50 #define FF_PERIODIC 0x51 #define FF_CONSTANT 0x52 @@ -481,7 +501,7 @@ struct ff_effect { #define FF_INERTIA 0x56 #define FF_RAMP 0x57 -#define FF_EFFECT_MIN FF_RUMBLE +#define FF_EFFECT_MIN FF_HAPTIC #define FF_EFFECT_MAX FF_RAMP /* -- cgit v1.2.3 From 7075ae4ac9db93a3e762f3c2793ad57dbbf8a120 Mon Sep 17 00:00:00 2001 From: Angela Czubak Date: Mon, 18 Aug 2025 23:08:44 +0000 Subject: Input: add INPUT_PROP_HAPTIC_TOUCHPAD INPUT_PROP_HAPTIC_TOUCHPAD property is to be set for a device with simple haptic capabilities. Signed-off-by: Angela Czubak Co-developed-by: Jonathan Denose Signed-off-by: Jonathan Denose Acked-by: Dmitry Torokhov Reviewed-by: Randy Dunlap Signed-off-by: Benjamin Tissoires --- Documentation/input/event-codes.rst | 14 ++++++++++++++ include/uapi/linux/input-event-codes.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/input/event-codes.rst b/Documentation/input/event-codes.rst index b4557462edd7..1ead9bb8d9c6 100644 --- a/Documentation/input/event-codes.rst +++ b/Documentation/input/event-codes.rst @@ -400,6 +400,20 @@ can report through the rotational axes (absolute and/or relative rx, ry, rz). All other axes retain their meaning. A device must not mix regular directional axes and accelerometer axes on the same event node. +INPUT_PROP_HAPTIC_TOUCHPAD +-------------------------- + +The INPUT_PROP_HAPTIC_TOUCHPAD property indicates that device: +- supports simple haptic auto and manual triggering +- can differentiate between at least 5 fingers +- uses correct resolution for the X/Y (units and value) +- reports correct force per touch, and correct units for them (newtons or grams) +- follows the MT protocol type B + +Summing up, such devices follow the MS spec for input devices in +Win8 and Win8.1, and in addition support the Simple haptic controller HID table, +and report correct units for the pressure. + Guidelines ========== diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index ca5851e97fac..4a9fbf42aa9f 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -27,6 +27,7 @@ #define INPUT_PROP_TOPBUTTONPAD 0x04 /* softbuttons at top of pad */ #define INPUT_PROP_POINTING_STICK 0x05 /* is a pointing stick */ #define INPUT_PROP_ACCELEROMETER 0x06 /* has accelerometer */ +#define INPUT_PROP_HAPTIC_TOUCHPAD 0x07 /* is a haptic touchpad */ #define INPUT_PROP_MAX 0x1f #define INPUT_PROP_CNT (INPUT_PROP_MAX + 1) -- cgit v1.2.3 From 54a53e95a908a4cc770f0530c49f04c89e7b18dc Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:44 -0700 Subject: tee: add TEE_IOCTL_PARAM_ATTR_TYPE_UBUF For drivers that can transfer data to the TEE without using shared memory from client, it is necessary to receive the user address directly, bypassing any processing by the TEE subsystem. Introduce TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT/OUTPUT/INOUT to represent userspace buffers. Reviewed-by: Sumit Garg Tested-by: Neil Armstrong Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 33 +++++++++++++++++++++++++++++++++ include/linux/tee_drv.h | 6 ++++++ include/uapi/linux/tee.h | 22 ++++++++++++++++------ 3 files changed, 55 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 5a7fce5b6007..529738565ebd 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -494,6 +494,17 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params, params[n].u.value.b = ip.b; params[n].u.value.c = ip.c; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + params[n].u.ubuf.uaddr = u64_to_user_ptr(ip.a); + params[n].u.ubuf.size = ip.b; + + if (!access_ok(params[n].u.ubuf.uaddr, + params[n].u.ubuf.size)) + return -EFAULT; + + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -527,6 +538,11 @@ static int params_to_user(struct tee_ioctl_param __user *uparams, put_user(p->u.value.c, &up->c)) return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + if (put_user((u64)p->u.ubuf.size, &up->b)) + return -EFAULT; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: if (put_user((u64)p->u.memref.size, &up->b)) @@ -727,6 +743,13 @@ static int params_to_supp(struct tee_context *ctx, ip.b = p->u.value.b; ip.c = p->u.value.c; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + ip.a = (__force unsigned long)p->u.ubuf.uaddr; + ip.b = p->u.ubuf.size; + ip.c = 0; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -829,6 +852,16 @@ static int params_from_supp(struct tee_param *params, size_t num_params, p->u.value.b = ip.b; p->u.value.c = ip.c; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + p->u.ubuf.uaddr = u64_to_user_ptr(ip.a); + p->u.ubuf.size = ip.b; + + if (!access_ok(params[n].u.ubuf.uaddr, + params[n].u.ubuf.size)) + return -EFAULT; + + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: /* diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 824f1251de60..7915e8869cbd 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -82,6 +82,11 @@ struct tee_param_memref { struct tee_shm *shm; }; +struct tee_param_ubuf { + void __user *uaddr; + size_t size; +}; + struct tee_param_value { u64 a; u64 b; @@ -92,6 +97,7 @@ struct tee_param { u64 attr; union { struct tee_param_memref memref; + struct tee_param_ubuf ubuf; struct tee_param_value value; } u; }; diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index d843cf980d98..0e3b735dcfca 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -151,6 +151,13 @@ struct tee_ioctl_buf_data { #define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT 6 #define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT 7 /* input and output */ +/* + * These defines userspace buffer parameters. + */ +#define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT 8 +#define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT 9 +#define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT 10 /* input and output */ + /* * Mask for the type part of the attribute, leaves room for more types */ @@ -186,14 +193,17 @@ struct tee_ioctl_buf_data { /** * struct tee_ioctl_param - parameter * @attr: attributes - * @a: if a memref, offset into the shared memory object, else a value parameter - * @b: if a memref, size of the buffer, else a value parameter + * @a: if a memref, offset into the shared memory object, + * else if a ubuf, address of the user buffer, + * else a value parameter + * @b: if a memref or ubuf, size of the buffer, else a value parameter * @c: if a memref, shared memory identifier, else a value parameter * - * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref or value is used in - * the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value and - * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref. TEE_PARAM_ATTR_TYPE_NONE - * indicates that none of the members are used. + * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref, ubuf, or value is + * used in the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value, + * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref, and TEE_PARAM_ATTR_TYPE_UBUF_* + * indicates ubuf. TEE_PARAM_ATTR_TYPE_NONE indicates that none of the members + * are used. * * Shared memory is allocated with TEE_IOC_SHM_ALLOC which returns an * identifier representing the shared memory object. A memref can reference -- cgit v1.2.3 From d5b8b0fa1775d8b59c3fc9e4aa2baa715d08f3ee Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:45 -0700 Subject: tee: add TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF The TEE subsystem allows session-based access to trusted services, requiring a session to be established to receive a service. This is not suitable for an environment that represents services as objects. An object supports various operations that a client can invoke, potentially generating a result or a new object that can be invoked independently of the original object. Add TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT/OUTPUT/INOUT to represent an object. Objects may reside in either TEE or userspace. To invoke an object in TEE, introduce a new ioctl. Use the existing SUPPL_RECV and SUPPL_SEND to invoke an object in userspace. Reviewed-by: Sumit Garg Tested-by: Neil Armstrong Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/tee_core.h | 4 +++ include/linux/tee_drv.h | 6 ++++ include/uapi/linux/tee.h | 41 +++++++++++++++++++---- 4 files changed, 130 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 529738565ebd..71b1bd69f067 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -487,6 +487,7 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params, switch (ip.attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) { case TEE_IOCTL_PARAM_ATTR_TYPE_NONE: case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: break; case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT: @@ -505,6 +506,11 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params, return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + params[n].u.objref.id = ip.a; + params[n].u.objref.flags = ip.b; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -543,6 +549,12 @@ static int params_to_user(struct tee_ioctl_param __user *uparams, if (put_user((u64)p->u.ubuf.size, &up->b)) return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + if (put_user(p->u.objref.id, &up->a) || + put_user(p->u.objref.flags, &up->b)) + return -EFAULT; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: if (put_user((u64)p->u.memref.size, &up->b)) @@ -695,6 +707,66 @@ out: return rc; } +static int tee_ioctl_object_invoke(struct tee_context *ctx, + struct tee_ioctl_buf_data __user *ubuf) +{ + int rc; + size_t n; + struct tee_ioctl_buf_data buf; + struct tee_ioctl_object_invoke_arg __user *uarg; + struct tee_ioctl_object_invoke_arg arg; + struct tee_ioctl_param __user *uparams = NULL; + struct tee_param *params = NULL; + + if (!ctx->teedev->desc->ops->object_invoke_func) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, sizeof(buf))) + return -EFAULT; + + if (buf.buf_len > TEE_MAX_ARG_SIZE || + buf.buf_len < sizeof(struct tee_ioctl_object_invoke_arg)) + return -EINVAL; + + uarg = u64_to_user_ptr(buf.buf_ptr); + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (sizeof(arg) + TEE_IOCTL_PARAM_SIZE(arg.num_params) != buf.buf_len) + return -EINVAL; + + if (arg.num_params) { + params = kcalloc(arg.num_params, sizeof(struct tee_param), + GFP_KERNEL); + if (!params) + return -ENOMEM; + uparams = uarg->params; + rc = params_from_user(ctx, params, arg.num_params, uparams); + if (rc) + goto out; + } + + rc = ctx->teedev->desc->ops->object_invoke_func(ctx, &arg, params); + if (rc) + goto out; + + if (put_user(arg.ret, &uarg->ret)) { + rc = -EFAULT; + goto out; + } + rc = params_to_user(uparams, arg.num_params, params); +out: + if (params) { + /* Decrease ref count for all valid shared memory pointers */ + for (n = 0; n < arg.num_params; n++) + if (tee_param_is_memref(params + n) && + params[n].u.memref.shm) + tee_shm_put(params[n].u.memref.shm); + kfree(params); + } + return rc; +} + static int tee_ioctl_cancel(struct tee_context *ctx, struct tee_ioctl_cancel_arg __user *uarg) { @@ -750,6 +822,12 @@ static int params_to_supp(struct tee_context *ctx, ip.b = p->u.ubuf.size; ip.c = 0; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + ip.a = p->u.objref.id; + ip.b = p->u.objref.flags; + ip.c = 0; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -862,6 +940,11 @@ static int params_from_supp(struct tee_param *params, size_t num_params, return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + p->u.objref.id = ip.a; + p->u.objref.flags = ip.b; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: /* @@ -944,6 +1027,8 @@ static long tee_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return tee_ioctl_open_session(ctx, uarg); case TEE_IOC_INVOKE: return tee_ioctl_invoke(ctx, uarg); + case TEE_IOC_OBJECT_INVOKE: + return tee_ioctl_object_invoke(ctx, uarg); case TEE_IOC_CANCEL: return tee_ioctl_cancel(ctx, uarg); case TEE_IOC_CLOSE_SESSION: diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index 456a940d4710..1f3e5dad6d0d 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -83,6 +83,7 @@ struct tee_device { * @close_session: close a session * @system_session: declare session as a system session * @invoke_func: invoke a trusted function + * @object_invoke_func: invoke a TEE object * @cancel_req: request cancel of an ongoing invoke or open * @supp_recv: called for supplicant to get a command * @supp_send: called for supplicant to send a response @@ -108,6 +109,9 @@ struct tee_driver_ops { int (*invoke_func)(struct tee_context *ctx, struct tee_ioctl_invoke_arg *arg, struct tee_param *param); + int (*object_invoke_func)(struct tee_context *ctx, + struct tee_ioctl_object_invoke_arg *arg, + struct tee_param *param); int (*cancel_req)(struct tee_context *ctx, u32 cancel_id, u32 session); int (*supp_recv)(struct tee_context *ctx, u32 *func, u32 *num_params, struct tee_param *param); diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 7915e8869cbd..88a6f9697c89 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -87,6 +87,11 @@ struct tee_param_ubuf { size_t size; }; +struct tee_param_objref { + u64 id; + u64 flags; +}; + struct tee_param_value { u64 a; u64 b; @@ -97,6 +102,7 @@ struct tee_param { u64 attr; union { struct tee_param_memref memref; + struct tee_param_objref objref; struct tee_param_ubuf ubuf; struct tee_param_value value; } u; diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index 0e3b735dcfca..9abb0f299549 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -48,8 +48,10 @@ #define TEE_GEN_CAP_PRIVILEGED (1 << 1)/* Privileged device (for supplicant) */ #define TEE_GEN_CAP_REG_MEM (1 << 2)/* Supports registering shared memory */ #define TEE_GEN_CAP_MEMREF_NULL (1 << 3)/* NULL MemRef support */ +#define TEE_GEN_CAP_OBJREF (1 << 4)/* Supports generic object reference */ -#define TEE_MEMREF_NULL (__u64)(-1) /* NULL MemRef Buffer */ +#define TEE_MEMREF_NULL ((__u64)(-1)) /* NULL MemRef Buffer */ +#define TEE_OBJREF_NULL ((__u64)(-1)) /* NULL ObjRef Object */ /* * TEE Implementation ID @@ -158,6 +160,13 @@ struct tee_ioctl_buf_data { #define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT 9 #define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT 10 /* input and output */ +/* + * These defines object reference parameters. + */ +#define TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT 11 +#define TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT 12 +#define TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT 13 + /* * Mask for the type part of the attribute, leaves room for more types */ @@ -195,15 +204,16 @@ struct tee_ioctl_buf_data { * @attr: attributes * @a: if a memref, offset into the shared memory object, * else if a ubuf, address of the user buffer, - * else a value parameter - * @b: if a memref or ubuf, size of the buffer, else a value parameter + * else if an objref, object identifier, else a value parameter + * @b: if a memref or ubuf, size of the buffer, + * else if objref, flags for the object, else a value parameter * @c: if a memref, shared memory identifier, else a value parameter * * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref, ubuf, or value is * used in the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value, - * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref, and TEE_PARAM_ATTR_TYPE_UBUF_* - * indicates ubuf. TEE_PARAM_ATTR_TYPE_NONE indicates that none of the members - * are used. + * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref, TEE_PARAM_ATTR_TYPE_UBUF_* + * indicates ubuf, and TEE_PARAM_ATTR_TYPE_OBJREF_* indicates objref. + * TEE_PARAM_ATTR_TYPE_NONE indicates that none of the members are used. * * Shared memory is allocated with TEE_IOC_SHM_ALLOC which returns an * identifier representing the shared memory object. A memref can reference @@ -442,4 +452,23 @@ struct tee_ioctl_shm_register_fd_data { * munmap(): unmaps previously shared memory */ +/** + * struct tee_ioctl_invoke_func_arg - Invokes an object in a Trusted Application + * @id: [in] Object id + * @op: [in] Object operation, specific to the object + * @ret: [out] return value + * @num_params: [in] number of parameters following this struct + */ +struct tee_ioctl_object_invoke_arg { + __u64 id; + __u32 op; + __u32 ret; + __u32 num_params; + /* num_params tells the actual number of element in params */ + struct tee_ioctl_param params[]; +}; + +#define TEE_IOC_OBJECT_INVOKE _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 10, \ + struct tee_ioctl_buf_data) + #endif /*__TEE_H*/ -- cgit v1.2.3 From bd5139306886a9626a7d794940376806eccb9547 Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:46 -0700 Subject: tee: increase TEE_MAX_ARG_SIZE to 4096 Increase TEE_MAX_ARG_SIZE to accommodate worst-case scenarios where additional buffer space is required to pass all arguments to TEE. This change is necessary for upcoming support for Qualcomm TEE, which requires a larger buffer for argument marshaling. Reviewed-by: Sumit Garg Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- include/uapi/linux/tee.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index 9abb0f299549..a5466b503bfe 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -42,7 +42,7 @@ #define TEE_IOC_MAGIC 0xa4 #define TEE_IOC_BASE 0 -#define TEE_MAX_ARG_SIZE 1024 +#define TEE_MAX_ARG_SIZE 4096 #define TEE_GEN_CAP_GP (1 << 0)/* GlobalPlatform compliant TEE */ #define TEE_GEN_CAP_PRIVILEGED (1 << 1)/* Privileged device (for supplicant) */ -- cgit v1.2.3 From d6e290837e50f73f88f31f19bd8a7213d92e6e46 Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:47 -0700 Subject: tee: add Qualcomm TEE driver Introduce qcomtee_object, which represents an object in both QTEE and the kernel. QTEE clients can invoke an instance of qcomtee_object to access QTEE services. If this invocation produces a new object in QTEE, an instance of qcomtee_object will be returned. Similarly, QTEE can request services from by issuing a callback request, which invokes an instance of qcomtee_object. Implement initial support for exporting qcomtee_object to userspace and QTEE, enabling the invocation of objects hosted in QTEE and userspace through the TEE subsystem. Tested-by: Neil Armstrong Tested-by: Harshal Dev Acked-by: Sumit Garg Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- MAINTAINERS | 6 + drivers/tee/Kconfig | 1 + drivers/tee/Makefile | 1 + drivers/tee/qcomtee/Kconfig | 12 + drivers/tee/qcomtee/Makefile | 7 + drivers/tee/qcomtee/async.c | 182 +++++++ drivers/tee/qcomtee/call.c | 813 +++++++++++++++++++++++++++++++ drivers/tee/qcomtee/core.c | 906 +++++++++++++++++++++++++++++++++++ drivers/tee/qcomtee/qcomtee.h | 143 ++++++ drivers/tee/qcomtee/qcomtee_msg.h | 304 ++++++++++++ drivers/tee/qcomtee/qcomtee_object.h | 316 ++++++++++++ drivers/tee/qcomtee/shm.c | 153 ++++++ drivers/tee/qcomtee/user_obj.c | 692 ++++++++++++++++++++++++++ include/uapi/linux/tee.h | 1 + 14 files changed, 3537 insertions(+) create mode 100644 drivers/tee/qcomtee/Kconfig create mode 100644 drivers/tee/qcomtee/Makefile create mode 100644 drivers/tee/qcomtee/async.c create mode 100644 drivers/tee/qcomtee/call.c create mode 100644 drivers/tee/qcomtee/core.c create mode 100644 drivers/tee/qcomtee/qcomtee.h create mode 100644 drivers/tee/qcomtee/qcomtee_msg.h create mode 100644 drivers/tee/qcomtee/qcomtee_object.h create mode 100644 drivers/tee/qcomtee/shm.c create mode 100644 drivers/tee/qcomtee/user_obj.c (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index daf520a13bdf..bde449308736 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20856,6 +20856,12 @@ F: Documentation/networking/device_drivers/cellular/qualcomm/rmnet.rst F: drivers/net/ethernet/qualcomm/rmnet/ F: include/linux/if_rmnet.h +QUALCOMM TEE (QCOMTEE) DRIVER +M: Amirreza Zarrabi +L: linux-arm-msm@vger.kernel.org +S: Maintained +F: drivers/tee/qcomtee/ + QUALCOMM TRUST ZONE MEMORY ALLOCATOR M: Bartosz Golaszewski L: linux-arm-msm@vger.kernel.org diff --git a/drivers/tee/Kconfig b/drivers/tee/Kconfig index 90600607a9d8..35d207ef6910 100644 --- a/drivers/tee/Kconfig +++ b/drivers/tee/Kconfig @@ -21,5 +21,6 @@ config TEE_DMABUF_HEAPS source "drivers/tee/optee/Kconfig" source "drivers/tee/amdtee/Kconfig" source "drivers/tee/tstee/Kconfig" +source "drivers/tee/qcomtee/Kconfig" endif diff --git a/drivers/tee/Makefile b/drivers/tee/Makefile index 949a6a79fb06..3239b91dee96 100644 --- a/drivers/tee/Makefile +++ b/drivers/tee/Makefile @@ -7,3 +7,4 @@ tee-objs += tee_shm_pool.o obj-$(CONFIG_OPTEE) += optee/ obj-$(CONFIG_AMDTEE) += amdtee/ obj-$(CONFIG_ARM_TSTEE) += tstee/ +obj-$(CONFIG_QCOMTEE) += qcomtee/ diff --git a/drivers/tee/qcomtee/Kconfig b/drivers/tee/qcomtee/Kconfig new file mode 100644 index 000000000000..927686abceb1 --- /dev/null +++ b/drivers/tee/qcomtee/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Qualcomm Trusted Execution Environment Configuration +config QCOMTEE + tristate "Qualcomm TEE Support" + depends on !CPU_BIG_ENDIAN + select QCOM_SCM + select QCOM_TZMEM_MODE_SHMBRIDGE + help + This option enables the Qualcomm Trusted Execution Environment (QTEE) + driver. It provides an API to access services offered by QTEE and + its loaded Trusted Applications (TAs). Additionally, it facilitates + the export of userspace services provided by supplicants to QTEE. diff --git a/drivers/tee/qcomtee/Makefile b/drivers/tee/qcomtee/Makefile new file mode 100644 index 000000000000..600af2b8f1c1 --- /dev/null +++ b/drivers/tee/qcomtee/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_QCOMTEE) += qcomtee.o +qcomtee-objs += async.o +qcomtee-objs += call.o +qcomtee-objs += core.o +qcomtee-objs += shm.o +qcomtee-objs += user_obj.o diff --git a/drivers/tee/qcomtee/async.c b/drivers/tee/qcomtee/async.c new file mode 100644 index 000000000000..31bff4309e67 --- /dev/null +++ b/drivers/tee/qcomtee/async.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "qcomtee.h" + +#define QCOMTEE_ASYNC_VERSION_1_0 0x00010000U /* Maj: 0x0001, Min: 0x0000. */ +#define QCOMTEE_ASYNC_VERSION_1_1 0x00010001U /* Maj: 0x0001, Min: 0x0001. */ +#define QCOMTEE_ASYNC_VERSION_1_2 0x00010002U /* Maj: 0x0001, Min: 0x0002. */ +#define QCOMTEE_ASYNC_VERSION_CURRENT QCOMTEE_ASYNC_VERSION_1_2 + +#define QCOMTEE_ASYNC_VERSION_MAJOR(n) upper_16_bits(n) +#define QCOMTEE_ASYNC_VERSION_MINOR(n) lower_16_bits(n) + +#define QCOMTEE_ASYNC_VERSION_CURRENT_MAJOR \ + QCOMTEE_ASYNC_VERSION_MAJOR(QCOMTEE_ASYNC_VERSION_CURRENT) +#define QCOMTEE_ASYNC_VERSION_CURRENT_MINOR \ + QCOMTEE_ASYNC_VERSION_MINOR(QCOMTEE_ASYNC_VERSION_CURRENT) + +/** + * struct qcomtee_async_msg_hdr - Asynchronous message header format. + * @version: current async protocol version of the remote endpoint. + * @op: async operation. + * + * @version specifies the endpoint's (QTEE or driver) supported async protocol. + * For example, if QTEE sets @version to %QCOMTEE_ASYNC_VERSION_1_1, QTEE + * handles operations supported in %QCOMTEE_ASYNC_VERSION_1_1 or + * %QCOMTEE_ASYNC_VERSION_1_0. @op determines the message format. + */ +struct qcomtee_async_msg_hdr { + u32 version; + u32 op; +}; + +/* Size of an empty async message. */ +#define QCOMTEE_ASYNC_MSG_ZERO sizeof(struct qcomtee_async_msg_hdr) + +/** + * struct qcomtee_async_release_msg - Release asynchronous message. + * @hdr: message header as &struct qcomtee_async_msg_hdr. + * @counts: number of objects in @object_ids. + * @object_ids: array of object IDs that should be released. + * + * Available in Maj = 0x0001, Min >= 0x0000. + */ +struct qcomtee_async_release_msg { + struct qcomtee_async_msg_hdr hdr; + u32 counts; + u32 object_ids[] __counted_by(counts); +}; + +/** + * qcomtee_get_async_buffer() - Get the start of the asynchronous message. + * @oic: context used for the current invocation. + * @async_buffer: return buffer to extract from or fill in async messages. + * + * If @oic is used for direct object invocation, the whole outbound buffer + * is available for the async message. If @oic is used for a callback request, + * the tail of the outbound buffer (after the callback request message) is + * available for the async message. + * + * The start of the async buffer is aligned, see qcomtee_msg_offset_align(). + */ +static void qcomtee_get_async_buffer(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_buffer *async_buffer) +{ + struct qcomtee_msg_callback *msg; + unsigned int offset; + int i; + + if (!(oic->flags & QCOMTEE_OIC_FLAG_BUSY)) { + /* The outbound buffer is empty. Using the whole buffer. */ + offset = 0; + } else { + msg = (struct qcomtee_msg_callback *)oic->out_msg.addr; + + /* Start offset in a message for buffer arguments. */ + offset = qcomtee_msg_buffer_args(struct qcomtee_msg_callback, + qcomtee_msg_args(msg)); + + /* Add size of IB arguments. */ + qcomtee_msg_for_each_input_buffer(i, msg) + offset += qcomtee_msg_offset_align(msg->args[i].b.size); + + /* Add size of OB arguments. */ + qcomtee_msg_for_each_output_buffer(i, msg) + offset += qcomtee_msg_offset_align(msg->args[i].b.size); + } + + async_buffer->addr = oic->out_msg.addr + offset; + async_buffer->size = oic->out_msg.size - offset; +} + +/** + * async_release() - Process QTEE async release requests. + * @oic: context used for the current invocation. + * @msg: async message for object release. + * @size: size of the async buffer available. + * + * Return: Size of the outbound buffer used when processing @msg. + */ +static size_t async_release(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_async_msg_hdr *async_msg, + size_t size) +{ + struct qcomtee_async_release_msg *msg; + struct qcomtee_object *object; + int i; + + msg = (struct qcomtee_async_release_msg *)async_msg; + + for (i = 0; i < msg->counts; i++) { + object = qcomtee_idx_erase(oic, msg->object_ids[i]); + qcomtee_object_put(object); + } + + return struct_size(msg, object_ids, msg->counts); +} + +/** + * qcomtee_fetch_async_reqs() - Fetch and process asynchronous messages. + * @oic: context used for the current invocation. + * + * Calls handlers to process the requested operations in the async message. + * Currently, only supports async release requests. + */ +void qcomtee_fetch_async_reqs(struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_async_msg_hdr *async_msg; + struct qcomtee_buffer async_buffer; + size_t consumed, used = 0; + u16 major_ver; + + qcomtee_get_async_buffer(oic, &async_buffer); + + while (async_buffer.size - used > QCOMTEE_ASYNC_MSG_ZERO) { + async_msg = (struct qcomtee_async_msg_hdr *)(async_buffer.addr + + used); + /* + * QTEE assumes that the unused space of the async buffer is + * zeroed; so if version is zero, the buffer is unused. + */ + if (async_msg->version == 0) + goto out; + + major_ver = QCOMTEE_ASYNC_VERSION_MAJOR(async_msg->version); + /* Major version mismatch is a compatibility break. */ + if (major_ver != QCOMTEE_ASYNC_VERSION_CURRENT_MAJOR) { + pr_err("Async message version mismatch (%u != %u)\n", + major_ver, QCOMTEE_ASYNC_VERSION_CURRENT_MAJOR); + + goto out; + } + + switch (async_msg->op) { + case QCOMTEE_MSG_OBJECT_OP_RELEASE: + consumed = async_release(oic, async_msg, + async_buffer.size - used); + break; + default: + pr_err("Unsupported async message %u\n", async_msg->op); + goto out; + } + + /* Supported operation but unable to parse the message. */ + if (!consumed) { + pr_err("Unable to parse async message for op %u\n", + async_msg->op); + goto out; + } + + /* Next async message. */ + used += qcomtee_msg_offset_align(consumed); + } + +out: + /* Reset the async buffer so async requests do not loop to QTEE. */ + memzero_explicit(async_buffer.addr, async_buffer.size); +} diff --git a/drivers/tee/qcomtee/call.c b/drivers/tee/qcomtee/call.c new file mode 100644 index 000000000000..33daa4d7033d --- /dev/null +++ b/drivers/tee/qcomtee/call.c @@ -0,0 +1,813 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +#include "qcomtee.h" + +static int find_qtee_object(struct qcomtee_object **object, unsigned long id, + struct qcomtee_context_data *ctxdata) +{ + int err = 0; + + guard(rcu)(); + /* Object release is RCU protected. */ + *object = idr_find(&ctxdata->qtee_objects_idr, id); + if (!qcomtee_object_get(*object)) + err = -EINVAL; + + return err; +} + +static void del_qtee_object(unsigned long id, + struct qcomtee_context_data *ctxdata) +{ + struct qcomtee_object *object; + + scoped_guard(mutex, &ctxdata->qtee_lock) + object = idr_remove(&ctxdata->qtee_objects_idr, id); + + qcomtee_object_put(object); +} + +/** + * qcomtee_context_add_qtee_object() - Add a QTEE object to the context. + * @param: TEE parameter representing @object. + * @object: QTEE object. + * @ctx: context to add the object. + * + * It assumes @object is %QCOMTEE_OBJECT_TYPE_TEE and the caller has already + * issued qcomtee_object_get() for @object. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_context_add_qtee_object(struct tee_param *param, + struct qcomtee_object *object, + struct tee_context *ctx) +{ + int ret; + struct qcomtee_context_data *ctxdata = ctx->data; + + scoped_guard(mutex, &ctxdata->qtee_lock) + ret = idr_alloc(&ctxdata->qtee_objects_idr, object, 0, 0, + GFP_KERNEL); + if (ret < 0) + return ret; + + param->u.objref.id = ret; + /* QTEE Object: QCOMTEE_OBJREF_FLAG_TEE set. */ + param->u.objref.flags = QCOMTEE_OBJREF_FLAG_TEE; + + return 0; +} + +/* Retrieve the QTEE object added with qcomtee_context_add_qtee_object(). */ +int qcomtee_context_find_qtee_object(struct qcomtee_object **object, + struct tee_param *param, + struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + + return find_qtee_object(object, param->u.objref.id, ctxdata); +} + +/** + * qcomtee_context_del_qtee_object() - Delete a QTEE object from the context. + * @param: TEE parameter representing @object. + * @ctx: context for deleting the object. + * + * The @param has been initialized by qcomtee_context_add_qtee_object(). + */ +void qcomtee_context_del_qtee_object(struct tee_param *param, + struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + /* 'qtee_objects_idr' stores QTEE objects only. */ + if (param->u.objref.flags & QCOMTEE_OBJREF_FLAG_TEE) + del_qtee_object(param->u.objref.id, ctxdata); +} + +/** + * qcomtee_objref_to_arg() - Convert OBJREF parameter to QTEE argument. + * @arg: QTEE argument. + * @param: TEE parameter. + * @ctx: context in which the conversion should happen. + * + * It assumes @param is an OBJREF. + * It does not set @arg.type; the caller should initialize it to a correct + * &enum qcomtee_arg_type value. It gets the object's refcount in @arg; + * the caller should manage to put it afterward. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_objref_to_arg(struct qcomtee_arg *arg, struct tee_param *param, + struct tee_context *ctx) +{ + int err = -EINVAL; + + arg->o = NULL_QCOMTEE_OBJECT; + /* param is a NULL object: */ + if (param->u.objref.id == TEE_OBJREF_NULL) + return 0; + + /* param is a callback object: */ + if (param->u.objref.flags & QCOMTEE_OBJREF_FLAG_USER) + err = qcomtee_user_param_to_object(&arg->o, param, ctx); + /* param is a QTEE object: */ + else if (param->u.objref.flags & QCOMTEE_OBJREF_FLAG_TEE) + err = qcomtee_context_find_qtee_object(&arg->o, param, ctx); + + /* + * For callback objects, call qcomtee_object_get() to keep a temporary + * copy for the driver, as these objects are released asynchronously + * and may disappear even before returning from QTEE. + * + * - For direct object invocations, the matching put is called in + * qcomtee_object_invoke() when parsing the QTEE response. + * - For callback responses, put is called in qcomtee_user_object_notify() + * after QTEE has received its copies. + */ + + if (!err && (typeof_qcomtee_object(arg->o) == QCOMTEE_OBJECT_TYPE_CB)) + qcomtee_object_get(arg->o); + + return err; +} + +/** + * qcomtee_objref_from_arg() - Convert QTEE argument to OBJREF param. + * @param: TEE parameter. + * @arg: QTEE argument. + * @ctx: context in which the conversion should happen. + * + * It assumes @arg is of %QCOMTEE_ARG_TYPE_IO or %QCOMTEE_ARG_TYPE_OO. + * It does not set @param.attr; the caller should initialize it to a + * correct type. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_objref_from_arg(struct tee_param *param, struct qcomtee_arg *arg, + struct tee_context *ctx) +{ + struct qcomtee_object *object = arg->o; + + switch (typeof_qcomtee_object(object)) { + case QCOMTEE_OBJECT_TYPE_NULL: + param->u.objref.id = TEE_OBJREF_NULL; + + return 0; + case QCOMTEE_OBJECT_TYPE_CB: + /* object is a callback object: */ + if (is_qcomtee_user_object(object)) + return qcomtee_user_param_from_object(param, object, + ctx); + + break; + case QCOMTEE_OBJECT_TYPE_TEE: + return qcomtee_context_add_qtee_object(param, object, ctx); + + case QCOMTEE_OBJECT_TYPE_ROOT: + default: + break; + } + + return -EINVAL; +} + +/** + * qcomtee_params_to_args() - Convert TEE parameters to QTEE arguments. + * @u: QTEE arguments. + * @params: TEE parameters. + * @num_params: number of elements in the parameter array. + * @ctx: context in which the conversion should happen. + * + * It assumes @u has at least @num_params + 1 entries and has been initialized + * with %QCOMTEE_ARG_TYPE_INV as &struct qcomtee_arg.type. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_params_to_args(struct qcomtee_arg *u, + struct tee_param *params, int num_params, + struct tee_context *ctx) +{ + int i; + + for (i = 0; i < num_params; i++) { + switch (params[i].attr) { + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + u[i].flags = QCOMTEE_ARG_FLAGS_UADDR; + u[i].b.uaddr = params[i].u.ubuf.uaddr; + u[i].b.size = params[i].u.ubuf.size; + + if (params[i].attr == + TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT) + u[i].type = QCOMTEE_ARG_TYPE_IB; + else /* TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT */ + u[i].type = QCOMTEE_ARG_TYPE_OB; + + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + u[i].type = QCOMTEE_ARG_TYPE_IO; + if (qcomtee_objref_to_arg(&u[i], ¶ms[i], ctx)) + goto out_failed; + + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + u[i].type = QCOMTEE_ARG_TYPE_OO; + u[i].o = NULL_QCOMTEE_OBJECT; + break; + default: + goto out_failed; + } + } + + return 0; + +out_failed: + /* Undo qcomtee_objref_to_arg(). */ + for (i--; i >= 0; i--) { + if (u[i].type != QCOMTEE_ARG_TYPE_IO) + continue; + + qcomtee_user_object_set_notify(u[i].o, false); + /* See docs for qcomtee_objref_to_arg() for double put. */ + if (typeof_qcomtee_object(u[i].o) == QCOMTEE_OBJECT_TYPE_CB) + qcomtee_object_put(u[i].o); + + qcomtee_object_put(u[i].o); + } + + return -EINVAL; +} + +/** + * qcomtee_params_from_args() - Convert QTEE arguments to TEE parameters. + * @params: TEE parameters. + * @u: QTEE arguments. + * @num_params: number of elements in the parameter array. + * @ctx: context in which the conversion should happen. + * + * @u should have already been initialized by qcomtee_params_to_args(). + * This also represents the end of a QTEE invocation that started with + * qcomtee_params_to_args() by releasing %QCOMTEE_ARG_TYPE_IO objects. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_params_from_args(struct tee_param *params, + struct qcomtee_arg *u, int num_params, + struct tee_context *ctx) +{ + int i, np; + + qcomtee_arg_for_each(np, u) { + switch (u[np].type) { + case QCOMTEE_ARG_TYPE_OB: + /* TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT */ + params[np].u.ubuf.size = u[np].b.size; + + break; + case QCOMTEE_ARG_TYPE_IO: + /* IEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT */ + qcomtee_object_put(u[np].o); + + break; + case QCOMTEE_ARG_TYPE_OO: + /* TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT */ + if (qcomtee_objref_from_arg(¶ms[np], &u[np], ctx)) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_IB: + default: + break; + } + } + + return 0; + +out_failed: + /* Undo qcomtee_objref_from_arg(). */ + for (i = 0; i < np; i++) { + if (params[i].attr == TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT) + qcomtee_context_del_qtee_object(¶ms[i], ctx); + } + + /* Release any IO and OO objects not processed. */ + for (; u[i].type && i < num_params; i++) { + if (u[i].type == QCOMTEE_ARG_TYPE_OO || + u[i].type == QCOMTEE_ARG_TYPE_IO) + qcomtee_object_put(u[i].o); + } + + return -EINVAL; +} + +/* TEE Device Ops. */ + +static int qcomtee_params_check(struct tee_param *params, int num_params) +{ + int io = 0, oo = 0, ib = 0, ob = 0; + int i; + + /* QTEE can accept 64 arguments. */ + if (num_params > QCOMTEE_ARGS_MAX) + return -EINVAL; + + /* Supported parameter types. */ + for (i = 0; i < num_params; i++) { + switch (params[i].attr) { + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + ib++; + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + ob++; + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + io++; + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + oo++; + break; + default: + return -EINVAL; + } + } + + /* QTEE can accept 16 arguments of each supported types. */ + if (io > QCOMTEE_ARGS_PER_TYPE || oo > QCOMTEE_ARGS_PER_TYPE || + ib > QCOMTEE_ARGS_PER_TYPE || ob > QCOMTEE_ARGS_PER_TYPE) + return -EINVAL; + + return 0; +} + +/* Check if an operation on ROOT_QCOMTEE_OBJECT from userspace is permitted. */ +static int qcomtee_root_object_check(u32 op, struct tee_param *params, + int num_params) +{ + /* Some privileged operations recognized by QTEE. */ + if (op == QCOMTEE_ROOT_OP_NOTIFY_DOMAIN_CHANGE || + op == QCOMTEE_ROOT_OP_ADCI_ACCEPT || + op == QCOMTEE_ROOT_OP_ADCI_SHUTDOWN) + return -EINVAL; + + /* + * QCOMTEE_ROOT_OP_REG_WITH_CREDENTIALS is to register with QTEE + * by passing a credential object as input OBJREF. TEE_OBJREF_NULL as a + * credential object represents a privileged client for QTEE and + * is used by the kernel only. + */ + if (op == QCOMTEE_ROOT_OP_REG_WITH_CREDENTIALS && num_params == 2) { + if (params[0].attr == TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT && + params[1].attr == TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT) { + if (params[0].u.objref.id == TEE_OBJREF_NULL) + return -EINVAL; + } + } + + return 0; +} + +/** + * qcomtee_object_invoke() - Invoke a QTEE object. + * @ctx: TEE context. + * @arg: ioctl arguments. + * @params: parameters for the object. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_object_invoke(struct tee_context *ctx, + struct tee_ioctl_object_invoke_arg *arg, + struct tee_param *params) +{ + struct qcomtee_object_invoke_ctx *oic __free(kfree) = NULL; + struct qcomtee_context_data *ctxdata = ctx->data; + struct qcomtee_arg *u __free(kfree) = NULL; + struct qcomtee_object *object; + int i, ret, result; + + if (qcomtee_params_check(params, arg->num_params)) + return -EINVAL; + + /* First, handle reserved operations: */ + if (arg->op == QCOMTEE_MSG_OBJECT_OP_RELEASE) { + del_qtee_object(arg->id, ctxdata); + + return 0; + } + + /* Otherwise, invoke a QTEE object: */ + oic = qcomtee_object_invoke_ctx_alloc(ctx); + if (!oic) + return -ENOMEM; + + /* +1 for ending QCOMTEE_ARG_TYPE_INV. */ + u = kcalloc(arg->num_params + 1, sizeof(*u), GFP_KERNEL); + if (!u) + return -ENOMEM; + + /* Get an object to invoke. */ + if (arg->id == TEE_OBJREF_NULL) { + /* Use ROOT if TEE_OBJREF_NULL is invoked. */ + if (qcomtee_root_object_check(arg->op, params, arg->num_params)) + return -EINVAL; + + object = ROOT_QCOMTEE_OBJECT; + } else if (find_qtee_object(&object, arg->id, ctxdata)) { + return -EINVAL; + } + + ret = qcomtee_params_to_args(u, params, arg->num_params, ctx); + if (ret) + goto out; + + ret = qcomtee_object_do_invoke(oic, object, arg->op, u, &result); + if (ret) { + qcomtee_arg_for_each_input_object(i, u) { + qcomtee_user_object_set_notify(u[i].o, false); + qcomtee_object_put(u[i].o); + } + + goto out; + } + + /* Prase QTEE response and put driver's object copies: */ + + if (!result) { + /* Assume service is UNAVAIL if unable to process the result. */ + if (qcomtee_params_from_args(params, u, arg->num_params, ctx)) + result = QCOMTEE_MSG_ERROR_UNAVAIL; + } else { + /* + * qcomtee_params_to_args() gets a copy of IO for the driver to + * make sure they do not get released while in the middle of + * invocation. On success (!result), qcomtee_params_from_args() + * puts them; Otherwise, put them here. + */ + qcomtee_arg_for_each_input_object(i, u) + qcomtee_object_put(u[i].o); + } + + arg->ret = result; +out: + qcomtee_object_put(object); + + return ret; +} + +/** + * qcomtee_supp_recv() - Wait for a request for the supplicant. + * @ctx: TEE context. + * @op: requested operation on the object. + * @num_params: number of elements in the parameter array. + * @params: parameters for @op. + * + * The first parameter is a meta %TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT. + * On input, it provides a user buffer. This buffer is used for parameters of + * type %TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT in qcomtee_cb_params_from_args(). + * On output, the object ID and request ID are stored in the meta parameter. + * + * @num_params is updated to the number of parameters that actually exist + * in @params on return. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_supp_recv(struct tee_context *ctx, u32 *op, u32 *num_params, + struct tee_param *params) +{ + struct qcomtee_user_object_request_data data; + void __user *uaddr; + size_t ubuf_size; + int i, ret; + + if (!*num_params) + return -EINVAL; + + /* First parameter should be an INOUT + meta parameter. */ + if (params->attr != + (TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT | TEE_IOCTL_PARAM_ATTR_META)) + return -EINVAL; + + /* Other parameters are none. */ + for (i = 1; i < *num_params; i++) + if (params[i].attr) + return -EINVAL; + + if (!IS_ALIGNED(params->u.value.a, 8)) + return -EINVAL; + + /* User buffer and size from meta parameter. */ + uaddr = u64_to_user_ptr(params->u.value.a); + ubuf_size = params->u.value.b; + /* Process TEE parameters. +/-1 to ignore the meta parameter. */ + ret = qcomtee_user_object_select(ctx, params + 1, *num_params - 1, + uaddr, ubuf_size, &data); + if (ret) + return ret; + + params->u.value.a = data.object_id; + params->u.value.b = data.id; + params->u.value.c = 0; + *op = data.op; + *num_params = data.np + 1; + + return 0; +} + +/** + * qcomtee_supp_send() - Submit a response for a request. + * @ctx: TEE context. + * @errno: return value for the request. + * @num_params: number of elements in the parameter array. + * @params: returned parameters. + * + * The first parameter is a meta %TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT. + * It specifies the request ID this response belongs to. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_supp_send(struct tee_context *ctx, u32 errno, u32 num_params, + struct tee_param *params) +{ + int req_id; + + if (!num_params) + return -EINVAL; + + /* First parameter should be an OUTPUT + meta parameter. */ + if (params->attr != (TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT | + TEE_IOCTL_PARAM_ATTR_META)) + return -EINVAL; + + req_id = params->u.value.a; + /* Process TEE parameters. +/-1 to ignore the meta parameter. */ + return qcomtee_user_object_submit(ctx, params + 1, num_params - 1, + req_id, errno); +} + +static int qcomtee_open(struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata __free(kfree) = NULL; + + ctxdata = kzalloc(sizeof(*ctxdata), GFP_KERNEL); + if (!ctxdata) + return -ENOMEM; + + /* + * In the QTEE driver, the same context is used to refcount resources + * shared by QTEE. For example, teedev_ctx_get() is called for any + * instance of callback objects (see qcomtee_user_param_to_object()). + * + * Maintain a copy of teedev for QTEE as it serves as a direct user of + * this context. The teedev will be released in the context's release(). + * + * tee_device_unregister() will remain blocked until all contexts + * are released. This includes contexts owned by the user, which are + * closed by teedev_close_context(), as well as those owned by QTEE + * closed by teedev_ctx_put() in object's release(). + */ + if (!tee_device_get(ctx->teedev)) + return -EINVAL; + + idr_init(&ctxdata->qtee_objects_idr); + mutex_init(&ctxdata->qtee_lock); + idr_init(&ctxdata->reqs_idr); + INIT_LIST_HEAD(&ctxdata->reqs_list); + mutex_init(&ctxdata->reqs_lock); + init_completion(&ctxdata->req_c); + + ctx->data = no_free_ptr(ctxdata); + + return 0; +} + +/* Gets called when the user closes the device */ +static void qcomtee_close_context(struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + struct qcomtee_object *object; + int id; + + /* Process QUEUED or PROCESSING requests. */ + qcomtee_requests_destroy(ctxdata); + /* Release QTEE objects. */ + idr_for_each_entry(&ctxdata->qtee_objects_idr, object, id) + qcomtee_object_put(object); +} + +/* Gets called when the final reference to the context goes away. */ +static void qcomtee_release(struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + + idr_destroy(&ctxdata->qtee_objects_idr); + idr_destroy(&ctxdata->reqs_idr); + kfree(ctxdata); + + /* There is nothing shared in this context with QTEE. */ + tee_device_put(ctx->teedev); +} + +static void qcomtee_get_version(struct tee_device *teedev, + struct tee_ioctl_version_data *vers) +{ + struct tee_ioctl_version_data v = { + .impl_id = TEE_IMPL_ID_QTEE, + .gen_caps = TEE_GEN_CAP_OBJREF, + }; + + *vers = v; +} + +/** + * qcomtee_get_qtee_feature_list() - Query QTEE features versions. + * @ctx: TEE context. + * @id: ID of the feature to query. + * @version: version of the feature. + * + * Used to query the verion of features supported by QTEE. + */ +static void qcomtee_get_qtee_feature_list(struct tee_context *ctx, u32 id, + u32 *version) +{ + struct qcomtee_object_invoke_ctx *oic __free(kfree); + struct qcomtee_object *client_env, *service; + struct qcomtee_arg u[3] = { 0 }; + int result; + + oic = qcomtee_object_invoke_ctx_alloc(ctx); + if (!oic) + return; + + client_env = qcomtee_object_get_client_env(oic); + if (client_env == NULL_QCOMTEE_OBJECT) + return; + + /* Get ''FeatureVersions Service'' object. */ + service = qcomtee_object_get_service(oic, client_env, + QCOMTEE_FEATURE_VER_UID); + if (service == NULL_QCOMTEE_OBJECT) + goto out_failed; + + /* IB: Feature to query. */ + u[0].b.addr = &id; + u[0].b.size = sizeof(id); + u[0].type = QCOMTEE_ARG_TYPE_IB; + + /* OB: Version returned. */ + u[1].b.addr = version; + u[1].b.size = sizeof(*version); + u[1].type = QCOMTEE_ARG_TYPE_OB; + + qcomtee_object_do_invoke(oic, service, QCOMTEE_FEATURE_VER_OP_GET, u, + &result); + +out_failed: + qcomtee_object_put(service); + qcomtee_object_put(client_env); +} + +static const struct tee_driver_ops qcomtee_ops = { + .get_version = qcomtee_get_version, + .open = qcomtee_open, + .close_context = qcomtee_close_context, + .release = qcomtee_release, + .object_invoke_func = qcomtee_object_invoke, + .supp_recv = qcomtee_supp_recv, + .supp_send = qcomtee_supp_send, +}; + +static const struct tee_desc qcomtee_desc = { + .name = "qcomtee", + .ops = &qcomtee_ops, + .owner = THIS_MODULE, +}; + +static int qcomtee_probe(struct platform_device *pdev) +{ + struct workqueue_struct *async_wq; + struct tee_device *teedev; + struct tee_shm_pool *pool; + struct tee_context *ctx; + struct qcomtee *qcomtee; + int err; + + qcomtee = kzalloc(sizeof(*qcomtee), GFP_KERNEL); + if (!qcomtee) + return -ENOMEM; + + pool = qcomtee_shm_pool_alloc(); + if (IS_ERR(pool)) { + err = PTR_ERR(pool); + + goto err_free_qcomtee; + } + + teedev = tee_device_alloc(&qcomtee_desc, NULL, pool, qcomtee); + if (IS_ERR(teedev)) { + err = PTR_ERR(teedev); + + goto err_pool_destroy; + } + + qcomtee->teedev = teedev; + qcomtee->pool = pool; + err = tee_device_register(qcomtee->teedev); + if (err) + goto err_unreg_teedev; + + platform_set_drvdata(pdev, qcomtee); + /* Start async wq. */ + async_wq = alloc_ordered_workqueue("qcomtee_wq", 0); + if (!async_wq) { + err = -ENOMEM; + + goto err_unreg_teedev; + } + + qcomtee->wq = async_wq; + /* Driver context used for async operations of teedev. */ + ctx = teedev_open(qcomtee->teedev); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + + goto err_dest_wq; + } + + qcomtee->ctx = ctx; + /* Init Object table. */ + qcomtee->xa_last_id = 0; + xa_init_flags(&qcomtee->xa_local_objects, XA_FLAGS_ALLOC); + /* Get QTEE verion. */ + qcomtee_get_qtee_feature_list(qcomtee->ctx, + QCOMTEE_FEATURE_VER_OP_GET_QTEE_ID, + &qcomtee->qtee_version); + + pr_info("QTEE version %u.%u.%u\n", + QTEE_VERSION_GET_MAJOR(qcomtee->qtee_version), + QTEE_VERSION_GET_MINOR(qcomtee->qtee_version), + QTEE_VERSION_GET_PATCH(qcomtee->qtee_version)); + + return 0; + +err_dest_wq: + destroy_workqueue(qcomtee->wq); +err_unreg_teedev: + tee_device_unregister(qcomtee->teedev); +err_pool_destroy: + tee_shm_pool_free(pool); +err_free_qcomtee: + kfree(qcomtee); + + return err; +} + +/** + * qcomtee_remove() - Device Removal Routine. + * @pdev: platform device information struct. + * + * It is called by the platform subsystem to alert the driver that it should + * release the device. + * + * QTEE does not provide an API to inform it about a callback object going away. + * However, when releasing QTEE objects, any callback object sent to QTEE + * previously would be released by QTEE as part of the object release. + */ +static void qcomtee_remove(struct platform_device *pdev) +{ + struct qcomtee *qcomtee = platform_get_drvdata(pdev); + + teedev_close_context(qcomtee->ctx); + /* Wait for RELEASE operations to be processed for QTEE objects. */ + tee_device_unregister(qcomtee->teedev); + destroy_workqueue(qcomtee->wq); + tee_shm_pool_free(qcomtee->pool); + kfree(qcomtee); +} + +static const struct platform_device_id qcomtee_ids[] = { { "qcomtee", 0 }, {} }; +MODULE_DEVICE_TABLE(platform, qcomtee_ids); + +static struct platform_driver qcomtee_platform_driver = { + .probe = qcomtee_probe, + .remove = qcomtee_remove, + .driver = { + .name = "qcomtee", + }, + .id_table = qcomtee_ids, +}; + +module_platform_driver(qcomtee_platform_driver); + +MODULE_AUTHOR("Qualcomm"); +MODULE_DESCRIPTION("QTEE driver"); +MODULE_VERSION("1.0"); +MODULE_LICENSE("GPL"); diff --git a/drivers/tee/qcomtee/core.c b/drivers/tee/qcomtee/core.c new file mode 100644 index 000000000000..b6931ed6f200 --- /dev/null +++ b/drivers/tee/qcomtee/core.c @@ -0,0 +1,906 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#include "qcomtee.h" + +/* QTEE root object. */ +struct qcomtee_object qcomtee_object_root = { + .name = "root", + .object_type = QCOMTEE_OBJECT_TYPE_ROOT, + .info.qtee_id = QCOMTEE_MSG_OBJECT_ROOT, +}; + +/* Next argument of type @type after index @i. */ +int qcomtee_next_arg_type(struct qcomtee_arg *u, int i, + enum qcomtee_arg_type type) +{ + while (u[i].type != QCOMTEE_ARG_TYPE_INV && u[i].type != type) + i++; + return i; +} + +/* + * QTEE expects IDs with the QCOMTEE_MSG_OBJECT_NS_BIT set for objects + * of the QCOMTEE_OBJECT_TYPE_CB type. + */ +#define QCOMTEE_OBJECT_ID_START (QCOMTEE_MSG_OBJECT_NS_BIT + 1) +#define QCOMTEE_OBJECT_ID_END (U32_MAX) + +#define QCOMTEE_OBJECT_SET(p, type, ...) \ + __QCOMTEE_OBJECT_SET(p, type, ##__VA_ARGS__, 0UL) +#define __QCOMTEE_OBJECT_SET(p, type, optr, ...) \ + do { \ + (p)->object_type = (type); \ + (p)->info.qtee_id = (unsigned long)(optr); \ + } while (0) + +static struct qcomtee_object * +qcomtee_qtee_object_alloc(struct qcomtee_object_invoke_ctx *oic, + unsigned int object_id) +{ + struct qcomtee *qcomtee = tee_get_drvdata(oic->ctx->teedev); + struct qcomtee_object *object; + + object = kzalloc(sizeof(*object), GFP_KERNEL); + if (!object) + return NULL_QCOMTEE_OBJECT; + + /* If failed, "no-name". */ + object->name = kasprintf(GFP_KERNEL, "qcomtee-%u", object_id); + QCOMTEE_OBJECT_SET(object, QCOMTEE_OBJECT_TYPE_TEE, object_id); + kref_init(&object->refcount); + /* A QTEE object requires a context for async operations. */ + object->info.qcomtee_async_ctx = qcomtee->ctx; + teedev_ctx_get(object->info.qcomtee_async_ctx); + + return object; +} + +static void qcomtee_qtee_object_free(struct qcomtee_object *object) +{ + /* See qcomtee_qtee_object_alloc(). */ + teedev_ctx_put(object->info.qcomtee_async_ctx); + + kfree(object->name); + kfree(object); +} + +static void qcomtee_do_release_qtee_object(struct work_struct *work) +{ + struct qcomtee_object *object; + struct qcomtee *qcomtee; + int ret, result; + + /* RELEASE does not require any argument. */ + struct qcomtee_arg args[] = { { .type = QCOMTEE_ARG_TYPE_INV } }; + + object = container_of(work, struct qcomtee_object, work); + qcomtee = tee_get_drvdata(object->info.qcomtee_async_ctx->teedev); + /* Get the TEE context used for asynchronous operations. */ + qcomtee->oic.ctx = object->info.qcomtee_async_ctx; + + ret = qcomtee_object_do_invoke_internal(&qcomtee->oic, object, + QCOMTEE_MSG_OBJECT_OP_RELEASE, + args, &result); + + /* Is it safe to retry the release? */ + if (ret && ret != -ENODEV) { + queue_work(qcomtee->wq, &object->work); + } else { + if (ret || result) + pr_err("%s release failed, ret = %d (%x)\n", + qcomtee_object_name(object), ret, result); + qcomtee_qtee_object_free(object); + } +} + +static void qcomtee_release_qtee_object(struct qcomtee_object *object) +{ + struct qcomtee *qcomtee = + tee_get_drvdata(object->info.qcomtee_async_ctx->teedev); + + INIT_WORK(&object->work, qcomtee_do_release_qtee_object); + queue_work(qcomtee->wq, &object->work); +} + +static void qcomtee_object_release(struct kref *refcount) +{ + struct qcomtee_object *object; + const char *name; + + object = container_of(refcount, struct qcomtee_object, refcount); + + /* + * qcomtee_object_get() is called in a RCU read lock. synchronize_rcu() + * to avoid releasing the object while it is being accessed in + * qcomtee_object_get(). + */ + synchronize_rcu(); + + switch (typeof_qcomtee_object(object)) { + case QCOMTEE_OBJECT_TYPE_TEE: + qcomtee_release_qtee_object(object); + + break; + case QCOMTEE_OBJECT_TYPE_CB: + name = object->name; + + if (object->ops->release) + object->ops->release(object); + + kfree_const(name); + + break; + case QCOMTEE_OBJECT_TYPE_ROOT: + case QCOMTEE_OBJECT_TYPE_NULL: + default: + break; + } +} + +/** + * qcomtee_object_get() - Increase the object's reference count. + * @object: object to increase the reference count. + * + * Context: The caller should hold RCU read lock. + */ +int qcomtee_object_get(struct qcomtee_object *object) +{ + if (object != NULL_QCOMTEE_OBJECT && object != ROOT_QCOMTEE_OBJECT) + return kref_get_unless_zero(&object->refcount); + + return 0; +} + +/** + * qcomtee_object_put() - Decrease the object's reference count. + * @object: object to decrease the reference count. + */ +void qcomtee_object_put(struct qcomtee_object *object) +{ + if (object != NULL_QCOMTEE_OBJECT && object != ROOT_QCOMTEE_OBJECT) + kref_put(&object->refcount, qcomtee_object_release); +} + +static int qcomtee_idx_alloc(struct qcomtee_object_invoke_ctx *oic, u32 *idx, + struct qcomtee_object *object) +{ + struct qcomtee *qcomtee = tee_get_drvdata(oic->ctx->teedev); + + /* Every ID allocated here has QCOMTEE_MSG_OBJECT_NS_BIT set. */ + return xa_alloc_cyclic(&qcomtee->xa_local_objects, idx, object, + XA_LIMIT(QCOMTEE_OBJECT_ID_START, + QCOMTEE_OBJECT_ID_END), + &qcomtee->xa_last_id, GFP_KERNEL); +} + +struct qcomtee_object *qcomtee_idx_erase(struct qcomtee_object_invoke_ctx *oic, + u32 idx) +{ + struct qcomtee *qcomtee = tee_get_drvdata(oic->ctx->teedev); + + if (idx < QCOMTEE_OBJECT_ID_START || idx > QCOMTEE_OBJECT_ID_END) + return NULL_QCOMTEE_OBJECT; + + return xa_erase(&qcomtee->xa_local_objects, idx); +} + +/** + * qcomtee_object_id_get() - Get an ID for an object to send to QTEE. + * @oic: context to use for the invocation. + * @object: object to assign an ID. + * @object_id: object ID. + * + * Called on the path to QTEE to construct the message; see + * qcomtee_prepare_msg() and qcomtee_update_msg(). + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_object_id_get(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, + unsigned int *object_id) +{ + u32 idx; + + switch (typeof_qcomtee_object(object)) { + case QCOMTEE_OBJECT_TYPE_CB: + if (qcomtee_idx_alloc(oic, &idx, object) < 0) + return -ENOSPC; + + *object_id = idx; + + break; + case QCOMTEE_OBJECT_TYPE_ROOT: + case QCOMTEE_OBJECT_TYPE_TEE: + *object_id = object->info.qtee_id; + + break; + case QCOMTEE_OBJECT_TYPE_NULL: + *object_id = QCOMTEE_MSG_OBJECT_NULL; + + break; + } + + return 0; +} + +/* Release object ID assigned in qcomtee_object_id_get. */ +static void qcomtee_object_id_put(struct qcomtee_object_invoke_ctx *oic, + unsigned int object_id) +{ + qcomtee_idx_erase(oic, object_id); +} + +/** + * qcomtee_local_object_get() - Get the object referenced by the ID. + * @oic: context to use for the invocation. + * @object_id: object ID. + * + * It is called on the path from QTEE. + * It is called on behalf of QTEE to obtain an instance of an object + * for a given ID. It increases the object's reference count on success. + * + * Return: On error, returns %NULL_QCOMTEE_OBJECT. + * On success, returns the object. + */ +static struct qcomtee_object * +qcomtee_local_object_get(struct qcomtee_object_invoke_ctx *oic, + unsigned int object_id) +{ + struct qcomtee *qcomtee = tee_get_drvdata(oic->ctx->teedev); + struct qcomtee_object *object; + + guard(rcu)(); + object = xa_load(&qcomtee->xa_local_objects, object_id); + /* It already checks for %NULL_QCOMTEE_OBJECT. */ + qcomtee_object_get(object); + + return object; +} + +/** + * qcomtee_object_user_init() - Initialize an object for the user. + * @object: object to initialize. + * @ot: type of object as &enum qcomtee_object_type. + * @ops: instance of callbacks. + * @fmt: name assigned to the object. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_user_init(struct qcomtee_object *object, + enum qcomtee_object_type ot, + struct qcomtee_object_operations *ops, + const char *fmt, ...) +{ + va_list ap; + int ret; + + kref_init(&object->refcount); + QCOMTEE_OBJECT_SET(object, QCOMTEE_OBJECT_TYPE_NULL); + + va_start(ap, fmt); + switch (ot) { + case QCOMTEE_OBJECT_TYPE_NULL: + ret = 0; + + break; + case QCOMTEE_OBJECT_TYPE_CB: + object->ops = ops; + if (!object->ops->dispatch) + return -EINVAL; + + /* If failed, "no-name". */ + object->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + QCOMTEE_OBJECT_SET(object, QCOMTEE_OBJECT_TYPE_CB); + + ret = 0; + break; + case QCOMTEE_OBJECT_TYPE_ROOT: + case QCOMTEE_OBJECT_TYPE_TEE: + default: + ret = -EINVAL; + } + va_end(ap); + + return ret; +} + +/** + * qcomtee_object_type() - Returns the type of object represented by an ID. + * @object_id: object ID for the object. + * + * Similar to typeof_qcomtee_object(), but instead of receiving an object as + * an argument, it receives an object ID. It is used internally on the return + * path from QTEE. + * + * Return: Returns the type of object referenced by @object_id. + */ +static enum qcomtee_object_type qcomtee_object_type(unsigned int object_id) +{ + if (object_id == QCOMTEE_MSG_OBJECT_NULL) + return QCOMTEE_OBJECT_TYPE_NULL; + + if (object_id & QCOMTEE_MSG_OBJECT_NS_BIT) + return QCOMTEE_OBJECT_TYPE_CB; + + return QCOMTEE_OBJECT_TYPE_TEE; +} + +/** + * qcomtee_object_qtee_init() - Initialize an object for QTEE. + * @oic: context to use for the invocation. + * @object: object returned. + * @object_id: object ID received from QTEE. + * + * Return: On failure, returns < 0 and sets @object to %NULL_QCOMTEE_OBJECT. + * On success, returns 0 + */ +static int qcomtee_object_qtee_init(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object **object, + unsigned int object_id) +{ + int ret = 0; + + switch (qcomtee_object_type(object_id)) { + case QCOMTEE_OBJECT_TYPE_NULL: + *object = NULL_QCOMTEE_OBJECT; + + break; + case QCOMTEE_OBJECT_TYPE_CB: + *object = qcomtee_local_object_get(oic, object_id); + if (*object == NULL_QCOMTEE_OBJECT) + ret = -EINVAL; + + break; + + default: /* QCOMTEE_OBJECT_TYPE_TEE */ + *object = qcomtee_qtee_object_alloc(oic, object_id); + if (*object == NULL_QCOMTEE_OBJECT) + ret = -ENOMEM; + + break; + } + + return ret; +} + +/* + * ''Marshaling API'' + * qcomtee_prepare_msg - Prepare the inbound buffer for sending to QTEE + * qcomtee_update_args - Parse the QTEE response in the inbound buffer + * qcomtee_prepare_args - Parse the QTEE request from the outbound buffer + * qcomtee_update_msg - Update the outbound buffer with the response for QTEE + */ + +static int qcomtee_prepare_msg(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u) +{ + struct qcomtee_msg_object_invoke *msg; + unsigned int object_id; + int i, ib, ob, io, oo; + size_t offset; + + /* Use the input message buffer in 'oic'. */ + msg = oic->in_msg.addr; + + /* Start offset in a message for buffer arguments. */ + offset = qcomtee_msg_buffer_args(struct qcomtee_msg_object_invoke, + qcomtee_args_len(u)); + + /* Get the ID of the object being invoked. */ + if (qcomtee_object_id_get(oic, object, &object_id)) + return -ENOSPC; + + ib = 0; + qcomtee_arg_for_each_input_buffer(i, u) { + void *msgptr; /* Address of buffer payload: */ + /* Overflow already checked in qcomtee_msg_buffers_alloc(). */ + msg->args[ib].b.offset = offset; + msg->args[ib].b.size = u[i].b.size; + + msgptr = qcomtee_msg_offset_to_ptr(msg, offset); + /* Userspace client or kernel client!? */ + if (!(u[i].flags & QCOMTEE_ARG_FLAGS_UADDR)) + memcpy(msgptr, u[i].b.addr, u[i].b.size); + else if (copy_from_user(msgptr, u[i].b.uaddr, u[i].b.size)) + return -EINVAL; + + offset += qcomtee_msg_offset_align(u[i].b.size); + ib++; + } + + ob = ib; + qcomtee_arg_for_each_output_buffer(i, u) { + /* Overflow already checked in qcomtee_msg_buffers_alloc(). */ + msg->args[ob].b.offset = offset; + msg->args[ob].b.size = u[i].b.size; + + offset += qcomtee_msg_offset_align(u[i].b.size); + ob++; + } + + io = ob; + qcomtee_arg_for_each_input_object(i, u) { + if (qcomtee_object_id_get(oic, u[i].o, &msg->args[io].o)) { + qcomtee_object_id_put(oic, object_id); + for (io--; io >= ob; io--) + qcomtee_object_id_put(oic, msg->args[io].o); + + return -ENOSPC; + } + + io++; + } + + oo = io; + qcomtee_arg_for_each_output_object(i, u) + oo++; + + /* Set object, operation, and argument counts. */ + qcomtee_msg_init(msg, object_id, op, ib, ob, io, oo); + + return 0; +} + +/** + * qcomtee_update_args() - Parse the QTEE response in the inbound buffer. + * @u: array of arguments for the invocation. + * @oic: context to use for the invocation. + * + * @u must be the same as the one used in qcomtee_prepare_msg() when + * initializing the inbound buffer. + * + * On failure, it continues processing the QTEE message. The caller should + * do the necessary cleanup, including calling qcomtee_object_put() + * on the output objects. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_update_args(struct qcomtee_arg *u, + struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_msg_object_invoke *msg; + int i, ib, ob, io, oo; + int ret = 0; + + /* Use the input message buffer in 'oic'. */ + msg = oic->in_msg.addr; + + ib = 0; + qcomtee_arg_for_each_input_buffer(i, u) + ib++; + + ob = ib; + qcomtee_arg_for_each_output_buffer(i, u) { + void *msgptr; /* Address of buffer payload: */ + /* QTEE can override the size to a smaller value. */ + u[i].b.size = msg->args[ob].b.size; + + msgptr = qcomtee_msg_offset_to_ptr(msg, msg->args[ob].b.offset); + /* Userspace client or kernel client!? */ + if (!(u[i].flags & QCOMTEE_ARG_FLAGS_UADDR)) + memcpy(u[i].b.addr, msgptr, u[i].b.size); + else if (copy_to_user(u[i].b.uaddr, msgptr, u[i].b.size)) + ret = -EINVAL; + + ob++; + } + + io = ob; + qcomtee_arg_for_each_input_object(i, u) + io++; + + oo = io; + qcomtee_arg_for_each_output_object(i, u) { + if (qcomtee_object_qtee_init(oic, &u[i].o, msg->args[oo].o)) + ret = -EINVAL; + + oo++; + } + + return ret; +} + +/** + * qcomtee_prepare_args() - Parse the QTEE request from the outbound buffer. + * @oic: context to use for the invocation. + * + * It initializes &qcomtee_object_invoke_ctx->u based on the QTEE request in + * the outbound buffer. It sets %QCOMTEE_ARG_TYPE_INV at the end of the array. + * + * On failure, it continues processing the QTEE message. The caller should + * do the necessary cleanup, including calling qcomtee_object_put() + * on the input objects. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_prepare_args(struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_msg_callback *msg; + int i, ret = 0; + + /* Use the output message buffer in 'oic'. */ + msg = oic->out_msg.addr; + + qcomtee_msg_for_each_input_buffer(i, msg) { + oic->u[i].b.addr = + qcomtee_msg_offset_to_ptr(msg, msg->args[i].b.offset); + oic->u[i].b.size = msg->args[i].b.size; + oic->u[i].type = QCOMTEE_ARG_TYPE_IB; + } + + qcomtee_msg_for_each_output_buffer(i, msg) { + oic->u[i].b.addr = + qcomtee_msg_offset_to_ptr(msg, msg->args[i].b.offset); + oic->u[i].b.size = msg->args[i].b.size; + oic->u[i].type = QCOMTEE_ARG_TYPE_OB; + } + + qcomtee_msg_for_each_input_object(i, msg) { + if (qcomtee_object_qtee_init(oic, &oic->u[i].o, msg->args[i].o)) + ret = -EINVAL; + + oic->u[i].type = QCOMTEE_ARG_TYPE_IO; + } + + qcomtee_msg_for_each_output_object(i, msg) + oic->u[i].type = QCOMTEE_ARG_TYPE_OO; + + /* End of Arguments. */ + oic->u[i].type = QCOMTEE_ARG_TYPE_INV; + + return ret; +} + +static int qcomtee_update_msg(struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_msg_callback *msg; + int i, ib, ob, io, oo; + + /* Use the output message buffer in 'oic'. */ + msg = oic->out_msg.addr; + + ib = 0; + qcomtee_arg_for_each_input_buffer(i, oic->u) + ib++; + + ob = ib; + qcomtee_arg_for_each_output_buffer(i, oic->u) { + /* Only reduce size; never increase it. */ + if (msg->args[ob].b.size < oic->u[i].b.size) + return -EINVAL; + + msg->args[ob].b.size = oic->u[i].b.size; + ob++; + } + + io = ob; + qcomtee_arg_for_each_input_object(i, oic->u) + io++; + + oo = io; + qcomtee_arg_for_each_output_object(i, oic->u) { + if (qcomtee_object_id_get(oic, oic->u[i].o, &msg->args[oo].o)) { + for (oo--; oo >= io; oo--) + qcomtee_object_id_put(oic, msg->args[oo].o); + + return -ENOSPC; + } + + oo++; + } + + return 0; +} + +/* Invoke a callback object. */ +static void qcomtee_cb_object_invoke(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_msg_callback *msg) +{ + int i, errno; + u32 op; + + /* Get the object being invoked. */ + unsigned int object_id = msg->cxt; + struct qcomtee_object *object; + + /* QTEE cannot invoke a NULL object or objects it hosts. */ + if (qcomtee_object_type(object_id) == QCOMTEE_OBJECT_TYPE_NULL || + qcomtee_object_type(object_id) == QCOMTEE_OBJECT_TYPE_TEE) { + errno = -EINVAL; + goto out; + } + + object = qcomtee_local_object_get(oic, object_id); + if (object == NULL_QCOMTEE_OBJECT) { + errno = -EINVAL; + goto out; + } + + oic->object = object; + + /* Filter bits used by transport. */ + op = msg->op & QCOMTEE_MSG_OBJECT_OP_MASK; + + switch (op) { + case QCOMTEE_MSG_OBJECT_OP_RELEASE: + qcomtee_object_id_put(oic, object_id); + qcomtee_object_put(object); + errno = 0; + + break; + case QCOMTEE_MSG_OBJECT_OP_RETAIN: + qcomtee_object_get(object); + errno = 0; + + break; + default: + errno = qcomtee_prepare_args(oic); + if (errno) { + /* Release any object that arrived as input. */ + qcomtee_arg_for_each_input_buffer(i, oic->u) + qcomtee_object_put(oic->u[i].o); + + break; + } + + errno = object->ops->dispatch(oic, object, op, oic->u); + if (!errno) { + /* On success, notify at the appropriate time. */ + oic->flags |= QCOMTEE_OIC_FLAG_NOTIFY; + } + } + +out: + + oic->errno = errno; +} + +static int +qcomtee_object_invoke_ctx_invoke(struct qcomtee_object_invoke_ctx *oic, + int *result, u64 *res_type) +{ + phys_addr_t out_msg_paddr; + phys_addr_t in_msg_paddr; + int ret; + u64 res; + + tee_shm_get_pa(oic->out_shm, 0, &out_msg_paddr); + tee_shm_get_pa(oic->in_shm, 0, &in_msg_paddr); + if (!(oic->flags & QCOMTEE_OIC_FLAG_BUSY)) + ret = qcom_scm_qtee_invoke_smc(in_msg_paddr, oic->in_msg.size, + out_msg_paddr, oic->out_msg.size, + &res, res_type); + else + ret = qcom_scm_qtee_callback_response(out_msg_paddr, + oic->out_msg.size, + &res, res_type); + + if (ret) + pr_err("QTEE returned with %d.\n", ret); + else + *result = (int)res; + + return ret; +} + +/** + * qcomtee_qtee_objects_put() - Put the callback objects in the argument array. + * @u: array of arguments. + * + * When qcomtee_object_do_invoke_internal() is successfully invoked, + * QTEE takes ownership of the callback objects. If the invocation fails, + * qcomtee_object_do_invoke_internal() calls qcomtee_qtee_objects_put() + * to mimic the release of callback objects by QTEE. + */ +static void qcomtee_qtee_objects_put(struct qcomtee_arg *u) +{ + int i; + + qcomtee_arg_for_each_input_object(i, u) { + if (typeof_qcomtee_object(u[i].o) == QCOMTEE_OBJECT_TYPE_CB) + qcomtee_object_put(u[i].o); + } +} + +/** + * qcomtee_object_do_invoke_internal() - Submit an invocation for an object. + * @oic: context to use for the current invocation. + * @object: object being invoked. + * @op: requested operation on the object. + * @u: array of arguments for the current invocation. + * @result: result returned from QTEE. + * + * The caller is responsible for keeping track of the refcount for each + * object, including @object. On return, the caller loses ownership of all + * input objects of type %QCOMTEE_OBJECT_TYPE_CB. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_do_invoke_internal(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u, int *result) +{ + struct qcomtee_msg_callback *cb_msg; + struct qcomtee_object *qto; + int i, ret, errno; + u64 res_type; + + /* Allocate inbound and outbound buffers. */ + ret = qcomtee_msg_buffers_alloc(oic, u); + if (ret) { + qcomtee_qtee_objects_put(u); + + return ret; + } + + ret = qcomtee_prepare_msg(oic, object, op, u); + if (ret) { + qcomtee_qtee_objects_put(u); + + goto out; + } + + /* Use input message buffer in 'oic'. */ + cb_msg = oic->out_msg.addr; + + while (1) { + if (oic->flags & QCOMTEE_OIC_FLAG_BUSY) { + errno = oic->errno; + if (!errno) + errno = qcomtee_update_msg(oic); + qcomtee_msg_set_result(cb_msg, errno); + } + + /* Invoke the remote object. */ + ret = qcomtee_object_invoke_ctx_invoke(oic, result, &res_type); + /* Return form callback objects result submission: */ + if (oic->flags & QCOMTEE_OIC_FLAG_BUSY) { + qto = oic->object; + if (qto) { + if (oic->flags & QCOMTEE_OIC_FLAG_NOTIFY) { + if (qto->ops->notify) + qto->ops->notify(oic, qto, + errno || ret); + } + + /* Get is in qcomtee_cb_object_invoke(). */ + qcomtee_object_put(qto); + } + + oic->object = NULL_QCOMTEE_OBJECT; + oic->flags &= ~(QCOMTEE_OIC_FLAG_BUSY | + QCOMTEE_OIC_FLAG_NOTIFY); + } + + if (ret) { + /* + * Unable to finished the invocation. + * If QCOMTEE_OIC_FLAG_SHARED is not set, put + * QCOMTEE_OBJECT_TYPE_CB input objects. + */ + if (!(oic->flags & QCOMTEE_OIC_FLAG_SHARED)) + qcomtee_qtee_objects_put(u); + else + ret = -ENODEV; + + goto out; + + } else { + /* + * QTEE obtained ownership of QCOMTEE_OBJECT_TYPE_CB + * input objects in 'u'. On further failure, QTEE is + * responsible for releasing them. + */ + oic->flags |= QCOMTEE_OIC_FLAG_SHARED; + } + + /* Is it a callback request? */ + if (res_type != QCOMTEE_RESULT_INBOUND_REQ_NEEDED) { + /* + * Parse results. If failed, assume the service + * was unavailable (i.e. QCOMTEE_MSG_ERROR_UNAVAIL) + * and put output objects to initiate cleanup. + */ + if (!*result && qcomtee_update_args(u, oic)) { + *result = QCOMTEE_MSG_ERROR_UNAVAIL; + qcomtee_arg_for_each_output_object(i, u) + qcomtee_object_put(u[i].o); + } + + break; + + } else { + oic->flags |= QCOMTEE_OIC_FLAG_BUSY; + qcomtee_fetch_async_reqs(oic); + qcomtee_cb_object_invoke(oic, cb_msg); + } + } + + qcomtee_fetch_async_reqs(oic); +out: + qcomtee_msg_buffers_free(oic); + + return ret; +} + +int qcomtee_object_do_invoke(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u, int *result) +{ + /* User can not set bits used by transport. */ + if (op & ~QCOMTEE_MSG_OBJECT_OP_MASK) + return -EINVAL; + + /* User can only invoke QTEE hosted objects. */ + if (typeof_qcomtee_object(object) != QCOMTEE_OBJECT_TYPE_TEE && + typeof_qcomtee_object(object) != QCOMTEE_OBJECT_TYPE_ROOT) + return -EINVAL; + + /* User cannot directly issue these operations to QTEE. */ + if (op == QCOMTEE_MSG_OBJECT_OP_RELEASE || + op == QCOMTEE_MSG_OBJECT_OP_RETAIN) + return -EINVAL; + + return qcomtee_object_do_invoke_internal(oic, object, op, u, result); +} + +/** + * qcomtee_object_get_client_env() - Get a privileged client env. object. + * @oic: context to use for the current invocation. + * + * The caller should call qcomtee_object_put() on the returned object + * to release it. + * + * Return: On error, returns %NULL_QCOMTEE_OBJECT. + * On success, returns the object. + */ +struct qcomtee_object * +qcomtee_object_get_client_env(struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_arg u[3] = { 0 }; + int ret, result; + + u[0].o = NULL_QCOMTEE_OBJECT; + u[0].type = QCOMTEE_ARG_TYPE_IO; + u[1].type = QCOMTEE_ARG_TYPE_OO; + ret = qcomtee_object_do_invoke(oic, ROOT_QCOMTEE_OBJECT, + QCOMTEE_ROOT_OP_REG_WITH_CREDENTIALS, u, + &result); + if (ret || result) + return NULL_QCOMTEE_OBJECT; + + return u[1].o; +} + +struct qcomtee_object * +qcomtee_object_get_service(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *client_env, u32 uid) +{ + struct qcomtee_arg u[3] = { 0 }; + int ret, result; + + u[0].b.addr = &uid; + u[0].b.size = sizeof(uid); + u[0].type = QCOMTEE_ARG_TYPE_IB; + u[1].type = QCOMTEE_ARG_TYPE_OO; + ret = qcomtee_object_do_invoke(oic, client_env, QCOMTEE_CLIENT_ENV_OPEN, + u, &result); + + if (ret || result) + return NULL_QCOMTEE_OBJECT; + + return u[1].o; +} diff --git a/drivers/tee/qcomtee/qcomtee.h b/drivers/tee/qcomtee/qcomtee.h new file mode 100644 index 000000000000..f34be992e68b --- /dev/null +++ b/drivers/tee/qcomtee/qcomtee.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef QCOMTEE_H +#define QCOMTEE_H + +#include +#include + +#include "qcomtee_msg.h" +#include "qcomtee_object.h" + +/* Flags relating to object reference. */ +#define QCOMTEE_OBJREF_FLAG_TEE BIT(0) +#define QCOMTEE_OBJREF_FLAG_USER BIT(1) + +/** + * struct qcomtee - Main service struct. + * @teedev: client device. + * @pool: shared memory pool. + * @ctx: driver private context. + * @oic: context to use for the current driver invocation. + * @wq: workqueue for QTEE async operations. + * @xa_local_objects: array of objects exported to QTEE. + * @xa_last_id: next ID to allocate. + * @qtee_version: QTEE version. + */ +struct qcomtee { + struct tee_device *teedev; + struct tee_shm_pool *pool; + struct tee_context *ctx; + struct qcomtee_object_invoke_ctx oic; + struct workqueue_struct *wq; + struct xarray xa_local_objects; + u32 xa_last_id; + u32 qtee_version; +}; + +void qcomtee_fetch_async_reqs(struct qcomtee_object_invoke_ctx *oic); +struct qcomtee_object *qcomtee_idx_erase(struct qcomtee_object_invoke_ctx *oic, + u32 idx); + +struct tee_shm_pool *qcomtee_shm_pool_alloc(void); +void qcomtee_msg_buffers_free(struct qcomtee_object_invoke_ctx *oic); +int qcomtee_msg_buffers_alloc(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_arg *u); + +/** + * qcomtee_object_do_invoke_internal() - Submit an invocation for an object. + * @oic: context to use for the current invocation. + * @object: object being invoked. + * @op: requested operation on the object. + * @u: array of arguments for the current invocation. + * @result: result returned from QTEE. + * + * The caller is responsible for keeping track of the refcount for each + * object, including @object. On return, the caller loses ownership of all + * input objects of type %QCOMTEE_OBJECT_TYPE_CB. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_do_invoke_internal(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u, int *result); + +/** + * struct qcomtee_context_data - Clients' or supplicants' context. + * @qtee_objects_idr: QTEE objects in this context. + * @qtee_lock: mutex for @qtee_objects_idr. + * @reqs_idr: requests in this context that hold ID. + * @reqs_list: FIFO for requests in PROCESSING or QUEUED state. + * @reqs_lock: mutex for @reqs_idr, @reqs_list and request states. + * @req_c: completion used when the supplicant is waiting for requests. + * @released: state of this context. + */ +struct qcomtee_context_data { + struct idr qtee_objects_idr; + /* Synchronize access to @qtee_objects_idr. */ + struct mutex qtee_lock; + + struct idr reqs_idr; + struct list_head reqs_list; + /* Synchronize access to @reqs_idr, @reqs_list and updating requests states. */ + struct mutex reqs_lock; + + struct completion req_c; + + bool released; +}; + +int qcomtee_context_add_qtee_object(struct tee_param *param, + struct qcomtee_object *object, + struct tee_context *ctx); +int qcomtee_context_find_qtee_object(struct qcomtee_object **object, + struct tee_param *param, + struct tee_context *ctx); +void qcomtee_context_del_qtee_object(struct tee_param *param, + struct tee_context *ctx); + +int qcomtee_objref_to_arg(struct qcomtee_arg *arg, struct tee_param *param, + struct tee_context *ctx); +int qcomtee_objref_from_arg(struct tee_param *param, struct qcomtee_arg *arg, + struct tee_context *ctx); + +/* OBJECTS: */ + +/* (1) User Object API. */ + +int is_qcomtee_user_object(struct qcomtee_object *object); +void qcomtee_user_object_set_notify(struct qcomtee_object *object, bool notify); +void qcomtee_requests_destroy(struct qcomtee_context_data *ctxdata); +int qcomtee_user_param_to_object(struct qcomtee_object **object, + struct tee_param *param, + struct tee_context *ctx); +int qcomtee_user_param_from_object(struct tee_param *param, + struct qcomtee_object *object, + struct tee_context *ctx); + +/** + * struct qcomtee_user_object_request_data - Data for user object request. + * @id: ID assigned to the request. + * @object_id: Object ID being invoked by QTEE. + * @op: Requested operation on object. + * @np: Number of parameters in the request. + */ +struct qcomtee_user_object_request_data { + int id; + u64 object_id; + u32 op; + int np; +}; + +int qcomtee_user_object_select(struct tee_context *ctx, + struct tee_param *params, int num_params, + void __user *uaddr, size_t size, + struct qcomtee_user_object_request_data *data); +int qcomtee_user_object_submit(struct tee_context *ctx, + struct tee_param *params, int num_params, + int req_id, int errno); + +#endif /* QCOMTEE_H */ diff --git a/drivers/tee/qcomtee/qcomtee_msg.h b/drivers/tee/qcomtee/qcomtee_msg.h new file mode 100644 index 000000000000..878f70178a5b --- /dev/null +++ b/drivers/tee/qcomtee/qcomtee_msg.h @@ -0,0 +1,304 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef QCOMTEE_MSG_H +#define QCOMTEE_MSG_H + +#include + +/** + * DOC: ''Qualcomm TEE'' (QTEE) Transport Message + * + * There are two buffers shared with QTEE: inbound and outbound buffers. + * The inbound buffer is used for direct object invocation, and the outbound + * buffer is used to make a request from QTEE to the kernel; i.e., a callback + * request. + * + * The unused tail of the outbound buffer is also used for sending and + * receiving asynchronous messages. An asynchronous message is independent of + * the current object invocation (i.e., contents of the inbound buffer) or + * callback request (i.e., the head of the outbound buffer); see + * qcomtee_get_async_buffer(). It is used by endpoints (QTEE or kernel) as an + * optimization to reduce the number of context switches between the secure and + * non-secure worlds. + * + * For instance, QTEE never sends an explicit callback request to release an + * object in the kernel. Instead, it sends asynchronous release messages in the + * outbound buffer when QTEE returns from the previous direct object invocation, + * or appends asynchronous release messages after the current callback request. + * + * QTEE supports two types of arguments in a message: buffer and object + * arguments. Depending on the direction of data flow, they could be input + * buffer (IO) to QTEE, output buffer (OB) from QTEE, input object (IO) to QTEE, + * or output object (OO) from QTEE. Object arguments hold object IDs. Buffer + * arguments hold (offset, size) pairs into the inbound or outbound buffers. + * + * QTEE holds an object table for objects it hosts and exposes to the kernel. + * An object ID is an index to the object table in QTEE. + * + * For the direct object invocation message format in the inbound buffer, see + * &struct qcomtee_msg_object_invoke. For the callback request message format + * in the outbound buffer, see &struct qcomtee_msg_callback. For the message + * format for asynchronous messages in the outbound buffer, see + * &struct qcomtee_async_msg_hdr. + */ + +/** + * define QCOMTEE_MSG_OBJECT_NS_BIT - Non-secure bit + * + * Object ID is a globally unique 32-bit number. IDs referencing objects + * in the kernel should have %QCOMTEE_MSG_OBJECT_NS_BIT set. + */ +#define QCOMTEE_MSG_OBJECT_NS_BIT BIT(31) + +/* Static object IDs recognized by QTEE. */ +#define QCOMTEE_MSG_OBJECT_NULL (0U) +#define QCOMTEE_MSG_OBJECT_ROOT (1U) + +/* Definitions from QTEE as part of the transport protocol. */ + +/* qcomtee_msg_arg is an argument as recognized by QTEE. */ +union qcomtee_msg_arg { + struct { + u32 offset; + u32 size; + } b; + u32 o; +}; + +/* BI and BO payloads in QTEE messages should be at 64-bit boundaries. */ +#define qcomtee_msg_offset_align(o) ALIGN((o), sizeof(u64)) + +/* Operations for objects are 32-bit. Transport uses the upper 16 bits. */ +#define QCOMTEE_MSG_OBJECT_OP_MASK GENMASK(15, 0) + +/* Reserved Operation IDs sent to QTEE: */ +/* QCOMTEE_MSG_OBJECT_OP_RELEASE - Reduces the refcount and releases the object. + * QCOMTEE_MSG_OBJECT_OP_RETAIN - Increases the refcount. + * + * These operation IDs are valid for all objects. + */ + +#define QCOMTEE_MSG_OBJECT_OP_RELEASE (QCOMTEE_MSG_OBJECT_OP_MASK - 0) +#define QCOMTEE_MSG_OBJECT_OP_RETAIN (QCOMTEE_MSG_OBJECT_OP_MASK - 1) + +/* Subset of operations supported by QTEE root object. */ + +#define QCOMTEE_ROOT_OP_REG_WITH_CREDENTIALS 5 +#define QCOMTEE_ROOT_OP_NOTIFY_DOMAIN_CHANGE 4 +#define QCOMTEE_ROOT_OP_ADCI_ACCEPT 8 +#define QCOMTEE_ROOT_OP_ADCI_SHUTDOWN 9 + +/* Subset of operations supported by client_env object. */ + +#define QCOMTEE_CLIENT_ENV_OPEN 0 + +/* List of available QTEE service UIDs and subset of operations. */ + +#define QCOMTEE_FEATURE_VER_UID 2033 +#define QCOMTEE_FEATURE_VER_OP_GET 0 +/* Get QTEE version number. */ +#define QCOMTEE_FEATURE_VER_OP_GET_QTEE_ID 10 +#define QTEE_VERSION_GET_MAJOR(x) (((x) >> 22) & 0xffU) +#define QTEE_VERSION_GET_MINOR(x) (((x) >> 12) & 0xffU) +#define QTEE_VERSION_GET_PATCH(x) ((x) >> 0 & 0xfffU) + +/* Response types as returned from qcomtee_object_invoke_ctx_invoke(). */ + +/* The message contains a callback request. */ +#define QCOMTEE_RESULT_INBOUND_REQ_NEEDED 3 + +/** + * struct qcomtee_msg_object_invoke - Direct object invocation message. + * @ctx: object ID hosted in QTEE. + * @op: operation for the object. + * @counts: number of different types of arguments in @args. + * @args: array of arguments. + * + * @counts consists of 4 * 4-bit fields. Bits 0 - 3 represent the number of + * input buffers, bits 4 - 7 represent the number of output buffers, + * bits 8 - 11 represent the number of input objects, and bits 12 - 15 + * represent the number of output objects. The remaining bits should be zero. + * + * 15 12 11 8 7 4 3 0 + * +----------------+----------------+----------------+----------------+ + * | #OO objects | #IO objects | #OB buffers | #IB buffers | + * +----------------+----------------+----------------+----------------+ + * + * The maximum number of arguments of each type is defined by + * %QCOMTEE_ARGS_PER_TYPE. + */ +struct qcomtee_msg_object_invoke { + u32 cxt; + u32 op; + u32 counts; + union qcomtee_msg_arg args[]; +}; + +/* Bit masks for the four 4-bit nibbles holding the counts. */ +#define QCOMTEE_MASK_IB GENMASK(3, 0) +#define QCOMTEE_MASK_OB GENMASK(7, 4) +#define QCOMTEE_MASK_IO GENMASK(11, 8) +#define QCOMTEE_MASK_OO GENMASK(15, 12) + +/** + * struct qcomtee_msg_callback - Callback request message. + * @result: result of operation @op on the object referenced by @cxt. + * @cxt: object ID hosted in the kernel. + * @op: operation for the object. + * @counts: number of different types of arguments in @args. + * @args: array of arguments. + * + * For details of @counts, see &qcomtee_msg_object_invoke.counts. + */ +struct qcomtee_msg_callback { + u32 result; + u32 cxt; + u32 op; + u32 counts; + union qcomtee_msg_arg args[]; +}; + +/* Offset in the message for the beginning of the buffer argument's contents. */ +#define qcomtee_msg_buffer_args(t, n) \ + qcomtee_msg_offset_align(struct_size_t(t, args, n)) +/* Pointer to the beginning of a buffer argument's content at an offset. */ +#define qcomtee_msg_offset_to_ptr(m, off) ((void *)&((char *)(m))[(off)]) + +/* Some helpers to manage msg.counts. */ + +static inline unsigned int qcomtee_msg_num_ib(u32 counts) +{ + return FIELD_GET(QCOMTEE_MASK_IB, counts); +} + +static inline unsigned int qcomtee_msg_num_ob(u32 counts) +{ + return FIELD_GET(QCOMTEE_MASK_OB, counts); +} + +static inline unsigned int qcomtee_msg_num_io(u32 counts) +{ + return FIELD_GET(QCOMTEE_MASK_IO, counts); +} + +static inline unsigned int qcomtee_msg_num_oo(u32 counts) +{ + return FIELD_GET(QCOMTEE_MASK_OO, counts); +} + +static inline unsigned int qcomtee_msg_idx_ib(u32 counts) +{ + return 0; +} + +static inline unsigned int qcomtee_msg_idx_ob(u32 counts) +{ + return qcomtee_msg_num_ib(counts); +} + +static inline unsigned int qcomtee_msg_idx_io(u32 counts) +{ + return qcomtee_msg_idx_ob(counts) + qcomtee_msg_num_ob(counts); +} + +static inline unsigned int qcomtee_msg_idx_oo(u32 counts) +{ + return qcomtee_msg_idx_io(counts) + qcomtee_msg_num_io(counts); +} + +#define qcomtee_msg_for_each(i, first, num) \ + for ((i) = (first); (i) < (first) + (num); (i)++) + +#define qcomtee_msg_for_each_input_buffer(i, m) \ + qcomtee_msg_for_each(i, qcomtee_msg_idx_ib((m)->counts), \ + qcomtee_msg_num_ib((m)->counts)) + +#define qcomtee_msg_for_each_output_buffer(i, m) \ + qcomtee_msg_for_each(i, qcomtee_msg_idx_ob((m)->counts), \ + qcomtee_msg_num_ob((m)->counts)) + +#define qcomtee_msg_for_each_input_object(i, m) \ + qcomtee_msg_for_each(i, qcomtee_msg_idx_io((m)->counts), \ + qcomtee_msg_num_io((m)->counts)) + +#define qcomtee_msg_for_each_output_object(i, m) \ + qcomtee_msg_for_each(i, qcomtee_msg_idx_oo((m)->counts), \ + qcomtee_msg_num_oo((m)->counts)) + +/* Sum of arguments in a message. */ +#define qcomtee_msg_args(m) \ + (qcomtee_msg_idx_oo((m)->counts) + qcomtee_msg_num_oo((m)->counts)) + +static inline void qcomtee_msg_init(struct qcomtee_msg_object_invoke *msg, + u32 cxt, u32 op, int in_buffer, + int out_buffer, int in_object, + int out_object) +{ + u32 counts = 0; + + counts |= (in_buffer & 0xfU); + counts |= ((out_buffer - in_buffer) & 0xfU) << 4; + counts |= ((in_object - out_buffer) & 0xfU) << 8; + counts |= ((out_object - in_object) & 0xfU) << 12; + + msg->cxt = cxt; + msg->op = op; + msg->counts = counts; +} + +/* Generic error codes. */ +#define QCOMTEE_MSG_OK 0 /* non-specific success code. */ +#define QCOMTEE_MSG_ERROR 1 /* non-specific error. */ +#define QCOMTEE_MSG_ERROR_INVALID 2 /* unsupported/unrecognized request. */ +#define QCOMTEE_MSG_ERROR_SIZE_IN 3 /* supplied buffer/string too large. */ +#define QCOMTEE_MSG_ERROR_SIZE_OUT 4 /* supplied output buffer too small. */ +#define QCOMTEE_MSG_ERROR_USERBASE 10 /* start of user-defined error range. */ + +/* Transport layer error codes. */ +#define QCOMTEE_MSG_ERROR_DEFUNCT -90 /* object no longer exists. */ +#define QCOMTEE_MSG_ERROR_ABORT -91 /* calling thread must exit. */ +#define QCOMTEE_MSG_ERROR_BADOBJ -92 /* invalid object context. */ +#define QCOMTEE_MSG_ERROR_NOSLOTS -93 /* caller's object table full. */ +#define QCOMTEE_MSG_ERROR_MAXARGS -94 /* too many args. */ +#define QCOMTEE_MSG_ERROR_MAXDATA -95 /* buffers too large. */ +#define QCOMTEE_MSG_ERROR_UNAVAIL -96 /* the request could not be processed. */ +#define QCOMTEE_MSG_ERROR_KMEM -97 /* kernel out of memory. */ +#define QCOMTEE_MSG_ERROR_REMOTE -98 /* local method sent to remote object. */ +#define QCOMTEE_MSG_ERROR_BUSY -99 /* Object is busy. */ +#define QCOMTEE_MSG_ERROR_TIMEOUT -103 /* Call Back Object invocation timed out. */ + +static inline void qcomtee_msg_set_result(struct qcomtee_msg_callback *cb_msg, + int err) +{ + if (!err) { + cb_msg->result = QCOMTEE_MSG_OK; + } else if (err < 0) { + /* If err < 0, then it is a transport error. */ + switch (err) { + case -ENOMEM: + cb_msg->result = QCOMTEE_MSG_ERROR_KMEM; + break; + case -ENODEV: + cb_msg->result = QCOMTEE_MSG_ERROR_DEFUNCT; + break; + case -ENOSPC: + case -EBUSY: + cb_msg->result = QCOMTEE_MSG_ERROR_BUSY; + break; + case -EBADF: + case -EINVAL: + cb_msg->result = QCOMTEE_MSG_ERROR_UNAVAIL; + break; + default: + cb_msg->result = QCOMTEE_MSG_ERROR; + } + } else { + /* If err > 0, then it is user defined error, pass it as is. */ + cb_msg->result = err; + } +} + +#endif /* QCOMTEE_MSG_H */ diff --git a/drivers/tee/qcomtee/qcomtee_object.h b/drivers/tee/qcomtee/qcomtee_object.h new file mode 100644 index 000000000000..5221449be7db --- /dev/null +++ b/drivers/tee/qcomtee/qcomtee_object.h @@ -0,0 +1,316 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef QCOMTEE_OBJECT_H +#define QCOMTEE_OBJECT_H + +#include +#include +#include +#include + +struct qcomtee_object; + +/** + * DOC: Overview + * + * qcomtee_object provides object refcounting, ID allocation for objects hosted + * in the kernel, and necessary message marshaling for Qualcomm TEE (QTEE). + * + * To invoke an object in QTEE, the user calls qcomtee_object_do_invoke() + * while passing an instance of &struct qcomtee_object and the requested + * operation + arguments. + * + * After boot, QTEE provides a static object %ROOT_QCOMTEE_OBJECT (type of + * %QCOMTEE_OBJECT_TYPE_ROOT). The root object is invoked to pass the user's + * credentials and obtain other instances of &struct qcomtee_object (type of + * %QCOMTEE_OBJECT_TYPE_TEE) that represent services and TAs in QTEE; + * see &enum qcomtee_object_type. + * + * The objects received from QTEE are refcounted. So the owner of these objects + * can issue qcomtee_object_get() to increase the refcount and pass objects + * to other clients, or issue qcomtee_object_put() to decrease the refcount + * and release the resources in QTEE. + * + * The kernel can host services accessible to QTEE. A driver should embed + * an instance of &struct qcomtee_object in the struct it wants to export to + * QTEE (this is called a callback object). It issues qcomtee_object_user_init() + * to set the dispatch() operation for the callback object and set its type + * to %QCOMTEE_OBJECT_TYPE_CB. + * + * core.c holds an object table for callback objects. An object ID is assigned + * to each callback object, which is an index to the object table. QTEE uses + * these IDs to reference or invoke callback objects. + * + * If QTEE invokes a callback object in the kernel, the dispatch() operation is + * called in the context of the thread that originally called + * qcomtee_object_do_invoke(). + */ + +/** + * enum qcomtee_object_type - Object types. + * @QCOMTEE_OBJECT_TYPE_TEE: object hosted on QTEE. + * @QCOMTEE_OBJECT_TYPE_CB: object hosted on kernel. + * @QCOMTEE_OBJECT_TYPE_ROOT: 'primordial' object. + * @QCOMTEE_OBJECT_TYPE_NULL: NULL object. + * + * The primordial object is used for bootstrapping the IPC connection between + * the kernel and QTEE. It is invoked by the kernel when it wants to get a + * 'client env'. + */ +enum qcomtee_object_type { + QCOMTEE_OBJECT_TYPE_TEE, + QCOMTEE_OBJECT_TYPE_CB, + QCOMTEE_OBJECT_TYPE_ROOT, + QCOMTEE_OBJECT_TYPE_NULL, +}; + +/** + * enum qcomtee_arg_type - Type of QTEE argument. + * @QCOMTEE_ARG_TYPE_INV: invalid type. + * @QCOMTEE_ARG_TYPE_OB: output buffer (OB). + * @QCOMTEE_ARG_TYPE_OO: output object (OO). + * @QCOMTEE_ARG_TYPE_IB: input buffer (IB). + * @QCOMTEE_ARG_TYPE_IO: input object (IO). + * + * Use the invalid type to specify the end of the argument array. + */ +enum qcomtee_arg_type { + QCOMTEE_ARG_TYPE_INV = 0, + QCOMTEE_ARG_TYPE_OB, + QCOMTEE_ARG_TYPE_OO, + QCOMTEE_ARG_TYPE_IB, + QCOMTEE_ARG_TYPE_IO, + QCOMTEE_ARG_TYPE_NR, +}; + +/** + * define QCOMTEE_ARGS_PER_TYPE - Maximum arguments of a specific type. + * + * The QTEE transport protocol limits the maximum number of arguments of + * a specific type (i.e., IB, OB, IO, and OO). + */ +#define QCOMTEE_ARGS_PER_TYPE 16 + +/* Maximum arguments that can fit in a QTEE message, ignoring the type. */ +#define QCOMTEE_ARGS_MAX (QCOMTEE_ARGS_PER_TYPE * (QCOMTEE_ARG_TYPE_NR - 1)) + +struct qcomtee_buffer { + union { + void *addr; + void __user *uaddr; + }; + size_t size; +}; + +/** + * struct qcomtee_arg - Argument for QTEE object invocation. + * @type: type of argument as &enum qcomtee_arg_type. + * @flags: extra flags. + * @b: address and size if the type of argument is a buffer. + * @o: object instance if the type of argument is an object. + * + * &qcomtee_arg.flags only accepts %QCOMTEE_ARG_FLAGS_UADDR for now, which + * states that &qcomtee_arg.b contains a userspace address in uaddr. + */ +struct qcomtee_arg { + enum qcomtee_arg_type type; +/* 'b.uaddr' holds a __user address. */ +#define QCOMTEE_ARG_FLAGS_UADDR BIT(0) + unsigned int flags; + union { + struct qcomtee_buffer b; + struct qcomtee_object *o; + }; +}; + +static inline int qcomtee_args_len(struct qcomtee_arg *args) +{ + int i = 0; + + while (args[i].type != QCOMTEE_ARG_TYPE_INV) + i++; + return i; +} + +/* Context is busy (callback is in progress). */ +#define QCOMTEE_OIC_FLAG_BUSY BIT(1) +/* Context needs to notify the current object. */ +#define QCOMTEE_OIC_FLAG_NOTIFY BIT(2) +/* Context has shared state with QTEE. */ +#define QCOMTEE_OIC_FLAG_SHARED BIT(3) + +/** + * struct qcomtee_object_invoke_ctx - QTEE context for object invocation. + * @ctx: TEE context for this invocation. + * @flags: flags for the invocation context. + * @errno: error code for the invocation. + * @object: current object invoked in this callback context. + * @u: array of arguments for the current invocation (+1 for ending arg). + * @in_msg: inbound buffer shared with QTEE. + * @out_msg: outbound buffer shared with QTEE. + * @in_shm: TEE shm allocated for inbound buffer. + * @out_shm: TEE shm allocated for outbound buffer. + * @data: extra data attached to this context. + */ +struct qcomtee_object_invoke_ctx { + struct tee_context *ctx; + unsigned long flags; + int errno; + + struct qcomtee_object *object; + struct qcomtee_arg u[QCOMTEE_ARGS_MAX + 1]; + + struct qcomtee_buffer in_msg; + struct qcomtee_buffer out_msg; + struct tee_shm *in_shm; + struct tee_shm *out_shm; + + void *data; +}; + +static inline struct qcomtee_object_invoke_ctx * +qcomtee_object_invoke_ctx_alloc(struct tee_context *ctx) +{ + struct qcomtee_object_invoke_ctx *oic; + + oic = kzalloc(sizeof(*oic), GFP_KERNEL); + if (oic) + oic->ctx = ctx; + return oic; +} + +/** + * qcomtee_object_do_invoke() - Submit an invocation for an object. + * @oic: context to use for the current invocation. + * @object: object being invoked. + * @op: requested operation on the object. + * @u: array of arguments for the current invocation. + * @result: result returned from QTEE. + * + * The caller is responsible for keeping track of the refcount for each object, + * including @object. On return, the caller loses ownership of all input + * objects of type %QCOMTEE_OBJECT_TYPE_CB. + * + * @object can be of %QCOMTEE_OBJECT_TYPE_ROOT or %QCOMTEE_OBJECT_TYPE_TEE. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_do_invoke(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u, int *result); + +/** + * struct qcomtee_object_operations - Callback object operations. + * @release: release the object if QTEE is not using it. + * @dispatch: dispatch the operation requested by QTEE. + * @notify: report the status of any pending response submitted by @dispatch. + */ +struct qcomtee_object_operations { + void (*release)(struct qcomtee_object *object); + int (*dispatch)(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *args); + void (*notify)(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, int err); +}; + +/** + * struct qcomtee_object - QTEE or kernel object. + * @name: object name. + * @refcount: reference counter. + * @object_type: object type as &enum qcomtee_object_type. + * @info: extra information for the object. + * @ops: callback operations for objects of type %QCOMTEE_OBJECT_TYPE_CB. + * @work: work for async operations on the object. + * + * @work is used for releasing objects of %QCOMTEE_OBJECT_TYPE_TEE type. + */ +struct qcomtee_object { + const char *name; + struct kref refcount; + + enum qcomtee_object_type object_type; + struct object_info { + unsigned long qtee_id; + /* TEE context for QTEE object async requests. */ + struct tee_context *qcomtee_async_ctx; + } info; + + struct qcomtee_object_operations *ops; + struct work_struct work; +}; + +/* Static instances of qcomtee_object objects. */ +#define NULL_QCOMTEE_OBJECT ((struct qcomtee_object *)(0)) +extern struct qcomtee_object qcomtee_object_root; +#define ROOT_QCOMTEE_OBJECT (&qcomtee_object_root) + +static inline enum qcomtee_object_type +typeof_qcomtee_object(struct qcomtee_object *object) +{ + if (object == NULL_QCOMTEE_OBJECT) + return QCOMTEE_OBJECT_TYPE_NULL; + return object->object_type; +} + +static inline const char *qcomtee_object_name(struct qcomtee_object *object) +{ + if (object == NULL_QCOMTEE_OBJECT) + return "null"; + + if (!object->name) + return "no-name"; + return object->name; +} + +/** + * qcomtee_object_user_init() - Initialize an object for the user. + * @object: object to initialize. + * @ot: type of object as &enum qcomtee_object_type. + * @ops: instance of callbacks. + * @fmt: name assigned to the object. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_user_init(struct qcomtee_object *object, + enum qcomtee_object_type ot, + struct qcomtee_object_operations *ops, + const char *fmt, ...) __printf(4, 5); + +/* Object release is RCU protected. */ +int qcomtee_object_get(struct qcomtee_object *object); +void qcomtee_object_put(struct qcomtee_object *object); + +#define qcomtee_arg_for_each(i, args) \ + for (i = 0; args[i].type != QCOMTEE_ARG_TYPE_INV; i++) + +/* Next argument of type @type after index @i. */ +int qcomtee_next_arg_type(struct qcomtee_arg *u, int i, + enum qcomtee_arg_type type); + +/* Iterate over argument of given type. */ +#define qcomtee_arg_for_each_type(i, args, at) \ + for (i = qcomtee_next_arg_type(args, 0, at); \ + args[i].type != QCOMTEE_ARG_TYPE_INV; \ + i = qcomtee_next_arg_type(args, i + 1, at)) + +#define qcomtee_arg_for_each_input_buffer(i, args) \ + qcomtee_arg_for_each_type(i, args, QCOMTEE_ARG_TYPE_IB) +#define qcomtee_arg_for_each_output_buffer(i, args) \ + qcomtee_arg_for_each_type(i, args, QCOMTEE_ARG_TYPE_OB) +#define qcomtee_arg_for_each_input_object(i, args) \ + qcomtee_arg_for_each_type(i, args, QCOMTEE_ARG_TYPE_IO) +#define qcomtee_arg_for_each_output_object(i, args) \ + qcomtee_arg_for_each_type(i, args, QCOMTEE_ARG_TYPE_OO) + +struct qcomtee_object * +qcomtee_object_get_client_env(struct qcomtee_object_invoke_ctx *oic); + +struct qcomtee_object * +qcomtee_object_get_service(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *client_env, u32 uid); + +#endif /* QCOMTEE_OBJECT_H */ diff --git a/drivers/tee/qcomtee/shm.c b/drivers/tee/qcomtee/shm.c new file mode 100644 index 000000000000..2aea76487372 --- /dev/null +++ b/drivers/tee/qcomtee/shm.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +#include "qcomtee.h" + +/** + * define MAX_OUTBOUND_BUFFER_SIZE - Maximum size of outbound buffers. + * + * The size of outbound buffer depends on QTEE callback requests. + */ +#define MAX_OUTBOUND_BUFFER_SIZE SZ_4K + +/** + * define MAX_INBOUND_BUFFER_SIZE - Maximum size of the inbound buffer. + * + * The size of the inbound buffer depends on the user's requests, + * specifically the number of IB and OB arguments. If an invocation + * requires a size larger than %MAX_INBOUND_BUFFER_SIZE, the user should + * consider using another form of shared memory with QTEE. + */ +#define MAX_INBOUND_BUFFER_SIZE SZ_4M + +/** + * qcomtee_msg_buffers_alloc() - Allocate inbound and outbound buffers. + * @oic: context to use for the current invocation. + * @u: array of arguments for the current invocation. + * + * It calculates the size of inbound and outbound buffers based on the + * arguments in @u. It allocates the buffers from the teedev pool. + * + * Return: On success, returns 0. On error, returns < 0. + */ +int qcomtee_msg_buffers_alloc(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_arg *u) +{ + struct tee_context *ctx = oic->ctx; + struct tee_shm *shm; + size_t size; + int i; + + /* Start offset in a message for buffer arguments. */ + size = qcomtee_msg_buffer_args(struct qcomtee_msg_object_invoke, + qcomtee_args_len(u)); + if (size > MAX_INBOUND_BUFFER_SIZE) + return -EINVAL; + + /* Add size of IB arguments. */ + qcomtee_arg_for_each_input_buffer(i, u) { + size = size_add(size, qcomtee_msg_offset_align(u[i].b.size)); + if (size > MAX_INBOUND_BUFFER_SIZE) + return -EINVAL; + } + + /* Add size of OB arguments. */ + qcomtee_arg_for_each_output_buffer(i, u) { + size = size_add(size, qcomtee_msg_offset_align(u[i].b.size)); + if (size > MAX_INBOUND_BUFFER_SIZE) + return -EINVAL; + } + + shm = tee_shm_alloc_priv_buf(ctx, size); + if (IS_ERR(shm)) + return PTR_ERR(shm); + + /* Allocate inbound buffer. */ + oic->in_shm = shm; + shm = tee_shm_alloc_priv_buf(ctx, MAX_OUTBOUND_BUFFER_SIZE); + if (IS_ERR(shm)) { + tee_shm_free(oic->in_shm); + + return PTR_ERR(shm); + } + /* Allocate outbound buffer. */ + oic->out_shm = shm; + + oic->in_msg.addr = tee_shm_get_va(oic->in_shm, 0); + oic->in_msg.size = tee_shm_get_size(oic->in_shm); + oic->out_msg.addr = tee_shm_get_va(oic->out_shm, 0); + oic->out_msg.size = tee_shm_get_size(oic->out_shm); + /* QTEE assume unused buffers are zeroed. */ + memzero_explicit(oic->in_msg.addr, oic->in_msg.size); + memzero_explicit(oic->out_msg.addr, oic->out_msg.size); + + return 0; +} + +void qcomtee_msg_buffers_free(struct qcomtee_object_invoke_ctx *oic) +{ + tee_shm_free(oic->in_shm); + tee_shm_free(oic->out_shm); +} + +/* Dynamic shared memory pool based on tee_dyn_shm_alloc_helper(). */ + +static int qcomtee_shm_register(struct tee_context *ctx, struct tee_shm *shm, + struct page **pages, size_t num_pages, + unsigned long start) +{ + return qcom_tzmem_shm_bridge_create(shm->paddr, shm->size, + &shm->sec_world_id); +} + +static int qcomtee_shm_unregister(struct tee_context *ctx, struct tee_shm *shm) +{ + qcom_tzmem_shm_bridge_delete(shm->sec_world_id); + + return 0; +} + +static int pool_op_alloc(struct tee_shm_pool *pool, struct tee_shm *shm, + size_t size, size_t align) +{ + if (!(shm->flags & TEE_SHM_PRIV)) + return -ENOMEM; + + return tee_dyn_shm_alloc_helper(shm, size, align, qcomtee_shm_register); +} + +static void pool_op_free(struct tee_shm_pool *pool, struct tee_shm *shm) +{ + tee_dyn_shm_free_helper(shm, qcomtee_shm_unregister); +} + +static void pool_op_destroy_pool(struct tee_shm_pool *pool) +{ + kfree(pool); +} + +static const struct tee_shm_pool_ops pool_ops = { + .alloc = pool_op_alloc, + .free = pool_op_free, + .destroy_pool = pool_op_destroy_pool, +}; + +struct tee_shm_pool *qcomtee_shm_pool_alloc(void) +{ + struct tee_shm_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + + pool->ops = &pool_ops; + + return pool; +} diff --git a/drivers/tee/qcomtee/user_obj.c b/drivers/tee/qcomtee/user_obj.c new file mode 100644 index 000000000000..0139905f2684 --- /dev/null +++ b/drivers/tee/qcomtee/user_obj.c @@ -0,0 +1,692 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +#include "qcomtee.h" + +/** + * DOC: User Objects aka Supplicants + * + * Any userspace process with access to the TEE device file can behave as a + * supplicant by creating a user object. Any TEE parameter of type OBJREF with + * %QCOMTEE_OBJREF_FLAG_USER flag set is considered a user object. + * + * A supplicant uses qcomtee_user_object_select() (i.e. TEE_IOC_SUPPL_RECV) to + * receive a QTEE user object request and qcomtee_user_object_submit() + * (i.e. TEE_IOC_SUPPL_SEND) to submit a response. QTEE expects to receive the + * response, including OB and OO in a specific order in the message; parameters + * submitted with qcomtee_user_object_submit() should maintain this order. + */ + +/** + * struct qcomtee_user_object - User object. + * @object: &struct qcomtee_object representing the user object. + * @ctx: context for which the user object is defined. + * @object_id: object ID in @ctx. + * @notify: notify on release. + * + * Any object managed in userspace is represented by this struct. + * If @notify is set, a notification message is sent back to userspace + * upon release. + */ +struct qcomtee_user_object { + struct qcomtee_object object; + struct tee_context *ctx; + u64 object_id; + bool notify; +}; + +#define to_qcomtee_user_object(o) \ + container_of((o), struct qcomtee_user_object, object) + +static struct qcomtee_object_operations qcomtee_user_object_ops; + +/* Is it a user object? */ +int is_qcomtee_user_object(struct qcomtee_object *object) +{ + return object != NULL_QCOMTEE_OBJECT && + typeof_qcomtee_object(object) == QCOMTEE_OBJECT_TYPE_CB && + object->ops == &qcomtee_user_object_ops; +} + +/* Set the user object's 'notify on release' flag. */ +void qcomtee_user_object_set_notify(struct qcomtee_object *object, bool notify) +{ + if (is_qcomtee_user_object(object)) + to_qcomtee_user_object(object)->notify = notify; +} + +/* Supplicant Requests: */ + +/** + * enum qcomtee_req_state - Current state of request. + * @QCOMTEE_REQ_QUEUED: Request is waiting for supplicant. + * @QCOMTEE_REQ_PROCESSING: Request has been picked by the supplicant. + * @QCOMTEE_REQ_PROCESSED: Response has been submitted for the request. + */ +enum qcomtee_req_state { + QCOMTEE_REQ_QUEUED = 1, + QCOMTEE_REQ_PROCESSING, + QCOMTEE_REQ_PROCESSED, +}; + +/* User requests sent to supplicants. */ +struct qcomtee_ureq { + enum qcomtee_req_state state; + + /* User Request: */ + int req_id; + u64 object_id; + u32 op; + struct qcomtee_arg *args; + int errno; + + struct list_head node; + struct completion c; /* Completion for whoever wait. */ +}; + +/* + * Placeholder for a PROCESSING request in qcomtee_context.reqs_idr. + * + * If the thread that calls qcomtee_object_invoke() dies and the supplicant + * is processing the request, replace the entry in qcomtee_context.reqs_idr + * with empty_ureq. This ensures that (1) the req_id remains busy and is not + * reused, and (2) the supplicant fails to submit the response and performs + * the necessary rollback. + */ +static struct qcomtee_ureq empty_ureq = { .state = QCOMTEE_REQ_PROCESSING }; + +/* Enqueue a user request for a context and assign a request ID. */ +static int ureq_enqueue(struct qcomtee_context_data *ctxdata, + struct qcomtee_ureq *ureq) +{ + int ret; + + guard(mutex)(&ctxdata->reqs_lock); + /* Supplicant is dying. */ + if (ctxdata->released) + return -ENODEV; + + /* Allocate an ID and queue the request. */ + ret = idr_alloc(&ctxdata->reqs_idr, ureq, 0, 0, GFP_KERNEL); + if (ret < 0) + return ret; + + ureq->req_id = ret; + ureq->state = QCOMTEE_REQ_QUEUED; + list_add_tail(&ureq->node, &ctxdata->reqs_list); + + return 0; +} + +/** + * ureq_dequeue() - Dequeue a user request from a context. + * @ctxdata: context data for a context to dequeue the request. + * @req_id: ID of the request to be dequeued. + * + * It dequeues a user request and releases its request ID. + * + * Context: The caller should hold &qcomtee_context_data->reqs_lock. + * Return: Returns the user request associated with this ID; otherwise, NULL. + */ +static struct qcomtee_ureq *ureq_dequeue(struct qcomtee_context_data *ctxdata, + int req_id) +{ + struct qcomtee_ureq *ureq; + + ureq = idr_remove(&ctxdata->reqs_idr, req_id); + if (ureq == &empty_ureq || !ureq) + return NULL; + + list_del(&ureq->node); + + return ureq; +} + +/** + * ureq_select() - Select the next request in a context. + * @ctxdata: context data for a context to pop a request. + * @ubuf_size: size of the available buffer for UBUF parameters. + * @num_params: number of entries for the TEE parameter array. + * + * It checks if @num_params is large enough to fit the next request arguments. + * It checks if @ubuf_size is large enough to fit IB buffer arguments. + * + * Context: The caller should hold &qcomtee_context_data->reqs_lock. + * Return: On success, returns a request; + * on failure, returns NULL and ERR_PTR. + */ +static struct qcomtee_ureq *ureq_select(struct qcomtee_context_data *ctxdata, + size_t ubuf_size, int num_params) +{ + struct qcomtee_ureq *req, *ureq = NULL; + struct qcomtee_arg *u; + int i; + + /* Find the a queued request. */ + list_for_each_entry(req, &ctxdata->reqs_list, node) { + if (req->state == QCOMTEE_REQ_QUEUED) { + ureq = req; + break; + } + } + + if (!ureq) + return NULL; + + u = ureq->args; + /* (1) Is there enough TEE parameters? */ + if (num_params < qcomtee_args_len(u)) + return ERR_PTR(-EINVAL); + /* (2) Is there enough space to pass input buffers? */ + qcomtee_arg_for_each_input_buffer(i, u) { + ubuf_size = size_sub(ubuf_size, u[i].b.size); + if (ubuf_size == SIZE_MAX) + return ERR_PTR(-EINVAL); + + ubuf_size = round_down(ubuf_size, 8); + } + + return ureq; +} + +/* Gets called when the user closes the device. */ +void qcomtee_requests_destroy(struct qcomtee_context_data *ctxdata) +{ + struct qcomtee_ureq *req, *ureq; + + guard(mutex)(&ctxdata->reqs_lock); + /* So ureq_enqueue() refuses new requests from QTEE. */ + ctxdata->released = true; + /* ureqs in reqs_list are in QUEUED or PROCESSING (!= empty_ureq) state. */ + list_for_each_entry_safe(ureq, req, &ctxdata->reqs_list, node) { + ureq_dequeue(ctxdata, ureq->req_id); + + if (ureq->op != QCOMTEE_MSG_OBJECT_OP_RELEASE) { + ureq->state = QCOMTEE_REQ_PROCESSED; + ureq->errno = -ENODEV; + + complete(&ureq->c); + } else { + kfree(ureq); + } + } +} + +/* User Object API. */ + +/* User object dispatcher. */ +static int qcomtee_user_object_dispatch(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *args) +{ + struct qcomtee_user_object *uo = to_qcomtee_user_object(object); + struct qcomtee_context_data *ctxdata = uo->ctx->data; + struct qcomtee_ureq *ureq __free(kfree) = NULL; + int errno; + + ureq = kzalloc(sizeof(*ureq), GFP_KERNEL); + if (!ureq) + return -ENOMEM; + + init_completion(&ureq->c); + ureq->object_id = uo->object_id; + ureq->op = op; + ureq->args = args; + + /* Queue the request. */ + if (ureq_enqueue(ctxdata, ureq)) + return -ENODEV; + /* Wakeup supplicant to process it. */ + complete(&ctxdata->req_c); + + /* + * Wait for the supplicant to process the request. Wait as KILLABLE + * in case the supplicant and invoke thread are both running from the + * same process, the supplicant crashes, or the shutdown sequence + * starts with supplicant dies first; otherwise, it stuck indefinitely. + * + * If the supplicant processes long-running requests, also use + * TASK_FREEZABLE to allow the device to safely suspend if needed. + */ + if (!wait_for_completion_state(&ureq->c, + TASK_KILLABLE | TASK_FREEZABLE)) { + errno = ureq->errno; + if (!errno) + oic->data = no_free_ptr(ureq); + } else { + enum qcomtee_req_state prev_state; + + errno = -ENODEV; + + scoped_guard(mutex, &ctxdata->reqs_lock) { + prev_state = ureq->state; + /* Replace with empty_ureq to keep req_id reserved. */ + if (prev_state == QCOMTEE_REQ_PROCESSING) { + list_del(&ureq->node); + idr_replace(&ctxdata->reqs_idr, + &empty_ureq, ureq->req_id); + + /* Remove as supplicant has never seen this request. */ + } else if (prev_state == QCOMTEE_REQ_QUEUED) { + ureq_dequeue(ctxdata, ureq->req_id); + } + } + + /* Supplicant did some work, do not discard it. */ + if (prev_state == QCOMTEE_REQ_PROCESSED) { + errno = ureq->errno; + if (!errno) + oic->data = no_free_ptr(ureq); + } + } + + return errno; +} + +/* Gets called after submitting the dispatcher response. */ +static void qcomtee_user_object_notify(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *unused_object, + int err) +{ + struct qcomtee_ureq *ureq = oic->data; + struct qcomtee_arg *u = ureq->args; + int i; + + /* + * If err, there was a transport issue, and QTEE did not receive the + * response for the dispatcher. Release the callback object created for + * QTEE, in addition to the copies of objects kept for the drivers. + */ + qcomtee_arg_for_each_output_object(i, u) { + if (err && + (typeof_qcomtee_object(u[i].o) == QCOMTEE_OBJECT_TYPE_CB)) + qcomtee_object_put(u[i].o); + qcomtee_object_put(u[i].o); + } + + kfree(ureq); +} + +static void qcomtee_user_object_release(struct qcomtee_object *object) +{ + struct qcomtee_user_object *uo = to_qcomtee_user_object(object); + struct qcomtee_context_data *ctxdata = uo->ctx->data; + struct qcomtee_ureq *ureq; + + /* RELEASE does not require any argument. */ + static struct qcomtee_arg args[] = { { .type = QCOMTEE_ARG_TYPE_INV } }; + + if (!uo->notify) + goto out_no_notify; + + ureq = kzalloc(sizeof(*ureq), GFP_KERNEL); + if (!ureq) + goto out_no_notify; + + /* QUEUE a release request: */ + ureq->object_id = uo->object_id; + ureq->op = QCOMTEE_MSG_OBJECT_OP_RELEASE; + ureq->args = args; + if (ureq_enqueue(ctxdata, ureq)) { + kfree(ureq); + /* Ignore the notification if it cannot be queued. */ + goto out_no_notify; + } + + complete(&ctxdata->req_c); + +out_no_notify: + teedev_ctx_put(uo->ctx); + kfree(uo); +} + +static struct qcomtee_object_operations qcomtee_user_object_ops = { + .release = qcomtee_user_object_release, + .notify = qcomtee_user_object_notify, + .dispatch = qcomtee_user_object_dispatch, +}; + +/** + * qcomtee_user_param_to_object() - OBJREF parameter to &struct qcomtee_object. + * @object: object returned. + * @param: TEE parameter. + * @ctx: context in which the conversion should happen. + * + * @param is an OBJREF with %QCOMTEE_OBJREF_FLAG_USER flags. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_user_param_to_object(struct qcomtee_object **object, + struct tee_param *param, + struct tee_context *ctx) +{ + struct qcomtee_user_object *user_object __free(kfree) = NULL; + int err; + + user_object = kzalloc(sizeof(*user_object), GFP_KERNEL); + if (!user_object) + return -ENOMEM; + + user_object->ctx = ctx; + user_object->object_id = param->u.objref.id; + /* By default, always notify userspace upon release. */ + user_object->notify = true; + err = qcomtee_object_user_init(&user_object->object, + QCOMTEE_OBJECT_TYPE_CB, + &qcomtee_user_object_ops, "uo-%llu", + param->u.objref.id); + if (err) + return err; + /* Matching teedev_ctx_put() is in qcomtee_user_object_release(). */ + teedev_ctx_get(ctx); + + *object = &no_free_ptr(user_object)->object; + + return 0; +} + +/* Reverse what qcomtee_user_param_to_object() does. */ +int qcomtee_user_param_from_object(struct tee_param *param, + struct qcomtee_object *object, + struct tee_context *ctx) +{ + struct qcomtee_user_object *uo; + + uo = to_qcomtee_user_object(object); + /* Ensure the object is in the same context as the caller. */ + if (uo->ctx != ctx) + return -EINVAL; + + param->u.objref.id = uo->object_id; + param->u.objref.flags = QCOMTEE_OBJREF_FLAG_USER; + + /* User objects are valid in userspace; do not keep a copy. */ + qcomtee_object_put(object); + + return 0; +} + +/** + * qcomtee_cb_params_from_args() - Convert QTEE arguments to TEE parameters. + * @params: TEE parameters. + * @u: QTEE arguments. + * @num_params: number of elements in the parameter array. + * @ubuf_addr: user buffer for arguments of type %QCOMTEE_ARG_TYPE_IB. + * @ubuf_size: size of the user buffer. + * @ctx: context in which the conversion should happen. + * + * It expects @params to have enough entries for @u. Entries in @params are of + * %TEE_IOCTL_PARAM_ATTR_TYPE_NONE. + * + * Return: On success, returns the number of input parameters; + * on failure, returns < 0. + */ +static int qcomtee_cb_params_from_args(struct tee_param *params, + struct qcomtee_arg *u, int num_params, + void __user *ubuf_addr, size_t ubuf_size, + struct tee_context *ctx) +{ + int i, np; + void __user *uaddr; + + qcomtee_arg_for_each(i, u) { + switch (u[i].type) { + case QCOMTEE_ARG_TYPE_IB: + params[i].attr = TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT; + + /* Underflow already checked in ureq_select(). */ + ubuf_size = round_down(ubuf_size - u[i].b.size, 8); + uaddr = (void __user *)(ubuf_addr + ubuf_size); + + params[i].u.ubuf.uaddr = uaddr; + params[i].u.ubuf.size = u[i].b.size; + if (copy_to_user(params[i].u.ubuf.uaddr, u[i].b.addr, + u[i].b.size)) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_OB: + params[i].attr = TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT; + /* Let the user knows the maximum size QTEE expects. */ + params[i].u.ubuf.size = u[i].b.size; + + break; + case QCOMTEE_ARG_TYPE_IO: + params[i].attr = TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT; + if (qcomtee_objref_from_arg(¶ms[i], &u[i], ctx)) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_OO: + params[i].attr = + TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT; + + break; + default: /* Never get here! */ + goto out_failed; + } + } + + return i; + +out_failed: + /* Undo qcomtee_objref_from_arg(). */ + for (np = i; np >= 0; np--) { + if (params[np].attr == TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT) + qcomtee_context_del_qtee_object(¶ms[np], ctx); + } + + /* Release any IO objects not processed. */ + for (; u[i].type; i++) { + if (u[i].type == QCOMTEE_ARG_TYPE_IO) + qcomtee_object_put(u[i].o); + } + + return -EINVAL; +} + +/** + * qcomtee_cb_params_to_args() - Convert TEE parameters to QTEE arguments. + * @u: QTEE arguments. + * @params: TEE parameters. + * @num_params: number of elements in the parameter array. + * @ctx: context in which the conversion should happen. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_cb_params_to_args(struct qcomtee_arg *u, + struct tee_param *params, int num_params, + struct tee_context *ctx) +{ + int i; + + qcomtee_arg_for_each(i, u) { + switch (u[i].type) { + case QCOMTEE_ARG_TYPE_IB: + if (params[i].attr != + TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_OB: + if (params[i].attr != + TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT) + goto out_failed; + + /* Client can not send more data than requested. */ + if (params[i].u.ubuf.size > u[i].b.size) + goto out_failed; + + if (copy_from_user(u[i].b.addr, params[i].u.ubuf.uaddr, + params[i].u.ubuf.size)) + goto out_failed; + + u[i].b.size = params[i].u.ubuf.size; + + break; + case QCOMTEE_ARG_TYPE_IO: + if (params[i].attr != + TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_OO: + if (params[i].attr != + TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT) + goto out_failed; + + if (qcomtee_objref_to_arg(&u[i], ¶ms[i], ctx)) + goto out_failed; + + break; + default: /* Never get here! */ + goto out_failed; + } + } + + return 0; + +out_failed: + /* Undo qcomtee_objref_to_arg(). */ + for (i--; i >= 0; i--) { + if (u[i].type != QCOMTEE_ARG_TYPE_OO) + continue; + + qcomtee_user_object_set_notify(u[i].o, false); + if (typeof_qcomtee_object(u[i].o) == QCOMTEE_OBJECT_TYPE_CB) + qcomtee_object_put(u[i].o); + + qcomtee_object_put(u[i].o); + } + + return -EINVAL; +} + +/** + * qcomtee_user_object_select() - Select a request for a user object. + * @ctx: context to look for a user object. + * @params: parameters for @op. + * @num_params: number of elements in the parameter array. + * @uaddr: user buffer for output UBUF parameters. + * @size: size of user buffer @uaddr. + * @data: information for the selected request. + * + * @params is filled along with @data for the selected request. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_user_object_select(struct tee_context *ctx, + struct tee_param *params, int num_params, + void __user *uaddr, size_t size, + struct qcomtee_user_object_request_data *data) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + struct qcomtee_ureq *ureq; + int ret; + + /* + * Hold the reqs_lock not only for ureq_select() and updating the ureq + * state to PROCESSING but for the entire duration of ureq access. + * This prevents qcomtee_user_object_dispatch() from freeing + * ureq while it is still in use, if client dies. + */ + + while (1) { + scoped_guard(mutex, &ctxdata->reqs_lock) { + ureq = ureq_select(ctxdata, size, num_params); + if (!ureq) + goto wait_for_request; + + if (IS_ERR(ureq)) + return PTR_ERR(ureq); + + /* Processing the request 'QUEUED -> PROCESSING'. */ + ureq->state = QCOMTEE_REQ_PROCESSING; + /* ''Prepare user request:'' */ + data->id = ureq->req_id; + data->object_id = ureq->object_id; + data->op = ureq->op; + ret = qcomtee_cb_params_from_args(params, ureq->args, + num_params, uaddr, + size, ctx); + if (ret >= 0) + goto done_request; + + /* Something is wrong with the request: */ + ureq_dequeue(ctxdata, data->id); + /* Send error to QTEE. */ + ureq->state = QCOMTEE_REQ_PROCESSED; + ureq->errno = ret; + + complete(&ureq->c); + } + + continue; +wait_for_request: + /* Wait for a new QUEUED request. */ + if (wait_for_completion_interruptible(&ctxdata->req_c)) + return -ERESTARTSYS; + } + +done_request: + /* No one is waiting for the response. */ + if (data->op == QCOMTEE_MSG_OBJECT_OP_RELEASE) { + scoped_guard(mutex, &ctxdata->reqs_lock) + ureq_dequeue(ctxdata, data->id); + kfree(ureq); + } + + data->np = ret; + + return 0; +} + +/** + * qcomtee_user_object_submit() - Submit a response for a user object. + * @ctx: context to look for a user object. + * @params: returned parameters. + * @num_params: number of elements in the parameter array. + * @req_id: request ID for the response. + * @errno: result of user object invocation. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_user_object_submit(struct tee_context *ctx, + struct tee_param *params, int num_params, + int req_id, int errno) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + struct qcomtee_ureq *ureq; + + /* See comments for reqs_lock in qcomtee_user_object_select(). */ + guard(mutex)(&ctxdata->reqs_lock); + + ureq = ureq_dequeue(ctxdata, req_id); + if (!ureq) + return -EINVAL; + + ureq->state = QCOMTEE_REQ_PROCESSED; + + if (!errno) + ureq->errno = qcomtee_cb_params_to_args(ureq->args, params, + num_params, ctx); + else + ureq->errno = errno; + /* Return errno if qcomtee_cb_params_to_args() failed; otherwise 0. */ + if (!errno && ureq->errno) + errno = ureq->errno; + else + errno = 0; + + /* Send result to QTEE. */ + complete(&ureq->c); + + return errno; +} diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index a5466b503bfe..386ad36f1a0a 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -59,6 +59,7 @@ #define TEE_IMPL_ID_OPTEE 1 #define TEE_IMPL_ID_AMDTEE 2 #define TEE_IMPL_ID_TSTEE 3 +#define TEE_IMPL_ID_QTEE 4 /* * OP-TEE specific capabilities -- cgit v1.2.3 From 2293c57484ae64c9a3c847c8807db8c26a3a4d41 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:21 +0200 Subject: mptcp: pm: nl: announce deny-join-id0 flag During the connection establishment, a peer can tell the other one that it cannot establish new subflows to the initial IP address and port by setting the 'C' flag [1]. Doing so makes sense when the sender is behind a strict NAT, operating behind a legacy Layer 4 load balancer, or using anycast IP address for example. When this 'C' flag is set, the path-managers must then not try to establish new subflows to the other peer's initial IP address and port. The in-kernel PM has access to this info, but the userspace PM didn't. The RFC8684 [1] is strict about that: (...) therefore the receiver MUST NOT try to open any additional subflows toward this address and port. So it is important to tell the userspace about that as it is responsible for the respect of this flag. When a new connection is created and established, the Netlink events now contain the existing but not currently used 'flags' attribute. When MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 is set, it means no other subflows to the initial IP address and port -- info that are also part of the event -- can be established. Link: https://datatracker.ietf.org/doc/html/rfc8684#section-3.1-20.6 [1] Fixes: 702c2f646d42 ("mptcp: netlink: allow userspace-driven subflow establishment") Reported-by: Marek Majkowski Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/532 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-2-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 4 ++-- include/uapi/linux/mptcp.h | 2 ++ include/uapi/linux/mptcp_pm.h | 4 ++-- net/mptcp/pm_netlink.c | 7 +++++++ 4 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index d15335684ec3..d1b4829b580a 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -28,13 +28,13 @@ definitions: traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: established doc: >- A MPTCP connection is established (can start new subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: closed doc: >- diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 67d015df8893..5fd5b4cf75ca 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -31,6 +31,8 @@ #define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) +#define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) + #define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) #define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 6ac84b2f636c..7359d34da446 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -16,10 +16,10 @@ * good time to allocate memory and send ADD_ADDR if needed. Depending on the * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 50aaf259959a..ce7d42d3bd00 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -408,6 +408,7 @@ static int mptcp_event_created(struct sk_buff *skb, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); + u16 flags = 0; if (err) return err; @@ -415,6 +416,12 @@ static int mptcp_event_created(struct sk_buff *skb, if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) return -EMSGSIZE; + if (READ_ONCE(msk->pm.remote_deny_join_id0)) + flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; + + if (flags && nla_put_u16(skb, MPTCP_ATTR_FLAGS, flags)) + return -EMSGSIZE; + return mptcp_event_add_subflow(skb, ssk); } -- cgit v1.2.3 From 705d2ac7b2044f1ca05ba6033183151a04dbff4d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 16 Sep 2025 15:28:02 +0100 Subject: io_uring/zcrx: allow synchronous buffer return Returning buffers via a ring is performant and convenient, but it becomes a problem when/if the user misconfigured the ring size and it becomes full. Add a synchronous way to return buffers back to the page pool via a new register opcode. It's supposed to be a reliable slow path for refilling. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 ++++++++ io_uring/register.c | 3 ++ io_uring/zcrx.c | 68 +++++++++++++++++++++++++++++++++++++++++++ io_uring/zcrx.h | 7 +++++ 4 files changed, 90 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1ce17c535944..a0cc1cc0dd01 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -689,6 +689,9 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, + /* return zcrx buffers back into circulation */ + IORING_REGISTER_ZCRX_REFILL = 36, + /* this goes last */ IORING_REGISTER_LAST, @@ -1070,6 +1073,15 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; +struct io_uring_zcrx_sync_refill { + __u32 zcrx_id; + /* the number of entries to return */ + __u32 nr_entries; + /* pointer to an array of struct io_uring_zcrx_rqe */ + __u64 rqes; + __u64 __resv[2]; +}; + #ifdef __cplusplus } #endif diff --git a/io_uring/register.c b/io_uring/register.c index 96e9cac12823..43f04c47522c 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -833,6 +833,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_QUERY: ret = io_query(ctx, arg, nr_args); break; + case IORING_REGISTER_ZCRX_REFILL: + ret = io_zcrx_return_bufs(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 81d4aa75a69f..07a114f9a542 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -927,6 +927,74 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .uninstall = io_pp_uninstall, }; +#define IO_ZCRX_MAX_SYS_REFILL_BUFS (1 << 16) +#define IO_ZCRX_SYS_REFILL_BATCH 32 + +static void io_return_buffers(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_rqe *rqes, unsigned nr) +{ + int i; + + for (i = 0; i < nr; i++) { + struct net_iov *niov; + netmem_ref netmem; + + if (!io_parse_rqe(&rqes[i], ifq, &niov)) + continue; + + scoped_guard(spinlock_bh, &ifq->rq_lock) { + if (!io_zcrx_put_niov_uref(niov)) + continue; + } + + netmem = net_iov_to_netmem(niov); + if (!page_pool_unref_and_test(netmem)) + continue; + io_zcrx_return_niov(niov); + } +} + +int io_zcrx_return_bufs(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg) +{ + struct io_uring_zcrx_rqe rqes[IO_ZCRX_SYS_REFILL_BATCH]; + struct io_uring_zcrx_rqe __user *user_rqes; + struct io_uring_zcrx_sync_refill zr; + struct io_zcrx_ifq *ifq; + unsigned nr, i; + + if (nr_arg) + return -EINVAL; + if (copy_from_user(&zr, arg, sizeof(zr))) + return -EFAULT; + if (!zr.nr_entries || zr.nr_entries > IO_ZCRX_MAX_SYS_REFILL_BUFS) + return -EINVAL; + if (!mem_is_zero(&zr.__resv, sizeof(zr.__resv))) + return -EINVAL; + + ifq = xa_load(&ctx->zcrx_ctxs, zr.zcrx_id); + if (!ifq) + return -EINVAL; + nr = zr.nr_entries; + user_rqes = u64_to_user_ptr(zr.rqes); + + for (i = 0; i < nr;) { + unsigned batch = min(nr - i, IO_ZCRX_SYS_REFILL_BATCH); + size_t size = batch * sizeof(rqes[0]); + + if (copy_from_user(rqes, user_rqes + i, size)) + return i ? i : -EFAULT; + io_return_buffers(ifq, rqes, batch); + + i += batch; + + if (fatal_signal_pending(current)) + return i; + cond_resched(); + } + return nr; +} + static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, struct io_zcrx_ifq *ifq, int off, int len) { diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index a48871b5adad..33ef61503092 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -63,6 +63,8 @@ struct io_zcrx_ifq { }; #if defined(CONFIG_IO_URING_ZCRX) +int io_zcrx_return_bufs(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg); int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); @@ -95,6 +97,11 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct { return NULL; } +static inline int io_zcrx_return_bufs(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg) +{ + return -EOPNOTSUPP; +} #endif int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); -- cgit v1.2.3 From 75d5546f60b36900051d75ee623fceccbeb6750c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 12 Sep 2025 18:58:51 +0200 Subject: HID: hidraw: tighten ioctl command parsing The handling for variable-length ioctl commands in hidraw_ioctl() is rather complex and the check for the data direction is incomplete. Simplify this code by factoring out the various ioctls grouped by dir and size, and using a switch() statement with the size masked out, to ensure the rest of the command is correctly matched. Fixes: 9188e79ec3fd ("HID: add phys and name ioctls to hidraw") Reported-by: Arnd Bergmann Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hidraw.c | 224 ++++++++++++++++++++++++-------------------- include/uapi/linux/hidraw.h | 2 + 2 files changed, 124 insertions(+), 102 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index c887f48756f4..bbd6f23bce78 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -394,27 +394,15 @@ static int hidraw_revoke(struct hidraw_list *list) return 0; } -static long hidraw_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +static long hidraw_fixed_size_ioctl(struct file *file, struct hidraw *dev, unsigned int cmd, + void __user *arg) { - struct inode *inode = file_inode(file); - unsigned int minor = iminor(inode); - long ret = 0; - struct hidraw *dev; - struct hidraw_list *list = file->private_data; - void __user *user_arg = (void __user*) arg; - - down_read(&minors_rwsem); - dev = hidraw_table[minor]; - if (!dev || !dev->exist || hidraw_is_revoked(list)) { - ret = -ENODEV; - goto out; - } + struct hid_device *hid = dev->hid; switch (cmd) { case HIDIOCGRDESCSIZE: - if (put_user(dev->hid->rsize, (int __user *)arg)) - ret = -EFAULT; + if (put_user(hid->rsize, (int __user *)arg)) + return -EFAULT; break; case HIDIOCGRDESC: @@ -422,113 +410,145 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd, __u32 len; if (get_user(len, (int __user *)arg)) - ret = -EFAULT; - else if (len > HID_MAX_DESCRIPTOR_SIZE - 1) - ret = -EINVAL; - else if (copy_to_user(user_arg + offsetof( - struct hidraw_report_descriptor, - value[0]), - dev->hid->rdesc, - min(dev->hid->rsize, len))) - ret = -EFAULT; + return -EFAULT; + + if (len > HID_MAX_DESCRIPTOR_SIZE - 1) + return -EINVAL; + + if (copy_to_user(arg + offsetof( + struct hidraw_report_descriptor, + value[0]), + hid->rdesc, + min(hid->rsize, len))) + return -EFAULT; + break; } case HIDIOCGRAWINFO: { struct hidraw_devinfo dinfo; - dinfo.bustype = dev->hid->bus; - dinfo.vendor = dev->hid->vendor; - dinfo.product = dev->hid->product; - if (copy_to_user(user_arg, &dinfo, sizeof(dinfo))) - ret = -EFAULT; + dinfo.bustype = hid->bus; + dinfo.vendor = hid->vendor; + dinfo.product = hid->product; + if (copy_to_user(arg, &dinfo, sizeof(dinfo))) + return -EFAULT; break; } case HIDIOCREVOKE: { - if (user_arg) - ret = -EINVAL; - else - ret = hidraw_revoke(list); - break; + struct hidraw_list *list = file->private_data; + + if (arg) + return -EINVAL; + + return hidraw_revoke(list); } default: - { - struct hid_device *hid = dev->hid; - if (_IOC_TYPE(cmd) != 'H') { - ret = -EINVAL; - break; - } + /* + * None of the above ioctls can return -EAGAIN, so + * use it as a marker that we need to check variable + * length ioctls. + */ + return -EAGAIN; + } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSFEATURE(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_send_report(file, user_arg, len, HID_FEATURE_REPORT); - break; - } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGFEATURE(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_get_report(file, user_arg, len, HID_FEATURE_REPORT); - break; - } + return 0; +} - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSINPUT(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_send_report(file, user_arg, len, HID_INPUT_REPORT); - break; - } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGINPUT(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_get_report(file, user_arg, len, HID_INPUT_REPORT); - break; - } +static long hidraw_rw_variable_size_ioctl(struct file *file, struct hidraw *dev, unsigned int cmd, + void __user *user_arg) +{ + int len = _IOC_SIZE(cmd); + + switch (cmd & ~IOCSIZE_MASK) { + case HIDIOCSFEATURE(0): + return hidraw_send_report(file, user_arg, len, HID_FEATURE_REPORT); + case HIDIOCGFEATURE(0): + return hidraw_get_report(file, user_arg, len, HID_FEATURE_REPORT); + case HIDIOCSINPUT(0): + return hidraw_send_report(file, user_arg, len, HID_INPUT_REPORT); + case HIDIOCGINPUT(0): + return hidraw_get_report(file, user_arg, len, HID_INPUT_REPORT); + case HIDIOCSOUTPUT(0): + return hidraw_send_report(file, user_arg, len, HID_OUTPUT_REPORT); + case HIDIOCGOUTPUT(0): + return hidraw_get_report(file, user_arg, len, HID_OUTPUT_REPORT); + } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSOUTPUT(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_send_report(file, user_arg, len, HID_OUTPUT_REPORT); - break; - } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGOUTPUT(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_get_report(file, user_arg, len, HID_OUTPUT_REPORT); - break; - } + return -EINVAL; +} - /* Begin Read-only ioctls. */ - if (_IOC_DIR(cmd) != _IOC_READ) { - ret = -EINVAL; - break; - } +static long hidraw_ro_variable_size_ioctl(struct file *file, struct hidraw *dev, unsigned int cmd, + void __user *user_arg) +{ + struct hid_device *hid = dev->hid; + int len = _IOC_SIZE(cmd); + int field_len; + + switch (cmd & ~IOCSIZE_MASK) { + case HIDIOCGRAWNAME(0): + field_len = strlen(hid->name) + 1; + if (len > field_len) + len = field_len; + return copy_to_user(user_arg, hid->name, len) ? -EFAULT : len; + case HIDIOCGRAWPHYS(0): + field_len = strlen(hid->phys) + 1; + if (len > field_len) + len = field_len; + return copy_to_user(user_arg, hid->phys, len) ? -EFAULT : len; + case HIDIOCGRAWUNIQ(0): + field_len = strlen(hid->uniq) + 1; + if (len > field_len) + len = field_len; + return copy_to_user(user_arg, hid->uniq, len) ? -EFAULT : len; + } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWNAME(0))) { - int len = strlen(hid->name) + 1; - if (len > _IOC_SIZE(cmd)) - len = _IOC_SIZE(cmd); - ret = copy_to_user(user_arg, hid->name, len) ? - -EFAULT : len; - break; - } + return -EINVAL; +} - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWPHYS(0))) { - int len = strlen(hid->phys) + 1; - if (len > _IOC_SIZE(cmd)) - len = _IOC_SIZE(cmd); - ret = copy_to_user(user_arg, hid->phys, len) ? - -EFAULT : len; - break; - } +static long hidraw_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(file); + unsigned int minor = iminor(inode); + struct hidraw *dev; + struct hidraw_list *list = file->private_data; + void __user *user_arg = (void __user *)arg; + int ret; - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWUNIQ(0))) { - int len = strlen(hid->uniq) + 1; - if (len > _IOC_SIZE(cmd)) - len = _IOC_SIZE(cmd); - ret = copy_to_user(user_arg, hid->uniq, len) ? - -EFAULT : len; - break; - } - } + down_read(&minors_rwsem); + dev = hidraw_table[minor]; + if (!dev || !dev->exist || hidraw_is_revoked(list)) { + ret = -ENODEV; + goto out; + } + + if (_IOC_TYPE(cmd) != 'H') { + ret = -EINVAL; + goto out; + } + if (_IOC_NR(cmd) > HIDIOCTL_LAST || _IOC_NR(cmd) == 0) { ret = -ENOTTY; + goto out; } + + ret = hidraw_fixed_size_ioctl(file, dev, cmd, user_arg); + if (ret != -EAGAIN) + goto out; + + switch (_IOC_DIR(cmd)) { + case (_IOC_READ | _IOC_WRITE): + ret = hidraw_rw_variable_size_ioctl(file, dev, cmd, user_arg); + break; + case _IOC_READ: + ret = hidraw_ro_variable_size_ioctl(file, dev, cmd, user_arg); + break; + default: + /* Any other IOC_DIR is wrong */ + ret = -EINVAL; + } + out: up_read(&minors_rwsem); return ret; diff --git a/include/uapi/linux/hidraw.h b/include/uapi/linux/hidraw.h index d5ee269864e0..ebd701b3c18d 100644 --- a/include/uapi/linux/hidraw.h +++ b/include/uapi/linux/hidraw.h @@ -48,6 +48,8 @@ struct hidraw_devinfo { #define HIDIOCGOUTPUT(len) _IOC(_IOC_WRITE|_IOC_READ, 'H', 0x0C, len) #define HIDIOCREVOKE _IOW('H', 0x0D, int) /* Revoke device access */ +#define HIDIOCTL_LAST _IOC_NR(HIDIOCREVOKE) + #define HIDRAW_FIRST_MINOR 0 #define HIDRAW_MAX_DEVICES 64 /* number of reports to buffer */ -- cgit v1.2.3 From 648dbccc03a000cd64c2a9d86012d98053545e64 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 16 Sep 2025 21:29:49 +0000 Subject: crypto: ccp - Add AMD Seamless Firmware Servicing (SFS) driver AMD Seamless Firmware Servicing (SFS) is a secure method to allow non-persistent updates to running firmware and settings without requiring BIOS reflash and/or system reset. SFS does not address anything that runs on the x86 processors and it can be used to update ASP firmware, modules, register settings and update firmware for other microprocessors like TMPM, etc. SFS driver support adds ioctl support to communicate the SFS commands to the ASP/PSP by using the TEE mailbox interface. The Seamless Firmware Servicing (SFS) driver is added as a PSP sub-device. For detailed information, please look at the SFS specifications: https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/58604.pdf Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Acked-by: Herbert Xu Link: https://lore.kernel.org/cover.1758057691.git.ashish.kalra@amd.com --- drivers/crypto/ccp/Makefile | 3 +- drivers/crypto/ccp/psp-dev.c | 20 +++ drivers/crypto/ccp/psp-dev.h | 8 +- drivers/crypto/ccp/sfs.c | 311 ++++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/sfs.h | 47 ++++++ include/linux/psp-platform-access.h | 2 + include/uapi/linux/psp-sfs.h | 87 ++++++++++ 7 files changed, 476 insertions(+), 2 deletions(-) create mode 100644 drivers/crypto/ccp/sfs.c create mode 100644 drivers/crypto/ccp/sfs.h create mode 100644 include/uapi/linux/psp-sfs.h (limited to 'include/uapi/linux') diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index 394484929dae..a9626b30044a 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -13,7 +13,8 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \ tee-dev.o \ platform-access.o \ dbc.o \ - hsti.o + hsti.o \ + sfs.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 1c5a7189631e..9e21da0e298a 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -17,6 +17,7 @@ #include "psp-dev.h" #include "sev-dev.h" #include "tee-dev.h" +#include "sfs.h" #include "platform-access.h" #include "dbc.h" #include "hsti.h" @@ -182,6 +183,17 @@ static int psp_check_tee_support(struct psp_device *psp) return 0; } +static int psp_check_sfs_support(struct psp_device *psp) +{ + /* Check if device supports SFS feature */ + if (!psp->capability.sfs) { + dev_dbg(psp->dev, "psp does not support SFS\n"); + return -ENODEV; + } + + return 0; +} + static int psp_init(struct psp_device *psp) { int ret; @@ -198,6 +210,12 @@ static int psp_init(struct psp_device *psp) return ret; } + if (!psp_check_sfs_support(psp)) { + ret = sfs_dev_init(psp); + if (ret) + return ret; + } + if (psp->vdata->platform_access) { ret = platform_access_dev_init(psp); if (ret) @@ -302,6 +320,8 @@ void psp_dev_destroy(struct sp_device *sp) tee_dev_destroy(psp); + sfs_dev_destroy(psp); + dbc_dev_destroy(psp); platform_access_dev_destroy(psp); diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index e43ce87ede76..268c83f298cb 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -32,7 +32,8 @@ union psp_cap_register { unsigned int sev :1, tee :1, dbc_thru_ext :1, - rsvd1 :4, + sfs :1, + rsvd1 :3, security_reporting :1, fused_part :1, rsvd2 :1, @@ -68,6 +69,7 @@ struct psp_device { void *tee_data; void *platform_access_data; void *dbc_data; + void *sfs_data; union psp_cap_register capability; }; @@ -118,12 +120,16 @@ struct psp_ext_request { * @PSP_SUB_CMD_DBC_SET_UID: Set UID for DBC * @PSP_SUB_CMD_DBC_GET_PARAMETER: Get parameter from DBC * @PSP_SUB_CMD_DBC_SET_PARAMETER: Set parameter for DBC + * @PSP_SUB_CMD_SFS_GET_FW_VERS: Get firmware versions for ASP and other MP + * @PSP_SUB_CMD_SFS_UPDATE: Command to load, verify and execute SFS package */ enum psp_sub_cmd { PSP_SUB_CMD_DBC_GET_NONCE = PSP_DYNAMIC_BOOST_GET_NONCE, PSP_SUB_CMD_DBC_SET_UID = PSP_DYNAMIC_BOOST_SET_UID, PSP_SUB_CMD_DBC_GET_PARAMETER = PSP_DYNAMIC_BOOST_GET_PARAMETER, PSP_SUB_CMD_DBC_SET_PARAMETER = PSP_DYNAMIC_BOOST_SET_PARAMETER, + PSP_SUB_CMD_SFS_GET_FW_VERS = PSP_SFS_GET_FW_VERSIONS, + PSP_SUB_CMD_SFS_UPDATE = PSP_SFS_UPDATE, }; int psp_extended_mailbox_cmd(struct psp_device *psp, unsigned int timeout_msecs, diff --git a/drivers/crypto/ccp/sfs.c b/drivers/crypto/ccp/sfs.c new file mode 100644 index 000000000000..2f4beaafe7ec --- /dev/null +++ b/drivers/crypto/ccp/sfs.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure Processor Seamless Firmware Servicing support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#include + +#include "sfs.h" +#include "sev-dev.h" + +#define SFS_DEFAULT_TIMEOUT (10 * MSEC_PER_SEC) +#define SFS_MAX_PAYLOAD_SIZE (2 * 1024 * 1024) +#define SFS_NUM_2MB_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PMD_SIZE) +#define SFS_NUM_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PAGE_SIZE) + +static DEFINE_MUTEX(sfs_ioctl_mutex); + +static struct sfs_misc_dev *misc_dev; + +static int send_sfs_cmd(struct sfs_device *sfs_dev, int msg) +{ + int ret; + + sfs_dev->command_buf->hdr.status = 0; + sfs_dev->command_buf->hdr.sub_cmd_id = msg; + + ret = psp_extended_mailbox_cmd(sfs_dev->psp, + SFS_DEFAULT_TIMEOUT, + (struct psp_ext_request *)sfs_dev->command_buf); + if (ret == -EIO) { + dev_dbg(sfs_dev->dev, + "msg 0x%x failed with PSP error: 0x%x, extended status: 0x%x\n", + msg, sfs_dev->command_buf->hdr.status, + *(u32 *)sfs_dev->command_buf->buf); + } + + return ret; +} + +static int send_sfs_get_fw_versions(struct sfs_device *sfs_dev) +{ + /* + * SFS_GET_FW_VERSIONS command needs the output buffer to be + * initialized to 0xC7 in every byte. + */ + memset(sfs_dev->command_buf->sfs_buffer, 0xc7, PAGE_SIZE); + sfs_dev->command_buf->hdr.payload_size = 2 * PAGE_SIZE; + + return send_sfs_cmd(sfs_dev, PSP_SFS_GET_FW_VERSIONS); +} + +static int send_sfs_update_package(struct sfs_device *sfs_dev, const char *payload_name) +{ + char payload_path[PAYLOAD_NAME_SIZE + sizeof("amd/")]; + const struct firmware *firmware; + unsigned long package_size; + int ret; + + /* Sanitize userspace provided payload name */ + if (!strnchr(payload_name, PAYLOAD_NAME_SIZE, '\0')) + return -EINVAL; + + snprintf(payload_path, sizeof(payload_path), "amd/%s", payload_name); + + ret = firmware_request_nowarn(&firmware, payload_path, sfs_dev->dev); + if (ret < 0) { + dev_warn_ratelimited(sfs_dev->dev, "firmware request failed for %s (%d)\n", + payload_path, ret); + return -ENOENT; + } + + /* + * SFS Update Package command's input buffer contains TEE_EXT_CMD_BUFFER + * followed by the Update Package and it should be 64KB aligned. + */ + package_size = ALIGN(firmware->size + PAGE_SIZE, 0x10000U); + + /* + * SFS command buffer is a pre-allocated 2MB buffer, fail update package + * if SFS payload is larger than the pre-allocated command buffer. + */ + if (package_size > SFS_MAX_PAYLOAD_SIZE) { + dev_warn_ratelimited(sfs_dev->dev, + "SFS payload size %ld larger than maximum supported payload size of %u\n", + package_size, SFS_MAX_PAYLOAD_SIZE); + release_firmware(firmware); + return -E2BIG; + } + + /* + * Copy firmware data to a HV_Fixed memory region. + */ + memcpy(sfs_dev->command_buf->sfs_buffer, firmware->data, firmware->size); + sfs_dev->command_buf->hdr.payload_size = package_size; + + release_firmware(firmware); + + return send_sfs_cmd(sfs_dev, PSP_SFS_UPDATE); +} + +static long sfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct sfs_user_get_fw_versions __user *sfs_get_fw_versions; + struct sfs_user_update_package __user *sfs_update_package; + struct psp_device *psp_master = psp_get_master_device(); + char payload_name[PAYLOAD_NAME_SIZE]; + struct sfs_device *sfs_dev; + int ret = 0; + + if (!psp_master || !psp_master->sfs_data) + return -ENODEV; + + sfs_dev = psp_master->sfs_data; + + guard(mutex)(&sfs_ioctl_mutex); + + switch (cmd) { + case SFSIOCFWVERS: + dev_dbg(sfs_dev->dev, "in SFSIOCFWVERS\n"); + + sfs_get_fw_versions = (struct sfs_user_get_fw_versions __user *)arg; + + ret = send_sfs_get_fw_versions(sfs_dev); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_get_fw_versions->blob, sfs_dev->command_buf->sfs_buffer, + PAGE_SIZE)) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_get_fw_versions->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_get_fw_versions->sfs_extended_status))) + return -EFAULT; + break; + case SFSIOCUPDATEPKG: + dev_dbg(sfs_dev->dev, "in SFSIOCUPDATEPKG\n"); + + sfs_update_package = (struct sfs_user_update_package __user *)arg; + + if (copy_from_user(payload_name, sfs_update_package->payload_name, + PAYLOAD_NAME_SIZE)) + return -EFAULT; + + ret = send_sfs_update_package(sfs_dev, payload_name); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_update_package->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_update_package->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_update_package->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_update_package->sfs_extended_status))) + return -EFAULT; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static const struct file_operations sfs_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = sfs_ioctl, +}; + +static void sfs_exit(struct kref *ref) +{ + misc_deregister(&misc_dev->misc); + kfree(misc_dev); + misc_dev = NULL; +} + +void sfs_dev_destroy(struct psp_device *psp) +{ + struct sfs_device *sfs_dev = psp->sfs_data; + + if (!sfs_dev) + return; + + /* + * Change SFS command buffer back to the default "Write-Back" type. + */ + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + + snp_free_hv_fixed_pages(sfs_dev->page); + + if (sfs_dev->misc) + kref_put(&misc_dev->refcount, sfs_exit); + + psp->sfs_data = NULL; +} + +/* Based on sev_misc_init() */ +static int sfs_misc_init(struct sfs_device *sfs) +{ + struct device *dev = sfs->dev; + int ret; + + /* + * SFS feature support can be detected on multiple devices but the SFS + * FW commands must be issued on the master. During probe, we do not + * know the master hence we create /dev/sfs on the first device probe. + */ + if (!misc_dev) { + struct miscdevice *misc; + + misc_dev = kzalloc(sizeof(*misc_dev), GFP_KERNEL); + if (!misc_dev) + return -ENOMEM; + + misc = &misc_dev->misc; + misc->minor = MISC_DYNAMIC_MINOR; + misc->name = "sfs"; + misc->fops = &sfs_fops; + misc->mode = 0600; + + ret = misc_register(misc); + if (ret) + return ret; + + kref_init(&misc_dev->refcount); + } else { + kref_get(&misc_dev->refcount); + } + + sfs->misc = misc_dev; + dev_dbg(dev, "registered SFS device\n"); + + return 0; +} + +int sfs_dev_init(struct psp_device *psp) +{ + struct device *dev = psp->dev; + struct sfs_device *sfs_dev; + struct page *page; + int ret = -ENOMEM; + + sfs_dev = devm_kzalloc(dev, sizeof(*sfs_dev), GFP_KERNEL); + if (!sfs_dev) + return -ENOMEM; + + /* + * Pre-allocate 2MB command buffer for all SFS commands using + * SNP HV_Fixed page allocator which also transitions the + * SFS command buffer to HV_Fixed page state if SNP is enabled. + */ + page = snp_alloc_hv_fixed_pages(SFS_NUM_2MB_PAGES_CMDBUF); + if (!page) { + dev_dbg(dev, "Command Buffer HV-Fixed page allocation failed\n"); + goto cleanup_dev; + } + sfs_dev->page = page; + sfs_dev->command_buf = page_address(page); + + dev_dbg(dev, "Command buffer 0x%px to be marked as HV_Fixed\n", sfs_dev->command_buf); + + /* + * SFS command buffer must be mapped as non-cacheable. + */ + ret = set_memory_uc((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + if (ret) { + dev_dbg(dev, "Set memory uc failed\n"); + goto cleanup_cmd_buf; + } + + dev_dbg(dev, "Command buffer 0x%px marked uncacheable\n", sfs_dev->command_buf); + + psp->sfs_data = sfs_dev; + sfs_dev->dev = dev; + sfs_dev->psp = psp; + + ret = sfs_misc_init(sfs_dev); + if (ret) + goto cleanup_mem_attr; + + dev_notice(sfs_dev->dev, "SFS support is available\n"); + + return 0; + +cleanup_mem_attr: + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + +cleanup_cmd_buf: + snp_free_hv_fixed_pages(page); + +cleanup_dev: + psp->sfs_data = NULL; + devm_kfree(dev, sfs_dev); + + return ret; +} diff --git a/drivers/crypto/ccp/sfs.h b/drivers/crypto/ccp/sfs.h new file mode 100644 index 000000000000..97704c210efd --- /dev/null +++ b/drivers/crypto/ccp/sfs.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AMD Platform Security Processor (PSP) Seamless Firmware (SFS) Support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __SFS_H__ +#define __SFS_H__ + +#include + +#include +#include +#include +#include +#include + +#include "psp-dev.h" + +struct sfs_misc_dev { + struct kref refcount; + struct miscdevice misc; +}; + +struct sfs_command { + struct psp_ext_req_buffer_hdr hdr; + u8 buf[PAGE_SIZE - sizeof(struct psp_ext_req_buffer_hdr)]; + u8 sfs_buffer[]; +} __packed; + +struct sfs_device { + struct device *dev; + struct psp_device *psp; + + struct page *page; + struct sfs_command *command_buf; + + struct sfs_misc_dev *misc; +}; + +void sfs_dev_destroy(struct psp_device *psp); +int sfs_dev_init(struct psp_device *psp); + +#endif /* __SFS_H__ */ diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index 1504fb012c05..540abf7de048 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -7,6 +7,8 @@ enum psp_platform_access_msg { PSP_CMD_NONE = 0x0, + PSP_SFS_GET_FW_VERSIONS, + PSP_SFS_UPDATE, PSP_CMD_HSTI_QUERY = 0x14, PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, diff --git a/include/uapi/linux/psp-sfs.h b/include/uapi/linux/psp-sfs.h new file mode 100644 index 000000000000..94e51670383c --- /dev/null +++ b/include/uapi/linux/psp-sfs.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * Userspace interface for AMD Seamless Firmware Servicing (SFS) + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __PSP_SFS_USER_H__ +#define __PSP_SFS_USER_H__ + +#include + +/** + * SFS: AMD Seamless Firmware Support (SFS) interface + */ + +#define PAYLOAD_NAME_SIZE 64 +#define TEE_EXT_CMD_BUFFER_SIZE 4096 + +/** + * struct sfs_user_get_fw_versions - get current level of base firmware (output). + * @blob: current level of base firmware for ASP and patch levels (input/output). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_get_fw_versions { + __u8 blob[TEE_EXT_CMD_BUFFER_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * struct sfs_user_update_package - update SFS package (input). + * @payload_name: name of SFS package to load, verify and execute (input). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_update_package { + char payload_name[PAYLOAD_NAME_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * Seamless Firmware Support (SFS) IOC + * + * possible return codes for all SFS IOCTLs: + * 0: success + * -EINVAL: invalid input + * -E2BIG: excess data passed + * -EFAULT: failed to copy to/from userspace + * -EBUSY: mailbox in recovery or in use + * -ENODEV: driver not bound with PSP device + * -EACCES: request isn't authorized + * -EINVAL: invalid parameter + * -ETIMEDOUT: request timed out + * -EAGAIN: invalid request for state machine + * -ENOENT: not implemented + * -ENFILE: overflow + * -EPERM: invalid signature + * -EIO: PSP I/O error + */ +#define SFS_IOC_TYPE 'S' + +/** + * SFSIOCFWVERS - returns blob containing FW versions + * ASP provides the current level of Base Firmware for the ASP + * and the other microprocessors as well as current patch + * level(s). + */ +#define SFSIOCFWVERS _IOWR(SFS_IOC_TYPE, 0x1, struct sfs_user_get_fw_versions) + +/** + * SFSIOCUPDATEPKG - updates package/payload + * ASP loads, verifies and executes the SFS package. + * By default, the SFS package/payload is loaded from + * /lib/firmware/amd, but alternative firmware loading + * path can be specified using kernel parameter + * firmware_class.path or the firmware loading path + * can be customized using sysfs file: + * /sys/module/firmware_class/parameters/path. + */ +#define SFSIOCUPDATEPKG _IOWR(SFS_IOC_TYPE, 0x2, struct sfs_user_update_package) + +#endif /* __PSP_SFS_USER_H__ */ -- cgit v1.2.3 From b5e74132dfbe60329b3ff0e5c485039f2e31605c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:30 +0200 Subject: tcp: accecn: AccECN option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Accurate ECN allows echoing back the sum of bytes for each IP ECN field value in the received packets using AccECN option. This change implements AccECN option tx & rx side processing without option send control related features that are added by a later change. Based on specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt (Some features of the spec will be added in the later changes rather than in this one). A full-length AccECN option is always attempted but if it does not fit, the minimum length is selected based on the counters that have changed since the last update. The AccECN option (with 24-bit fields) often ends in odd sizes so the option write code tries to take advantage of some nop used to pad the other TCP options. The delivered_ecn_bytes pairs with received_ecn_bytes similar to how delivered_ce pairs with received_ce. In contrast to ACE field, however, the option is not always available to update delivered_ecn_bytes. For ACK w/o AccECN option, the delivered bytes calculated based on the cumulative ACK+SACK information are assigned to one of the counters using an estimation heuristic to select the most likely ECN byte counter. Any estimation error is corrected when the next AccECN option arrives. It may occur that the heuristic gets too confused when there are enough different byte counter deltas between ACKs with the AccECN option in which case the heuristic just gives up on updating the counters for a while. tcp_ecn_option sysctl can be used to select option sending mode for AccECN: TCP_ECN_OPTION_DISABLED, TCP_ECN_OPTION_MINIMUM, and TCP_ECN_OPTION_FULL. This patch increases the size of tcp_info struct, as there is no existing holes for new u32 variables. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ /* size: 248, cachelines: 4, members: 61 */ } [AFTER THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ __u32 tcpi_received_ce; /* 248 4 */ __u32 tcpi_delivered_e1_bytes; /* 252 4 */ __u32 tcpi_delivered_e0_bytes; /* 256 4 */ __u32 tcpi_delivered_ce_bytes; /* 260 4 */ __u32 tcpi_received_e1_bytes; /* 264 4 */ __u32 tcpi_received_e0_bytes; /* 268 4 */ __u32 tcpi_received_ce_bytes; /* 272 4 */ /* size: 280, cachelines: 5, members: 68 */ } This patch uses the existing 1-byte holes in the tcp_sock_write_txrx group for new u8 members, but adds a 4-byte hole in tcp_sock_write_rx group after the new u32 delivered_ecn_bytes[3] member. Therefore, the group size of tcp_sock_write_rx is increased from 96 to 112. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4; /* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2728 0 */ [...] /* size: 3200, cachelines: 50, members: 167 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ u32 delivered_ecn_bytes[3];/* 2672 12 */ /* XXX 4 bytes hole, try to pack */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2744 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } Signed-off-by: Ilpo Järvinen Signed-off-by: Neal Cardwell Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-7-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 19 +++ .../networking/net_cachelines/tcp_sock.rst | 3 + include/linux/tcp.h | 9 +- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 13 ++ include/net/tcp_ecn.h | 89 ++++++++++- include/uapi/linux/tcp.h | 7 + net/ipv4/sysctl_net_ipv4.c | 9 ++ net/ipv4/tcp.c | 15 +- net/ipv4/tcp_input.c | 94 +++++++++++- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 165 ++++++++++++++++++++- 12 files changed, 412 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 3d8eb54b84f9..1c206501b973 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -468,6 +468,25 @@ tcp_ecn - INTEGER Default: 2 +tcp_ecn_option - INTEGER + Control Accurate ECN (AccECN) option sending when AccECN has been + successfully negotiated during handshake. Send logic inhibits + sending AccECN options regarless of this setting when no AccECN + option has been seen for the reverse direction. + + Possible values are: + + = ============================================================ + 0 Never send AccECN option. This also disables sending AccECN + option in SYN/ACK during handshake. + 1 Send AccECN option sparingly according to the minimum option + rules outlined in draft-ietf-tcpm-accurate-ecn. + 2 Send AccECN option on every packet whenever it fits into TCP + option space. + = ============================================================ + + Default: 2 + tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall back to non-ECN. Currently, this knob implements the fallback diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 5a2b0af57364..b941151f8c0a 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -104,8 +104,11 @@ u32 delivered_ce read_mostly read_w u32 received_ce read_mostly read_write u32[3] received_ecn_bytes read_mostly read_write u8:4 received_ce_pending read_mostly read_write +u32[3] delivered_ecn_bytes read_write u8:2 syn_ect_snt write_mostly read_write u8:2 syn_ect_rcv read_mostly read_write +u8:2 accecn_minlen write_mostly read_write +u8:2 est_ecnfield read_write u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 012d01347b3c..73557656cb2d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -122,8 +122,9 @@ struct tcp_options_received { smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ - u8 saw_unknown:1, /* Received unknown option */ - unused:7; + u8 accecn:6, /* AccECN index in header, 0=no options */ + saw_unknown:1, /* Received unknown option */ + unused:1; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ @@ -293,6 +294,9 @@ struct tcp_sock { rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ unused2:4; + u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ + est_ecnfield:2,/* ECN field for AccECN delivered estimates */ + unused3:4; __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -337,6 +341,7 @@ struct tcp_sock { u32 rate_delivered; /* saved rate sample: packets delivered */ u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_rtt_last_tsecr; + u32 delivered_ecn_bytes[3]; u64 first_tx_mstamp; /* start of window send phase */ u64 delivered_mstamp; /* time we reached "delivered" */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 54a7d187f62a..acbb7dd497e1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -148,6 +148,7 @@ struct netns_ipv4 { struct local_ports ip_local_ports; u8 sysctl_tcp_ecn; + u8 sysctl_tcp_ecn_option; u8 sysctl_tcp_ecn_fallback; u8 sysctl_ip_default_ttl; diff --git a/include/net/tcp.h b/include/net/tcp.h index da8c6640ead3..6be29129465e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -213,6 +213,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ +#define TCPOPT_ACCECN0 172 /* 0xAC: Accurate ECN Order 0 */ +#define TCPOPT_ACCECN1 174 /* 0xAE: Accurate ECN Order 1 */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt @@ -230,6 +232,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 +#define TCPOLEN_ACCECN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 @@ -243,6 +246,13 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 +#define TCPOLEN_ACCECN_PERFIELD 3 + +/* Maximum number of byte counters in AccECN option + size */ +#define TCP_ACCECN_NUMFIELDS 3 +#define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ + TCPOLEN_ACCECN_PERFIELD * \ + TCP_ACCECN_NUMFIELDS) /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ @@ -981,6 +991,9 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) * See draft-ietf-tcpm-accurate-ecn for the latest values. */ #define TCP_ACCECN_CEP_INIT_OFFSET 5 +#define TCP_ACCECN_E1B_INIT_OFFSET 1 +#define TCP_ACCECN_E0B_INIT_OFFSET 1 +#define TCP_ACCECN_CEB_INIT_OFFSET 0 /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 1a41a459aa07..08c7f4757e4e 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -24,6 +24,13 @@ enum tcp_ecn_mode { TCP_ECN_IN_ACCECN_OUT_NOECN = 5, }; +/* AccECN option sending when AccECN has been successfully negotiated */ +enum tcp_accecn_option { + TCP_ACCECN_OPTION_DISABLED = 0, + TCP_ACCECN_OPTION_MINIMUM = 1, + TCP_ACCECN_OPTION_FULL = 2, +}; + static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { /* Do not set CWR if in AccECN mode! */ @@ -169,6 +176,79 @@ static inline void tcp_accecn_third_ack(struct sock *sk, } } +/* Maps IP ECN field ECT/CE code point to AccECN option field number, given + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). + */ +static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return 1; + case INET_ECN_CE: + return 2; + case INET_ECN_ECT_0: + return 3; + } + return 0; +} + +/* Maps IP ECN field ECT/CE code point to AccECN option field value offset. + * Some fields do not start from zero, to detect zeroing by middleboxes. + */ +static inline u32 tcp_accecn_field_init_offset(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return TCP_ACCECN_E1B_INIT_OFFSET; + case INET_ECN_CE: + return TCP_ACCECN_CEB_INIT_OFFSET; + case INET_ECN_ECT_0: + return TCP_ACCECN_E0B_INIT_OFFSET; + } + return 0; +} + +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ +static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option, + bool order) +{ + /* Based on Table 5 of the AccECN spec to map (option, order) to + * the corresponding ECN conuters (ECT-1, ECT-0, or CE). + */ + static const u8 optfield_lookup[2][3] = { + /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */ + { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 }, + /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */ + { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 } + }; + + return optfield_lookup[order][option % 3]; +} + +/* Handles AccECN option ECT and CE 24-bit byte counters update into + * the u32 value in tcp_sock. As we're processing TCP options, it is + * safe to access from - 1. + */ +static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, + u32 init_offset) +{ + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & + 0xFFFFFFU; + u32 delta = (truncated - *cnt) & 0xFFFFFFU; + + /* If delta has the highest bit set (24th bit) indicating + * negative, sign extend to correct an estimation using + * sign_extend32(delta, 24 - 1) + */ + delta = sign_extend32(delta, 23); + *cnt += delta; + return (s32)delta; +} + /* Updates Accurate ECN received counters from the received IP ECN field */ static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, u32 len) @@ -192,8 +272,12 @@ static inline void tcp_ecn_received_counters(struct sock *sk, tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); - if (len > 0) + if (len > 0) { + u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); tp->received_ecn_bytes[ecnfield - 1] += len; + tp->accecn_minlen = max_t(u8, tp->accecn_minlen, + minlen); + } } } @@ -263,6 +347,9 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) tp->received_ce = 0; tp->received_ce_pending = 0; __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); + __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); + tp->accecn_minlen = 0; + tp->est_ecnfield = 0; } /* Used for make_synack to form the ACE flags */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index bdac8c42fa82..53e0e85b52be 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -316,6 +316,13 @@ struct tcp_info { * in milliseconds, including any * unfinished recovery. */ + __u32 tcpi_received_ce; /* # of CE marks received */ + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ + __u32 tcpi_delivered_e0_bytes; + __u32 tcpi_delivered_ce_bytes; + __u32 tcpi_received_e1_bytes; + __u32 tcpi_received_e0_bytes; + __u32 tcpi_received_ce_bytes; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 268f8b86e8a7..4a697acb4e85 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -731,6 +731,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &tcp_ecn_mode_max, }, + { + .procname = "tcp_ecn_option", + .data = &init_net.ipv4.sysctl_tcp_ecn_option, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a45a4184b603..8c4a4b8666fc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include #include +#include #include #include #include @@ -4155,6 +4156,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ce_idx = INET_ECN_CE - 1; unsigned long rate; u32 now; u64 rate64; @@ -4281,6 +4285,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_received_ce = tp->received_ce; + info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; + info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; + info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx]; + info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx]; + info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx]; + info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx]; + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -5162,12 +5174,13 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 96); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 112); } void __init tcp_init(void) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8b48a3c00945..e898a76c485e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -384,6 +385,73 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) } } +/* Returns true if the byte counters can be used */ +static bool tcp_accecn_process_option(struct tcp_sock *tp, + const struct sk_buff *skb, + u32 delivered_bytes, int flag) +{ + u8 estimate_ecnfield = tp->est_ecnfield; + bool ambiguous_ecn_bytes_incr = false; + bool first_changed = false; + unsigned int optlen; + bool order1, res; + unsigned int i; + u8 *ptr; + + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (estimate_ecnfield) { + u8 ecnfield = estimate_ecnfield - 1; + + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; + return true; + } + return false; + } + + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; + optlen = ptr[1] - 2; + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return false; + order1 = (ptr[0] == TCPOPT_ACCECN1); + ptr += 2; + + res = !!estimate_ecnfield; + for (i = 0; i < 3; i++) { + u32 init_offset; + u8 ecnfield; + s32 delta; + u32 *cnt; + + if (optlen < TCPOLEN_ACCECN_PERFIELD) + break; + + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); + init_offset = tcp_accecn_field_init_offset(ecnfield); + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); + if (delta && delta < 0) { + res = false; + ambiguous_ecn_bytes_incr = true; + } + if (delta && ecnfield != estimate_ecnfield) { + if (!first_changed) { + tp->est_ecnfield = ecnfield; + first_changed = true; + } else { + res = false; + ambiguous_ecn_bytes_incr = true; + } + } + + optlen -= TCPOLEN_ACCECN_PERFIELD; + ptr += TCPOLEN_ACCECN_PERFIELD; + } + if (ambiguous_ecn_bytes_incr) + tp->est_ecnfield = 0; + + return res; +} + static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; @@ -400,7 +468,8 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, /* Returns the ECN CE delta */ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int flag) + u32 delivered_pkts, u32 delivered_bytes, + int flag) { const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); @@ -411,6 +480,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) return 0; + tcp_accecn_process_option(tp, skb, delivered_bytes, flag); + if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) @@ -436,12 +507,14 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, } static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int *flag) + u32 delivered_pkts, u32 delivered_bytes, + int *flag) { struct tcp_sock *tp = tcp_sk(sk); u32 delta; - delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); + delta = __tcp_accecn_process(sk, skb, delivered_pkts, + delivered_bytes, *flag); if (delta > 0) { tcp_count_delivered_ce(tp, delta); *flag |= FLAG_ECE; @@ -3973,6 +4046,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); @@ -4012,6 +4086,7 @@ no_queue: if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ @@ -4139,6 +4214,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->accecn = 0; opt_rx->saw_unknown = 0; while (length > 0) { @@ -4230,6 +4306,12 @@ void tcp_parse_options(const struct net *net, ptr, th->syn, foc, false); break; + case TCPOPT_ACCECN0: + case TCPOPT_ACCECN1: + /* Save offset of AccECN option in TCP header */ + opt_rx->accecn = (ptr - 2) - (__u8 *)th; + break; + case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. @@ -4290,11 +4372,14 @@ static bool tcp_fast_parse_options(const struct net *net, */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { - if (tcp_parse_aligned_timestamp(tp, th)) + if (tcp_parse_aligned_timestamp(tp, th)) { + tp->rx_opt.accecn = 0; return true; + } } tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); @@ -6119,6 +6204,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) */ tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6162f8dbe9d2..aa8dbfe20924 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3561,6 +3561,7 @@ fallback: static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; + net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5be2b3eb73d3..34e5c83bbace 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -385,6 +385,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) +#define OPTION_ACCECN BIT(12) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -406,6 +407,8 @@ struct tcp_out_options { u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 num_accecn_fields:7, /* number of AccECN fields needed */ + use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ @@ -603,6 +606,11 @@ static __be32 *process_tcp_ao_options(struct tcp_sock *tp, return ptr; } +/* Initial values for AccECN option, ordered is based on ECN field bits + * similar to received_ecn_bytes. Used for SYN/ACK AccECN option. + */ +static const u32 synack_ecn_bytes[3] = { 0, 0, 0 }; + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -621,6 +629,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, struct tcp_out_options *opts, struct tcp_key *key) { + u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */ + u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */ __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ @@ -656,15 +666,71 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } + if (OPTION_ACCECN & options) { + const u32 *ecn_bytes = opts->use_synack_ecn_bytes ? + synack_ecn_bytes : + tp->received_ecn_bytes; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ce_idx = INET_ECN_CE - 1; + u32 e0b; + u32 e1b; + u32 ceb; + u8 len; + + e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; + e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; + ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; + len = TCPOLEN_ACCECN_BASE + + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; + + if (opts->num_accecn_fields == 2) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + } else if (opts->num_accecn_fields == 1) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + leftover_highbyte = e1b & 0xff; + leftover_lowbyte = TCPOPT_NOP; + } else if (opts->num_accecn_fields == 0) { + leftover_highbyte = TCPOPT_ACCECN1; + leftover_lowbyte = len; + } else if (opts->num_accecn_fields == 3) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + *ptr++ = htonl(((e0b & 0xffffff) << 8) | + TCPOPT_NOP); + } + if (tp) + tp->accecn_minlen = 0; + } + if (unlikely(OPTION_SACK_ADVERTISE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_WSCALE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | + u8 highbyte = TCPOPT_NOP; + + /* Do not split the leftover 2-byte to fit into a single + * NOP, i.e., replace this NOP only when 1 byte is leftover + * within leftover_highbyte. + */ + if (unlikely(leftover_highbyte != TCPOPT_NOP && + leftover_lowbyte == TCPOPT_NOP)) { + highbyte = leftover_highbyte; + leftover_highbyte = TCPOPT_NOP; + } + *ptr++ = htonl((highbyte << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); @@ -675,11 +741,13 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, tp->duplicate_sack : tp->selective_acks; int this_sack; - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { @@ -688,6 +756,14 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, } tp->rx_opt.dsack = 0; + } else if (unlikely(leftover_highbyte != TCPOPT_NOP || + leftover_lowbyte != TCPOPT_NOP)) { + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | + (TCPOPT_NOP << 8) | + TCPOPT_NOP); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { @@ -768,6 +844,61 @@ static void mptcp_set_option_cond(const struct request_sock *req, } } +static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts) +{ + /* How much there's room for combining with the alignment padding? */ + if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) == + OPTION_SACK_ADVERTISE) + return 2; + else if (opts->options & OPTION_WSCALE) + return 1; + return 0; +} + +/* Calculates how long AccECN option will fit to @remaining option space. + * + * AccECN option can sometimes replace NOPs used for alignment of other + * TCP options (up to @max_combine_saving available). + * + * Only solutions with at least @required AccECN fields are accepted. + * + * Returns: The size of the AccECN option excluding space repurposed from + * the alignment of the other options. + */ +static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required, + int remaining) +{ + int size = TCP_ACCECN_MAXSIZE; + int max_combine_saving; + int align_size; + + if (opts->use_synack_ecn_bytes) + max_combine_saving = tcp_synack_options_combine_saving(opts); + else + max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0; + opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; + while (opts->num_accecn_fields >= required) { + /* Pad to dword if cannot combine */ + if ((size & 0x3) > max_combine_saving) + align_size = ALIGN(size, 4); + else + align_size = ALIGN_DOWN(size, 4); + + if (remaining >= align_size) { + size = align_size; + break; + } + + opts->num_accecn_fields--; + size -= TCPOLEN_ACCECN_PERFIELD; + } + if (opts->num_accecn_fields < required) + return 0; + + opts->options |= OPTION_ACCECN; + return size; +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -850,6 +981,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && + tcp_ecn_mode_accecn(tp) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE)) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -867,6 +1007,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; + struct tcp_request_sock *treq = tcp_rsk(req); if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; @@ -929,6 +1070,13 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + if (treq->accecn_ok && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); @@ -1001,6 +1149,13 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; } + if (tcp_ecn_mode_accecn(tp) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { + opts->use_synack_ecn_bytes = 0; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + MAX_TCP_OPTION_SPACE - size); + } + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; -- cgit v1.2.3 From b40671b5ee588c8a61b2d0eacbad32ffc57e9a8f Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 16 Sep 2025 10:24:32 +0200 Subject: tcp: accecn: AccECN option failure handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AccECN option may fail in various way, handle these: - Attempt to negotiate the use of AccECN on the 1st retransmitted SYN - From the 2nd retransmitted SYN, stop AccECN negotiation - Remove option from SYN/ACK rexmits to handle blackholes - If no option arrives in SYN/ACK, assume Option is not usable - If an option arrives later, re-enabled - If option is zeroed, disable AccECN option processing This patch use existing padding bits in tcp_request_sock and holes in tcp_sock without increasing the size. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-9-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 4 +++- include/net/tcp_ecn.h | 51 +++++++++++++++++++++++++++++++++++++++++++++--- include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_input.c | 35 +++++++++++++++++++++++++++++++-- net/ipv4/tcp_minisocks.c | 14 +++++++++++++ net/ipv4/tcp_output.c | 11 ++++++++--- 7 files changed, 111 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f637b659b35a..3ca5ed02de6d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -173,6 +173,7 @@ struct tcp_request_sock { u8 syn_ect_snt: 2, syn_ect_rcv: 2, accecn_fail_mode:4; + u8 saw_accecn_opt :2; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -407,7 +408,8 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ - u8 accecn_fail_mode:4; /* AccECN failure handling */ + u8 accecn_fail_mode:4, /* AccECN failure handling */ + saw_accecn_opt:2; /* An AccECN option was seen */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 133fb6b79500..f13e5cd2b1ac 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -91,6 +91,11 @@ static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) tp->accecn_fail_mode |= mode; } +#define TCP_ACCECN_OPT_NOT_SEEN 0x0 +#define TCP_ACCECN_OPT_EMPTY_SEEN 0x1 +#define TCP_ACCECN_OPT_COUNTER_SEEN 0x2 +#define TCP_ACCECN_OPT_FAIL_SEEN 0x3 + static inline u8 tcp_accecn_ace(const struct tcphdr *th) { return (th->ae << 2) | (th->cwr << 1) | th->ece; @@ -146,6 +151,14 @@ static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, return true; } +static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp, + u8 saw_opt) +{ + tp->saw_accecn_opt = saw_opt; + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); +} + /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ static inline void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, u8 sent_ect) @@ -428,9 +441,35 @@ static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, } } +static inline u8 tcp_accecn_option_init(const struct sk_buff *skb, + u8 opt_offset) +{ + u8 *ptr = skb_transport_header(skb) + opt_offset; + unsigned int optlen = ptr[1] - 2; + + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return TCP_ACCECN_OPT_FAIL_SEEN; + ptr += 2; + + /* Detect option zeroing: an AccECN connection "MAY check that the + * initial value of the EE0B field or the EE1B field is non-zero" + */ + if (optlen < TCPOLEN_ACCECN_PERFIELD) + return TCP_ACCECN_OPT_EMPTY_SEEN; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + if (optlen < TCPOLEN_ACCECN_PERFIELD * 3) + return TCP_ACCECN_OPT_COUNTER_SEEN; + ptr += TCPOLEN_ACCECN_PERFIELD * 2; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + + return TCP_ACCECN_OPT_COUNTER_SEEN; +} + /* See Table 2 of the AccECN draft */ -static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, - u8 ip_dsfield) +static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, + const struct tcphdr *th, u8 ip_dsfield) { struct tcp_sock *tp = tcp_sk(sk); u8 ace = tcp_accecn_ace(th); @@ -469,7 +508,13 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; - tp->accecn_opt_demand = 2; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tp->accecn_opt_demand = 2; + } if (INET_ECN_is_ce(ip_dsfield) && tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt)) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 53e0e85b52be..dce3113787a7 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -323,6 +323,8 @@ struct tcp_info { __u32 tcpi_received_e1_bytes; __u32 tcpi_received_e0_bytes; __u32 tcpi_received_ce_bytes; + __u16 tcpi_accecn_fail_mode; + __u16 tcpi_accecn_opt_seen; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 090f9ac43d4c..5b5c655ded1d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3409,6 +3409,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered = 0; tp->delivered_ce = 0; tp->accecn_fail_mode = 0; + tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; @@ -4287,6 +4288,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; + info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; info->tcpi_received_ce = tp->received_ce; info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 87154fd86167..5732f2d4329c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -398,7 +398,22 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, unsigned int i; u8 *ptr; + if (tcp_accecn_opt_fail_recv(tp)) + return false; + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (!tp->saw_accecn_opt) { + /* Too late to enable after this point due to + * potential counter wraps + */ + if (tp->bytes_sent >= (1 << 23) - 1) { + u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN; + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + } + return false; + } + if (estimate_ecnfield) { u8 ecnfield = estimate_ecnfield - 1; @@ -415,6 +430,13 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, order1 = (ptr[0] == TCPOPT_ACCECN1); ptr += 2; + if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + tp->saw_accecn_opt = tcp_accecn_option_init(skb, + tp->rx_opt.accecn); + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); + } + res = !!estimate_ecnfield; for (i = 0; i < 3; i++) { u32 init_offset; @@ -6123,7 +6145,13 @@ step1: if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; - tcp_accecn_opt_demand_min(sk, 1); + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tcp_accecn_opt_demand_min(sk, 1); + } } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && @@ -6606,7 +6634,8 @@ consume: */ if (tcp_ecn_mode_any(tp)) - tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield); + tcp_ecn_rcv_synack(sk, skb, th, + TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -7177,6 +7206,8 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; + tcp_rsk(req)->accecn_fail_mode = 0; tcp_rsk(req)->syn_ect_rcv = 0; tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 193343494558..327095ef95ef 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,6 +463,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->saw_accecn_opt = treq->saw_accecn_opt; tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); @@ -678,6 +679,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, bool own_req; tmp_opt.saw_tstamp = 0; + tmp_opt.accecn = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); @@ -855,6 +857,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; + if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && + tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); + + tcp_rsk(req)->saw_accecn_opt = saw_opt; + if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { + u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; + + tcp_rsk(req)->accecn_fail_mode |= fail_mode; + } + } + /* For Fast Open no more processing is needed (sk is the * child socket). */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f897c2594954..65b90f73daa0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -985,9 +985,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } - /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + /* Simultaneous open SYN/ACK needs AccECN option but not SYN. + * It is attempted to negotiate the use of AccECN also on the first + * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs" + * of AccECN draft. + */ if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && tcp_ecn_mode_accecn(tp) && + inet_csk(sk)->icsk_retransmits < 2 && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && remaining >= TCPOLEN_ACCECN_BASE)) { opts->use_synack_ecn_bytes = 1; @@ -1076,7 +1081,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, if (treq->accecn_ok && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && - remaining >= TCPOLEN_ACCECN_BASE) { + req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { opts->use_synack_ecn_bytes = 1; remaining -= tcp_options_fit_accecn(opts, 0, remaining); } @@ -1156,7 +1161,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (tcp_ecn_mode_accecn(tp)) { int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); - if (ecn_opt && + if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) && (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk))) { opts->use_synack_ecn_bytes = 0; -- cgit v1.2.3 From 00c94ca2b99e6610e483f92e531b319eeaed94aa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:29 -0700 Subject: psp: base PSP device support Add a netlink family for PSP and allow drivers to register support. The "PSP device" is its own object. This allows us to perform more flexible reference counting / lifetime control than if PSP information was part of net_device. In the future we should also be able to "delegate" PSP access to software devices, such as *vlan, veth or netkit more easily. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-3-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 96 +++++++++++++++ include/linux/netdevice.h | 4 + include/net/psp.h | 12 ++ include/net/psp/functions.h | 14 +++ include/net/psp/types.h | 100 ++++++++++++++++ include/uapi/linux/psp.h | 42 +++++++ net/Kconfig | 1 + net/Makefile | 1 + net/psp/Kconfig | 13 ++ net/psp/Makefile | 5 + net/psp/psp-nl-gen.c | 65 ++++++++++ net/psp/psp-nl-gen.h | 30 +++++ net/psp/psp.h | 31 +++++ net/psp/psp_main.c | 139 ++++++++++++++++++++++ net/psp/psp_nl.c | 223 +++++++++++++++++++++++++++++++++++ tools/net/ynl/Makefile.deps | 1 + 16 files changed, 777 insertions(+) create mode 100644 Documentation/netlink/specs/psp.yaml create mode 100644 include/net/psp.h create mode 100644 include/net/psp/functions.h create mode 100644 include/net/psp/types.h create mode 100644 include/uapi/linux/psp.h create mode 100644 net/psp/Kconfig create mode 100644 net/psp/Makefile create mode 100644 net/psp/psp-nl-gen.c create mode 100644 net/psp/psp-nl-gen.h create mode 100644 net/psp/psp.h create mode 100644 net/psp/psp_main.c create mode 100644 net/psp/psp_nl.c (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml new file mode 100644 index 000000000000..706f4baf8764 --- /dev/null +++ b/Documentation/netlink/specs/psp.yaml @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +--- +name: psp + +doc: + PSP Security Protocol Generic Netlink family. + +definitions: + - + type: enum + name: version + entries: [hdr0-aes-gcm-128, hdr0-aes-gcm-256, + hdr0-aes-gmac-128, hdr0-aes-gmac-256] + +attribute-sets: + - + name: dev + attributes: + - + name: id + doc: PSP device ID. + type: u32 + checks: + min: 1 + - + name: ifindex + doc: ifindex of the main netdevice linked to the PSP device. + type: u32 + - + name: psp-versions-cap + doc: Bitmask of PSP versions supported by the device. + type: u32 + enum: version + enum-as-flags: true + - + name: psp-versions-ena + doc: Bitmask of currently enabled (accepted on Rx) PSP versions. + type: u32 + enum: version + enum-as-flags: true + +operations: + list: + - + name: dev-get + doc: Get / dump information about PSP capable devices on the system. + attribute-set: dev + do: + request: + attributes: + - id + reply: &dev-all + attributes: + - id + - ifindex + - psp-versions-cap + - psp-versions-ena + pre: psp-device-get-locked + post: psp-device-unlock + dump: + reply: *dev-all + - + name: dev-add-ntf + doc: Notification about device appearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-del-ntf + doc: Notification about device disappearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-set + doc: Set the configuration of a PSP device. + attribute-set: dev + do: + request: + attributes: + - id + - psp-versions-ena + reply: + attributes: [] + pre: psp-device-get-locked + post: psp-device-unlock + - + name: dev-change-ntf + doc: Notification about device configuration being changed. + notify: dev-get + mcgrp: mgmt + +mcast-groups: + list: + - + name: mgmt + +... diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f5a840c07cf1..1c54d44805fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1906,6 +1906,7 @@ enum netdev_reg_state { * device struct * @mpls_ptr: mpls_dev struct pointer * @mctp_ptr: MCTP specific data + * @psp_dev: PSP crypto device registered for this netdev * * @dev_addr: Hw address (before bcast, * because most packets are unicast) @@ -2310,6 +2311,9 @@ struct net_device { #if IS_ENABLED(CONFIG_MCTP) struct mctp_dev __rcu *mctp_ptr; #endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_dev __rcu *psp_dev; +#endif /* * Cache lines mostly used on receive path (including eth_type_trans()) diff --git a/include/net/psp.h b/include/net/psp.h new file mode 100644 index 000000000000..33bb4d1dc46e --- /dev/null +++ b/include/net/psp.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_ALL_H +#define __NET_PSP_ALL_H + +#include +#include +#include + +/* Do not add any code here. Put it in the sub-headers instead. */ + +#endif /* __NET_PSP_ALL_H */ diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h new file mode 100644 index 000000000000..074f9df9afc3 --- /dev/null +++ b/include/net/psp/functions.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_HELPERS_H +#define __NET_PSP_HELPERS_H + +#include + +/* Driver-facing API */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, + struct psp_dev_caps *psd_caps, void *priv_ptr); +void psp_dev_unregister(struct psp_dev *psd); + +#endif /* __NET_PSP_HELPERS_H */ diff --git a/include/net/psp/types.h b/include/net/psp/types.h new file mode 100644 index 000000000000..d242b1ecee7d --- /dev/null +++ b/include/net/psp/types.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_H +#define __NET_PSP_H + +#include +#include + +struct netlink_ext_ack; + +#define PSP_DEFAULT_UDP_PORT 1000 + +struct psphdr { + u8 nexthdr; + u8 hdrlen; + u8 crypt_offset; + u8 verfl; + __be32 spi; + __be64 iv; + __be64 vc[]; /* optional */ +}; + +#define PSP_SPI_KEY_ID GENMASK(30, 0) +#define PSP_SPI_KEY_PHASE BIT(31) + +#define PSPHDR_CRYPT_OFFSET GENMASK(5, 0) + +#define PSPHDR_VERFL_SAMPLE BIT(7) +#define PSPHDR_VERFL_DROP BIT(6) +#define PSPHDR_VERFL_VERSION GENMASK(5, 2) +#define PSPHDR_VERFL_VIRT BIT(1) +#define PSPHDR_VERFL_ONE BIT(0) + +#define PSP_HDRLEN_NOOPT ((sizeof(struct psphdr) - 8) / 8) + +/** + * struct psp_dev_config - PSP device configuration + * @versions: PSP versions enabled on the device + */ +struct psp_dev_config { + u32 versions; +}; + +/** + * struct psp_dev - PSP device struct + * @main_netdev: original netdevice of this PSP device + * @ops: driver callbacks + * @caps: device capabilities + * @drv_priv: driver priv pointer + * @lock: instance lock, protects all fields + * @refcnt: reference count for the instance + * @id: instance id + * @config: current device configuration + * + * @rcu: RCU head for freeing the structure + */ +struct psp_dev { + struct net_device *main_netdev; + + struct psp_dev_ops *ops; + struct psp_dev_caps *caps; + void *drv_priv; + + struct mutex lock; + refcount_t refcnt; + + u32 id; + + struct psp_dev_config config; + + struct rcu_head rcu; +}; + +/** + * struct psp_dev_caps - PSP device capabilities + */ +struct psp_dev_caps { + /** + * @versions: mask of supported PSP versions + * Set this field to 0 to indicate PSP is not supported at all. + */ + u32 versions; +}; + +#define PSP_MAX_KEY 32 + +/** + * struct psp_dev_ops - netdev driver facing PSP callbacks + */ +struct psp_dev_ops { + /** + * @set_config: set configuration of a PSP device + * Driver can inspect @psd->config for the previous configuration. + * Core will update @psd->config with @config on success. + */ + int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, + struct netlink_ext_ack *extack); +}; + +#endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h new file mode 100644 index 000000000000..4a404f085190 --- /dev/null +++ b/include/uapi/linux/psp.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_PSP_H +#define _UAPI_LINUX_PSP_H + +#define PSP_FAMILY_NAME "psp" +#define PSP_FAMILY_VERSION 1 + +enum psp_version { + PSP_VERSION_HDR0_AES_GCM_128, + PSP_VERSION_HDR0_AES_GCM_256, + PSP_VERSION_HDR0_AES_GMAC_128, + PSP_VERSION_HDR0_AES_GMAC_256, +}; + +enum { + PSP_A_DEV_ID = 1, + PSP_A_DEV_IFINDEX, + PSP_A_DEV_PSP_VERSIONS_CAP, + PSP_A_DEV_PSP_VERSIONS_ENA, + + __PSP_A_DEV_MAX, + PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1) +}; + +enum { + PSP_CMD_DEV_GET = 1, + PSP_CMD_DEV_ADD_NTF, + PSP_CMD_DEV_DEL_NTF, + PSP_CMD_DEV_SET, + PSP_CMD_DEV_CHANGE_NTF, + + __PSP_CMD_MAX, + PSP_CMD_MAX = (__PSP_CMD_MAX - 1) +}; + +#define PSP_MCGRP_MGMT "mgmt" + +#endif /* _UAPI_LINUX_PSP_H */ diff --git a/net/Kconfig b/net/Kconfig index d5865cf19799..4b563aea4c23 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -82,6 +82,7 @@ config NET_CRC32C menu "Networking options" source "net/packet/Kconfig" +source "net/psp/Kconfig" source "net/unix/Kconfig" source "net/tls/Kconfig" source "net/xfrm/Kconfig" diff --git a/net/Makefile b/net/Makefile index aac960c41db6..90e3d72bf58b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ +obj-$(CONFIG_INET_PSP) += psp/ obj-y += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/psp/Kconfig b/net/psp/Kconfig new file mode 100644 index 000000000000..55f9dd87446b --- /dev/null +++ b/net/psp/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# PSP configuration +# +config INET_PSP + bool "PSP Security Protocol support" + depends on INET + help + Enable kernel support for the PSP protocol. + For more information see: + https://raw.githubusercontent.com/google/psp/main/doc/PSP_Arch_Spec.pdf + + If unsure, say N. diff --git a/net/psp/Makefile b/net/psp/Makefile new file mode 100644 index 000000000000..41b51d06e560 --- /dev/null +++ b/net/psp/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_INET_PSP) += psp.o + +psp-y := psp_main.o psp_nl.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c new file mode 100644 index 000000000000..859712e7c2c1 --- /dev/null +++ b/net/psp/psp-nl-gen.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "psp-nl-gen.h" + +#include + +/* PSP_CMD_DEV_GET - do */ +static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* PSP_CMD_DEV_SET - do */ +static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), +}; + +/* Ops table for psp */ +static const struct genl_split_ops psp_nl_ops[] = { + { + .cmd = PSP_CMD_DEV_GET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_get_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_get_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_DEV_GET, + .dumpit = psp_nl_dev_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = PSP_CMD_DEV_SET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_set_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_set_nl_policy, + .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group psp_nl_mcgrps[] = { + [PSP_NLGRP_MGMT] = { "mgmt", }, +}; + +struct genl_family psp_nl_family __ro_after_init = { + .name = PSP_FAMILY_NAME, + .version = PSP_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = psp_nl_ops, + .n_split_ops = ARRAY_SIZE(psp_nl_ops), + .mcgrps = psp_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(psp_nl_mcgrps), +}; diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h new file mode 100644 index 000000000000..a099686cab5d --- /dev/null +++ b/net/psp/psp-nl-gen.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_PSP_GEN_H +#define _LINUX_PSP_GEN_H + +#include +#include + +#include + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + PSP_NLGRP_MGMT, +}; + +extern struct genl_family psp_nl_family; + +#endif /* _LINUX_PSP_GEN_H */ diff --git a/net/psp/psp.h b/net/psp/psp.h new file mode 100644 index 000000000000..94d0cc31a61f --- /dev/null +++ b/net/psp/psp.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __PSP_PSP_H +#define __PSP_PSP_H + +#include +#include +#include +#include +#include + +extern struct xarray psp_devs; +extern struct mutex psp_devs_lock; + +void psp_dev_destroy(struct psp_dev *psd); +int psp_dev_check_access(struct psp_dev *psd, struct net *net); + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); + +static inline void psp_dev_get(struct psp_dev *psd) +{ + refcount_inc(&psd->refcnt); +} + +static inline void psp_dev_put(struct psp_dev *psd) +{ + if (refcount_dec_and_test(&psd->refcnt)) + psp_dev_destroy(psd); +} + +#endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c new file mode 100644 index 000000000000..e09499b7b14a --- /dev/null +++ b/net/psp/psp_main.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp.h" +#include "psp-nl-gen.h" + +DEFINE_XARRAY_ALLOC1(psp_devs); +struct mutex psp_devs_lock; + +/** + * DOC: PSP locking + * + * psp_devs_lock protects the psp_devs xarray. + * Ordering is take the psp_devs_lock and then the instance lock. + * Each instance is protected by RCU, and has a refcount. + * When driver unregisters the instance gets flushed, but struct sticks around. + */ + +/** + * psp_dev_check_access() - check if user in a given net ns can access PSP dev + * @psd: PSP device structure user is trying to access + * @net: net namespace user is in + * + * Return: 0 if PSP device should be visible in @net, errno otherwise. + */ +int psp_dev_check_access(struct psp_dev *psd, struct net *net) +{ + if (dev_net(psd->main_netdev) == net) + return 0; + return -ENOENT; +} + +/** + * psp_dev_create() - create and register PSP device + * @netdev: main netdevice + * @psd_ops: driver callbacks + * @psd_caps: device capabilities + * @priv_ptr: back-pointer to driver private data + * + * Return: pointer to allocated PSP device, or ERR_PTR. + */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, + struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, + void *priv_ptr) +{ + struct psp_dev *psd; + static u32 last_id; + int err; + + if (WARN_ON(!psd_caps->versions || + !psd_ops->set_config)) + return ERR_PTR(-EINVAL); + + psd = kzalloc(sizeof(*psd), GFP_KERNEL); + if (!psd) + return ERR_PTR(-ENOMEM); + + psd->main_netdev = netdev; + psd->ops = psd_ops; + psd->caps = psd_caps; + psd->drv_priv = priv_ptr; + + mutex_init(&psd->lock); + refcount_set(&psd->refcnt, 1); + + mutex_lock(&psp_devs_lock); + err = xa_alloc_cyclic(&psp_devs, &psd->id, psd, xa_limit_16b, + &last_id, GFP_KERNEL); + if (err) { + mutex_unlock(&psp_devs_lock); + kfree(psd); + return ERR_PTR(err); + } + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_ADD_NTF); + + rcu_assign_pointer(netdev->psp_dev, psd); + + mutex_unlock(&psd->lock); + + return psd; +} +EXPORT_SYMBOL(psp_dev_create); + +void psp_dev_destroy(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + xa_erase(&psp_devs, psd->id); + mutex_unlock(&psp_devs_lock); + + mutex_destroy(&psd->lock); + kfree_rcu(psd, rcu); +} + +/** + * psp_dev_unregister() - unregister PSP device + * @psd: PSP device structure + */ +void psp_dev_unregister(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + mutex_lock(&psd->lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_DEL_NTF); + + /* Wait until psp_dev_destroy() to call xa_erase() to prevent a + * different psd from being added to the xarray with this id, while + * there are still references to this psd being held. + */ + xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); + mutex_unlock(&psp_devs_lock); + + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); + + psd->ops = NULL; + psd->drv_priv = NULL; + + mutex_unlock(&psd->lock); + + psp_dev_put(psd); +} +EXPORT_SYMBOL(psp_dev_unregister); + +static int __init psp_init(void) +{ + mutex_init(&psp_devs_lock); + + return genl_register_family(&psp_nl_family); +} + +subsys_initcall(psp_init); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c new file mode 100644 index 000000000000..fda5ce800f82 --- /dev/null +++ b/net/psp/psp_nl.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp-nl-gen.h" +#include "psp.h" + +/* Netlink helpers */ + +static struct sk_buff *psp_nl_reply_new(struct genl_info *info) +{ + struct sk_buff *rsp; + void *hdr; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return NULL; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + nlmsg_free(rsp); + return NULL; + } + + return rsp; +} + +static int psp_nl_reply_send(struct sk_buff *rsp, struct genl_info *info) +{ + /* Note that this *only* works with a single message per skb! */ + nlmsg_end(rsp, (struct nlmsghdr *)rsp->data); + + return genlmsg_reply(rsp, info); +} + +/* Device stuff */ + +static struct psp_dev * +psp_device_get_and_lock(struct net *net, struct nlattr *dev_id) +{ + struct psp_dev *psd; + int err; + + mutex_lock(&psp_devs_lock); + psd = xa_load(&psp_devs, nla_get_u32(dev_id)); + if (!psd) { + mutex_unlock(&psp_devs_lock); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + err = psp_dev_check_access(psd, net); + if (err) { + mutex_unlock(&psd->lock); + return ERR_PTR(err); + } + + return psd; +} + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_ID)) + return -EINVAL; + + info->user_ptr[0] = psp_device_get_and_lock(genl_info_net(info), + info->attrs[PSP_A_DEV_ID]); + return PTR_ERR_OR_ZERO(info->user_ptr[0]); +} + +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + + mutex_unlock(&psd->lock); +} + +static int +psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(rsp, PSP_A_DEV_IFINDEX, psd->main_netdev->ifindex) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_CAP, psd->caps->versions) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_ENA, psd->config.versions)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + + if (!genl_has_listeners(&psp_nl_family, dev_net(psd->main_netdev), + PSP_NLGRP_MGMT)) + return; + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + genl_info_init_ntf(&info, &psp_nl_family, cmd); + if (psp_nl_dev_fill(psd, ntf, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_MGMT, GFP_KERNEL); +} + +int psp_nl_dev_get_doit(struct sk_buff *req, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct sk_buff *rsp; + int err; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + err = psp_nl_dev_fill(psd, rsp, info); + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +psp_nl_dev_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb, + struct psp_dev *psd) +{ + if (psp_dev_check_access(psd, sock_net(rsp->sk))) + return 0; + + return psp_nl_dev_fill(psd, rsp, genl_info_dump(cb)); +} + +int psp_nl_dev_get_dumpit(struct sk_buff *rsp, struct netlink_callback *cb) +{ + struct psp_dev *psd; + int err = 0; + + mutex_lock(&psp_devs_lock); + xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) { + mutex_lock(&psd->lock); + err = psp_nl_dev_get_dumpit_one(rsp, cb, psd); + mutex_unlock(&psd->lock); + if (err) + break; + } + mutex_unlock(&psp_devs_lock); + + return err; +} + +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct psp_dev_config new_config; + struct sk_buff *rsp; + int err; + + memcpy(&new_config, &psd->config, sizeof(new_config)); + + if (info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]) { + new_config.versions = + nla_get_u32(info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]); + if (new_config.versions & ~psd->caps->versions) { + NL_SET_ERR_MSG(info->extack, "Requested PSP versions not supported by the device"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(info->extack, "No settings present"); + return -EINVAL; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + if (memcmp(&new_config, &psd->config, sizeof(new_config))) { + err = psd->ops->set_config(psd, &new_config, info->extack); + if (err) + goto err_free_rsp; + + memcpy(&psd->config, &new_config, sizeof(new_config)); + } + + psp_nl_notify_dev(psd, PSP_CMD_DEV_CHANGE_NTF); + + return psp_nl_reply_send(rsp, info); + +err_free_rsp: + nlmsg_free(rsp); + return err; +} diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 90686e241157..865fd2e8519e 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -31,6 +31,7 @@ CFLAGS_ovpn:=$(call get_hdr_inc,_LINUX_OVPN_H,ovpn.h) CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_flow:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_vport:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) +CFLAGS_psp:=$(call get_hdr_inc,_LINUX_PSP_H,psp.h) CFLAGS_rt-addr:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ $(call get_hdr_inc,__LINUX_IF_ADDR_H,if_addr.h) CFLAGS_rt-link:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ -- cgit v1.2.3 From 117f02a49b7719b210d154a0d0e728001bf4af06 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:32 -0700 Subject: psp: add op for rotation of device key Rotating the device key is a key part of the PSP protocol design. Some external daemon needs to do it once a day, or so. Add a netlink op to perform this operation. Add a notification group for informing users that key has been rotated and they should rekey (next rotation will cut them off). Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-6-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 21 +++++++++++++++++++ include/net/psp/types.h | 5 +++++ include/uapi/linux/psp.h | 3 +++ net/psp/psp-nl-gen.c | 15 ++++++++++++++ net/psp/psp-nl-gen.h | 2 ++ net/psp/psp_main.c | 3 ++- net/psp/psp_nl.c | 40 ++++++++++++++++++++++++++++++++++++ 7 files changed, 88 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml index 706f4baf8764..054cc02b65ad 100644 --- a/Documentation/netlink/specs/psp.yaml +++ b/Documentation/netlink/specs/psp.yaml @@ -88,9 +88,30 @@ operations: notify: dev-get mcgrp: mgmt + - + name: key-rotate + doc: Rotate the device key. + attribute-set: dev + do: + request: + attributes: + - id + reply: + attributes: + - id + pre: psp-device-get-locked + post: psp-device-unlock + - + name: key-rotate-ntf + doc: Notification about device key getting rotated. + notify: key-rotate + mcgrp: use + mcast-groups: list: - name: mgmt + - + name: use ... diff --git a/include/net/psp/types.h b/include/net/psp/types.h index 4922fc8d42fd..66327fa80c92 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -102,6 +102,11 @@ struct psp_dev_ops { */ int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, struct netlink_ext_ack *extack); + + /** + * @key_rotate: rotate the device key + */ + int (*key_rotate)(struct psp_dev *psd, struct netlink_ext_ack *extack); }; #endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index 4a404f085190..cbfbf3f0f364 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -32,11 +32,14 @@ enum { PSP_CMD_DEV_DEL_NTF, PSP_CMD_DEV_SET, PSP_CMD_DEV_CHANGE_NTF, + PSP_CMD_KEY_ROTATE, + PSP_CMD_KEY_ROTATE_NTF, __PSP_CMD_MAX, PSP_CMD_MAX = (__PSP_CMD_MAX - 1) }; #define PSP_MCGRP_MGMT "mgmt" +#define PSP_MCGRP_USE "use" #endif /* _UAPI_LINUX_PSP_H */ diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c index 859712e7c2c1..7f49577ac72f 100644 --- a/net/psp/psp-nl-gen.c +++ b/net/psp/psp-nl-gen.c @@ -21,6 +21,11 @@ static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), }; +/* PSP_CMD_KEY_ROTATE - do */ +static const struct nla_policy psp_key_rotate_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + /* Ops table for psp */ static const struct genl_split_ops psp_nl_ops[] = { { @@ -46,10 +51,20 @@ static const struct genl_split_ops psp_nl_ops[] = { .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = PSP_CMD_KEY_ROTATE, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_key_rotate_doit, + .post_doit = psp_device_unlock, + .policy = psp_key_rotate_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group psp_nl_mcgrps[] = { [PSP_NLGRP_MGMT] = { "mgmt", }, + [PSP_NLGRP_USE] = { "use", }, }; struct genl_family psp_nl_family __ro_after_init = { diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h index a099686cab5d..00a2d4ec59e4 100644 --- a/net/psp/psp-nl-gen.h +++ b/net/psp/psp-nl-gen.h @@ -20,9 +20,11 @@ psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info); enum { PSP_NLGRP_MGMT, + PSP_NLGRP_USE, }; extern struct genl_family psp_nl_family; diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index e09499b7b14a..f60155493afc 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -54,7 +54,8 @@ psp_dev_create(struct net_device *netdev, int err; if (WARN_ON(!psd_caps->versions || - !psd_ops->set_config)) + !psd_ops->set_config || + !psd_ops->key_rotate)) return ERR_PTR(-EINVAL); psd = kzalloc(sizeof(*psd), GFP_KERNEL); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index fda5ce800f82..75f2702c1029 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -221,3 +221,43 @@ err_free_rsp: nlmsg_free(rsp); return err; } + +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct genl_info ntf_info; + struct sk_buff *ntf, *rsp; + int err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + genl_info_init_ntf(&ntf_info, &psp_nl_family, PSP_CMD_KEY_ROTATE_NTF); + ntf = psp_nl_reply_new(&ntf_info); + if (!ntf) { + err = -ENOMEM; + goto err_free_rsp; + } + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(ntf, PSP_A_DEV_ID, psd->id)) { + err = -EMSGSIZE; + goto err_free_ntf; + } + + err = psd->ops->key_rotate(psd, info->extack); + if (err) + goto err_free_ntf; + + nlmsg_end(ntf, (struct nlmsghdr *)ntf->data); + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_USE, GFP_KERNEL); + return psp_nl_reply_send(rsp, info); + +err_free_ntf: + nlmsg_free(ntf); +err_free_rsp: + nlmsg_free(rsp); + return err; +} -- cgit v1.2.3 From 6b46ca260e2290e3453d1355ab5b6d283d73d780 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:35 -0700 Subject: net: psp: add socket security association code Add the ability to install PSP Rx and Tx crypto keys on TCP connections. Netlink ops are provided for both operations. Rx side combines allocating a new Rx key and installing it on the socket. Theoretically these are separate actions, but in practice they will always be used one after the other. We can add distinct "alloc" and "install" ops later. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-9-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 70 +++++++++ include/net/psp/functions.h | 114 +++++++++++++-- include/net/psp/types.h | 57 ++++++++ include/uapi/linux/psp.h | 21 +++ net/psp/Kconfig | 1 + net/psp/Makefile | 2 +- net/psp/psp-nl-gen.c | 39 +++++ net/psp/psp-nl-gen.h | 7 + net/psp/psp.h | 22 +++ net/psp/psp_main.c | 26 +++- net/psp/psp_nl.c | 232 +++++++++++++++++++++++++++++ net/psp/psp_sock.c | 274 +++++++++++++++++++++++++++++++++++ 12 files changed, 854 insertions(+), 11 deletions(-) create mode 100644 net/psp/psp_sock.c (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml index 054cc02b65ad..944429e5c9a8 100644 --- a/Documentation/netlink/specs/psp.yaml +++ b/Documentation/netlink/specs/psp.yaml @@ -38,6 +38,44 @@ attribute-sets: type: u32 enum: version enum-as-flags: true + - + name: assoc + attributes: + - + name: dev-id + doc: PSP device ID. + type: u32 + checks: + min: 1 + - + name: version + doc: | + PSP versions (AEAD and protocol version) used by this association, + dictates the size of the key. + type: u32 + enum: version + - + name: rx-key + type: nest + nested-attributes: keys + - + name: tx-key + type: nest + nested-attributes: keys + - + name: sock-fd + doc: Sockets which should be bound to the association immediately. + type: u32 + - + name: keys + attributes: + - + name: key + type: binary + - + name: spi + doc: Security Parameters Index (SPI) of the association. + type: u32 operations: list: @@ -107,6 +145,38 @@ operations: notify: key-rotate mcgrp: use + - + name: rx-assoc + doc: Allocate a new Rx key + SPI pair, associate it with a socket. + attribute-set: assoc + do: + request: + attributes: + - dev-id + - version + - sock-fd + reply: + attributes: + - dev-id + - rx-key + pre: psp-assoc-device-get-locked + post: psp-device-unlock + - + name: tx-assoc + doc: Add a PSP Tx association. + attribute-set: assoc + do: + request: + attributes: + - dev-id + - version + - tx-key + - sock-fd + reply: + attributes: [] + pre: psp-assoc-device-get-locked + post: psp-device-unlock + mcast-groups: list: - diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 1ccc5fc238b8..0d7141230f47 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -4,7 +4,9 @@ #define __NET_PSP_HELPERS_H #include +#include #include +#include #include struct inet_timewait_sock; @@ -16,41 +18,130 @@ psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, void psp_dev_unregister(struct psp_dev *psd); /* Kernel-facing API */ +void psp_assoc_put(struct psp_assoc *pas); + +static inline void *psp_assoc_drv_data(struct psp_assoc *pas) +{ + return pas->drv_data; +} + #if IS_ENABLED(CONFIG_INET_PSP) -static inline void psp_sk_assoc_free(struct sock *sk) { } -static inline void -psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } -static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } -static inline void -psp_reply_set_decrypted(struct sk_buff *skb) { } +unsigned int psp_key_size(u32 version); +void psp_sk_assoc_free(struct sock *sk); +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk); +void psp_twsk_assoc_free(struct inet_timewait_sock *tw); +void psp_reply_set_decrypted(struct sk_buff *skb); + +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return rcu_dereference_check(sk->psp_assoc, lockdep_sock_is_held(sk)); +} static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { + struct psp_assoc *pas; + + pas = psp_sk_assoc(sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; } static inline unsigned long __psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, unsigned long diffs) { + struct psp_skb_ext *a, *b; + + a = skb_ext_find(one, SKB_EXT_PSP); + b = skb_ext_find(two, SKB_EXT_PSP); + + diffs |= (!!a) ^ (!!b); + if (!diffs && unlikely(a)) + diffs |= memcmp(a, b, sizeof(*a)); return diffs; } +static inline bool +psp_is_allowed_nondata(struct sk_buff *skb, struct psp_assoc *pas) +{ + bool fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN); + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + u32 seq = TCP_SKB_CB(skb)->seq; + bool pure_fin; + + pure_fin = fin && end_seq - seq == 1; + + return seq == end_seq || (pure_fin && seq == pas->upgrade_seq); +} + +static inline bool +psp_pse_matches_pas(struct psp_skb_ext *pse, struct psp_assoc *pas) +{ + return pse && pas->rx.spi == pse->spi && + pas->generation == pse->generation && + pas->version == pse->version && + pas->dev_id == pse->dev_id; +} + +static inline enum skb_drop_reason +__psp_sk_rx_policy_check(struct sk_buff *skb, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse = skb_ext_find(skb, SKB_EXT_PSP); + + if (!pas) + return pse ? SKB_DROP_REASON_PSP_INPUT : 0; + + if (likely(psp_pse_matches_pas(pse, pas))) { + if (unlikely(!pas->peer_tx)) + pas->peer_tx = 1; + + return 0; + } + + if (!pse) { + if (!pas->tx.spi || + (!pas->peer_tx && psp_is_allowed_nondata(skb, pas))) + return 0; + } + + return SKB_DROP_REASON_PSP_INPUT; +} + static inline enum skb_drop_reason psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) { - return 0; + return __psp_sk_rx_policy_check(skb, psp_sk_assoc(sk)); } static inline enum skb_drop_reason psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) { - return 0; + return __psp_sk_rx_policy_check(skb, rcu_dereference(tw->psp_assoc)); +} + +static inline struct psp_assoc *psp_sk_get_assoc_rcu(struct sock *sk) +{ + struct inet_timewait_sock *tw; + struct psp_assoc *pas; + int state; + + state = 1 << READ_ONCE(sk->sk_state); + if (!sk_is_inet(sk) || state & TCPF_NEW_SYN_RECV) + return NULL; + + tw = inet_twsk(sk); + pas = state & TCPF_TIME_WAIT ? rcu_dereference(tw->psp_assoc) : + rcu_dereference(sk->psp_assoc); + return pas; } static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { - return NULL; + if (!skb->decrypted || !skb->sk) + return NULL; + + return psp_sk_get_assoc_rcu(skb->sk); } #else static inline void psp_sk_assoc_free(struct sock *sk) { } @@ -60,6 +151,11 @@ static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } static inline void psp_reply_set_decrypted(struct sk_buff *skb) { } +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return NULL; +} + static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { } diff --git a/include/net/psp/types.h b/include/net/psp/types.h index 66327fa80c92..b0e32e7165a3 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -51,6 +51,7 @@ struct psp_dev_config { * @refcnt: reference count for the instance * @id: instance id * @config: current device configuration + * @active_assocs: list of registered associations * * @rcu: RCU head for freeing the structure */ @@ -68,6 +69,8 @@ struct psp_dev { struct psp_dev_config config; + struct list_head active_assocs; + struct rcu_head rcu; }; @@ -80,6 +83,12 @@ struct psp_dev_caps { * Set this field to 0 to indicate PSP is not supported at all. */ u32 versions; + + /** + * @assoc_drv_spc: size of driver-specific state in Tx assoc + * Determines the size of struct psp_assoc::drv_spc + */ + u32 assoc_drv_spc; }; #define PSP_MAX_KEY 32 @@ -91,6 +100,32 @@ struct psp_skb_ext { u8 version; }; +struct psp_key_parsed { + __be32 spi; + u8 key[PSP_MAX_KEY]; +}; + +struct psp_assoc { + struct psp_dev *psd; + + u16 dev_id; + u8 generation; + u8 version; + u8 peer_tx; + + u32 upgrade_seq; + + struct psp_key_parsed tx; + struct psp_key_parsed rx; + + refcount_t refcnt; + struct rcu_head rcu; + struct work_struct work; + struct list_head assocs_list; + + u8 drv_data[] __aligned(8); +}; + /** * struct psp_dev_ops - netdev driver facing PSP callbacks */ @@ -107,6 +142,28 @@ struct psp_dev_ops { * @key_rotate: rotate the device key */ int (*key_rotate)(struct psp_dev *psd, struct netlink_ext_ack *extack); + + /** + * @rx_spi_alloc: allocate an Rx SPI+key pair + * Allocate an Rx SPI and resulting derived key. + * This key should remain valid until key rotation. + */ + int (*rx_spi_alloc)(struct psp_dev *psd, u32 version, + struct psp_key_parsed *assoc, + struct netlink_ext_ack *extack); + + /** + * @tx_key_add: add a Tx key to the device + * Install an association in the device. Core will allocate space + * for the driver to use at drv_data. + */ + int (*tx_key_add)(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack); + /** + * @tx_key_del: remove a Tx key from the device + * Remove an association from the device. + */ + void (*tx_key_del)(struct psp_dev *psd, struct psp_assoc *pas); }; #endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index cbfbf3f0f364..607c42c39ba5 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -26,6 +26,25 @@ enum { PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1) }; +enum { + PSP_A_ASSOC_DEV_ID = 1, + PSP_A_ASSOC_VERSION, + PSP_A_ASSOC_RX_KEY, + PSP_A_ASSOC_TX_KEY, + PSP_A_ASSOC_SOCK_FD, + + __PSP_A_ASSOC_MAX, + PSP_A_ASSOC_MAX = (__PSP_A_ASSOC_MAX - 1) +}; + +enum { + PSP_A_KEYS_KEY = 1, + PSP_A_KEYS_SPI, + + __PSP_A_KEYS_MAX, + PSP_A_KEYS_MAX = (__PSP_A_KEYS_MAX - 1) +}; + enum { PSP_CMD_DEV_GET = 1, PSP_CMD_DEV_ADD_NTF, @@ -34,6 +53,8 @@ enum { PSP_CMD_DEV_CHANGE_NTF, PSP_CMD_KEY_ROTATE, PSP_CMD_KEY_ROTATE_NTF, + PSP_CMD_RX_ASSOC, + PSP_CMD_TX_ASSOC, __PSP_CMD_MAX, PSP_CMD_MAX = (__PSP_CMD_MAX - 1) diff --git a/net/psp/Kconfig b/net/psp/Kconfig index 5e3908a40945..a7d24691a7e1 100644 --- a/net/psp/Kconfig +++ b/net/psp/Kconfig @@ -6,6 +6,7 @@ config INET_PSP bool "PSP Security Protocol support" depends on INET select SKB_DECRYPTED + select SOCK_VALIDATE_XMIT help Enable kernel support for the PSP protocol. For more information see: diff --git a/net/psp/Makefile b/net/psp/Makefile index 41b51d06e560..eb5ff3c5bfb2 100644 --- a/net/psp/Makefile +++ b/net/psp/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_INET_PSP) += psp.o -psp-y := psp_main.o psp_nl.o psp-nl-gen.o +psp-y := psp_main.o psp_nl.o psp_sock.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c index 7f49577ac72f..9fdd6f831803 100644 --- a/net/psp/psp-nl-gen.c +++ b/net/psp/psp-nl-gen.c @@ -10,6 +10,12 @@ #include +/* Common nested types */ +const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1] = { + [PSP_A_KEYS_KEY] = { .type = NLA_BINARY, }, + [PSP_A_KEYS_SPI] = { .type = NLA_U32, }, +}; + /* PSP_CMD_DEV_GET - do */ static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), @@ -26,6 +32,21 @@ static const struct nla_policy psp_key_rotate_nl_policy[PSP_A_DEV_ID + 1] = { [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), }; +/* PSP_CMD_RX_ASSOC - do */ +static const struct nla_policy psp_rx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + +/* PSP_CMD_TX_ASSOC - do */ +static const struct nla_policy psp_tx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_TX_KEY] = NLA_POLICY_NESTED(psp_keys_nl_policy), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + /* Ops table for psp */ static const struct genl_split_ops psp_nl_ops[] = { { @@ -60,6 +81,24 @@ static const struct genl_split_ops psp_nl_ops[] = { .maxattr = PSP_A_DEV_ID, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = PSP_CMD_RX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_rx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_rx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_TX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_tx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_tx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group psp_nl_mcgrps[] = { diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h index 00a2d4ec59e4..25268ed11fb5 100644 --- a/net/psp/psp-nl-gen.h +++ b/net/psp/psp-nl-gen.h @@ -11,8 +11,13 @@ #include +/* Common nested types */ +extern const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1]; + int psp_device_get_locked(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); void psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); @@ -21,6 +26,8 @@ int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info); enum { PSP_NLGRP_MGMT, diff --git a/net/psp/psp.h b/net/psp/psp.h index 94d0cc31a61f..defd3e3fd5e7 100644 --- a/net/psp/psp.h +++ b/net/psp/psp.h @@ -4,6 +4,7 @@ #define __PSP_PSP_H #include +#include #include #include #include @@ -17,15 +18,36 @@ int psp_dev_check_access(struct psp_dev *psd, struct net *net); void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); +struct psp_assoc *psp_assoc_create(struct psp_dev *psd); +struct psp_dev *psp_dev_get_for_sock(struct sock *sk); +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas); +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack); +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack); + static inline void psp_dev_get(struct psp_dev *psd) { refcount_inc(&psd->refcnt); } +static inline bool psp_dev_tryget(struct psp_dev *psd) +{ + return refcount_inc_not_zero(&psd->refcnt); +} + static inline void psp_dev_put(struct psp_dev *psd) { if (refcount_dec_and_test(&psd->refcnt)) psp_dev_destroy(psd); } +static inline bool psp_dev_is_registered(struct psp_dev *psd) +{ + lockdep_assert_held(&psd->lock); + return !!psd->ops; +} + #endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index f60155493afc..a1ae3c8920c3 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -55,7 +55,10 @@ psp_dev_create(struct net_device *netdev, if (WARN_ON(!psd_caps->versions || !psd_ops->set_config || - !psd_ops->key_rotate)) + !psd_ops->key_rotate || + !psd_ops->rx_spi_alloc || + !psd_ops->tx_key_add || + !psd_ops->tx_key_del)) return ERR_PTR(-EINVAL); psd = kzalloc(sizeof(*psd), GFP_KERNEL); @@ -68,6 +71,7 @@ psp_dev_create(struct net_device *netdev, psd->drv_priv = priv_ptr; mutex_init(&psd->lock); + INIT_LIST_HEAD(&psd->active_assocs); refcount_set(&psd->refcnt, 1); mutex_lock(&psp_devs_lock); @@ -107,6 +111,8 @@ void psp_dev_destroy(struct psp_dev *psd) */ void psp_dev_unregister(struct psp_dev *psd) { + struct psp_assoc *pas, *next; + mutex_lock(&psp_devs_lock); mutex_lock(&psd->lock); @@ -119,6 +125,9 @@ void psp_dev_unregister(struct psp_dev *psd) xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); mutex_unlock(&psp_devs_lock); + list_for_each_entry_safe(pas, next, &psd->active_assocs, assocs_list) + psp_dev_tx_key_del(psd, pas); + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); psd->ops = NULL; @@ -130,6 +139,21 @@ void psp_dev_unregister(struct psp_dev *psd) } EXPORT_SYMBOL(psp_dev_unregister); +unsigned int psp_key_size(u32 version) +{ + switch (version) { + case PSP_VERSION_HDR0_AES_GCM_128: + case PSP_VERSION_HDR0_AES_GMAC_128: + return 16; + case PSP_VERSION_HDR0_AES_GCM_256: + case PSP_VERSION_HDR0_AES_GMAC_256: + return 32; + default: + return 0; + } +} +EXPORT_SYMBOL(psp_key_size); + static int __init psp_init(void) { mutex_init(&psp_devs_lock); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index 75f2702c1029..1b1d08fce637 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -79,9 +79,12 @@ void psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { + struct socket *socket = info->user_ptr[1]; struct psp_dev *psd = info->user_ptr[0]; mutex_unlock(&psd->lock); + if (socket) + sockfd_put(socket); } static int @@ -261,3 +264,232 @@ err_free_rsp: nlmsg_free(rsp); return err; } + +/* Key etc. */ + +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket; + struct psp_dev *psd; + struct nlattr *id; + int fd, err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_SOCK_FD)) + return -EINVAL; + + fd = nla_get_u32(info->attrs[PSP_A_ASSOC_SOCK_FD]); + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + if (!sk_is_tcp(socket->sk)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[PSP_A_ASSOC_SOCK_FD], + "Unsupported socket family and type"); + err = -EOPNOTSUPP; + goto err_sock_put; + } + + psd = psp_dev_get_for_sock(socket->sk); + if (psd) { + err = psp_dev_check_access(psd, genl_info_net(info)); + if (err) { + psp_dev_put(psd); + psd = NULL; + } + } + + if (!psd && GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_DEV_ID)) { + err = -EINVAL; + goto err_sock_put; + } + + id = info->attrs[PSP_A_ASSOC_DEV_ID]; + if (psd) { + mutex_lock(&psd->lock); + if (id && psd->id != nla_get_u32(id)) { + mutex_unlock(&psd->lock); + NL_SET_ERR_MSG_ATTR(info->extack, id, + "Device id vs socket mismatch"); + err = -EINVAL; + goto err_psd_put; + } + + psp_dev_put(psd); + } else { + psd = psp_device_get_and_lock(genl_info_net(info), id); + if (IS_ERR(psd)) { + err = PTR_ERR(psd); + goto err_sock_put; + } + } + + info->user_ptr[0] = psd; + info->user_ptr[1] = socket; + + return 0; + +err_psd_put: + psp_dev_put(psd); +err_sock_put: + sockfd_put(socket); + return err; +} + +static int +psp_nl_parse_key(struct genl_info *info, u32 attr, struct psp_key_parsed *key, + unsigned int key_sz) +{ + struct nlattr *nest = info->attrs[attr]; + struct nlattr *tb[PSP_A_KEYS_SPI + 1]; + u32 spi; + int err; + + err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, + psp_keys_nl_policy, info->extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_KEY) || + NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_SPI)) + return -EINVAL; + + if (nla_len(tb[PSP_A_KEYS_KEY]) != key_sz) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "incorrect key length"); + return -EINVAL; + } + + spi = nla_get_u32(tb[PSP_A_KEYS_SPI]); + if (!(spi & PSP_SPI_KEY_ID)) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "invalid SPI: lower 31b must be non-zero"); + return -EINVAL; + } + + key->spi = cpu_to_be32(spi); + memcpy(key->key, nla_data(tb[PSP_A_KEYS_KEY]), key_sz); + + return 0; +} + +static int +psp_nl_put_key(struct sk_buff *skb, u32 attr, u32 version, + struct psp_key_parsed *key) +{ + int key_sz = psp_key_size(version); + void *nest; + + nest = nla_nest_start(skb, attr); + + if (nla_put_u32(skb, PSP_A_KEYS_SPI, be32_to_cpu(key->spi)) || + nla_put(skb, PSP_A_KEYS_KEY, key_sz, key->key)) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + + nla_nest_end(skb, nest); + + return 0; +} + +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct psp_assoc *pas; + struct sk_buff *rsp; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + pas = psp_assoc_create(psd); + if (!pas) { + err = -ENOMEM; + goto err_free_rsp; + } + pas->version = version; + + err = psd->ops->rx_spi_alloc(psd, version, &key, info->extack); + if (err) + goto err_free_pas; + + if (nla_put_u32(rsp, PSP_A_ASSOC_DEV_ID, psd->id) || + psp_nl_put_key(rsp, PSP_A_ASSOC_RX_KEY, version, &key)) { + err = -EMSGSIZE; + goto err_free_pas; + } + + err = psp_sock_assoc_set_rx(socket->sk, pas, &key, info->extack); + if (err) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_SOCK_FD]); + goto err_free_pas; + } + psp_assoc_put(pas); + + return psp_nl_reply_send(rsp, info); + +err_free_pas: + psp_assoc_put(pas); +err_free_rsp: + nlmsg_free(rsp); + return err; +} + +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct sk_buff *rsp; + unsigned int key_sz; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION) || + GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_TX_KEY)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + key_sz = psp_key_size(version); + if (!key_sz) + return -EINVAL; + + err = psp_nl_parse_key(info, PSP_A_ASSOC_TX_KEY, &key, key_sz); + if (err < 0) + return err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + err = psp_sock_assoc_set_tx(socket->sk, psd, version, &key, + info->extack); + if (err) + goto err_free_msg; + + return psp_nl_reply_send(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c new file mode 100644 index 000000000000..8ebccee94593 --- /dev/null +++ b/net/psp/psp_sock.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +#include +#include +#include "psp.h" + +struct psp_dev *psp_dev_get_for_sock(struct sock *sk) +{ + struct dst_entry *dst; + struct psp_dev *psd; + + dst = sk_dst_get(sk); + if (!dst) + return NULL; + + rcu_read_lock(); + psd = rcu_dereference(dst->dev->psp_dev); + if (psd && !psp_dev_tryget(psd)) + psd = NULL; + rcu_read_unlock(); + + dst_release(dst); + + return psd; +} + +static struct sk_buff * +psp_validate_xmit(struct sock *sk, struct net_device *dev, struct sk_buff *skb) +{ + struct psp_assoc *pas; + bool good; + + rcu_read_lock(); + pas = psp_skb_get_assoc_rcu(skb); + good = !pas || rcu_access_pointer(dev->psp_dev) == pas->psd; + rcu_read_unlock(); + if (!good) { + kfree_skb_reason(skb, SKB_DROP_REASON_PSP_OUTPUT); + return NULL; + } + + return skb; +} + +struct psp_assoc *psp_assoc_create(struct psp_dev *psd) +{ + struct psp_assoc *pas; + + lockdep_assert_held(&psd->lock); + + pas = kzalloc(struct_size(pas, drv_data, psd->caps->assoc_drv_spc), + GFP_KERNEL_ACCOUNT); + if (!pas) + return NULL; + + pas->psd = psd; + pas->dev_id = psd->id; + psp_dev_get(psd); + refcount_set(&pas->refcnt, 1); + + list_add_tail(&pas->assocs_list, &psd->active_assocs); + + return pas; +} + +static struct psp_assoc *psp_assoc_dummy(struct psp_assoc *pas) +{ + struct psp_dev *psd = pas->psd; + size_t sz; + + lockdep_assert_held(&psd->lock); + + sz = struct_size(pas, drv_data, psd->caps->assoc_drv_spc); + return kmemdup(pas, sz, GFP_KERNEL); +} + +static int psp_dev_tx_key_add(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack) +{ + return psd->ops->tx_key_add(psd, pas, extack); +} + +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas) +{ + if (pas->tx.spi) + psd->ops->tx_key_del(psd, pas); + list_del(&pas->assocs_list); +} + +static void psp_assoc_free(struct work_struct *work) +{ + struct psp_assoc *pas = container_of(work, struct psp_assoc, work); + struct psp_dev *psd = pas->psd; + + mutex_lock(&psd->lock); + if (psd->ops) + psp_dev_tx_key_del(psd, pas); + mutex_unlock(&psd->lock); + psp_dev_put(psd); + kfree(pas); +} + +static void psp_assoc_free_queue(struct rcu_head *head) +{ + struct psp_assoc *pas = container_of(head, struct psp_assoc, rcu); + + INIT_WORK(&pas->work, psp_assoc_free); + schedule_work(&pas->work); +} + +/** + * psp_assoc_put() - release a reference on a PSP association + * @pas: association to release + */ +void psp_assoc_put(struct psp_assoc *pas) +{ + if (pas && refcount_dec_and_test(&pas->refcnt)) + call_rcu(&pas->rcu, psp_assoc_free_queue); +} + +void psp_sk_assoc_free(struct sock *sk) +{ + struct psp_assoc *pas = rcu_dereference_protected(sk->psp_assoc, 1); + + rcu_assign_pointer(sk->psp_assoc, NULL); + psp_assoc_put(pas); +} + +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + int err; + + memcpy(&pas->rx, key, sizeof(*key)); + + lock_sock(sk); + + if (psp_sk_assoc(sk)) { + NL_SET_ERR_MSG(extack, "Socket already has PSP state"); + err = -EBUSY; + goto exit_unlock; + } + + refcount_inc(&pas->refcnt); + rcu_assign_pointer(sk->psp_assoc, pas); + err = 0; + +exit_unlock: + release_sock(sk); + + return err; +} + +static int psp_sock_recv_queue_check(struct sock *sk, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse; + struct sk_buff *skb; + + skb_rbtree_walk(skb, &tcp_sk(sk)->out_of_order_queue) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + + skb_queue_walk(&sk->sk_receive_queue, skb) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + return 0; +} + +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + struct psp_assoc *pas, *dummy; + int err; + + lock_sock(sk); + + pas = psp_sk_assoc(sk); + if (!pas) { + NL_SET_ERR_MSG(extack, "Socket has no Rx key"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->psd != psd) { + NL_SET_ERR_MSG(extack, "Rx key from different device"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->version != version) { + NL_SET_ERR_MSG(extack, + "PSP version mismatch with existing state"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->tx.spi) { + NL_SET_ERR_MSG(extack, "Tx key already set"); + err = -EBUSY; + goto exit_unlock; + } + + err = psp_sock_recv_queue_check(sk, pas); + if (err) { + NL_SET_ERR_MSG(extack, "Socket has incompatible segments already in the recv queue"); + goto exit_unlock; + } + + /* Pass a fake association to drivers to make sure they don't + * try to store pointers to it. For re-keying we'll need to + * re-allocate the assoc structures. + */ + dummy = psp_assoc_dummy(pas); + if (!dummy) { + err = -ENOMEM; + goto exit_unlock; + } + + memcpy(&dummy->tx, key, sizeof(*key)); + err = psp_dev_tx_key_add(psd, dummy, extack); + if (err) + goto exit_free_dummy; + + memcpy(pas->drv_data, dummy->drv_data, psd->caps->assoc_drv_spc); + memcpy(&pas->tx, key, sizeof(*key)); + + WRITE_ONCE(sk->sk_validate_xmit_skb, psp_validate_xmit); + tcp_write_collapse_fence(sk); + pas->upgrade_seq = tcp_sk(sk)->rcv_nxt; + +exit_free_dummy: + kfree(dummy); +exit_unlock: + release_sock(sk); + return err; +} + +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) +{ + struct psp_assoc *pas = psp_sk_assoc(sk); + + if (pas) + refcount_inc(&pas->refcnt); + rcu_assign_pointer(tw->psp_assoc, pas); + tw->tw_validate_xmit_skb = psp_validate_xmit; +} + +void psp_twsk_assoc_free(struct inet_timewait_sock *tw) +{ + struct psp_assoc *pas = rcu_dereference_protected(tw->psp_assoc, 1); + + rcu_assign_pointer(tw->psp_assoc, NULL); + psp_assoc_put(pas); +} + +void psp_reply_set_decrypted(struct sk_buff *skb) +{ + struct psp_assoc *pas; + + rcu_read_lock(); + pas = psp_sk_get_assoc_rcu(skb->sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; + rcu_read_unlock(); +} +EXPORT_IPV6_MOD_GPL(psp_reply_set_decrypted); -- cgit v1.2.3 From baefdbdf6812e120c9fba9cfb101d3656f478026 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:31 +0200 Subject: bpf: Implement exclusive map creation Exclusive maps allow maps to only be accessed by program with a program with a matching hash which is specified in the excl_prog_hash attr. For the signing use-case, this allows the trusted loader program to load the map and verify the integrity Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-3-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++++++ kernel/bpf/syscall.c | 31 +++++++++++++++++++++++++++---- kernel/bpf/verifier.c | 6 ++++++ tools/include/uapi/linux/bpf.h | 6 ++++++ 5 files changed, 46 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d75902074bd1..c6a6ee1b2938 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -329,6 +329,7 @@ struct bpf_map { atomic64_t sleepable_refcnt; s64 __percpu *elem_count; u64 cookie; /* write-once */ + char *excl_prog_sha; }; static inline const char *btf_field_type_name(enum btf_field_type type) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 233de8677382..57687b2e1c47 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1522,6 +1522,12 @@ union bpf_attr { * If provided, map_flags should have BPF_F_TOKEN_FD flag set. */ __s32 map_token_fd; + + /* Hash of the program that has exclusive access to the map. + */ + __aligned_u64 excl_prog_hash; + /* Size of the passed excl_prog_hash. */ + __u32 excl_prog_hash_size; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3f178a0f8eb1..c8ef91acfe98 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -860,6 +860,7 @@ static void bpf_map_free(struct bpf_map *map) * the free of values or special fields allocated from bpf memory * allocator. */ + kfree(map->excl_prog_sha); migrate_disable(); map->ops->map_free(map); migrate_enable(); @@ -1338,9 +1339,9 @@ static bool bpf_net_capable(void) return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); } -#define BPF_MAP_CREATE_LAST_FIELD map_token_fd +#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ -static int map_create(union bpf_attr *attr, bool kernel) +static int map_create(union bpf_attr *attr, bpfptr_t uattr) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1534,7 +1535,29 @@ static int map_create(union bpf_attr *attr, bool kernel) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token, kernel); + if (attr->excl_prog_hash) { + bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); + + if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { + err = -EINVAL; + goto free_map; + } + + map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!map->excl_prog_sha) { + err = -ENOMEM; + goto free_map; + } + + if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) { + err = -EFAULT; + goto free_map; + } + } else if (attr->excl_prog_hash_size) { + return -EINVAL; + } + + err = security_bpf_map_create(map, attr, token, uattr.is_kernel); if (err) goto free_map_sec; @@ -6008,7 +6031,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr, uattr.is_kernel); + err = map_create(&attr, uattr); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6625570ac23d..aef6b266f08d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20407,6 +20407,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); + if (map->excl_prog_sha && + memcmp(map->excl_prog_sha, prog->digest, SHA256_DIGEST_SIZE)) { + verbose(env, "program's hash doesn't match map's excl_prog_hash\n"); + return -EACCES; + } + if (btf_record_has_field(map->record, BPF_LIST_HEAD) || btf_record_has_field(map->record, BPF_RB_ROOT)) { if (is_tracing_prog_type(prog_type)) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 233de8677382..57687b2e1c47 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1522,6 +1522,12 @@ union bpf_attr { * If provided, map_flags should have BPF_F_TOKEN_FD flag set. */ __s32 map_token_fd; + + /* Hash of the program that has exclusive access to the map. + */ + __aligned_u64 excl_prog_hash; + /* Size of the passed excl_prog_hash. */ + __u32 excl_prog_hash_size; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ -- cgit v1.2.3 From ea2e6467ac36bf3d785defc89e58269b15d182f7 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:35 +0200 Subject: bpf: Return hashes of maps in BPF_OBJ_GET_INFO_BY_FD Currently only array maps are supported, but the implementation can be extended for other maps and objects. The hash is memoized only for exclusive and frozen maps as their content is stable until the exclusive program modifies the map. This is required for BPF signing, enabling a trusted loader program to verify a map's integrity. The loader retrieves the map's runtime hash from the kernel and compares it against an expected hash computed at build time. Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-7-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/arraymap.c | 13 ++++++++++++ kernel/bpf/syscall.c | 23 ++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ .../testing/selftests/bpf/progs/verifier_map_ptr.c | 7 +++++-- 6 files changed, 48 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c6a6ee1b2938..e0c2c78a5faa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -110,6 +111,7 @@ struct bpf_map_ops { long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); + int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -289,6 +291,7 @@ struct bpf_map_owner { }; struct bpf_map { + u8 sha[SHA256_DIGEST_SIZE]; const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 57687b2e1c47..0987b52d5648 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6672,6 +6672,8 @@ struct bpf_map_info { __u32 btf_value_type_id; __u32 btf_vmlinux_id; __u64 map_extra; + __aligned_u64 hash; + __u32 hash_size; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3d080916faf9..26d5dda989bc 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "map_in_map.h" @@ -174,6 +175,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + (u64)array->elem_size * (index & array->index_mask); } +static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size, + void *hash_buf) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + sha256(array->value, (u64)array->elem_size * array->map.max_entries, + hash_buf); + memcpy(array->map.sha, hash_buf, sizeof(array->map.sha)); + return 0; +} + static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) { @@ -800,6 +812,7 @@ const struct bpf_map_ops array_map_ops = { .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, + .map_get_hash = &array_map_get_hash, }; const struct bpf_map_ops percpu_array_map_ops = { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c8ef91acfe98..cf7173b1bb83 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ +#include #include #include #include @@ -5184,6 +5185,9 @@ static int bpf_map_get_info_by_fd(struct file *file, info_len = min_t(u32, sizeof(info), info_len); memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; @@ -5208,6 +5212,25 @@ static int bpf_map_get_info_by_fd(struct file *file, return err; } + if (info.hash) { + char __user *uhash = u64_to_user_ptr(info.hash); + + if (!map->ops->map_get_hash) + return -EINVAL; + + if (info.hash_size != SHA256_DIGEST_SIZE) + return -EINVAL; + + err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); + if (err != 0) + return err; + + if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0) + return -EFAULT; + } else if (info.hash_size) { + return -EINVAL; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 57687b2e1c47..0987b52d5648 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6672,6 +6672,8 @@ struct bpf_map_info { __u32 btf_value_type_id; __u32 btf_vmlinux_id; __u64 map_extra; + __aligned_u64 hash; + __u32 hash_size; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c index 11a079145966..e2767d27d8aa 100644 --- a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c @@ -70,10 +70,13 @@ __naked void bpf_map_ptr_write_rejected(void) : __clobber_all); } +/* The first element of struct bpf_map is a SHA256 hash of 32 bytes, accessing + * into this array is valid. The opts field is now at offset 33. + */ SEC("socket") __description("bpf_map_ptr: read non-existent field rejected") __failure -__msg("cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4") +__msg("cannot access ptr member ops with moff 32 in struct bpf_map with off 33 size 4") __failure_unpriv __msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN") __flag(BPF_F_ANY_ALIGNMENT) @@ -82,7 +85,7 @@ __naked void read_non_existent_field_rejected(void) asm volatile (" \ r6 = 0; \ r1 = %[map_array_48b] ll; \ - r6 = *(u32*)(r1 + 1); \ + r6 = *(u32*)(r1 + 33); \ r0 = 1; \ exit; \ " : -- cgit v1.2.3 From eafedbc7c050c44744fbdf80bdf3315e860b7513 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Fri, 19 Sep 2025 06:42:07 +0000 Subject: rust_binder: add Rust Binder driver We're generally not proponents of rewrites (nasty uncomfortable things that make you late for dinner!). So why rewrite Binder? Binder has been evolving over the past 15+ years to meet the evolving needs of Android. Its responsibilities, expectations, and complexity have grown considerably during that time. While we expect Binder to continue to evolve along with Android, there are a number of factors that currently constrain our ability to develop/maintain it. Briefly those are: 1. Complexity: Binder is at the intersection of everything in Android and fulfills many responsibilities beyond IPC. It has become many things to many people, and due to its many features and their interactions with each other, its complexity is quite high. In just 6kLOC it must deliver transactions to the right threads. It must correctly parse and translate the contents of transactions, which can contain several objects of different types (e.g., pointers, fds) that can interact with each other. It controls the size of thread pools in userspace, and ensures that transactions are assigned to threads in ways that avoid deadlocks where the threadpool has run out of threads. It must track refcounts of objects that are shared by several processes by forwarding refcount changes between the processes correctly. It must handle numerous error scenarios and it combines/nests 13 different locks, 7 reference counters, and atomic variables. Finally, It must do all of this as fast and efficiently as possible. Minor performance regressions can cause a noticeably degraded user experience. 2. Things to improve: Thousand-line functions [1], error-prone error handling [2], and confusing structure can occur as a code base grows organically. After more than a decade of development, this codebase could use an overhaul. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/android/binder.c?h=v6.5#n2896 [2]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/android/binder.c?h=v6.5#n3658 3. Security critical: Binder is a critical part of Android's sandboxing strategy. Even Android's most de-privileged sandboxes (e.g. the Chrome renderer, or SW Codec) have direct access to Binder. More than just about any other component, it's important that Binder provide robust security, and itself be robust against security vulnerabilities. It's #1 (high complexity) that has made continuing to evolve Binder and resolving #2 (tech debt) exceptionally difficult without causing #3 (security issues). For Binder to continue to meet Android's needs, we need better ways to manage (and reduce!) complexity without increasing the risk. The biggest change is obviously the choice of programming language. We decided to use Rust because it directly addresses a number of the challenges within Binder that we have faced during the last years. It prevents mistakes with ref counting, locking, bounds checking, and also does a lot to reduce the complexity of error handling. Additionally, we've been able to use the more expressive type system to encode the ownership semantics of the various structs and pointers, which takes the complexity of managing object lifetimes out of the hands of the programmer, reducing the risk of use-after-frees and similar problems. Rust has many different pointer types that it uses to encode ownership semantics into the type system, and this is probably one of the most important aspects of how it helps in Binder. The Binder driver has a lot of different objects that have complex ownership semantics; some pointers own a refcount, some pointers have exclusive ownership, and some pointers just reference the object and it is kept alive in some other manner. With Rust, we can use a different pointer type for each kind of pointer, which enables the compiler to enforce that the ownership semantics are implemented correctly. Another useful feature is Rust's error handling. Rust allows for more simplified error handling with features such as destructors, and you get compilation failures if errors are not properly handled. This means that even though Rust requires you to spend more lines of code than C on things such as writing down invariants that are left implicit in C, the Rust driver is still slightly smaller than C binder: Rust is 5.5kLOC and C is 5.8kLOC. (These numbers are excluding blank lines, comments, binderfs, and any debugging facilities in C that are not yet implemented in the Rust driver. The numbers include abstractions in rust/kernel/ that are unlikely to be used by other drivers than Binder.) Although this rewrite completely rethinks how the code is structured and how assumptions are enforced, we do not fundamentally change *how* the driver does the things it does. A lot of careful thought has gone into the existing design. The rewrite is aimed rather at improving code health, structure, readability, robustness, security, maintainability and extensibility. We also include more inline documentation, and improve how assumptions in the code are enforced. Furthermore, all unsafe code is annotated with a SAFETY comment that explains why it is correct. We have left the binderfs filesystem component in C. Rewriting it in Rust would be a large amount of work and requires a lot of bindings to the file system interfaces. Binderfs has not historically had the same challenges with security and complexity, so rewriting binderfs seems to have lower value than the rest of Binder. Correctness and feature parity ------------------------------ Rust binder passes all tests that validate the correctness of Binder in the Android Open Source Project. We can boot a device, and run a variety of apps and functionality without issues. We have performed this both on the Cuttlefish Android emulator device, and on a Pixel 6 Pro. As for feature parity, Rust binder currently implements all features that C binder supports, with the exception of some debugging facilities. The missing debugging facilities will be added before we submit the Rust implementation upstream. Tracepoints ----------- I did not include all of the tracepoints as I felt that the mechansim for making C access fields of Rust structs should be discussed on list separately. I also did not include the support for building Rust Binder as a module since that requires exporting a bunch of additional symbols on the C side. Original RFC Link with old benchmark numbers: https://lore.kernel.org/r/20231101-rust-binder-v1-0-08ba9197f637@google.com Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Matt Gilbride Signed-off-by: Matt Gilbride Acked-by: Carlos Llamas Acked-by: Paul Moore Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20250919-rust-binder-v2-1-a384b09f28dd@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/Kconfig | 15 +- drivers/android/Makefile | 1 + drivers/android/binder/Makefile | 9 + drivers/android/binder/allocation.rs | 602 +++++++++ drivers/android/binder/context.rs | 180 +++ drivers/android/binder/deferred_close.rs | 204 +++ drivers/android/binder/defs.rs | 182 +++ drivers/android/binder/error.rs | 99 ++ drivers/android/binder/freeze.rs | 388 ++++++ drivers/android/binder/node.rs | 1131 +++++++++++++++++ drivers/android/binder/node/wrapper.rs | 78 ++ drivers/android/binder/page_range.rs | 734 +++++++++++ drivers/android/binder/page_range_helper.c | 24 + drivers/android/binder/page_range_helper.h | 15 + drivers/android/binder/process.rs | 1696 +++++++++++++++++++++++++ drivers/android/binder/range_alloc/array.rs | 251 ++++ drivers/android/binder/range_alloc/mod.rs | 329 +++++ drivers/android/binder/range_alloc/tree.rs | 488 +++++++ drivers/android/binder/rust_binder.h | 23 + drivers/android/binder/rust_binder_events.c | 59 + drivers/android/binder/rust_binder_events.h | 36 + drivers/android/binder/rust_binder_internal.h | 87 ++ drivers/android/binder/rust_binder_main.rs | 627 +++++++++ drivers/android/binder/rust_binderfs.c | 850 +++++++++++++ drivers/android/binder/stats.rs | 89 ++ drivers/android/binder/thread.rs | 1596 +++++++++++++++++++++++ drivers/android/binder/trace.rs | 16 + drivers/android/binder/transaction.rs | 456 +++++++ include/uapi/linux/android/binder.h | 2 +- rust/bindings/bindings_helper.h | 8 + rust/helpers/binder.c | 26 + rust/helpers/helpers.c | 1 + rust/helpers/page.c | 8 + rust/helpers/security.c | 24 + rust/kernel/cred.rs | 6 + rust/kernel/page.rs | 6 + rust/kernel/security.rs | 37 + rust/uapi/uapi_helper.h | 1 + 38 files changed, 10382 insertions(+), 2 deletions(-) create mode 100644 drivers/android/binder/Makefile create mode 100644 drivers/android/binder/allocation.rs create mode 100644 drivers/android/binder/context.rs create mode 100644 drivers/android/binder/deferred_close.rs create mode 100644 drivers/android/binder/defs.rs create mode 100644 drivers/android/binder/error.rs create mode 100644 drivers/android/binder/freeze.rs create mode 100644 drivers/android/binder/node.rs create mode 100644 drivers/android/binder/node/wrapper.rs create mode 100644 drivers/android/binder/page_range.rs create mode 100644 drivers/android/binder/page_range_helper.c create mode 100644 drivers/android/binder/page_range_helper.h create mode 100644 drivers/android/binder/process.rs create mode 100644 drivers/android/binder/range_alloc/array.rs create mode 100644 drivers/android/binder/range_alloc/mod.rs create mode 100644 drivers/android/binder/range_alloc/tree.rs create mode 100644 drivers/android/binder/rust_binder.h create mode 100644 drivers/android/binder/rust_binder_events.c create mode 100644 drivers/android/binder/rust_binder_events.h create mode 100644 drivers/android/binder/rust_binder_internal.h create mode 100644 drivers/android/binder/rust_binder_main.rs create mode 100644 drivers/android/binder/rust_binderfs.c create mode 100644 drivers/android/binder/stats.rs create mode 100644 drivers/android/binder/thread.rs create mode 100644 drivers/android/binder/trace.rs create mode 100644 drivers/android/binder/transaction.rs create mode 100644 rust/helpers/binder.c (limited to 'include/uapi/linux') diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 75af3cf472c8..e2e402c9d175 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -14,6 +14,19 @@ config ANDROID_BINDER_IPC Android process, using Binder to identify, invoke and pass arguments between said processes. +config ANDROID_BINDER_IPC_RUST + bool "Rust version of Android Binder IPC Driver" + depends on RUST && MMU && !ANDROID_BINDER_IPC + help + This enables the Rust implementation of the Binder driver. + + Binder is used in Android for both communication between processes, + and remote method invocation. + + This means one Android process can call a method/routine in another + Android process, using Binder to identify, invoke and pass arguments + between said processes. + config ANDROID_BINDERFS bool "Android Binderfs filesystem" depends on ANDROID_BINDER_IPC @@ -28,7 +41,7 @@ config ANDROID_BINDERFS config ANDROID_BINDER_DEVICES string "Android Binder devices" - depends on ANDROID_BINDER_IPC + depends on ANDROID_BINDER_IPC || ANDROID_BINDER_IPC_RUST default "binder,hwbinder,vndbinder" help Default value for the binder.devices parameter. diff --git a/drivers/android/Makefile b/drivers/android/Makefile index f422f91e026b..e0c650d3898e 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -4,3 +4,4 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o binder_netlink.o obj-$(CONFIG_ANDROID_BINDER_ALLOC_KUNIT_TEST) += tests/ +obj-$(CONFIG_ANDROID_BINDER_IPC_RUST) += binder/ diff --git a/drivers/android/binder/Makefile b/drivers/android/binder/Makefile new file mode 100644 index 000000000000..09eabb527fa0 --- /dev/null +++ b/drivers/android/binder/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +ccflags-y += -I$(src) # needed for trace events + +obj-$(CONFIG_ANDROID_BINDER_IPC_RUST) += rust_binder.o +rust_binder-y := \ + rust_binder_main.o \ + rust_binderfs.o \ + rust_binder_events.o \ + page_range_helper.o diff --git a/drivers/android/binder/allocation.rs b/drivers/android/binder/allocation.rs new file mode 100644 index 000000000000..7f65a9c3a0e5 --- /dev/null +++ b/drivers/android/binder/allocation.rs @@ -0,0 +1,602 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use core::mem::{size_of, size_of_val, MaybeUninit}; +use core::ops::Range; + +use kernel::{ + bindings, + fs::file::{File, FileDescriptorReservation}, + prelude::*, + sync::{aref::ARef, Arc}, + transmute::{AsBytes, FromBytes}, + uaccess::UserSliceReader, + uapi, +}; + +use crate::{ + deferred_close::DeferredFdCloser, + defs::*, + node::{Node, NodeRef}, + process::Process, + DArc, +}; + +#[derive(Default)] +pub(crate) struct AllocationInfo { + /// Range within the allocation where we can find the offsets to the object descriptors. + pub(crate) offsets: Option>, + /// The target node of the transaction this allocation is associated to. + /// Not set for replies. + pub(crate) target_node: Option, + /// When this allocation is dropped, call `pending_oneway_finished` on the node. + /// + /// This is used to serialize oneway transaction on the same node. Binder guarantees that + /// oneway transactions to the same node are delivered sequentially in the order they are sent. + pub(crate) oneway_node: Option>, + /// Zero the data in the buffer on free. + pub(crate) clear_on_free: bool, + /// List of files embedded in this transaction. + file_list: FileList, +} + +/// Represents an allocation that the kernel is currently using. +/// +/// When allocations are idle, the range allocator holds the data related to them. +/// +/// # Invariants +/// +/// This allocation corresponds to an allocation in the range allocator, so the relevant pages are +/// marked in use in the page range. +pub(crate) struct Allocation { + pub(crate) offset: usize, + size: usize, + pub(crate) ptr: usize, + pub(crate) process: Arc, + allocation_info: Option, + free_on_drop: bool, + pub(crate) oneway_spam_detected: bool, + #[allow(dead_code)] + pub(crate) debug_id: usize, +} + +impl Allocation { + pub(crate) fn new( + process: Arc, + debug_id: usize, + offset: usize, + size: usize, + ptr: usize, + oneway_spam_detected: bool, + ) -> Self { + Self { + process, + offset, + size, + ptr, + debug_id, + oneway_spam_detected, + allocation_info: None, + free_on_drop: true, + } + } + + fn size_check(&self, offset: usize, size: usize) -> Result { + let overflow_fail = offset.checked_add(size).is_none(); + let cmp_size_fail = offset.wrapping_add(size) > self.size; + if overflow_fail || cmp_size_fail { + return Err(EFAULT); + } + Ok(()) + } + + pub(crate) fn copy_into( + &self, + reader: &mut UserSliceReader, + offset: usize, + size: usize, + ) -> Result { + self.size_check(offset, size)?; + + // SAFETY: While this object exists, the range allocator will keep the range allocated, and + // in turn, the pages will be marked as in use. + unsafe { + self.process + .pages + .copy_from_user_slice(reader, self.offset + offset, size) + } + } + + pub(crate) fn read(&self, offset: usize) -> Result { + self.size_check(offset, size_of::())?; + + // SAFETY: While this object exists, the range allocator will keep the range allocated, and + // in turn, the pages will be marked as in use. + unsafe { self.process.pages.read(self.offset + offset) } + } + + pub(crate) fn write(&self, offset: usize, obj: &T) -> Result { + self.size_check(offset, size_of_val::(obj))?; + + // SAFETY: While this object exists, the range allocator will keep the range allocated, and + // in turn, the pages will be marked as in use. + unsafe { self.process.pages.write(self.offset + offset, obj) } + } + + pub(crate) fn fill_zero(&self) -> Result { + // SAFETY: While this object exists, the range allocator will keep the range allocated, and + // in turn, the pages will be marked as in use. + unsafe { self.process.pages.fill_zero(self.offset, self.size) } + } + + pub(crate) fn keep_alive(mut self) { + self.process + .buffer_make_freeable(self.offset, self.allocation_info.take()); + self.free_on_drop = false; + } + + pub(crate) fn set_info(&mut self, info: AllocationInfo) { + self.allocation_info = Some(info); + } + + pub(crate) fn get_or_init_info(&mut self) -> &mut AllocationInfo { + self.allocation_info.get_or_insert_with(Default::default) + } + + pub(crate) fn set_info_offsets(&mut self, offsets: Range) { + self.get_or_init_info().offsets = Some(offsets); + } + + pub(crate) fn set_info_oneway_node(&mut self, oneway_node: DArc) { + self.get_or_init_info().oneway_node = Some(oneway_node); + } + + pub(crate) fn set_info_clear_on_drop(&mut self) { + self.get_or_init_info().clear_on_free = true; + } + + pub(crate) fn set_info_target_node(&mut self, target_node: NodeRef) { + self.get_or_init_info().target_node = Some(target_node); + } + + /// Reserve enough space to push at least `num_fds` fds. + pub(crate) fn info_add_fd_reserve(&mut self, num_fds: usize) -> Result { + self.get_or_init_info() + .file_list + .files_to_translate + .reserve(num_fds, GFP_KERNEL)?; + + Ok(()) + } + + pub(crate) fn info_add_fd( + &mut self, + file: ARef, + buffer_offset: usize, + close_on_free: bool, + ) -> Result { + self.get_or_init_info().file_list.files_to_translate.push( + FileEntry { + file, + buffer_offset, + close_on_free, + }, + GFP_KERNEL, + )?; + + Ok(()) + } + + pub(crate) fn set_info_close_on_free(&mut self, cof: FdsCloseOnFree) { + self.get_or_init_info().file_list.close_on_free = cof.0; + } + + pub(crate) fn translate_fds(&mut self) -> Result { + let file_list = match self.allocation_info.as_mut() { + Some(info) => &mut info.file_list, + None => return Ok(TranslatedFds::new()), + }; + + let files = core::mem::take(&mut file_list.files_to_translate); + + let num_close_on_free = files.iter().filter(|entry| entry.close_on_free).count(); + let mut close_on_free = KVec::with_capacity(num_close_on_free, GFP_KERNEL)?; + + let mut reservations = KVec::with_capacity(files.len(), GFP_KERNEL)?; + for file_info in files { + let res = FileDescriptorReservation::get_unused_fd_flags(bindings::O_CLOEXEC)?; + let fd = res.reserved_fd(); + self.write::(file_info.buffer_offset, &fd)?; + + reservations.push( + Reservation { + res, + file: file_info.file, + }, + GFP_KERNEL, + )?; + if file_info.close_on_free { + close_on_free.push(fd, GFP_KERNEL)?; + } + } + + Ok(TranslatedFds { + reservations, + close_on_free: FdsCloseOnFree(close_on_free), + }) + } + + /// Should the looper return to userspace when freeing this allocation? + pub(crate) fn looper_need_return_on_free(&self) -> bool { + // Closing fds involves pushing task_work for execution when we return to userspace. Hence, + // we should return to userspace asap if we are closing fds. + match self.allocation_info { + Some(ref info) => !info.file_list.close_on_free.is_empty(), + None => false, + } + } +} + +impl Drop for Allocation { + fn drop(&mut self) { + if !self.free_on_drop { + return; + } + + if let Some(mut info) = self.allocation_info.take() { + if let Some(oneway_node) = info.oneway_node.as_ref() { + oneway_node.pending_oneway_finished(); + } + + info.target_node = None; + + if let Some(offsets) = info.offsets.clone() { + let view = AllocationView::new(self, offsets.start); + for i in offsets.step_by(size_of::()) { + if view.cleanup_object(i).is_err() { + pr_warn!("Error cleaning up object at offset {}\n", i) + } + } + } + + for &fd in &info.file_list.close_on_free { + let closer = match DeferredFdCloser::new(GFP_KERNEL) { + Ok(closer) => closer, + Err(kernel::alloc::AllocError) => { + // Ignore allocation failures. + break; + } + }; + + // Here, we ignore errors. The operation can fail if the fd is not valid, or if the + // method is called from a kthread. However, this is always called from a syscall, + // so the latter case cannot happen, and we don't care about the first case. + let _ = closer.close_fd(fd); + } + + if info.clear_on_free { + if let Err(e) = self.fill_zero() { + pr_warn!("Failed to clear data on free: {:?}", e); + } + } + } + + self.process.buffer_raw_free(self.ptr); + } +} + +/// A wrapper around `Allocation` that is being created. +/// +/// If the allocation is destroyed while wrapped in this wrapper, then the allocation will be +/// considered to be part of a failed transaction. Successful transactions avoid that by calling +/// `success`, which skips the destructor. +#[repr(transparent)] +pub(crate) struct NewAllocation(pub(crate) Allocation); + +impl NewAllocation { + pub(crate) fn success(self) -> Allocation { + // This skips the destructor. + // + // SAFETY: This type is `#[repr(transparent)]`, so the layout matches. + unsafe { core::mem::transmute(self) } + } +} + +impl core::ops::Deref for NewAllocation { + type Target = Allocation; + fn deref(&self) -> &Allocation { + &self.0 + } +} + +impl core::ops::DerefMut for NewAllocation { + fn deref_mut(&mut self) -> &mut Allocation { + &mut self.0 + } +} + +/// A view into the beginning of an allocation. +/// +/// All attempts to read or write outside of the view will fail. To intentionally access outside of +/// this view, use the `alloc` field of this struct directly. +pub(crate) struct AllocationView<'a> { + pub(crate) alloc: &'a mut Allocation, + limit: usize, +} + +impl<'a> AllocationView<'a> { + pub(crate) fn new(alloc: &'a mut Allocation, limit: usize) -> Self { + AllocationView { alloc, limit } + } + + pub(crate) fn read(&self, offset: usize) -> Result { + if offset.checked_add(size_of::()).ok_or(EINVAL)? > self.limit { + return Err(EINVAL); + } + self.alloc.read(offset) + } + + pub(crate) fn write(&self, offset: usize, obj: &T) -> Result { + if offset.checked_add(size_of::()).ok_or(EINVAL)? > self.limit { + return Err(EINVAL); + } + self.alloc.write(offset, obj) + } + + pub(crate) fn copy_into( + &self, + reader: &mut UserSliceReader, + offset: usize, + size: usize, + ) -> Result { + if offset.checked_add(size).ok_or(EINVAL)? > self.limit { + return Err(EINVAL); + } + self.alloc.copy_into(reader, offset, size) + } + + pub(crate) fn transfer_binder_object( + &self, + offset: usize, + obj: &uapi::flat_binder_object, + strong: bool, + node_ref: NodeRef, + ) -> Result { + let mut newobj = FlatBinderObject::default(); + let node = node_ref.node.clone(); + if Arc::ptr_eq(&node_ref.node.owner, &self.alloc.process) { + // The receiving process is the owner of the node, so send it a binder object (instead + // of a handle). + let (ptr, cookie) = node.get_id(); + newobj.hdr.type_ = if strong { + BINDER_TYPE_BINDER + } else { + BINDER_TYPE_WEAK_BINDER + }; + newobj.flags = obj.flags; + newobj.__bindgen_anon_1.binder = ptr as _; + newobj.cookie = cookie as _; + self.write(offset, &newobj)?; + // Increment the user ref count on the node. It will be decremented as part of the + // destruction of the buffer, when we see a binder or weak-binder object. + node.update_refcount(true, 1, strong); + } else { + // The receiving process is different from the owner, so we need to insert a handle to + // the binder object. + let handle = self + .alloc + .process + .as_arc_borrow() + .insert_or_update_handle(node_ref, false)?; + newobj.hdr.type_ = if strong { + BINDER_TYPE_HANDLE + } else { + BINDER_TYPE_WEAK_HANDLE + }; + newobj.flags = obj.flags; + newobj.__bindgen_anon_1.handle = handle; + if self.write(offset, &newobj).is_err() { + // Decrement ref count on the handle we just created. + let _ = self + .alloc + .process + .as_arc_borrow() + .update_ref(handle, false, strong); + return Err(EINVAL); + } + } + + Ok(()) + } + + fn cleanup_object(&self, index_offset: usize) -> Result { + let offset = self.alloc.read(index_offset)?; + let header = self.read::(offset)?; + match header.type_ { + BINDER_TYPE_WEAK_BINDER | BINDER_TYPE_BINDER => { + let obj = self.read::(offset)?; + let strong = header.type_ == BINDER_TYPE_BINDER; + // SAFETY: The type is `BINDER_TYPE_{WEAK_}BINDER`, so the `binder` field is + // populated. + let ptr = unsafe { obj.__bindgen_anon_1.binder }; + let cookie = obj.cookie; + self.alloc.process.update_node(ptr, cookie, strong); + Ok(()) + } + BINDER_TYPE_WEAK_HANDLE | BINDER_TYPE_HANDLE => { + let obj = self.read::(offset)?; + let strong = header.type_ == BINDER_TYPE_HANDLE; + // SAFETY: The type is `BINDER_TYPE_{WEAK_}HANDLE`, so the `handle` field is + // populated. + let handle = unsafe { obj.__bindgen_anon_1.handle }; + self.alloc + .process + .as_arc_borrow() + .update_ref(handle, false, strong) + } + _ => Ok(()), + } + } +} + +/// A binder object as it is serialized. +/// +/// # Invariants +/// +/// All bytes must be initialized, and the value of `self.hdr.type_` must be one of the allowed +/// types. +#[repr(C)] +pub(crate) union BinderObject { + hdr: uapi::binder_object_header, + fbo: uapi::flat_binder_object, + fdo: uapi::binder_fd_object, + bbo: uapi::binder_buffer_object, + fdao: uapi::binder_fd_array_object, +} + +/// A view into a `BinderObject` that can be used in a match statement. +pub(crate) enum BinderObjectRef<'a> { + Binder(&'a mut uapi::flat_binder_object), + Handle(&'a mut uapi::flat_binder_object), + Fd(&'a mut uapi::binder_fd_object), + Ptr(&'a mut uapi::binder_buffer_object), + Fda(&'a mut uapi::binder_fd_array_object), +} + +impl BinderObject { + pub(crate) fn read_from(reader: &mut UserSliceReader) -> Result { + let object = Self::read_from_inner(|slice| { + let read_len = usize::min(slice.len(), reader.len()); + reader.clone_reader().read_slice(&mut slice[..read_len])?; + Ok(()) + })?; + + // If we used a object type smaller than the largest object size, then we've read more + // bytes than we needed to. However, we used `.clone_reader()` to avoid advancing the + // original reader. Now, we call `skip` so that the caller's reader is advanced by the + // right amount. + // + // The `skip` call fails if the reader doesn't have `size` bytes available. This could + // happen if the type header corresponds to an object type that is larger than the rest of + // the reader. + // + // Any extra bytes beyond the size of the object are inaccessible after this call, so + // reading them again from the `reader` later does not result in TOCTOU bugs. + reader.skip(object.size())?; + + Ok(object) + } + + /// Use the provided reader closure to construct a `BinderObject`. + /// + /// The closure should write the bytes for the object into the provided slice. + pub(crate) fn read_from_inner(reader: R) -> Result + where + R: FnOnce(&mut [u8; size_of::()]) -> Result<()>, + { + let mut obj = MaybeUninit::::zeroed(); + + // SAFETY: The lengths of `BinderObject` and `[u8; size_of::()]` are equal, + // and the byte array has an alignment requirement of one, so the pointer cast is okay. + // Additionally, `obj` was initialized to zeros, so the byte array will not be + // uninitialized. + (reader)(unsafe { &mut *obj.as_mut_ptr().cast() })?; + + // SAFETY: The entire object is initialized, so accessing this field is safe. + let type_ = unsafe { obj.assume_init_ref().hdr.type_ }; + if Self::type_to_size(type_).is_none() { + // The value of `obj.hdr_type_` was invalid. + return Err(EINVAL); + } + + // SAFETY: All bytes are initialized (since we zeroed them at the start) and we checked + // that `self.hdr.type_` is one of the allowed types, so the type invariants are satisfied. + unsafe { Ok(obj.assume_init()) } + } + + pub(crate) fn as_ref(&mut self) -> BinderObjectRef<'_> { + use BinderObjectRef::*; + // SAFETY: The constructor ensures that all bytes of `self` are initialized, and all + // variants of this union accept all initialized bit patterns. + unsafe { + match self.hdr.type_ { + BINDER_TYPE_WEAK_BINDER | BINDER_TYPE_BINDER => Binder(&mut self.fbo), + BINDER_TYPE_WEAK_HANDLE | BINDER_TYPE_HANDLE => Handle(&mut self.fbo), + BINDER_TYPE_FD => Fd(&mut self.fdo), + BINDER_TYPE_PTR => Ptr(&mut self.bbo), + BINDER_TYPE_FDA => Fda(&mut self.fdao), + // SAFETY: By the type invariant, the value of `self.hdr.type_` cannot have any + // other value than the ones checked above. + _ => core::hint::unreachable_unchecked(), + } + } + } + + pub(crate) fn size(&self) -> usize { + // SAFETY: The entire object is initialized, so accessing this field is safe. + let type_ = unsafe { self.hdr.type_ }; + + // SAFETY: The type invariants guarantee that the type field is correct. + unsafe { Self::type_to_size(type_).unwrap_unchecked() } + } + + fn type_to_size(type_: u32) -> Option { + match type_ { + BINDER_TYPE_WEAK_BINDER => Some(size_of::()), + BINDER_TYPE_BINDER => Some(size_of::()), + BINDER_TYPE_WEAK_HANDLE => Some(size_of::()), + BINDER_TYPE_HANDLE => Some(size_of::()), + BINDER_TYPE_FD => Some(size_of::()), + BINDER_TYPE_PTR => Some(size_of::()), + BINDER_TYPE_FDA => Some(size_of::()), + _ => None, + } + } +} + +#[derive(Default)] +struct FileList { + files_to_translate: KVec, + close_on_free: KVec, +} + +struct FileEntry { + /// The file for which a descriptor will be created in the recipient process. + file: ARef, + /// The offset in the buffer where the file descriptor is stored. + buffer_offset: usize, + /// Whether this fd should be closed when the allocation is freed. + close_on_free: bool, +} + +pub(crate) struct TranslatedFds { + reservations: KVec, + /// If commit is called, then these fds should be closed. (If commit is not called, then they + /// shouldn't be closed.) + close_on_free: FdsCloseOnFree, +} + +struct Reservation { + res: FileDescriptorReservation, + file: ARef, +} + +impl TranslatedFds { + pub(crate) fn new() -> Self { + Self { + reservations: KVec::new(), + close_on_free: FdsCloseOnFree(KVec::new()), + } + } + + pub(crate) fn commit(self) -> FdsCloseOnFree { + for entry in self.reservations { + entry.res.fd_install(entry.file); + } + + self.close_on_free + } +} + +pub(crate) struct FdsCloseOnFree(KVec); diff --git a/drivers/android/binder/context.rs b/drivers/android/binder/context.rs new file mode 100644 index 000000000000..3d135ec03ca7 --- /dev/null +++ b/drivers/android/binder/context.rs @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + error::Error, + list::{List, ListArc, ListLinks}, + prelude::*, + security, + str::{CStr, CString}, + sync::{Arc, Mutex}, + task::Kuid, +}; + +use crate::{error::BinderError, node::NodeRef, process::Process}; + +kernel::sync::global_lock! { + // SAFETY: We call `init` in the module initializer, so it's initialized before first use. + pub(crate) unsafe(uninit) static CONTEXTS: Mutex = ContextList { + list: List::new(), + }; +} + +pub(crate) struct ContextList { + list: List, +} + +pub(crate) fn get_all_contexts() -> Result>> { + let lock = CONTEXTS.lock(); + + let count = lock.list.iter().count(); + + let mut ctxs = KVec::with_capacity(count, GFP_KERNEL)?; + for ctx in &lock.list { + ctxs.push(Arc::from(ctx), GFP_KERNEL)?; + } + Ok(ctxs) +} + +/// This struct keeps track of the processes using this context, and which process is the context +/// manager. +struct Manager { + node: Option, + uid: Option, + all_procs: List, +} + +/// There is one context per binder file (/dev/binder, /dev/hwbinder, etc) +#[pin_data] +pub(crate) struct Context { + #[pin] + manager: Mutex, + pub(crate) name: CString, + #[pin] + links: ListLinks, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Context { untracked; } +} +kernel::list::impl_list_item! { + impl ListItem<0> for Context { + using ListLinks { self.links }; + } +} + +impl Context { + pub(crate) fn new(name: &CStr) -> Result> { + let name = CString::try_from(name)?; + let list_ctx = ListArc::pin_init::( + try_pin_init!(Context { + name, + links <- ListLinks::new(), + manager <- kernel::new_mutex!(Manager { + all_procs: List::new(), + node: None, + uid: None, + }, "Context::manager"), + }), + GFP_KERNEL, + )?; + + let ctx = list_ctx.clone_arc(); + CONTEXTS.lock().list.push_back(list_ctx); + + Ok(ctx) + } + + /// Called when the file for this context is unlinked. + /// + /// No-op if called twice. + pub(crate) fn deregister(&self) { + // SAFETY: We never add the context to any other linked list than this one, so it is either + // in this list, or not in any list. + unsafe { CONTEXTS.lock().list.remove(self) }; + } + + pub(crate) fn register_process(self: &Arc, proc: ListArc) { + if !Arc::ptr_eq(self, &proc.ctx) { + pr_err!("Context::register_process called on the wrong context."); + return; + } + self.manager.lock().all_procs.push_back(proc); + } + + pub(crate) fn deregister_process(self: &Arc, proc: &Process) { + if !Arc::ptr_eq(self, &proc.ctx) { + pr_err!("Context::deregister_process called on the wrong context."); + return; + } + // SAFETY: We just checked that this is the right list. + unsafe { self.manager.lock().all_procs.remove(proc) }; + } + + pub(crate) fn set_manager_node(&self, node_ref: NodeRef) -> Result { + let mut manager = self.manager.lock(); + if manager.node.is_some() { + pr_warn!("BINDER_SET_CONTEXT_MGR already set"); + return Err(EBUSY); + } + security::binder_set_context_mgr(&node_ref.node.owner.cred)?; + + // If the context manager has been set before, ensure that we use the same euid. + let caller_uid = Kuid::current_euid(); + if let Some(ref uid) = manager.uid { + if *uid != caller_uid { + return Err(EPERM); + } + } + + manager.node = Some(node_ref); + manager.uid = Some(caller_uid); + Ok(()) + } + + pub(crate) fn unset_manager_node(&self) { + let node_ref = self.manager.lock().node.take(); + drop(node_ref); + } + + pub(crate) fn get_manager_node(&self, strong: bool) -> Result { + self.manager + .lock() + .node + .as_ref() + .ok_or_else(BinderError::new_dead)? + .clone(strong) + .map_err(BinderError::from) + } + + pub(crate) fn for_each_proc(&self, mut func: F) + where + F: FnMut(&Process), + { + let lock = self.manager.lock(); + for proc in &lock.all_procs { + func(&proc); + } + } + + pub(crate) fn get_all_procs(&self) -> Result>> { + let lock = self.manager.lock(); + let count = lock.all_procs.iter().count(); + + let mut procs = KVec::with_capacity(count, GFP_KERNEL)?; + for proc in &lock.all_procs { + procs.push(Arc::from(proc), GFP_KERNEL)?; + } + Ok(procs) + } + + pub(crate) fn get_procs_with_pid(&self, pid: i32) -> Result>> { + let orig = self.get_all_procs()?; + let mut backing = KVec::with_capacity(orig.len(), GFP_KERNEL)?; + for proc in orig.into_iter().filter(|proc| proc.task.pid() == pid) { + backing.push(proc, GFP_KERNEL)?; + } + Ok(backing) + } +} diff --git a/drivers/android/binder/deferred_close.rs b/drivers/android/binder/deferred_close.rs new file mode 100644 index 000000000000..ac895c04d0cb --- /dev/null +++ b/drivers/android/binder/deferred_close.rs @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Logic for closing files in a deferred manner. +//! +//! This file could make sense to have in `kernel::fs`, but it was rejected for being too +//! Binder-specific. + +use core::mem::MaybeUninit; +use kernel::{ + alloc::{AllocError, Flags}, + bindings, + prelude::*, +}; + +/// Helper used for closing file descriptors in a way that is safe even if the file is currently +/// held using `fdget`. +/// +/// Additional motivation can be found in commit 80cd795630d6 ("binder: fix use-after-free due to +/// ksys_close() during fdget()") and in the comments on `binder_do_fd_close`. +pub(crate) struct DeferredFdCloser { + inner: KBox, +} + +/// SAFETY: This just holds an allocation with no real content, so there's no safety issue with +/// moving it across threads. +unsafe impl Send for DeferredFdCloser {} +/// SAFETY: This just holds an allocation with no real content, so there's no safety issue with +/// moving it across threads. +unsafe impl Sync for DeferredFdCloser {} + +/// # Invariants +/// +/// If the `file` pointer is non-null, then it points at a `struct file` and owns a refcount to +/// that file. +#[repr(C)] +struct DeferredFdCloserInner { + twork: MaybeUninit, + file: *mut bindings::file, +} + +impl DeferredFdCloser { + /// Create a new [`DeferredFdCloser`]. + pub(crate) fn new(flags: Flags) -> Result { + Ok(Self { + // INVARIANT: The `file` pointer is null, so the type invariant does not apply. + inner: KBox::new( + DeferredFdCloserInner { + twork: MaybeUninit::uninit(), + file: core::ptr::null_mut(), + }, + flags, + )?, + }) + } + + /// Schedule a task work that closes the file descriptor when this task returns to userspace. + /// + /// Fails if this is called from a context where we cannot run work when returning to + /// userspace. (E.g., from a kthread.) + pub(crate) fn close_fd(self, fd: u32) -> Result<(), DeferredFdCloseError> { + use bindings::task_work_notify_mode_TWA_RESUME as TWA_RESUME; + + // In this method, we schedule the task work before closing the file. This is because + // scheduling a task work is fallible, and we need to know whether it will fail before we + // attempt to close the file. + + // Task works are not available on kthreads. + let current = kernel::current!(); + + // Check if this is a kthread. + // SAFETY: Reading `flags` from a task is always okay. + if unsafe { ((*current.as_ptr()).flags & bindings::PF_KTHREAD) != 0 } { + return Err(DeferredFdCloseError::TaskWorkUnavailable); + } + + // Transfer ownership of the box's allocation to a raw pointer. This disables the + // destructor, so we must manually convert it back to a KBox to drop it. + // + // Until we convert it back to a `KBox`, there are no aliasing requirements on this + // pointer. + let inner = KBox::into_raw(self.inner); + + // The `callback_head` field is first in the struct, so this cast correctly gives us a + // pointer to the field. + let callback_head = inner.cast::(); + // SAFETY: This pointer offset operation does not go out-of-bounds. + let file_field = unsafe { core::ptr::addr_of_mut!((*inner).file) }; + + let current = current.as_ptr(); + + // SAFETY: This function currently has exclusive access to the `DeferredFdCloserInner`, so + // it is okay for us to perform unsynchronized writes to its `callback_head` field. + unsafe { bindings::init_task_work(callback_head, Some(Self::do_close_fd)) }; + + // SAFETY: This inserts the `DeferredFdCloserInner` into the task workqueue for the current + // task. If this operation is successful, then this transfers exclusive ownership of the + // `callback_head` field to the C side until it calls `do_close_fd`, and we don't touch or + // invalidate the field during that time. + // + // When the C side calls `do_close_fd`, the safety requirements of that method are + // satisfied because when a task work is executed, the callback is given ownership of the + // pointer. + // + // The file pointer is currently null. If it is changed to be non-null before `do_close_fd` + // is called, then that change happens due to the write at the end of this function, and + // that write has a safety comment that explains why the refcount can be dropped when + // `do_close_fd` runs. + let res = unsafe { bindings::task_work_add(current, callback_head, TWA_RESUME) }; + + if res != 0 { + // SAFETY: Scheduling the task work failed, so we still have ownership of the box, so + // we may destroy it. + unsafe { drop(KBox::from_raw(inner)) }; + + return Err(DeferredFdCloseError::TaskWorkUnavailable); + } + + // This removes the fd from the fd table in `current`. The file is not fully closed until + // `filp_close` is called. We are given ownership of one refcount to the file. + // + // SAFETY: This is safe no matter what `fd` is. If the `fd` is valid (that is, if the + // pointer is non-null), then we call `filp_close` on the returned pointer as required by + // `file_close_fd`. + let file = unsafe { bindings::file_close_fd(fd) }; + if file.is_null() { + // We don't clean up the task work since that might be expensive if the task work queue + // is long. Just let it execute and let it clean up for itself. + return Err(DeferredFdCloseError::BadFd); + } + + // Acquire a second refcount to the file. + // + // SAFETY: The `file` pointer points at a file with a non-zero refcount. + unsafe { bindings::get_file(file) }; + + // This method closes the fd, consuming one of our two refcounts. There could be active + // light refcounts created from that fd, so we must ensure that the file has a positive + // refcount for the duration of those active light refcounts. We do that by holding on to + // the second refcount until the current task returns to userspace. + // + // SAFETY: The `file` pointer is valid. Passing `current->files` as the file table to close + // it in is correct, since we just got the `fd` from `file_close_fd` which also uses + // `current->files`. + // + // Note: fl_owner_t is currently a void pointer. + unsafe { bindings::filp_close(file, (*current).files as bindings::fl_owner_t) }; + + // We update the file pointer that the task work is supposed to fput. This transfers + // ownership of our last refcount. + // + // INVARIANT: This changes the `file` field of a `DeferredFdCloserInner` from null to + // non-null. This doesn't break the type invariant for `DeferredFdCloserInner` because we + // still own a refcount to the file, so we can pass ownership of that refcount to the + // `DeferredFdCloserInner`. + // + // When `do_close_fd` runs, it must be safe for it to `fput` the refcount. However, this is + // the case because all light refcounts that are associated with the fd we closed + // previously must be dropped when `do_close_fd`, since light refcounts must be dropped + // before returning to userspace. + // + // SAFETY: Task works are executed on the current thread right before we return to + // userspace, so this write is guaranteed to happen before `do_close_fd` is called, which + // means that a race is not possible here. + unsafe { *file_field = file }; + + Ok(()) + } + + /// # Safety + /// + /// The provided pointer must point at the `twork` field of a `DeferredFdCloserInner` stored in + /// a `KBox`, and the caller must pass exclusive ownership of that `KBox`. Furthermore, if the + /// file pointer is non-null, then it must be okay to release the refcount by calling `fput`. + unsafe extern "C" fn do_close_fd(inner: *mut bindings::callback_head) { + // SAFETY: The caller just passed us ownership of this box. + let inner = unsafe { KBox::from_raw(inner.cast::()) }; + if !inner.file.is_null() { + // SAFETY: By the type invariants, we own a refcount to this file, and the caller + // guarantees that dropping the refcount now is okay. + unsafe { bindings::fput(inner.file) }; + } + // The allocation is freed when `inner` goes out of scope. + } +} + +/// Represents a failure to close an fd in a deferred manner. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub(crate) enum DeferredFdCloseError { + /// Closing the fd failed because we were unable to schedule a task work. + TaskWorkUnavailable, + /// Closing the fd failed because the fd does not exist. + BadFd, +} + +impl From for Error { + fn from(err: DeferredFdCloseError) -> Error { + match err { + DeferredFdCloseError::TaskWorkUnavailable => ESRCH, + DeferredFdCloseError::BadFd => EBADF, + } + } +} diff --git a/drivers/android/binder/defs.rs b/drivers/android/binder/defs.rs new file mode 100644 index 000000000000..33f51b4139c7 --- /dev/null +++ b/drivers/android/binder/defs.rs @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use core::mem::MaybeUninit; +use core::ops::{Deref, DerefMut}; +use kernel::{ + transmute::{AsBytes, FromBytes}, + uapi::{self, *}, +}; + +macro_rules! pub_no_prefix { + ($prefix:ident, $($newname:ident),+ $(,)?) => { + $(pub(crate) const $newname: u32 = kernel::macros::concat_idents!($prefix, $newname);)+ + }; +} + +pub_no_prefix!( + binder_driver_return_protocol_, + BR_TRANSACTION, + BR_TRANSACTION_SEC_CTX, + BR_REPLY, + BR_DEAD_REPLY, + BR_FAILED_REPLY, + BR_FROZEN_REPLY, + BR_NOOP, + BR_SPAWN_LOOPER, + BR_TRANSACTION_COMPLETE, + BR_TRANSACTION_PENDING_FROZEN, + BR_ONEWAY_SPAM_SUSPECT, + BR_OK, + BR_ERROR, + BR_INCREFS, + BR_ACQUIRE, + BR_RELEASE, + BR_DECREFS, + BR_DEAD_BINDER, + BR_CLEAR_DEATH_NOTIFICATION_DONE, + BR_FROZEN_BINDER, + BR_CLEAR_FREEZE_NOTIFICATION_DONE, +); + +pub_no_prefix!( + binder_driver_command_protocol_, + BC_TRANSACTION, + BC_TRANSACTION_SG, + BC_REPLY, + BC_REPLY_SG, + BC_FREE_BUFFER, + BC_ENTER_LOOPER, + BC_EXIT_LOOPER, + BC_REGISTER_LOOPER, + BC_INCREFS, + BC_ACQUIRE, + BC_RELEASE, + BC_DECREFS, + BC_INCREFS_DONE, + BC_ACQUIRE_DONE, + BC_REQUEST_DEATH_NOTIFICATION, + BC_CLEAR_DEATH_NOTIFICATION, + BC_DEAD_BINDER_DONE, + BC_REQUEST_FREEZE_NOTIFICATION, + BC_CLEAR_FREEZE_NOTIFICATION, + BC_FREEZE_NOTIFICATION_DONE, +); + +pub_no_prefix!( + flat_binder_object_flags_, + FLAT_BINDER_FLAG_ACCEPTS_FDS, + FLAT_BINDER_FLAG_TXN_SECURITY_CTX +); + +pub_no_prefix!( + transaction_flags_, + TF_ONE_WAY, + TF_ACCEPT_FDS, + TF_CLEAR_BUF, + TF_UPDATE_TXN +); + +pub(crate) use uapi::{ + BINDER_TYPE_BINDER, BINDER_TYPE_FD, BINDER_TYPE_FDA, BINDER_TYPE_HANDLE, BINDER_TYPE_PTR, + BINDER_TYPE_WEAK_BINDER, BINDER_TYPE_WEAK_HANDLE, +}; + +macro_rules! decl_wrapper { + ($newname:ident, $wrapped:ty) => { + // Define a wrapper around the C type. Use `MaybeUninit` to enforce that the value of + // padding bytes must be preserved. + #[derive(Copy, Clone)] + #[repr(transparent)] + pub(crate) struct $newname(MaybeUninit<$wrapped>); + + // SAFETY: This macro is only used with types where this is ok. + unsafe impl FromBytes for $newname {} + // SAFETY: This macro is only used with types where this is ok. + unsafe impl AsBytes for $newname {} + + impl Deref for $newname { + type Target = $wrapped; + fn deref(&self) -> &Self::Target { + // SAFETY: We use `MaybeUninit` only to preserve padding. The value must still + // always be valid. + unsafe { self.0.assume_init_ref() } + } + } + + impl DerefMut for $newname { + fn deref_mut(&mut self) -> &mut Self::Target { + // SAFETY: We use `MaybeUninit` only to preserve padding. The value must still + // always be valid. + unsafe { self.0.assume_init_mut() } + } + } + + impl Default for $newname { + fn default() -> Self { + // Create a new value of this type where all bytes (including padding) are zeroed. + Self(MaybeUninit::zeroed()) + } + } + }; +} + +decl_wrapper!(BinderNodeDebugInfo, uapi::binder_node_debug_info); +decl_wrapper!(BinderNodeInfoForRef, uapi::binder_node_info_for_ref); +decl_wrapper!(FlatBinderObject, uapi::flat_binder_object); +decl_wrapper!(BinderFdObject, uapi::binder_fd_object); +decl_wrapper!(BinderFdArrayObject, uapi::binder_fd_array_object); +decl_wrapper!(BinderObjectHeader, uapi::binder_object_header); +decl_wrapper!(BinderBufferObject, uapi::binder_buffer_object); +decl_wrapper!(BinderTransactionData, uapi::binder_transaction_data); +decl_wrapper!( + BinderTransactionDataSecctx, + uapi::binder_transaction_data_secctx +); +decl_wrapper!(BinderTransactionDataSg, uapi::binder_transaction_data_sg); +decl_wrapper!(BinderWriteRead, uapi::binder_write_read); +decl_wrapper!(BinderVersion, uapi::binder_version); +decl_wrapper!(BinderFrozenStatusInfo, uapi::binder_frozen_status_info); +decl_wrapper!(BinderFreezeInfo, uapi::binder_freeze_info); +decl_wrapper!(BinderFrozenStateInfo, uapi::binder_frozen_state_info); +decl_wrapper!(BinderHandleCookie, uapi::binder_handle_cookie); +decl_wrapper!(ExtendedError, uapi::binder_extended_error); + +impl BinderVersion { + pub(crate) fn current() -> Self { + Self(MaybeUninit::new(uapi::binder_version { + protocol_version: BINDER_CURRENT_PROTOCOL_VERSION as _, + })) + } +} + +impl BinderTransactionData { + pub(crate) fn with_buffers_size(self, buffers_size: u64) -> BinderTransactionDataSg { + BinderTransactionDataSg(MaybeUninit::new(uapi::binder_transaction_data_sg { + transaction_data: *self, + buffers_size, + })) + } +} + +impl BinderTransactionDataSecctx { + /// View the inner data as wrapped in `BinderTransactionData`. + pub(crate) fn tr_data(&mut self) -> &mut BinderTransactionData { + // SAFETY: Transparent wrapper is safe to transmute. + unsafe { + &mut *(&mut self.transaction_data as *mut uapi::binder_transaction_data + as *mut BinderTransactionData) + } + } +} + +impl ExtendedError { + pub(crate) fn new(id: u32, command: u32, param: i32) -> Self { + Self(MaybeUninit::new(uapi::binder_extended_error { + id, + command, + param, + })) + } +} diff --git a/drivers/android/binder/error.rs b/drivers/android/binder/error.rs new file mode 100644 index 000000000000..9921827267d0 --- /dev/null +++ b/drivers/android/binder/error.rs @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::prelude::*; + +use crate::defs::*; + +pub(crate) type BinderResult = core::result::Result; + +/// An error that will be returned to userspace via the `BINDER_WRITE_READ` ioctl rather than via +/// errno. +pub(crate) struct BinderError { + pub(crate) reply: u32, + source: Option, +} + +impl BinderError { + pub(crate) fn new_dead() -> Self { + Self { + reply: BR_DEAD_REPLY, + source: None, + } + } + + pub(crate) fn new_frozen() -> Self { + Self { + reply: BR_FROZEN_REPLY, + source: None, + } + } + + pub(crate) fn new_frozen_oneway() -> Self { + Self { + reply: BR_TRANSACTION_PENDING_FROZEN, + source: None, + } + } + + pub(crate) fn is_dead(&self) -> bool { + self.reply == BR_DEAD_REPLY + } + + pub(crate) fn as_errno(&self) -> kernel::ffi::c_int { + self.source.unwrap_or(EINVAL).to_errno() + } + + pub(crate) fn should_pr_warn(&self) -> bool { + self.source.is_some() + } +} + +/// Convert an errno into a `BinderError` and store the errno used to construct it. The errno +/// should be stored as the thread's extended error when given to userspace. +impl From for BinderError { + fn from(source: Error) -> Self { + Self { + reply: BR_FAILED_REPLY, + source: Some(source), + } + } +} + +impl From for BinderError { + fn from(source: kernel::fs::file::BadFdError) -> Self { + BinderError::from(Error::from(source)) + } +} + +impl From for BinderError { + fn from(_: kernel::alloc::AllocError) -> Self { + Self { + reply: BR_FAILED_REPLY, + source: Some(ENOMEM), + } + } +} + +impl core::fmt::Debug for BinderError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.reply { + BR_FAILED_REPLY => match self.source.as_ref() { + Some(source) => f + .debug_struct("BR_FAILED_REPLY") + .field("source", source) + .finish(), + None => f.pad("BR_FAILED_REPLY"), + }, + BR_DEAD_REPLY => f.pad("BR_DEAD_REPLY"), + BR_FROZEN_REPLY => f.pad("BR_FROZEN_REPLY"), + BR_TRANSACTION_PENDING_FROZEN => f.pad("BR_TRANSACTION_PENDING_FROZEN"), + BR_TRANSACTION_COMPLETE => f.pad("BR_TRANSACTION_COMPLETE"), + _ => f + .debug_struct("BinderError") + .field("reply", &self.reply) + .finish(), + } + } +} diff --git a/drivers/android/binder/freeze.rs b/drivers/android/binder/freeze.rs new file mode 100644 index 000000000000..e68c3c8bc55a --- /dev/null +++ b/drivers/android/binder/freeze.rs @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + alloc::AllocError, + list::ListArc, + prelude::*, + rbtree::{self, RBTreeNodeReservation}, + seq_file::SeqFile, + seq_print, + sync::{Arc, UniqueArc}, + uaccess::UserSliceReader, +}; + +use crate::{ + defs::*, node::Node, process::Process, thread::Thread, BinderReturnWriter, DArc, DLArc, + DTRWrap, DeliverToRead, +}; + +#[derive(Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] +pub(crate) struct FreezeCookie(u64); + +/// Represents a listener for changes to the frozen state of a process. +pub(crate) struct FreezeListener { + /// The node we are listening for. + pub(crate) node: DArc, + /// The cookie of this freeze listener. + cookie: FreezeCookie, + /// What value of `is_frozen` did we most recently tell userspace about? + last_is_frozen: Option, + /// We sent a `BR_FROZEN_BINDER` and we are waiting for `BC_FREEZE_NOTIFICATION_DONE` before + /// sending any other commands. + is_pending: bool, + /// Userspace sent `BC_CLEAR_FREEZE_NOTIFICATION` and we need to reply with + /// `BR_CLEAR_FREEZE_NOTIFICATION_DONE` as soon as possible. If `is_pending` is set, then we + /// must wait for it to be unset before we can reply. + is_clearing: bool, + /// Number of cleared duplicates that can't be deleted until userspace sends + /// `BC_FREEZE_NOTIFICATION_DONE`. + num_pending_duplicates: u64, + /// Number of cleared duplicates that can be deleted. + num_cleared_duplicates: u64, +} + +impl FreezeListener { + /// Is it okay to create a new listener with the same cookie as this one for the provided node? + /// + /// Under some scenarios, userspace may delete a freeze listener and immediately recreate it + /// with the same cookie. This results in duplicate listeners. To avoid issues with ambiguity, + /// we allow this only if the new listener is for the same node, and we also require that the + /// old listener has already been cleared. + fn allow_duplicate(&self, node: &DArc) -> bool { + Arc::ptr_eq(&self.node, node) && self.is_clearing + } +} + +type UninitFM = UniqueArc>>; + +/// Represents a notification that the freeze state has changed. +pub(crate) struct FreezeMessage { + cookie: FreezeCookie, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for FreezeMessage { + untracked; + } +} + +impl FreezeMessage { + fn new(flags: kernel::alloc::Flags) -> Result { + UniqueArc::new_uninit(flags) + } + + fn init(ua: UninitFM, cookie: FreezeCookie) -> DLArc { + match ua.pin_init_with(DTRWrap::new(FreezeMessage { cookie })) { + Ok(msg) => ListArc::from(msg), + Err(err) => match err {}, + } + } +} + +impl DeliverToRead for FreezeMessage { + fn do_work( + self: DArc, + thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let _removed_listener; + let mut node_refs = thread.process.node_refs.lock(); + let Some(mut freeze_entry) = node_refs.freeze_listeners.find_mut(&self.cookie) else { + return Ok(true); + }; + let freeze = freeze_entry.get_mut(); + + if freeze.num_cleared_duplicates > 0 { + freeze.num_cleared_duplicates -= 1; + drop(node_refs); + writer.write_code(BR_CLEAR_FREEZE_NOTIFICATION_DONE)?; + writer.write_payload(&self.cookie.0)?; + return Ok(true); + } + + if freeze.is_pending { + return Ok(true); + } + if freeze.is_clearing { + _removed_listener = freeze_entry.remove_node(); + drop(node_refs); + writer.write_code(BR_CLEAR_FREEZE_NOTIFICATION_DONE)?; + writer.write_payload(&self.cookie.0)?; + Ok(true) + } else { + let is_frozen = freeze.node.owner.inner.lock().is_frozen; + if freeze.last_is_frozen == Some(is_frozen) { + return Ok(true); + } + + let mut state_info = BinderFrozenStateInfo::default(); + state_info.is_frozen = is_frozen as u32; + state_info.cookie = freeze.cookie.0; + freeze.is_pending = true; + freeze.last_is_frozen = Some(is_frozen); + drop(node_refs); + + writer.write_code(BR_FROZEN_BINDER)?; + writer.write_payload(&state_info)?; + // BR_FROZEN_BINDER notifications can cause transactions + Ok(false) + } + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + #[inline(never)] + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!(m, "{}has frozen binder\n", prefix); + Ok(()) + } +} + +impl FreezeListener { + pub(crate) fn on_process_exit(&self, proc: &Arc) { + if !self.is_clearing { + self.node.remove_freeze_listener(proc); + } + } +} + +impl Process { + pub(crate) fn request_freeze_notif( + self: &Arc, + reader: &mut UserSliceReader, + ) -> Result<()> { + let hc = reader.read::()?; + let handle = hc.handle; + let cookie = FreezeCookie(hc.cookie); + + let msg = FreezeMessage::new(GFP_KERNEL)?; + let alloc = RBTreeNodeReservation::new(GFP_KERNEL)?; + + let mut node_refs_guard = self.node_refs.lock(); + let node_refs = &mut *node_refs_guard; + let Some(info) = node_refs.by_handle.get_mut(&handle) else { + pr_warn!("BC_REQUEST_FREEZE_NOTIFICATION invalid ref {}\n", handle); + return Err(EINVAL); + }; + if info.freeze().is_some() { + pr_warn!("BC_REQUEST_FREEZE_NOTIFICATION already set\n"); + return Err(EINVAL); + } + let node_ref = info.node_ref(); + let freeze_entry = node_refs.freeze_listeners.entry(cookie); + + if let rbtree::Entry::Occupied(ref dupe) = freeze_entry { + if !dupe.get().allow_duplicate(&node_ref.node) { + pr_warn!("BC_REQUEST_FREEZE_NOTIFICATION duplicate cookie\n"); + return Err(EINVAL); + } + } + + // All failure paths must come before this call, and all modifications must come after this + // call. + node_ref.node.add_freeze_listener(self, GFP_KERNEL)?; + + match freeze_entry { + rbtree::Entry::Vacant(entry) => { + entry.insert( + FreezeListener { + cookie, + node: node_ref.node.clone(), + last_is_frozen: None, + is_pending: false, + is_clearing: false, + num_pending_duplicates: 0, + num_cleared_duplicates: 0, + }, + alloc, + ); + } + rbtree::Entry::Occupied(mut dupe) => { + let dupe = dupe.get_mut(); + if dupe.is_pending { + dupe.num_pending_duplicates += 1; + } else { + dupe.num_cleared_duplicates += 1; + } + dupe.last_is_frozen = None; + dupe.is_pending = false; + dupe.is_clearing = false; + } + } + + *info.freeze() = Some(cookie); + let msg = FreezeMessage::init(msg, cookie); + drop(node_refs_guard); + let _ = self.push_work(msg); + Ok(()) + } + + pub(crate) fn freeze_notif_done(self: &Arc, reader: &mut UserSliceReader) -> Result<()> { + let cookie = FreezeCookie(reader.read()?); + let alloc = FreezeMessage::new(GFP_KERNEL)?; + let mut node_refs_guard = self.node_refs.lock(); + let node_refs = &mut *node_refs_guard; + let Some(freeze) = node_refs.freeze_listeners.get_mut(&cookie) else { + pr_warn!("BC_FREEZE_NOTIFICATION_DONE {:016x} not found\n", cookie.0); + return Err(EINVAL); + }; + let mut clear_msg = None; + if freeze.num_pending_duplicates > 0 { + clear_msg = Some(FreezeMessage::init(alloc, cookie)); + freeze.num_pending_duplicates -= 1; + freeze.num_cleared_duplicates += 1; + } else { + if !freeze.is_pending { + pr_warn!( + "BC_FREEZE_NOTIFICATION_DONE {:016x} not pending\n", + cookie.0 + ); + return Err(EINVAL); + } + if freeze.is_clearing { + // Immediately send another FreezeMessage for BR_CLEAR_FREEZE_NOTIFICATION_DONE. + clear_msg = Some(FreezeMessage::init(alloc, cookie)); + } + freeze.is_pending = false; + } + drop(node_refs_guard); + if let Some(clear_msg) = clear_msg { + let _ = self.push_work(clear_msg); + } + Ok(()) + } + + pub(crate) fn clear_freeze_notif(self: &Arc, reader: &mut UserSliceReader) -> Result<()> { + let hc = reader.read::()?; + let handle = hc.handle; + let cookie = FreezeCookie(hc.cookie); + + let alloc = FreezeMessage::new(GFP_KERNEL)?; + let mut node_refs_guard = self.node_refs.lock(); + let node_refs = &mut *node_refs_guard; + let Some(info) = node_refs.by_handle.get_mut(&handle) else { + pr_warn!("BC_CLEAR_FREEZE_NOTIFICATION invalid ref {}\n", handle); + return Err(EINVAL); + }; + let Some(info_cookie) = info.freeze() else { + pr_warn!("BC_CLEAR_FREEZE_NOTIFICATION freeze notification not active\n"); + return Err(EINVAL); + }; + if *info_cookie != cookie { + pr_warn!("BC_CLEAR_FREEZE_NOTIFICATION freeze notification cookie mismatch\n"); + return Err(EINVAL); + } + let Some(listener) = node_refs.freeze_listeners.get_mut(&cookie) else { + pr_warn!("BC_CLEAR_FREEZE_NOTIFICATION invalid cookie {}\n", handle); + return Err(EINVAL); + }; + listener.is_clearing = true; + listener.node.remove_freeze_listener(self); + *info.freeze() = None; + let mut msg = None; + if !listener.is_pending { + msg = Some(FreezeMessage::init(alloc, cookie)); + } + drop(node_refs_guard); + + if let Some(msg) = msg { + let _ = self.push_work(msg); + } + Ok(()) + } + + fn get_freeze_cookie(&self, node: &DArc) -> Option { + let node_refs = &mut *self.node_refs.lock(); + let handle = node_refs.by_node.get(&node.global_id())?; + let node_ref = node_refs.by_handle.get_mut(handle)?; + *node_ref.freeze() + } + + /// Creates a vector of every freeze listener on this process. + /// + /// Returns pairs of the remote process listening for notifications and the local node it is + /// listening on. + #[expect(clippy::type_complexity)] + fn find_freeze_recipients(&self) -> Result, Arc)>, AllocError> { + // Defined before `inner` to drop after releasing spinlock if `push_within_capacity` fails. + let mut node_proc_pair; + + // We pre-allocate space for up to 8 recipients before we take the spinlock. However, if + // the allocation fails, use a vector with a capacity of zero instead of failing. After + // all, there might not be any freeze listeners, in which case this operation could still + // succeed. + let mut recipients = + KVVec::with_capacity(8, GFP_KERNEL).unwrap_or_else(|_err| KVVec::new()); + + let mut inner = self.lock_with_nodes(); + let mut curr = inner.nodes.cursor_front(); + while let Some(cursor) = curr { + let (key, node) = cursor.current(); + let key = *key; + let list = node.freeze_list(&inner.inner); + let len = list.len(); + + if recipients.spare_capacity_mut().len() < len { + drop(inner); + recipients.reserve(len, GFP_KERNEL)?; + inner = self.lock_with_nodes(); + // Find the node we were looking at and try again. If the set of nodes was changed, + // then just proceed to the next node. This is ok because we don't guarantee the + // inclusion of nodes that are added or removed in parallel with this operation. + curr = inner.nodes.cursor_lower_bound(&key); + continue; + } + + for proc in list { + node_proc_pair = (node.clone(), proc.clone()); + recipients + .push_within_capacity(node_proc_pair) + .map_err(|_| { + pr_err!( + "push_within_capacity failed even though we checked the capacity\n" + ); + AllocError + })?; + } + + curr = cursor.move_next(); + } + Ok(recipients) + } + + /// Prepare allocations for sending freeze messages. + pub(crate) fn prepare_freeze_messages(&self) -> Result { + let recipients = self.find_freeze_recipients()?; + let mut batch = KVVec::with_capacity(recipients.len(), GFP_KERNEL)?; + for (node, proc) in recipients { + let Some(cookie) = proc.get_freeze_cookie(&node) else { + // If the freeze listener was removed in the meantime, just discard the + // notification. + continue; + }; + let msg_alloc = FreezeMessage::new(GFP_KERNEL)?; + let msg = FreezeMessage::init(msg_alloc, cookie); + batch.push((proc, msg), GFP_KERNEL)?; + } + + Ok(FreezeMessages { batch }) + } +} + +pub(crate) struct FreezeMessages { + batch: KVVec<(Arc, DLArc)>, +} + +impl FreezeMessages { + pub(crate) fn send_messages(self) { + for (proc, msg) in self.batch { + let _ = proc.push_work(msg); + } + } +} diff --git a/drivers/android/binder/node.rs b/drivers/android/binder/node.rs new file mode 100644 index 000000000000..ade895ef791e --- /dev/null +++ b/drivers/android/binder/node.rs @@ -0,0 +1,1131 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + list::{AtomicTracker, List, ListArc, ListLinks, TryNewListArc}, + prelude::*, + seq_file::SeqFile, + seq_print, + sync::lock::{spinlock::SpinLockBackend, Guard}, + sync::{Arc, LockedBy, SpinLock}, +}; + +use crate::{ + defs::*, + error::BinderError, + process::{NodeRefInfo, Process, ProcessInner}, + thread::Thread, + transaction::Transaction, + BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverToRead, +}; + +use core::mem; + +mod wrapper; +pub(crate) use self::wrapper::CritIncrWrapper; + +#[derive(Debug)] +pub(crate) struct CouldNotDeliverCriticalIncrement; + +/// Keeps track of how this node is scheduled. +/// +/// There are two ways to schedule a node to a work list. Just schedule the node itself, or +/// allocate a wrapper that references the node and schedule the wrapper. These wrappers exists to +/// make it possible to "move" a node from one list to another - when `do_work` is called directly +/// on the `Node`, then it's a no-op if there's also a pending wrapper. +/// +/// Wrappers are generally only needed for zero-to-one refcount increments, and there are two cases +/// of this: weak increments and strong increments. We call such increments "critical" because it +/// is critical that they are delivered to the thread doing the increment. Some examples: +/// +/// * One thread makes a zero-to-one strong increment, and another thread makes a zero-to-one weak +/// increment. Delivering the node to the thread doing the weak increment is wrong, since the +/// thread doing the strong increment may have ended a long time ago when the command is actually +/// processed by userspace. +/// +/// * We have a weak reference and are about to drop it on one thread. But then another thread does +/// a zero-to-one strong increment. If the strong increment gets sent to the thread that was +/// about to drop the weak reference, then the strong increment could be processed after the +/// other thread has already exited, which would be too late. +/// +/// Note that trying to create a `ListArc` to the node can succeed even if `has_normal_push` is +/// set. This is because another thread might just have popped the node from a todo list, but not +/// yet called `do_work`. However, if `has_normal_push` is false, then creating a `ListArc` should +/// always succeed. +/// +/// Like the other fields in `NodeInner`, the delivery state is protected by the process lock. +struct DeliveryState { + /// Is the `Node` currently scheduled? + has_pushed_node: bool, + + /// Is a wrapper currently scheduled? + /// + /// The wrapper is used only for strong zero2one increments. + has_pushed_wrapper: bool, + + /// Is the currently scheduled `Node` scheduled due to a weak zero2one increment? + /// + /// Weak zero2one operations are always scheduled using the `Node`. + has_weak_zero2one: bool, + + /// Is the currently scheduled wrapper/`Node` scheduled due to a strong zero2one increment? + /// + /// If `has_pushed_wrapper` is set, then the strong zero2one increment was scheduled using the + /// wrapper. Otherwise, `has_pushed_node` must be set and it was scheduled using the `Node`. + has_strong_zero2one: bool, +} + +impl DeliveryState { + fn should_normal_push(&self) -> bool { + !self.has_pushed_node && !self.has_pushed_wrapper + } + + fn did_normal_push(&mut self) { + assert!(self.should_normal_push()); + self.has_pushed_node = true; + } + + fn should_push_weak_zero2one(&self) -> bool { + !self.has_weak_zero2one && !self.has_strong_zero2one + } + + fn can_push_weak_zero2one_normally(&self) -> bool { + !self.has_pushed_node + } + + fn did_push_weak_zero2one(&mut self) { + assert!(self.should_push_weak_zero2one()); + assert!(self.can_push_weak_zero2one_normally()); + self.has_pushed_node = true; + self.has_weak_zero2one = true; + } + + fn should_push_strong_zero2one(&self) -> bool { + !self.has_strong_zero2one + } + + fn can_push_strong_zero2one_normally(&self) -> bool { + !self.has_pushed_node + } + + fn did_push_strong_zero2one(&mut self) { + assert!(self.should_push_strong_zero2one()); + assert!(self.can_push_strong_zero2one_normally()); + self.has_pushed_node = true; + self.has_strong_zero2one = true; + } + + fn did_push_strong_zero2one_wrapper(&mut self) { + assert!(self.should_push_strong_zero2one()); + assert!(!self.can_push_strong_zero2one_normally()); + self.has_pushed_wrapper = true; + self.has_strong_zero2one = true; + } +} + +struct CountState { + /// The reference count. + count: usize, + /// Whether the process that owns this node thinks that we hold a refcount on it. (Note that + /// even if count is greater than one, we only increment it once in the owning process.) + has_count: bool, +} + +impl CountState { + fn new() -> Self { + Self { + count: 0, + has_count: false, + } + } +} + +struct NodeInner { + /// Strong refcounts held on this node by `NodeRef` objects. + strong: CountState, + /// Weak refcounts held on this node by `NodeRef` objects. + weak: CountState, + delivery_state: DeliveryState, + /// The binder driver guarantees that oneway transactions sent to the same node are serialized, + /// that is, userspace will not be given the next one until it has finished processing the + /// previous oneway transaction. This is done to avoid the case where two oneway transactions + /// arrive in opposite order from the order in which they were sent. (E.g., they could be + /// delivered to two different threads, which could appear as-if they were sent in opposite + /// order.) + /// + /// To fix that, we store pending oneway transactions in a separate list in the node, and don't + /// deliver the next oneway transaction until userspace signals that it has finished processing + /// the previous oneway transaction by calling the `BC_FREE_BUFFER` ioctl. + oneway_todo: List>, + /// Keeps track of whether this node has a pending oneway transaction. + /// + /// When this is true, incoming oneway transactions are stored in `oneway_todo`, instead of + /// being delivered directly to the process. + has_oneway_transaction: bool, + /// List of processes to deliver a notification to when this node is destroyed (usually due to + /// the process dying). + death_list: List, 1>, + /// List of processes to deliver freeze notifications to. + freeze_list: KVVec>, + /// The number of active BR_INCREFS or BR_ACQUIRE operations. (should be maximum two) + /// + /// If this is non-zero, then we postpone any BR_RELEASE or BR_DECREFS notifications until the + /// active operations have ended. This avoids the situation an increment and decrement get + /// reordered from userspace's perspective. + active_inc_refs: u8, + /// List of `NodeRefInfo` objects that reference this node. + refs: List, +} + +#[pin_data] +pub(crate) struct Node { + pub(crate) debug_id: usize, + ptr: u64, + pub(crate) cookie: u64, + pub(crate) flags: u32, + pub(crate) owner: Arc, + inner: LockedBy, + #[pin] + links_track: AtomicTracker, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Node { + tracked_by links_track: AtomicTracker; + } +} + +// Make `oneway_todo` work. +kernel::list::impl_list_item! { + impl ListItem<0> for DTRWrap { + using ListLinks { self.links.inner }; + } +} + +impl Node { + pub(crate) fn new( + ptr: u64, + cookie: u64, + flags: u32, + owner: Arc, + ) -> impl PinInit { + pin_init!(Self { + inner: LockedBy::new( + &owner.inner, + NodeInner { + strong: CountState::new(), + weak: CountState::new(), + delivery_state: DeliveryState { + has_pushed_node: false, + has_pushed_wrapper: false, + has_weak_zero2one: false, + has_strong_zero2one: false, + }, + death_list: List::new(), + oneway_todo: List::new(), + freeze_list: KVVec::new(), + has_oneway_transaction: false, + active_inc_refs: 0, + refs: List::new(), + }, + ), + debug_id: super::next_debug_id(), + ptr, + cookie, + flags, + owner, + links_track <- AtomicTracker::new(), + }) + } + + pub(crate) fn has_oneway_transaction(&self, owner_inner: &mut ProcessInner) -> bool { + let inner = self.inner.access_mut(owner_inner); + inner.has_oneway_transaction + } + + #[inline(never)] + pub(crate) fn full_debug_print( + &self, + m: &SeqFile, + owner_inner: &mut ProcessInner, + ) -> Result<()> { + let inner = self.inner.access_mut(owner_inner); + seq_print!( + m, + " node {}: u{:016x} c{:016x} hs {} hw {} cs {} cw {}", + self.debug_id, + self.ptr, + self.cookie, + inner.strong.has_count, + inner.weak.has_count, + inner.strong.count, + inner.weak.count, + ); + if !inner.refs.is_empty() { + seq_print!(m, " proc"); + for node_ref in &inner.refs { + seq_print!(m, " {}", node_ref.process.task.pid()); + } + } + seq_print!(m, "\n"); + for t in &inner.oneway_todo { + t.debug_print_inner(m, " pending async transaction "); + } + Ok(()) + } + + /// Insert the `NodeRef` into this `refs` list. + /// + /// # Safety + /// + /// It must be the case that `info.node_ref.node` is this node. + pub(crate) unsafe fn insert_node_info( + &self, + info: ListArc, + ) { + self.inner + .access_mut(&mut self.owner.inner.lock()) + .refs + .push_front(info); + } + + /// Insert the `NodeRef` into this `refs` list. + /// + /// # Safety + /// + /// It must be the case that `info.node_ref.node` is this node. + pub(crate) unsafe fn remove_node_info( + &self, + info: &NodeRefInfo, + ) -> Option> { + // SAFETY: We always insert `NodeRefInfo` objects into the `refs` list of the node that it + // references in `info.node_ref.node`. That is this node, so `info` cannot possibly be in + // the `refs` list of another node. + unsafe { + self.inner + .access_mut(&mut self.owner.inner.lock()) + .refs + .remove(info) + } + } + + /// An id that is unique across all binder nodes on the system. Used as the key in the + /// `by_node` map. + pub(crate) fn global_id(&self) -> usize { + self as *const Node as usize + } + + pub(crate) fn get_id(&self) -> (u64, u64) { + (self.ptr, self.cookie) + } + + pub(crate) fn add_death( + &self, + death: ListArc, 1>, + guard: &mut Guard<'_, ProcessInner, SpinLockBackend>, + ) { + self.inner.access_mut(guard).death_list.push_back(death); + } + + pub(crate) fn inc_ref_done_locked( + self: &DArc, + _strong: bool, + owner_inner: &mut ProcessInner, + ) -> Option> { + let inner = self.inner.access_mut(owner_inner); + if inner.active_inc_refs == 0 { + pr_err!("inc_ref_done called when no active inc_refs"); + return None; + } + + inner.active_inc_refs -= 1; + if inner.active_inc_refs == 0 { + // Having active inc_refs can inhibit dropping of ref-counts. Calculate whether we + // would send a refcount decrement, and if so, tell the caller to schedule us. + let strong = inner.strong.count > 0; + let has_strong = inner.strong.has_count; + let weak = strong || inner.weak.count > 0; + let has_weak = inner.weak.has_count; + + let should_drop_weak = !weak && has_weak; + let should_drop_strong = !strong && has_strong; + + // If we want to drop the ref-count again, tell the caller to schedule a work node for + // that. + let need_push = should_drop_weak || should_drop_strong; + + if need_push && inner.delivery_state.should_normal_push() { + let list_arc = ListArc::try_from_arc(self.clone()).ok().unwrap(); + inner.delivery_state.did_normal_push(); + Some(list_arc) + } else { + None + } + } else { + None + } + } + + pub(crate) fn update_refcount_locked( + self: &DArc, + inc: bool, + strong: bool, + count: usize, + owner_inner: &mut ProcessInner, + ) -> Option> { + let is_dead = owner_inner.is_dead; + let inner = self.inner.access_mut(owner_inner); + + // Get a reference to the state we'll update. + let state = if strong { + &mut inner.strong + } else { + &mut inner.weak + }; + + // Update the count and determine whether we need to push work. + let need_push = if inc { + state.count += count; + // TODO: This method shouldn't be used for zero-to-one increments. + !is_dead && !state.has_count + } else { + if state.count < count { + pr_err!("Failure: refcount underflow!"); + return None; + } + state.count -= count; + !is_dead && state.count == 0 && state.has_count + }; + + if need_push && inner.delivery_state.should_normal_push() { + let list_arc = ListArc::try_from_arc(self.clone()).ok().unwrap(); + inner.delivery_state.did_normal_push(); + Some(list_arc) + } else { + None + } + } + + pub(crate) fn incr_refcount_allow_zero2one( + self: &DArc, + strong: bool, + owner_inner: &mut ProcessInner, + ) -> Result>, CouldNotDeliverCriticalIncrement> { + let is_dead = owner_inner.is_dead; + let inner = self.inner.access_mut(owner_inner); + + // Get a reference to the state we'll update. + let state = if strong { + &mut inner.strong + } else { + &mut inner.weak + }; + + // Update the count and determine whether we need to push work. + state.count += 1; + if is_dead || state.has_count { + return Ok(None); + } + + // Userspace needs to be notified of this. + if !strong && inner.delivery_state.should_push_weak_zero2one() { + assert!(inner.delivery_state.can_push_weak_zero2one_normally()); + let list_arc = ListArc::try_from_arc(self.clone()).ok().unwrap(); + inner.delivery_state.did_push_weak_zero2one(); + Ok(Some(list_arc)) + } else if strong && inner.delivery_state.should_push_strong_zero2one() { + if inner.delivery_state.can_push_strong_zero2one_normally() { + let list_arc = ListArc::try_from_arc(self.clone()).ok().unwrap(); + inner.delivery_state.did_push_strong_zero2one(); + Ok(Some(list_arc)) + } else { + state.count -= 1; + Err(CouldNotDeliverCriticalIncrement) + } + } else { + // Work is already pushed, and we don't need to push again. + Ok(None) + } + } + + pub(crate) fn incr_refcount_allow_zero2one_with_wrapper( + self: &DArc, + strong: bool, + wrapper: CritIncrWrapper, + owner_inner: &mut ProcessInner, + ) -> Option> { + match self.incr_refcount_allow_zero2one(strong, owner_inner) { + Ok(Some(node)) => Some(node as _), + Ok(None) => None, + Err(CouldNotDeliverCriticalIncrement) => { + assert!(strong); + let inner = self.inner.access_mut(owner_inner); + inner.strong.count += 1; + inner.delivery_state.did_push_strong_zero2one_wrapper(); + Some(wrapper.init(self.clone())) + } + } + } + + pub(crate) fn update_refcount(self: &DArc, inc: bool, count: usize, strong: bool) { + self.owner + .inner + .lock() + .update_node_refcount(self, inc, strong, count, None); + } + + pub(crate) fn populate_counts( + &self, + out: &mut BinderNodeInfoForRef, + guard: &Guard<'_, ProcessInner, SpinLockBackend>, + ) { + let inner = self.inner.access(guard); + out.strong_count = inner.strong.count as _; + out.weak_count = inner.weak.count as _; + } + + pub(crate) fn populate_debug_info( + &self, + out: &mut BinderNodeDebugInfo, + guard: &Guard<'_, ProcessInner, SpinLockBackend>, + ) { + out.ptr = self.ptr as _; + out.cookie = self.cookie as _; + let inner = self.inner.access(guard); + if inner.strong.has_count { + out.has_strong_ref = 1; + } + if inner.weak.has_count { + out.has_weak_ref = 1; + } + } + + pub(crate) fn force_has_count(&self, guard: &mut Guard<'_, ProcessInner, SpinLockBackend>) { + let inner = self.inner.access_mut(guard); + inner.strong.has_count = true; + inner.weak.has_count = true; + } + + fn write(&self, writer: &mut BinderReturnWriter<'_>, code: u32) -> Result { + writer.write_code(code)?; + writer.write_payload(&self.ptr)?; + writer.write_payload(&self.cookie)?; + Ok(()) + } + + pub(crate) fn submit_oneway( + &self, + transaction: DLArc, + guard: &mut Guard<'_, ProcessInner, SpinLockBackend>, + ) -> Result<(), (BinderError, DLArc)> { + if guard.is_dead { + return Err((BinderError::new_dead(), transaction)); + } + + let inner = self.inner.access_mut(guard); + if inner.has_oneway_transaction { + inner.oneway_todo.push_back(transaction); + } else { + inner.has_oneway_transaction = true; + guard.push_work(transaction)?; + } + Ok(()) + } + + pub(crate) fn release(&self) { + let mut guard = self.owner.inner.lock(); + while let Some(work) = self.inner.access_mut(&mut guard).oneway_todo.pop_front() { + drop(guard); + work.into_arc().cancel(); + guard = self.owner.inner.lock(); + } + + let death_list = core::mem::take(&mut self.inner.access_mut(&mut guard).death_list); + drop(guard); + for death in death_list { + death.into_arc().set_dead(); + } + } + + pub(crate) fn pending_oneway_finished(&self) { + let mut guard = self.owner.inner.lock(); + if guard.is_dead { + // Cleanup will happen in `Process::deferred_release`. + return; + } + + let inner = self.inner.access_mut(&mut guard); + + let transaction = inner.oneway_todo.pop_front(); + inner.has_oneway_transaction = transaction.is_some(); + if let Some(transaction) = transaction { + match guard.push_work(transaction) { + Ok(()) => {} + Err((_err, work)) => { + // Process is dead. + // This shouldn't happen due to the `is_dead` check, but if it does, just drop + // the transaction and return. + drop(guard); + drop(work); + } + } + } + } + + /// Finds an outdated transaction that the given transaction can replace. + /// + /// If one is found, it is removed from the list and returned. + pub(crate) fn take_outdated_transaction( + &self, + new: &Transaction, + guard: &mut Guard<'_, ProcessInner, SpinLockBackend>, + ) -> Option> { + let inner = self.inner.access_mut(guard); + let mut cursor = inner.oneway_todo.cursor_front(); + while let Some(next) = cursor.peek_next() { + if new.can_replace(&next) { + return Some(next.remove()); + } + cursor.move_next(); + } + None + } + + /// This is split into a separate function since it's called by both `Node::do_work` and + /// `NodeWrapper::do_work`. + fn do_work_locked( + &self, + writer: &mut BinderReturnWriter<'_>, + mut guard: Guard<'_, ProcessInner, SpinLockBackend>, + ) -> Result { + let inner = self.inner.access_mut(&mut guard); + let strong = inner.strong.count > 0; + let has_strong = inner.strong.has_count; + let weak = strong || inner.weak.count > 0; + let has_weak = inner.weak.has_count; + + if weak && !has_weak { + inner.weak.has_count = true; + inner.active_inc_refs += 1; + } + + if strong && !has_strong { + inner.strong.has_count = true; + inner.active_inc_refs += 1; + } + + let no_active_inc_refs = inner.active_inc_refs == 0; + let should_drop_weak = no_active_inc_refs && (!weak && has_weak); + let should_drop_strong = no_active_inc_refs && (!strong && has_strong); + if should_drop_weak { + inner.weak.has_count = false; + } + if should_drop_strong { + inner.strong.has_count = false; + } + if no_active_inc_refs && !weak { + // Remove the node if there are no references to it. + guard.remove_node(self.ptr); + } + drop(guard); + + if weak && !has_weak { + self.write(writer, BR_INCREFS)?; + } + if strong && !has_strong { + self.write(writer, BR_ACQUIRE)?; + } + if should_drop_strong { + self.write(writer, BR_RELEASE)?; + } + if should_drop_weak { + self.write(writer, BR_DECREFS)?; + } + + Ok(true) + } + + pub(crate) fn add_freeze_listener( + &self, + process: &Arc, + flags: kernel::alloc::Flags, + ) -> Result { + let mut vec_alloc = KVVec::>::new(); + loop { + let mut guard = self.owner.inner.lock(); + // Do not check for `guard.dead`. The `dead` flag that matters here is the owner of the + // listener, no the target. + let inner = self.inner.access_mut(&mut guard); + let len = inner.freeze_list.len(); + if len >= inner.freeze_list.capacity() { + if len >= vec_alloc.capacity() { + drop(guard); + vec_alloc = KVVec::with_capacity((1 + len).next_power_of_two(), flags)?; + continue; + } + mem::swap(&mut inner.freeze_list, &mut vec_alloc); + for elem in vec_alloc.drain_all() { + inner.freeze_list.push_within_capacity(elem)?; + } + } + inner.freeze_list.push_within_capacity(process.clone())?; + return Ok(()); + } + } + + pub(crate) fn remove_freeze_listener(&self, p: &Arc) { + let _unused_capacity; + let mut guard = self.owner.inner.lock(); + let inner = self.inner.access_mut(&mut guard); + let len = inner.freeze_list.len(); + inner.freeze_list.retain(|proc| !Arc::ptr_eq(proc, p)); + if len == inner.freeze_list.len() { + pr_warn!( + "Could not remove freeze listener for {}\n", + p.pid_in_current_ns() + ); + } + if inner.freeze_list.is_empty() { + _unused_capacity = mem::replace(&mut inner.freeze_list, KVVec::new()); + } + } + + pub(crate) fn freeze_list<'a>(&'a self, guard: &'a ProcessInner) -> &'a [Arc] { + &self.inner.access(guard).freeze_list + } +} + +impl DeliverToRead for Node { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let mut owner_inner = self.owner.inner.lock(); + let inner = self.inner.access_mut(&mut owner_inner); + + assert!(inner.delivery_state.has_pushed_node); + if inner.delivery_state.has_pushed_wrapper { + // If the wrapper is scheduled, then we are either a normal push or weak zero2one + // increment, and the wrapper is a strong zero2one increment, so the wrapper always + // takes precedence over us. + assert!(inner.delivery_state.has_strong_zero2one); + inner.delivery_state.has_pushed_node = false; + inner.delivery_state.has_weak_zero2one = false; + return Ok(true); + } + + inner.delivery_state.has_pushed_node = false; + inner.delivery_state.has_weak_zero2one = false; + inner.delivery_state.has_strong_zero2one = false; + + self.do_work_locked(writer, owner_inner) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + #[inline(never)] + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!( + m, + "{}node work {}: u{:016x} c{:016x}\n", + prefix, + self.debug_id, + self.ptr, + self.cookie, + ); + Ok(()) + } +} + +/// Represents something that holds one or more ref-counts to a `Node`. +/// +/// Whenever process A holds a refcount to a node owned by a different process B, then process A +/// will store a `NodeRef` that refers to the `Node` in process B. When process A releases the +/// refcount, we destroy the NodeRef, which decrements the ref-count in process A. +/// +/// This type is also used for some other cases. For example, a transaction allocation holds a +/// refcount on the target node, and this is implemented by storing a `NodeRef` in the allocation +/// so that the destructor of the allocation will drop a refcount of the `Node`. +pub(crate) struct NodeRef { + pub(crate) node: DArc, + /// How many times does this NodeRef hold a refcount on the Node? + strong_node_count: usize, + weak_node_count: usize, + /// How many times does userspace hold a refcount on this NodeRef? + strong_count: usize, + weak_count: usize, +} + +impl NodeRef { + pub(crate) fn new(node: DArc, strong_count: usize, weak_count: usize) -> Self { + Self { + node, + strong_node_count: strong_count, + weak_node_count: weak_count, + strong_count, + weak_count, + } + } + + pub(crate) fn absorb(&mut self, mut other: Self) { + assert!( + Arc::ptr_eq(&self.node, &other.node), + "absorb called with differing nodes" + ); + self.strong_node_count += other.strong_node_count; + self.weak_node_count += other.weak_node_count; + self.strong_count += other.strong_count; + self.weak_count += other.weak_count; + other.strong_count = 0; + other.weak_count = 0; + other.strong_node_count = 0; + other.weak_node_count = 0; + + if self.strong_node_count >= 2 || self.weak_node_count >= 2 { + let mut guard = self.node.owner.inner.lock(); + let inner = self.node.inner.access_mut(&mut guard); + + if self.strong_node_count >= 2 { + inner.strong.count -= self.strong_node_count - 1; + self.strong_node_count = 1; + assert_ne!(inner.strong.count, 0); + } + if self.weak_node_count >= 2 { + inner.weak.count -= self.weak_node_count - 1; + self.weak_node_count = 1; + assert_ne!(inner.weak.count, 0); + } + } + } + + pub(crate) fn get_count(&self) -> (usize, usize) { + (self.strong_count, self.weak_count) + } + + pub(crate) fn clone(&self, strong: bool) -> Result { + if strong && self.strong_count == 0 { + return Err(EINVAL); + } + Ok(self + .node + .owner + .inner + .lock() + .new_node_ref(self.node.clone(), strong, None)) + } + + /// Updates (increments or decrements) the number of references held against the node. If the + /// count being updated transitions from 0 to 1 or from 1 to 0, the node is notified by having + /// its `update_refcount` function called. + /// + /// Returns whether `self` should be removed (when both counts are zero). + pub(crate) fn update(&mut self, inc: bool, strong: bool) -> bool { + if strong && self.strong_count == 0 { + return false; + } + let (count, node_count, other_count) = if strong { + ( + &mut self.strong_count, + &mut self.strong_node_count, + self.weak_count, + ) + } else { + ( + &mut self.weak_count, + &mut self.weak_node_count, + self.strong_count, + ) + }; + if inc { + if *count == 0 { + *node_count = 1; + self.node.update_refcount(true, 1, strong); + } + *count += 1; + } else { + if *count == 0 { + pr_warn!( + "pid {} performed invalid decrement on ref\n", + kernel::current!().pid() + ); + return false; + } + *count -= 1; + if *count == 0 { + self.node.update_refcount(false, *node_count, strong); + *node_count = 0; + return other_count == 0; + } + } + false + } +} + +impl Drop for NodeRef { + // This destructor is called conditionally from `Allocation::drop`. That branch is often + // mispredicted. Inlining this method call reduces the cost of those branch mispredictions. + #[inline(always)] + fn drop(&mut self) { + if self.strong_node_count > 0 { + self.node + .update_refcount(false, self.strong_node_count, true); + } + if self.weak_node_count > 0 { + self.node + .update_refcount(false, self.weak_node_count, false); + } + } +} + +struct NodeDeathInner { + dead: bool, + cleared: bool, + notification_done: bool, + /// Indicates whether the normal flow was interrupted by removing the handle. In this case, we + /// need behave as if the death notification didn't exist (i.e., we don't deliver anything to + /// the user. + aborted: bool, +} + +/// Used to deliver notifications when a process dies. +/// +/// A process can request to be notified when a process dies using `BC_REQUEST_DEATH_NOTIFICATION`. +/// This will make the driver send a `BR_DEAD_BINDER` to userspace when the process dies (or +/// immediately if it is already dead). Userspace is supposed to respond with `BC_DEAD_BINDER_DONE` +/// once it has processed the notification. +/// +/// Userspace can unregister from death notifications using the `BC_CLEAR_DEATH_NOTIFICATION` +/// command. In this case, the kernel will respond with `BR_CLEAR_DEATH_NOTIFICATION_DONE` once the +/// notification has been removed. Note that if the remote process dies before the kernel has +/// responded with `BR_CLEAR_DEATH_NOTIFICATION_DONE`, then the kernel will still send a +/// `BR_DEAD_BINDER`, which userspace must be able to process. In this case, the kernel will wait +/// for the `BC_DEAD_BINDER_DONE` command before it sends `BR_CLEAR_DEATH_NOTIFICATION_DONE`. +/// +/// Note that even if the kernel sends a `BR_DEAD_BINDER`, this does not remove the death +/// notification. Userspace must still remove it manually using `BC_CLEAR_DEATH_NOTIFICATION`. +/// +/// If a process uses `BC_RELEASE` to destroy its last refcount on a node that has an active death +/// registration, then the death registration is immediately deleted (we implement this using the +/// `aborted` field). However, userspace is not supposed to delete a `NodeRef` without first +/// deregistering death notifications, so this codepath is not executed under normal circumstances. +#[pin_data] +pub(crate) struct NodeDeath { + node: DArc, + process: Arc, + pub(crate) cookie: u64, + #[pin] + links_track: AtomicTracker<0>, + /// Used by the owner `Node` to store a list of registered death notifications. + /// + /// # Invariants + /// + /// Only ever used with the `death_list` list of `self.node`. + #[pin] + death_links: ListLinks<1>, + /// Used by the process to keep track of the death notifications for which we have sent a + /// `BR_DEAD_BINDER` but not yet received a `BC_DEAD_BINDER_DONE`. + /// + /// # Invariants + /// + /// Only ever used with the `delivered_deaths` list of `self.process`. + #[pin] + delivered_links: ListLinks<2>, + #[pin] + delivered_links_track: AtomicTracker<2>, + #[pin] + inner: SpinLock, +} + +impl NodeDeath { + /// Constructs a new node death notification object. + pub(crate) fn new( + node: DArc, + process: Arc, + cookie: u64, + ) -> impl PinInit> { + DTRWrap::new(pin_init!( + Self { + node, + process, + cookie, + links_track <- AtomicTracker::new(), + death_links <- ListLinks::new(), + delivered_links <- ListLinks::new(), + delivered_links_track <- AtomicTracker::new(), + inner <- kernel::new_spinlock!(NodeDeathInner { + dead: false, + cleared: false, + notification_done: false, + aborted: false, + }, "NodeDeath::inner"), + } + )) + } + + /// Sets the cleared flag to `true`. + /// + /// It removes `self` from the node's death notification list if needed. + /// + /// Returns whether it needs to be queued. + pub(crate) fn set_cleared(self: &DArc, abort: bool) -> bool { + let (needs_removal, needs_queueing) = { + // Update state and determine if we need to queue a work item. We only need to do it + // when the node is not dead or if the user already completed the death notification. + let mut inner = self.inner.lock(); + if abort { + inner.aborted = true; + } + if inner.cleared { + // Already cleared. + return false; + } + inner.cleared = true; + (!inner.dead, !inner.dead || inner.notification_done) + }; + + // Remove death notification from node. + if needs_removal { + let mut owner_inner = self.node.owner.inner.lock(); + let node_inner = self.node.inner.access_mut(&mut owner_inner); + // SAFETY: A `NodeDeath` is never inserted into the death list of any node other than + // its owner, so it is either in this death list or in no death list. + unsafe { node_inner.death_list.remove(self) }; + } + needs_queueing + } + + /// Sets the 'notification done' flag to `true`. + pub(crate) fn set_notification_done(self: DArc, thread: &Thread) { + let needs_queueing = { + let mut inner = self.inner.lock(); + inner.notification_done = true; + inner.cleared + }; + if needs_queueing { + if let Some(death) = ListArc::try_from_arc_or_drop(self) { + let _ = thread.push_work_if_looper(death); + } + } + } + + /// Sets the 'dead' flag to `true` and queues work item if needed. + pub(crate) fn set_dead(self: DArc) { + let needs_queueing = { + let mut inner = self.inner.lock(); + if inner.cleared { + false + } else { + inner.dead = true; + true + } + }; + if needs_queueing { + // Push the death notification to the target process. There is nothing else to do if + // it's already dead. + if let Some(death) = ListArc::try_from_arc_or_drop(self) { + let process = death.process.clone(); + let _ = process.push_work(death); + } + } + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for NodeDeath { + tracked_by links_track: AtomicTracker; + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<1> for DTRWrap { untracked; } +} +kernel::list::impl_list_item! { + impl ListItem<1> for DTRWrap { + using ListLinks { self.wrapped.death_links }; + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<2> for DTRWrap { + tracked_by wrapped: NodeDeath; + } +} +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<2> for NodeDeath { + tracked_by delivered_links_track: AtomicTracker<2>; + } +} +kernel::list::impl_list_item! { + impl ListItem<2> for DTRWrap { + using ListLinks { self.wrapped.delivered_links }; + } +} + +impl DeliverToRead for NodeDeath { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let done = { + let inner = self.inner.lock(); + if inner.aborted { + return Ok(true); + } + inner.cleared && (!inner.dead || inner.notification_done) + }; + + let cookie = self.cookie; + let cmd = if done { + BR_CLEAR_DEATH_NOTIFICATION_DONE + } else { + let process = self.process.clone(); + let mut process_inner = process.inner.lock(); + let inner = self.inner.lock(); + if inner.aborted { + return Ok(true); + } + // We're still holding the inner lock, so it cannot be aborted while we insert it into + // the delivered list. + process_inner.death_delivered(self.clone()); + BR_DEAD_BINDER + }; + + writer.write_code(cmd)?; + writer.write_payload(&cookie)?; + // DEAD_BINDER notifications can cause transactions, so stop processing work items when we + // get to a death notification. + Ok(cmd != BR_DEAD_BINDER) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + #[inline(never)] + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + let inner = self.inner.lock(); + + let dead_binder = inner.dead && !inner.notification_done; + + if dead_binder { + if inner.cleared { + seq_print!(m, "{}has cleared dead binder\n", prefix); + } else { + seq_print!(m, "{}has dead binder\n", prefix); + } + } else { + seq_print!(m, "{}has cleared death notification\n", prefix); + } + + Ok(()) + } +} diff --git a/drivers/android/binder/node/wrapper.rs b/drivers/android/binder/node/wrapper.rs new file mode 100644 index 000000000000..43294c050502 --- /dev/null +++ b/drivers/android/binder/node/wrapper.rs @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{list::ListArc, prelude::*, seq_file::SeqFile, seq_print, sync::UniqueArc}; + +use crate::{node::Node, thread::Thread, BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverToRead}; + +use core::mem::MaybeUninit; + +pub(crate) struct CritIncrWrapper { + inner: UniqueArc>>, +} + +impl CritIncrWrapper { + pub(crate) fn new() -> Result { + Ok(CritIncrWrapper { + inner: UniqueArc::new_uninit(GFP_KERNEL)?, + }) + } + + pub(super) fn init(self, node: DArc) -> DLArc { + match self.inner.pin_init_with(DTRWrap::new(NodeWrapper { node })) { + Ok(initialized) => ListArc::from(initialized) as _, + Err(err) => match err {}, + } + } +} + +struct NodeWrapper { + node: DArc, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for NodeWrapper { + untracked; + } +} + +impl DeliverToRead for NodeWrapper { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let node = &self.node; + let mut owner_inner = node.owner.inner.lock(); + let inner = node.inner.access_mut(&mut owner_inner); + + let ds = &mut inner.delivery_state; + + assert!(ds.has_pushed_wrapper); + assert!(ds.has_strong_zero2one); + ds.has_pushed_wrapper = false; + ds.has_strong_zero2one = false; + + node.do_work_locked(writer, owner_inner) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + #[inline(never)] + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!( + m, + "{}node work {}: u{:016x} c{:016x}\n", + prefix, + self.node.debug_id, + self.node.ptr, + self.node.cookie, + ); + Ok(()) + } +} diff --git a/drivers/android/binder/page_range.rs b/drivers/android/binder/page_range.rs new file mode 100644 index 000000000000..9379038f61f5 --- /dev/null +++ b/drivers/android/binder/page_range.rs @@ -0,0 +1,734 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! This module has utilities for managing a page range where unused pages may be reclaimed by a +//! vma shrinker. + +// To avoid deadlocks, locks are taken in the order: +// +// 1. mmap lock +// 2. spinlock +// 3. lru spinlock +// +// The shrinker will use trylock methods because it locks them in a different order. + +use core::{ + marker::PhantomPinned, + mem::{size_of, size_of_val, MaybeUninit}, + ptr, +}; + +use kernel::{ + bindings, + error::Result, + ffi::{c_ulong, c_void}, + mm::{virt, Mm, MmWithUser}, + new_mutex, new_spinlock, + page::{Page, PAGE_SHIFT, PAGE_SIZE}, + prelude::*, + str::CStr, + sync::{aref::ARef, Mutex, SpinLock}, + task::Pid, + transmute::FromBytes, + types::Opaque, + uaccess::UserSliceReader, +}; + +/// Represents a shrinker that can be registered with the kernel. +/// +/// Each shrinker can be used by many `ShrinkablePageRange` objects. +#[repr(C)] +pub(crate) struct Shrinker { + inner: Opaque<*mut bindings::shrinker>, + list_lru: Opaque, +} + +// SAFETY: The shrinker and list_lru are thread safe. +unsafe impl Send for Shrinker {} +// SAFETY: The shrinker and list_lru are thread safe. +unsafe impl Sync for Shrinker {} + +impl Shrinker { + /// Create a new shrinker. + /// + /// # Safety + /// + /// Before using this shrinker with a `ShrinkablePageRange`, the `register` method must have + /// been called exactly once, and it must not have returned an error. + pub(crate) const unsafe fn new() -> Self { + Self { + inner: Opaque::uninit(), + list_lru: Opaque::uninit(), + } + } + + /// Register this shrinker with the kernel. + pub(crate) fn register(&'static self, name: &CStr) -> Result<()> { + // SAFETY: These fields are not yet used, so it's okay to zero them. + unsafe { + self.inner.get().write(ptr::null_mut()); + self.list_lru.get().write_bytes(0, 1); + } + + // SAFETY: The field is not yet used, so we can initialize it. + let ret = unsafe { bindings::__list_lru_init(self.list_lru.get(), false, ptr::null_mut()) }; + if ret != 0 { + return Err(Error::from_errno(ret)); + } + + // SAFETY: The `name` points at a valid c string. + let shrinker = unsafe { bindings::shrinker_alloc(0, name.as_char_ptr()) }; + if shrinker.is_null() { + // SAFETY: We initialized it, so its okay to destroy it. + unsafe { bindings::list_lru_destroy(self.list_lru.get()) }; + return Err(Error::from_errno(ret)); + } + + // SAFETY: We're about to register the shrinker, and these are the fields we need to + // initialize. (All other fields are already zeroed.) + unsafe { + (&raw mut (*shrinker).count_objects).write(Some(rust_shrink_count)); + (&raw mut (*shrinker).scan_objects).write(Some(rust_shrink_scan)); + (&raw mut (*shrinker).private_data).write(self.list_lru.get().cast()); + } + + // SAFETY: The new shrinker has been fully initialized, so we can register it. + unsafe { bindings::shrinker_register(shrinker) }; + + // SAFETY: This initializes the pointer to the shrinker so that we can use it. + unsafe { self.inner.get().write(shrinker) }; + + Ok(()) + } +} + +/// A container that manages a page range in a vma. +/// +/// The pages can be thought of as an array of booleans of whether the pages are usable. The +/// methods `use_range` and `stop_using_range` set all booleans in a range to true or false +/// respectively. Initially, no pages are allocated. When a page is not used, it is not freed +/// immediately. Instead, it is made available to the memory shrinker to free it if the device is +/// under memory pressure. +/// +/// It's okay for `use_range` and `stop_using_range` to race with each other, although there's no +/// way to know whether an index ends up with true or false if a call to `use_range` races with +/// another call to `stop_using_range` on a given index. +/// +/// It's also okay for the two methods to race with themselves, e.g. if two threads call +/// `use_range` on the same index, then that's fine and neither call will return until the page is +/// allocated and mapped. +/// +/// The methods that read or write to a range require that the page is marked as in use. So it is +/// _not_ okay to call `stop_using_range` on a page that is in use by the methods that read or +/// write to the page. +#[pin_data(PinnedDrop)] +pub(crate) struct ShrinkablePageRange { + /// Shrinker object registered with the kernel. + shrinker: &'static Shrinker, + /// Pid using this page range. Only used as debugging information. + pid: Pid, + /// The mm for the relevant process. + mm: ARef, + /// Used to synchronize calls to `vm_insert_page` and `zap_page_range_single`. + #[pin] + mm_lock: Mutex<()>, + /// Spinlock protecting changes to pages. + #[pin] + lock: SpinLock, + + /// Must not move, since page info has pointers back. + #[pin] + _pin: PhantomPinned, +} + +struct Inner { + /// Array of pages. + /// + /// Since this is also accessed by the shrinker, we can't use a `Box`, which asserts exclusive + /// ownership. To deal with that, we manage it using raw pointers. + pages: *mut PageInfo, + /// Length of the `pages` array. + size: usize, + /// The address of the vma to insert the pages into. + vma_addr: usize, +} + +// SAFETY: proper locking is in place for `Inner` +unsafe impl Send for Inner {} + +type StableMmGuard = + kernel::sync::lock::Guard<'static, (), kernel::sync::lock::mutex::MutexBackend>; + +/// An array element that describes the current state of a page. +/// +/// There are three states: +/// +/// * Free. The page is None. The `lru` element is not queued. +/// * Available. The page is Some. The `lru` element is queued to the shrinker's lru. +/// * Used. The page is Some. The `lru` element is not queued. +/// +/// When an element is available, the shrinker is able to free the page. +#[repr(C)] +struct PageInfo { + lru: bindings::list_head, + page: Option, + range: *const ShrinkablePageRange, +} + +impl PageInfo { + /// # Safety + /// + /// The caller ensures that writing to `me.page` is ok, and that the page is not currently set. + unsafe fn set_page(me: *mut PageInfo, page: Page) { + // SAFETY: This pointer offset is in bounds. + let ptr = unsafe { &raw mut (*me).page }; + + // SAFETY: The pointer is valid for writing, so also valid for reading. + if unsafe { (*ptr).is_some() } { + pr_err!("set_page called when there is already a page"); + // SAFETY: We will initialize the page again below. + unsafe { ptr::drop_in_place(ptr) }; + } + + // SAFETY: The pointer is valid for writing. + unsafe { ptr::write(ptr, Some(page)) }; + } + + /// # Safety + /// + /// The caller ensures that reading from `me.page` is ok for the duration of 'a. + unsafe fn get_page<'a>(me: *const PageInfo) -> Option<&'a Page> { + // SAFETY: This pointer offset is in bounds. + let ptr = unsafe { &raw const (*me).page }; + + // SAFETY: The pointer is valid for reading. + unsafe { (*ptr).as_ref() } + } + + /// # Safety + /// + /// The caller ensures that writing to `me.page` is ok for the duration of 'a. + unsafe fn take_page(me: *mut PageInfo) -> Option { + // SAFETY: This pointer offset is in bounds. + let ptr = unsafe { &raw mut (*me).page }; + + // SAFETY: The pointer is valid for reading. + unsafe { (*ptr).take() } + } + + /// Add this page to the lru list, if not already in the list. + /// + /// # Safety + /// + /// The pointer must be valid, and it must be the right shrinker and nid. + unsafe fn list_lru_add(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker) { + // SAFETY: This pointer offset is in bounds. + let lru_ptr = unsafe { &raw mut (*me).lru }; + // SAFETY: The lru pointer is valid, and we're not using it with any other lru list. + unsafe { bindings::list_lru_add(shrinker.list_lru.get(), lru_ptr, nid, ptr::null_mut()) }; + } + + /// Remove this page from the lru list, if it is in the list. + /// + /// # Safety + /// + /// The pointer must be valid, and it must be the right shrinker and nid. + unsafe fn list_lru_del(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker) { + // SAFETY: This pointer offset is in bounds. + let lru_ptr = unsafe { &raw mut (*me).lru }; + // SAFETY: The lru pointer is valid, and we're not using it with any other lru list. + unsafe { bindings::list_lru_del(shrinker.list_lru.get(), lru_ptr, nid, ptr::null_mut()) }; + } +} + +impl ShrinkablePageRange { + /// Create a new `ShrinkablePageRange` using the given shrinker. + pub(crate) fn new(shrinker: &'static Shrinker) -> impl PinInit { + try_pin_init!(Self { + shrinker, + pid: kernel::current!().pid(), + mm: ARef::from(&**kernel::current!().mm().ok_or(ESRCH)?), + mm_lock <- new_mutex!((), "ShrinkablePageRange::mm"), + lock <- new_spinlock!(Inner { + pages: ptr::null_mut(), + size: 0, + vma_addr: 0, + }, "ShrinkablePageRange"), + _pin: PhantomPinned, + }) + } + + pub(crate) fn stable_trylock_mm(&self) -> Option { + // SAFETY: This extends the duration of the reference. Since this call happens before + // `mm_lock` is taken in the destructor of `ShrinkablePageRange`, the destructor will block + // until the returned guard is dropped. This ensures that the guard is valid until dropped. + let mm_lock = unsafe { &*ptr::from_ref(&self.mm_lock) }; + + mm_lock.try_lock() + } + + /// Register a vma with this page range. Returns the size of the region. + pub(crate) fn register_with_vma(&self, vma: &virt::VmaNew) -> Result { + let num_bytes = usize::min(vma.end() - vma.start(), bindings::SZ_4M as usize); + let num_pages = num_bytes >> PAGE_SHIFT; + + if !ptr::eq::(&*self.mm, &**vma.mm()) { + pr_debug!("Failed to register with vma: invalid vma->vm_mm"); + return Err(EINVAL); + } + if num_pages == 0 { + pr_debug!("Failed to register with vma: size zero"); + return Err(EINVAL); + } + + let mut pages = KVVec::::with_capacity(num_pages, GFP_KERNEL)?; + + // SAFETY: This just initializes the pages array. + unsafe { + let self_ptr = self as *const ShrinkablePageRange; + for i in 0..num_pages { + let info = pages.as_mut_ptr().add(i); + (&raw mut (*info).range).write(self_ptr); + (&raw mut (*info).page).write(None); + let lru = &raw mut (*info).lru; + (&raw mut (*lru).next).write(lru); + (&raw mut (*lru).prev).write(lru); + } + } + + let mut inner = self.lock.lock(); + if inner.size > 0 { + pr_debug!("Failed to register with vma: already registered"); + drop(inner); + return Err(EBUSY); + } + + inner.pages = pages.into_raw_parts().0; + inner.size = num_pages; + inner.vma_addr = vma.start(); + + Ok(num_pages) + } + + /// Make sure that the given pages are allocated and mapped. + /// + /// Must not be called from an atomic context. + pub(crate) fn use_range(&self, start: usize, end: usize) -> Result<()> { + if start >= end { + return Ok(()); + } + let mut inner = self.lock.lock(); + assert!(end <= inner.size); + + for i in start..end { + // SAFETY: This pointer offset is in bounds. + let page_info = unsafe { inner.pages.add(i) }; + + // SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay. + if let Some(page) = unsafe { PageInfo::get_page(page_info) } { + // Since we're going to use the page, we should remove it from the lru list so that + // the shrinker will not free it. + // + // SAFETY: The pointer is valid, and this is the right shrinker. + // + // The shrinker can't free the page between the check and this call to + // `list_lru_del` because we hold the lock. + unsafe { PageInfo::list_lru_del(page_info, page.nid(), self.shrinker) }; + } else { + // We have to allocate a new page. Use the slow path. + drop(inner); + // SAFETY: `i < end <= inner.size` so `i` is in bounds. + match unsafe { self.use_page_slow(i) } { + Ok(()) => {} + Err(err) => { + pr_warn!("Error in use_page_slow: {:?}", err); + return Err(err); + } + } + inner = self.lock.lock(); + } + } + Ok(()) + } + + /// Mark the given page as in use, slow path. + /// + /// Must not be called from an atomic context. + /// + /// # Safety + /// + /// Assumes that `i` is in bounds. + #[cold] + unsafe fn use_page_slow(&self, i: usize) -> Result<()> { + let new_page = Page::alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO)?; + + let mm_mutex = self.mm_lock.lock(); + let inner = self.lock.lock(); + + // SAFETY: This pointer offset is in bounds. + let page_info = unsafe { inner.pages.add(i) }; + + // SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay. + if let Some(page) = unsafe { PageInfo::get_page(page_info) } { + // The page was already there, or someone else added the page while we didn't hold the + // spinlock. + // + // SAFETY: The pointer is valid, and this is the right shrinker. + // + // The shrinker can't free the page between the check and this call to + // `list_lru_del` because we hold the lock. + unsafe { PageInfo::list_lru_del(page_info, page.nid(), self.shrinker) }; + return Ok(()); + } + + let vma_addr = inner.vma_addr; + // Release the spinlock while we insert the page into the vma. + drop(inner); + + // No overflow since we stay in bounds of the vma. + let user_page_addr = vma_addr + (i << PAGE_SHIFT); + + // We use `mmput_async` when dropping the `mm` because `use_page_slow` is usually used from + // a remote process. If the call to `mmput` races with the process shutting down, then the + // caller of `use_page_slow` becomes responsible for cleaning up the `mm`, which doesn't + // happen until it returns to userspace. However, the caller might instead go to sleep and + // wait for the owner of the `mm` to wake it up, which doesn't happen because it's in the + // middle of a shutdown process that won't complete until the `mm` is dropped. This can + // amount to a deadlock. + // + // Using `mmput_async` avoids this, because then the `mm` cleanup is instead queued to a + // workqueue. + MmWithUser::into_mmput_async(self.mm.mmget_not_zero().ok_or(ESRCH)?) + .mmap_read_lock() + .vma_lookup(vma_addr) + .ok_or(ESRCH)? + .as_mixedmap_vma() + .ok_or(ESRCH)? + .vm_insert_page(user_page_addr, &new_page) + .inspect_err(|err| { + pr_warn!( + "Failed to vm_insert_page({}): vma_addr:{} i:{} err:{:?}", + user_page_addr, + vma_addr, + i, + err + ) + })?; + + let inner = self.lock.lock(); + + // SAFETY: The `page_info` pointer is valid and currently does not have a page. The page + // can be written to since we hold the lock. + // + // We released and reacquired the spinlock since we checked that the page is null, but we + // always hold the mm_lock mutex when setting the page to a non-null value, so it's not + // possible for someone else to have changed it since our check. + unsafe { PageInfo::set_page(page_info, new_page) }; + + drop(inner); + drop(mm_mutex); + + Ok(()) + } + + /// If the given page is in use, then mark it as available so that the shrinker can free it. + /// + /// May be called from an atomic context. + pub(crate) fn stop_using_range(&self, start: usize, end: usize) { + if start >= end { + return; + } + let inner = self.lock.lock(); + assert!(end <= inner.size); + + for i in (start..end).rev() { + // SAFETY: The pointer is in bounds. + let page_info = unsafe { inner.pages.add(i) }; + + // SAFETY: Okay for reading since we have the lock. + if let Some(page) = unsafe { PageInfo::get_page(page_info) } { + // SAFETY: The pointer is valid, and it's the right shrinker. + unsafe { PageInfo::list_lru_add(page_info, page.nid(), self.shrinker) }; + } + } + } + + /// Helper for reading or writing to a range of bytes that may overlap with several pages. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + unsafe fn iterate(&self, mut offset: usize, mut size: usize, mut cb: T) -> Result + where + T: FnMut(&Page, usize, usize) -> Result, + { + if size == 0 { + return Ok(()); + } + + let (pages, num_pages) = { + let inner = self.lock.lock(); + (inner.pages, inner.size) + }; + let num_bytes = num_pages << PAGE_SHIFT; + + // Check that the request is within the buffer. + if offset.checked_add(size).ok_or(EFAULT)? > num_bytes { + return Err(EFAULT); + } + + let mut page_index = offset >> PAGE_SHIFT; + offset &= PAGE_SIZE - 1; + while size > 0 { + let available = usize::min(size, PAGE_SIZE - offset); + // SAFETY: The pointer is in bounds. + let page_info = unsafe { pages.add(page_index) }; + // SAFETY: The caller guarantees that this page is in the "in use" state for the + // duration of this call to `iterate`, so nobody will change the page. + let page = unsafe { PageInfo::get_page(page_info) }; + if page.is_none() { + pr_warn!("Page is null!"); + } + let page = page.ok_or(EFAULT)?; + cb(page, offset, available)?; + size -= available; + page_index += 1; + offset = 0; + } + Ok(()) + } + + /// Copy from userspace into this page range. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + pub(crate) unsafe fn copy_from_user_slice( + &self, + reader: &mut UserSliceReader, + offset: usize, + size: usize, + ) -> Result { + // SAFETY: `self.iterate` has the same safety requirements as `copy_from_user_slice`. + unsafe { + self.iterate(offset, size, |page, offset, to_copy| { + page.copy_from_user_slice_raw(reader, offset, to_copy) + }) + } + } + + /// Copy from this page range into kernel space. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + pub(crate) unsafe fn read(&self, offset: usize) -> Result { + let mut out = MaybeUninit::::uninit(); + let mut out_offset = 0; + // SAFETY: `self.iterate` has the same safety requirements as `read`. + unsafe { + self.iterate(offset, size_of::(), |page, offset, to_copy| { + // SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T. + let obj_ptr = (out.as_mut_ptr() as *mut u8).add(out_offset); + // SAFETY: The pointer points is in-bounds of the `out` variable, so it is valid. + page.read_raw(obj_ptr, offset, to_copy)?; + out_offset += to_copy; + Ok(()) + })?; + } + // SAFETY: We just initialised the data. + Ok(unsafe { out.assume_init() }) + } + + /// Copy from kernel space into this page range. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + pub(crate) unsafe fn write(&self, offset: usize, obj: &T) -> Result { + let mut obj_offset = 0; + // SAFETY: `self.iterate` has the same safety requirements as `write`. + unsafe { + self.iterate(offset, size_of_val(obj), |page, offset, to_copy| { + // SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T. + let obj_ptr = (obj as *const T as *const u8).add(obj_offset); + // SAFETY: We have a reference to the object, so the pointer is valid. + page.write_raw(obj_ptr, offset, to_copy)?; + obj_offset += to_copy; + Ok(()) + }) + } + } + + /// Write zeroes to the given range. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + pub(crate) unsafe fn fill_zero(&self, offset: usize, size: usize) -> Result { + // SAFETY: `self.iterate` has the same safety requirements as `copy_into`. + unsafe { + self.iterate(offset, size, |page, offset, len| { + page.fill_zero_raw(offset, len) + }) + } + } +} + +#[pinned_drop] +impl PinnedDrop for ShrinkablePageRange { + fn drop(self: Pin<&mut Self>) { + let (pages, size) = { + let lock = self.lock.lock(); + (lock.pages, lock.size) + }; + + if size == 0 { + return; + } + + // Note: This call is also necessary for the safety of `stable_trylock_mm`. + let mm_lock = self.mm_lock.lock(); + + // This is the destructor, so unlike the other methods, we only need to worry about races + // with the shrinker here. Since we hold the `mm_lock`, we also can't race with the + // shrinker, and after this loop, the shrinker will not access any of our pages since we + // removed them from the lru list. + for i in 0..size { + // SAFETY: Loop is in-bounds of the size. + let p_ptr = unsafe { pages.add(i) }; + // SAFETY: No other readers, so we can read. + if let Some(p) = unsafe { PageInfo::get_page(p_ptr) } { + // SAFETY: The pointer is valid and it's the right shrinker. + unsafe { PageInfo::list_lru_del(p_ptr, p.nid(), self.shrinker) }; + } + } + + drop(mm_lock); + + // SAFETY: `pages` was allocated as an `KVVec` with capacity `size`. Furthermore, + // all `size` elements are initialized. Also, the array is no longer shared with the + // shrinker due to the above loop. + drop(unsafe { KVVec::from_raw_parts(pages, size, size) }); + } +} + +/// # Safety +/// Called by the shrinker. +#[no_mangle] +unsafe extern "C" fn rust_shrink_count( + shrink: *mut bindings::shrinker, + _sc: *mut bindings::shrink_control, +) -> c_ulong { + // SAFETY: We can access our own private data. + let list_lru = unsafe { (*shrink).private_data.cast::() }; + // SAFETY: Accessing the lru list is okay. Just an FFI call. + unsafe { bindings::list_lru_count(list_lru) } +} + +/// # Safety +/// Called by the shrinker. +#[no_mangle] +unsafe extern "C" fn rust_shrink_scan( + shrink: *mut bindings::shrinker, + sc: *mut bindings::shrink_control, +) -> c_ulong { + // SAFETY: We can access our own private data. + let list_lru = unsafe { (*shrink).private_data.cast::() }; + // SAFETY: Caller guarantees that it is safe to read this field. + let nr_to_scan = unsafe { (*sc).nr_to_scan }; + // SAFETY: Accessing the lru list is okay. Just an FFI call. + unsafe { + bindings::list_lru_walk( + list_lru, + Some(bindings::rust_shrink_free_page_wrap), + ptr::null_mut(), + nr_to_scan, + ) + } +} + +const LRU_SKIP: bindings::lru_status = bindings::lru_status_LRU_SKIP; +const LRU_REMOVED_ENTRY: bindings::lru_status = bindings::lru_status_LRU_REMOVED_RETRY; + +/// # Safety +/// Called by the shrinker. +#[no_mangle] +unsafe extern "C" fn rust_shrink_free_page( + item: *mut bindings::list_head, + lru: *mut bindings::list_lru_one, + _cb_arg: *mut c_void, +) -> bindings::lru_status { + // Fields that should survive after unlocking the lru lock. + let page; + let page_index; + let mm; + let mmap_read; + let mm_mutex; + let vma_addr; + + { + // CAST: The `list_head` field is first in `PageInfo`. + let info = item as *mut PageInfo; + // SAFETY: The `range` field of `PageInfo` is immutable. + let range = unsafe { &*((*info).range) }; + + mm = match range.mm.mmget_not_zero() { + Some(mm) => MmWithUser::into_mmput_async(mm), + None => return LRU_SKIP, + }; + + mm_mutex = match range.stable_trylock_mm() { + Some(guard) => guard, + None => return LRU_SKIP, + }; + + mmap_read = match mm.mmap_read_trylock() { + Some(guard) => guard, + None => return LRU_SKIP, + }; + + // We can't lock it normally here, since we hold the lru lock. + let inner = match range.lock.try_lock() { + Some(inner) => inner, + None => return LRU_SKIP, + }; + + // SAFETY: The item is in this lru list, so it's okay to remove it. + unsafe { bindings::list_lru_isolate(lru, item) }; + + // SAFETY: Both pointers are in bounds of the same allocation. + page_index = unsafe { info.offset_from(inner.pages) } as usize; + + // SAFETY: We hold the spinlock, so we can take the page. + // + // This sets the page pointer to zero before we unmap it from the vma. However, we call + // `zap_page_range` before we release the mmap lock, so `use_page_slow` will not be able to + // insert a new page until after our call to `zap_page_range`. + page = unsafe { PageInfo::take_page(info) }; + vma_addr = inner.vma_addr; + + // From this point on, we don't access this PageInfo or ShrinkablePageRange again, because + // they can be freed at any point after we unlock `lru_lock`. This is with the exception of + // `mm_mutex` which is kept alive by holding the lock. + } + + // SAFETY: The lru lock is locked when this method is called. + unsafe { bindings::spin_unlock(&raw mut (*lru).lock) }; + + if let Some(vma) = mmap_read.vma_lookup(vma_addr) { + let user_page_addr = vma_addr + (page_index << PAGE_SHIFT); + vma.zap_page_range_single(user_page_addr, PAGE_SIZE); + } + + drop(mmap_read); + drop(mm_mutex); + drop(mm); + drop(page); + + // SAFETY: We just unlocked the lru lock, but it should be locked when we return. + unsafe { bindings::spin_lock(&raw mut (*lru).lock) }; + + LRU_REMOVED_ENTRY +} diff --git a/drivers/android/binder/page_range_helper.c b/drivers/android/binder/page_range_helper.c new file mode 100644 index 000000000000..496887723ee0 --- /dev/null +++ b/drivers/android/binder/page_range_helper.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* C helper for page_range.rs to work around a CFI violation. + * + * Bindgen currently pretends that `enum lru_status` is the same as an integer. + * This assumption is fine ABI-wise, but once you add CFI to the mix, it + * triggers a CFI violation because `enum lru_status` gets a different CFI tag. + * + * This file contains a workaround until bindgen can be fixed. + * + * Copyright (C) 2025 Google LLC. + */ +#include "page_range_helper.h" + +unsigned int rust_shrink_free_page(struct list_head *item, + struct list_lru_one *list, + void *cb_arg); + +enum lru_status +rust_shrink_free_page_wrap(struct list_head *item, struct list_lru_one *list, + void *cb_arg) +{ + return rust_shrink_free_page(item, list, cb_arg); +} diff --git a/drivers/android/binder/page_range_helper.h b/drivers/android/binder/page_range_helper.h new file mode 100644 index 000000000000..18dd2dd117b2 --- /dev/null +++ b/drivers/android/binder/page_range_helper.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Google, Inc. + */ + +#ifndef _LINUX_PAGE_RANGE_HELPER_H +#define _LINUX_PAGE_RANGE_HELPER_H + +#include + +enum lru_status +rust_shrink_free_page_wrap(struct list_head *item, struct list_lru_one *list, + void *cb_arg); + +#endif /* _LINUX_PAGE_RANGE_HELPER_H */ diff --git a/drivers/android/binder/process.rs b/drivers/android/binder/process.rs new file mode 100644 index 000000000000..f13a747e784c --- /dev/null +++ b/drivers/android/binder/process.rs @@ -0,0 +1,1696 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! This module defines the `Process` type, which represents a process using a particular binder +//! context. +//! +//! The `Process` object keeps track of all of the resources that this process owns in the binder +//! context. +//! +//! There is one `Process` object for each binder fd that a process has opened, so processes using +//! several binder contexts have several `Process` objects. This ensures that the contexts are +//! fully separated. + +use core::mem::take; + +use kernel::{ + bindings, + cred::Credential, + error::Error, + fs::file::{self, File}, + list::{List, ListArc, ListArcField, ListLinks}, + mm, + prelude::*, + rbtree::{self, RBTree, RBTreeNode, RBTreeNodeReservation}, + seq_file::SeqFile, + seq_print, + sync::poll::PollTable, + sync::{ + lock::{spinlock::SpinLockBackend, Guard}, + Arc, ArcBorrow, CondVar, CondVarTimeoutResult, Mutex, SpinLock, UniqueArc, + }, + task::Task, + types::ARef, + uaccess::{UserSlice, UserSliceReader}, + uapi, + workqueue::{self, Work}, +}; + +use crate::{ + allocation::{Allocation, AllocationInfo, NewAllocation}, + context::Context, + defs::*, + error::{BinderError, BinderResult}, + node::{CouldNotDeliverCriticalIncrement, CritIncrWrapper, Node, NodeDeath, NodeRef}, + page_range::ShrinkablePageRange, + range_alloc::{RangeAllocator, ReserveNew, ReserveNewArgs}, + stats::BinderStats, + thread::{PushWorkRes, Thread}, + BinderfsProcFile, DArc, DLArc, DTRWrap, DeliverToRead, +}; + +#[path = "freeze.rs"] +mod freeze; +use self::freeze::{FreezeCookie, FreezeListener}; + +struct Mapping { + address: usize, + alloc: RangeAllocator, +} + +impl Mapping { + fn new(address: usize, size: usize) -> Self { + Self { + address, + alloc: RangeAllocator::new(size), + } + } +} + +// bitflags for defer_work. +const PROC_DEFER_FLUSH: u8 = 1; +const PROC_DEFER_RELEASE: u8 = 2; + +/// The fields of `Process` protected by the spinlock. +pub(crate) struct ProcessInner { + is_manager: bool, + pub(crate) is_dead: bool, + threads: RBTree>, + /// INVARIANT: Threads pushed to this list must be owned by this process. + ready_threads: List, + nodes: RBTree>, + mapping: Option, + work: List>, + delivered_deaths: List, 2>, + + /// The number of requested threads that haven't registered yet. + requested_thread_count: u32, + /// The maximum number of threads used by the process thread pool. + max_threads: u32, + /// The number of threads the started and registered with the thread pool. + started_thread_count: u32, + + /// Bitmap of deferred work to do. + defer_work: u8, + + /// Number of transactions to be transmitted before processes in freeze_wait + /// are woken up. + outstanding_txns: u32, + /// Process is frozen and unable to service binder transactions. + pub(crate) is_frozen: bool, + /// Process received sync transactions since last frozen. + pub(crate) sync_recv: bool, + /// Process received async transactions since last frozen. + pub(crate) async_recv: bool, + pub(crate) binderfs_file: Option, + /// Check for oneway spam + oneway_spam_detection_enabled: bool, +} + +impl ProcessInner { + fn new() -> Self { + Self { + is_manager: false, + is_dead: false, + threads: RBTree::new(), + ready_threads: List::new(), + mapping: None, + nodes: RBTree::new(), + work: List::new(), + delivered_deaths: List::new(), + requested_thread_count: 0, + max_threads: 0, + started_thread_count: 0, + defer_work: 0, + outstanding_txns: 0, + is_frozen: false, + sync_recv: false, + async_recv: false, + binderfs_file: None, + oneway_spam_detection_enabled: false, + } + } + + /// Schedule the work item for execution on this process. + /// + /// If any threads are ready for work, then the work item is given directly to that thread and + /// it is woken up. Otherwise, it is pushed to the process work list. + /// + /// This call can fail only if the process is dead. In this case, the work item is returned to + /// the caller so that the caller can drop it after releasing the inner process lock. This is + /// necessary since the destructor of `Transaction` will take locks that can't necessarily be + /// taken while holding the inner process lock. + pub(crate) fn push_work( + &mut self, + work: DLArc, + ) -> Result<(), (BinderError, DLArc)> { + // Try to find a ready thread to which to push the work. + if let Some(thread) = self.ready_threads.pop_front() { + // Push to thread while holding state lock. This prevents the thread from giving up + // (for example, because of a signal) when we're about to deliver work. + match thread.push_work(work) { + PushWorkRes::Ok => Ok(()), + PushWorkRes::FailedDead(work) => Err((BinderError::new_dead(), work)), + } + } else if self.is_dead { + Err((BinderError::new_dead(), work)) + } else { + let sync = work.should_sync_wakeup(); + + // Didn't find a thread waiting for proc work; this can happen + // in two scenarios: + // 1. All threads are busy handling transactions + // In that case, one of those threads should call back into + // the kernel driver soon and pick up this work. + // 2. Threads are using the (e)poll interface, in which case + // they may be blocked on the waitqueue without having been + // added to waiting_threads. For this case, we just iterate + // over all threads not handling transaction work, and + // wake them all up. We wake all because we don't know whether + // a thread that called into (e)poll is handling non-binder + // work currently. + self.work.push_back(work); + + // Wake up polling threads, if any. + for thread in self.threads.values() { + thread.notify_if_poll_ready(sync); + } + + Ok(()) + } + } + + pub(crate) fn remove_node(&mut self, ptr: u64) { + self.nodes.remove(&ptr); + } + + /// Updates the reference count on the given node. + pub(crate) fn update_node_refcount( + &mut self, + node: &DArc, + inc: bool, + strong: bool, + count: usize, + othread: Option<&Thread>, + ) { + let push = node.update_refcount_locked(inc, strong, count, self); + + // If we decided that we need to push work, push either to the process or to a thread if + // one is specified. + if let Some(node) = push { + if let Some(thread) = othread { + thread.push_work_deferred(node); + } else { + let _ = self.push_work(node); + // Nothing to do: `push_work` may fail if the process is dead, but that's ok as in + // that case, it doesn't care about the notification. + } + } + } + + pub(crate) fn new_node_ref( + &mut self, + node: DArc, + strong: bool, + thread: Option<&Thread>, + ) -> NodeRef { + self.update_node_refcount(&node, true, strong, 1, thread); + let strong_count = if strong { 1 } else { 0 }; + NodeRef::new(node, strong_count, 1 - strong_count) + } + + pub(crate) fn new_node_ref_with_thread( + &mut self, + node: DArc, + strong: bool, + thread: &Thread, + wrapper: Option, + ) -> Result { + let push = match wrapper { + None => node + .incr_refcount_allow_zero2one(strong, self)? + .map(|node| node as _), + Some(wrapper) => node.incr_refcount_allow_zero2one_with_wrapper(strong, wrapper, self), + }; + if let Some(node) = push { + thread.push_work_deferred(node); + } + let strong_count = if strong { 1 } else { 0 }; + Ok(NodeRef::new(node, strong_count, 1 - strong_count)) + } + + /// Returns an existing node with the given pointer and cookie, if one exists. + /// + /// Returns an error if a node with the given pointer but a different cookie exists. + fn get_existing_node(&self, ptr: u64, cookie: u64) -> Result>> { + match self.nodes.get(&ptr) { + None => Ok(None), + Some(node) => { + let (_, node_cookie) = node.get_id(); + if node_cookie == cookie { + Ok(Some(node.clone())) + } else { + Err(EINVAL) + } + } + } + } + + fn register_thread(&mut self) -> bool { + if self.requested_thread_count == 0 { + return false; + } + + self.requested_thread_count -= 1; + self.started_thread_count += 1; + true + } + + /// Finds a delivered death notification with the given cookie, removes it from the thread's + /// delivered list, and returns it. + fn pull_delivered_death(&mut self, cookie: u64) -> Option> { + let mut cursor = self.delivered_deaths.cursor_front(); + while let Some(next) = cursor.peek_next() { + if next.cookie == cookie { + return Some(next.remove().into_arc()); + } + cursor.move_next(); + } + None + } + + pub(crate) fn death_delivered(&mut self, death: DArc) { + if let Some(death) = ListArc::try_from_arc_or_drop(death) { + self.delivered_deaths.push_back(death); + } else { + pr_warn!("Notification added to `delivered_deaths` twice."); + } + } + + pub(crate) fn add_outstanding_txn(&mut self) { + self.outstanding_txns += 1; + } + + fn txns_pending_locked(&self) -> bool { + if self.outstanding_txns > 0 { + return true; + } + for thread in self.threads.values() { + if thread.has_current_transaction() { + return true; + } + } + false + } +} + +/// Used to keep track of a node that this process has a handle to. +#[pin_data] +pub(crate) struct NodeRefInfo { + debug_id: usize, + /// The refcount that this process owns to the node. + node_ref: ListArcField, + death: ListArcField>, { Self::LIST_PROC }>, + /// Cookie of the active freeze listener for this node. + freeze: ListArcField, { Self::LIST_PROC }>, + /// Used to store this `NodeRefInfo` in the node's `refs` list. + #[pin] + links: ListLinks<{ Self::LIST_NODE }>, + /// The handle for this `NodeRefInfo`. + handle: u32, + /// The process that has a handle to the node. + pub(crate) process: Arc, +} + +impl NodeRefInfo { + /// The id used for the `Node::refs` list. + pub(crate) const LIST_NODE: u64 = 0x2da16350fb724a10; + /// The id used for the `ListArc` in `ProcessNodeRefs`. + const LIST_PROC: u64 = 0xd703a5263dcc8650; + + fn new(node_ref: NodeRef, handle: u32, process: Arc) -> impl PinInit { + pin_init!(Self { + debug_id: super::next_debug_id(), + node_ref: ListArcField::new(node_ref), + death: ListArcField::new(None), + freeze: ListArcField::new(None), + links <- ListLinks::new(), + handle, + process, + }) + } + + kernel::list::define_list_arc_field_getter! { + pub(crate) fn death(&mut self<{Self::LIST_PROC}>) -> &mut Option> { death } + pub(crate) fn freeze(&mut self<{Self::LIST_PROC}>) -> &mut Option { freeze } + pub(crate) fn node_ref(&mut self<{Self::LIST_PROC}>) -> &mut NodeRef { node_ref } + pub(crate) fn node_ref2(&self<{Self::LIST_PROC}>) -> &NodeRef { node_ref } + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<{Self::LIST_NODE}> for NodeRefInfo { untracked; } + impl ListArcSafe<{Self::LIST_PROC}> for NodeRefInfo { untracked; } +} +kernel::list::impl_list_item! { + impl ListItem<{Self::LIST_NODE}> for NodeRefInfo { + using ListLinks { self.links }; + } +} + +/// Keeps track of references this process has to nodes owned by other processes. +/// +/// TODO: Currently, the rbtree requires two allocations per node reference, and two tree +/// traversals to look up a node by `Node::global_id`. Once the rbtree is more powerful, these +/// extra costs should be eliminated. +struct ProcessNodeRefs { + /// Used to look up nodes using the 32-bit id that this process knows it by. + by_handle: RBTree>, + /// Used to look up nodes without knowing their local 32-bit id. The usize is the address of + /// the underlying `Node` struct as returned by `Node::global_id`. + by_node: RBTree, + /// Used to look up a `FreezeListener` by cookie. + /// + /// There might be multiple freeze listeners for the same node, but at most one of them is + /// active. + freeze_listeners: RBTree, +} + +impl ProcessNodeRefs { + fn new() -> Self { + Self { + by_handle: RBTree::new(), + by_node: RBTree::new(), + freeze_listeners: RBTree::new(), + } + } +} + +/// A process using binder. +/// +/// Strictly speaking, there can be multiple of these per process. There is one for each binder fd +/// that a process has opened, so processes using several binder contexts have several `Process` +/// objects. This ensures that the contexts are fully separated. +#[pin_data] +pub(crate) struct Process { + pub(crate) ctx: Arc, + + // The task leader (process). + pub(crate) task: ARef, + + // Credential associated with file when `Process` is created. + pub(crate) cred: ARef, + + #[pin] + pub(crate) inner: SpinLock, + + #[pin] + pub(crate) pages: ShrinkablePageRange, + + // Waitqueue of processes waiting for all outstanding transactions to be + // processed. + #[pin] + freeze_wait: CondVar, + + // Node references are in a different lock to avoid recursive acquisition when + // incrementing/decrementing a node in another process. + #[pin] + node_refs: Mutex, + + // Work node for deferred work item. + #[pin] + defer_work: Work, + + // Links for process list in Context. + #[pin] + links: ListLinks, + + pub(crate) stats: BinderStats, +} + +kernel::impl_has_work! { + impl HasWork for Process { self.defer_work } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Process { untracked; } +} +kernel::list::impl_list_item! { + impl ListItem<0> for Process { + using ListLinks { self.links }; + } +} + +impl workqueue::WorkItem for Process { + type Pointer = Arc; + + fn run(me: Arc) { + let defer; + { + let mut inner = me.inner.lock(); + defer = inner.defer_work; + inner.defer_work = 0; + } + + if defer & PROC_DEFER_FLUSH != 0 { + me.deferred_flush(); + } + if defer & PROC_DEFER_RELEASE != 0 { + me.deferred_release(); + } + } +} + +impl Process { + fn new(ctx: Arc, cred: ARef) -> Result> { + let current = kernel::current!(); + let list_process = ListArc::pin_init::( + try_pin_init!(Process { + ctx, + cred, + inner <- kernel::new_spinlock!(ProcessInner::new(), "Process::inner"), + pages <- ShrinkablePageRange::new(&super::BINDER_SHRINKER), + node_refs <- kernel::new_mutex!(ProcessNodeRefs::new(), "Process::node_refs"), + freeze_wait <- kernel::new_condvar!("Process::freeze_wait"), + task: current.group_leader().into(), + defer_work <- kernel::new_work!("Process::defer_work"), + links <- ListLinks::new(), + stats: BinderStats::new(), + }), + GFP_KERNEL, + )?; + + let process = list_process.clone_arc(); + process.ctx.register_process(list_process); + + Ok(process) + } + + pub(crate) fn pid_in_current_ns(&self) -> kernel::task::Pid { + self.task.tgid_nr_ns(None) + } + + #[inline(never)] + pub(crate) fn debug_print_stats(&self, m: &SeqFile, ctx: &Context) -> Result<()> { + seq_print!(m, "proc {}\n", self.pid_in_current_ns()); + seq_print!(m, "context {}\n", &*ctx.name); + + let inner = self.inner.lock(); + seq_print!(m, " threads: {}\n", inner.threads.iter().count()); + seq_print!( + m, + " requested threads: {}+{}/{}\n", + inner.requested_thread_count, + inner.started_thread_count, + inner.max_threads, + ); + if let Some(mapping) = &inner.mapping { + seq_print!( + m, + " free oneway space: {}\n", + mapping.alloc.free_oneway_space() + ); + seq_print!(m, " buffers: {}\n", mapping.alloc.count_buffers()); + } + seq_print!( + m, + " outstanding transactions: {}\n", + inner.outstanding_txns + ); + seq_print!(m, " nodes: {}\n", inner.nodes.iter().count()); + drop(inner); + + { + let mut refs = self.node_refs.lock(); + let (mut count, mut weak, mut strong) = (0, 0, 0); + for r in refs.by_handle.values_mut() { + let node_ref = r.node_ref(); + let (nstrong, nweak) = node_ref.get_count(); + count += 1; + weak += nweak; + strong += nstrong; + } + seq_print!(m, " refs: {count} s {strong} w {weak}\n"); + } + + self.stats.debug_print(" ", m); + + Ok(()) + } + + #[inline(never)] + pub(crate) fn debug_print(&self, m: &SeqFile, ctx: &Context, print_all: bool) -> Result<()> { + seq_print!(m, "proc {}\n", self.pid_in_current_ns()); + seq_print!(m, "context {}\n", &*ctx.name); + + let mut all_threads = KVec::new(); + let mut all_nodes = KVec::new(); + loop { + let inner = self.inner.lock(); + let num_threads = inner.threads.iter().count(); + let num_nodes = inner.nodes.iter().count(); + + if all_threads.capacity() < num_threads || all_nodes.capacity() < num_nodes { + drop(inner); + all_threads.reserve(num_threads, GFP_KERNEL)?; + all_nodes.reserve(num_nodes, GFP_KERNEL)?; + continue; + } + + for thread in inner.threads.values() { + assert!(all_threads.len() < all_threads.capacity()); + let _ = all_threads.push(thread.clone(), GFP_ATOMIC); + } + + for node in inner.nodes.values() { + assert!(all_nodes.len() < all_nodes.capacity()); + let _ = all_nodes.push(node.clone(), GFP_ATOMIC); + } + + break; + } + + for thread in all_threads { + thread.debug_print(m, print_all)?; + } + + let mut inner = self.inner.lock(); + for node in all_nodes { + if print_all || node.has_oneway_transaction(&mut inner) { + node.full_debug_print(m, &mut inner)?; + } + } + drop(inner); + + if print_all { + let mut refs = self.node_refs.lock(); + for r in refs.by_handle.values_mut() { + let node_ref = r.node_ref(); + let dead = node_ref.node.owner.inner.lock().is_dead; + let (strong, weak) = node_ref.get_count(); + let debug_id = node_ref.node.debug_id; + + seq_print!( + m, + " ref {}: desc {} {}node {debug_id} s {strong} w {weak}", + r.debug_id, + r.handle, + if dead { "dead " } else { "" }, + ); + } + } + + let inner = self.inner.lock(); + for work in &inner.work { + work.debug_print(m, " ", " pending transaction ")?; + } + for _death in &inner.delivered_deaths { + seq_print!(m, " has delivered dead binder\n"); + } + if let Some(mapping) = &inner.mapping { + mapping.alloc.debug_print(m)?; + } + drop(inner); + + Ok(()) + } + + /// Attempts to fetch a work item from the process queue. + pub(crate) fn get_work(&self) -> Option> { + self.inner.lock().work.pop_front() + } + + /// Attempts to fetch a work item from the process queue. If none is available, it registers the + /// given thread as ready to receive work directly. + /// + /// This must only be called when the thread is not participating in a transaction chain; when + /// it is, work will always be delivered directly to the thread (and not through the process + /// queue). + pub(crate) fn get_work_or_register<'a>( + &'a self, + thread: &'a Arc, + ) -> GetWorkOrRegister<'a> { + let mut inner = self.inner.lock(); + // Try to get work from the process queue. + if let Some(work) = inner.work.pop_front() { + return GetWorkOrRegister::Work(work); + } + + // Register the thread as ready. + GetWorkOrRegister::Register(Registration::new(thread, &mut inner)) + } + + fn get_current_thread(self: ArcBorrow<'_, Self>) -> Result> { + let id = { + let current = kernel::current!(); + if !core::ptr::eq(current.group_leader(), &*self.task) { + pr_err!("get_current_thread was called from the wrong process."); + return Err(EINVAL); + } + current.pid() + }; + + { + let inner = self.inner.lock(); + if let Some(thread) = inner.threads.get(&id) { + return Ok(thread.clone()); + } + } + + // Allocate a new `Thread` without holding any locks. + let reservation = RBTreeNodeReservation::new(GFP_KERNEL)?; + let ta: Arc = Thread::new(id, self.into())?; + + let mut inner = self.inner.lock(); + match inner.threads.entry(id) { + rbtree::Entry::Vacant(entry) => { + entry.insert(ta.clone(), reservation); + Ok(ta) + } + rbtree::Entry::Occupied(_entry) => { + pr_err!("Cannot create two threads with the same id."); + Err(EINVAL) + } + } + } + + pub(crate) fn push_work(&self, work: DLArc) -> BinderResult { + // If push_work fails, drop the work item outside the lock. + let res = self.inner.lock().push_work(work); + match res { + Ok(()) => Ok(()), + Err((err, work)) => { + drop(work); + Err(err) + } + } + } + + fn set_as_manager( + self: ArcBorrow<'_, Self>, + info: Option, + thread: &Thread, + ) -> Result { + let (ptr, cookie, flags) = if let Some(obj) = info { + ( + // SAFETY: The object type for this ioctl is implicitly `BINDER_TYPE_BINDER`, so it + // is safe to access the `binder` field. + unsafe { obj.__bindgen_anon_1.binder }, + obj.cookie, + obj.flags, + ) + } else { + (0, 0, 0) + }; + let node_ref = self.get_node(ptr, cookie, flags as _, true, thread)?; + let node = node_ref.node.clone(); + self.ctx.set_manager_node(node_ref)?; + self.inner.lock().is_manager = true; + + // Force the state of the node to prevent the delivery of acquire/increfs. + let mut owner_inner = node.owner.inner.lock(); + node.force_has_count(&mut owner_inner); + Ok(()) + } + + fn get_node_inner( + self: ArcBorrow<'_, Self>, + ptr: u64, + cookie: u64, + flags: u32, + strong: bool, + thread: &Thread, + wrapper: Option, + ) -> Result> { + // Try to find an existing node. + { + let mut inner = self.inner.lock(); + if let Some(node) = inner.get_existing_node(ptr, cookie)? { + return Ok(inner.new_node_ref_with_thread(node, strong, thread, wrapper)); + } + } + + // Allocate the node before reacquiring the lock. + let node = DTRWrap::arc_pin_init(Node::new(ptr, cookie, flags, self.into()))?.into_arc(); + let rbnode = RBTreeNode::new(ptr, node.clone(), GFP_KERNEL)?; + let mut inner = self.inner.lock(); + if let Some(node) = inner.get_existing_node(ptr, cookie)? { + return Ok(inner.new_node_ref_with_thread(node, strong, thread, wrapper)); + } + + inner.nodes.insert(rbnode); + // This can only fail if someone has already pushed the node to a list, but we just created + // it and still hold the lock, so it can't fail right now. + let node_ref = inner + .new_node_ref_with_thread(node, strong, thread, wrapper) + .unwrap(); + + Ok(Ok(node_ref)) + } + + pub(crate) fn get_node( + self: ArcBorrow<'_, Self>, + ptr: u64, + cookie: u64, + flags: u32, + strong: bool, + thread: &Thread, + ) -> Result { + let mut wrapper = None; + for _ in 0..2 { + match self.get_node_inner(ptr, cookie, flags, strong, thread, wrapper) { + Err(err) => return Err(err), + Ok(Ok(node_ref)) => return Ok(node_ref), + Ok(Err(CouldNotDeliverCriticalIncrement)) => { + wrapper = Some(CritIncrWrapper::new()?); + } + } + } + // We only get a `CouldNotDeliverCriticalIncrement` error if `wrapper` is `None`, so the + // loop should run at most twice. + unreachable!() + } + + pub(crate) fn insert_or_update_handle( + self: ArcBorrow<'_, Process>, + node_ref: NodeRef, + is_mananger: bool, + ) -> Result { + { + let mut refs = self.node_refs.lock(); + + // Do a lookup before inserting. + if let Some(handle_ref) = refs.by_node.get(&node_ref.node.global_id()) { + let handle = *handle_ref; + let info = refs.by_handle.get_mut(&handle).unwrap(); + info.node_ref().absorb(node_ref); + return Ok(handle); + } + } + + // Reserve memory for tree nodes. + let reserve1 = RBTreeNodeReservation::new(GFP_KERNEL)?; + let reserve2 = RBTreeNodeReservation::new(GFP_KERNEL)?; + let info = UniqueArc::new_uninit(GFP_KERNEL)?; + + let mut refs = self.node_refs.lock(); + + // Do a lookup again as node may have been inserted before the lock was reacquired. + if let Some(handle_ref) = refs.by_node.get(&node_ref.node.global_id()) { + let handle = *handle_ref; + let info = refs.by_handle.get_mut(&handle).unwrap(); + info.node_ref().absorb(node_ref); + return Ok(handle); + } + + // Find id. + let mut target: u32 = if is_mananger { 0 } else { 1 }; + for handle in refs.by_handle.keys() { + if *handle > target { + break; + } + if *handle == target { + target = target.checked_add(1).ok_or(ENOMEM)?; + } + } + + let gid = node_ref.node.global_id(); + let (info_proc, info_node) = { + let info_init = NodeRefInfo::new(node_ref, target, self.into()); + match info.pin_init_with(info_init) { + Ok(info) => ListArc::pair_from_pin_unique(info), + // error is infallible + Err(err) => match err {}, + } + }; + + // Ensure the process is still alive while we insert a new reference. + // + // This releases the lock before inserting the nodes, but since `is_dead` is set as the + // first thing in `deferred_release`, process cleanup will not miss the items inserted into + // `refs` below. + if self.inner.lock().is_dead { + return Err(ESRCH); + } + + // SAFETY: `info_proc` and `info_node` reference the same node, so we are inserting + // `info_node` into the right node's `refs` list. + unsafe { info_proc.node_ref2().node.insert_node_info(info_node) }; + + refs.by_node.insert(reserve1.into_node(gid, target)); + refs.by_handle.insert(reserve2.into_node(target, info_proc)); + Ok(target) + } + + pub(crate) fn get_transaction_node(&self, handle: u32) -> BinderResult { + // When handle is zero, try to get the context manager. + if handle == 0 { + Ok(self.ctx.get_manager_node(true)?) + } else { + Ok(self.get_node_from_handle(handle, true)?) + } + } + + pub(crate) fn get_node_from_handle(&self, handle: u32, strong: bool) -> Result { + self.node_refs + .lock() + .by_handle + .get_mut(&handle) + .ok_or(ENOENT)? + .node_ref() + .clone(strong) + } + + pub(crate) fn remove_from_delivered_deaths(&self, death: &DArc) { + let mut inner = self.inner.lock(); + // SAFETY: By the invariant on the `delivered_links` field, this is the right linked list. + let removed = unsafe { inner.delivered_deaths.remove(death) }; + drop(inner); + drop(removed); + } + + pub(crate) fn update_ref( + self: ArcBorrow<'_, Process>, + handle: u32, + inc: bool, + strong: bool, + ) -> Result { + if inc && handle == 0 { + if let Ok(node_ref) = self.ctx.get_manager_node(strong) { + if core::ptr::eq(&*self, &*node_ref.node.owner) { + return Err(EINVAL); + } + let _ = self.insert_or_update_handle(node_ref, true); + return Ok(()); + } + } + + // To preserve original binder behaviour, we only fail requests where the manager tries to + // increment references on itself. + let mut refs = self.node_refs.lock(); + if let Some(info) = refs.by_handle.get_mut(&handle) { + if info.node_ref().update(inc, strong) { + // Clean up death if there is one attached to this node reference. + if let Some(death) = info.death().take() { + death.set_cleared(true); + self.remove_from_delivered_deaths(&death); + } + + // Remove reference from process tables, and from the node's `refs` list. + + // SAFETY: We are removing the `NodeRefInfo` from the right node. + unsafe { info.node_ref2().node.remove_node_info(info) }; + + let id = info.node_ref().node.global_id(); + refs.by_handle.remove(&handle); + refs.by_node.remove(&id); + } + } else { + // All refs are cleared in process exit, so this warning is expected in that case. + if !self.inner.lock().is_dead { + pr_warn!("{}: no such ref {handle}\n", self.pid_in_current_ns()); + } + } + Ok(()) + } + + /// Decrements the refcount of the given node, if one exists. + pub(crate) fn update_node(&self, ptr: u64, cookie: u64, strong: bool) { + let mut inner = self.inner.lock(); + if let Ok(Some(node)) = inner.get_existing_node(ptr, cookie) { + inner.update_node_refcount(&node, false, strong, 1, None); + } + } + + pub(crate) fn inc_ref_done(&self, reader: &mut UserSliceReader, strong: bool) -> Result { + let ptr = reader.read::()?; + let cookie = reader.read::()?; + let mut inner = self.inner.lock(); + if let Ok(Some(node)) = inner.get_existing_node(ptr, cookie) { + if let Some(node) = node.inc_ref_done_locked(strong, &mut inner) { + // This only fails if the process is dead. + let _ = inner.push_work(node); + } + } + Ok(()) + } + + pub(crate) fn buffer_alloc( + self: &Arc, + debug_id: usize, + size: usize, + is_oneway: bool, + from_pid: i32, + ) -> BinderResult { + use kernel::page::PAGE_SIZE; + + let mut reserve_new_args = ReserveNewArgs { + debug_id, + size, + is_oneway, + pid: from_pid, + ..ReserveNewArgs::default() + }; + + let (new_alloc, addr) = loop { + let mut inner = self.inner.lock(); + let mapping = inner.mapping.as_mut().ok_or_else(BinderError::new_dead)?; + let alloc_request = match mapping.alloc.reserve_new(reserve_new_args)? { + ReserveNew::Success(new_alloc) => break (new_alloc, mapping.address), + ReserveNew::NeedAlloc(request) => request, + }; + drop(inner); + // We need to allocate memory and then call `reserve_new` again. + reserve_new_args = alloc_request.make_alloc()?; + }; + + let res = Allocation::new( + self.clone(), + debug_id, + new_alloc.offset, + size, + addr + new_alloc.offset, + new_alloc.oneway_spam_detected, + ); + + // This allocation will be marked as in use until the `Allocation` is used to free it. + // + // This method can't be called while holding a lock, so we release the lock first. It's + // okay for several threads to use the method on the same index at the same time. In that + // case, one of the calls will allocate the given page (if missing), and the other call + // will wait for the other call to finish allocating the page. + // + // We will not call `stop_using_range` in parallel with this on the same page, because the + // allocation can only be removed via the destructor of the `Allocation` object that we + // currently own. + match self.pages.use_range( + new_alloc.offset / PAGE_SIZE, + (new_alloc.offset + size).div_ceil(PAGE_SIZE), + ) { + Ok(()) => {} + Err(err) => { + pr_warn!("use_range failure {:?}", err); + return Err(err.into()); + } + } + + Ok(NewAllocation(res)) + } + + pub(crate) fn buffer_get(self: &Arc, ptr: usize) -> Option { + let mut inner = self.inner.lock(); + let mapping = inner.mapping.as_mut()?; + let offset = ptr.checked_sub(mapping.address)?; + let (size, debug_id, odata) = mapping.alloc.reserve_existing(offset).ok()?; + let mut alloc = Allocation::new(self.clone(), debug_id, offset, size, ptr, false); + if let Some(data) = odata { + alloc.set_info(data); + } + Some(alloc) + } + + pub(crate) fn buffer_raw_free(&self, ptr: usize) { + let mut inner = self.inner.lock(); + if let Some(ref mut mapping) = &mut inner.mapping { + let offset = match ptr.checked_sub(mapping.address) { + Some(offset) => offset, + None => return, + }; + + let freed_range = match mapping.alloc.reservation_abort(offset) { + Ok(freed_range) => freed_range, + Err(_) => { + pr_warn!( + "Pointer {:x} failed to free, base = {:x}\n", + ptr, + mapping.address + ); + return; + } + }; + + // No more allocations in this range. Mark them as not in use. + // + // Must be done before we release the lock so that `use_range` is not used on these + // indices until `stop_using_range` returns. + self.pages + .stop_using_range(freed_range.start_page_idx, freed_range.end_page_idx); + } + } + + pub(crate) fn buffer_make_freeable(&self, offset: usize, mut data: Option) { + let mut inner = self.inner.lock(); + if let Some(ref mut mapping) = &mut inner.mapping { + if mapping.alloc.reservation_commit(offset, &mut data).is_err() { + pr_warn!("Offset {} failed to be marked freeable\n", offset); + } + } + } + + fn create_mapping(&self, vma: &mm::virt::VmaNew) -> Result { + use kernel::page::PAGE_SIZE; + let size = usize::min(vma.end() - vma.start(), bindings::SZ_4M as usize); + let mapping = Mapping::new(vma.start(), size); + let page_count = self.pages.register_with_vma(vma)?; + if page_count * PAGE_SIZE != size { + return Err(EINVAL); + } + + // Save range allocator for later. + self.inner.lock().mapping = Some(mapping); + + Ok(()) + } + + fn version(&self, data: UserSlice) -> Result { + data.writer().write(&BinderVersion::current()) + } + + pub(crate) fn register_thread(&self) -> bool { + self.inner.lock().register_thread() + } + + fn remove_thread(&self, thread: Arc) { + self.inner.lock().threads.remove(&thread.id); + thread.release(); + } + + fn set_max_threads(&self, max: u32) { + self.inner.lock().max_threads = max; + } + + fn set_oneway_spam_detection_enabled(&self, enabled: u32) { + self.inner.lock().oneway_spam_detection_enabled = enabled != 0; + } + + pub(crate) fn is_oneway_spam_detection_enabled(&self) -> bool { + self.inner.lock().oneway_spam_detection_enabled + } + + fn get_node_debug_info(&self, data: UserSlice) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + + // Read the starting point. + let ptr = reader.read::()?.ptr; + let mut out = BinderNodeDebugInfo::default(); + + { + let inner = self.inner.lock(); + for (node_ptr, node) in &inner.nodes { + if *node_ptr > ptr { + node.populate_debug_info(&mut out, &inner); + break; + } + } + } + + writer.write(&out) + } + + fn get_node_info_from_ref(&self, data: UserSlice) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + let mut out = reader.read::()?; + + if out.strong_count != 0 + || out.weak_count != 0 + || out.reserved1 != 0 + || out.reserved2 != 0 + || out.reserved3 != 0 + { + return Err(EINVAL); + } + + // Only the context manager is allowed to use this ioctl. + if !self.inner.lock().is_manager { + return Err(EPERM); + } + + { + let mut node_refs = self.node_refs.lock(); + let node_info = node_refs.by_handle.get_mut(&out.handle).ok_or(ENOENT)?; + let node_ref = node_info.node_ref(); + let owner_inner = node_ref.node.owner.inner.lock(); + node_ref.node.populate_counts(&mut out, &owner_inner); + } + + // Write the result back. + writer.write(&out) + } + + pub(crate) fn needs_thread(&self) -> bool { + let mut inner = self.inner.lock(); + let ret = inner.requested_thread_count == 0 + && inner.ready_threads.is_empty() + && inner.started_thread_count < inner.max_threads; + if ret { + inner.requested_thread_count += 1 + } + ret + } + + pub(crate) fn request_death( + self: &Arc, + reader: &mut UserSliceReader, + thread: &Thread, + ) -> Result { + let handle: u32 = reader.read()?; + let cookie: u64 = reader.read()?; + + // Queue BR_ERROR if we can't allocate memory for the death notification. + let death = UniqueArc::new_uninit(GFP_KERNEL).inspect_err(|_| { + thread.push_return_work(BR_ERROR); + })?; + let mut refs = self.node_refs.lock(); + let Some(info) = refs.by_handle.get_mut(&handle) else { + pr_warn!("BC_REQUEST_DEATH_NOTIFICATION invalid ref {handle}\n"); + return Ok(()); + }; + + // Nothing to do if there is already a death notification request for this handle. + if info.death().is_some() { + pr_warn!("BC_REQUEST_DEATH_NOTIFICATION death notification already set\n"); + return Ok(()); + } + + let death = { + let death_init = NodeDeath::new(info.node_ref().node.clone(), self.clone(), cookie); + match death.pin_init_with(death_init) { + Ok(death) => death, + // error is infallible + Err(err) => match err {}, + } + }; + + // Register the death notification. + { + let owner = info.node_ref2().node.owner.clone(); + let mut owner_inner = owner.inner.lock(); + if owner_inner.is_dead { + let death = Arc::from(death); + *info.death() = Some(death.clone()); + drop(owner_inner); + death.set_dead(); + } else { + let death = ListArc::from(death); + *info.death() = Some(death.clone_arc()); + info.node_ref().node.add_death(death, &mut owner_inner); + } + } + Ok(()) + } + + pub(crate) fn clear_death(&self, reader: &mut UserSliceReader, thread: &Thread) -> Result { + let handle: u32 = reader.read()?; + let cookie: u64 = reader.read()?; + + let mut refs = self.node_refs.lock(); + let Some(info) = refs.by_handle.get_mut(&handle) else { + pr_warn!("BC_CLEAR_DEATH_NOTIFICATION invalid ref {handle}\n"); + return Ok(()); + }; + + let Some(death) = info.death().take() else { + pr_warn!("BC_CLEAR_DEATH_NOTIFICATION death notification not active\n"); + return Ok(()); + }; + if death.cookie != cookie { + *info.death() = Some(death); + pr_warn!("BC_CLEAR_DEATH_NOTIFICATION death notification cookie mismatch\n"); + return Ok(()); + } + + // Update state and determine if we need to queue a work item. We only need to do it when + // the node is not dead or if the user already completed the death notification. + if death.set_cleared(false) { + if let Some(death) = ListArc::try_from_arc_or_drop(death) { + let _ = thread.push_work_if_looper(death); + } + } + + Ok(()) + } + + pub(crate) fn dead_binder_done(&self, cookie: u64, thread: &Thread) { + if let Some(death) = self.inner.lock().pull_delivered_death(cookie) { + death.set_notification_done(thread); + } + } + + /// Locks the spinlock and move the `nodes` rbtree out. + /// + /// This allows you to iterate through `nodes` while also allowing you to give other parts of + /// the codebase exclusive access to `ProcessInner`. + pub(crate) fn lock_with_nodes(&self) -> WithNodes<'_> { + let mut inner = self.inner.lock(); + WithNodes { + nodes: take(&mut inner.nodes), + inner, + } + } + + fn deferred_flush(&self) { + let inner = self.inner.lock(); + for thread in inner.threads.values() { + thread.exit_looper(); + } + } + + fn deferred_release(self: Arc) { + let is_manager = { + let mut inner = self.inner.lock(); + inner.is_dead = true; + inner.is_frozen = false; + inner.sync_recv = false; + inner.async_recv = false; + inner.is_manager + }; + + if is_manager { + self.ctx.unset_manager_node(); + } + + self.ctx.deregister_process(&self); + + let binderfs_file = self.inner.lock().binderfs_file.take(); + drop(binderfs_file); + + // Release threads. + let threads = { + let mut inner = self.inner.lock(); + let threads = take(&mut inner.threads); + let ready = take(&mut inner.ready_threads); + drop(inner); + drop(ready); + + for thread in threads.values() { + thread.release(); + } + threads + }; + + // Release nodes. + { + while let Some(node) = { + let mut lock = self.inner.lock(); + lock.nodes.cursor_front().map(|c| c.remove_current().1) + } { + node.to_key_value().1.release(); + } + } + + // Clean up death listeners and remove nodes from external node info lists. + for info in self.node_refs.lock().by_handle.values_mut() { + // SAFETY: We are removing the `NodeRefInfo` from the right node. + unsafe { info.node_ref2().node.remove_node_info(info) }; + + // Remove all death notifications from the nodes (that belong to a different process). + let death = if let Some(existing) = info.death().take() { + existing + } else { + continue; + }; + death.set_cleared(false); + } + + // Clean up freeze listeners. + let freeze_listeners = take(&mut self.node_refs.lock().freeze_listeners); + for listener in freeze_listeners.values() { + listener.on_process_exit(&self); + } + drop(freeze_listeners); + + // Release refs on foreign nodes. + { + let mut refs = self.node_refs.lock(); + let by_handle = take(&mut refs.by_handle); + let by_node = take(&mut refs.by_node); + drop(refs); + drop(by_node); + drop(by_handle); + } + + // Cancel all pending work items. + while let Some(work) = self.get_work() { + work.into_arc().cancel(); + } + + let delivered_deaths = take(&mut self.inner.lock().delivered_deaths); + drop(delivered_deaths); + + // Free any resources kept alive by allocated buffers. + let omapping = self.inner.lock().mapping.take(); + if let Some(mut mapping) = omapping { + let address = mapping.address; + mapping + .alloc + .take_for_each(|offset, size, debug_id, odata| { + let ptr = offset + address; + pr_warn!( + "{}: removing orphan mapping {offset}:{size}\n", + self.pid_in_current_ns() + ); + let mut alloc = + Allocation::new(self.clone(), debug_id, offset, size, ptr, false); + if let Some(data) = odata { + alloc.set_info(data); + } + drop(alloc) + }); + } + + // calls to synchronize_rcu() in thread drop will happen here + drop(threads); + } + + pub(crate) fn drop_outstanding_txn(&self) { + let wake = { + let mut inner = self.inner.lock(); + if inner.outstanding_txns == 0 { + pr_err!("outstanding_txns underflow"); + return; + } + inner.outstanding_txns -= 1; + inner.is_frozen && inner.outstanding_txns == 0 + }; + + if wake { + self.freeze_wait.notify_all(); + } + } + + pub(crate) fn ioctl_freeze(&self, info: &BinderFreezeInfo) -> Result { + if info.enable == 0 { + let msgs = self.prepare_freeze_messages()?; + let mut inner = self.inner.lock(); + inner.sync_recv = false; + inner.async_recv = false; + inner.is_frozen = false; + drop(inner); + msgs.send_messages(); + return Ok(()); + } + + let mut inner = self.inner.lock(); + inner.sync_recv = false; + inner.async_recv = false; + inner.is_frozen = true; + + if info.timeout_ms > 0 { + let mut jiffies = kernel::time::msecs_to_jiffies(info.timeout_ms); + while jiffies > 0 { + if inner.outstanding_txns == 0 { + break; + } + + match self + .freeze_wait + .wait_interruptible_timeout(&mut inner, jiffies) + { + CondVarTimeoutResult::Signal { .. } => { + inner.is_frozen = false; + return Err(ERESTARTSYS); + } + CondVarTimeoutResult::Woken { jiffies: remaining } => { + jiffies = remaining; + } + CondVarTimeoutResult::Timeout => { + jiffies = 0; + } + } + } + } + + if inner.txns_pending_locked() { + inner.is_frozen = false; + Err(EAGAIN) + } else { + drop(inner); + match self.prepare_freeze_messages() { + Ok(batch) => { + batch.send_messages(); + Ok(()) + } + Err(kernel::alloc::AllocError) => { + self.inner.lock().is_frozen = false; + Err(ENOMEM) + } + } + } + } +} + +fn get_frozen_status(data: UserSlice) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + + let mut info = reader.read::()?; + info.sync_recv = 0; + info.async_recv = 0; + let mut found = false; + + for ctx in crate::context::get_all_contexts()? { + ctx.for_each_proc(|proc| { + if proc.task.pid() == info.pid as _ { + found = true; + let inner = proc.inner.lock(); + let txns_pending = inner.txns_pending_locked(); + info.async_recv |= inner.async_recv as u32; + info.sync_recv |= inner.sync_recv as u32; + info.sync_recv |= (txns_pending as u32) << 1; + } + }); + } + + if found { + writer.write(&info)?; + Ok(()) + } else { + Err(EINVAL) + } +} + +fn ioctl_freeze(reader: &mut UserSliceReader) -> Result { + let info = reader.read::()?; + + // Very unlikely for there to be more than 3, since a process normally uses at most binder and + // hwbinder. + let mut procs = KVec::with_capacity(3, GFP_KERNEL)?; + + let ctxs = crate::context::get_all_contexts()?; + for ctx in ctxs { + for proc in ctx.get_procs_with_pid(info.pid as i32)? { + procs.push(proc, GFP_KERNEL)?; + } + } + + for proc in procs { + proc.ioctl_freeze(&info)?; + } + Ok(()) +} + +/// The ioctl handler. +impl Process { + /// Ioctls that are write-only from the perspective of userspace. + /// + /// The kernel will only read from the pointer that userspace provided to us. + fn ioctl_write_only( + this: ArcBorrow<'_, Process>, + _file: &File, + cmd: u32, + reader: &mut UserSliceReader, + ) -> Result { + let thread = this.get_current_thread()?; + match cmd { + uapi::BINDER_SET_MAX_THREADS => this.set_max_threads(reader.read()?), + uapi::BINDER_THREAD_EXIT => this.remove_thread(thread), + uapi::BINDER_SET_CONTEXT_MGR => this.set_as_manager(None, &thread)?, + uapi::BINDER_SET_CONTEXT_MGR_EXT => { + this.set_as_manager(Some(reader.read()?), &thread)? + } + uapi::BINDER_ENABLE_ONEWAY_SPAM_DETECTION => { + this.set_oneway_spam_detection_enabled(reader.read()?) + } + uapi::BINDER_FREEZE => ioctl_freeze(reader)?, + _ => return Err(EINVAL), + } + Ok(()) + } + + /// Ioctls that are read/write from the perspective of userspace. + /// + /// The kernel will both read from and write to the pointer that userspace provided to us. + fn ioctl_write_read( + this: ArcBorrow<'_, Process>, + file: &File, + cmd: u32, + data: UserSlice, + ) -> Result { + let thread = this.get_current_thread()?; + let blocking = (file.flags() & file::flags::O_NONBLOCK) == 0; + match cmd { + uapi::BINDER_WRITE_READ => thread.write_read(data, blocking)?, + uapi::BINDER_GET_NODE_DEBUG_INFO => this.get_node_debug_info(data)?, + uapi::BINDER_GET_NODE_INFO_FOR_REF => this.get_node_info_from_ref(data)?, + uapi::BINDER_VERSION => this.version(data)?, + uapi::BINDER_GET_FROZEN_INFO => get_frozen_status(data)?, + uapi::BINDER_GET_EXTENDED_ERROR => thread.get_extended_error(data)?, + _ => return Err(EINVAL), + } + Ok(()) + } +} + +/// The file operations supported by `Process`. +impl Process { + pub(crate) fn open(ctx: ArcBorrow<'_, Context>, file: &File) -> Result> { + Self::new(ctx.into(), ARef::from(file.cred())) + } + + pub(crate) fn release(this: Arc, _file: &File) { + let binderfs_file; + let should_schedule; + { + let mut inner = this.inner.lock(); + should_schedule = inner.defer_work == 0; + inner.defer_work |= PROC_DEFER_RELEASE; + binderfs_file = inner.binderfs_file.take(); + } + + if should_schedule { + // Ignore failures to schedule to the workqueue. Those just mean that we're already + // scheduled for execution. + let _ = workqueue::system().enqueue(this); + } + + drop(binderfs_file); + } + + pub(crate) fn flush(this: ArcBorrow<'_, Process>) -> Result { + let should_schedule; + { + let mut inner = this.inner.lock(); + should_schedule = inner.defer_work == 0; + inner.defer_work |= PROC_DEFER_FLUSH; + } + + if should_schedule { + // Ignore failures to schedule to the workqueue. Those just mean that we're already + // scheduled for execution. + let _ = workqueue::system().enqueue(Arc::from(this)); + } + Ok(()) + } + + pub(crate) fn ioctl(this: ArcBorrow<'_, Process>, file: &File, cmd: u32, arg: usize) -> Result { + use kernel::ioctl::{_IOC_DIR, _IOC_SIZE}; + use kernel::uapi::{_IOC_READ, _IOC_WRITE}; + + crate::trace::trace_ioctl(cmd, arg); + + let user_slice = UserSlice::new(UserPtr::from_addr(arg), _IOC_SIZE(cmd)); + + const _IOC_READ_WRITE: u32 = _IOC_READ | _IOC_WRITE; + + match _IOC_DIR(cmd) { + _IOC_WRITE => Self::ioctl_write_only(this, file, cmd, &mut user_slice.reader()), + _IOC_READ_WRITE => Self::ioctl_write_read(this, file, cmd, user_slice), + _ => Err(EINVAL), + } + } + + pub(crate) fn compat_ioctl( + this: ArcBorrow<'_, Process>, + file: &File, + cmd: u32, + arg: usize, + ) -> Result { + Self::ioctl(this, file, cmd, arg) + } + + pub(crate) fn mmap( + this: ArcBorrow<'_, Process>, + _file: &File, + vma: &mm::virt::VmaNew, + ) -> Result { + // We don't allow mmap to be used in a different process. + if !core::ptr::eq(kernel::current!().group_leader(), &*this.task) { + return Err(EINVAL); + } + if vma.start() == 0 { + return Err(EINVAL); + } + + vma.try_clear_maywrite().map_err(|_| EPERM)?; + vma.set_dontcopy(); + vma.set_mixedmap(); + + // TODO: Set ops. We need to learn when the user unmaps so that we can stop using it. + this.create_mapping(vma) + } + + pub(crate) fn poll( + this: ArcBorrow<'_, Process>, + file: &File, + table: PollTable<'_>, + ) -> Result { + let thread = this.get_current_thread()?; + let (from_proc, mut mask) = thread.poll(file, table); + if mask == 0 && from_proc && !this.inner.lock().work.is_empty() { + mask |= bindings::POLLIN; + } + Ok(mask) + } +} + +/// Represents that a thread has registered with the `ready_threads` list of its process. +/// +/// The destructor of this type will unregister the thread from the list of ready threads. +pub(crate) struct Registration<'a> { + thread: &'a Arc, +} + +impl<'a> Registration<'a> { + fn new(thread: &'a Arc, guard: &mut Guard<'_, ProcessInner, SpinLockBackend>) -> Self { + assert!(core::ptr::eq(&thread.process.inner, guard.lock_ref())); + // INVARIANT: We are pushing this thread to the right `ready_threads` list. + if let Ok(list_arc) = ListArc::try_from_arc(thread.clone()) { + guard.ready_threads.push_front(list_arc); + } else { + // It is an error to hit this branch, and it should not be reachable. We try to do + // something reasonable when the failure path happens. Most likely, the thread in + // question will sleep forever. + pr_err!("Same thread registered with `ready_threads` twice."); + } + Self { thread } + } +} + +impl Drop for Registration<'_> { + fn drop(&mut self) { + let mut inner = self.thread.process.inner.lock(); + // SAFETY: The thread has the invariant that we never push it to any other linked list than + // the `ready_threads` list of its parent process. Therefore, the thread is either in that + // list, or in no list. + unsafe { inner.ready_threads.remove(self.thread) }; + } +} + +pub(crate) struct WithNodes<'a> { + pub(crate) inner: Guard<'a, ProcessInner, SpinLockBackend>, + pub(crate) nodes: RBTree>, +} + +impl Drop for WithNodes<'_> { + fn drop(&mut self) { + core::mem::swap(&mut self.nodes, &mut self.inner.nodes); + if self.nodes.iter().next().is_some() { + pr_err!("nodes array was modified while using lock_with_nodes\n"); + } + } +} + +pub(crate) enum GetWorkOrRegister<'a> { + Work(DLArc), + Register(Registration<'a>), +} diff --git a/drivers/android/binder/range_alloc/array.rs b/drivers/android/binder/range_alloc/array.rs new file mode 100644 index 000000000000..07e1dec2ce63 --- /dev/null +++ b/drivers/android/binder/range_alloc/array.rs @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + page::{PAGE_MASK, PAGE_SIZE}, + prelude::*, + seq_file::SeqFile, + seq_print, + task::Pid, +}; + +use crate::range_alloc::{DescriptorState, FreedRange, Range}; + +/// Keeps track of allocations in a process' mmap. +/// +/// Each process has an mmap where the data for incoming transactions will be placed. This struct +/// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that +/// has metadata related to the allocation. We also keep track of available free space. +pub(super) struct ArrayRangeAllocator { + /// This stores all ranges that are allocated. Unlike the tree based allocator, we do *not* + /// store the free ranges. + /// + /// Sorted by offset. + pub(super) ranges: KVec>, + size: usize, + free_oneway_space: usize, +} + +struct FindEmptyRes { + /// Which index in `ranges` should we insert the new range at? + /// + /// Inserting the new range at this index keeps `ranges` sorted. + insert_at_idx: usize, + /// Which offset should we insert the new range at? + insert_at_offset: usize, +} + +impl ArrayRangeAllocator { + pub(crate) fn new(size: usize, alloc: EmptyArrayAlloc) -> Self { + Self { + ranges: alloc.ranges, + size, + free_oneway_space: size / 2, + } + } + + pub(crate) fn free_oneway_space(&self) -> usize { + self.free_oneway_space + } + + pub(crate) fn count_buffers(&self) -> usize { + self.ranges.len() + } + + pub(crate) fn total_size(&self) -> usize { + self.size + } + + pub(crate) fn is_full(&self) -> bool { + self.ranges.len() == self.ranges.capacity() + } + + pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> { + for range in &self.ranges { + seq_print!( + m, + " buffer {}: {} size {} pid {} oneway {}", + 0, + range.offset, + range.size, + range.state.pid(), + range.state.is_oneway(), + ); + if let DescriptorState::Reserved(_) = range.state { + seq_print!(m, " reserved\n"); + } else { + seq_print!(m, " allocated\n"); + } + } + Ok(()) + } + + /// Find somewhere to put a new range. + /// + /// Unlike the tree implementation, we do not bother to find the smallest gap. The idea is that + /// fragmentation isn't a big issue when we don't have many ranges. + /// + /// Returns the index that the new range should have in `self.ranges` after insertion. + fn find_empty_range(&self, size: usize) -> Option { + let after_last_range = self.ranges.last().map(Range::endpoint).unwrap_or(0); + + if size <= self.total_size() - after_last_range { + // We can put the range at the end, so just do that. + Some(FindEmptyRes { + insert_at_idx: self.ranges.len(), + insert_at_offset: after_last_range, + }) + } else { + let mut end_of_prev = 0; + for (i, range) in self.ranges.iter().enumerate() { + // Does it fit before the i'th range? + if size <= range.offset - end_of_prev { + return Some(FindEmptyRes { + insert_at_idx: i, + insert_at_offset: end_of_prev, + }); + } + end_of_prev = range.endpoint(); + } + None + } + } + + pub(crate) fn reserve_new( + &mut self, + debug_id: usize, + size: usize, + is_oneway: bool, + pid: Pid, + ) -> Result { + // Compute new value of free_oneway_space, which is set only on success. + let new_oneway_space = if is_oneway { + match self.free_oneway_space.checked_sub(size) { + Some(new_oneway_space) => new_oneway_space, + None => return Err(ENOSPC), + } + } else { + self.free_oneway_space + }; + + let FindEmptyRes { + insert_at_idx, + insert_at_offset, + } = self.find_empty_range(size).ok_or(ENOSPC)?; + self.free_oneway_space = new_oneway_space; + + let new_range = Range { + offset: insert_at_offset, + size, + state: DescriptorState::new(is_oneway, debug_id, pid), + }; + // Insert the value at the given index to keep the array sorted. + self.ranges + .insert_within_capacity(insert_at_idx, new_range) + .ok() + .unwrap(); + + Ok(insert_at_offset) + } + + pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result { + // This could use a binary search, but linear scans are usually faster for small arrays. + let i = self + .ranges + .iter() + .position(|range| range.offset == offset) + .ok_or(EINVAL)?; + let range = &self.ranges[i]; + + if let DescriptorState::Allocated(_) = range.state { + return Err(EPERM); + } + + let size = range.size; + let offset = range.offset; + + if range.state.is_oneway() { + self.free_oneway_space += size; + } + + // This computes the range of pages that are no longer used by *any* allocated range. The + // caller will mark them as unused, which means that they can be freed if the system comes + // under memory pressure. + let mut freed_range = FreedRange::interior_pages(offset, size); + #[expect(clippy::collapsible_if)] // reads better like this + if offset % PAGE_SIZE != 0 { + if i == 0 || self.ranges[i - 1].endpoint() <= (offset & PAGE_MASK) { + freed_range.start_page_idx -= 1; + } + } + if range.endpoint() % PAGE_SIZE != 0 { + let page_after = (range.endpoint() & PAGE_MASK) + PAGE_SIZE; + if i + 1 == self.ranges.len() || page_after <= self.ranges[i + 1].offset { + freed_range.end_page_idx += 1; + } + } + + self.ranges.remove(i)?; + Ok(freed_range) + } + + pub(crate) fn reservation_commit(&mut self, offset: usize, data: &mut Option) -> Result { + // This could use a binary search, but linear scans are usually faster for small arrays. + let range = self + .ranges + .iter_mut() + .find(|range| range.offset == offset) + .ok_or(ENOENT)?; + + let DescriptorState::Reserved(reservation) = &range.state else { + return Err(ENOENT); + }; + + range.state = DescriptorState::Allocated(reservation.clone().allocate(data.take())); + Ok(()) + } + + pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option)> { + // This could use a binary search, but linear scans are usually faster for small arrays. + let range = self + .ranges + .iter_mut() + .find(|range| range.offset == offset) + .ok_or(ENOENT)?; + + let DescriptorState::Allocated(allocation) = &mut range.state else { + return Err(ENOENT); + }; + + let data = allocation.take(); + let debug_id = allocation.reservation.debug_id; + range.state = DescriptorState::Reserved(allocation.reservation.clone()); + Ok((range.size, debug_id, data)) + } + + pub(crate) fn take_for_each)>(&mut self, callback: F) { + for range in self.ranges.iter_mut() { + if let DescriptorState::Allocated(allocation) = &mut range.state { + callback( + range.offset, + range.size, + allocation.reservation.debug_id, + allocation.data.take(), + ); + } + } + } +} + +pub(crate) struct EmptyArrayAlloc { + ranges: KVec>, +} + +impl EmptyArrayAlloc { + pub(crate) fn try_new(capacity: usize) -> Result { + Ok(Self { + ranges: KVec::with_capacity(capacity, GFP_KERNEL)?, + }) + } +} diff --git a/drivers/android/binder/range_alloc/mod.rs b/drivers/android/binder/range_alloc/mod.rs new file mode 100644 index 000000000000..2301e2bc1a1f --- /dev/null +++ b/drivers/android/binder/range_alloc/mod.rs @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{page::PAGE_SIZE, prelude::*, seq_file::SeqFile, task::Pid}; + +mod tree; +use self::tree::{FromArrayAllocs, ReserveNewTreeAlloc, TreeRangeAllocator}; + +mod array; +use self::array::{ArrayRangeAllocator, EmptyArrayAlloc}; + +enum DescriptorState { + Reserved(Reservation), + Allocated(Allocation), +} + +impl DescriptorState { + fn new(is_oneway: bool, debug_id: usize, pid: Pid) -> Self { + DescriptorState::Reserved(Reservation { + debug_id, + is_oneway, + pid, + }) + } + + fn pid(&self) -> Pid { + match self { + DescriptorState::Reserved(inner) => inner.pid, + DescriptorState::Allocated(inner) => inner.reservation.pid, + } + } + + fn is_oneway(&self) -> bool { + match self { + DescriptorState::Reserved(inner) => inner.is_oneway, + DescriptorState::Allocated(inner) => inner.reservation.is_oneway, + } + } +} + +#[derive(Clone)] +struct Reservation { + debug_id: usize, + is_oneway: bool, + pid: Pid, +} + +impl Reservation { + fn allocate(self, data: Option) -> Allocation { + Allocation { + data, + reservation: self, + } + } +} + +struct Allocation { + reservation: Reservation, + data: Option, +} + +impl Allocation { + fn deallocate(self) -> (Reservation, Option) { + (self.reservation, self.data) + } + + fn debug_id(&self) -> usize { + self.reservation.debug_id + } + + fn take(&mut self) -> Option { + self.data.take() + } +} + +/// The array implementation must switch to the tree if it wants to go beyond this number of +/// ranges. +const TREE_THRESHOLD: usize = 8; + +/// Represents a range of pages that have just become completely free. +#[derive(Copy, Clone)] +pub(crate) struct FreedRange { + pub(crate) start_page_idx: usize, + pub(crate) end_page_idx: usize, +} + +impl FreedRange { + fn interior_pages(offset: usize, size: usize) -> FreedRange { + FreedRange { + // Divide round up + start_page_idx: offset.div_ceil(PAGE_SIZE), + // Divide round down + end_page_idx: (offset + size) / PAGE_SIZE, + } + } +} + +struct Range { + offset: usize, + size: usize, + state: DescriptorState, +} + +impl Range { + fn endpoint(&self) -> usize { + self.offset + self.size + } +} + +pub(crate) struct RangeAllocator { + inner: Impl, +} + +enum Impl { + Empty(usize), + Array(ArrayRangeAllocator), + Tree(TreeRangeAllocator), +} + +impl RangeAllocator { + pub(crate) fn new(size: usize) -> Self { + Self { + inner: Impl::Empty(size), + } + } + + pub(crate) fn free_oneway_space(&self) -> usize { + match &self.inner { + Impl::Empty(size) => size / 2, + Impl::Array(array) => array.free_oneway_space(), + Impl::Tree(tree) => tree.free_oneway_space(), + } + } + + pub(crate) fn count_buffers(&self) -> usize { + match &self.inner { + Impl::Empty(_size) => 0, + Impl::Array(array) => array.count_buffers(), + Impl::Tree(tree) => tree.count_buffers(), + } + } + + pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> { + match &self.inner { + Impl::Empty(_size) => Ok(()), + Impl::Array(array) => array.debug_print(m), + Impl::Tree(tree) => tree.debug_print(m), + } + } + + /// Try to reserve a new buffer, using the provided allocation if necessary. + pub(crate) fn reserve_new(&mut self, mut args: ReserveNewArgs) -> Result> { + match &mut self.inner { + Impl::Empty(size) => { + let empty_array = match args.empty_array_alloc.take() { + Some(empty_array) => ArrayRangeAllocator::new(*size, empty_array), + None => { + return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc { + args, + need_empty_array_alloc: true, + need_new_tree_alloc: false, + need_tree_alloc: false, + })) + } + }; + + self.inner = Impl::Array(empty_array); + self.reserve_new(args) + } + Impl::Array(array) if array.is_full() => { + let allocs = match args.new_tree_alloc { + Some(ref mut allocs) => allocs, + None => { + return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc { + args, + need_empty_array_alloc: false, + need_new_tree_alloc: true, + need_tree_alloc: true, + })) + } + }; + + let new_tree = + TreeRangeAllocator::from_array(array.total_size(), &mut array.ranges, allocs); + + self.inner = Impl::Tree(new_tree); + self.reserve_new(args) + } + Impl::Array(array) => { + let offset = + array.reserve_new(args.debug_id, args.size, args.is_oneway, args.pid)?; + Ok(ReserveNew::Success(ReserveNewSuccess { + offset, + oneway_spam_detected: false, + _empty_array_alloc: args.empty_array_alloc, + _new_tree_alloc: args.new_tree_alloc, + _tree_alloc: args.tree_alloc, + })) + } + Impl::Tree(tree) => { + let alloc = match args.tree_alloc { + Some(alloc) => alloc, + None => { + return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc { + args, + need_empty_array_alloc: false, + need_new_tree_alloc: false, + need_tree_alloc: true, + })); + } + }; + let (offset, oneway_spam_detected) = + tree.reserve_new(args.debug_id, args.size, args.is_oneway, args.pid, alloc)?; + Ok(ReserveNew::Success(ReserveNewSuccess { + offset, + oneway_spam_detected, + _empty_array_alloc: args.empty_array_alloc, + _new_tree_alloc: args.new_tree_alloc, + _tree_alloc: None, + })) + } + } + } + + /// Deletes the allocations at `offset`. + pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result { + match &mut self.inner { + Impl::Empty(_size) => Err(EINVAL), + Impl::Array(array) => array.reservation_abort(offset), + Impl::Tree(tree) => { + let freed_range = tree.reservation_abort(offset)?; + if tree.is_empty() { + self.inner = Impl::Empty(tree.total_size()); + } + Ok(freed_range) + } + } + } + + /// Called when an allocation is no longer in use by the kernel. + /// + /// The value in `data` will be stored, if any. A mutable reference is used to avoid dropping + /// the `T` when an error is returned. + pub(crate) fn reservation_commit(&mut self, offset: usize, data: &mut Option) -> Result { + match &mut self.inner { + Impl::Empty(_size) => Err(EINVAL), + Impl::Array(array) => array.reservation_commit(offset, data), + Impl::Tree(tree) => tree.reservation_commit(offset, data), + } + } + + /// Called when the kernel starts using an allocation. + /// + /// Returns the size of the existing entry and the data associated with it. + pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option)> { + match &mut self.inner { + Impl::Empty(_size) => Err(EINVAL), + Impl::Array(array) => array.reserve_existing(offset), + Impl::Tree(tree) => tree.reserve_existing(offset), + } + } + + /// Call the provided callback at every allocated region. + /// + /// This destroys the range allocator. Used only during shutdown. + pub(crate) fn take_for_each)>(&mut self, callback: F) { + match &mut self.inner { + Impl::Empty(_size) => {} + Impl::Array(array) => array.take_for_each(callback), + Impl::Tree(tree) => tree.take_for_each(callback), + } + } +} + +/// The arguments for `reserve_new`. +#[derive(Default)] +pub(crate) struct ReserveNewArgs { + pub(crate) size: usize, + pub(crate) is_oneway: bool, + pub(crate) debug_id: usize, + pub(crate) pid: Pid, + pub(crate) empty_array_alloc: Option>, + pub(crate) new_tree_alloc: Option>, + pub(crate) tree_alloc: Option>, +} + +/// The return type of `ReserveNew`. +pub(crate) enum ReserveNew { + Success(ReserveNewSuccess), + NeedAlloc(ReserveNewNeedAlloc), +} + +/// Returned by `reserve_new` when the reservation was successul. +pub(crate) struct ReserveNewSuccess { + pub(crate) offset: usize, + pub(crate) oneway_spam_detected: bool, + + // If the user supplied an allocation that we did not end up using, then we return it here. + // The caller will kfree it outside of the lock. + _empty_array_alloc: Option>, + _new_tree_alloc: Option>, + _tree_alloc: Option>, +} + +/// Returned by `reserve_new` to request the caller to make an allocation before calling the method +/// again. +pub(crate) struct ReserveNewNeedAlloc { + args: ReserveNewArgs, + need_empty_array_alloc: bool, + need_new_tree_alloc: bool, + need_tree_alloc: bool, +} + +impl ReserveNewNeedAlloc { + /// Make the necessary allocations for another call to `reserve_new`. + pub(crate) fn make_alloc(mut self) -> Result> { + if self.need_empty_array_alloc && self.args.empty_array_alloc.is_none() { + self.args.empty_array_alloc = Some(EmptyArrayAlloc::try_new(TREE_THRESHOLD)?); + } + if self.need_new_tree_alloc && self.args.new_tree_alloc.is_none() { + self.args.new_tree_alloc = Some(FromArrayAllocs::try_new(TREE_THRESHOLD)?); + } + if self.need_tree_alloc && self.args.tree_alloc.is_none() { + self.args.tree_alloc = Some(ReserveNewTreeAlloc::try_new()?); + } + Ok(self.args) + } +} diff --git a/drivers/android/binder/range_alloc/tree.rs b/drivers/android/binder/range_alloc/tree.rs new file mode 100644 index 000000000000..7b1a248fcb02 --- /dev/null +++ b/drivers/android/binder/range_alloc/tree.rs @@ -0,0 +1,488 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + page::PAGE_SIZE, + prelude::*, + rbtree::{RBTree, RBTreeNode, RBTreeNodeReservation}, + seq_file::SeqFile, + seq_print, + task::Pid, +}; + +use crate::range_alloc::{DescriptorState, FreedRange, Range}; + +/// Keeps track of allocations in a process' mmap. +/// +/// Each process has an mmap where the data for incoming transactions will be placed. This struct +/// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that +/// has metadata related to the allocation. We also keep track of available free space. +pub(super) struct TreeRangeAllocator { + /// This collection contains descriptors for *both* ranges containing an allocation, *and* free + /// ranges between allocations. The free ranges get merged, so there are never two free ranges + /// next to each other. + tree: RBTree>, + /// Contains an entry for every free range in `self.tree`. This tree sorts the ranges by size, + /// letting us look up the smallest range whose size is at least some lower bound. + free_tree: RBTree, + size: usize, + free_oneway_space: usize, +} + +impl TreeRangeAllocator { + pub(crate) fn from_array( + size: usize, + ranges: &mut KVec>, + alloc: &mut FromArrayAllocs, + ) -> Self { + let mut tree = TreeRangeAllocator { + tree: RBTree::new(), + free_tree: RBTree::new(), + size, + free_oneway_space: size / 2, + }; + + let mut free_offset = 0; + for range in ranges.drain_all() { + let free_size = range.offset - free_offset; + if free_size > 0 { + let free_node = alloc.free_tree.pop().unwrap(); + tree.free_tree + .insert(free_node.into_node((free_size, free_offset), ())); + let tree_node = alloc.tree.pop().unwrap(); + tree.tree.insert( + tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size)), + ); + } + free_offset = range.endpoint(); + + if range.state.is_oneway() { + tree.free_oneway_space = tree.free_oneway_space.saturating_sub(range.size); + } + + let free_res = alloc.free_tree.pop().unwrap(); + let tree_node = alloc.tree.pop().unwrap(); + let mut desc = Descriptor::new(range.offset, range.size); + desc.state = Some((range.state, free_res)); + tree.tree.insert(tree_node.into_node(range.offset, desc)); + } + + // After the last range, we may need a free range. + if free_offset < size { + let free_size = size - free_offset; + let free_node = alloc.free_tree.pop().unwrap(); + tree.free_tree + .insert(free_node.into_node((free_size, free_offset), ())); + let tree_node = alloc.tree.pop().unwrap(); + tree.tree + .insert(tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size))); + } + + tree + } + + pub(crate) fn is_empty(&self) -> bool { + let mut tree_iter = self.tree.values(); + // There's always at least one range, because index zero is either the start of a free or + // allocated range. + let first_value = tree_iter.next().unwrap(); + if tree_iter.next().is_some() { + // There are never two free ranges next to each other, so if there is more than one + // descriptor, then at least one of them must hold an allocated range. + return false; + } + // There is only one descriptor. Return true if it is for a free range. + first_value.state.is_none() + } + + pub(crate) fn total_size(&self) -> usize { + self.size + } + + pub(crate) fn free_oneway_space(&self) -> usize { + self.free_oneway_space + } + + pub(crate) fn count_buffers(&self) -> usize { + self.tree + .values() + .filter(|desc| desc.state.is_some()) + .count() + } + + pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> { + for desc in self.tree.values() { + let state = match &desc.state { + Some(state) => &state.0, + None => continue, + }; + seq_print!( + m, + " buffer: {} size {} pid {}", + desc.offset, + desc.size, + state.pid(), + ); + if state.is_oneway() { + seq_print!(m, " oneway"); + } + match state { + DescriptorState::Reserved(_res) => { + seq_print!(m, " reserved\n"); + } + DescriptorState::Allocated(_alloc) => { + seq_print!(m, " allocated\n"); + } + } + } + Ok(()) + } + + fn find_best_match(&mut self, size: usize) -> Option<&mut Descriptor> { + let free_cursor = self.free_tree.cursor_lower_bound(&(size, 0))?; + let ((_, offset), ()) = free_cursor.current(); + self.tree.get_mut(offset) + } + + /// Try to reserve a new buffer, using the provided allocation if necessary. + pub(crate) fn reserve_new( + &mut self, + debug_id: usize, + size: usize, + is_oneway: bool, + pid: Pid, + alloc: ReserveNewTreeAlloc, + ) -> Result<(usize, bool)> { + // Compute new value of free_oneway_space, which is set only on success. + let new_oneway_space = if is_oneway { + match self.free_oneway_space.checked_sub(size) { + Some(new_oneway_space) => new_oneway_space, + None => return Err(ENOSPC), + } + } else { + self.free_oneway_space + }; + + // Start detecting spammers once we have less than 20% + // of async space left (which is less than 10% of total + // buffer size). + // + // (This will short-circut, so `low_oneway_space` is + // only called when necessary.) + let oneway_spam_detected = + is_oneway && new_oneway_space < self.size / 10 && self.low_oneway_space(pid); + + let (found_size, found_off, tree_node, free_tree_node) = match self.find_best_match(size) { + None => { + pr_warn!("ENOSPC from range_alloc.reserve_new - size: {}", size); + return Err(ENOSPC); + } + Some(desc) => { + let found_size = desc.size; + let found_offset = desc.offset; + + // In case we need to break up the descriptor + let new_desc = Descriptor::new(found_offset + size, found_size - size); + let (tree_node, free_tree_node, desc_node_res) = alloc.initialize(new_desc); + + desc.state = Some(( + DescriptorState::new(is_oneway, debug_id, pid), + desc_node_res, + )); + desc.size = size; + + (found_size, found_offset, tree_node, free_tree_node) + } + }; + self.free_oneway_space = new_oneway_space; + self.free_tree.remove(&(found_size, found_off)); + + if found_size != size { + self.tree.insert(tree_node); + self.free_tree.insert(free_tree_node); + } + + Ok((found_off, oneway_spam_detected)) + } + + pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result { + let mut cursor = self.tree.cursor_lower_bound(&offset).ok_or_else(|| { + pr_warn!( + "EINVAL from range_alloc.reservation_abort - offset: {}", + offset + ); + EINVAL + })?; + + let (_, desc) = cursor.current_mut(); + + if desc.offset != offset { + pr_warn!( + "EINVAL from range_alloc.reservation_abort - offset: {}", + offset + ); + return Err(EINVAL); + } + + let (reservation, free_node_res) = desc.try_change_state(|state| match state { + Some((DescriptorState::Reserved(reservation), free_node_res)) => { + (None, Ok((reservation, free_node_res))) + } + None => { + pr_warn!( + "EINVAL from range_alloc.reservation_abort - offset: {}", + offset + ); + (None, Err(EINVAL)) + } + allocated => { + pr_warn!( + "EPERM from range_alloc.reservation_abort - offset: {}", + offset + ); + (allocated, Err(EPERM)) + } + })?; + + let mut size = desc.size; + let mut offset = desc.offset; + let free_oneway_space_add = if reservation.is_oneway { size } else { 0 }; + + self.free_oneway_space += free_oneway_space_add; + + let mut freed_range = FreedRange::interior_pages(offset, size); + // Compute how large the next free region needs to be to include one more page in + // the newly freed range. + let add_next_page_needed = match (offset + size) % PAGE_SIZE { + 0 => usize::MAX, + unalign => PAGE_SIZE - unalign, + }; + // Compute how large the previous free region needs to be to include one more page + // in the newly freed range. + let add_prev_page_needed = match offset % PAGE_SIZE { + 0 => usize::MAX, + unalign => unalign, + }; + + // Merge next into current if next is free + let remove_next = match cursor.peek_next() { + Some((_, next)) if next.state.is_none() => { + if next.size >= add_next_page_needed { + freed_range.end_page_idx += 1; + } + self.free_tree.remove(&(next.size, next.offset)); + size += next.size; + true + } + _ => false, + }; + + if remove_next { + let (_, desc) = cursor.current_mut(); + desc.size = size; + cursor.remove_next(); + } + + // Merge current into prev if prev is free + match cursor.peek_prev_mut() { + Some((_, prev)) if prev.state.is_none() => { + if prev.size >= add_prev_page_needed { + freed_range.start_page_idx -= 1; + } + // merge previous with current, remove current + self.free_tree.remove(&(prev.size, prev.offset)); + offset = prev.offset; + size += prev.size; + prev.size = size; + cursor.remove_current(); + } + _ => {} + }; + + self.free_tree + .insert(free_node_res.into_node((size, offset), ())); + + Ok(freed_range) + } + + pub(crate) fn reservation_commit(&mut self, offset: usize, data: &mut Option) -> Result { + let desc = self.tree.get_mut(&offset).ok_or(ENOENT)?; + + desc.try_change_state(|state| match state { + Some((DescriptorState::Reserved(reservation), free_node_res)) => ( + Some(( + DescriptorState::Allocated(reservation.allocate(data.take())), + free_node_res, + )), + Ok(()), + ), + other => (other, Err(ENOENT)), + }) + } + + /// Takes an entry at the given offset from [`DescriptorState::Allocated`] to + /// [`DescriptorState::Reserved`]. + /// + /// Returns the size of the existing entry and the data associated with it. + pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option)> { + let desc = self.tree.get_mut(&offset).ok_or_else(|| { + pr_warn!( + "ENOENT from range_alloc.reserve_existing - offset: {}", + offset + ); + ENOENT + })?; + + let (debug_id, data) = desc.try_change_state(|state| match state { + Some((DescriptorState::Allocated(allocation), free_node_res)) => { + let (reservation, data) = allocation.deallocate(); + let debug_id = reservation.debug_id; + ( + Some((DescriptorState::Reserved(reservation), free_node_res)), + Ok((debug_id, data)), + ) + } + other => { + pr_warn!( + "ENOENT from range_alloc.reserve_existing - offset: {}", + offset + ); + (other, Err(ENOENT)) + } + })?; + + Ok((desc.size, debug_id, data)) + } + + /// Call the provided callback at every allocated region. + /// + /// This destroys the range allocator. Used only during shutdown. + pub(crate) fn take_for_each)>(&mut self, callback: F) { + for (_, desc) in self.tree.iter_mut() { + if let Some((DescriptorState::Allocated(allocation), _)) = &mut desc.state { + callback( + desc.offset, + desc.size, + allocation.debug_id(), + allocation.take(), + ); + } + } + } + + /// Find the amount and size of buffers allocated by the current caller. + /// + /// The idea is that once we cross the threshold, whoever is responsible + /// for the low async space is likely to try to send another async transaction, + /// and at some point we'll catch them in the act. This is more efficient + /// than keeping a map per pid. + fn low_oneway_space(&self, calling_pid: Pid) -> bool { + let mut total_alloc_size = 0; + let mut num_buffers = 0; + for (_, desc) in self.tree.iter() { + if let Some((state, _)) = &desc.state { + if state.is_oneway() && state.pid() == calling_pid { + total_alloc_size += desc.size; + num_buffers += 1; + } + } + } + + // Warn if this pid has more than 50 transactions, or more than 50% of + // async space (which is 25% of total buffer size). Oneway spam is only + // detected when the threshold is exceeded. + num_buffers > 50 || total_alloc_size > self.size / 4 + } +} + +type TreeDescriptorState = (DescriptorState, FreeNodeRes); +struct Descriptor { + size: usize, + offset: usize, + state: Option>, +} + +impl Descriptor { + fn new(offset: usize, size: usize) -> Self { + Self { + size, + offset, + state: None, + } + } + + fn try_change_state(&mut self, f: F) -> Result + where + F: FnOnce(Option>) -> (Option>, Result), + { + let (new_state, result) = f(self.state.take()); + self.state = new_state; + result + } +} + +// (Descriptor.size, Descriptor.offset) +type FreeKey = (usize, usize); +type FreeNodeRes = RBTreeNodeReservation; + +/// An allocation for use by `reserve_new`. +pub(crate) struct ReserveNewTreeAlloc { + tree_node_res: RBTreeNodeReservation>, + free_tree_node_res: FreeNodeRes, + desc_node_res: FreeNodeRes, +} + +impl ReserveNewTreeAlloc { + pub(crate) fn try_new() -> Result { + let tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?; + let free_tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?; + let desc_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?; + Ok(Self { + tree_node_res, + free_tree_node_res, + desc_node_res, + }) + } + + fn initialize( + self, + desc: Descriptor, + ) -> ( + RBTreeNode>, + RBTreeNode, + FreeNodeRes, + ) { + let size = desc.size; + let offset = desc.offset; + ( + self.tree_node_res.into_node(offset, desc), + self.free_tree_node_res.into_node((size, offset), ()), + self.desc_node_res, + ) + } +} + +/// An allocation for creating a tree from an `ArrayRangeAllocator`. +pub(crate) struct FromArrayAllocs { + tree: KVec>>, + free_tree: KVec>, +} + +impl FromArrayAllocs { + pub(crate) fn try_new(len: usize) -> Result { + let num_descriptors = 2 * len + 1; + + let mut tree = KVec::with_capacity(num_descriptors, GFP_KERNEL)?; + for _ in 0..num_descriptors { + tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?; + } + + let mut free_tree = KVec::with_capacity(num_descriptors, GFP_KERNEL)?; + for _ in 0..num_descriptors { + free_tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?; + } + + Ok(Self { tree, free_tree }) + } +} diff --git a/drivers/android/binder/rust_binder.h b/drivers/android/binder/rust_binder.h new file mode 100644 index 000000000000..31806890ed1a --- /dev/null +++ b/drivers/android/binder/rust_binder.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Google, Inc. + */ + +#ifndef _LINUX_RUST_BINDER_H +#define _LINUX_RUST_BINDER_H + +#include +#include + +/* + * These symbols are exposed by `rust_binderfs.c` and exist here so that Rust + * Binder can call them. + */ +int init_rust_binderfs(void); + +struct dentry; +struct inode; +struct dentry *rust_binderfs_create_proc_file(struct inode *nodp, int pid); +void rust_binderfs_remove_file(struct dentry *dentry); + +#endif diff --git a/drivers/android/binder/rust_binder_events.c b/drivers/android/binder/rust_binder_events.c new file mode 100644 index 000000000000..488b1470060c --- /dev/null +++ b/drivers/android/binder/rust_binder_events.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* rust_binder_events.c + * + * Rust Binder tracepoints. + * + * Copyright 2025 Google LLC + */ + +#include "rust_binder.h" + +const char * const binder_command_strings[] = { + "BC_TRANSACTION", + "BC_REPLY", + "BC_ACQUIRE_RESULT", + "BC_FREE_BUFFER", + "BC_INCREFS", + "BC_ACQUIRE", + "BC_RELEASE", + "BC_DECREFS", + "BC_INCREFS_DONE", + "BC_ACQUIRE_DONE", + "BC_ATTEMPT_ACQUIRE", + "BC_REGISTER_LOOPER", + "BC_ENTER_LOOPER", + "BC_EXIT_LOOPER", + "BC_REQUEST_DEATH_NOTIFICATION", + "BC_CLEAR_DEATH_NOTIFICATION", + "BC_DEAD_BINDER_DONE", + "BC_TRANSACTION_SG", + "BC_REPLY_SG", +}; + +const char * const binder_return_strings[] = { + "BR_ERROR", + "BR_OK", + "BR_TRANSACTION", + "BR_REPLY", + "BR_ACQUIRE_RESULT", + "BR_DEAD_REPLY", + "BR_TRANSACTION_COMPLETE", + "BR_INCREFS", + "BR_ACQUIRE", + "BR_RELEASE", + "BR_DECREFS", + "BR_ATTEMPT_ACQUIRE", + "BR_NOOP", + "BR_SPAWN_LOOPER", + "BR_FINISHED", + "BR_DEAD_BINDER", + "BR_CLEAR_DEATH_NOTIFICATION_DONE", + "BR_FAILED_REPLY", + "BR_FROZEN_REPLY", + "BR_ONEWAY_SPAM_SUSPECT", + "BR_TRANSACTION_PENDING_FROZEN" +}; + +#define CREATE_TRACE_POINTS +#define CREATE_RUST_TRACE_POINTS +#include "rust_binder_events.h" diff --git a/drivers/android/binder/rust_binder_events.h b/drivers/android/binder/rust_binder_events.h new file mode 100644 index 000000000000..2f3efbf9dba6 --- /dev/null +++ b/drivers/android/binder/rust_binder_events.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Google, Inc. + */ + +#undef TRACE_SYSTEM +#undef TRACE_INCLUDE_FILE +#undef TRACE_INCLUDE_PATH +#define TRACE_SYSTEM rust_binder +#define TRACE_INCLUDE_FILE rust_binder_events +#define TRACE_INCLUDE_PATH ../drivers/android/binder + +#if !defined(_RUST_BINDER_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _RUST_BINDER_TRACE_H + +#include + +TRACE_EVENT(rust_binder_ioctl, + TP_PROTO(unsigned int cmd, unsigned long arg), + TP_ARGS(cmd, arg), + + TP_STRUCT__entry( + __field(unsigned int, cmd) + __field(unsigned long, arg) + ), + TP_fast_assign( + __entry->cmd = cmd; + __entry->arg = arg; + ), + TP_printk("cmd=0x%x arg=0x%lx", __entry->cmd, __entry->arg) +); + +#endif /* _RUST_BINDER_TRACE_H */ + +/* This part must be outside protection */ +#include diff --git a/drivers/android/binder/rust_binder_internal.h b/drivers/android/binder/rust_binder_internal.h new file mode 100644 index 000000000000..78288fe7964d --- /dev/null +++ b/drivers/android/binder/rust_binder_internal.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* rust_binder_internal.h + * + * This file contains internal data structures used by Rust Binder. Mostly, + * these are type definitions used only by binderfs or things that Rust Binder + * define and export to binderfs. + * + * It does not include things exported by binderfs to Rust Binder since this + * file is not included as input to bindgen. + * + * Copyright (C) 2025 Google LLC. + */ + +#ifndef _LINUX_RUST_BINDER_INTERNAL_H +#define _LINUX_RUST_BINDER_INTERNAL_H + +#define RUST_BINDERFS_SUPER_MAGIC 0x6c6f6f71 + +#include +#include +#include + +/* + * The internal data types in the Rust Binder driver are opaque to C, so we use + * void pointer typedefs for these types. + */ +typedef void *rust_binder_context; + +/** + * struct binder_device - information about a binder device node + * @minor: the minor number used by this device + * @ctx: the Rust Context used by this device, or null for binder-control + * + * This is used as the private data for files directly in binderfs, but not + * files in the binder_logs subdirectory. This struct owns a refcount on `ctx` + * and the entry for `minor` in `binderfs_minors`. For binder-control `ctx` is + * null. + */ +struct binder_device { + int minor; + rust_binder_context ctx; +}; + +int rust_binder_stats_show(struct seq_file *m, void *unused); +int rust_binder_state_show(struct seq_file *m, void *unused); +int rust_binder_transactions_show(struct seq_file *m, void *unused); +int rust_binder_proc_show(struct seq_file *m, void *pid); + +extern const struct file_operations rust_binder_fops; +rust_binder_context rust_binder_new_context(char *name); +void rust_binder_remove_context(rust_binder_context device); + +/** + * binderfs_mount_opts - mount options for binderfs + * @max: maximum number of allocatable binderfs binder devices + * @stats_mode: enable binder stats in binderfs. + */ +struct binderfs_mount_opts { + int max; + int stats_mode; +}; + +/** + * binderfs_info - information about a binderfs mount + * @ipc_ns: The ipc namespace the binderfs mount belongs to. + * @control_dentry: This records the dentry of this binderfs mount + * binder-control device. + * @root_uid: uid that needs to be used when a new binder device is + * created. + * @root_gid: gid that needs to be used when a new binder device is + * created. + * @mount_opts: The mount options in use. + * @device_count: The current number of allocated binder devices. + * @proc_log_dir: Pointer to the directory dentry containing process-specific + * logs. + */ +struct binderfs_info { + struct ipc_namespace *ipc_ns; + struct dentry *control_dentry; + kuid_t root_uid; + kgid_t root_gid; + struct binderfs_mount_opts mount_opts; + int device_count; + struct dentry *proc_log_dir; +}; + +#endif /* _LINUX_RUST_BINDER_INTERNAL_H */ diff --git a/drivers/android/binder/rust_binder_main.rs b/drivers/android/binder/rust_binder_main.rs new file mode 100644 index 000000000000..6773b7c273ec --- /dev/null +++ b/drivers/android/binder/rust_binder_main.rs @@ -0,0 +1,627 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Binder -- the Android IPC mechanism. +#![recursion_limit = "256"] +#![allow( + clippy::as_underscore, + clippy::ref_as_ptr, + clippy::ptr_as_ptr, + clippy::cast_lossless +)] + +use kernel::{ + bindings::{self, seq_file}, + fs::File, + list::{ListArc, ListArcSafe, ListLinksSelfPtr, TryNewListArc}, + prelude::*, + seq_file::SeqFile, + seq_print, + sync::poll::PollTable, + sync::Arc, + task::Pid, + transmute::AsBytes, + types::ForeignOwnable, + uaccess::UserSliceWriter, +}; + +use crate::{context::Context, page_range::Shrinker, process::Process, thread::Thread}; + +use core::{ + ptr::NonNull, + sync::atomic::{AtomicBool, AtomicUsize, Ordering}, +}; + +mod allocation; +mod context; +mod deferred_close; +mod defs; +mod error; +mod node; +mod page_range; +mod process; +mod range_alloc; +mod stats; +mod thread; +mod trace; +mod transaction; + +#[allow(warnings)] // generated bindgen code +mod binderfs { + use kernel::bindings::{dentry, inode}; + + extern "C" { + pub fn init_rust_binderfs() -> kernel::ffi::c_int; + } + extern "C" { + pub fn rust_binderfs_create_proc_file( + nodp: *mut inode, + pid: kernel::ffi::c_int, + ) -> *mut dentry; + } + extern "C" { + pub fn rust_binderfs_remove_file(dentry: *mut dentry); + } + pub type rust_binder_context = *mut kernel::ffi::c_void; + #[repr(C)] + #[derive(Copy, Clone)] + pub struct binder_device { + pub minor: kernel::ffi::c_int, + pub ctx: rust_binder_context, + } + impl Default for binder_device { + fn default() -> Self { + let mut s = ::core::mem::MaybeUninit::::uninit(); + unsafe { + ::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1); + s.assume_init() + } + } + } +} + +module! { + type: BinderModule, + name: "rust_binder", + authors: ["Wedson Almeida Filho", "Alice Ryhl"], + description: "Android Binder", + license: "GPL", +} + +fn next_debug_id() -> usize { + static NEXT_DEBUG_ID: AtomicUsize = AtomicUsize::new(0); + + NEXT_DEBUG_ID.fetch_add(1, Ordering::Relaxed) +} + +/// Provides a single place to write Binder return values via the +/// supplied `UserSliceWriter`. +pub(crate) struct BinderReturnWriter<'a> { + writer: UserSliceWriter, + thread: &'a Thread, +} + +impl<'a> BinderReturnWriter<'a> { + fn new(writer: UserSliceWriter, thread: &'a Thread) -> Self { + BinderReturnWriter { writer, thread } + } + + /// Write a return code back to user space. + /// Should be a `BR_` constant from [`defs`] e.g. [`defs::BR_TRANSACTION_COMPLETE`]. + fn write_code(&mut self, code: u32) -> Result { + stats::GLOBAL_STATS.inc_br(code); + self.thread.process.stats.inc_br(code); + self.writer.write(&code) + } + + /// Write something *other than* a return code to user space. + fn write_payload(&mut self, payload: &T) -> Result { + self.writer.write(payload) + } + + fn len(&self) -> usize { + self.writer.len() + } +} + +/// Specifies how a type should be delivered to the read part of a BINDER_WRITE_READ ioctl. +/// +/// When a value is pushed to the todo list for a process or thread, it is stored as a trait object +/// with the type `Arc`. Trait objects are a Rust feature that lets you +/// implement dynamic dispatch over many different types. This lets us store many different types +/// in the todo list. +trait DeliverToRead: ListArcSafe + Send + Sync { + /// Performs work. Returns true if remaining work items in the queue should be processed + /// immediately, or false if it should return to caller before processing additional work + /// items. + fn do_work( + self: DArc, + thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result; + + /// Cancels the given work item. This is called instead of [`DeliverToRead::do_work`] when work + /// won't be delivered. + fn cancel(self: DArc); + + /// Should we use `wake_up_interruptible_sync` or `wake_up_interruptible` when scheduling this + /// work item? + /// + /// Generally only set to true for non-oneway transactions. + fn should_sync_wakeup(&self) -> bool; + + fn debug_print(&self, m: &SeqFile, prefix: &str, transaction_prefix: &str) -> Result<()>; +} + +// Wrapper around a `DeliverToRead` with linked list links. +#[pin_data] +struct DTRWrap { + #[pin] + links: ListLinksSelfPtr>, + #[pin] + wrapped: T, +} +kernel::list::impl_list_arc_safe! { + impl{T: ListArcSafe + ?Sized} ListArcSafe<0> for DTRWrap { + tracked_by wrapped: T; + } +} +kernel::list::impl_list_item! { + impl ListItem<0> for DTRWrap { + using ListLinksSelfPtr { self.links }; + } +} + +impl core::ops::Deref for DTRWrap { + type Target = T; + fn deref(&self) -> &T { + &self.wrapped + } +} + +type DArc = kernel::sync::Arc>; +type DLArc = kernel::list::ListArc>; + +impl DTRWrap { + fn new(val: impl PinInit) -> impl PinInit { + pin_init!(Self { + links <- ListLinksSelfPtr::new(), + wrapped <- val, + }) + } + + fn arc_try_new(val: T) -> Result, kernel::alloc::AllocError> { + ListArc::pin_init( + try_pin_init!(Self { + links <- ListLinksSelfPtr::new(), + wrapped: val, + }), + GFP_KERNEL, + ) + .map_err(|_| kernel::alloc::AllocError) + } + + fn arc_pin_init(init: impl PinInit) -> Result, kernel::error::Error> { + ListArc::pin_init( + try_pin_init!(Self { + links <- ListLinksSelfPtr::new(), + wrapped <- init, + }), + GFP_KERNEL, + ) + } +} + +struct DeliverCode { + code: u32, + skip: AtomicBool, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for DeliverCode { untracked; } +} + +impl DeliverCode { + fn new(code: u32) -> Self { + Self { + code, + skip: AtomicBool::new(false), + } + } + + /// Disable this DeliverCode and make it do nothing. + /// + /// This is used instead of removing it from the work list, since `LinkedList::remove` is + /// unsafe, whereas this method is not. + fn skip(&self) { + self.skip.store(true, Ordering::Relaxed); + } +} + +impl DeliverToRead for DeliverCode { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + if !self.skip.load(Ordering::Relaxed) { + writer.write_code(self.code)?; + } + Ok(true) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!(m, "{}", prefix); + if self.skip.load(Ordering::Relaxed) { + seq_print!(m, "(skipped) "); + } + if self.code == defs::BR_TRANSACTION_COMPLETE { + seq_print!(m, "transaction complete\n"); + } else { + seq_print!(m, "transaction error: {}\n", self.code); + } + Ok(()) + } +} + +fn ptr_align(value: usize) -> Option { + let size = core::mem::size_of::() - 1; + Some(value.checked_add(size)? & !size) +} + +// SAFETY: We call register in `init`. +static BINDER_SHRINKER: Shrinker = unsafe { Shrinker::new() }; + +struct BinderModule {} + +impl kernel::Module for BinderModule { + fn init(_module: &'static kernel::ThisModule) -> Result { + // SAFETY: The module initializer never runs twice, so we only call this once. + unsafe { crate::context::CONTEXTS.init() }; + + pr_warn!("Loaded Rust Binder."); + + BINDER_SHRINKER.register(kernel::c_str!("android-binder"))?; + + // SAFETY: The module is being loaded, so we can initialize binderfs. + unsafe { kernel::error::to_result(binderfs::init_rust_binderfs())? }; + + Ok(Self {}) + } +} + +/// Makes the inner type Sync. +#[repr(transparent)] +pub struct AssertSync(T); +// SAFETY: Used only to insert `file_operations` into a global, which is safe. +unsafe impl Sync for AssertSync {} + +/// File operations that rust_binderfs.c can use. +#[no_mangle] +#[used] +pub static rust_binder_fops: AssertSync = { + // SAFETY: All zeroes is safe for the `file_operations` type. + let zeroed_ops = unsafe { core::mem::MaybeUninit::zeroed().assume_init() }; + + let ops = kernel::bindings::file_operations { + owner: THIS_MODULE.as_ptr(), + poll: Some(rust_binder_poll), + unlocked_ioctl: Some(rust_binder_unlocked_ioctl), + compat_ioctl: Some(rust_binder_compat_ioctl), + mmap: Some(rust_binder_mmap), + open: Some(rust_binder_open), + release: Some(rust_binder_release), + flush: Some(rust_binder_flush), + ..zeroed_ops + }; + AssertSync(ops) +}; + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_new_context( + name: *const kernel::ffi::c_char, +) -> *mut kernel::ffi::c_void { + // SAFETY: The caller will always provide a valid c string here. + let name = unsafe { kernel::str::CStr::from_char_ptr(name) }; + match Context::new(name) { + Ok(ctx) => Arc::into_foreign(ctx), + Err(_err) => core::ptr::null_mut(), + } +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_remove_context(device: *mut kernel::ffi::c_void) { + if !device.is_null() { + // SAFETY: The caller ensures that the `device` pointer came from a previous call to + // `rust_binder_new_device`. + let ctx = unsafe { Arc::::from_foreign(device) }; + ctx.deregister(); + drop(ctx); + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_open( + inode: *mut bindings::inode, + file_ptr: *mut bindings::file, +) -> kernel::ffi::c_int { + // SAFETY: The `rust_binderfs.c` file ensures that `i_private` is set to a + // `struct binder_device`. + let device = unsafe { (*inode).i_private } as *const binderfs::binder_device; + + assert!(!device.is_null()); + + // SAFETY: The `rust_binderfs.c` file ensures that `device->ctx` holds a binder context when + // using the rust binder fops. + let ctx = unsafe { Arc::::borrow((*device).ctx) }; + + // SAFETY: The caller provides a valid file pointer to a new `struct file`. + let file = unsafe { File::from_raw_file(file_ptr) }; + let process = match Process::open(ctx, file) { + Ok(process) => process, + Err(err) => return err.to_errno(), + }; + + // SAFETY: This is an `inode` for a newly created binder file. + match unsafe { BinderfsProcFile::new(inode, process.task.pid()) } { + Ok(Some(file)) => process.inner.lock().binderfs_file = Some(file), + Ok(None) => { /* pid already exists */ } + Err(err) => return err.to_errno(), + } + + // SAFETY: This file is associated with Rust binder, so we own the `private_data` field. + unsafe { (*file_ptr).private_data = process.into_foreign() }; + 0 +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_release( + _inode: *mut bindings::inode, + file: *mut bindings::file, +) -> kernel::ffi::c_int { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let process = unsafe { Arc::::from_foreign((*file).private_data) }; + // SAFETY: The caller ensures that the file is valid. + let file = unsafe { File::from_raw_file(file) }; + Process::release(process, file); + 0 +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_compat_ioctl( + file: *mut bindings::file, + cmd: kernel::ffi::c_uint, + arg: kernel::ffi::c_ulong, +) -> kernel::ffi::c_long { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + // SAFETY: The caller ensures that the file is valid. + match Process::compat_ioctl(f, unsafe { File::from_raw_file(file) }, cmd as _, arg as _) { + Ok(()) => 0, + Err(err) => err.to_errno() as isize, + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_unlocked_ioctl( + file: *mut bindings::file, + cmd: kernel::ffi::c_uint, + arg: kernel::ffi::c_ulong, +) -> kernel::ffi::c_long { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + // SAFETY: The caller ensures that the file is valid. + match Process::ioctl(f, unsafe { File::from_raw_file(file) }, cmd as _, arg as _) { + Ok(()) => 0, + Err(err) => err.to_errno() as isize, + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_mmap( + file: *mut bindings::file, + vma: *mut bindings::vm_area_struct, +) -> kernel::ffi::c_int { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + // SAFETY: The caller ensures that the vma is valid. + let area = unsafe { kernel::mm::virt::VmaNew::from_raw(vma) }; + // SAFETY: The caller ensures that the file is valid. + match Process::mmap(f, unsafe { File::from_raw_file(file) }, area) { + Ok(()) => 0, + Err(err) => err.to_errno(), + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_poll( + file: *mut bindings::file, + wait: *mut bindings::poll_table_struct, +) -> bindings::__poll_t { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + // SAFETY: The caller ensures that the file is valid. + let fileref = unsafe { File::from_raw_file(file) }; + // SAFETY: The caller ensures that the `PollTable` is valid. + match Process::poll(f, fileref, unsafe { PollTable::from_raw(wait) }) { + Ok(v) => v, + Err(_) => bindings::POLLERR, + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_flush( + file: *mut bindings::file, + _id: bindings::fl_owner_t, +) -> kernel::ffi::c_int { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + match Process::flush(f) { + Ok(()) => 0, + Err(err) => err.to_errno(), + } +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_stats_show( + ptr: *mut seq_file, + _: *mut kernel::ffi::c_void, +) -> kernel::ffi::c_int { + // SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which + // this method is called. + let m = unsafe { SeqFile::from_raw(ptr) }; + if let Err(err) = rust_binder_stats_show_impl(m) { + seq_print!(m, "failed to generate state: {:?}\n", err); + } + 0 +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_state_show( + ptr: *mut seq_file, + _: *mut kernel::ffi::c_void, +) -> kernel::ffi::c_int { + // SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which + // this method is called. + let m = unsafe { SeqFile::from_raw(ptr) }; + if let Err(err) = rust_binder_state_show_impl(m) { + seq_print!(m, "failed to generate state: {:?}\n", err); + } + 0 +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_proc_show( + ptr: *mut seq_file, + _: *mut kernel::ffi::c_void, +) -> kernel::ffi::c_int { + // SAFETY: Accessing the private field of `seq_file` is okay. + let pid = (unsafe { (*ptr).private }) as usize as Pid; + // SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which + // this method is called. + let m = unsafe { SeqFile::from_raw(ptr) }; + if let Err(err) = rust_binder_proc_show_impl(m, pid) { + seq_print!(m, "failed to generate state: {:?}\n", err); + } + 0 +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_transactions_show( + ptr: *mut seq_file, + _: *mut kernel::ffi::c_void, +) -> kernel::ffi::c_int { + // SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which + // this method is called. + let m = unsafe { SeqFile::from_raw(ptr) }; + if let Err(err) = rust_binder_transactions_show_impl(m) { + seq_print!(m, "failed to generate state: {:?}\n", err); + } + 0 +} + +fn rust_binder_transactions_show_impl(m: &SeqFile) -> Result<()> { + seq_print!(m, "binder transactions:\n"); + let contexts = context::get_all_contexts()?; + for ctx in contexts { + let procs = ctx.get_all_procs()?; + for proc in procs { + proc.debug_print(m, &ctx, false)?; + seq_print!(m, "\n"); + } + } + Ok(()) +} + +fn rust_binder_stats_show_impl(m: &SeqFile) -> Result<()> { + seq_print!(m, "binder stats:\n"); + stats::GLOBAL_STATS.debug_print("", m); + let contexts = context::get_all_contexts()?; + for ctx in contexts { + let procs = ctx.get_all_procs()?; + for proc in procs { + proc.debug_print_stats(m, &ctx)?; + seq_print!(m, "\n"); + } + } + Ok(()) +} + +fn rust_binder_state_show_impl(m: &SeqFile) -> Result<()> { + seq_print!(m, "binder state:\n"); + let contexts = context::get_all_contexts()?; + for ctx in contexts { + let procs = ctx.get_all_procs()?; + for proc in procs { + proc.debug_print(m, &ctx, true)?; + seq_print!(m, "\n"); + } + } + Ok(()) +} + +fn rust_binder_proc_show_impl(m: &SeqFile, pid: Pid) -> Result<()> { + seq_print!(m, "binder proc state:\n"); + let contexts = context::get_all_contexts()?; + for ctx in contexts { + let procs = ctx.get_procs_with_pid(pid)?; + for proc in procs { + proc.debug_print(m, &ctx, true)?; + seq_print!(m, "\n"); + } + } + Ok(()) +} + +struct BinderfsProcFile(NonNull); + +// SAFETY: Safe to drop any thread. +unsafe impl Send for BinderfsProcFile {} + +impl BinderfsProcFile { + /// # Safety + /// + /// Takes an inode from a newly created binder file. + unsafe fn new(nodp: *mut bindings::inode, pid: i32) -> Result> { + // SAFETY: The caller passes an `inode` for a newly created binder file. + let dentry = unsafe { binderfs::rust_binderfs_create_proc_file(nodp, pid) }; + match kernel::error::from_err_ptr(dentry) { + Ok(dentry) => Ok(NonNull::new(dentry).map(Self)), + Err(err) if err == EEXIST => Ok(None), + Err(err) => Err(err), + } + } +} + +impl Drop for BinderfsProcFile { + fn drop(&mut self) { + // SAFETY: This is a dentry from `rust_binderfs_remove_file` that has not been deleted yet. + unsafe { binderfs::rust_binderfs_remove_file(self.0.as_ptr()) }; + } +} diff --git a/drivers/android/binder/rust_binderfs.c b/drivers/android/binder/rust_binderfs.c new file mode 100644 index 000000000000..6b497146b698 --- /dev/null +++ b/drivers/android/binder/rust_binderfs.c @@ -0,0 +1,850 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rust_binder.h" +#include "rust_binder_internal.h" + +#define FIRST_INODE 1 +#define SECOND_INODE 2 +#define INODE_OFFSET 3 +#define BINDERFS_MAX_MINOR (1U << MINORBITS) +/* Ensure that the initial ipc namespace always has devices available. */ +#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 4) + +DEFINE_SHOW_ATTRIBUTE(rust_binder_stats); +DEFINE_SHOW_ATTRIBUTE(rust_binder_state); +DEFINE_SHOW_ATTRIBUTE(rust_binder_transactions); +DEFINE_SHOW_ATTRIBUTE(rust_binder_proc); + +char *rust_binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; +module_param_named(rust_devices, rust_binder_devices_param, charp, 0444); + +static dev_t binderfs_dev; +static DEFINE_MUTEX(binderfs_minors_mutex); +static DEFINE_IDA(binderfs_minors); + +enum binderfs_param { + Opt_max, + Opt_stats_mode, +}; + +enum binderfs_stats_mode { + binderfs_stats_mode_unset, + binderfs_stats_mode_global, +}; + +struct binder_features { + bool oneway_spam_detection; + bool extended_error; + bool freeze_notification; +}; + +static const struct constant_table binderfs_param_stats[] = { + { "global", binderfs_stats_mode_global }, + {} +}; + +static const struct fs_parameter_spec binderfs_fs_parameters[] = { + fsparam_u32("max", Opt_max), + fsparam_enum("stats", Opt_stats_mode, binderfs_param_stats), + {} +}; + +static struct binder_features binder_features = { + .oneway_spam_detection = true, + .extended_error = true, + .freeze_notification = true, +}; + +static inline struct binderfs_info *BINDERFS_SB(const struct super_block *sb) +{ + return sb->s_fs_info; +} + +/** + * binderfs_binder_device_create - allocate inode from super block of a + * binderfs mount + * @ref_inode: inode from wich the super block will be taken + * @userp: buffer to copy information about new device for userspace to + * @req: struct binderfs_device as copied from userspace + * + * This function allocates a new binder_device and reserves a new minor + * number for it. + * Minor numbers are limited and tracked globally in binderfs_minors. The + * function will stash a struct binder_device for the specific binder + * device in i_private of the inode. + * It will go on to allocate a new inode from the super block of the + * filesystem mount, stash a struct binder_device in its i_private field + * and attach a dentry to that inode. + * + * Return: 0 on success, negative errno on failure + */ +static int binderfs_binder_device_create(struct inode *ref_inode, + struct binderfs_device __user *userp, + struct binderfs_device *req) +{ + int minor, ret; + struct dentry *dentry, *root; + struct binder_device *device = NULL; + rust_binder_context ctx = NULL; + struct inode *inode = NULL; + struct super_block *sb = ref_inode->i_sb; + struct binderfs_info *info = sb->s_fs_info; +#if defined(CONFIG_IPC_NS) + bool use_reserve = (info->ipc_ns == &init_ipc_ns); +#else + bool use_reserve = true; +#endif + + /* Reserve new minor number for the new device. */ + mutex_lock(&binderfs_minors_mutex); + if (++info->device_count <= info->mount_opts.max) + minor = ida_alloc_max(&binderfs_minors, + use_reserve ? BINDERFS_MAX_MINOR : + BINDERFS_MAX_MINOR_CAPPED, + GFP_KERNEL); + else + minor = -ENOSPC; + if (minor < 0) { + --info->device_count; + mutex_unlock(&binderfs_minors_mutex); + return minor; + } + mutex_unlock(&binderfs_minors_mutex); + + ret = -ENOMEM; + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) + goto err; + + req->name[BINDERFS_MAX_NAME] = '\0'; /* NUL-terminate */ + + ctx = rust_binder_new_context(req->name); + if (!ctx) + goto err; + + inode = new_inode(sb); + if (!inode) + goto err; + + inode->i_ino = minor + INODE_OFFSET; + simple_inode_init_ts(inode); + init_special_inode(inode, S_IFCHR | 0600, + MKDEV(MAJOR(binderfs_dev), minor)); + inode->i_fop = &rust_binder_fops; + inode->i_uid = info->root_uid; + inode->i_gid = info->root_gid; + + req->major = MAJOR(binderfs_dev); + req->minor = minor; + device->ctx = ctx; + device->minor = minor; + + if (userp && copy_to_user(userp, req, sizeof(*req))) { + ret = -EFAULT; + goto err; + } + + root = sb->s_root; + inode_lock(d_inode(root)); + + /* look it up */ + dentry = lookup_noperm(&QSTR(req->name), root); + if (IS_ERR(dentry)) { + inode_unlock(d_inode(root)); + ret = PTR_ERR(dentry); + goto err; + } + + if (d_really_is_positive(dentry)) { + /* already exists */ + dput(dentry); + inode_unlock(d_inode(root)); + ret = -EEXIST; + goto err; + } + + inode->i_private = device; + d_instantiate(dentry, inode); + fsnotify_create(root->d_inode, dentry); + inode_unlock(d_inode(root)); + + return 0; + +err: + kfree(device); + rust_binder_remove_context(ctx); + mutex_lock(&binderfs_minors_mutex); + --info->device_count; + ida_free(&binderfs_minors, minor); + mutex_unlock(&binderfs_minors_mutex); + iput(inode); + + return ret; +} + +/** + * binder_ctl_ioctl - handle binder device node allocation requests + * + * The request handler for the binder-control device. All requests operate on + * the binderfs mount the binder-control device resides in: + * - BINDER_CTL_ADD + * Allocate a new binder device. + * + * Return: %0 on success, negative errno on failure. + */ +static long binder_ctl_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int ret = -EINVAL; + struct inode *inode = file_inode(file); + struct binderfs_device __user *device = (struct binderfs_device __user *)arg; + struct binderfs_device device_req; + + switch (cmd) { + case BINDER_CTL_ADD: + ret = copy_from_user(&device_req, device, sizeof(device_req)); + if (ret) { + ret = -EFAULT; + break; + } + + ret = binderfs_binder_device_create(inode, device, &device_req); + break; + default: + break; + } + + return ret; +} + +static void binderfs_evict_inode(struct inode *inode) +{ + struct binder_device *device = inode->i_private; + struct binderfs_info *info = BINDERFS_SB(inode->i_sb); + + clear_inode(inode); + + if (!S_ISCHR(inode->i_mode) || !device) + return; + + mutex_lock(&binderfs_minors_mutex); + --info->device_count; + ida_free(&binderfs_minors, device->minor); + mutex_unlock(&binderfs_minors_mutex); + + /* ctx is null for binder-control, but this function ignores null pointers */ + rust_binder_remove_context(device->ctx); + + kfree(device); +} + +static int binderfs_fs_context_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + int opt; + struct binderfs_mount_opts *ctx = fc->fs_private; + struct fs_parse_result result; + + opt = fs_parse(fc, binderfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_max: + if (result.uint_32 > BINDERFS_MAX_MINOR) + return invalfc(fc, "Bad value for '%s'", param->key); + + ctx->max = result.uint_32; + break; + case Opt_stats_mode: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ctx->stats_mode = result.uint_32; + break; + default: + return invalfc(fc, "Unsupported parameter '%s'", param->key); + } + + return 0; +} + +static int binderfs_fs_context_reconfigure(struct fs_context *fc) +{ + struct binderfs_mount_opts *ctx = fc->fs_private; + struct binderfs_info *info = BINDERFS_SB(fc->root->d_sb); + + if (info->mount_opts.stats_mode != ctx->stats_mode) + return invalfc(fc, "Binderfs stats mode cannot be changed during a remount"); + + info->mount_opts.stats_mode = ctx->stats_mode; + info->mount_opts.max = ctx->max; + return 0; +} + +static int binderfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct binderfs_info *info = BINDERFS_SB(root->d_sb); + + if (info->mount_opts.max <= BINDERFS_MAX_MINOR) + seq_printf(seq, ",max=%d", info->mount_opts.max); + + switch (info->mount_opts.stats_mode) { + case binderfs_stats_mode_unset: + break; + case binderfs_stats_mode_global: + seq_puts(seq, ",stats=global"); + break; + } + + return 0; +} + +static const struct super_operations binderfs_super_ops = { + .evict_inode = binderfs_evict_inode, + .show_options = binderfs_show_options, + .statfs = simple_statfs, +}; + +static inline bool is_binderfs_control_device(const struct dentry *dentry) +{ + struct binderfs_info *info = dentry->d_sb->s_fs_info; + + return info->control_dentry == dentry; +} + +static int binderfs_rename(struct mnt_idmap *idmap, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (is_binderfs_control_device(old_dentry) || + is_binderfs_control_device(new_dentry)) + return -EPERM; + + return simple_rename(idmap, old_dir, old_dentry, new_dir, + new_dentry, flags); +} + +static int binderfs_unlink(struct inode *dir, struct dentry *dentry) +{ + if (is_binderfs_control_device(dentry)) + return -EPERM; + + return simple_unlink(dir, dentry); +} + +static const struct file_operations binder_ctl_fops = { + .owner = THIS_MODULE, + .open = nonseekable_open, + .unlocked_ioctl = binder_ctl_ioctl, + .compat_ioctl = binder_ctl_ioctl, + .llseek = noop_llseek, +}; + +/** + * binderfs_binder_ctl_create - create a new binder-control device + * @sb: super block of the binderfs mount + * + * This function creates a new binder-control device node in the binderfs mount + * referred to by @sb. + * + * Return: 0 on success, negative errno on failure + */ +static int binderfs_binder_ctl_create(struct super_block *sb) +{ + int minor, ret; + struct dentry *dentry; + struct binder_device *device; + struct inode *inode = NULL; + struct dentry *root = sb->s_root; + struct binderfs_info *info = sb->s_fs_info; +#if defined(CONFIG_IPC_NS) + bool use_reserve = (info->ipc_ns == &init_ipc_ns); +#else + bool use_reserve = true; +#endif + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) + return -ENOMEM; + + /* If we have already created a binder-control node, return. */ + if (info->control_dentry) { + ret = 0; + goto out; + } + + ret = -ENOMEM; + inode = new_inode(sb); + if (!inode) + goto out; + + /* Reserve a new minor number for the new device. */ + mutex_lock(&binderfs_minors_mutex); + minor = ida_alloc_max(&binderfs_minors, + use_reserve ? BINDERFS_MAX_MINOR : + BINDERFS_MAX_MINOR_CAPPED, + GFP_KERNEL); + mutex_unlock(&binderfs_minors_mutex); + if (minor < 0) { + ret = minor; + goto out; + } + + inode->i_ino = SECOND_INODE; + simple_inode_init_ts(inode); + init_special_inode(inode, S_IFCHR | 0600, + MKDEV(MAJOR(binderfs_dev), minor)); + inode->i_fop = &binder_ctl_fops; + inode->i_uid = info->root_uid; + inode->i_gid = info->root_gid; + + device->minor = minor; + device->ctx = NULL; + + dentry = d_alloc_name(root, "binder-control"); + if (!dentry) + goto out; + + inode->i_private = device; + info->control_dentry = dentry; + d_add(dentry, inode); + + return 0; + +out: + kfree(device); + iput(inode); + + return ret; +} + +static const struct inode_operations binderfs_dir_inode_operations = { + .lookup = simple_lookup, + .rename = binderfs_rename, + .unlink = binderfs_unlink, +}; + +static struct inode *binderfs_make_inode(struct super_block *sb, int mode) +{ + struct inode *ret; + + ret = new_inode(sb); + if (ret) { + ret->i_ino = iunique(sb, BINDERFS_MAX_MINOR + INODE_OFFSET); + ret->i_mode = mode; + simple_inode_init_ts(ret); + } + return ret; +} + +static struct dentry *binderfs_create_dentry(struct dentry *parent, + const char *name) +{ + struct dentry *dentry; + + dentry = lookup_noperm(&QSTR(name), parent); + if (IS_ERR(dentry)) + return dentry; + + /* Return error if the file/dir already exists. */ + if (d_really_is_positive(dentry)) { + dput(dentry); + return ERR_PTR(-EEXIST); + } + + return dentry; +} + +void rust_binderfs_remove_file(struct dentry *dentry) +{ + struct inode *parent_inode; + + parent_inode = d_inode(dentry->d_parent); + inode_lock(parent_inode); + if (simple_positive(dentry)) { + dget(dentry); + simple_unlink(parent_inode, dentry); + d_delete(dentry); + dput(dentry); + } + inode_unlock(parent_inode); +} + +static struct dentry *rust_binderfs_create_file(struct dentry *parent, const char *name, + const struct file_operations *fops, + void *data) +{ + struct dentry *dentry; + struct inode *new_inode, *parent_inode; + struct super_block *sb; + + parent_inode = d_inode(parent); + inode_lock(parent_inode); + + dentry = binderfs_create_dentry(parent, name); + if (IS_ERR(dentry)) + goto out; + + sb = parent_inode->i_sb; + new_inode = binderfs_make_inode(sb, S_IFREG | 0444); + if (!new_inode) { + dput(dentry); + dentry = ERR_PTR(-ENOMEM); + goto out; + } + + new_inode->i_fop = fops; + new_inode->i_private = data; + d_instantiate(dentry, new_inode); + fsnotify_create(parent_inode, dentry); + +out: + inode_unlock(parent_inode); + return dentry; +} + +struct dentry *rust_binderfs_create_proc_file(struct inode *nodp, int pid) +{ + struct binderfs_info *info = nodp->i_sb->s_fs_info; + struct dentry *dir = info->proc_log_dir; + char strbuf[20 + 1]; + void *data = (void *)(unsigned long) pid; + + if (!dir) + return NULL; + + snprintf(strbuf, sizeof(strbuf), "%u", pid); + return rust_binderfs_create_file(dir, strbuf, &rust_binder_proc_fops, data); +} + +static struct dentry *binderfs_create_dir(struct dentry *parent, + const char *name) +{ + struct dentry *dentry; + struct inode *new_inode, *parent_inode; + struct super_block *sb; + + parent_inode = d_inode(parent); + inode_lock(parent_inode); + + dentry = binderfs_create_dentry(parent, name); + if (IS_ERR(dentry)) + goto out; + + sb = parent_inode->i_sb; + new_inode = binderfs_make_inode(sb, S_IFDIR | 0755); + if (!new_inode) { + dput(dentry); + dentry = ERR_PTR(-ENOMEM); + goto out; + } + + new_inode->i_fop = &simple_dir_operations; + new_inode->i_op = &simple_dir_inode_operations; + + set_nlink(new_inode, 2); + d_instantiate(dentry, new_inode); + inc_nlink(parent_inode); + fsnotify_mkdir(parent_inode, dentry); + +out: + inode_unlock(parent_inode); + return dentry; +} + +static int binder_features_show(struct seq_file *m, void *unused) +{ + bool *feature = m->private; + + seq_printf(m, "%d\n", *feature); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(binder_features); + +static int init_binder_features(struct super_block *sb) +{ + struct dentry *dentry, *dir; + + dir = binderfs_create_dir(sb->s_root, "features"); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + dentry = rust_binderfs_create_file(dir, "oneway_spam_detection", + &binder_features_fops, + &binder_features.oneway_spam_detection); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + dentry = rust_binderfs_create_file(dir, "extended_error", + &binder_features_fops, + &binder_features.extended_error); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + dentry = rust_binderfs_create_file(dir, "freeze_notification", + &binder_features_fops, + &binder_features.freeze_notification); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + return 0; +} + +static int init_binder_logs(struct super_block *sb) +{ + struct dentry *binder_logs_root_dir, *dentry, *proc_log_dir; + struct binderfs_info *info; + int ret = 0; + + binder_logs_root_dir = binderfs_create_dir(sb->s_root, + "binder_logs"); + if (IS_ERR(binder_logs_root_dir)) { + ret = PTR_ERR(binder_logs_root_dir); + goto out; + } + + dentry = rust_binderfs_create_file(binder_logs_root_dir, "stats", + &rust_binder_stats_fops, NULL); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out; + } + + dentry = rust_binderfs_create_file(binder_logs_root_dir, "state", + &rust_binder_state_fops, NULL); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out; + } + + dentry = rust_binderfs_create_file(binder_logs_root_dir, "transactions", + &rust_binder_transactions_fops, NULL); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out; + } + + proc_log_dir = binderfs_create_dir(binder_logs_root_dir, "proc"); + if (IS_ERR(proc_log_dir)) { + ret = PTR_ERR(proc_log_dir); + goto out; + } + info = sb->s_fs_info; + info->proc_log_dir = proc_log_dir; + +out: + return ret; +} + +static int binderfs_fill_super(struct super_block *sb, struct fs_context *fc) +{ + int ret; + struct binderfs_info *info; + struct binderfs_mount_opts *ctx = fc->fs_private; + struct inode *inode = NULL; + struct binderfs_device device_info = {}; + const char *name; + size_t len; + + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + + /* + * The binderfs filesystem can be mounted by userns root in a + * non-initial userns. By default such mounts have the SB_I_NODEV flag + * set in s_iflags to prevent security issues where userns root can + * just create random device nodes via mknod() since it owns the + * filesystem mount. But binderfs does not allow to create any files + * including devices nodes. The only way to create binder devices nodes + * is through the binder-control device which userns root is explicitly + * allowed to do. So removing the SB_I_NODEV flag from s_iflags is both + * necessary and safe. + */ + sb->s_iflags &= ~SB_I_NODEV; + sb->s_iflags |= SB_I_NOEXEC; + sb->s_magic = RUST_BINDERFS_SUPER_MAGIC; + sb->s_op = &binderfs_super_ops; + sb->s_time_gran = 1; + + sb->s_fs_info = kzalloc(sizeof(struct binderfs_info), GFP_KERNEL); + if (!sb->s_fs_info) + return -ENOMEM; + info = sb->s_fs_info; + + info->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns); + + info->root_gid = make_kgid(sb->s_user_ns, 0); + if (!gid_valid(info->root_gid)) + info->root_gid = GLOBAL_ROOT_GID; + info->root_uid = make_kuid(sb->s_user_ns, 0); + if (!uid_valid(info->root_uid)) + info->root_uid = GLOBAL_ROOT_UID; + info->mount_opts.max = ctx->max; + info->mount_opts.stats_mode = ctx->stats_mode; + + inode = new_inode(sb); + if (!inode) + return -ENOMEM; + + inode->i_ino = FIRST_INODE; + inode->i_fop = &simple_dir_operations; + inode->i_mode = S_IFDIR | 0755; + simple_inode_init_ts(inode); + inode->i_op = &binderfs_dir_inode_operations; + set_nlink(inode, 2); + + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; + + ret = binderfs_binder_ctl_create(sb); + if (ret) + return ret; + + name = rust_binder_devices_param; + for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) { + strscpy(device_info.name, name, len + 1); + ret = binderfs_binder_device_create(inode, NULL, &device_info); + if (ret) + return ret; + name += len; + if (*name == ',') + name++; + } + + ret = init_binder_features(sb); + if (ret) + return ret; + + if (info->mount_opts.stats_mode == binderfs_stats_mode_global) + return init_binder_logs(sb); + + return 0; +} + +static int binderfs_fs_context_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, binderfs_fill_super); +} + +static void binderfs_fs_context_free(struct fs_context *fc) +{ + struct binderfs_mount_opts *ctx = fc->fs_private; + + kfree(ctx); +} + +static const struct fs_context_operations binderfs_fs_context_ops = { + .free = binderfs_fs_context_free, + .get_tree = binderfs_fs_context_get_tree, + .parse_param = binderfs_fs_context_parse_param, + .reconfigure = binderfs_fs_context_reconfigure, +}; + +static int binderfs_init_fs_context(struct fs_context *fc) +{ + struct binderfs_mount_opts *ctx; + + ctx = kzalloc(sizeof(struct binderfs_mount_opts), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->max = BINDERFS_MAX_MINOR; + ctx->stats_mode = binderfs_stats_mode_unset; + + fc->fs_private = ctx; + fc->ops = &binderfs_fs_context_ops; + + return 0; +} + +static void binderfs_kill_super(struct super_block *sb) +{ + struct binderfs_info *info = sb->s_fs_info; + + /* + * During inode eviction struct binderfs_info is needed. + * So first wipe the super_block then free struct binderfs_info. + */ + kill_litter_super(sb); + + if (info && info->ipc_ns) + put_ipc_ns(info->ipc_ns); + + kfree(info); +} + +static struct file_system_type binder_fs_type = { + .name = "binder", + .init_fs_context = binderfs_init_fs_context, + .parameters = binderfs_fs_parameters, + .kill_sb = binderfs_kill_super, + .fs_flags = FS_USERNS_MOUNT, +}; + +int init_rust_binderfs(void) +{ + int ret; + const char *name; + size_t len; + + /* Verify that the default binderfs device names are valid. */ + name = rust_binder_devices_param; + for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) { + if (len > BINDERFS_MAX_NAME) + return -E2BIG; + name += len; + if (*name == ',') + name++; + } + + /* Allocate new major number for binderfs. */ + ret = alloc_chrdev_region(&binderfs_dev, 0, BINDERFS_MAX_MINOR, + "rust_binder"); + if (ret) + return ret; + + ret = register_filesystem(&binder_fs_type); + if (ret) { + unregister_chrdev_region(binderfs_dev, BINDERFS_MAX_MINOR); + return ret; + } + + return ret; +} diff --git a/drivers/android/binder/stats.rs b/drivers/android/binder/stats.rs new file mode 100644 index 000000000000..a83ec111d2cb --- /dev/null +++ b/drivers/android/binder/stats.rs @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Keep track of statistics for binder_logs. + +use crate::defs::*; +use core::sync::atomic::{AtomicU32, Ordering::Relaxed}; +use kernel::{ioctl::_IOC_NR, seq_file::SeqFile, seq_print}; + +const BC_COUNT: usize = _IOC_NR(BC_REPLY_SG) as usize + 1; +const BR_COUNT: usize = _IOC_NR(BR_TRANSACTION_PENDING_FROZEN) as usize + 1; + +pub(crate) static GLOBAL_STATS: BinderStats = BinderStats::new(); + +pub(crate) struct BinderStats { + bc: [AtomicU32; BC_COUNT], + br: [AtomicU32; BR_COUNT], +} + +impl BinderStats { + pub(crate) const fn new() -> Self { + #[expect(clippy::declare_interior_mutable_const)] + const ZERO: AtomicU32 = AtomicU32::new(0); + + Self { + bc: [ZERO; BC_COUNT], + br: [ZERO; BR_COUNT], + } + } + + pub(crate) fn inc_bc(&self, bc: u32) { + let idx = _IOC_NR(bc) as usize; + if let Some(bc_ref) = self.bc.get(idx) { + bc_ref.fetch_add(1, Relaxed); + } + } + + pub(crate) fn inc_br(&self, br: u32) { + let idx = _IOC_NR(br) as usize; + if let Some(br_ref) = self.br.get(idx) { + br_ref.fetch_add(1, Relaxed); + } + } + + pub(crate) fn debug_print(&self, prefix: &str, m: &SeqFile) { + for (i, cnt) in self.bc.iter().enumerate() { + let cnt = cnt.load(Relaxed); + if cnt > 0 { + seq_print!(m, "{}{}: {}\n", prefix, command_string(i), cnt); + } + } + for (i, cnt) in self.br.iter().enumerate() { + let cnt = cnt.load(Relaxed); + if cnt > 0 { + seq_print!(m, "{}{}: {}\n", prefix, return_string(i), cnt); + } + } + } +} + +mod strings { + use core::str::from_utf8_unchecked; + use kernel::str::CStr; + + extern "C" { + static binder_command_strings: [*const u8; super::BC_COUNT]; + static binder_return_strings: [*const u8; super::BR_COUNT]; + } + + pub(super) fn command_string(i: usize) -> &'static str { + // SAFETY: Accessing `binder_command_strings` is always safe. + let c_str_ptr = unsafe { binder_command_strings[i] }; + // SAFETY: The `binder_command_strings` array only contains nul-terminated strings. + let bytes = unsafe { CStr::from_char_ptr(c_str_ptr) }.as_bytes(); + // SAFETY: The `binder_command_strings` array only contains strings with ascii-chars. + unsafe { from_utf8_unchecked(bytes) } + } + + pub(super) fn return_string(i: usize) -> &'static str { + // SAFETY: Accessing `binder_return_strings` is always safe. + let c_str_ptr = unsafe { binder_return_strings[i] }; + // SAFETY: The `binder_command_strings` array only contains nul-terminated strings. + let bytes = unsafe { CStr::from_char_ptr(c_str_ptr) }.as_bytes(); + // SAFETY: The `binder_command_strings` array only contains strings with ascii-chars. + unsafe { from_utf8_unchecked(bytes) } + } +} +use strings::{command_string, return_string}; diff --git a/drivers/android/binder/thread.rs b/drivers/android/binder/thread.rs new file mode 100644 index 000000000000..7e34ccd394f8 --- /dev/null +++ b/drivers/android/binder/thread.rs @@ -0,0 +1,1596 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! This module defines the `Thread` type, which represents a userspace thread that is using +//! binder. +//! +//! The `Process` object stores all of the threads in an rb tree. + +use kernel::{ + bindings, + fs::{File, LocalFile}, + list::{AtomicTracker, List, ListArc, ListLinks, TryNewListArc}, + prelude::*, + security, + seq_file::SeqFile, + seq_print, + sync::poll::{PollCondVar, PollTable}, + sync::{Arc, SpinLock}, + task::Task, + types::ARef, + uaccess::UserSlice, + uapi, +}; + +use crate::{ + allocation::{Allocation, AllocationView, BinderObject, BinderObjectRef, NewAllocation}, + defs::*, + error::BinderResult, + process::{GetWorkOrRegister, Process}, + ptr_align, + stats::GLOBAL_STATS, + transaction::Transaction, + BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverCode, DeliverToRead, +}; + +use core::{ + mem::size_of, + sync::atomic::{AtomicU32, Ordering}, +}; + +/// Stores the layout of the scatter-gather entries. This is used during the `translate_objects` +/// call and is discarded when it returns. +struct ScatterGatherState { + /// A struct that tracks the amount of unused buffer space. + unused_buffer_space: UnusedBufferSpace, + /// Scatter-gather entries to copy. + sg_entries: KVec, + /// Indexes into `sg_entries` corresponding to the last binder_buffer_object that + /// was processed and all of its ancestors. The array is in sorted order. + ancestors: KVec, +} + +/// This entry specifies an additional buffer that should be copied using the scatter-gather +/// mechanism. +struct ScatterGatherEntry { + /// The index in the offset array of the BINDER_TYPE_PTR that this entry originates from. + obj_index: usize, + /// Offset in target buffer. + offset: usize, + /// User address in source buffer. + sender_uaddr: usize, + /// Number of bytes to copy. + length: usize, + /// The minimum offset of the next fixup in this buffer. + fixup_min_offset: usize, + /// The offsets within this buffer that contain pointers which should be translated. + pointer_fixups: KVec, +} + +/// This entry specifies that a fixup should happen at `target_offset` of the +/// buffer. If `skip` is nonzero, then the fixup is a `binder_fd_array_object` +/// and is applied later. Otherwise if `skip` is zero, then the size of the +/// fixup is `sizeof::()` and `pointer_value` is written to the buffer. +struct PointerFixupEntry { + /// The number of bytes to skip, or zero for a `binder_buffer_object` fixup. + skip: usize, + /// The translated pointer to write when `skip` is zero. + pointer_value: u64, + /// The offset at which the value should be written. The offset is relative + /// to the original buffer. + target_offset: usize, +} + +/// Return type of `apply_and_validate_fixup_in_parent`. +struct ParentFixupInfo { + /// The index of the parent buffer in `sg_entries`. + parent_sg_index: usize, + /// The number of ancestors of the buffer. + /// + /// The buffer is considered an ancestor of itself, so this is always at + /// least one. + num_ancestors: usize, + /// New value of `fixup_min_offset` if this fixup is applied. + new_min_offset: usize, + /// The offset of the fixup in the target buffer. + target_offset: usize, +} + +impl ScatterGatherState { + /// Called when a `binder_buffer_object` or `binder_fd_array_object` tries + /// to access a region in its parent buffer. These accesses have various + /// restrictions, which this method verifies. + /// + /// The `parent_offset` and `length` arguments describe the offset and + /// length of the access in the parent buffer. + /// + /// # Detailed restrictions + /// + /// Obviously the fixup must be in-bounds for the parent buffer. + /// + /// For safety reasons, we only allow fixups inside a buffer to happen + /// at increasing offsets; additionally, we only allow fixup on the last + /// buffer object that was verified, or one of its parents. + /// + /// Example of what is allowed: + /// + /// A + /// B (parent = A, offset = 0) + /// C (parent = A, offset = 16) + /// D (parent = C, offset = 0) + /// E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset) + /// + /// Examples of what is not allowed: + /// + /// Decreasing offsets within the same parent: + /// A + /// C (parent = A, offset = 16) + /// B (parent = A, offset = 0) // decreasing offset within A + /// + /// Arcerring to a parent that wasn't the last object or any of its parents: + /// A + /// B (parent = A, offset = 0) + /// C (parent = A, offset = 0) + /// C (parent = A, offset = 16) + /// D (parent = B, offset = 0) // B is not A or any of A's parents + fn validate_parent_fixup( + &self, + parent: usize, + parent_offset: usize, + length: usize, + ) -> Result { + // Using `position` would also be correct, but `rposition` avoids + // quadratic running times. + let ancestors_i = self + .ancestors + .iter() + .copied() + .rposition(|sg_idx| self.sg_entries[sg_idx].obj_index == parent) + .ok_or(EINVAL)?; + let sg_idx = self.ancestors[ancestors_i]; + let sg_entry = match self.sg_entries.get(sg_idx) { + Some(sg_entry) => sg_entry, + None => { + pr_err!( + "self.ancestors[{}] is {}, but self.sg_entries.len() is {}", + ancestors_i, + sg_idx, + self.sg_entries.len() + ); + return Err(EINVAL); + } + }; + if sg_entry.fixup_min_offset > parent_offset { + pr_warn!( + "validate_parent_fixup: fixup_min_offset={}, parent_offset={}", + sg_entry.fixup_min_offset, + parent_offset + ); + return Err(EINVAL); + } + let new_min_offset = parent_offset.checked_add(length).ok_or(EINVAL)?; + if new_min_offset > sg_entry.length { + pr_warn!( + "validate_parent_fixup: new_min_offset={}, sg_entry.length={}", + new_min_offset, + sg_entry.length + ); + return Err(EINVAL); + } + let target_offset = sg_entry.offset.checked_add(parent_offset).ok_or(EINVAL)?; + // The `ancestors_i + 1` operation can't overflow since the output of the addition is at + // most `self.ancestors.len()`, which also fits in a usize. + Ok(ParentFixupInfo { + parent_sg_index: sg_idx, + num_ancestors: ancestors_i + 1, + new_min_offset, + target_offset, + }) + } +} + +/// Keeps track of how much unused buffer space is left. The initial amount is the number of bytes +/// requested by the user using the `buffers_size` field of `binder_transaction_data_sg`. Each time +/// we translate an object of type `BINDER_TYPE_PTR`, some of the unused buffer space is consumed. +struct UnusedBufferSpace { + /// The start of the remaining space. + offset: usize, + /// The end of the remaining space. + limit: usize, +} +impl UnusedBufferSpace { + /// Claim the next `size` bytes from the unused buffer space. The offset for the claimed chunk + /// into the buffer is returned. + fn claim_next(&mut self, size: usize) -> Result { + // We require every chunk to be aligned. + let size = ptr_align(size).ok_or(EINVAL)?; + let new_offset = self.offset.checked_add(size).ok_or(EINVAL)?; + + if new_offset <= self.limit { + let offset = self.offset; + self.offset = new_offset; + Ok(offset) + } else { + Err(EINVAL) + } + } +} + +pub(crate) enum PushWorkRes { + Ok, + FailedDead(DLArc), +} + +impl PushWorkRes { + fn is_ok(&self) -> bool { + match self { + PushWorkRes::Ok => true, + PushWorkRes::FailedDead(_) => false, + } + } +} + +/// The fields of `Thread` protected by the spinlock. +struct InnerThread { + /// Determines the looper state of the thread. It is a bit-wise combination of the constants + /// prefixed with `LOOPER_`. + looper_flags: u32, + + /// Determines whether the looper should return. + looper_need_return: bool, + + /// Determines if thread is dead. + is_dead: bool, + + /// Work item used to deliver error codes to the thread that started a transaction. Stored here + /// so that it can be reused. + reply_work: DArc, + + /// Work item used to deliver error codes to the current thread. Stored here so that it can be + /// reused. + return_work: DArc, + + /// Determines whether the work list below should be processed. When set to false, `work_list` + /// is treated as if it were empty. + process_work_list: bool, + /// List of work items to deliver to userspace. + work_list: List>, + current_transaction: Option>, + + /// Extended error information for this thread. + extended_error: ExtendedError, +} + +const LOOPER_REGISTERED: u32 = 0x01; +const LOOPER_ENTERED: u32 = 0x02; +const LOOPER_EXITED: u32 = 0x04; +const LOOPER_INVALID: u32 = 0x08; +const LOOPER_WAITING: u32 = 0x10; +const LOOPER_WAITING_PROC: u32 = 0x20; +const LOOPER_POLL: u32 = 0x40; + +impl InnerThread { + fn new() -> Result { + fn next_err_id() -> u32 { + static EE_ID: AtomicU32 = AtomicU32::new(0); + EE_ID.fetch_add(1, Ordering::Relaxed) + } + + Ok(Self { + looper_flags: 0, + looper_need_return: false, + is_dead: false, + process_work_list: false, + reply_work: ThreadError::try_new()?, + return_work: ThreadError::try_new()?, + work_list: List::new(), + current_transaction: None, + extended_error: ExtendedError::new(next_err_id(), BR_OK, 0), + }) + } + + fn pop_work(&mut self) -> Option> { + if !self.process_work_list { + return None; + } + + let ret = self.work_list.pop_front(); + self.process_work_list = !self.work_list.is_empty(); + ret + } + + fn push_work(&mut self, work: DLArc) -> PushWorkRes { + if self.is_dead { + PushWorkRes::FailedDead(work) + } else { + self.work_list.push_back(work); + self.process_work_list = true; + PushWorkRes::Ok + } + } + + fn push_reply_work(&mut self, code: u32) { + if let Ok(work) = ListArc::try_from_arc(self.reply_work.clone()) { + work.set_error_code(code); + self.push_work(work); + } else { + pr_warn!("Thread reply work is already in use."); + } + } + + fn push_return_work(&mut self, reply: u32) { + if let Ok(work) = ListArc::try_from_arc(self.return_work.clone()) { + work.set_error_code(reply); + self.push_work(work); + } else { + pr_warn!("Thread return work is already in use."); + } + } + + /// Used to push work items that do not need to be processed immediately and can wait until the + /// thread gets another work item. + fn push_work_deferred(&mut self, work: DLArc) { + self.work_list.push_back(work); + } + + /// Fetches the transaction this thread can reply to. If the thread has a pending transaction + /// (that it could respond to) but it has also issued a transaction, it must first wait for the + /// previously-issued transaction to complete. + /// + /// The `thread` parameter should be the thread containing this `ThreadInner`. + fn pop_transaction_to_reply(&mut self, thread: &Thread) -> Result> { + let transaction = self.current_transaction.take().ok_or(EINVAL)?; + if core::ptr::eq(thread, transaction.from.as_ref()) { + self.current_transaction = Some(transaction); + return Err(EINVAL); + } + // Find a new current transaction for this thread. + self.current_transaction = transaction.find_from(thread).cloned(); + Ok(transaction) + } + + fn pop_transaction_replied(&mut self, transaction: &DArc) -> bool { + match self.current_transaction.take() { + None => false, + Some(old) => { + if !Arc::ptr_eq(transaction, &old) { + self.current_transaction = Some(old); + return false; + } + self.current_transaction = old.clone_next(); + true + } + } + } + + fn looper_enter(&mut self) { + self.looper_flags |= LOOPER_ENTERED; + if self.looper_flags & LOOPER_REGISTERED != 0 { + self.looper_flags |= LOOPER_INVALID; + } + } + + fn looper_register(&mut self, valid: bool) { + self.looper_flags |= LOOPER_REGISTERED; + if !valid || self.looper_flags & LOOPER_ENTERED != 0 { + self.looper_flags |= LOOPER_INVALID; + } + } + + fn looper_exit(&mut self) { + self.looper_flags |= LOOPER_EXITED; + } + + /// Determines whether the thread is part of a pool, i.e., if it is a looper. + fn is_looper(&self) -> bool { + self.looper_flags & (LOOPER_ENTERED | LOOPER_REGISTERED) != 0 + } + + /// Determines whether the thread should attempt to fetch work items from the process queue. + /// This is generally case when the thread is registered as a looper and not part of a + /// transaction stack. But if there is local work, we want to return to userspace before we + /// deliver any remote work. + fn should_use_process_work_queue(&self) -> bool { + self.current_transaction.is_none() && !self.process_work_list && self.is_looper() + } + + fn poll(&mut self) -> u32 { + self.looper_flags |= LOOPER_POLL; + if self.process_work_list || self.looper_need_return { + bindings::POLLIN + } else { + 0 + } + } +} + +/// This represents a thread that's used with binder. +#[pin_data] +pub(crate) struct Thread { + pub(crate) id: i32, + pub(crate) process: Arc, + pub(crate) task: ARef, + #[pin] + inner: SpinLock, + #[pin] + work_condvar: PollCondVar, + /// Used to insert this thread into the process' `ready_threads` list. + /// + /// INVARIANT: May never be used for any other list than the `self.process.ready_threads`. + #[pin] + links: ListLinks, + #[pin] + links_track: AtomicTracker, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Thread { + tracked_by links_track: AtomicTracker; + } +} +kernel::list::impl_list_item! { + impl ListItem<0> for Thread { + using ListLinks { self.links }; + } +} + +impl Thread { + pub(crate) fn new(id: i32, process: Arc) -> Result> { + let inner = InnerThread::new()?; + + Arc::pin_init( + try_pin_init!(Thread { + id, + process, + task: ARef::from(&**kernel::current!()), + inner <- kernel::new_spinlock!(inner, "Thread::inner"), + work_condvar <- kernel::new_poll_condvar!("Thread::work_condvar"), + links <- ListLinks::new(), + links_track <- AtomicTracker::new(), + }), + GFP_KERNEL, + ) + } + + #[inline(never)] + pub(crate) fn debug_print(self: &Arc, m: &SeqFile, print_all: bool) -> Result<()> { + let inner = self.inner.lock(); + + if print_all || inner.current_transaction.is_some() || !inner.work_list.is_empty() { + seq_print!( + m, + " thread {}: l {:02x} need_return {}\n", + self.id, + inner.looper_flags, + inner.looper_need_return, + ); + } + + let mut t_opt = inner.current_transaction.as_ref(); + while let Some(t) = t_opt { + if Arc::ptr_eq(&t.from, self) { + t.debug_print_inner(m, " outgoing transaction "); + t_opt = t.from_parent.as_ref(); + } else if Arc::ptr_eq(&t.to, &self.process) { + t.debug_print_inner(m, " incoming transaction "); + t_opt = t.find_from(self); + } else { + t.debug_print_inner(m, " bad transaction "); + t_opt = None; + } + } + + for work in &inner.work_list { + work.debug_print(m, " ", " pending transaction ")?; + } + Ok(()) + } + + pub(crate) fn get_extended_error(&self, data: UserSlice) -> Result { + let mut writer = data.writer(); + let ee = self.inner.lock().extended_error; + writer.write(&ee)?; + Ok(()) + } + + pub(crate) fn set_current_transaction(&self, transaction: DArc) { + self.inner.lock().current_transaction = Some(transaction); + } + + pub(crate) fn has_current_transaction(&self) -> bool { + self.inner.lock().current_transaction.is_some() + } + + /// Attempts to fetch a work item from the thread-local queue. The behaviour if the queue is + /// empty depends on `wait`: if it is true, the function waits for some work to be queued (or a + /// signal); otherwise it returns indicating that none is available. + fn get_work_local(self: &Arc, wait: bool) -> Result>> { + { + let mut inner = self.inner.lock(); + if inner.looper_need_return { + return Ok(inner.pop_work()); + } + } + + // Try once if the caller does not want to wait. + if !wait { + return self.inner.lock().pop_work().ok_or(EAGAIN).map(Some); + } + + // Loop waiting only on the local queue (i.e., not registering with the process queue). + let mut inner = self.inner.lock(); + loop { + if let Some(work) = inner.pop_work() { + return Ok(Some(work)); + } + + inner.looper_flags |= LOOPER_WAITING; + let signal_pending = self.work_condvar.wait_interruptible_freezable(&mut inner); + inner.looper_flags &= !LOOPER_WAITING; + + if signal_pending { + return Err(EINTR); + } + if inner.looper_need_return { + return Ok(None); + } + } + } + + /// Attempts to fetch a work item from the thread-local queue, falling back to the process-wide + /// queue if none is available locally. + /// + /// This must only be called when the thread is not participating in a transaction chain. If it + /// is, the local version (`get_work_local`) should be used instead. + fn get_work(self: &Arc, wait: bool) -> Result>> { + // Try to get work from the thread's work queue, using only a local lock. + { + let mut inner = self.inner.lock(); + if let Some(work) = inner.pop_work() { + return Ok(Some(work)); + } + if inner.looper_need_return { + drop(inner); + return Ok(self.process.get_work()); + } + } + + // If the caller doesn't want to wait, try to grab work from the process queue. + // + // We know nothing will have been queued directly to the thread queue because it is not in + // a transaction and it is not in the process' ready list. + if !wait { + return self.process.get_work().ok_or(EAGAIN).map(Some); + } + + // Get work from the process queue. If none is available, atomically register as ready. + let reg = match self.process.get_work_or_register(self) { + GetWorkOrRegister::Work(work) => return Ok(Some(work)), + GetWorkOrRegister::Register(reg) => reg, + }; + + let mut inner = self.inner.lock(); + loop { + if let Some(work) = inner.pop_work() { + return Ok(Some(work)); + } + + inner.looper_flags |= LOOPER_WAITING | LOOPER_WAITING_PROC; + let signal_pending = self.work_condvar.wait_interruptible_freezable(&mut inner); + inner.looper_flags &= !(LOOPER_WAITING | LOOPER_WAITING_PROC); + + if signal_pending || inner.looper_need_return { + // We need to return now. We need to pull the thread off the list of ready threads + // (by dropping `reg`), then check the state again after it's off the list to + // ensure that something was not queued in the meantime. If something has been + // queued, we just return it (instead of the error). + drop(inner); + drop(reg); + + let res = match self.inner.lock().pop_work() { + Some(work) => Ok(Some(work)), + None if signal_pending => Err(EINTR), + None => Ok(None), + }; + return res; + } + } + } + + /// Push the provided work item to be delivered to user space via this thread. + /// + /// Returns whether the item was successfully pushed. This can only fail if the thread is dead. + pub(crate) fn push_work(&self, work: DLArc) -> PushWorkRes { + let sync = work.should_sync_wakeup(); + + let res = self.inner.lock().push_work(work); + + if res.is_ok() { + if sync { + self.work_condvar.notify_sync(); + } else { + self.work_condvar.notify_one(); + } + } + + res + } + + /// Attempts to push to given work item to the thread if it's a looper thread (i.e., if it's + /// part of a thread pool) and is alive. Otherwise, push the work item to the process instead. + pub(crate) fn push_work_if_looper(&self, work: DLArc) -> BinderResult { + let mut inner = self.inner.lock(); + if inner.is_looper() && !inner.is_dead { + inner.push_work(work); + Ok(()) + } else { + drop(inner); + self.process.push_work(work) + } + } + + pub(crate) fn push_work_deferred(&self, work: DLArc) { + self.inner.lock().push_work_deferred(work); + } + + pub(crate) fn push_return_work(&self, reply: u32) { + self.inner.lock().push_return_work(reply); + } + + fn translate_object( + &self, + obj_index: usize, + offset: usize, + object: BinderObjectRef<'_>, + view: &mut AllocationView<'_>, + allow_fds: bool, + sg_state: &mut ScatterGatherState, + ) -> BinderResult { + match object { + BinderObjectRef::Binder(obj) => { + let strong = obj.hdr.type_ == BINDER_TYPE_BINDER; + // SAFETY: `binder` is a `binder_uintptr_t`; any bit pattern is a valid + // representation. + let ptr = unsafe { obj.__bindgen_anon_1.binder } as _; + let cookie = obj.cookie as _; + let flags = obj.flags as _; + let node = self + .process + .as_arc_borrow() + .get_node(ptr, cookie, flags, strong, self)?; + security::binder_transfer_binder(&self.process.cred, &view.alloc.process.cred)?; + view.transfer_binder_object(offset, obj, strong, node)?; + } + BinderObjectRef::Handle(obj) => { + let strong = obj.hdr.type_ == BINDER_TYPE_HANDLE; + // SAFETY: `handle` is a `u32`; any bit pattern is a valid representation. + let handle = unsafe { obj.__bindgen_anon_1.handle } as _; + let node = self.process.get_node_from_handle(handle, strong)?; + security::binder_transfer_binder(&self.process.cred, &view.alloc.process.cred)?; + view.transfer_binder_object(offset, obj, strong, node)?; + } + BinderObjectRef::Fd(obj) => { + if !allow_fds { + return Err(EPERM.into()); + } + + // SAFETY: `fd` is a `u32`; any bit pattern is a valid representation. + let fd = unsafe { obj.__bindgen_anon_1.fd }; + let file = LocalFile::fget(fd)?; + // SAFETY: The binder driver never calls `fdget_pos` and this code runs from an + // ioctl, so there are no active calls to `fdget_pos` on this thread. + let file = unsafe { LocalFile::assume_no_fdget_pos(file) }; + security::binder_transfer_file( + &self.process.cred, + &view.alloc.process.cred, + &file, + )?; + + let mut obj_write = BinderFdObject::default(); + obj_write.hdr.type_ = BINDER_TYPE_FD; + // This will be overwritten with the actual fd when the transaction is received. + obj_write.__bindgen_anon_1.fd = u32::MAX; + obj_write.cookie = obj.cookie; + view.write::(offset, &obj_write)?; + + const FD_FIELD_OFFSET: usize = + core::mem::offset_of!(uapi::binder_fd_object, __bindgen_anon_1.fd); + + let field_offset = offset + FD_FIELD_OFFSET; + + view.alloc.info_add_fd(file, field_offset, false)?; + } + BinderObjectRef::Ptr(obj) => { + let obj_length = obj.length.try_into().map_err(|_| EINVAL)?; + let alloc_offset = match sg_state.unused_buffer_space.claim_next(obj_length) { + Ok(alloc_offset) => alloc_offset, + Err(err) => { + pr_warn!( + "Failed to claim space for a BINDER_TYPE_PTR. (offset: {}, limit: {}, size: {})", + sg_state.unused_buffer_space.offset, + sg_state.unused_buffer_space.limit, + obj_length, + ); + return Err(err.into()); + } + }; + + let sg_state_idx = sg_state.sg_entries.len(); + sg_state.sg_entries.push( + ScatterGatherEntry { + obj_index, + offset: alloc_offset, + sender_uaddr: obj.buffer as _, + length: obj_length, + pointer_fixups: KVec::new(), + fixup_min_offset: 0, + }, + GFP_KERNEL, + )?; + + let buffer_ptr_in_user_space = (view.alloc.ptr + alloc_offset) as u64; + + if obj.flags & uapi::BINDER_BUFFER_FLAG_HAS_PARENT == 0 { + sg_state.ancestors.clear(); + sg_state.ancestors.push(sg_state_idx, GFP_KERNEL)?; + } else { + // Another buffer also has a pointer to this buffer, and we need to fixup that + // pointer too. + + let parent_index = usize::try_from(obj.parent).map_err(|_| EINVAL)?; + let parent_offset = usize::try_from(obj.parent_offset).map_err(|_| EINVAL)?; + + let info = sg_state.validate_parent_fixup( + parent_index, + parent_offset, + size_of::(), + )?; + + sg_state.ancestors.truncate(info.num_ancestors); + sg_state.ancestors.push(sg_state_idx, GFP_KERNEL)?; + + let parent_entry = match sg_state.sg_entries.get_mut(info.parent_sg_index) { + Some(parent_entry) => parent_entry, + None => { + pr_err!( + "validate_parent_fixup returned index out of bounds for sg.entries" + ); + return Err(EINVAL.into()); + } + }; + + parent_entry.fixup_min_offset = info.new_min_offset; + parent_entry.pointer_fixups.push( + PointerFixupEntry { + skip: 0, + pointer_value: buffer_ptr_in_user_space, + target_offset: info.target_offset, + }, + GFP_KERNEL, + )?; + } + + let mut obj_write = BinderBufferObject::default(); + obj_write.hdr.type_ = BINDER_TYPE_PTR; + obj_write.flags = obj.flags; + obj_write.buffer = buffer_ptr_in_user_space; + obj_write.length = obj.length; + obj_write.parent = obj.parent; + obj_write.parent_offset = obj.parent_offset; + view.write::(offset, &obj_write)?; + } + BinderObjectRef::Fda(obj) => { + if !allow_fds { + return Err(EPERM.into()); + } + let parent_index = usize::try_from(obj.parent).map_err(|_| EINVAL)?; + let parent_offset = usize::try_from(obj.parent_offset).map_err(|_| EINVAL)?; + let num_fds = usize::try_from(obj.num_fds).map_err(|_| EINVAL)?; + let fds_len = num_fds.checked_mul(size_of::()).ok_or(EINVAL)?; + + let info = sg_state.validate_parent_fixup(parent_index, parent_offset, fds_len)?; + view.alloc.info_add_fd_reserve(num_fds)?; + + sg_state.ancestors.truncate(info.num_ancestors); + let parent_entry = match sg_state.sg_entries.get_mut(info.parent_sg_index) { + Some(parent_entry) => parent_entry, + None => { + pr_err!( + "validate_parent_fixup returned index out of bounds for sg.entries" + ); + return Err(EINVAL.into()); + } + }; + + parent_entry.fixup_min_offset = info.new_min_offset; + parent_entry + .pointer_fixups + .push( + PointerFixupEntry { + skip: fds_len, + pointer_value: 0, + target_offset: info.target_offset, + }, + GFP_KERNEL, + ) + .map_err(|_| ENOMEM)?; + + let fda_uaddr = parent_entry + .sender_uaddr + .checked_add(parent_offset) + .ok_or(EINVAL)?; + let mut fda_bytes = KVec::new(); + UserSlice::new(UserPtr::from_addr(fda_uaddr as _), fds_len) + .read_all(&mut fda_bytes, GFP_KERNEL)?; + + if fds_len != fda_bytes.len() { + pr_err!("UserSlice::read_all returned wrong length in BINDER_TYPE_FDA"); + return Err(EINVAL.into()); + } + + for i in (0..fds_len).step_by(size_of::()) { + let fd = { + let mut fd_bytes = [0u8; size_of::()]; + fd_bytes.copy_from_slice(&fda_bytes[i..i + size_of::()]); + u32::from_ne_bytes(fd_bytes) + }; + + let file = LocalFile::fget(fd)?; + // SAFETY: The binder driver never calls `fdget_pos` and this code runs from an + // ioctl, so there are no active calls to `fdget_pos` on this thread. + let file = unsafe { LocalFile::assume_no_fdget_pos(file) }; + security::binder_transfer_file( + &self.process.cred, + &view.alloc.process.cred, + &file, + )?; + + // The `validate_parent_fixup` call ensuers that this addition will not + // overflow. + view.alloc.info_add_fd(file, info.target_offset + i, true)?; + } + drop(fda_bytes); + + let mut obj_write = BinderFdArrayObject::default(); + obj_write.hdr.type_ = BINDER_TYPE_FDA; + obj_write.num_fds = obj.num_fds; + obj_write.parent = obj.parent; + obj_write.parent_offset = obj.parent_offset; + view.write::(offset, &obj_write)?; + } + } + Ok(()) + } + + fn apply_sg(&self, alloc: &mut Allocation, sg_state: &mut ScatterGatherState) -> BinderResult { + for sg_entry in &mut sg_state.sg_entries { + let mut end_of_previous_fixup = sg_entry.offset; + let offset_end = sg_entry.offset.checked_add(sg_entry.length).ok_or(EINVAL)?; + + let mut reader = + UserSlice::new(UserPtr::from_addr(sg_entry.sender_uaddr), sg_entry.length).reader(); + for fixup in &mut sg_entry.pointer_fixups { + let fixup_len = if fixup.skip == 0 { + size_of::() + } else { + fixup.skip + }; + + let target_offset_end = fixup.target_offset.checked_add(fixup_len).ok_or(EINVAL)?; + if fixup.target_offset < end_of_previous_fixup || offset_end < target_offset_end { + pr_warn!( + "Fixups oob {} {} {} {}", + fixup.target_offset, + end_of_previous_fixup, + offset_end, + target_offset_end + ); + return Err(EINVAL.into()); + } + + let copy_off = end_of_previous_fixup; + let copy_len = fixup.target_offset - end_of_previous_fixup; + if let Err(err) = alloc.copy_into(&mut reader, copy_off, copy_len) { + pr_warn!("Failed copying into alloc: {:?}", err); + return Err(err.into()); + } + if fixup.skip == 0 { + let res = alloc.write::(fixup.target_offset, &fixup.pointer_value); + if let Err(err) = res { + pr_warn!("Failed copying ptr into alloc: {:?}", err); + return Err(err.into()); + } + } + if let Err(err) = reader.skip(fixup_len) { + pr_warn!("Failed skipping {} from reader: {:?}", fixup_len, err); + return Err(err.into()); + } + end_of_previous_fixup = target_offset_end; + } + let copy_off = end_of_previous_fixup; + let copy_len = offset_end - end_of_previous_fixup; + if let Err(err) = alloc.copy_into(&mut reader, copy_off, copy_len) { + pr_warn!("Failed copying remainder into alloc: {:?}", err); + return Err(err.into()); + } + } + Ok(()) + } + + /// This method copies the payload of a transaction into the target process. + /// + /// The resulting payload will have several different components, which will be stored next to + /// each other in the allocation. Furthermore, various objects can be embedded in the payload, + /// and those objects have to be translated so that they make sense to the target transaction. + pub(crate) fn copy_transaction_data( + &self, + to_process: Arc, + tr: &BinderTransactionDataSg, + debug_id: usize, + allow_fds: bool, + txn_security_ctx_offset: Option<&mut usize>, + ) -> BinderResult { + let trd = &tr.transaction_data; + let is_oneway = trd.flags & TF_ONE_WAY != 0; + let mut secctx = if let Some(offset) = txn_security_ctx_offset { + let secid = self.process.cred.get_secid(); + let ctx = match security::SecurityCtx::from_secid(secid) { + Ok(ctx) => ctx, + Err(err) => { + pr_warn!("Failed to get security ctx for id {}: {:?}", secid, err); + return Err(err.into()); + } + }; + Some((offset, ctx)) + } else { + None + }; + + let data_size = trd.data_size.try_into().map_err(|_| EINVAL)?; + let aligned_data_size = ptr_align(data_size).ok_or(EINVAL)?; + let offsets_size = trd.offsets_size.try_into().map_err(|_| EINVAL)?; + let aligned_offsets_size = ptr_align(offsets_size).ok_or(EINVAL)?; + let buffers_size = tr.buffers_size.try_into().map_err(|_| EINVAL)?; + let aligned_buffers_size = ptr_align(buffers_size).ok_or(EINVAL)?; + let aligned_secctx_size = match secctx.as_ref() { + Some((_offset, ctx)) => ptr_align(ctx.len()).ok_or(EINVAL)?, + None => 0, + }; + + // This guarantees that at least `sizeof(usize)` bytes will be allocated. + let len = usize::max( + aligned_data_size + .checked_add(aligned_offsets_size) + .and_then(|sum| sum.checked_add(aligned_buffers_size)) + .and_then(|sum| sum.checked_add(aligned_secctx_size)) + .ok_or(ENOMEM)?, + size_of::(), + ); + let secctx_off = aligned_data_size + aligned_offsets_size + aligned_buffers_size; + let mut alloc = + match to_process.buffer_alloc(debug_id, len, is_oneway, self.process.task.pid()) { + Ok(alloc) => alloc, + Err(err) => { + pr_warn!( + "Failed to allocate buffer. len:{}, is_oneway:{}", + len, + is_oneway + ); + return Err(err); + } + }; + + // SAFETY: This accesses a union field, but it's okay because the field's type is valid for + // all bit-patterns. + let trd_data_ptr = unsafe { &trd.data.ptr }; + let mut buffer_reader = + UserSlice::new(UserPtr::from_addr(trd_data_ptr.buffer as _), data_size).reader(); + let mut end_of_previous_object = 0; + let mut sg_state = None; + + // Copy offsets if there are any. + if offsets_size > 0 { + { + let mut reader = + UserSlice::new(UserPtr::from_addr(trd_data_ptr.offsets as _), offsets_size) + .reader(); + alloc.copy_into(&mut reader, aligned_data_size, offsets_size)?; + } + + let offsets_start = aligned_data_size; + let offsets_end = aligned_data_size + aligned_offsets_size; + + // This state is used for BINDER_TYPE_PTR objects. + let sg_state = sg_state.insert(ScatterGatherState { + unused_buffer_space: UnusedBufferSpace { + offset: offsets_end, + limit: len, + }, + sg_entries: KVec::new(), + ancestors: KVec::new(), + }); + + // Traverse the objects specified. + let mut view = AllocationView::new(&mut alloc, data_size); + for (index, index_offset) in (offsets_start..offsets_end) + .step_by(size_of::()) + .enumerate() + { + let offset = view.alloc.read(index_offset)?; + + if offset < end_of_previous_object { + pr_warn!("Got transaction with invalid offset."); + return Err(EINVAL.into()); + } + + // Copy data between two objects. + if end_of_previous_object < offset { + view.copy_into( + &mut buffer_reader, + end_of_previous_object, + offset - end_of_previous_object, + )?; + } + + let mut object = BinderObject::read_from(&mut buffer_reader)?; + + match self.translate_object( + index, + offset, + object.as_ref(), + &mut view, + allow_fds, + sg_state, + ) { + Ok(()) => end_of_previous_object = offset + object.size(), + Err(err) => { + pr_warn!("Error while translating object."); + return Err(err); + } + } + + // Update the indexes containing objects to clean up. + let offset_after_object = index_offset + size_of::(); + view.alloc + .set_info_offsets(offsets_start..offset_after_object); + } + } + + // Copy remaining raw data. + alloc.copy_into( + &mut buffer_reader, + end_of_previous_object, + data_size - end_of_previous_object, + )?; + + if let Some(sg_state) = sg_state.as_mut() { + if let Err(err) = self.apply_sg(&mut alloc, sg_state) { + pr_warn!("Failure in apply_sg: {:?}", err); + return Err(err); + } + } + + if let Some((off_out, secctx)) = secctx.as_mut() { + if let Err(err) = alloc.write(secctx_off, secctx.as_bytes()) { + pr_warn!("Failed to write security context: {:?}", err); + return Err(err.into()); + } + **off_out = secctx_off; + } + Ok(alloc) + } + + fn unwind_transaction_stack(self: &Arc) { + let mut thread = self.clone(); + while let Ok(transaction) = { + let mut inner = thread.inner.lock(); + inner.pop_transaction_to_reply(thread.as_ref()) + } { + let reply = Err(BR_DEAD_REPLY); + if !transaction.from.deliver_single_reply(reply, &transaction) { + break; + } + + thread = transaction.from.clone(); + } + } + + pub(crate) fn deliver_reply( + &self, + reply: Result, u32>, + transaction: &DArc, + ) { + if self.deliver_single_reply(reply, transaction) { + transaction.from.unwind_transaction_stack(); + } + } + + /// Delivers a reply to the thread that started a transaction. The reply can either be a + /// reply-transaction or an error code to be delivered instead. + /// + /// Returns whether the thread is dead. If it is, the caller is expected to unwind the + /// transaction stack by completing transactions for threads that are dead. + fn deliver_single_reply( + &self, + reply: Result, u32>, + transaction: &DArc, + ) -> bool { + if let Ok(transaction) = &reply { + transaction.set_outstanding(&mut self.process.inner.lock()); + } + + { + let mut inner = self.inner.lock(); + if !inner.pop_transaction_replied(transaction) { + return false; + } + + if inner.is_dead { + return true; + } + + match reply { + Ok(work) => { + inner.push_work(work); + } + Err(code) => inner.push_reply_work(code), + } + } + + // Notify the thread now that we've released the inner lock. + self.work_condvar.notify_sync(); + false + } + + /// Determines if the given transaction is the current transaction for this thread. + fn is_current_transaction(&self, transaction: &DArc) -> bool { + let inner = self.inner.lock(); + match &inner.current_transaction { + None => false, + Some(current) => Arc::ptr_eq(current, transaction), + } + } + + /// Determines the current top of the transaction stack. It fails if the top is in another + /// thread (i.e., this thread belongs to a stack but it has called another thread). The top is + /// [`None`] if the thread is not currently participating in a transaction stack. + fn top_of_transaction_stack(&self) -> Result>> { + let inner = self.inner.lock(); + if let Some(cur) = &inner.current_transaction { + if core::ptr::eq(self, cur.from.as_ref()) { + pr_warn!("got new transaction with bad transaction stack"); + return Err(EINVAL); + } + Ok(Some(cur.clone())) + } else { + Ok(None) + } + } + + fn transaction(self: &Arc, tr: &BinderTransactionDataSg, inner: T) + where + T: FnOnce(&Arc, &BinderTransactionDataSg) -> BinderResult, + { + if let Err(err) = inner(self, tr) { + if err.should_pr_warn() { + let mut ee = self.inner.lock().extended_error; + ee.command = err.reply; + ee.param = err.as_errno(); + pr_warn!( + "Transaction failed: {:?} my_pid:{}", + err, + self.process.pid_in_current_ns() + ); + } + + self.push_return_work(err.reply); + } + } + + fn transaction_inner(self: &Arc, tr: &BinderTransactionDataSg) -> BinderResult { + // SAFETY: Handle's type has no invalid bit patterns. + let handle = unsafe { tr.transaction_data.target.handle }; + let node_ref = self.process.get_transaction_node(handle)?; + security::binder_transaction(&self.process.cred, &node_ref.node.owner.cred)?; + // TODO: We need to ensure that there isn't a pending transaction in the work queue. How + // could this happen? + let top = self.top_of_transaction_stack()?; + let list_completion = DTRWrap::arc_try_new(DeliverCode::new(BR_TRANSACTION_COMPLETE))?; + let completion = list_completion.clone_arc(); + let transaction = Transaction::new(node_ref, top, self, tr)?; + + // Check that the transaction stack hasn't changed while the lock was released, then update + // it with the new transaction. + { + let mut inner = self.inner.lock(); + if !transaction.is_stacked_on(&inner.current_transaction) { + pr_warn!("Transaction stack changed during transaction!"); + return Err(EINVAL.into()); + } + inner.current_transaction = Some(transaction.clone_arc()); + // We push the completion as a deferred work so that we wait for the reply before + // returning to userland. + inner.push_work_deferred(list_completion); + } + + if let Err(e) = transaction.submit() { + completion.skip(); + // Define `transaction` first to drop it after `inner`. + let transaction; + let mut inner = self.inner.lock(); + transaction = inner.current_transaction.take().unwrap(); + inner.current_transaction = transaction.clone_next(); + Err(e) + } else { + Ok(()) + } + } + + fn reply_inner(self: &Arc, tr: &BinderTransactionDataSg) -> BinderResult { + let orig = self.inner.lock().pop_transaction_to_reply(self)?; + if !orig.from.is_current_transaction(&orig) { + return Err(EINVAL.into()); + } + + // We need to complete the transaction even if we cannot complete building the reply. + let out = (|| -> BinderResult<_> { + let completion = DTRWrap::arc_try_new(DeliverCode::new(BR_TRANSACTION_COMPLETE))?; + let process = orig.from.process.clone(); + let allow_fds = orig.flags & TF_ACCEPT_FDS != 0; + let reply = Transaction::new_reply(self, process, tr, allow_fds)?; + self.inner.lock().push_work(completion); + orig.from.deliver_reply(Ok(reply), &orig); + Ok(()) + })() + .map_err(|mut err| { + // At this point we only return `BR_TRANSACTION_COMPLETE` to the caller, and we must let + // the sender know that the transaction has completed (with an error in this case). + pr_warn!( + "Failure {:?} during reply - delivering BR_FAILED_REPLY to sender.", + err + ); + let reply = Err(BR_FAILED_REPLY); + orig.from.deliver_reply(reply, &orig); + err.reply = BR_TRANSACTION_COMPLETE; + err + }); + + out + } + + fn oneway_transaction_inner(self: &Arc, tr: &BinderTransactionDataSg) -> BinderResult { + // SAFETY: The `handle` field is valid for all possible byte values, so reading from the + // union is okay. + let handle = unsafe { tr.transaction_data.target.handle }; + let node_ref = self.process.get_transaction_node(handle)?; + security::binder_transaction(&self.process.cred, &node_ref.node.owner.cred)?; + let transaction = Transaction::new(node_ref, None, self, tr)?; + let code = if self.process.is_oneway_spam_detection_enabled() + && transaction.oneway_spam_detected + { + BR_ONEWAY_SPAM_SUSPECT + } else { + BR_TRANSACTION_COMPLETE + }; + let list_completion = DTRWrap::arc_try_new(DeliverCode::new(code))?; + let completion = list_completion.clone_arc(); + self.inner.lock().push_work(list_completion); + match transaction.submit() { + Ok(()) => Ok(()), + Err(err) => { + completion.skip(); + Err(err) + } + } + } + + fn write(self: &Arc, req: &mut BinderWriteRead) -> Result { + let write_start = req.write_buffer.wrapping_add(req.write_consumed); + let write_len = req.write_size.saturating_sub(req.write_consumed); + let mut reader = + UserSlice::new(UserPtr::from_addr(write_start as _), write_len as _).reader(); + + while reader.len() >= size_of::() && self.inner.lock().return_work.is_unused() { + let before = reader.len(); + let cmd = reader.read::()?; + GLOBAL_STATS.inc_bc(cmd); + self.process.stats.inc_bc(cmd); + match cmd { + BC_TRANSACTION => { + let tr = reader.read::()?.with_buffers_size(0); + if tr.transaction_data.flags & TF_ONE_WAY != 0 { + self.transaction(&tr, Self::oneway_transaction_inner); + } else { + self.transaction(&tr, Self::transaction_inner); + } + } + BC_TRANSACTION_SG => { + let tr = reader.read::()?; + if tr.transaction_data.flags & TF_ONE_WAY != 0 { + self.transaction(&tr, Self::oneway_transaction_inner); + } else { + self.transaction(&tr, Self::transaction_inner); + } + } + BC_REPLY => { + let tr = reader.read::()?.with_buffers_size(0); + self.transaction(&tr, Self::reply_inner) + } + BC_REPLY_SG => { + let tr = reader.read::()?; + self.transaction(&tr, Self::reply_inner) + } + BC_FREE_BUFFER => { + let buffer = self.process.buffer_get(reader.read()?); + if let Some(buffer) = &buffer { + if buffer.looper_need_return_on_free() { + self.inner.lock().looper_need_return = true; + } + } + drop(buffer); + } + BC_INCREFS => { + self.process + .as_arc_borrow() + .update_ref(reader.read()?, true, false)? + } + BC_ACQUIRE => { + self.process + .as_arc_borrow() + .update_ref(reader.read()?, true, true)? + } + BC_RELEASE => { + self.process + .as_arc_borrow() + .update_ref(reader.read()?, false, true)? + } + BC_DECREFS => { + self.process + .as_arc_borrow() + .update_ref(reader.read()?, false, false)? + } + BC_INCREFS_DONE => self.process.inc_ref_done(&mut reader, false)?, + BC_ACQUIRE_DONE => self.process.inc_ref_done(&mut reader, true)?, + BC_REQUEST_DEATH_NOTIFICATION => self.process.request_death(&mut reader, self)?, + BC_CLEAR_DEATH_NOTIFICATION => self.process.clear_death(&mut reader, self)?, + BC_DEAD_BINDER_DONE => self.process.dead_binder_done(reader.read()?, self), + BC_REGISTER_LOOPER => { + let valid = self.process.register_thread(); + self.inner.lock().looper_register(valid); + } + BC_ENTER_LOOPER => self.inner.lock().looper_enter(), + BC_EXIT_LOOPER => self.inner.lock().looper_exit(), + BC_REQUEST_FREEZE_NOTIFICATION => self.process.request_freeze_notif(&mut reader)?, + BC_CLEAR_FREEZE_NOTIFICATION => self.process.clear_freeze_notif(&mut reader)?, + BC_FREEZE_NOTIFICATION_DONE => self.process.freeze_notif_done(&mut reader)?, + + // Fail if given an unknown error code. + // BC_ATTEMPT_ACQUIRE and BC_ACQUIRE_RESULT are no longer supported. + _ => return Err(EINVAL), + } + // Update the number of write bytes consumed. + req.write_consumed += (before - reader.len()) as u64; + } + + Ok(()) + } + + fn read(self: &Arc, req: &mut BinderWriteRead, wait: bool) -> Result { + let read_start = req.read_buffer.wrapping_add(req.read_consumed); + let read_len = req.read_size.saturating_sub(req.read_consumed); + let mut writer = BinderReturnWriter::new( + UserSlice::new(UserPtr::from_addr(read_start as _), read_len as _).writer(), + self, + ); + let (in_pool, use_proc_queue) = { + let inner = self.inner.lock(); + (inner.is_looper(), inner.should_use_process_work_queue()) + }; + + let getter = if use_proc_queue { + Self::get_work + } else { + Self::get_work_local + }; + + // Reserve some room at the beginning of the read buffer so that we can send a + // BR_SPAWN_LOOPER if we need to. + let mut has_noop_placeholder = false; + if req.read_consumed == 0 { + if let Err(err) = writer.write_code(BR_NOOP) { + pr_warn!("Failure when writing BR_NOOP at beginning of buffer."); + return Err(err); + } + has_noop_placeholder = true; + } + + // Loop doing work while there is room in the buffer. + let initial_len = writer.len(); + while writer.len() >= size_of::() + 4 { + match getter(self, wait && initial_len == writer.len()) { + Ok(Some(work)) => match work.into_arc().do_work(self, &mut writer) { + Ok(true) => {} + Ok(false) => break, + Err(err) => { + return Err(err); + } + }, + Ok(None) => { + break; + } + Err(err) => { + // Propagate the error if we haven't written anything else. + if err != EINTR && err != EAGAIN { + pr_warn!("Failure in work getter: {:?}", err); + } + if initial_len == writer.len() { + return Err(err); + } else { + break; + } + } + } + } + + req.read_consumed += read_len - writer.len() as u64; + + // Write BR_SPAWN_LOOPER if the process needs more threads for its pool. + if has_noop_placeholder && in_pool && self.process.needs_thread() { + let mut writer = + UserSlice::new(UserPtr::from_addr(req.read_buffer as _), req.read_size as _) + .writer(); + writer.write(&BR_SPAWN_LOOPER)?; + } + Ok(()) + } + + pub(crate) fn write_read(self: &Arc, data: UserSlice, wait: bool) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + let mut req = reader.read::()?; + + // Go through the write buffer. + let mut ret = Ok(()); + if req.write_size > 0 { + ret = self.write(&mut req); + if let Err(err) = ret { + pr_warn!( + "Write failure {:?} in pid:{}", + err, + self.process.pid_in_current_ns() + ); + req.read_consumed = 0; + writer.write(&req)?; + self.inner.lock().looper_need_return = false; + return ret; + } + } + + // Go through the work queue. + if req.read_size > 0 { + ret = self.read(&mut req, wait); + if ret.is_err() && ret != Err(EINTR) { + pr_warn!( + "Read failure {:?} in pid:{}", + ret, + self.process.pid_in_current_ns() + ); + } + } + + // Write the request back so that the consumed fields are visible to the caller. + writer.write(&req)?; + + self.inner.lock().looper_need_return = false; + + ret + } + + pub(crate) fn poll(&self, file: &File, table: PollTable<'_>) -> (bool, u32) { + table.register_wait(file, &self.work_condvar); + let mut inner = self.inner.lock(); + (inner.should_use_process_work_queue(), inner.poll()) + } + + /// Make the call to `get_work` or `get_work_local` return immediately, if any. + pub(crate) fn exit_looper(&self) { + let mut inner = self.inner.lock(); + let should_notify = inner.looper_flags & LOOPER_WAITING != 0; + if should_notify { + inner.looper_need_return = true; + } + drop(inner); + + if should_notify { + self.work_condvar.notify_one(); + } + } + + pub(crate) fn notify_if_poll_ready(&self, sync: bool) { + // Determine if we need to notify. This requires the lock. + let inner = self.inner.lock(); + let notify = inner.looper_flags & LOOPER_POLL != 0 && inner.should_use_process_work_queue(); + drop(inner); + + // Now that the lock is no longer held, notify the waiters if we have to. + if notify { + if sync { + self.work_condvar.notify_sync(); + } else { + self.work_condvar.notify_one(); + } + } + } + + pub(crate) fn release(self: &Arc) { + self.inner.lock().is_dead = true; + + //self.work_condvar.clear(); + self.unwind_transaction_stack(); + + // Cancel all pending work items. + while let Ok(Some(work)) = self.get_work_local(false) { + work.into_arc().cancel(); + } + } +} + +#[pin_data] +struct ThreadError { + error_code: AtomicU32, + #[pin] + links_track: AtomicTracker, +} + +impl ThreadError { + fn try_new() -> Result> { + DTRWrap::arc_pin_init(pin_init!(Self { + error_code: AtomicU32::new(BR_OK), + links_track <- AtomicTracker::new(), + })) + .map(ListArc::into_arc) + } + + fn set_error_code(&self, code: u32) { + self.error_code.store(code, Ordering::Relaxed); + } + + fn is_unused(&self) -> bool { + self.error_code.load(Ordering::Relaxed) == BR_OK + } +} + +impl DeliverToRead for ThreadError { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let code = self.error_code.load(Ordering::Relaxed); + self.error_code.store(BR_OK, Ordering::Relaxed); + writer.write_code(code)?; + Ok(true) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!( + m, + "{}transaction error: {}\n", + prefix, + self.error_code.load(Ordering::Relaxed) + ); + Ok(()) + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for ThreadError { + tracked_by links_track: AtomicTracker; + } +} diff --git a/drivers/android/binder/trace.rs b/drivers/android/binder/trace.rs new file mode 100644 index 000000000000..af0e4392805e --- /dev/null +++ b/drivers/android/binder/trace.rs @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::ffi::{c_uint, c_ulong}; +use kernel::tracepoint::declare_trace; + +declare_trace! { + unsafe fn rust_binder_ioctl(cmd: c_uint, arg: c_ulong); +} + +#[inline] +pub(crate) fn trace_ioctl(cmd: u32, arg: usize) { + // SAFETY: Always safe to call. + unsafe { rust_binder_ioctl(cmd, arg as c_ulong) } +} diff --git a/drivers/android/binder/transaction.rs b/drivers/android/binder/transaction.rs new file mode 100644 index 000000000000..02512175d622 --- /dev/null +++ b/drivers/android/binder/transaction.rs @@ -0,0 +1,456 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use core::sync::atomic::{AtomicBool, Ordering}; +use kernel::{ + prelude::*, + seq_file::SeqFile, + seq_print, + sync::{Arc, SpinLock}, + task::Kuid, + time::{Instant, Monotonic}, + types::ScopeGuard, +}; + +use crate::{ + allocation::{Allocation, TranslatedFds}, + defs::*, + error::{BinderError, BinderResult}, + node::{Node, NodeRef}, + process::{Process, ProcessInner}, + ptr_align, + thread::{PushWorkRes, Thread}, + BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverToRead, +}; + +#[pin_data(PinnedDrop)] +pub(crate) struct Transaction { + pub(crate) debug_id: usize, + target_node: Option>, + pub(crate) from_parent: Option>, + pub(crate) from: Arc, + pub(crate) to: Arc, + #[pin] + allocation: SpinLock>, + is_outstanding: AtomicBool, + code: u32, + pub(crate) flags: u32, + data_size: usize, + offsets_size: usize, + data_address: usize, + sender_euid: Kuid, + txn_security_ctx_off: Option, + pub(crate) oneway_spam_detected: bool, + start_time: Instant, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Transaction { untracked; } +} + +impl Transaction { + pub(crate) fn new( + node_ref: NodeRef, + from_parent: Option>, + from: &Arc, + tr: &BinderTransactionDataSg, + ) -> BinderResult> { + let debug_id = super::next_debug_id(); + let trd = &tr.transaction_data; + let allow_fds = node_ref.node.flags & FLAT_BINDER_FLAG_ACCEPTS_FDS != 0; + let txn_security_ctx = node_ref.node.flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX != 0; + let mut txn_security_ctx_off = if txn_security_ctx { Some(0) } else { None }; + let to = node_ref.node.owner.clone(); + let mut alloc = match from.copy_transaction_data( + to.clone(), + tr, + debug_id, + allow_fds, + txn_security_ctx_off.as_mut(), + ) { + Ok(alloc) => alloc, + Err(err) => { + if !err.is_dead() { + pr_warn!("Failure in copy_transaction_data: {:?}", err); + } + return Err(err); + } + }; + let oneway_spam_detected = alloc.oneway_spam_detected; + if trd.flags & TF_ONE_WAY != 0 { + if from_parent.is_some() { + pr_warn!("Oneway transaction should not be in a transaction stack."); + return Err(EINVAL.into()); + } + alloc.set_info_oneway_node(node_ref.node.clone()); + } + if trd.flags & TF_CLEAR_BUF != 0 { + alloc.set_info_clear_on_drop(); + } + let target_node = node_ref.node.clone(); + alloc.set_info_target_node(node_ref); + let data_address = alloc.ptr; + + Ok(DTRWrap::arc_pin_init(pin_init!(Transaction { + debug_id, + target_node: Some(target_node), + from_parent, + sender_euid: from.process.task.euid(), + from: from.clone(), + to, + code: trd.code, + flags: trd.flags, + data_size: trd.data_size as _, + offsets_size: trd.offsets_size as _, + data_address, + allocation <- kernel::new_spinlock!(Some(alloc.success()), "Transaction::new"), + is_outstanding: AtomicBool::new(false), + txn_security_ctx_off, + oneway_spam_detected, + start_time: Instant::now(), + }))?) + } + + pub(crate) fn new_reply( + from: &Arc, + to: Arc, + tr: &BinderTransactionDataSg, + allow_fds: bool, + ) -> BinderResult> { + let debug_id = super::next_debug_id(); + let trd = &tr.transaction_data; + let mut alloc = match from.copy_transaction_data(to.clone(), tr, debug_id, allow_fds, None) + { + Ok(alloc) => alloc, + Err(err) => { + pr_warn!("Failure in copy_transaction_data: {:?}", err); + return Err(err); + } + }; + let oneway_spam_detected = alloc.oneway_spam_detected; + if trd.flags & TF_CLEAR_BUF != 0 { + alloc.set_info_clear_on_drop(); + } + Ok(DTRWrap::arc_pin_init(pin_init!(Transaction { + debug_id, + target_node: None, + from_parent: None, + sender_euid: from.process.task.euid(), + from: from.clone(), + to, + code: trd.code, + flags: trd.flags, + data_size: trd.data_size as _, + offsets_size: trd.offsets_size as _, + data_address: alloc.ptr, + allocation <- kernel::new_spinlock!(Some(alloc.success()), "Transaction::new"), + is_outstanding: AtomicBool::new(false), + txn_security_ctx_off: None, + oneway_spam_detected, + start_time: Instant::now(), + }))?) + } + + #[inline(never)] + pub(crate) fn debug_print_inner(&self, m: &SeqFile, prefix: &str) { + seq_print!( + m, + "{}{}: from {}:{} to {} code {:x} flags {:x} elapsed {}ms", + prefix, + self.debug_id, + self.from.process.task.pid(), + self.from.id, + self.to.task.pid(), + self.code, + self.flags, + self.start_time.elapsed().as_millis(), + ); + if let Some(target_node) = &self.target_node { + seq_print!(m, " node {}", target_node.debug_id); + } + seq_print!(m, " size {}:{}\n", self.data_size, self.offsets_size); + } + + /// Determines if the transaction is stacked on top of the given transaction. + pub(crate) fn is_stacked_on(&self, onext: &Option>) -> bool { + match (&self.from_parent, onext) { + (None, None) => true, + (Some(from_parent), Some(next)) => Arc::ptr_eq(from_parent, next), + _ => false, + } + } + + /// Returns a pointer to the next transaction on the transaction stack, if there is one. + pub(crate) fn clone_next(&self) -> Option> { + Some(self.from_parent.as_ref()?.clone()) + } + + /// Searches in the transaction stack for a thread that belongs to the target process. This is + /// useful when finding a target for a new transaction: if the node belongs to a process that + /// is already part of the transaction stack, we reuse the thread. + fn find_target_thread(&self) -> Option> { + let mut it = &self.from_parent; + while let Some(transaction) = it { + if Arc::ptr_eq(&transaction.from.process, &self.to) { + return Some(transaction.from.clone()); + } + it = &transaction.from_parent; + } + None + } + + /// Searches in the transaction stack for a transaction originating at the given thread. + pub(crate) fn find_from(&self, thread: &Thread) -> Option<&DArc> { + let mut it = &self.from_parent; + while let Some(transaction) = it { + if core::ptr::eq(thread, transaction.from.as_ref()) { + return Some(transaction); + } + + it = &transaction.from_parent; + } + None + } + + pub(crate) fn set_outstanding(&self, to_process: &mut ProcessInner) { + // No race because this method is only called once. + if !self.is_outstanding.load(Ordering::Relaxed) { + self.is_outstanding.store(true, Ordering::Relaxed); + to_process.add_outstanding_txn(); + } + } + + /// Decrement `outstanding_txns` in `to` if it hasn't already been decremented. + fn drop_outstanding_txn(&self) { + // No race because this is called at most twice, and one of the calls are in the + // destructor, which is guaranteed to not race with any other operations on the + // transaction. It also cannot race with `set_outstanding`, since submission happens + // before delivery. + if self.is_outstanding.load(Ordering::Relaxed) { + self.is_outstanding.store(false, Ordering::Relaxed); + self.to.drop_outstanding_txn(); + } + } + + /// Submits the transaction to a work queue. Uses a thread if there is one in the transaction + /// stack, otherwise uses the destination process. + /// + /// Not used for replies. + pub(crate) fn submit(self: DLArc) -> BinderResult { + // Defined before `process_inner` so that the destructor runs after releasing the lock. + let mut _t_outdated; + + let oneway = self.flags & TF_ONE_WAY != 0; + let process = self.to.clone(); + let mut process_inner = process.inner.lock(); + + self.set_outstanding(&mut process_inner); + + if oneway { + if let Some(target_node) = self.target_node.clone() { + if process_inner.is_frozen { + process_inner.async_recv = true; + if self.flags & TF_UPDATE_TXN != 0 { + if let Some(t_outdated) = + target_node.take_outdated_transaction(&self, &mut process_inner) + { + // Save the transaction to be dropped after locks are released. + _t_outdated = t_outdated; + } + } + } + match target_node.submit_oneway(self, &mut process_inner) { + Ok(()) => {} + Err((err, work)) => { + drop(process_inner); + // Drop work after releasing process lock. + drop(work); + return Err(err); + } + } + + if process_inner.is_frozen { + return Err(BinderError::new_frozen_oneway()); + } else { + return Ok(()); + } + } else { + pr_err!("Failed to submit oneway transaction to node."); + } + } + + if process_inner.is_frozen { + process_inner.sync_recv = true; + return Err(BinderError::new_frozen()); + } + + let res = if let Some(thread) = self.find_target_thread() { + match thread.push_work(self) { + PushWorkRes::Ok => Ok(()), + PushWorkRes::FailedDead(me) => Err((BinderError::new_dead(), me)), + } + } else { + process_inner.push_work(self) + }; + drop(process_inner); + + match res { + Ok(()) => Ok(()), + Err((err, work)) => { + // Drop work after releasing process lock. + drop(work); + Err(err) + } + } + } + + /// Check whether one oneway transaction can supersede another. + pub(crate) fn can_replace(&self, old: &Transaction) -> bool { + if self.from.process.task.pid() != old.from.process.task.pid() { + return false; + } + + if self.flags & old.flags & (TF_ONE_WAY | TF_UPDATE_TXN) != (TF_ONE_WAY | TF_UPDATE_TXN) { + return false; + } + + let target_node_match = match (self.target_node.as_ref(), old.target_node.as_ref()) { + (None, None) => true, + (Some(tn1), Some(tn2)) => Arc::ptr_eq(tn1, tn2), + _ => false, + }; + + self.code == old.code && self.flags == old.flags && target_node_match + } + + fn prepare_file_list(&self) -> Result { + let mut alloc = self.allocation.lock().take().ok_or(ESRCH)?; + + match alloc.translate_fds() { + Ok(translated) => { + *self.allocation.lock() = Some(alloc); + Ok(translated) + } + Err(err) => { + // Free the allocation eagerly. + drop(alloc); + Err(err) + } + } + } +} + +impl DeliverToRead for Transaction { + fn do_work( + self: DArc, + thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let send_failed_reply = ScopeGuard::new(|| { + if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 { + let reply = Err(BR_FAILED_REPLY); + self.from.deliver_reply(reply, &self); + } + self.drop_outstanding_txn(); + }); + + let files = if let Ok(list) = self.prepare_file_list() { + list + } else { + // On failure to process the list, we send a reply back to the sender and ignore the + // transaction on the recipient. + return Ok(true); + }; + + let mut tr_sec = BinderTransactionDataSecctx::default(); + let tr = tr_sec.tr_data(); + if let Some(target_node) = &self.target_node { + let (ptr, cookie) = target_node.get_id(); + tr.target.ptr = ptr as _; + tr.cookie = cookie as _; + }; + tr.code = self.code; + tr.flags = self.flags; + tr.data_size = self.data_size as _; + tr.data.ptr.buffer = self.data_address as _; + tr.offsets_size = self.offsets_size as _; + if tr.offsets_size > 0 { + tr.data.ptr.offsets = (self.data_address + ptr_align(self.data_size).unwrap()) as _; + } + tr.sender_euid = self.sender_euid.into_uid_in_current_ns(); + tr.sender_pid = 0; + if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 { + // Not a reply and not one-way. + tr.sender_pid = self.from.process.pid_in_current_ns(); + } + let code = if self.target_node.is_none() { + BR_REPLY + } else if self.txn_security_ctx_off.is_some() { + BR_TRANSACTION_SEC_CTX + } else { + BR_TRANSACTION + }; + + // Write the transaction code and data to the user buffer. + writer.write_code(code)?; + if let Some(off) = self.txn_security_ctx_off { + tr_sec.secctx = (self.data_address + off) as u64; + writer.write_payload(&tr_sec)?; + } else { + writer.write_payload(&*tr)?; + } + + let mut alloc = self.allocation.lock().take().ok_or(ESRCH)?; + + // Dismiss the completion of transaction with a failure. No failure paths are allowed from + // here on out. + send_failed_reply.dismiss(); + + // Commit files, and set FDs in FDA to be closed on buffer free. + let close_on_free = files.commit(); + alloc.set_info_close_on_free(close_on_free); + + // It is now the user's responsibility to clear the allocation. + alloc.keep_alive(); + + self.drop_outstanding_txn(); + + // When this is not a reply and not a oneway transaction, update `current_transaction`. If + // it's a reply, `current_transaction` has already been updated appropriately. + if self.target_node.is_some() && tr_sec.transaction_data.flags & TF_ONE_WAY == 0 { + thread.set_current_transaction(self); + } + + Ok(false) + } + + fn cancel(self: DArc) { + let allocation = self.allocation.lock().take(); + drop(allocation); + + // If this is not a reply or oneway transaction, then send a dead reply. + if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 { + let reply = Err(BR_DEAD_REPLY); + self.from.deliver_reply(reply, &self); + } + + self.drop_outstanding_txn(); + } + + fn should_sync_wakeup(&self) -> bool { + self.flags & TF_ONE_WAY == 0 + } + + fn debug_print(&self, m: &SeqFile, _prefix: &str, tprefix: &str) -> Result<()> { + self.debug_print_inner(m, tprefix); + Ok(()) + } +} + +#[pinned_drop] +impl PinnedDrop for Transaction { + fn drop(self: Pin<&mut Self>) { + self.drop_outstanding_txn(); + } +} diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 1fd92021a573..03ee4c7010d7 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -38,7 +38,7 @@ enum { BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE), }; -enum { +enum flat_binder_object_flags { FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff, FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100, diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 84d60635e8a9..9b3a4ab95818 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -71,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -99,3 +101,9 @@ const xa_mark_t RUST_CONST_HELPER_XA_PRESENT = XA_PRESENT; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC = XA_FLAGS_ALLOC; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC1 = XA_FLAGS_ALLOC1; + +#if IS_ENABLED(CONFIG_ANDROID_BINDER_IPC_RUST) +#include "../../drivers/android/binder/rust_binder.h" +#include "../../drivers/android/binder/rust_binder_events.h" +#include "../../drivers/android/binder/page_range_helper.h" +#endif diff --git a/rust/helpers/binder.c b/rust/helpers/binder.c new file mode 100644 index 000000000000..224d38a92f1d --- /dev/null +++ b/rust/helpers/binder.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2025 Google LLC. + */ + +#include +#include + +unsigned long rust_helper_list_lru_count(struct list_lru *lru) +{ + return list_lru_count(lru); +} + +unsigned long rust_helper_list_lru_walk(struct list_lru *lru, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long nr_to_walk) +{ + return list_lru_walk(lru, isolate, cb_arg, nr_to_walk); +} + +void rust_helper_init_task_work(struct callback_head *twork, + task_work_func_t func) +{ + init_task_work(twork, func); +} diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 7cf7fe95e41d..8e8277bdddca 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -8,6 +8,7 @@ */ #include "auxiliary.c" +#include "binder.c" #include "blk.c" #include "bug.c" #include "build_assert.c" diff --git a/rust/helpers/page.c b/rust/helpers/page.c index b3f2b8fbf87f..7144de5a61db 100644 --- a/rust/helpers/page.c +++ b/rust/helpers/page.c @@ -2,6 +2,7 @@ #include #include +#include struct page *rust_helper_alloc_pages(gfp_t gfp_mask, unsigned int order) { @@ -17,3 +18,10 @@ void rust_helper_kunmap_local(const void *addr) { kunmap_local(addr); } + +#ifndef NODE_NOT_IN_PAGE_FLAGS +int rust_helper_page_to_nid(const struct page *page) +{ + return page_to_nid(page); +} +#endif diff --git a/rust/helpers/security.c b/rust/helpers/security.c index 0c4c2065df28..ca22da09548d 100644 --- a/rust/helpers/security.c +++ b/rust/helpers/security.c @@ -17,4 +17,28 @@ void rust_helper_security_release_secctx(struct lsm_context *cp) { security_release_secctx(cp); } + +int rust_helper_security_binder_set_context_mgr(const struct cred *mgr) +{ + return security_binder_set_context_mgr(mgr); +} + +int rust_helper_security_binder_transaction(const struct cred *from, + const struct cred *to) +{ + return security_binder_transaction(from, to); +} + +int rust_helper_security_binder_transfer_binder(const struct cred *from, + const struct cred *to) +{ + return security_binder_transfer_binder(from, to); +} + +int rust_helper_security_binder_transfer_file(const struct cred *from, + const struct cred *to, + const struct file *file) +{ + return security_binder_transfer_file(from, to, file); +} #endif diff --git a/rust/kernel/cred.rs b/rust/kernel/cred.rs index 2599f01e8b28..3aa2e4c4a50c 100644 --- a/rust/kernel/cred.rs +++ b/rust/kernel/cred.rs @@ -54,6 +54,12 @@ impl Credential { unsafe { &*ptr.cast() } } + /// Returns a raw pointer to the inner credential. + #[inline] + pub fn as_ptr(&self) -> *const bindings::cred { + self.0.get() + } + /// Get the id for this security context. #[inline] pub fn get_secid(&self) -> u32 { diff --git a/rust/kernel/page.rs b/rust/kernel/page.rs index 7c1b17246ed5..811fe30e8e6f 100644 --- a/rust/kernel/page.rs +++ b/rust/kernel/page.rs @@ -85,6 +85,12 @@ impl Page { self.page.as_ptr() } + /// Get the node id containing this page. + pub fn nid(&self) -> i32 { + // SAFETY: Always safe to call with a valid page. + unsafe { bindings::page_to_nid(self.as_ptr()) } + } + /// Runs a piece of code with this page mapped to an address. /// /// The page is unmapped when this call returns. diff --git a/rust/kernel/security.rs b/rust/kernel/security.rs index 0c63e9e7e564..9d271695265f 100644 --- a/rust/kernel/security.rs +++ b/rust/kernel/security.rs @@ -8,9 +8,46 @@ use crate::{ bindings, + cred::Credential, error::{to_result, Result}, + fs::File, }; +/// Calls the security modules to determine if the given task can become the manager of a binder +/// context. +#[inline] +pub fn binder_set_context_mgr(mgr: &Credential) -> Result { + // SAFETY: `mrg.0` is valid because the shared reference guarantees a nonzero refcount. + to_result(unsafe { bindings::security_binder_set_context_mgr(mgr.as_ptr()) }) +} + +/// Calls the security modules to determine if binder transactions are allowed from task `from` to +/// task `to`. +#[inline] +pub fn binder_transaction(from: &Credential, to: &Credential) -> Result { + // SAFETY: `from` and `to` are valid because the shared references guarantee nonzero refcounts. + to_result(unsafe { bindings::security_binder_transaction(from.as_ptr(), to.as_ptr()) }) +} + +/// Calls the security modules to determine if task `from` is allowed to send binder objects +/// (owned by itself or other processes) to task `to` through a binder transaction. +#[inline] +pub fn binder_transfer_binder(from: &Credential, to: &Credential) -> Result { + // SAFETY: `from` and `to` are valid because the shared references guarantee nonzero refcounts. + to_result(unsafe { bindings::security_binder_transfer_binder(from.as_ptr(), to.as_ptr()) }) +} + +/// Calls the security modules to determine if task `from` is allowed to send the given file to +/// task `to` (which would get its own file descriptor) through a binder transaction. +#[inline] +pub fn binder_transfer_file(from: &Credential, to: &Credential, file: &File) -> Result { + // SAFETY: `from`, `to` and `file` are valid because the shared references guarantee nonzero + // refcounts. + to_result(unsafe { + bindings::security_binder_transfer_file(from.as_ptr(), to.as_ptr(), file.as_ptr()) + }) +} + /// A security context string. /// /// # Invariants diff --git a/rust/uapi/uapi_helper.h b/rust/uapi/uapi_helper.h index 1409441359f5..de3562b08d0c 100644 --- a/rust/uapi/uapi_helper.h +++ b/rust/uapi/uapi_helper.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 01b4a3061b1d4ded108e1a700b4414c00662954c Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:55 +0300 Subject: wifi: nl80211: Add more configuration options for NAN commands Current NAN APIs have only basic configuration for master preference and operating bands. Add and parse additional parameters which provide more control over NAN synchronization. The newly added attributes allow to publish additional NAN attributes and vendor elements in NAN beacons, control scan and discovery beacons periodicity, enable/disable DW notifications etc. Signed-off-by: Andrei Otcheretianski tested: Miriam Rachel Korenblit Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.a4779492bf8e.I375feb919bd72358173766b9fe10010c40796b33@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 60 +++++++++ include/uapi/linux/nl80211.h | 110 +++++++++++++++- net/wireless/nl80211.c | 298 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 431 insertions(+), 37 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4072a67c9cc9..e2f4ca500ea3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3912,6 +3912,38 @@ struct cfg80211_qos_map { struct cfg80211_dscp_range up[8]; }; +/** + * struct cfg80211_nan_band_config - NAN band specific configuration + * + * @chan: Pointer to the IEEE 802.11 channel structure. The channel to be used + * for NAN operations on this band. For 2.4 GHz band, this is always + * channel 6. For 5 GHz band, the channel is either 44 or 149, according + * to the regulatory constraints. If chan pointer is NULL the entire band + * configuration entry is considered invalid and should not be used. + * @rssi_close: RSSI close threshold used for NAN state transition algorithm + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should + * be greater than -60 dBm. + * @rssi_middle: RSSI middle threshold used for NAN state transition algorithm. + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should be + * greater than -75 dBm and less than rssi_close. + * @awake_dw_interval: Committed DW interval. Valid values range: 0-5. 0 + * indicates no wakeup for DW and can't be used on 2.4GHz band, otherwise + * 2^(n-1). + * @disable_scan: If true, the device will not scan this band for cluster + * merge. Disabling scan on 2.4 GHz band is not allowed. + */ +struct cfg80211_nan_band_config { + struct ieee80211_channel *chan; + s8 rssi_close; + s8 rssi_middle; + u8 awake_dw_interval; + bool disable_scan; +}; + /** * struct cfg80211_nan_conf - NAN configuration * @@ -3921,10 +3953,31 @@ struct cfg80211_qos_map { * @bands: operating bands, a bitmap of &enum nl80211_band values. * For instance, for NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * @cluster_id: cluster ID used for NAN synchronization. This is a MAC address + * that can take a value from 50-6F-9A-01-00-00 to 50-6F-9A-01-FF-FF. + * If NULL, the device will pick a random Cluster ID. + * @scan_period: period (in seconds) between NAN scans. + * @scan_dwell_time: dwell time (in milliseconds) for NAN scans. + * @discovery_beacon_interval: interval (in TUs) for discovery beacons. + * @band_cfgs: array of band specific configurations, indexed by + * &enum nl80211_band values. + * @extra_nan_attrs: pointer to additional NAN attributes. + * @extra_nan_attrs_len: length of the additional NAN attributes. + * @vendor_elems: pointer to vendor-specific elements. + * @vendor_elems_len: length of the vendor-specific elements. */ struct cfg80211_nan_conf { u8 master_pref; u8 bands; + const u8 *cluster_id; + u16 scan_period; + u16 scan_dwell_time; + u8 discovery_beacon_interval; + struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS]; + const u8 *extra_nan_attrs; + u16 extra_nan_attrs_len; + const u8 *vendor_elems; + u16 vendor_elems_len; }; /** @@ -3933,10 +3986,17 @@ struct cfg80211_nan_conf { * * @CFG80211_NAN_CONF_CHANGED_PREF: master preference * @CFG80211_NAN_CONF_CHANGED_BANDS: operating bands + * @CFG80211_NAN_CONF_CHANGED_CONFIG: changed additional configuration. + * When this flag is set, it indicates that some additional attribute(s) + * (other then master_pref and bands) have been changed. In this case, + * all the unchanged attributes will be properly configured to their + * previous values. The driver doesn't need to store any + * previous configuration besides master_pref and bands. */ enum cfg80211_nan_conf_changes { CFG80211_NAN_CONF_CHANGED_PREF = BIT(0), CFG80211_NAN_CONF_CHANGED_BANDS = BIT(1), + CFG80211_NAN_CONF_CHANGED_CONFIG = BIT(2), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index aed0b4c5d5e8..20b8202a3d58 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1085,8 +1085,9 @@ * %NL80211_ATTR_NAN_MASTER_PREF attribute and optional * %NL80211_ATTR_BANDS attributes. If %NL80211_ATTR_BANDS is * omitted or set to 0, it means don't-care and the device will - * decide what to use. After this command NAN functions can be - * added. + * decide what to use. Additional cluster configuration may be + * optionally provided with %NL80211_ATTR_NAN_CONFIG. + * After this command NAN functions can be added. * @NL80211_CMD_STOP_NAN: Stop the NAN operation, identified by * its %NL80211_ATTR_WDEV interface. * @NL80211_CMD_ADD_NAN_FUNCTION: Add a NAN function. The function is defined @@ -1115,6 +1116,10 @@ * current configuration is not changed. If it is present but * set to zero, the configuration is changed to don't-care * (i.e. the device can decide what to do). + * Additional parameters may be provided with + * %NL80211_ATTR_NAN_CONFIG. User space should provide all previously + * configured nested attributes under %NL80211_ATTR_NAN_CONFIG, even if + * only a subset was changed. * @NL80211_CMD_NAN_MATCH: Notification sent when a match is reported. * This will contain a %NL80211_ATTR_NAN_MATCH nested attribute and * %NL80211_ATTR_COOKIE. @@ -2936,6 +2941,12 @@ enum nl80211_commands { * indicate that it wants strict checking on the BSS parameters to be * modified. * + * @NL80211_ATTR_NAN_CONFIG: Nested attribute for + * extended NAN cluster configuration. This is used with + * %NL80211_CMD_START_NAN and %NL80211_CMD_CHANGE_NAN_CONFIG. + * See &enum nl80211_nan_conf_attributes for details. + * This attribute is optional. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3498,6 +3509,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_LONG_BEACON_PERIOD, NL80211_ATTR_S1G_SHORT_BEACON, NL80211_ATTR_BSS_PARAM, + NL80211_ATTR_NAN_CONFIG, /* add attributes here, update the policy in nl80211.c */ @@ -7323,6 +7335,100 @@ enum nl80211_nan_match_attributes { NL80211_NAN_MATCH_ATTR_MAX = NUM_NL80211_NAN_MATCH_ATTR - 1 }; +/** + * enum nl80211_nan_band_conf_attributes - NAN band configuration attributes + * @__NL80211_NAN_BAND_CONF_INVALID: Invalid. + * @NL80211_NAN_BAND_CONF_BAND: Band for which the configuration is + * being set. The value is according to &enum nl80211_band (u8). + * @NL80211_NAN_BAND_CONF_FREQ: Discovery frequency. This attribute shall not + * be present on 2.4 GHZ band. On 5 GHz band its presence is optional. + * The allowed values are 5220 (channel 44) or 5745 (channel 149). + * If not present, channel 149 is used if allowed, otherwise channel 44 + * will be selected. The value is in MHz (u16). + * @NL80211_NAN_BAND_CONF_RSSI_CLOSE: RSSI close threshold used for NAN state + * transition algorithm as described in chapters 3.3.6 and 3.3.7 "NAN + * Device Role and State Transition" of Wi-Fi Aware (TM) Specification + * v4.0. If not specified, default device value is used. The value should + * be greater than -60 dBm (s8). + * @NL80211_NAN_BAND_CONF_RSSI_MIDDLE: RSSI middle threshold used for NAN state + * transition algorithm as described in chapters 3.3.6 and 3.3.7 "NAN + * Device Role and State Transition" of Wi-Fi Aware (TM) Specification + * v4.0. If not present, default device value is used. The value should be + * greater than -75 dBm and less than %NL80211_NAN_BAND_CONF_RSSI_CLOSE + * (s8). + * @NL80211_NAN_BAND_CONF_WAKE_DW: Committed DW information (values 0-5). + * Value 0 means that the device will not wake up during the + * discovery window. Values 1-5 mean that the device will wake up + * during each 2^(n - 1) discovery window, where n is the value of + * this attribute. Setting this attribute to 0 is not allowed on + * 2.4 GHz band (u8). This is an optional parameter (default is 1). + * @NL80211_NAN_BAND_CONF_DISABLE_SCAN: Optional flag attribute to disable + * scanning (for cluster merge) on the band. If set, the device will not + * scan on this band anymore. Disabling scanning on 2.4 GHz band is not + * allowed. + * @NUM_NL80211_NAN_BAND_CONF_ATTR: Internal. + * @NL80211_NAN_BAND_CONF_ATTR_MAX: Highest NAN band configuration attribute. + * + * These attributes are used to configure NAN band-specific parameters. Note, + * that both RSSI attributes should be configured (or both left unset). + */ +enum nl80211_nan_band_conf_attributes { + __NL80211_NAN_BAND_CONF_INVALID, + NL80211_NAN_BAND_CONF_BAND, + NL80211_NAN_BAND_CONF_FREQ, + NL80211_NAN_BAND_CONF_RSSI_CLOSE, + NL80211_NAN_BAND_CONF_RSSI_MIDDLE, + NL80211_NAN_BAND_CONF_WAKE_DW, + NL80211_NAN_BAND_CONF_DISABLE_SCAN, + + /* keep last */ + NUM_NL80211_NAN_BAND_CONF_ATTR, + NL80211_NAN_BAND_CONF_ATTR_MAX = NUM_NL80211_NAN_BAND_CONF_ATTR - 1, +}; + +/** + * enum nl80211_nan_conf_attributes - NAN configuration attributes + * @__NL80211_NAN_CONF_INVALID: Invalid attribute, used for validation. + * @NL80211_NAN_CONF_CLUSTER_ID: ID for the NAN cluster. This is a MAC + * address that can take values from 50-6F-9A-01-00-00 to + * 50-6F-9A-01-FF-FF. This attribute is optional. If not present, + * a random Cluster ID will be chosen. + * @NL80211_NAN_CONF_EXTRA_ATTRS: Additional NAN attributes to be + * published in the beacons. This is an optional byte array. + * @NL80211_NAN_CONF_VENDOR_ELEMS: Vendor-specific elements that will + * be published in the beacons. This is an optional byte array. + * @NL80211_NAN_CONF_BAND_CONFIGS: This is a nested array attribute, + * containing multiple entries for each supported band. Each band + * configuration consists of &enum nl80211_nan_band_conf_attributes. + * @NL80211_NAN_CONF_SCAN_PERIOD: Scan period in seconds. If not configured, + * device default is used. Zero value will disable scanning. + * This is u16 (optional). + * @NL80211_NAN_CONF_SCAN_DWELL_TIME: Scan dwell time in TUs per channel. + * Only non-zero values are valid. If not configured the device default + * value is used. This is u16 (optional) + * @NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL: Discovery beacon interval + * in TUs. Valid range is 50-200 TUs. If not configured the device default + * value is used. This is u8 (optional) + * @NUM_NL80211_NAN_CONF_ATTR: Internal. + * @NL80211_NAN_CONF_ATTR_MAX: Highest NAN configuration attribute. + * + * These attributes are used to configure NAN-specific parameters. + */ +enum nl80211_nan_conf_attributes { + __NL80211_NAN_CONF_INVALID, + NL80211_NAN_CONF_CLUSTER_ID, + NL80211_NAN_CONF_EXTRA_ATTRS, + NL80211_NAN_CONF_VENDOR_ELEMS, + NL80211_NAN_CONF_BAND_CONFIGS, + NL80211_NAN_CONF_SCAN_PERIOD, + NL80211_NAN_CONF_SCAN_DWELL_TIME, + NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL, + + /* keep last */ + NUM_NL80211_NAN_CONF_ATTR, + NL80211_NAN_CONF_ATTR_MAX = NUM_NL80211_NAN_CONF_ATTR - 1, +}; + /** * enum nl80211_external_auth_action - Action to perform with external * authentication request. Used by NL80211_ATTR_EXTERNAL_AUTH_ACTION. diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b7bc7e5e81dd..04679acc8135 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -312,6 +312,26 @@ static int validate_supported_selectors(const struct nlattr *attr, return 0; } +static int validate_nan_cluster_id(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + static const u8 cluster_id_prefix[4] = {0x50, 0x6f, 0x9a, 0x1}; + + if (len != ETH_ALEN) { + NL_SET_ERR_MSG_ATTR(extack, attr, "bad cluster id length"); + return -EINVAL; + } + + if (memcmp(data, cluster_id_prefix, sizeof(cluster_id_prefix))) { + NL_SET_ERR_MSG_ATTR(extack, attr, "invalid cluster id prefix"); + return -EINVAL; + } + + return 0; +} + /* policy for the attributes */ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; @@ -500,6 +520,35 @@ nl80211_s1g_short_beacon[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1] = { IEEE80211_MAX_DATA_LEN), }; +static const struct nla_policy +nl80211_nan_band_conf_policy[NL80211_NAN_BAND_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_BAND_CONF_BAND] = NLA_POLICY_MAX(NLA_U8, + NUM_NL80211_BANDS - 1), + [NL80211_NAN_BAND_CONF_FREQ] = { .type = NLA_U16 }, + [NL80211_NAN_BAND_CONF_RSSI_CLOSE] = NLA_POLICY_MIN(NLA_S8, -59), + [NL80211_NAN_BAND_CONF_RSSI_MIDDLE] = NLA_POLICY_MIN(NLA_S8, -74), + [NL80211_NAN_BAND_CONF_WAKE_DW] = NLA_POLICY_MAX(NLA_U8, 5), + [NL80211_NAN_BAND_CONF_DISABLE_SCAN] = { .type = NLA_FLAG }, +}; + +static const struct nla_policy +nl80211_nan_conf_policy[NL80211_NAN_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_CONF_CLUSTER_ID] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_nan_cluster_id, + ETH_ALEN), + [NL80211_NAN_CONF_EXTRA_ATTRS] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN}, + [NL80211_NAN_CONF_VENDOR_ELEMS] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), + [NL80211_NAN_CONF_BAND_CONFIGS] = + NLA_POLICY_NESTED_ARRAY(nl80211_nan_band_conf_policy), + [NL80211_NAN_CONF_SCAN_PERIOD] = { .type = NLA_U16 }, + [NL80211_NAN_CONF_SCAN_DWELL_TIME] = NLA_POLICY_RANGE(NLA_U16, 50, 512), + [NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL] = + NLA_POLICY_RANGE(NLA_U8, 50, 200), +}; + static const struct netlink_range_validation nl80211_punct_bitmap_range = { .min = 0, .max = 0xffff, @@ -769,6 +818,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_ATTR_BANDS] = { .type = NLA_U32 }, + [NL80211_ATTR_NAN_CONFIG] = NLA_POLICY_NESTED(nl80211_nan_conf_policy), [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, .len = FILS_MAX_KEK_LEN }, @@ -15398,6 +15448,211 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info) return 0; } +static struct ieee80211_channel *nl80211_get_nan_channel(struct wiphy *wiphy, + int freq) +{ + struct ieee80211_channel *chan; + struct cfg80211_chan_def def; + + /* Check if the frequency is valid for NAN */ + if (freq != 5220 && freq != 5745 && freq != 2437) + return NULL; + + chan = ieee80211_get_channel(wiphy, freq); + if (!chan) + return NULL; + + cfg80211_chandef_create(&def, chan, NL80211_CHAN_NO_HT); + + /* Check if the channel is allowed */ + if (cfg80211_reg_can_beacon(wiphy, &def, NL80211_IFTYPE_NAN)) + return chan; + + return NULL; +} + +static int nl80211_parse_nan_band_config(struct wiphy *wiphy, + struct nlattr **tb, + struct cfg80211_nan_band_config *cfg, + enum nl80211_band band) +{ + if (BIT(band) & ~(u32)wiphy->nan_supported_bands) + return -EINVAL; + + if (tb[NL80211_NAN_BAND_CONF_FREQ]) { + u16 freq = nla_get_u16(tb[NL80211_NAN_BAND_CONF_FREQ]); + + if (band != NL80211_BAND_5GHZ) + return -EINVAL; + + cfg->chan = nl80211_get_nan_channel(wiphy, freq); + if (!cfg->chan) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]) { + cfg->rssi_close = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]); + if (!tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) { + cfg->rssi_middle = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]); + if (!cfg->rssi_close || cfg->rssi_middle >= cfg->rssi_close) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_WAKE_DW]) { + cfg->awake_dw_interval = + nla_get_u8(tb[NL80211_NAN_BAND_CONF_WAKE_DW]); + + if (band == NL80211_BAND_2GHZ && cfg->awake_dw_interval == 0) + return -EINVAL; + } + + cfg->disable_scan = + nla_get_flag(tb[NL80211_NAN_BAND_CONF_DISABLE_SCAN]); + return 0; +} + +static int nl80211_parse_nan_conf(struct wiphy *wiphy, + struct genl_info *info, + struct cfg80211_nan_conf *conf, + u32 *changed_flags) +{ + struct nlattr *attrs[NL80211_NAN_CONF_ATTR_MAX + 1]; + int err, rem; + u32 changed = 0; + struct nlattr *band_config; + + if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { + conf->master_pref = + nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); + + changed |= CFG80211_NAN_CONF_CHANGED_PREF; + } + + if (info->attrs[NL80211_ATTR_BANDS]) { + u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); + + if (bands & ~(u32)wiphy->nan_supported_bands) + return -EOPNOTSUPP; + + if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) + return -EINVAL; + + conf->bands = bands; + changed |= CFG80211_NAN_CONF_CHANGED_BANDS; + } + + conf->band_cfgs[NL80211_BAND_2GHZ].awake_dw_interval = 1; + if (conf->bands & BIT(NL80211_BAND_5GHZ) || !conf->bands) + conf->band_cfgs[NL80211_BAND_5GHZ].awake_dw_interval = 1; + + /* On 2.4 GHz band use channel 6 */ + conf->band_cfgs[NL80211_BAND_2GHZ].chan = + nl80211_get_nan_channel(wiphy, 2437); + if (!conf->band_cfgs[NL80211_BAND_2GHZ].chan) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_NAN_CONFIG]) + goto out; + + err = nla_parse_nested(attrs, NL80211_NAN_CONF_ATTR_MAX, + info->attrs[NL80211_ATTR_NAN_CONFIG], NULL, + info->extack); + if (err) + return err; + + changed |= CFG80211_NAN_CONF_CHANGED_CONFIG; + if (attrs[NL80211_NAN_CONF_CLUSTER_ID]) + conf->cluster_id = + nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]); + + if (attrs[NL80211_NAN_CONF_EXTRA_ATTRS]) { + conf->extra_nan_attrs = + nla_data(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + conf->extra_nan_attrs_len = + nla_len(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + } + + if (attrs[NL80211_NAN_CONF_VENDOR_ELEMS]) { + conf->vendor_elems = + nla_data(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + conf->vendor_elems_len = + nla_len(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + } + + if (attrs[NL80211_NAN_CONF_BAND_CONFIGS]) { + nla_for_each_nested(band_config, + attrs[NL80211_NAN_CONF_BAND_CONFIGS], + rem) { + enum nl80211_band band; + struct cfg80211_nan_band_config *cfg; + struct nlattr *tb[NL80211_NAN_BAND_CONF_ATTR_MAX + 1]; + + err = nla_parse_nested(tb, + NL80211_NAN_BAND_CONF_ATTR_MAX, + band_config, NULL, + info->extack); + if (err) + return err; + + if (!tb[NL80211_NAN_BAND_CONF_BAND]) + return -EINVAL; + + band = nla_get_u8(tb[NL80211_NAN_BAND_CONF_BAND]); + if (conf->bands && !(conf->bands & BIT(band))) + return -EINVAL; + + cfg = &conf->band_cfgs[band]; + + err = nl80211_parse_nan_band_config(wiphy, tb, cfg, + band); + if (err) + return err; + } + } + + if (attrs[NL80211_NAN_CONF_SCAN_PERIOD]) + conf->scan_period = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_PERIOD]); + + if (attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]) + conf->scan_dwell_time = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]); + + if (attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]) + conf->discovery_beacon_interval = + nla_get_u8(attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]); +out: + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + (!conf->bands || conf->bands & BIT(NL80211_BAND_5GHZ))) { + /* If no 5GHz channel is specified use default, if possible */ + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5745); + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan) + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5220); + + /* Return error if user space asked explicitly for 5 GHz */ + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + conf->bands & BIT(NL80211_BAND_5GHZ)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[NL80211_ATTR_BANDS], + "5 GHz band operation is not allowed"); + return -EINVAL; + } + } + + if (changed_flags) + *changed_flags = changed; + + return 0; +} + static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -15414,23 +15669,13 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) if (rfkill_blocked(rdev->wiphy.rfkill)) return -ERFKILL; + /* Master preference is mandatory for START_NAN */ if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) return -EINVAL; - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL); + if (err) + return err; err = rdev_start_nan(rdev, wdev, &conf); if (err) @@ -15786,6 +16031,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb, struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_nan_conf conf = {}; u32 changed = 0; + int err; if (wdev->iftype != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; @@ -15793,27 +16039,9 @@ static int nl80211_nan_change_config(struct sk_buff *skb, if (!wdev_running(wdev)) return -ENOTCONN; - if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - if (conf.master_pref <= 1 || conf.master_pref == 255) - return -EINVAL; - - changed |= CFG80211_NAN_CONF_CHANGED_PREF; - } - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - changed |= CFG80211_NAN_CONF_CHANGED_BANDS; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed); + if (err) + return err; if (!changed) return -EINVAL; -- cgit v1.2.3 From ba9b2ceaa2558a38a5da59fd654b641610a8568e Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:56 +0300 Subject: wifi: nl80211: Add NAN Discovery Window (DW) notification This notification will be used by the device to inform user space about upcoming DW. When received, user space will be able to prepare multicast Service Discovery Frames (SDFs) to be transmitted during the next DW using %NL80211_CMD_FRAME command on the NAN management interface. The device/driver will take care to transmit the frames in the correct timing. This allows to implement a synchronized Discovery Engine (DE) in user space, if the device doesn't support DE offload. Note that this notification can be sent before the actual DW starts as long as the driver/device handles the actual timing of the SDF transmission. Signed-off-by: Andrei Otcheretianski Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.0e1d15031bab.I5b1721e61b63910452b3c5cdcdc1e94cb094d4c9@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 ++++++++++++ include/uapi/linux/nl80211.h | 16 ++++++++++++++++ net/wireless/nl80211.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 16 ++++++++++++++++ 4 files changed, 89 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e2f4ca500ea3..0c1311d254be 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3959,6 +3959,8 @@ struct cfg80211_nan_band_config { * @scan_period: period (in seconds) between NAN scans. * @scan_dwell_time: dwell time (in milliseconds) for NAN scans. * @discovery_beacon_interval: interval (in TUs) for discovery beacons. + * @enable_dw_notification: flag to enable/disable discovery window + * notifications. * @band_cfgs: array of band specific configurations, indexed by * &enum nl80211_band values. * @extra_nan_attrs: pointer to additional NAN attributes. @@ -3973,6 +3975,7 @@ struct cfg80211_nan_conf { u16 scan_period; u16 scan_dwell_time; u8 discovery_beacon_interval; + bool enable_dw_notification; struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS]; const u8 *extra_nan_attrs; u16 extra_nan_attrs_len; @@ -10062,6 +10065,15 @@ void cfg80211_schedule_channels_check(struct wireless_dev *wdev); */ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); +/** + * cfg80211_next_nan_dw_notif - Notify about the next NAN Discovery Window (DW) + * @wdev: Pointer to the wireless device structure + * @chan: DW channel (6, 44 or 149) + * @gfp: Memory allocation flags + */ +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp); + #ifdef CONFIG_CFG80211_DEBUGFS /** * wiphy_locked_debugfs_read - do a locked read in debugfs diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 20b8202a3d58..d674608e2635 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1349,6 +1349,15 @@ * control EPCS configuration. Used to notify userland on the current state * of EPCS. * + * @NL80211_CMD_NAN_NEXT_DW_NOTIFICATION: This command is used to notify + * user space about the next NAN Discovery Window (DW). User space may use + * it to prepare frames to be sent in the next DW. + * %NL80211_ATTR_WIPHY_FREQ is used to indicate the frequency of the next + * DW. SDF transmission should be requested with %NL80211_CMD_FRAME and + * the device/driver shall take care of the actual transmission timing. + * This notification is only sent to the NAN interface owning socket + * (see %NL80211_ATTR_SOCKET_OWNER flag). + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1609,6 +1618,8 @@ enum nl80211_commands { NL80211_CMD_ASSOC_MLO_RECONF, NL80211_CMD_EPCS_CFG, + NL80211_CMD_NAN_NEXT_DW_NOTIFICATION, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -7409,6 +7420,10 @@ enum nl80211_nan_band_conf_attributes { * @NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL: Discovery beacon interval * in TUs. Valid range is 50-200 TUs. If not configured the device default * value is used. This is u8 (optional) + * @NL80211_NAN_CONF_NOTIFY_DW: If set, the driver will notify userspace about + * the upcoming discovery window with + * %NL80211_CMD_NAN_NEXT_DW_NOTIFICATION. + * This is a flag attribute. * @NUM_NL80211_NAN_CONF_ATTR: Internal. * @NL80211_NAN_CONF_ATTR_MAX: Highest NAN configuration attribute. * @@ -7423,6 +7438,7 @@ enum nl80211_nan_conf_attributes { NL80211_NAN_CONF_SCAN_PERIOD, NL80211_NAN_CONF_SCAN_DWELL_TIME, NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL, + NL80211_NAN_CONF_NOTIFY_DW, /* keep last */ NUM_NL80211_NAN_CONF_ATTR, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 04679acc8135..d64145746b65 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -547,6 +547,7 @@ nl80211_nan_conf_policy[NL80211_NAN_CONF_ATTR_MAX + 1] = { [NL80211_NAN_CONF_SCAN_DWELL_TIME] = NLA_POLICY_RANGE(NLA_U16, 50, 512), [NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL] = NLA_POLICY_RANGE(NLA_U8, 50, 200), + [NL80211_NAN_CONF_NOTIFY_DW] = { .type = NLA_FLAG }, }; static const struct netlink_range_validation nl80211_punct_bitmap_range = { @@ -15627,6 +15628,11 @@ static int nl80211_parse_nan_conf(struct wiphy *wiphy, if (attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]) conf->discovery_beacon_interval = nla_get_u8(attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]); + + if (attrs[NL80211_NAN_CONF_NOTIFY_DW]) + conf->enable_dw_notification = + nla_get_flag(attrs[NL80211_NAN_CONF_NOTIFY_DW]); + out: if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && (!conf->bands || conf->bands & BIT(NL80211_BAND_5GHZ))) { @@ -21764,6 +21770,45 @@ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled) } EXPORT_SYMBOL(cfg80211_epcs_changed); +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_next_nan_dw_notif(wdev, chan); + + if (!wdev->owner_nlportid) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, + NL80211_CMD_NAN_NEXT_DW_NOTIFICATION); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_unicast(wiphy_net(wiphy), msg, wdev->owner_nlportid); + + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_next_nan_dw_notif); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 9b6074155d59..ff47e9bffd4f 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -4166,6 +4166,22 @@ TRACE_EVENT(cfg80211_epcs_changed, WDEV_PR_ARG, __entry->enabled) ); +TRACE_EVENT(cfg80211_next_nan_dw_notif, + TP_PROTO(struct wireless_dev *wdev, + struct ieee80211_channel *chan), + TP_ARGS(wdev, chan), + TP_STRUCT__entry( + WDEV_ENTRY + CHAN_ENTRY + ), + TP_fast_assign( + WDEV_ASSIGN; + CHAN_ASSIGN(chan); + ), + TP_printk(WDEV_PR_FMT " " CHAN_PR_FMT, + WDEV_PR_ARG, CHAN_PR_ARG) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 1ccfd8db34fb3b1852284668094d7207499c2415 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:57 +0300 Subject: wifi: cfg80211: Add cluster joined notification APIs The drivers should notify upper layers and user space when a NAN device joins a cluster. This is needed, for example, to set the correct addr3 in SDF frames. Add API to report cluster join event. Signed-off-by: Andrei Otcheretianski Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.ad27b7b6e4d9.I70b213a2a49f18d1ba2ad325e67e8eff51cc7a1f@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 ++++++++++++++ include/uapi/linux/nl80211.h | 8 ++++++++ net/wireless/nl80211.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 19 +++++++++++++++++++ 4 files changed, 82 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0c1311d254be..1b10bd31bdd6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -10074,6 +10074,20 @@ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, struct ieee80211_channel *chan, gfp_t gfp); +/** + * cfg80211_nan_cluster_joined - Notify about NAN cluster join + * @wdev: Pointer to the wireless device structure + * @cluster_id: Cluster ID of the NAN cluster that was joined or started + * @new_cluster: Indicates if this is a new cluster or an existing one + * @gfp: Memory allocation flags + * + * This function is used to notify user space when a NAN cluster has been + * joined, providing the cluster ID and a flag whether it is a new cluster. + */ +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp); + #ifdef CONFIG_CFG80211_DEBUGFS /** * wiphy_locked_debugfs_read - do a locked read in debugfs diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d674608e2635..c5a7658b7297 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1357,6 +1357,9 @@ * the device/driver shall take care of the actual transmission timing. * This notification is only sent to the NAN interface owning socket * (see %NL80211_ATTR_SOCKET_OWNER flag). + * @NL80211_CMD_NAN_CLUSTER_JOINED: This command is used to notify + * user space that the NAN new cluster has been joined. The cluster ID is + * indicated by %NL80211_ATTR_MAC. * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use @@ -1619,6 +1622,7 @@ enum nl80211_commands { NL80211_CMD_EPCS_CFG, NL80211_CMD_NAN_NEXT_DW_NOTIFICATION, + NL80211_CMD_NAN_CLUSTER_JOINED, /* add new commands above here */ @@ -2957,6 +2961,9 @@ enum nl80211_commands { * %NL80211_CMD_START_NAN and %NL80211_CMD_CHANGE_NAN_CONFIG. * See &enum nl80211_nan_conf_attributes for details. * This attribute is optional. + * @NL80211_ATTR_NAN_NEW_CLUSTER: Flag attribute indicating that a new + * NAN cluster has been created. This is used with + * %NL80211_CMD_NAN_CLUSTER_JOINED * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -3521,6 +3528,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_SHORT_BEACON, NL80211_ATTR_BSS_PARAM, NL80211_ATTR_NAN_CONFIG, + NL80211_ATTR_NAN_NEW_CLUSTER, /* add attributes here, update the policy in nl80211.c */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d64145746b65..904a725a4f4a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -21809,6 +21809,47 @@ void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, } EXPORT_SYMBOL(cfg80211_next_nan_dw_notif); +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_nan_cluster_joined(wdev, cluster_id, new_cluster); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_CLUSTER_JOINED); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, cluster_id) || + (new_cluster && nla_put_flag(msg, NL80211_ATTR_NAN_NEW_CLUSTER))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + if (!wdev->owner_nlportid) + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(wiphy), + msg, 0, NL80211_MCGRP_NAN, gfp); + else + genlmsg_unicast(wiphy_net(wiphy), msg, + wdev->owner_nlportid); + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_nan_cluster_joined); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index ff47e9bffd4f..8a4c34112eb5 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -4182,6 +4182,25 @@ TRACE_EVENT(cfg80211_next_nan_dw_notif, WDEV_PR_ARG, CHAN_PR_ARG) ); +TRACE_EVENT(cfg80211_nan_cluster_joined, + TP_PROTO(struct wireless_dev *wdev, + const u8 *cluster_id, + bool new_cluster), + TP_ARGS(wdev, cluster_id, new_cluster), + TP_STRUCT__entry( + WDEV_ENTRY + MAC_ENTRY(cluster_id) + __field(bool, new_cluster) + ), + TP_fast_assign( + WDEV_ASSIGN; + MAC_ASSIGN(cluster_id, cluster_id); + __entry->new_cluster = new_cluster; + ), + TP_printk(WDEV_PR_FMT " cluster_id %pMF%s", + WDEV_PR_ARG, __entry->cluster_id, + __entry->new_cluster ? " [new]" : "") +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 3cbadd84f5c4ea792c0df3506639a2cb57ba9b11 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:58 +0300 Subject: wifi: nl80211: Add more NAN capabilities Add better break down for NAN capabilities, as NAN has multiple optional features. This allows to better indicate which features are supported or or offloaded to the device. Signed-off-by: Andrei Otcheretianski Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.bb02cd8c1596.I01fb2e8dc3662b847f3c27117bc4e199fc96d0a3@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 55 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c5a7658b7297..423e258cdbd2 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2964,6 +2964,10 @@ enum nl80211_commands { * @NL80211_ATTR_NAN_NEW_CLUSTER: Flag attribute indicating that a new * NAN cluster has been created. This is used with * %NL80211_CMD_NAN_CLUSTER_JOINED + * @NL80211_ATTR_NAN_CAPABILITIES: Nested attribute for NAN capabilities. + * This is used with %NL80211_CMD_GET_WIPHY to indicate the NAN + * capabilities supported by the driver. See &enum nl80211_nan_capabilities + * for details. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -3529,6 +3533,7 @@ enum nl80211_attrs { NL80211_ATTR_BSS_PARAM, NL80211_ATTR_NAN_CONFIG, NL80211_ATTR_NAN_NEW_CLUSTER, + NL80211_ATTR_NAN_CAPABILITIES, /* add attributes here, update the policy in nl80211.c */ @@ -8362,4 +8367,54 @@ enum nl80211_s1g_short_beacon_attrs { __NL80211_S1G_SHORT_BEACON_ATTR_LAST - 1 }; +/** + * enum nl80211_nan_capabilities - NAN (Neighbor Aware Networking) + * capabilities. + * + * @__NL80211_NAN_CAPABILITIES_INVALID: Invalid. + * @NL80211_NAN_CAPA_CONFIGURABLE_SYNC: Flag attribute indicating that + * the device supports configurable synchronization. If set, the device + * should be able to handle %NL80211_ATTR_NAN_CONFIG + * attribute in the %NL80211_CMD_START_NAN (and change) command. + * @NL80211_NAN_CAPA_USERSPACE_DE: Flag attribute indicating that + * NAN Discovery Engine (DE) is not offloaded and the driver assumes + * user space DE implementation. When set, %NL80211_CMD_ADD_NAN_FUNCTION, + * %NL80211_CMD_DEL_NAN_FUNCTION and %NL80211_CMD_NAN_MATCH commands + * should not be used. In addition, the device/driver should support + * sending discovery window (DW) notifications using + * %NL80211_CMD_NAN_NEXT_DW_NOTIFICATION and handling transmission and + * reception of NAN SDF frames on NAN device interface during DW windows. + * (%NL80211_CMD_FRAME is used to transmit SDFs) + * @NL80211_NAN_CAPA_OP_MODE: u8 attribute indicating the supported operation + * modes as defined in Wi-Fi Aware (TM) specification Table 81 (Operation + * Mode field format). + * @NL80211_NAN_CAPA_NUM_ANTENNAS: u8 attribute indicating the number of + * TX and RX antennas supported by the device. Lower nibble indicates + * the number of TX antennas and upper nibble indicates the number of RX + * antennas. Value 0 indicates the information is not available. + * See table 79 of Wi-Fi Aware (TM) specification (Number of + * Antennas field). + * @NL80211_NAN_CAPA_MAX_CHANNEL_SWITCH_TIME: u16 attribute indicating the + * maximum time in microseconds that the device requires to switch + * channels. + * @NL80211_NAN_CAPA_CAPABILITIES: u8 attribute containing the + * capabilities of the device as defined in Wi-Fi Aware (TM) + * specification Table 79 (Capabilities field). + * @__NL80211_NAN_CAPABILITIES_LAST: Internal + * @NL80211_NAN_CAPABILITIES_MAX: Highest NAN capability attribute. + */ +enum nl80211_nan_capabilities { + __NL80211_NAN_CAPABILITIES_INVALID, + + NL80211_NAN_CAPA_CONFIGURABLE_SYNC, + NL80211_NAN_CAPA_USERSPACE_DE, + NL80211_NAN_CAPA_OP_MODE, + NL80211_NAN_CAPA_NUM_ANTENNAS, + NL80211_NAN_CAPA_MAX_CHANNEL_SWITCH_TIME, + NL80211_NAN_CAPA_CAPABILITIES, + /* keep last */ + __NL80211_NAN_CAPABILITIES_LAST, + NL80211_NAN_CAPABILITIES_MAX = __NL80211_NAN_CAPABILITIES_LAST - 1, +}; + #endif /* __LINUX_NL80211_H */ -- cgit v1.2.3 From d0688dc2b172d19e20fdb8be8c37930da12aaf88 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 18 Sep 2025 15:19:11 +1000 Subject: wifi: cfg80211: correctly implement and validate S1G chandef Currently, the S1G channelisation implementation differs from that of VHT, which is the PHY that S1G is based on. The major difference between the clock rate is 1/10th of VHT. However how their channelisation is represented within cfg80211 and mac80211 vastly differ. To rectify this, remove the use of IEEE80211_CHAN_1/2/4.. flags that were previously used to indicate the control channel width, however it should be implied that the control channels are 1MHz in the case of S1G. Additionally, introduce the invert - being IEEE80211_CHAN_NO_4/8/16MHz - that imply the control channel may not be used for a certain bandwidth. With these new flags, we can perform regulatory and chandef validation just as we would for VHT. To deal with the notion that S1G PHYs may contain a 2MHz primary channel, introduce a new variable, s1g_primary_2mhz, which indicates whether we are operating on a 2MHz primary channel. In this case, the chandef::chan points to the 1MHz primary channel pointed to by the primary channel location. Alongside this, introduce some new helper routines that can extract the sibling 1MHz channel. The sibling being the alternate 1MHz primary subchannel within the 2MHz primary channel that is not pointed to by chandef::chan. Furthermore, due to unique restrictions imposed on S1G PHYs, introduce a new flag, IEEE80211_CHAN_S1G_NO_PRIMARY, which states that the 1MHz channel cannot be used as a primary channel. This is assumed to be set by vendors as it is hardware and regdom specific, When we validate a 2MHz primary channel, we need to ensure both 1MHz subchannels do not contain this flag. If one or both of the 1MHz subchannels contain this flag then the 2MHz primary is not permitted for use as a primary channel. Properly integrate S1G channel validation such that it is implemented according with other PHY types such as VHT. Additionally, implement a new S1G-specific regulatory flag to allow cfg80211 to understand specific vendor requirements for S1G PHYs. Signed-off-by: Arien Judge Signed-off-by: Andrew Pope Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250918051913.500781-2-lachlan.hodges@morsemicro.com [remove redundant NL80211_ATTR_S1G_PRIMARY_2MHZ check] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 95 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 15 +++++++ net/wireless/chan.c | 103 +++++++++++++++++++++++++++++-------------- net/wireless/nl80211.c | 37 +++++++++------- net/wireless/reg.c | 76 ++++++++++--------------------- 5 files changed, 225 insertions(+), 101 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 26fd42e189ce..2d612c760dd1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -129,6 +129,13 @@ struct wiphy; * with very low power (VLP), even if otherwise set to NO_IR. * @IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY: Allow activity on a 20 MHz channel, * even if otherwise set to NO_IR. + * @IEEE80211_CHAN_S1G_NO_PRIMARY: Prevents the channel for use as an S1G + * primary channel. Does not prevent the wider operating channel + * described by the chandef from being used. In order for a 2MHz primary + * to be used, both 1MHz subchannels shall not contain this flag. + * @IEEE80211_CHAN_NO_4MHZ: 4 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_8MHZ: 8 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_16MHZ: 16 MHz bandwidth is not permitted on this channel. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = BIT(0), @@ -158,6 +165,10 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_CAN_MONITOR = BIT(24), IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP = BIT(25), IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY = BIT(26), + IEEE80211_CHAN_S1G_NO_PRIMARY = BIT(27), + IEEE80211_CHAN_NO_4MHZ = BIT(28), + IEEE80211_CHAN_NO_8MHZ = BIT(29), + IEEE80211_CHAN_NO_16MHZ = BIT(30), }; #define IEEE80211_CHAN_NO_HT40 \ @@ -821,6 +832,9 @@ struct key_params { * @punctured: mask of the punctured 20 MHz subchannels, with * bits turned on being disabled (punctured); numbered * from lower to higher frequency (like in the spec) + * @s1g_primary_2mhz: Indicates if the control channel pointed to + * by 'chan' exists as a 1MHz primary subchannel within an + * S1G 2MHz primary channel. */ struct cfg80211_chan_def { struct ieee80211_channel *chan; @@ -830,6 +844,7 @@ struct cfg80211_chan_def { struct ieee80211_edmg edmg; u16 freq1_offset; u16 punctured; + bool s1g_primary_2mhz; }; /* @@ -990,6 +1005,18 @@ cfg80211_chandef_is_edmg(const struct cfg80211_chan_def *chandef) return chandef->edmg.channels || chandef->edmg.bw_config; } +/** + * cfg80211_chandef_is_s1g - check if chandef represents an S1G channel + * @chandef: the channel definition + * + * Return: %true if S1G. + */ +static inline bool +cfg80211_chandef_is_s1g(const struct cfg80211_chan_def *chandef) +{ + return chandef->chan->band == NL80211_BAND_S1GHZ; +} + /** * cfg80211_chandef_compatible - check if two channel definitions are compatible * @chandef1: first channel definition @@ -10179,4 +10206,72 @@ ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file, void *data); #endif +/** + * cfg80211_s1g_get_start_freq_khz - get S1G chandef start frequency + * @chandef: the chandef to use + * + * Return: the chandefs starting frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_start_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz - bw_mhz * 500 + 500; +} + +/** + * cfg80211_s1g_get_end_freq_khz - get S1G chandef end frequency + * @chandef: the chandef to use + * + * Return: the chandefs ending frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_end_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz + bw_mhz * 500 - 500; +} + +/** + * cfg80211_s1g_get_primary_sibling - retrieve the sibling 1MHz subchannel + * for an S1G chandef using a 2MHz primary channel. + * @wiphy: wiphy the channel belongs to + * @chandef: the chandef to use + * + * When chandef::s1g_primary_2mhz is set to true, we are operating on a 2MHz + * primary channel. The 1MHz subchannel designated by the primary channel + * location exists within chandef::chan, whilst the 'sibling' is denoted as + * being the other 1MHz subchannel that make up the 2MHz primary channel. + * + * Returns: the sibling 1MHz &struct ieee80211_channel, or %NULL on failure. + */ +static inline struct ieee80211_channel * +cfg80211_s1g_get_primary_sibling(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 pri_1mhz_khz, sibling_1mhz_khz, op_low_1mhz_khz, pri_index; + + if (!chandef->s1g_primary_2mhz || width_mhz < 2) + return NULL; + + pri_1mhz_khz = ieee80211_channel_to_khz(chandef->chan); + op_low_1mhz_khz = cfg80211_s1g_get_start_freq_khz(chandef); + + /* + * Compute the index of the primary 1 MHz subchannel within the + * operating channel, relative to the lowest 1 MHz center frequency. + * Flip the least significant bit to select the even/odd sibling, + * then translate that index back into a channel frequency. + */ + pri_index = (pri_1mhz_khz - op_low_1mhz_khz) / 1000; + sibling_1mhz_khz = op_low_1mhz_khz + ((pri_index ^ 1) * 1000); + + return ieee80211_get_channel_khz(wiphy, sibling_1mhz_khz); +} + #endif /* __NET_CFG80211_H */ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 423e258cdbd2..8134f10e4e6c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2969,6 +2969,10 @@ enum nl80211_commands { * capabilities supported by the driver. See &enum nl80211_nan_capabilities * for details. * + * @NL80211_ATTR_S1G_PRIMARY_2MHZ: flag attribute indicating that the S1G + * primary channel is 2 MHz wide, and the control channel designates + * the 1 MHz primary subchannel within that 2 MHz primary. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3535,6 +3539,8 @@ enum nl80211_attrs { NL80211_ATTR_NAN_NEW_CLUSTER, NL80211_ATTR_NAN_CAPABILITIES, + NL80211_ATTR_S1G_PRIMARY_2MHZ, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -4432,6 +4438,12 @@ enum nl80211_wmm_rule { * very low power (VLP) AP, despite being NO_IR. * @NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY: This channel can be active in * 20 MHz bandwidth, despite being NO_IR. + * @NL80211_FREQUENCY_ATTR_NO_4MHZ: 4 MHz operation is not allowed on this + * channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_8MHZ: 8 MHz operation is not allowed on this + * channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_16MHZ: 16 MHz operation is not allowed on this + * channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4477,6 +4489,9 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_CAN_MONITOR, NL80211_FREQUENCY_ATTR_ALLOW_6GHZ_VLP_AP, NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY, + NL80211_FREQUENCY_ATTR_NO_4MHZ, + NL80211_FREQUENCY_ATTR_NO_8MHZ, + NL80211_FREQUENCY_ATTR_NO_16MHZ, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 193734b7f9dc..68221b1ab45e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -100,6 +100,11 @@ static u32 cfg80211_get_end_freq(const struct cfg80211_chan_def *chandef, punctured = 0) : (punctured >>= 1))) \ if (!(punctured & 1)) +#define for_each_s1g_subchan(chandef, freq_khz) \ + for (freq_khz = cfg80211_s1g_get_start_freq_khz(chandef); \ + freq_khz <= cfg80211_s1g_get_end_freq_khz(chandef); \ + freq_khz += MHZ_TO_KHZ(1)) + struct cfg80211_per_bw_puncturing_values { u8 len; const u16 *valid_values; @@ -336,8 +341,7 @@ static bool cfg80211_valid_center_freq(u32 center, bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) { - u32 control_freq, oper_freq; - int oper_width, control_width; + u32 control_freq, control_freq_khz, start_khz, end_khz; if (!chandef->chan) return false; @@ -363,27 +367,16 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_4: case NL80211_CHAN_WIDTH_8: case NL80211_CHAN_WIDTH_16: - if (chandef->chan->band != NL80211_BAND_S1GHZ) - return false; - - control_freq = ieee80211_channel_to_khz(chandef->chan); - oper_freq = ieee80211_chandef_to_khz(chandef); - control_width = nl80211_chan_width_to_mhz( - ieee80211_s1g_channel_width( - chandef->chan)); - oper_width = cfg80211_chandef_get_width(chandef); - - if (oper_width < 0 || control_width < 0) + if (!cfg80211_chandef_is_s1g(chandef)) return false; if (chandef->center_freq2) return false; - if (control_freq + MHZ_TO_KHZ(control_width) / 2 > - oper_freq + MHZ_TO_KHZ(oper_width) / 2) - return false; + control_freq_khz = ieee80211_channel_to_khz(chandef->chan); + start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + end_khz = cfg80211_s1g_get_end_freq_khz(chandef); - if (control_freq - MHZ_TO_KHZ(control_width) / 2 < - oper_freq - MHZ_TO_KHZ(oper_width) / 2) + if (control_freq_khz < start_khz || control_freq_khz > end_khz) return false; break; case NL80211_CHAN_WIDTH_80P80: @@ -461,6 +454,9 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) !cfg80211_edmg_chandef_valid(chandef)) return false; + if (!cfg80211_chandef_is_s1g(chandef) && chandef->s1g_primary_2mhz) + return false; + return valid_puncturing_bitmap(chandef); } EXPORT_SYMBOL(cfg80211_chandef_valid); @@ -725,6 +721,10 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, { struct ieee80211_channel *c; + /* DFS is not required for S1G */ + if (cfg80211_chandef_is_s1g(chandef)) + return 0; + for_each_subchan(chandef, freq, cf) { c = ieee80211_get_channel_khz(wiphy, freq); if (!c) @@ -1130,6 +1130,55 @@ static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels, return true; } +static bool cfg80211_s1g_usable(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + u32 freq_khz; + const struct ieee80211_channel *chan; + u32 pri_khz = ieee80211_channel_to_khz(chandef->chan); + u32 end_khz = cfg80211_s1g_get_end_freq_khz(chandef); + u32 start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 prohibited_flags = IEEE80211_CHAN_DISABLED; + + if (width_mhz >= 16) + prohibited_flags |= IEEE80211_CHAN_NO_16MHZ; + if (width_mhz >= 8) + prohibited_flags |= IEEE80211_CHAN_NO_8MHZ; + if (width_mhz >= 4) + prohibited_flags |= IEEE80211_CHAN_NO_4MHZ; + + if (chandef->chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + if (pri_khz < start_khz || pri_khz > end_khz) + return false; + + for_each_s1g_subchan(chandef, freq_khz) { + chan = ieee80211_get_channel_khz(wiphy, freq_khz); + if (!chan || (chan->flags & prohibited_flags)) + return false; + } + + if (chandef->s1g_primary_2mhz) { + u32 sib_khz; + const struct ieee80211_channel *sibling; + + sibling = cfg80211_s1g_get_primary_sibling(wiphy, chandef); + if (!sibling) + return false; + + if (sibling->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + sib_khz = ieee80211_channel_to_khz(sibling); + if (sib_khz < start_khz || sib_khz > end_khz) + return false; + } + + return true; +} + bool _cfg80211_chandef_usable(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, u32 prohibited_flags, @@ -1154,6 +1203,9 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, ext_nss_cap = __le16_to_cpu(vht_cap->vht_mcs.tx_highest) & IEEE80211_VHT_EXT_NSS_BW_CAPABLE; + if (cfg80211_chandef_is_s1g(chandef)) + return cfg80211_s1g_usable(wiphy, chandef); + if (edmg_cap->channels && !cfg80211_edmg_usable(wiphy, chandef->edmg.channels, @@ -1165,21 +1217,6 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, control_freq = chandef->chan->center_freq; switch (chandef->width) { - case NL80211_CHAN_WIDTH_1: - width = 1; - break; - case NL80211_CHAN_WIDTH_2: - width = 2; - break; - case NL80211_CHAN_WIDTH_4: - width = 4; - break; - case NL80211_CHAN_WIDTH_8: - width = 8; - break; - case NL80211_CHAN_WIDTH_16: - width = 16; - break; case NL80211_CHAN_WIDTH_5: width = 5; break; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4e0d40865441..de34a1d14073 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -931,6 +931,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_S1G_SHORT_BEACON] = NLA_POLICY_NESTED(nl80211_s1g_short_beacon), [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, + [NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -1319,6 +1320,15 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, nla_put_flag(msg, NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_4MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_4MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_8MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_8MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_16MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_16MHZ)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -3541,6 +3551,7 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, chandef->center_freq1 = KHZ_TO_MHZ(control_freq); chandef->freq1_offset = control_freq % 1000; chandef->center_freq2 = 0; + chandef->s1g_primary_2mhz = false; if (!chandef->chan) { NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ], @@ -3584,27 +3595,20 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, return -EINVAL; } } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { - chandef->width = - nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); - if (chandef->chan->band == NL80211_BAND_S1GHZ) { - /* User input error for channel width doesn't match channel */ - if (chandef->width != ieee80211_s1g_channel_width(chandef->chan)) { - NL_SET_ERR_MSG_ATTR(extack, - attrs[NL80211_ATTR_CHANNEL_WIDTH], - "bad channel width"); - return -EINVAL; - } - } + chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); - chandef->freq1_offset = - nla_get_u32_default(attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], - 0); + chandef->freq1_offset = nla_get_u32_default( + attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], 0); } + if (attrs[NL80211_ATTR_CENTER_FREQ2]) chandef->center_freq2 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]); + + chandef->s1g_primary_2mhz = nla_get_flag( + attrs[NL80211_ATTR_S1G_PRIMARY_2MHZ]); } if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) { @@ -10455,8 +10459,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto out_free; } - /* ignore disabled channels */ + /* Ignore disabled / no primary channels */ if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; @@ -10478,6 +10483,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) chan = &wiphy->bands[band]->channels[j]; if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & + IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 3b0ac3437f81..73cab51f6379 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1707,6 +1707,16 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + if (is_s1g) { + if (max_bandwidth_khz < MHZ_TO_KHZ(16)) + bw_flags |= IEEE80211_CHAN_NO_16MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(8)) + bw_flags |= IEEE80211_CHAN_NO_8MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(4)) + bw_flags |= IEEE80211_CHAN_NO_4MHZ; + return bw_flags; + } + /* If we get a reg_rule we can assume that at least 5Mhz fit */ if (!cfg80211_does_bw_fit_range(freq_range, center_freq_khz, @@ -1717,59 +1727,19 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (is_s1g) { - /* S1G is strict about non overlapping channels. We can - * calculate which bandwidth is allowed per channel by finding - * the largest bandwidth which cleanly divides the freq_range. - */ - int edge_offset; - int ch_bw = max_bandwidth_khz; - - while (ch_bw) { - edge_offset = (center_freq_khz - ch_bw / 2) - - freq_range->start_freq_khz; - if (edge_offset % ch_bw == 0) { - switch (KHZ_TO_MHZ(ch_bw)) { - case 1: - bw_flags |= IEEE80211_CHAN_1MHZ; - break; - case 2: - bw_flags |= IEEE80211_CHAN_2MHZ; - break; - case 4: - bw_flags |= IEEE80211_CHAN_4MHZ; - break; - case 8: - bw_flags |= IEEE80211_CHAN_8MHZ; - break; - case 16: - bw_flags |= IEEE80211_CHAN_16MHZ; - break; - default: - /* If we got here, no bandwidths fit on - * this frequency, ie. band edge. - */ - bw_flags |= IEEE80211_CHAN_DISABLED; - break; - } - break; - } - ch_bw /= 2; - } - } else { - if (max_bandwidth_khz < MHZ_TO_KHZ(10)) - bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(20)) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags |= IEEE80211_CHAN_NO_HT40; - if (max_bandwidth_khz < MHZ_TO_KHZ(80)) - bw_flags |= IEEE80211_CHAN_NO_80MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(160)) - bw_flags |= IEEE80211_CHAN_NO_160MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(320)) - bw_flags |= IEEE80211_CHAN_NO_320MHZ; - } + if (max_bandwidth_khz < MHZ_TO_KHZ(10)) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(20)) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(40)) + bw_flags |= IEEE80211_CHAN_NO_HT40; + if (max_bandwidth_khz < MHZ_TO_KHZ(80)) + bw_flags |= IEEE80211_CHAN_NO_80MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(160)) + bw_flags |= IEEE80211_CHAN_NO_160MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(320)) + bw_flags |= IEEE80211_CHAN_NO_320MHZ; + return bw_flags; } -- cgit v1.2.3 From 5222470b2fbb3740f931f189db33dd1367b1ae75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:51 +0200 Subject: nsfs: support file handles A while ago we added support for file handles to pidfs so pidfds can be encoded and decoded as file handles. Userspace has adopted this quickly and it's proven very useful. Implement file handles for namespaces as well. A process is not always able to open /proc/self/ns/. That requires procfs to be mounted and for /proc/self/ or /proc/self/ns/ to not be overmounted. However, userspace can always derive a namespace fd from a pidfd. And that always works for a task's own namespace. There's no need to introduce unnecessary behavioral differences between /proc/self/ns/ fds, pidfd-derived namespace fds, and file-handle-derived namespace fds. So namespace file handles are always decodable if the caller is located in the namespace the file handle refers to. This also allows a task to e.g., store a set of file handles to its namespaces in a file on-disk so it can verify when it gets rexeced that they're still valid and so on. This is akin to the pidfd use-case. Or just plainly for namespace comparison reasons where a file handle to the task's own namespace can be easily compared against others. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/nsfs.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/exportfs.h | 6 ++ include/uapi/linux/nsfs.h | 9 +++ 3 files changed, 173 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/nsfs.c b/fs/nsfs.c index 80e631aeb3ce..926e2680414e 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -13,6 +13,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include "mount.h" #include "internal.h" @@ -417,12 +423,164 @@ static const struct stashed_operations nsfs_stashed_ops = { .put_data = nsfs_put_data, }; +#define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32)) +#define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32)) + +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct ns_common *ns = inode->i_private; + int len = *max_len; + + if (parent) + return FILEID_INVALID; + + if (len < NSFS_FID_SIZE_U32_VER0) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + return FILEID_INVALID; + } else if (len > NSFS_FID_SIZE_U32_LATEST) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + } + + fid->ns_id = ns->ns_id; + fid->ns_type = ns->ops->type; + fid->ns_inum = inode->i_ino; + return FILEID_NSFS; +} + +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct path path __free(path_put) = {}; + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct user_namespace *owning_ns = NULL; + struct ns_common *ns; + int ret; + + if (fh_len < NSFS_FID_SIZE_U32_VER0) + return NULL; + + /* Check that any trailing bytes are zero. */ + if ((fh_len > NSFS_FID_SIZE_U32_LATEST) && + memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0, + fh_len - NSFS_FID_SIZE_U32_LATEST)) + return NULL; + + switch (fh_type) { + case FILEID_NSFS: + break; + default: + return NULL; + } + + scoped_guard(rcu) { + ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); + if (!ns) + return NULL; + + VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); + VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); + + if (!refcount_inc_not_zero(&ns->count)) + return NULL; + } + + switch (ns->ops->type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + if (!current_in_namespace(to_cg_ns(ns))) + owning_ns = to_cg_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + if (!current_in_namespace(to_ipc_ns(ns))) + owning_ns = to_ipc_ns(ns)->user_ns; + break; +#endif + case CLONE_NEWNS: + if (!current_in_namespace(to_mnt_ns(ns))) + owning_ns = to_mnt_ns(ns)->user_ns; + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + if (!current_in_namespace(to_net_ns(ns))) + owning_ns = to_net_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + if (!current_in_namespace(to_pid_ns(ns))) { + owning_ns = to_pid_ns(ns)->user_ns; + } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + if (!current_in_namespace(to_time_ns(ns))) + owning_ns = to_time_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + if (!current_in_namespace(to_user_ns(ns))) + owning_ns = to_user_ns(ns); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + if (!current_in_namespace(to_uts_ns(ns))) + owning_ns = to_uts_ns(ns)->user_ns; + break; +#endif + default: + return ERR_PTR(-EOPNOTSUPP); + } + + if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + + /* path_from_stashed() unconditionally consumes the reference. */ + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (ret) + return ERR_PTR(ret); + + return no_free_ptr(path.dentry); +} + +static int nsfs_export_permission(struct handle_to_path_ctx *ctx, + unsigned int oflags) +{ + /* nsfs_fh_to_dentry() performs all permission checks. */ + return 0; +} + +static struct file *nsfs_export_open(struct path *path, unsigned int oflags) +{ + return file_open_root(path, "", oflags, 0); +} + +static const struct export_operations nsfs_export_operations = { + .encode_fh = nsfs_encode_fh, + .fh_to_dentry = nsfs_fh_to_dentry, + .open = nsfs_export_open, + .permission = nsfs_export_permission, +}; + static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &nsfs_ops; + ctx->eops = &nsfs_export_operations; ctx->dops = &ns_dentry_operations; fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index cfb0dd1ea49c..3aac58a520c7 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -122,6 +122,12 @@ enum fid_type { FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1, FILEID_BCACHEFS_WITH_PARENT = 0xb2, + /* + * + * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number. + */ + FILEID_NSFS = 0xf1, + /* * 64 bit unique kernfs id */ diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 97d8d80d139f..fa86fe3c8bd3 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -53,4 +53,13 @@ enum init_ns_ino { MNT_NS_INIT_INO = 0xEFFFFFF8U, }; +struct nsfs_file_handle { + __u64 ns_id; + __u32 ns_type; + __u32 ns_inum; +}; + +#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ +#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ + #endif /* __LINUX_NSFS_H */ -- cgit v1.2.3 From e83f0b5d10dcf62833008327cb661c7d118bca85 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:52 +0200 Subject: nsfs: support exhaustive file handles Pidfd file handles are exhaustive meaning they don't require a handle on another pidfd to pass to open_by_handle_at() so it can derive the filesystem to decode in. Instead it can be derived from the file handle itself. The same is possible for namespace file handles. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/fhandle.c | 6 ++++++ fs/internal.h | 1 + fs/nsfs.c | 10 ++++++++++ include/uapi/linux/fcntl.h | 1 + 4 files changed, 18 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/fhandle.c b/fs/fhandle.c index 68a7d2861c58..dd734d8828d0 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" #include "mount.h" @@ -189,6 +190,11 @@ static int get_path_anchor(int fd, struct path *root) return 0; } + if (fd == FD_NSFS_ROOT) { + nsfs_get_root(root); + return 0; + } + return -EBADF; } diff --git a/fs/internal.h b/fs/internal.h index 38e8aab27bbd..a33d18ee5b74 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -355,3 +355,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); void pidfs_get_root(struct path *path); +void nsfs_get_root(struct path *path); diff --git a/fs/nsfs.c b/fs/nsfs.c index 926e2680414e..22765fcab18e 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -25,6 +25,14 @@ static struct vfsmount *nsfs_mnt; +static struct path nsfs_root_path = {}; + +void nsfs_get_root(struct path *path) +{ + *path = nsfs_root_path; + path_get(path); +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static const struct file_operations ns_file_operations = { @@ -598,4 +606,6 @@ void __init nsfs_init(void) if (IS_ERR(nsfs_mnt)) panic("can't set nsfs up\n"); nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; + nsfs_root_path.mnt = nsfs_mnt; + nsfs_root_path.dentry = nsfs_mnt->mnt_root; } diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index f291ab4f94eb..3741ea1b73d8 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -111,6 +111,7 @@ #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ #define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ #define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ -- cgit v1.2.3 From f861225b9ee9cb2da1c7b2f5f921856cb8ca86bb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:53 +0200 Subject: nsfs: add missing id retrieval support The mount namespace has supported id retrieval for a while already. Add support for the other types as well. Signed-off-by: Christian Brauner --- fs/nsfs.c | 25 +++++++++++++------------ include/uapi/linux/nsfs.h | 6 ++++-- 2 files changed, 17 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/nsfs.c b/fs/nsfs.c index 22765fcab18e..8484bc4dd3de 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -177,6 +177,7 @@ static bool nsfs_ioctl_valid(unsigned int cmd) case NS_GET_TGID_FROM_PIDNS: case NS_GET_PID_IN_PIDNS: case NS_GET_TGID_IN_PIDNS: + case NS_GET_ID: return true; } @@ -226,18 +227,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, argp = (uid_t __user *) arg; uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); - case NS_GET_MNTNS_ID: { - __u64 __user *idp; - __u64 id; - - if (ns->ops->type != CLONE_NEWNS) - return -EINVAL; - - mnt_ns = container_of(ns, struct mnt_namespace, ns); - idp = (__u64 __user *)arg; - id = mnt_ns->ns.ns_id; - return put_user(id, idp); - } case NS_GET_PID_FROM_PIDNS: fallthrough; case NS_GET_TGID_FROM_PIDNS: @@ -283,6 +272,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, ret = -ESRCH; return ret; } + case NS_GET_MNTNS_ID: + if (ns->ops->type != CLONE_NEWNS) + return -EINVAL; + fallthrough; + case NS_GET_ID: { + __u64 __user *idp; + __u64 id; + + idp = (__u64 __user *)arg; + id = ns->ns_id; + return put_user(id, idp); + } } /* extensible ioctls */ diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index fa86fe3c8bd3..5d5bf22464c9 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -16,8 +16,6 @@ #define NS_GET_NSTYPE _IO(NSIO, 0x3) /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) -/* Get the id for a mount namespace */ -#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) /* Translate pid from target pid namespace into the caller's pid namespace. */ #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) /* Return thread-group leader id of pid in the callers pid namespace. */ @@ -42,6 +40,10 @@ struct mnt_ns_info { /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) +/* Retrieve namespace identifiers. */ +#define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64) +#define NS_GET_ID _IOR(NSIO, 13, __u64) + enum init_ns_ino { IPC_NS_INIT_INO = 0xEFFFFFFFU, UTS_NS_INIT_INO = 0xEFFFFFFEU, -- cgit v1.2.3 From cc47f434271ba90c18c16e0bba360df38a8bc954 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:04 +0200 Subject: nsfs: add inode number for anon namespace Add an inode number anonymous namespaces. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/uapi/linux/nsfs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 5d5bf22464c9..e098759ec917 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -53,6 +53,9 @@ enum init_ns_ino { TIME_NS_INIT_INO = 0xEFFFFFFAU, NET_NS_INIT_INO = 0xEFFFFFF9U, MNT_NS_INIT_INO = 0xEFFFFFF8U, +#ifdef __KERNEL__ + MNT_NS_ANON_INO = 0xEFFFFFF7U, +#endif }; struct nsfs_file_handle { -- cgit v1.2.3 From b73b8146d7ff68e245525adb944a4c998d423d59 Mon Sep 17 00:00:00 2001 From: Alasdair McWilliam Date: Wed, 17 Sep 2025 10:55:42 +0100 Subject: rtnetlink: add needed_{head,tail}room attributes Various network interface types make use of needed_{head,tail}room values to efficiently reserve buffer space for additional encapsulation headers, such as VXLAN, Geneve, IPSec, etc. However, it is not currently possible to query these values in a generic way. Introduce ability to query the needed_{head,tail}room values of a network device via rtnetlink, such that applications that may wish to use these values can do so. For example, Cilium agent iterates over present devices based on user config (direct routing, vxlan, geneve, wireguard etc.) and in future will configure netkit in order to expose the needed_{head,tail}room into K8s pods. See b9ed315d3c4c ("netkit: Allow for configuring needed_{head,tail}room"). Suggested-by: Daniel Borkmann Signed-off-by: Alasdair McWilliam Reviewed-by: Daniel Borkmann Link: https://patch.msgid.link/20250917095543.14039-1-alasdair@mcwilliam.dev Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 6 ++++++ include/uapi/linux/if_link.h | 2 ++ net/core/rtnetlink.c | 10 +++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index 6ab31f86854d..2a23e9699c0b 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -1057,6 +1057,12 @@ attribute-sets: - name: netns-immutable type: u8 + - + name: headroom + type: u16 + - + name: tailroom + type: u16 - name: prop-list-link-attrs subset-of: link-attrs diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 45f56c9f95d9..3b491d96e52e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -379,6 +379,8 @@ enum { IFLA_DPLL_PIN, IFLA_MAX_PACING_OFFLOAD_HORIZON, IFLA_NETNS_IMMUTABLE, + IFLA_HEADROOM, + IFLA_TAILROOM, __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 094b085cff20..d9e68ca84926 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1326,6 +1326,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + + nla_total_size(2) /* IFLA_HEADROOM */ + + nla_total_size(2) /* IFLA_TAILROOM */ + 0; } @@ -2091,7 +2093,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, - atomic_read(&dev->carrier_down_count))) + atomic_read(&dev->carrier_down_count)) || + nla_put_u16(skb, IFLA_HEADROOM, + READ_ONCE(dev->needed_headroom)) || + nla_put_u16(skb, IFLA_TAILROOM, + READ_ONCE(dev->needed_tailroom))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) @@ -2243,6 +2249,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT }, + [IFLA_HEADROOM] = { .type = NLA_REJECT }, + [IFLA_TAILROOM] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { -- cgit v1.2.3 From d6d673dd1e92b2bed0096e7e7e9fe5d7e7d2156c Mon Sep 17 00:00:00 2001 From: Ashwini Sahu Date: Mon, 8 Sep 2025 15:26:45 +0530 Subject: uapi: vduse: fix typo in comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a spelling mistake in vduse.h: "regsion" → "region" in the documentation for struct vduse_iova_info. No functional change. Signed-off-by: Ashwini Sahu Message-Id: <20250908095645.610336-1-ashwini@wisig.com> Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/vduse.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index 68a627d04afa..10ad71aa00d6 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -237,7 +237,7 @@ struct vduse_iova_umem { * struct vduse_iova_info - information of one IOVA region * @start: start of the IOVA region * @last: last of the IOVA region - * @capability: capability of the IOVA regsion + * @capability: capability of the IOVA region * @reserved: for future use, needs to be initialized to zero * * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of -- cgit v1.2.3 From d9a2211dd3aee3ef29fc675f70a1941bc3f4f51f Mon Sep 17 00:00:00 2001 From: Haixu Cui Date: Mon, 8 Sep 2025 17:23:46 +0800 Subject: virtio: Add ID for virtio SPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add VIRTIO_ID_SPI definition for virtio SPI. Signed-off-by: Haixu Cui Reviewed-by: Viresh Kumar Reviewed-by: Alex Bennée Link: https://patch.msgid.link/20250908092348.1283552-2-quic_haixcui@quicinc.com Signed-off-by: Mark Brown --- include/uapi/linux/virtio_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 7aa2eb766205..6c12db16faa3 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -68,6 +68,7 @@ #define VIRTIO_ID_AUDIO_POLICY 39 /* virtio audio policy */ #define VIRTIO_ID_BT 40 /* virtio bluetooth */ #define VIRTIO_ID_GPIO 41 /* virtio gpio */ +#define VIRTIO_ID_SPI 45 /* virtio spi */ /* * Virtio Transitional IDs -- cgit v1.2.3 From 6a1f3390fafeafe130b8128b3047452b92911a98 Mon Sep 17 00:00:00 2001 From: Haixu Cui Date: Mon, 8 Sep 2025 17:23:47 +0800 Subject: virtio-spi: Add virtio-spi.h Add virtio-spi.h header for virtio SPI. Signed-off-by: Haixu Cui Link: https://patch.msgid.link/20250908092348.1283552-3-quic_haixcui@quicinc.com Signed-off-by: Mark Brown --- MAINTAINERS | 6 ++ include/uapi/linux/virtio_spi.h | 181 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 include/uapi/linux/virtio_spi.h (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 931e68699ab9..18cd254aa85b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26803,6 +26803,12 @@ S: Maintained F: include/uapi/linux/virtio_snd.h F: sound/virtio/* +VIRTIO SPI DRIVER +M: Haixu Cui +L: virtualization@lists.linux.dev +S: Maintained +F: include/uapi/linux/virtio_spi.h + VIRTUAL BOX GUEST DEVICE DRIVER M: Hans de Goede M: Arnd Bergmann diff --git a/include/uapi/linux/virtio_spi.h b/include/uapi/linux/virtio_spi.h new file mode 100644 index 000000000000..8ab3c970cdd3 --- /dev/null +++ b/include/uapi/linux/virtio_spi.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* + * Copyright (C) 2023 OpenSynergy GmbH + * Copyright (C) 2025 Qualcomm Innovation Center, Inc. All rights reserved. + */ +#ifndef _LINUX_VIRTIO_VIRTIO_SPI_H +#define _LINUX_VIRTIO_VIRTIO_SPI_H + +#include +#include +#include +#include + +/* Sample data on trailing clock edge */ +#define VIRTIO_SPI_CPHA _BITUL(0) +/* Clock is high when IDLE */ +#define VIRTIO_SPI_CPOL _BITUL(1) +/* Chip Select is active high */ +#define VIRTIO_SPI_CS_HIGH _BITUL(2) +/* Transmit LSB first */ +#define VIRTIO_SPI_MODE_LSB_FIRST _BITUL(3) +/* Loopback mode */ +#define VIRTIO_SPI_MODE_LOOP _BITUL(4) + +/** + * struct virtio_spi_config - All config fields are read-only for the + * Virtio SPI driver + * @cs_max_number: maximum number of chipselect the host SPI controller + * supports. + * @cs_change_supported: indicates if the host SPI controller supports to toggle + * chipselect after each transfer in one message: + * 0: unsupported, chipselect will be kept in active state throughout the + * message transaction; + * 1: supported. + * Note: Message here contains a sequence of SPI transfers. + * @tx_nbits_supported: indicates the supported number of bit for writing: + * bit 0: DUAL (2-bit transfer), 1 for supported + * bit 1: QUAD (4-bit transfer), 1 for supported + * bit 2: OCTAL (8-bit transfer), 1 for supported + * other bits are reserved as 0, 1-bit transfer is always supported. + * @rx_nbits_supported: indicates the supported number of bit for reading: + * bit 0: DUAL (2-bit transfer), 1 for supported + * bit 1: QUAD (4-bit transfer), 1 for supported + * bit 2: OCTAL (8-bit transfer), 1 for supported + * other bits are reserved as 0, 1-bit transfer is always supported. + * @bits_per_word_mask: mask indicating which values of bits_per_word are + * supported. If not set, no limitation for bits_per_word. + * @mode_func_supported: indicates the following features are supported or not: + * bit 0-1: CPHA feature + * 0b00: invalid, should support as least one CPHA setting + * 0b01: supports CPHA=0 only + * 0b10: supports CPHA=1 only + * 0b11: supports CPHA=0 and CPHA=1. + * bit 2-3: CPOL feature + * 0b00: invalid, should support as least one CPOL setting + * 0b01: supports CPOL=0 only + * 0b10: supports CPOL=1 only + * 0b11: supports CPOL=0 and CPOL=1. + * bit 4: chipselect active high feature, 0 for unsupported and 1 for + * supported, chipselect active low is supported by default. + * bit 5: LSB first feature, 0 for unsupported and 1 for supported, + * MSB first is supported by default. + * bit 6: loopback mode feature, 0 for unsupported and 1 for supported, + * normal mode is supported by default. + * @max_freq_hz: the maximum clock rate supported in Hz unit, 0 means no + * limitation for transfer speed. + * @max_word_delay_ns: the maximum word delay supported, in nanoseconds. + * A value of 0 indicates that word delay is unsupported. + * Each transfer may consist of a sequence of words. + * @max_cs_setup_ns: the maximum delay supported after chipselect is asserted, + * in ns unit, 0 means delay is not supported to introduce after chipselect is + * asserted. + * @max_cs_hold_ns: the maximum delay supported before chipselect is deasserted, + * in ns unit, 0 means delay is not supported to introduce before chipselect + * is deasserted. + * @max_cs_incative_ns: maximum delay supported after chipselect is deasserted, + * in ns unit, 0 means delay is not supported to introduce after chipselect is + * deasserted. + */ +struct virtio_spi_config { + __u8 cs_max_number; + __u8 cs_change_supported; +#define VIRTIO_SPI_RX_TX_SUPPORT_DUAL _BITUL(0) +#define VIRTIO_SPI_RX_TX_SUPPORT_QUAD _BITUL(1) +#define VIRTIO_SPI_RX_TX_SUPPORT_OCTAL _BITUL(2) + __u8 tx_nbits_supported; + __u8 rx_nbits_supported; + __le32 bits_per_word_mask; +#define VIRTIO_SPI_MF_SUPPORT_CPHA_0 _BITUL(0) +#define VIRTIO_SPI_MF_SUPPORT_CPHA_1 _BITUL(1) +#define VIRTIO_SPI_MF_SUPPORT_CPOL_0 _BITUL(2) +#define VIRTIO_SPI_MF_SUPPORT_CPOL_1 _BITUL(3) +#define VIRTIO_SPI_MF_SUPPORT_CS_HIGH _BITUL(4) +#define VIRTIO_SPI_MF_SUPPORT_LSB_FIRST _BITUL(5) +#define VIRTIO_SPI_MF_SUPPORT_LOOPBACK _BITUL(6) + __le32 mode_func_supported; + __le32 max_freq_hz; + __le32 max_word_delay_ns; + __le32 max_cs_setup_ns; + __le32 max_cs_hold_ns; + __le32 max_cs_inactive_ns; +}; + +/** + * struct spi_transfer_head - virtio SPI transfer descriptor + * @chip_select_id: chipselect index the SPI transfer used. + * @bits_per_word: the number of bits in each SPI transfer word. + * @cs_change: whether to deselect device after finishing this transfer + * before starting the next transfer, 0 means cs keep asserted and + * 1 means cs deasserted then asserted again. + * @tx_nbits: bus width for write transfer. + * 0,1: bus width is 1, also known as SINGLE + * 2 : bus width is 2, also known as DUAL + * 4 : bus width is 4, also known as QUAD + * 8 : bus width is 8, also known as OCTAL + * other values are invalid. + * @rx_nbits: bus width for read transfer. + * 0,1: bus width is 1, also known as SINGLE + * 2 : bus width is 2, also known as DUAL + * 4 : bus width is 4, also known as QUAD + * 8 : bus width is 8, also known as OCTAL + * other values are invalid. + * @reserved: for future use. + * @mode: SPI transfer mode. + * bit 0: CPHA, determines the timing (i.e. phase) of the data + * bits relative to the clock pulses.For CPHA=0, the + * "out" side changes the data on the trailing edge of the + * preceding clock cycle, while the "in" side captures the data + * on (or shortly after) the leading edge of the clock cycle. + * For CPHA=1, the "out" side changes the data on the leading + * edge of the current clock cycle, while the "in" side + * captures the data on (or shortly after) the trailing edge of + * the clock cycle. + * bit 1: CPOL, determines the polarity of the clock. CPOL=0 is a + * clock which idles at 0, and each cycle consists of a pulse + * of 1. CPOL=1 is a clock which idles at 1, and each cycle + * consists of a pulse of 0. + * bit 2: CS_HIGH, if 1, chip select active high, else active low. + * bit 3: LSB_FIRST, determines per-word bits-on-wire, if 0, MSB + * first, else LSB first. + * bit 4: LOOP, loopback mode. + * @freq: the transfer speed in Hz. + * @word_delay_ns: delay to be inserted between consecutive words of a + * transfer, in ns unit. + * @cs_setup_ns: delay to be introduced after CS is asserted, in ns + * unit. + * @cs_delay_hold_ns: delay to be introduced before CS is deasserted + * for each transfer, in ns unit. + * @cs_change_delay_inactive_ns: delay to be introduced after CS is + * deasserted and before next asserted, in ns unit. + */ +struct spi_transfer_head { + __u8 chip_select_id; + __u8 bits_per_word; + __u8 cs_change; + __u8 tx_nbits; + __u8 rx_nbits; + __u8 reserved[3]; + __le32 mode; + __le32 freq; + __le32 word_delay_ns; + __le32 cs_setup_ns; + __le32 cs_delay_hold_ns; + __le32 cs_change_delay_inactive_ns; +}; + +/** + * struct spi_transfer_result - virtio SPI transfer result + * @result: Transfer result code. + * VIRTIO_SPI_TRANS_OK: Transfer successful. + * VIRTIO_SPI_PARAM_ERR: Parameter error. + * VIRTIO_SPI_TRANS_ERR: Transfer error. + */ +struct spi_transfer_result { +#define VIRTIO_SPI_TRANS_OK 0 +#define VIRTIO_SPI_PARAM_ERR 1 +#define VIRTIO_SPI_TRANS_ERR 2 + __u8 result; +}; + +#endif /* #ifndef _LINUX_VIRTIO_VIRTIO_SPI_H */ -- cgit v1.2.3 From cd875625b475dc4e28ac302ccb3422cc9f678f89 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Thu, 18 Sep 2025 17:33:18 -0700 Subject: ptp: document behavior of PTP_STRICT_FLAGS Commit 6138e687c7b6 ("ptp: Introduce strict checking of external time stamp options.") added the PTP_STRICT_FLAGS to the set of flags supported for the external timestamp request ioctl. It is only supported by PTP_EXTTS_REQUEST2, as it was introduced the introduction of the new ioctls. Further, the kernel has always set this flag for PTP_EXTTS_REQUEST2 regardless of whether or not the user requested the behavior. This effectively means that the flag is not useful for userspace. If the user issues a PTP_EXTTS_REQUEST ioctl, the flag is ignored due to not being supported on the old ioctl. If the user issues a PTP_EXTTS_REQUEST2 ioctl, the flag will be set by the kernel regardless of whether the user set the flag in their structure. Add a comment documenting this behavior in the uAPI header file. Signed-off-by: Jacob Keller Reviewed-by: Vadim Fedorenko Acked-by: Richard Cochran Reviewed-by: Kory Maincent Tested-by: James Clark Link: https://patch.msgid.link/20250918-jk-fix-bcm-phy-supported-flags-v1-3-747b60407c9c@intel.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ptp_clock.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 18eefa6d93d6..2c3346e91dbe 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -37,6 +37,9 @@ /* * flag fields valid for the new PTP_EXTTS_REQUEST2 ioctl. + * + * Note: PTP_STRICT_FLAGS is always enabled by the kernel for + * PTP_EXTTS_REQUEST2 regardless of whether it is set by userspace. */ #define PTP_EXTTS_VALID_FLAGS (PTP_ENABLE_FEATURE | \ PTP_RISING_EDGE | \ -- cgit v1.2.3 From c9809f03c158f07eaa76c7dd3606fc0a184520f2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 19 Sep 2025 14:08:58 +0200 Subject: mptcp: pm: netlink: only add server-side attr when true This attribute is a boolean. No need to add it to set it to 'false'. Indeed, the default value when this attribute is not set is naturally 'false'. A few bytes can then be saved by not adding this attribute if the connection is not on the server side. This prepares the future deprecation of its attribute, in favour of a new flag. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250919-net-next-mptcp-server-side-flag-v1-1-a97a5d561a8b@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 4 ++-- include/uapi/linux/mptcp_pm.h | 4 ++-- net/mptcp/pm_netlink.c | 4 +++- tools/testing/selftests/net/mptcp/userspace_pm.sh | 2 +- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index d1b4829b580a..fc47a2931014 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -28,13 +28,13 @@ definitions: traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side, [flags]. + dport, [server-side], [flags]. - name: established doc: >- A MPTCP connection is established (can start new subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side, [flags]. + dport, [server-side], [flags]. - name: closed doc: >- diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 7359d34da446..bf44a5cf5b5a 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -16,10 +16,10 @@ * good time to allocate memory and send ADD_ADDR if needed. Depending on the * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side, [flags]. + * sport, dport, [server-side], [flags]. * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side, [flags]. + * sport, dport, [server-side], [flags]. * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 483ddbb9ec40..33a6bf536c02 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -413,7 +413,9 @@ static int mptcp_event_created(struct sk_buff *skb, if (err) return err; - if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) + /* only set when it is the server side */ + if (READ_ONCE(msk->pm.server_side) && + nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) return -EMSGSIZE; if (READ_ONCE(msk->pm.remote_deny_join_id0)) diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 3d45991f24ed..87323942cb8a 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -241,7 +241,7 @@ make_connection() print_test "Established IP${is_v6} MPTCP Connection ns2 => ns1" if [ "${client_token}" != "" ] && [ "${server_token}" != "" ] && - [ "${client_serverside}" = 0 ] && [ "${server_serverside}" = 1 ] && + [ "${client_serverside:-0}" = 0 ] && [ "${server_serverside:-0}" = 1 ] && [ "${client_nojoin:-0}" = 0 ] && [ "${server_nojoin:-0}" = 1 ] then test_pass -- cgit v1.2.3 From 3d7ae91107b839ffeeb19730a2e2a46e0054bae8 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 19 Sep 2025 14:08:59 +0200 Subject: mptcp: pm: netlink: announce server-side flag Now that the 'flags' attribute is used, it seems interesting to add one flag for 'server-side', a boolean value. This is duplicating the info from the dedicated 'server-side' attribute, but it will be deprecated in the next commit, and removed in a few versions. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250919-net-next-mptcp-server-side-flag-v1-2-a97a5d561a8b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm_netlink.c | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 5fd5b4cf75ca..95d621f6d598 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -32,6 +32,7 @@ #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) #define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) +#define MPTCP_PM_EV_FLAG_SERVER_SIDE _BITUL(1) #define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 33a6bf536c02..aa0c73faaa6a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -413,10 +413,13 @@ static int mptcp_event_created(struct sk_buff *skb, if (err) return err; - /* only set when it is the server side */ - if (READ_ONCE(msk->pm.server_side) && - nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) - return -EMSGSIZE; + if (READ_ONCE(msk->pm.server_side)) { + flags |= MPTCP_PM_EV_FLAG_SERVER_SIDE; + + /* only set when it is the server side */ + if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) + return -EMSGSIZE; + } if (READ_ONCE(msk->pm.remote_deny_join_id0)) flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; -- cgit v1.2.3 From 5c967ebb551919661166305c0ff9422e41065c02 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 19 Sep 2025 14:09:02 +0200 Subject: mptcp: use _BITUL() instead of (1 << x) Simply to use the proper way to declare bits, and to align with all other flags declared in this file. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250919-net-next-mptcp-server-side-flag-v1-5-a97a5d561a8b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 95d621f6d598..15eef878690b 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -34,11 +34,11 @@ #define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) #define MPTCP_PM_EV_FLAG_SERVER_SIDE _BITUL(1) -#define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) -#define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) -#define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) -#define MPTCP_PM_ADDR_FLAG_FULLMESH (1 << 3) -#define MPTCP_PM_ADDR_FLAG_IMPLICIT (1 << 4) +#define MPTCP_PM_ADDR_FLAG_SIGNAL _BITUL(0) +#define MPTCP_PM_ADDR_FLAG_SUBFLOW _BITUL(1) +#define MPTCP_PM_ADDR_FLAG_BACKUP _BITUL(2) +#define MPTCP_PM_ADDR_FLAG_FULLMESH _BITUL(3) +#define MPTCP_PM_ADDR_FLAG_IMPLICIT _BITUL(4) struct mptcp_info { __u8 mptcpi_subflows; -- cgit v1.2.3 From 349271568303695f0ac3563af153d2b4542f6986 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 21 Sep 2025 18:01:16 +0200 Subject: bpf: Implement signature verification for BPF programs This patch extends the BPF_PROG_LOAD command by adding three new fields to `union bpf_attr` in the user-space API: - signature: A pointer to the signature blob. - signature_size: The size of the signature blob. - keyring_id: The serial number of a loaded kernel keyring (e.g., the user or session keyring) containing the trusted public keys. When a BPF program is loaded with a signature, the kernel: 1. Retrieves the trusted keyring using the provided `keyring_id`. 2. Verifies the supplied signature against the BPF program's instruction buffer. 3. If the signature is valid and was generated by a key in the trusted keyring, the program load proceeds. 4. If no signature is provided, the load proceeds as before, allowing for backward compatibility. LSMs can chose to restrict unsigned programs and implement a security policy. 5. If signature verification fails for any reason, the program is not loaded. Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250921160120.9711-2-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- crypto/asymmetric_keys/pkcs7_verify.c | 1 + include/linux/verification.h | 1 + include/uapi/linux/bpf.h | 10 ++++++++ kernel/bpf/helpers.c | 2 +- kernel/bpf/syscall.c | 45 ++++++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 10 ++++++++ tools/lib/bpf/bpf.c | 2 +- 7 files changed, 68 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c index f0d4ff3c20a8..6d6475e3a9bf 100644 --- a/crypto/asymmetric_keys/pkcs7_verify.c +++ b/crypto/asymmetric_keys/pkcs7_verify.c @@ -429,6 +429,7 @@ int pkcs7_verify(struct pkcs7_message *pkcs7, /* Authattr presence checked in parser */ break; case VERIFYING_UNSPECIFIED_SIGNATURE: + case VERIFYING_BPF_SIGNATURE: if (pkcs7->data_type != OID_data) { pr_warn("Invalid unspecified sig (not pkcs7-data)\n"); return -EKEYREJECTED; diff --git a/include/linux/verification.h b/include/linux/verification.h index 4f3022d081c3..dec7f2beabfd 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -36,6 +36,7 @@ enum key_being_used_for { VERIFYING_KEY_SIGNATURE, VERIFYING_KEY_SELF_SIGNATURE, VERIFYING_UNSPECIFIED_SIGNATURE, + VERIFYING_BPF_SIGNATURE, NR__KEY_BEING_USED_FOR }; #ifdef CONFIG_SYSTEM_DATA_VERIFICATION diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0987b52d5648..f3b173e48b0f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1611,6 +1611,16 @@ union bpf_attr { * continuous. */ __u32 fd_array_cnt; + /* Pointer to a buffer containing the signature of the BPF + * program. + */ + __aligned_u64 signature; + /* Size of the signature buffer in bytes. */ + __u32 signature_size; + /* ID of the kernel keyring to be used for signature + * verification. + */ + __s32 keyring_id; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ef4ede8bb74f..969f63f8ca28 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3898,7 +3898,7 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, return verify_pkcs7_signature(data, data_len, sig, sig_len, trusted_keyring->key, - VERIFYING_UNSPECIFIED_SIGNATURE, NULL, + VERIFYING_BPF_SIGNATURE, NULL, NULL); #else return -EOPNOTSUPP; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf7173b1bb83..8a3c3d26f6e2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -2785,8 +2786,44 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } } +static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, + bool is_kernel) +{ + bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); + struct bpf_dynptr_kern sig_ptr, insns_ptr; + struct bpf_key *key = NULL; + void *sig; + int err = 0; + + if (system_keyring_id_check(attr->keyring_id) == 0) + key = bpf_lookup_system_key(attr->keyring_id); + else + key = bpf_lookup_user_key(attr->keyring_id, 0); + + if (!key) + return -EINVAL; + + sig = kvmemdup_bpfptr(usig, attr->signature_size); + if (IS_ERR(sig)) { + bpf_key_put(key); + return -ENOMEM; + } + + bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0, + attr->signature_size); + bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0, + prog->len * sizeof(struct bpf_insn)); + + err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, + (struct bpf_dynptr *)&sig_ptr, key); + + bpf_key_put(key); + kvfree(sig); + return err; +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt +#define BPF_PROG_LOAD_LAST_FIELD keyring_id static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { @@ -2950,6 +2987,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) /* eBPF programs must be GPL compatible to use GPL-ed functions */ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; + if (attr->signature) { + err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel); + if (err) + goto free_prog; + } + prog->orig_prog = NULL; prog->jited = 0; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0987b52d5648..f3b173e48b0f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1611,6 +1611,16 @@ union bpf_attr { * continuous. */ __u32 fd_array_cnt; + /* Pointer to a buffer containing the signature of the BPF + * program. + */ + __aligned_u64 signature; + /* Size of the signature buffer in bytes. */ + __u32 signature_size; + /* ID of the kernel keyring to be used for signature + * verification. + */ + __s32 keyring_id; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 19ad7bcf0c2f..339b19797237 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -240,7 +240,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, size_t insn_cnt, struct bpf_prog_load_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, fd_array_cnt); + const size_t attr_sz = offsetofend(union bpf_attr, keyring_id); void *finfo = NULL, *linfo = NULL; const char *func_info, *line_info; __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; -- cgit v1.2.3 From 5c8fd7e2b5b0a527cf88740da122166695382a78 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 23 Sep 2025 12:24:00 +0100 Subject: bpf: bpf task work plumbing This patch adds necessary plumbing in verifier, syscall and maps to support handling new kfunc bpf_task_work_schedule and kernel structure bpf_task_work. The idea is similar to how we already handle bpf_wq and bpf_timer. verifier changes validate calls to bpf_task_work_schedule to make sure it is safe and expected invariants hold. btf part is required to detect bpf_task_work structure inside map value and store its offset, which will be used in the next patch to calculate key and value addresses. arraymap and hashtab changes are needed to handle freeing of the bpf_task_work: run code needed to deinitialize it, for example cancel task_work callback if possible. The use of bpf_task_work and proper implementation for kfuncs are introduced in the next patch. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250923112404.668720-6-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++ include/uapi/linux/bpf.h | 4 ++ kernel/bpf/arraymap.c | 8 +-- kernel/bpf/btf.c | 7 +++ kernel/bpf/hashtab.c | 19 ++++--- kernel/bpf/helpers.c | 40 ++++++++++++++ kernel/bpf/syscall.c | 16 +++++- kernel/bpf/verifier.c | 117 ++++++++++++++++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 4 ++ 9 files changed, 208 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dfc1a27b56d5..a2ab51fa8b0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -209,6 +209,7 @@ enum btf_field_type { BPF_WORKQUEUE = (1 << 10), BPF_UPTR = (1 << 11), BPF_RES_SPIN_LOCK = (1 << 12), + BPF_TASK_WORK = (1 << 13), }; enum bpf_cgroup_storage_type { @@ -262,6 +263,7 @@ struct btf_record { int timer_off; int wq_off; int refcount_off; + int task_work_off; struct btf_field fields[]; }; @@ -363,6 +365,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_rb_node"; case BPF_REFCOUNT: return "bpf_refcount"; + case BPF_TASK_WORK: + return "bpf_task_work"; default: WARN_ON_ONCE(1); return "unknown"; @@ -401,6 +405,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_rb_node); case BPF_REFCOUNT: return sizeof(struct bpf_refcount); + case BPF_TASK_WORK: + return sizeof(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -433,6 +439,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_rb_node); case BPF_REFCOUNT: return __alignof__(struct bpf_refcount); + case BPF_TASK_WORK: + return __alignof__(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -464,6 +472,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: + case BPF_TASK_WORK: break; default: WARN_ON_ONCE(1); @@ -600,6 +609,7 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); void bpf_wq_cancel_and_free(void *timer); +void bpf_task_work_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, @@ -2426,6 +2436,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f3b173e48b0f..ae83d8649ef1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7436,6 +7436,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 26d5dda989bc..80b1765a3159 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -443,7 +443,7 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } -static void array_map_free_timers_wq(struct bpf_map *map) +static void array_map_free_internal_structs(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -451,12 +451,14 @@ static void array_map_free_timers_wq(struct bpf_map *map) /* We don't reset or free fields other than timer and workqueue * on uref dropping to zero. */ - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { for (i = 0; i < array->map.max_entries; i++) { if (btf_record_has_field(map->record, BPF_TIMER)) bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); if (btf_record_has_field(map->record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_TASK_WORK)) + bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i)); } } } @@ -795,7 +797,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, - .map_release_uref = array_map_free_timers_wq, + .map_release_uref = array_map_free_internal_structs, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c51e16bbf0c1..9f47a3aa7ff8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3490,6 +3490,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true }, { BPF_TIMER, "bpf_timer", true }, { BPF_WORKQUEUE, "bpf_wq", true }, + { BPF_TASK_WORK, "bpf_task_work", true }, { BPF_LIST_HEAD, "bpf_list_head", false }, { BPF_LIST_NODE, "bpf_list_node", false }, { BPF_RB_ROOT, "bpf_rb_root", false }, @@ -3675,6 +3676,7 @@ static int btf_find_field_one(const struct btf *btf, case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: + case BPF_TASK_WORK: ret = btf_find_struct(btf, var_type, off, sz, field_type, info_cnt ? &info[0] : &tmp); if (ret < 0) @@ -3967,6 +3969,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; + rec->task_work_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); if (info_arr[i].off + field_type_size > value_size) { @@ -4006,6 +4009,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->wq_off = rec->fields[i].offset; break; + case BPF_TASK_WORK: + WARN_ON_ONCE(rec->task_work_off >= 0); + rec->task_work_off = rec->fields[i].offset; + break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2319f8f8fa3e..c2fcd0cd51e5 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -223,9 +223,12 @@ static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem * if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, htab_elem_value(elem, htab->map.key_size)); + if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) + bpf_obj_free_task_work(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); } -static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; @@ -1495,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab) } } -static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_malloced_internal_structs(struct bpf_htab *htab) { int i; @@ -1514,16 +1517,16 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) rcu_read_unlock(); } -static void htab_map_free_timers_and_wq(struct bpf_map *map) +static void htab_map_free_internal_structs(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_and_wq(htab); + htab_free_malloced_internal_structs(htab); else - htab_free_prealloced_timers_and_wq(htab); + htab_free_prealloced_internal_structs(htab); } } @@ -2255,7 +2258,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2276,7 +2279,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 969f63f8ca28..7f5e528df13b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3906,8 +3906,48 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, } #endif /* CONFIG_KEYS */ +typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); + +/** + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return 0; +} + +/** + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return 0; +} + __bpf_kfunc_end_defs(); +void bpf_task_work_cancel_and_free(void *val) +{ +} + BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a3c3d26f6e2..adb05d235011 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -674,6 +674,7 @@ void btf_record_free(struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to release */ break; default: @@ -727,6 +728,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to acquire */ break; default: @@ -785,6 +787,13 @@ void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) bpf_wq_cancel_and_free(obj + rec->wq_off); } +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK))) + return; + bpf_task_work_cancel_and_free(obj + rec->task_work_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -809,6 +818,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_WORKQUEUE: bpf_wq_cancel_and_free(field_ptr); break; + case BPF_TASK_WORK: + bpf_task_work_cancel_and_free(field_ptr); + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -1240,7 +1252,8 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR | + BPF_TASK_WORK, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1272,6 +1285,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, break; case BPF_TIMER: case BPF_WORKQUEUE: + case BPF_TASK_WORK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 02b93a54a446..ceeb0ffe7d67 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2224,10 +2224,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) - reg->map_uid = reg->id; - if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) + if (btf_record_has_field(map->inner_map_meta->record, + BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { reg->map_uid = reg->id; + } } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -8461,6 +8461,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, case BPF_TIMER: field_off = map->record->timer_off; break; + case BPF_TASK_WORK: + field_off = map->record->task_work_off; + break; default: verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; @@ -8514,6 +8517,26 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_task_work_func(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_map_field_pointer(env, regno, BPF_TASK_WORK); + if (err) + return err; + + if (meta->map.ptr) { + verifier_bug(env, "Two map pointers in a bpf_task_work helper"); + return -EFAULT; + } + meta->map.uid = reg->map_uid; + meta->map.ptr = map; + return 0; +} + static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { @@ -10343,6 +10366,8 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx); +static bool is_task_work_add_kfunc(u32 func_id); + static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); @@ -10561,7 +10586,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, insn_idx, subprog, - is_bpf_wq_set_callback_impl_kfunc(insn->imm)); + is_bpf_wq_set_callback_impl_kfunc(insn->imm) || + is_task_work_add_kfunc(insn->imm)); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; @@ -10876,6 +10902,36 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr; + + /* + * callback_fn(struct bpf_map *map, void *key, void *value); + */ + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_1].map_ptr = map_ptr; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = map_ptr; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_async_callback_fn = true; + callee->callback_ret_range = retval_range(S32_MIN, S32_MAX); + return 0; +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id); /* Are we currently verifying the callback for a rbtree helper that must @@ -12000,6 +12056,7 @@ enum { KF_ARG_RB_NODE_ID, KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, + KF_ARG_TASK_WORK_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12010,6 +12067,7 @@ BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) +BTF_ID(struct, bpf_task_work) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12058,6 +12116,11 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); } +static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID); +} + static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) { return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); @@ -12145,6 +12208,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, KF_ARG_PTR_TO_RES_SPIN_LOCK, + KF_ARG_PTR_TO_TASK_WORK, }; enum special_kfunc_type { @@ -12194,6 +12258,8 @@ enum special_kfunc_type { KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, KF___bpf_trap, + KF_bpf_task_work_schedule_signal, + KF_bpf_task_work_schedule_resume, }; BTF_ID_LIST(special_kfunc_list) @@ -12262,6 +12328,14 @@ BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, __bpf_trap) +BTF_ID(func, bpf_task_work_schedule_signal) +BTF_ID(func, bpf_task_work_schedule_resume) + +static bool is_task_work_add_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; +} static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -12352,6 +12426,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TASK_WORK; + if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG; @@ -12695,7 +12772,8 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || + is_task_work_add_kfunc(btf_id); } static bool is_bpf_throw_kfunc(struct bpf_insn *insn) @@ -13076,7 +13154,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "pointer in R%d isn't map pointer\n", regno); return -EINVAL; } - if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { + if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || + reg->map_ptr->record->task_work_off >= 0)) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) @@ -13091,6 +13170,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ */ if (meta->map.ptr != reg->map_ptr || meta->map.uid != reg->map_uid) { + if (reg->map_ptr->record->task_work_off >= 0) { + verbose(env, + "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } verbose(env, "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", meta->map.uid, reg->map_uid); @@ -13129,6 +13214,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_TASK_WORK: case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; @@ -13422,6 +13508,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_TASK_WORK: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_task_work_func(env, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_IRQ_FLAG: if (reg->type != PTR_TO_STACK) { verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); @@ -13788,6 +13883,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_task_work_add_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_task_work_schedule_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f3b173e48b0f..ae83d8649ef1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7436,6 +7436,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); -- cgit v1.2.3 From 1c1658058c99bcfd3b2347e587a556986037f80a Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Wed, 17 Sep 2025 20:10:35 +0200 Subject: hwmon: (dell-smm) Add support for automatic fan mode Many machines treat fan state 3 as some sort of automatic mode, which is superior to the separate SMM calls for switching to automatic fan mode for two reasons: - the fan control mode can be controlled for each fan separately - the current fan control mode can be retrieved from the BIOS On some machines however, this special fan state does not exist. Fan state 3 acts like a regular fan state on such machines or does not exist at all. Such machines usually use separate SMM calls for enabling/disabling automatic fan control. Add support for it. If the machine supports separate SMM calls for changing the fan control mode, then the other interface is ignored. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20250917181036.10972-4-W_Armin@gmx.de Signed-off-by: Guenter Roeck --- Documentation/hwmon/dell-smm-hwmon.rst | 56 +++++++++++++++---------- drivers/hwmon/dell-smm-hwmon.c | 74 ++++++++++++++++++++++++++++------ include/uapi/linux/i8k.h | 2 + 3 files changed, 98 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/hwmon/dell-smm-hwmon.rst b/Documentation/hwmon/dell-smm-hwmon.rst index 5a4edb6565cf..3e4e2d916ac5 100644 --- a/Documentation/hwmon/dell-smm-hwmon.rst +++ b/Documentation/hwmon/dell-smm-hwmon.rst @@ -38,7 +38,7 @@ fan[1-4]_min RO Minimal Fan speed in RPM fan[1-4]_max RO Maximal Fan speed in RPM fan[1-4]_target RO Expected Fan speed in RPM pwm[1-4] RW Control the fan PWM duty-cycle. -pwm1_enable WO Enable or disable automatic BIOS fan +pwm[1-4]_enable RW/WO Enable or disable automatic BIOS fan control (not supported on all laptops, see below for details). temp[1-10]_input RO Temperature reading in milli-degrees @@ -49,26 +49,40 @@ temp[1-10]_label RO Temperature sensor label. Due to the nature of the SMM interface, each pwmX attribute controls fan number X. -Disabling automatic BIOS fan control ------------------------------------- - -On some laptops the BIOS automatically sets fan speed every few -seconds. Therefore the fan speed set by mean of this driver is quickly -overwritten. - -There is experimental support for disabling automatic BIOS fan -control, at least on laptops where the corresponding SMM command is -known, by writing the value ``1`` in the attribute ``pwm1_enable`` -(writing ``2`` enables automatic BIOS control again). Even if you have -more than one fan, all of them are set to either enabled or disabled -automatic fan control at the same time and, notwithstanding the name, -``pwm1_enable`` sets automatic control for all fans. - -If ``pwm1_enable`` is not available, then it means that SMM codes for -enabling and disabling automatic BIOS fan control are not whitelisted -for your hardware. It is possible that codes that work for other -laptops actually work for yours as well, or that you have to discover -new codes. +Enabling/Disabling automatic BIOS fan control +--------------------------------------------- + +There exist two methods for enabling/disabling automatic BIOS fan control: + +1. Separate SMM commands to enable/disable automatic BIOS fan control for all fans. + +2. A special fan state that enables automatic BIOS fan control for a individual fan. + +The driver cannot reliably detect what method should be used on a given +device, so instead the following heuristic is used: + +- use fan state 3 for enabling BIOS fan control if the maximum fan state + setable by the user is smaller than 3 (default setting). + +- use separate SMM commands if device is whitelisted to support them. + +When using the first method, each fan will have a standard ``pwmX_enable`` +sysfs attribute. Writing ``1`` into this attribute will disable automatic +BIOS fan control for the associated fan and set it to maximum speed. Enabling +BIOS fan control again can be achieved by writing ``2`` into this attribute. +Reading this sysfs attributes returns the current setting as reported by +the underlying hardware. + +When using the second method however, only the ``pwm1_enable`` sysfs attribute +will be available to enable/disable automatic BIOS fan control globaly for all +fans available on a given device. Additionally, this sysfs attribute is write-only +as there exists no SMM command for reading the current fan control setting. + +If no ``pwmX_enable`` attributes are available, then it means that the driver +cannot use the first method and the SMM codes for enabling and disabling automatic +BIOS fan control are not whitelisted for your device. It is possible that codes +that work for other laptops actually work for yours as well, or that you have to +discover new codes. Check the list ``i8k_whitelist_fan_control`` in file ``drivers/hwmon/dell-smm-hwmon.c`` in the kernel tree: as a first diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index 36576db09706..79befa13b699 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -764,6 +764,13 @@ static int dell_smm_get_cur_state(struct thermal_cooling_device *dev, unsigned l if (ret < 0) return ret; + /* + * A fan state bigger than i8k_fan_max might indicate that + * the fan is currently in automatic mode. + */ + if (ret > cdata->data->i8k_fan_max) + return -ENODATA; + *state = ret; return 0; @@ -851,7 +858,14 @@ static umode_t dell_smm_is_visible(const void *drvdata, enum hwmon_sensor_types break; case hwmon_pwm_enable: - if (auto_fan) + if (auto_fan) { + /* + * The setting affects all fans, so only create a + * single attribute. + */ + if (channel != 1) + return 0; + /* * There is no command for retrieve the current status * from BIOS, and userspace/firmware itself can change @@ -859,6 +873,10 @@ static umode_t dell_smm_is_visible(const void *drvdata, enum hwmon_sensor_types * Thus we can only provide write-only access for now. */ return 0200; + } + + if (data->fan[channel] && data->i8k_fan_max < I8K_FAN_AUTO) + return 0644; break; default: @@ -928,14 +946,28 @@ static int dell_smm_read(struct device *dev, enum hwmon_sensor_types type, u32 a } break; case hwmon_pwm: + ret = i8k_get_fan_status(data, channel); + if (ret < 0) + return ret; + switch (attr) { case hwmon_pwm_input: - ret = i8k_get_fan_status(data, channel); - if (ret < 0) - return ret; + /* + * A fan state bigger than i8k_fan_max might indicate that + * the fan is currently in automatic mode. + */ + if (ret > data->i8k_fan_max) + return -ENODATA; *val = clamp_val(ret * data->i8k_pwm_mult, 0, 255); + return 0; + case hwmon_pwm_enable: + if (ret == I8K_FAN_AUTO) + *val = 2; + else + *val = 1; + return 0; default: break; @@ -1022,16 +1054,32 @@ static int dell_smm_write(struct device *dev, enum hwmon_sensor_types type, u32 return 0; case hwmon_pwm_enable: - if (!val) - return -EINVAL; - - if (val == 1) + switch (val) { + case 1: enable = false; - else + break; + case 2: enable = true; + break; + default: + return -EINVAL; + } mutex_lock(&data->i8k_mutex); - err = i8k_enable_fan_auto_mode(data, enable); + if (auto_fan) { + err = i8k_enable_fan_auto_mode(data, enable); + } else { + /* + * When putting the fan into manual control mode we have to ensure + * that the device does not overheat until the userspace fan control + * software takes over. Because of this we set the fan speed to + * i8k_fan_max when disabling automatic fan control. + */ + if (enable) + err = i8k_set_fan(data, channel, I8K_FAN_AUTO); + else + err = i8k_set_fan(data, channel, data->i8k_fan_max); + } mutex_unlock(&data->i8k_mutex); if (err < 0) @@ -1082,9 +1130,9 @@ static const struct hwmon_channel_info * const dell_smm_info[] = { ), HWMON_CHANNEL_INFO(pwm, HWMON_PWM_INPUT | HWMON_PWM_ENABLE, - HWMON_PWM_INPUT, - HWMON_PWM_INPUT, - HWMON_PWM_INPUT + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE ), NULL }; diff --git a/include/uapi/linux/i8k.h b/include/uapi/linux/i8k.h index 268e6268f6c8..a16e4049710f 100644 --- a/include/uapi/linux/i8k.h +++ b/include/uapi/linux/i8k.h @@ -36,6 +36,8 @@ #define I8K_FAN_LOW 1 #define I8K_FAN_HIGH 2 #define I8K_FAN_TURBO 3 +/* Many machines treat this mode as some sort of automatic mode */ +#define I8K_FAN_AUTO 3 #define I8K_FAN_MAX I8K_FAN_TURBO #define I8K_VOL_UP 1 -- cgit v1.2.3 From 94040a8f484576cb1b7df3b2e93118c3b3e3aff4 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:58:28 +0900 Subject: can: netlink: document which symbols are FD specific The CAN XL netlink interface will also have data bitrate and TDC parameters. The current FD parameters do not have a prefix in their names to differentiate them. Because the netlink interface is part of the UAPI, it is unfortunately not feasible to rename the existing symbols to add an FD_ prefix. The best alternative is to add a comment for each of the symbols to notify the reader of which parts are CAN FD specific. While at it, fix a typo: transiver -> transceiver. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-canxl-netlink-prep-v4-3-e720d28f66fe@kernel.org Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/netlink.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index 02ec32d69474..ef62f56eaaef 100644 --- a/include/uapi/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h @@ -101,8 +101,8 @@ struct can_ctrlmode { #define CAN_CTRLMODE_PRESUME_ACK 0x40 /* Ignore missing CAN ACKs */ #define CAN_CTRLMODE_FD_NON_ISO 0x80 /* CAN FD in non-ISO mode */ #define CAN_CTRLMODE_CC_LEN8_DLC 0x100 /* Classic CAN DLC option */ -#define CAN_CTRLMODE_TDC_AUTO 0x200 /* CAN transiver automatically calculates TDCV */ -#define CAN_CTRLMODE_TDC_MANUAL 0x400 /* TDCV is manually set up by user */ +#define CAN_CTRLMODE_TDC_AUTO 0x200 /* FD transceiver automatically calculates TDCV */ +#define CAN_CTRLMODE_TDC_MANUAL 0x400 /* FD TDCV is manually set up by user */ /* * CAN device statistics @@ -129,14 +129,14 @@ enum { IFLA_CAN_RESTART_MS, IFLA_CAN_RESTART, IFLA_CAN_BERR_COUNTER, - IFLA_CAN_DATA_BITTIMING, - IFLA_CAN_DATA_BITTIMING_CONST, + IFLA_CAN_DATA_BITTIMING, /* FD */ + IFLA_CAN_DATA_BITTIMING_CONST, /* FD */ IFLA_CAN_TERMINATION, IFLA_CAN_TERMINATION_CONST, IFLA_CAN_BITRATE_CONST, - IFLA_CAN_DATA_BITRATE_CONST, + IFLA_CAN_DATA_BITRATE_CONST, /* FD */ IFLA_CAN_BITRATE_MAX, - IFLA_CAN_TDC, + IFLA_CAN_TDC, /* FD */ IFLA_CAN_CTRLMODE_EXT, /* add new constants above here */ @@ -145,7 +145,7 @@ enum { }; /* - * CAN FD Transmitter Delay Compensation (TDC) + * CAN FD/XL Transmitter Delay Compensation (TDC) * * Please refer to struct can_tdc_const and can_tdc in * include/linux/can/bittiming.h for further details. -- cgit v1.2.3 From 04a91570ac67760301e5458d65eaf1342ecca314 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 16 Sep 2025 23:22:49 -0400 Subject: ext4: implemet new ioctls to set and get superblock parameters Implement the EXT4_IOC_GET_TUNE_SB_PARAM and EXT4_IOC_SET_TUNE_SB_PARAM ioctls, which allow certains superblock parameters to be set while the file system is mounted, without needing write access to the block device. Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Message-ID: <20250916-tune2fs-v2-3-d594dc7486f0@mit.edu> Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 312 ++++++++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/ext4.h | 53 ++++++++ 2 files changed, 358 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 84e3c73952d7..a93a7baae990 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -27,14 +27,16 @@ #include "fsmap.h" #include -typedef void ext4_update_sb_callback(struct ext4_super_block *es, - const void *arg); +typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi, + struct ext4_super_block *es, + const void *arg); /* * Superblock modification callback function for changing file system * label */ -static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) +static void ext4_sb_setlabel(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { /* Sanity check, this should never happen */ BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX); @@ -46,7 +48,8 @@ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) * Superblock modification callback function for changing file system * UUID. */ -static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg) +static void ext4_sb_setuuid(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE); } @@ -71,7 +74,7 @@ int ext4_update_primary_sb(struct super_block *sb, handle_t *handle, goto out_err; lock_buffer(bh); - func(es, arg); + func(sbi, es, arg); ext4_superblock_csum_set(sb); unlock_buffer(bh); @@ -149,7 +152,7 @@ static int ext4_update_backup_sb(struct super_block *sb, unlock_buffer(bh); goto out_bh; } - func(es, arg); + func(EXT4_SB(sb), es, arg); if (ext4_has_feature_metadata_csum(sb)) es->s_checksum = ext4_superblock_csum(es); set_buffer_uptodate(bh); @@ -1230,6 +1233,295 @@ static int ext4_ioctl_setuuid(struct file *filp, return ret; } + +#define TUNE_OPS_SUPPORTED (EXT4_TUNE_FL_ERRORS_BEHAVIOR | \ + EXT4_TUNE_FL_MNT_COUNT | EXT4_TUNE_FL_MAX_MNT_COUNT | \ + EXT4_TUNE_FL_CHECKINTRVAL | EXT4_TUNE_FL_LAST_CHECK_TIME | \ + EXT4_TUNE_FL_RESERVED_BLOCKS | EXT4_TUNE_FL_RESERVED_UID | \ + EXT4_TUNE_FL_RESERVED_GID | EXT4_TUNE_FL_DEFAULT_MNT_OPTS | \ + EXT4_TUNE_FL_DEF_HASH_ALG | EXT4_TUNE_FL_RAID_STRIDE | \ + EXT4_TUNE_FL_RAID_STRIPE_WIDTH | EXT4_TUNE_FL_MOUNT_OPTS | \ + EXT4_TUNE_FL_FEATURES | EXT4_TUNE_FL_EDIT_FEATURES | \ + EXT4_TUNE_FL_FORCE_FSCK | EXT4_TUNE_FL_ENCODING | \ + EXT4_TUNE_FL_ENCODING_FLAGS) + +#define EXT4_TUNE_SET_COMPAT_SUPP \ + (EXT4_FEATURE_COMPAT_DIR_INDEX | \ + EXT4_FEATURE_COMPAT_STABLE_INODES) +#define EXT4_TUNE_SET_INCOMPAT_SUPP \ + (EXT4_FEATURE_INCOMPAT_EXTENTS | \ + EXT4_FEATURE_INCOMPAT_EA_INODE | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD) +#define EXT4_TUNE_SET_RO_COMPAT_SUPP \ + (EXT4_FEATURE_RO_COMPAT_LARGE_FILE | \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_PROJECT | \ + EXT4_FEATURE_RO_COMPAT_VERITY) + +#define EXT4_TUNE_CLEAR_COMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_INCOMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_RO_COMPAT_SUPP (0) + +#define SB_ENC_SUPP_MASK (SB_ENC_STRICT_MODE_FL | \ + SB_ENC_NO_COMPAT_FALLBACK_FL) + +static int ext4_ioctl_get_tune_sb(struct ext4_sb_info *sbi, + struct ext4_tune_sb_params __user *params) +{ + struct ext4_tune_sb_params ret; + struct ext4_super_block *es = sbi->s_es; + + memset(&ret, 0, sizeof(ret)); + ret.set_flags = TUNE_OPS_SUPPORTED; + ret.errors_behavior = le16_to_cpu(es->s_errors); + ret.mnt_count = le16_to_cpu(es->s_mnt_count); + ret.max_mnt_count = le16_to_cpu(es->s_max_mnt_count); + ret.checkinterval = le32_to_cpu(es->s_checkinterval); + ret.last_check_time = le32_to_cpu(es->s_lastcheck); + ret.reserved_blocks = ext4_r_blocks_count(es); + ret.blocks_count = ext4_blocks_count(es); + ret.reserved_uid = ext4_get_resuid(es); + ret.reserved_gid = ext4_get_resgid(es); + ret.default_mnt_opts = le32_to_cpu(es->s_default_mount_opts); + ret.def_hash_alg = es->s_def_hash_version; + ret.raid_stride = le16_to_cpu(es->s_raid_stride); + ret.raid_stripe_width = le32_to_cpu(es->s_raid_stripe_width); + ret.encoding = le16_to_cpu(es->s_encoding); + ret.encoding_flags = le16_to_cpu(es->s_encoding_flags); + strscpy_pad(ret.mount_opts, es->s_mount_opts); + ret.feature_compat = le32_to_cpu(es->s_feature_compat); + ret.feature_incompat = le32_to_cpu(es->s_feature_incompat); + ret.feature_ro_compat = le32_to_cpu(es->s_feature_ro_compat); + ret.set_feature_compat_mask = EXT4_TUNE_SET_COMPAT_SUPP; + ret.set_feature_incompat_mask = EXT4_TUNE_SET_INCOMPAT_SUPP; + ret.set_feature_ro_compat_mask = EXT4_TUNE_SET_RO_COMPAT_SUPP; + ret.clear_feature_compat_mask = EXT4_TUNE_CLEAR_COMPAT_SUPP; + ret.clear_feature_incompat_mask = EXT4_TUNE_CLEAR_INCOMPAT_SUPP; + ret.clear_feature_ro_compat_mask = EXT4_TUNE_CLEAR_RO_COMPAT_SUPP; + if (copy_to_user(params, &ret, sizeof(ret))) + return -EFAULT; + return 0; +} + +static void ext4_sb_setparams(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + const struct ext4_tune_sb_params *params = arg; + + if (params->set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) + es->s_errors = cpu_to_le16(params->errors_behavior); + if (params->set_flags & EXT4_TUNE_FL_MNT_COUNT) + es->s_mnt_count = cpu_to_le16(params->mnt_count); + if (params->set_flags & EXT4_TUNE_FL_MAX_MNT_COUNT) + es->s_max_mnt_count = cpu_to_le16(params->max_mnt_count); + if (params->set_flags & EXT4_TUNE_FL_CHECKINTRVAL) + es->s_checkinterval = cpu_to_le32(params->checkinterval); + if (params->set_flags & EXT4_TUNE_FL_LAST_CHECK_TIME) + es->s_lastcheck = cpu_to_le32(params->last_check_time); + if (params->set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) { + ext4_fsblk_t blk = params->reserved_blocks; + + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_UID) { + int uid = params->reserved_uid; + + es->s_def_resuid = cpu_to_le16(uid & 0xFFFF); + es->s_def_resuid_hi = cpu_to_le16(uid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_GID) { + int gid = params->reserved_gid; + + es->s_def_resgid = cpu_to_le16(gid & 0xFFFF); + es->s_def_resgid_hi = cpu_to_le16(gid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_DEFAULT_MNT_OPTS) + es->s_default_mount_opts = cpu_to_le32(params->default_mnt_opts); + if (params->set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + es->s_def_hash_version = params->def_hash_alg; + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIDE) + es->s_raid_stride = cpu_to_le16(params->raid_stride); + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIPE_WIDTH) + es->s_raid_stripe_width = + cpu_to_le32(params->raid_stripe_width); + if (params->set_flags & EXT4_TUNE_FL_ENCODING) + es->s_encoding = cpu_to_le16(params->encoding); + if (params->set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) + es->s_encoding_flags = cpu_to_le16(params->encoding_flags); + strscpy_pad(es->s_mount_opts, params->mount_opts); + if (params->set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + es->s_feature_compat |= + cpu_to_le32(params->set_feature_compat_mask); + es->s_feature_incompat |= + cpu_to_le32(params->set_feature_incompat_mask); + es->s_feature_ro_compat |= + cpu_to_le32(params->set_feature_ro_compat_mask); + es->s_feature_compat &= + ~cpu_to_le32(params->clear_feature_compat_mask); + es->s_feature_incompat &= + ~cpu_to_le32(params->clear_feature_incompat_mask); + es->s_feature_ro_compat &= + ~cpu_to_le32(params->clear_feature_ro_compat_mask); + if (params->set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX) + es->s_def_hash_version = sbi->s_def_hash_version; + if (params->set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CSUM_SEED) + es->s_checksum_seed = cpu_to_le32(sbi->s_csum_seed); + } + if (params->set_flags & EXT4_TUNE_FL_FORCE_FSCK) + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); +} + +static int ext4_ioctl_set_tune_sb(struct file *filp, + struct ext4_tune_sb_params __user *in) +{ + struct ext4_tune_sb_params params; + struct super_block *sb = file_inode(filp)->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int enabling_casefold = 0; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(¶ms, in, sizeof(params))) + return -EFAULT; + + if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0) + return -EOPNOTSUPP; + + if ((params.set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) && + (params.errors_behavior > EXT4_ERRORS_PANIC)) + return -EINVAL; + + if ((params.set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) && + (params.reserved_blocks > ext4_blocks_count(sbi->s_es) / 2)) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) && + ((params.def_hash_alg > DX_HASH_LAST) || + (params.def_hash_alg == DX_HASH_SIPHASH))) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_FEATURES) && + (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES)) + return -EINVAL; + + if (params.set_flags & EXT4_TUNE_FL_FEATURES) { + params.set_feature_compat_mask = + params.feature_compat & + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask = + params.feature_incompat & + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask = + params.feature_ro_compat & + ~le32_to_cpu(es->s_feature_ro_compat); + params.clear_feature_compat_mask = + ~params.feature_compat & + le32_to_cpu(es->s_feature_compat); + params.clear_feature_incompat_mask = + ~params.feature_incompat & + le32_to_cpu(es->s_feature_incompat); + params.clear_feature_ro_compat_mask = + ~params.feature_ro_compat & + le32_to_cpu(es->s_feature_ro_compat); + params.set_flags |= EXT4_TUNE_FL_EDIT_FEATURES; + } + if (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + if ((params.set_feature_compat_mask & + ~EXT4_TUNE_SET_COMPAT_SUPP) || + (params.set_feature_incompat_mask & + ~EXT4_TUNE_SET_INCOMPAT_SUPP) || + (params.set_feature_ro_compat_mask & + ~EXT4_TUNE_SET_RO_COMPAT_SUPP) || + (params.clear_feature_compat_mask & + ~EXT4_TUNE_CLEAR_COMPAT_SUPP) || + (params.clear_feature_incompat_mask & + ~EXT4_TUNE_CLEAR_INCOMPAT_SUPP) || + (params.clear_feature_ro_compat_mask & + ~EXT4_TUNE_CLEAR_RO_COMPAT_SUPP)) + return -EOPNOTSUPP; + + /* + * Filter out the features that are already set from + * the set_mask. + */ + params.set_feature_compat_mask &= + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask &= + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask &= + ~le32_to_cpu(es->s_feature_ro_compat); + if ((params.set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CASEFOLD)) { + enabling_casefold = 1; + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING)) { + params.encoding = EXT4_ENC_UTF8_12_1; + params.set_flags |= EXT4_TUNE_FL_ENCODING; + } + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)) { + params.encoding_flags = 0; + params.set_flags |= EXT4_TUNE_FL_ENCODING_FLAGS; + } + } + if ((params.set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX)) { + uuid_t uu; + + memcpy(&uu, sbi->s_hash_seed, UUID_SIZE); + if (uuid_is_null(&uu)) + generate_random_uuid((char *) + &sbi->s_hash_seed); + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + else if (sbi->s_def_hash_version == 0) + sbi->s_def_hash_version = DX_HASH_HALF_MD4; + if (!(es->s_flags & + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH)) && + !(es->s_flags & + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH))) { +#ifdef __CHAR_UNSIGNED__ + sbi->s_hash_unsigned = 3; +#else + sbi->s_hash_unsigned = 0; +#endif + } + } + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding == 0) + params.encoding = EXT4_ENC_UTF8_12_1; + else if (params.encoding != EXT4_ENC_UTF8_12_1) + return -EINVAL; + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding_flags & ~SB_ENC_SUPP_MASK) + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setparams, ¶ms); + mnt_drop_write_file(filp); + + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + + return ret; +} + static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1616,6 +1908,11 @@ resizefs_out: return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg); case EXT4_IOC_SETFSUUID: return ext4_ioctl_setuuid(filp, (const void __user *)arg); + case EXT4_IOC_GET_TUNE_SB_PARAM: + return ext4_ioctl_get_tune_sb(EXT4_SB(sb), + (void __user *)arg); + case EXT4_IOC_SET_TUNE_SB_PARAM: + return ext4_ioctl_set_tune_sb(filp, (void __user *)arg); default: return -ENOTTY; } @@ -1703,7 +2000,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } #endif -static void set_overhead(struct ext4_super_block *es, const void *arg) +static void set_overhead(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); } diff --git a/include/uapi/linux/ext4.h b/include/uapi/linux/ext4.h index 1c4c2dd29112..411dcc1e4a35 100644 --- a/include/uapi/linux/ext4.h +++ b/include/uapi/linux/ext4.h @@ -33,6 +33,8 @@ #define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) #define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid) #define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid) +#define EXT4_IOC_GET_TUNE_SB_PARAM _IOR('f', 45, struct ext4_tune_sb_params) +#define EXT4_IOC_SET_TUNE_SB_PARAM _IOW('f', 46, struct ext4_tune_sb_params) #define EXT4_IOC_SHUTDOWN _IOR('X', 125, __u32) @@ -108,6 +110,57 @@ struct ext4_new_group_input { __u16 unused; }; +struct ext4_tune_sb_params { + __u32 set_flags; + __u32 checkinterval; + __u16 errors_behavior; + __u16 mnt_count; + __u16 max_mnt_count; + __u16 raid_stride; + __u64 last_check_time; + __u64 reserved_blocks; + __u64 blocks_count; + __u32 default_mnt_opts; + __u32 reserved_uid; + __u32 reserved_gid; + __u32 raid_stripe_width; + __u16 encoding; + __u16 encoding_flags; + __u8 def_hash_alg; + __u8 pad_1; + __u16 pad_2; + __u32 feature_compat; + __u32 feature_incompat; + __u32 feature_ro_compat; + __u32 set_feature_compat_mask; + __u32 set_feature_incompat_mask; + __u32 set_feature_ro_compat_mask; + __u32 clear_feature_compat_mask; + __u32 clear_feature_incompat_mask; + __u32 clear_feature_ro_compat_mask; + __u8 mount_opts[64]; + __u8 pad[64]; +}; + +#define EXT4_TUNE_FL_ERRORS_BEHAVIOR 0x00000001 +#define EXT4_TUNE_FL_MNT_COUNT 0x00000002 +#define EXT4_TUNE_FL_MAX_MNT_COUNT 0x00000004 +#define EXT4_TUNE_FL_CHECKINTRVAL 0x00000008 +#define EXT4_TUNE_FL_LAST_CHECK_TIME 0x00000010 +#define EXT4_TUNE_FL_RESERVED_BLOCKS 0x00000020 +#define EXT4_TUNE_FL_RESERVED_UID 0x00000040 +#define EXT4_TUNE_FL_RESERVED_GID 0x00000080 +#define EXT4_TUNE_FL_DEFAULT_MNT_OPTS 0x00000100 +#define EXT4_TUNE_FL_DEF_HASH_ALG 0x00000200 +#define EXT4_TUNE_FL_RAID_STRIDE 0x00000400 +#define EXT4_TUNE_FL_RAID_STRIPE_WIDTH 0x00000800 +#define EXT4_TUNE_FL_MOUNT_OPTS 0x00001000 +#define EXT4_TUNE_FL_FEATURES 0x00002000 +#define EXT4_TUNE_FL_EDIT_FEATURES 0x00004000 +#define EXT4_TUNE_FL_FORCE_FSCK 0x00008000 +#define EXT4_TUNE_FL_ENCODING 0x00010000 +#define EXT4_TUNE_FL_ENCODING_FLAGS 0x00020000 + /* * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. * It indicates that the entry in extent status cache is for a hole. -- cgit v1.2.3 From cc2f08129925b437bf28f7f7822f20dac083a87c Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 24 Sep 2025 12:40:33 +0000 Subject: ethtool: add FEC bins histogram report IEEE 802.3ck-2022 defines counters for FEC bins and 802.3df-2024 clarifies it a bit further. Implement reporting interface through as addition to FEC stats available in ethtool. Drivers can leave bin counter uninitialized if per-lane values are provided. In this case the core will recalculate summ for the bin. Signed-off-by: Vadim Fedorenko Reviewed-by: Aleksandr Loktionov Link: https://patch.msgid.link/20250924124037.1508846-2-vadim.fedorenko@linux.dev Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 29 +++++++++ Documentation/networking/ethtool-netlink.rst | 5 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 3 +- .../net/ethernet/fungible/funeth/funeth_ethtool.c | 3 +- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 3 +- drivers/net/ethernet/intel/ice/ice_ethtool.c | 4 +- .../ethernet/marvell/octeontx2/nic/otx2_ethtool.c | 3 +- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 3 +- drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c | 3 +- drivers/net/ethernet/sfc/ethtool.c | 3 +- drivers/net/ethernet/sfc/siena/ethtool.c | 3 +- drivers/net/netdevsim/ethtool.c | 25 +++++++- include/linux/ethtool.h | 25 +++++++- include/uapi/linux/ethtool_netlink_generated.h | 12 ++++ net/ethtool/fec.c | 75 +++++++++++++++++++++- 15 files changed, 186 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 7a7594713f1f..6a0fb1974513 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1219,6 +1219,30 @@ attribute-sets: name: udp-ports type: nest nested-attributes: tunnel-udp + - + name: fec-hist + attr-cnt-name: --ethtool-a-fec-hist-cnt + attributes: + - + name: pad + type: pad + - + name: bin-low + type: u32 + doc: Low bound of FEC bin (inclusive) + - + name: bin-high + type: u32 + doc: High bound of FEC bin (inclusive) + - + name: bin-val + type: uint + doc: Error count in the bin (optional if per-lane values exist) + - + name: bin-val-per-lane + type: binary + sub-type: u64 + doc: An array of per-lane error counters in the bin (optional) - name: fec-stat attr-cnt-name: __ethtool-a-fec-stat-cnt @@ -1242,6 +1266,11 @@ attribute-sets: name: corr-bits type: binary sub-type: u64 + - + name: hist + type: nest + multi-attr: True + nested-attributes: fec-hist - name: fec attr-cnt-name: __ethtool-a-fec-cnt diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index ab20c644af24..b270886c5f5d 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1541,6 +1541,11 @@ Drivers fill in the statistics in the following structure: .. kernel-doc:: include/linux/ethtool.h :identifiers: ethtool_fec_stats +Statistics may have FEC bins histogram attribute ``ETHTOOL_A_FEC_STAT_HIST`` +as defined in IEEE 802.3ck-2022 and 802.3df-2024. Nested attributes will have +the range of FEC errors in the bin (inclusive) and the amount of error events +in the bin. + FEC_SET ======= diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index be32ef8f5c96..41686a6f84b5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -3208,7 +3208,8 @@ static int bnxt_get_fecparam(struct net_device *dev, } static void bnxt_get_fec_stats(struct net_device *dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct bnxt *bp = netdev_priv(dev); u64 *rx; diff --git a/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c b/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c index ba83dbf4ed22..1966dba512f8 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c +++ b/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c @@ -930,7 +930,8 @@ static void fun_get_rmon_stats(struct net_device *netdev, } static void fun_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *stats) + struct ethtool_fec_stats *stats, + struct ethtool_fec_hist *hist) { const struct funeth_priv *fp = netdev_priv(netdev); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index a752d0e3db3a..a5eefa28454c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1659,7 +1659,8 @@ static void hns3_set_msglevel(struct net_device *netdev, u32 msg_level) } static void hns3_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct hnae3_handle *handle = hns3_get_handle(netdev); struct hnae3_ae_dev *ae_dev = hns3_get_ae_dev(handle); diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 348acd46a0ef..dc131779d426 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -4624,10 +4624,12 @@ static int ice_get_port_fec_stats(struct ice_hw *hw, u16 pcs_quad, u16 pcs_port, * ice_get_fec_stats - returns FEC correctable, uncorrectable stats per netdev * @netdev: network interface device structure * @fec_stats: buffer to hold FEC statistics for given port + * @hist: buffer to put FEC histogram statistics for given port * */ static void ice_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_port_topology port_topology; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c index 998c734ff839..b90e23dc49de 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c @@ -1283,7 +1283,8 @@ end: } static void otx2_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct otx2_nic *pfvf = netdev_priv(netdev); struct cgx_fw_data *rsp; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index d507366d773e..bcc3bbb78cc9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1927,7 +1927,8 @@ static int mlx5e_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) } static void mlx5e_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct mlx5e_priv *priv = netdev_priv(netdev); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index fecb8c602024..d55d2ac1c3b9 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -1718,7 +1718,8 @@ fbnic_get_pause_stats(struct net_device *netdev, static void fbnic_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct fbnic_net *fbn = netdev_priv(netdev); struct fbnic_phy_stats *phy_stats; diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index 23c6a7df78d0..18fe5850a978 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -217,7 +217,8 @@ static int efx_ethtool_set_wol(struct net_device *net_dev, } static void efx_ethtool_get_fec_stats(struct net_device *net_dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct efx_nic *efx = efx_netdev_priv(net_dev); diff --git a/drivers/net/ethernet/sfc/siena/ethtool.c b/drivers/net/ethernet/sfc/siena/ethtool.c index 994909789bfe..8c3ebd0617fb 100644 --- a/drivers/net/ethernet/sfc/siena/ethtool.c +++ b/drivers/net/ethernet/sfc/siena/ethtool.c @@ -217,7 +217,8 @@ static int efx_ethtool_set_wol(struct net_device *net_dev, } static void efx_ethtool_get_fec_stats(struct net_device *net_dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct efx_nic *efx = netdev_priv(net_dev); diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c index f631d90c428a..36a201533aae 100644 --- a/drivers/net/netdevsim/ethtool.c +++ b/drivers/net/netdevsim/ethtool.c @@ -165,11 +165,34 @@ nsim_set_fecparam(struct net_device *dev, struct ethtool_fecparam *fecparam) return 0; } +static const struct ethtool_fec_hist_range netdevsim_fec_ranges[] = { + { 0, 0}, + { 1, 3}, + { 4, 7}, + { 0, 0} +}; + static void -nsim_get_fec_stats(struct net_device *dev, struct ethtool_fec_stats *fec_stats) +nsim_get_fec_stats(struct net_device *dev, struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { + struct ethtool_fec_hist_value *values = hist->values; + + hist->ranges = netdevsim_fec_ranges; + fec_stats->corrected_blocks.total = 123; fec_stats->uncorrectable_blocks.total = 4; + + values[0].per_lane[0] = 125; + values[0].per_lane[1] = 120; + values[0].per_lane[2] = 100; + values[0].per_lane[3] = 100; + values[1].sum = 12; + values[2].sum = 2; + values[2].per_lane[0] = 2; + values[2].per_lane[1] = 0; + values[2].per_lane[2] = 0; + values[2].per_lane[3] = 0; } static int nsim_get_ts_info(struct net_device *dev, diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c869b7f8bce8..c2d8b4ec62eb 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -492,7 +492,29 @@ struct ethtool_pause_stats { }; #define ETHTOOL_MAX_LANES 8 +/** + * IEEE 802.3ck/df defines 16 bins for FEC histogram plus one more for + * the end-of-list marker, total 17 items + */ +#define ETHTOOL_FEC_HIST_MAX 17 +/** + * struct ethtool_fec_hist_range - error bits range for FEC histogram + * statistics + * @low: low bound of the bin (inclusive) + * @high: high bound of the bin (inclusive) + */ +struct ethtool_fec_hist_range { + u16 low; + u16 high; +}; +struct ethtool_fec_hist { + struct ethtool_fec_hist_value { + u64 sum; + u64 per_lane[ETHTOOL_MAX_LANES]; + } values[ETHTOOL_FEC_HIST_MAX]; + const struct ethtool_fec_hist_range *ranges; +}; /** * struct ethtool_fec_stats - statistics for IEEE 802.3 FEC * @corrected_blocks: number of received blocks corrected by FEC @@ -1214,7 +1236,8 @@ struct ethtool_ops { int (*set_link_ksettings)(struct net_device *, const struct ethtool_link_ksettings *); void (*get_fec_stats)(struct net_device *dev, - struct ethtool_fec_stats *fec_stats); + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist); int (*get_fecparam)(struct net_device *, struct ethtool_fecparam *); int (*set_fecparam)(struct net_device *, diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index e3b8813465d7..0e8ac0d974e2 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -561,12 +561,24 @@ enum { ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) }; +enum { + ETHTOOL_A_FEC_HIST_PAD = 1, + ETHTOOL_A_FEC_HIST_BIN_LOW, + ETHTOOL_A_FEC_HIST_BIN_HIGH, + ETHTOOL_A_FEC_HIST_BIN_VAL, + ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE, + + __ETHTOOL_A_FEC_HIST_CNT, + ETHTOOL_A_FEC_HIST_MAX = (__ETHTOOL_A_FEC_HIST_CNT - 1) +}; + enum { ETHTOOL_A_FEC_STAT_UNSPEC, ETHTOOL_A_FEC_STAT_PAD, ETHTOOL_A_FEC_STAT_CORRECTED, ETHTOOL_A_FEC_STAT_UNCORR, ETHTOOL_A_FEC_STAT_CORR_BITS, + ETHTOOL_A_FEC_STAT_HIST, __ETHTOOL_A_FEC_STAT_CNT, ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c index e7d3f2c352a3..4669e74cbcaa 100644 --- a/net/ethtool/fec.c +++ b/net/ethtool/fec.c @@ -17,6 +17,7 @@ struct fec_reply_data { u64 stats[1 + ETHTOOL_MAX_LANES]; u8 cnt; } corr, uncorr, corr_bits; + struct ethtool_fec_hist fec_stat_hist; }; #define FEC_REPDATA(__reply_base) \ @@ -113,7 +114,10 @@ static int fec_prepare_data(const struct ethnl_req_info *req_base, struct ethtool_fec_stats stats; ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8); - dev->ethtool_ops->get_fec_stats(dev, &stats); + ethtool_stats_init((u64 *)data->fec_stat_hist.values, + sizeof(data->fec_stat_hist.values) / 8); + dev->ethtool_ops->get_fec_stats(dev, &stats, + &data->fec_stat_hist); fec_stats_recalc(&data->corr, &stats.corrected_blocks); fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks); @@ -157,13 +161,77 @@ static int fec_reply_size(const struct ethnl_req_info *req_base, len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ - if (req_base->flags & ETHTOOL_FLAG_STATS) + if (req_base->flags & ETHTOOL_FLAG_STATS) { len += 3 * nla_total_size_64bit(sizeof(u64) * (1 + ETHTOOL_MAX_LANES)); + /* add FEC bins information */ + len += (nla_total_size(0) + /* _A_FEC_HIST */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_LOW */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_HI */ + /* _A_FEC_HIST_BIN_VAL + per-lane values */ + nla_total_size_64bit(sizeof(u64)) + + nla_total_size_64bit(sizeof(u64) * ETHTOOL_MAX_LANES)) * + ETHTOOL_FEC_HIST_MAX; + } return len; } +static int fec_put_hist(struct sk_buff *skb, + const struct ethtool_fec_hist *hist) +{ + const struct ethtool_fec_hist_range *ranges = hist->ranges; + const struct ethtool_fec_hist_value *values = hist->values; + struct nlattr *nest; + int i, j; + u64 sum; + + if (!ranges) + return 0; + + for (i = 0; i < ETHTOOL_FEC_HIST_MAX; i++) { + if (i && !ranges[i].low && !ranges[i].high) + break; + + if (WARN_ON_ONCE(values[i].sum == ETHTOOL_STAT_NOT_SET && + values[i].per_lane[0] == ETHTOOL_STAT_NOT_SET)) + break; + + nest = nla_nest_start(skb, ETHTOOL_A_FEC_STAT_HIST); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_LOW, + ranges[i].low) || + nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_HIGH, + ranges[i].high)) + goto err_cancel_hist; + sum = 0; + for (j = 0; j < ETHTOOL_MAX_LANES; j++) { + if (values[i].per_lane[j] == ETHTOOL_STAT_NOT_SET) + break; + sum += values[i].per_lane[j]; + } + if (nla_put_uint(skb, ETHTOOL_A_FEC_HIST_BIN_VAL, + values[i].sum == ETHTOOL_STAT_NOT_SET ? + sum : values[i].sum)) + goto err_cancel_hist; + if (j && nla_put_64bit(skb, ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE, + sizeof(u64) * j, + values[i].per_lane, + ETHTOOL_A_FEC_HIST_PAD)) + goto err_cancel_hist; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel_hist: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) { struct nlattr *nest; @@ -183,6 +251,9 @@ static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD)) goto err_cancel; + if (fec_put_hist(skb, &data->fec_stat_hist)) + goto err_cancel; + nla_nest_end(skb, nest); return 0; -- cgit v1.2.3 From c5273f6ca166c4edfaa6a87570e111453a0576ad Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:40 +0200 Subject: mptcp: pm: rename 'subflows' to 'extra_subflows' A few variables linked to the Path-Managers are confusing, and it would help current and future developers, to clarify them. One of them is 'subflows', which in fact represents the number of extra subflows: all the additional subflows created after the initial one, and not the total number of subflows. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_extra_subflows. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-5-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 13 ++++++------ net/mptcp/pm_kernel.c | 24 +++++++++++------------ net/mptcp/pm_userspace.c | 2 +- net/mptcp/protocol.h | 6 +++--- net/mptcp/sockopt.c | 4 ++-- tools/testing/selftests/bpf/progs/mptcp_subflow.c | 2 +- 7 files changed, 27 insertions(+), 25 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 15eef878690b..f807c8dba56e 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -42,6 +42,7 @@ struct mptcp_info { __u8 mptcpi_subflows; + #define mptcpi_extra_subflows mptcpi_subflows __u8 mptcpi_add_addr_signal; __u8 mptcpi_add_addr_accepted; __u8 mptcpi_subflows_max; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 584cab90aa6e..332e96bdadc0 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -489,7 +489,7 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) if (mptcp_pm_is_userspace(msk)) { if (mptcp_userspace_pm_active(msk)) { spin_lock_bh(&pm->lock); - pm->subflows++; + pm->extra_subflows++; spin_unlock_bh(&pm->lock); return true; } @@ -498,8 +498,9 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) subflows_max = mptcp_pm_get_subflows_max(msk); - pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->subflows, - subflows_max, READ_ONCE(pm->accept_subflow)); + pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, + pm->extra_subflows, subflows_max, + READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ if (!READ_ONCE(pm->accept_subflow)) @@ -507,8 +508,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { - ret = pm->subflows < subflows_max; - if (ret && ++pm->subflows == subflows_max) + ret = pm->extra_subflows < subflows_max; + if (ret && ++pm->extra_subflows == subflows_max) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); @@ -594,7 +595,7 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, if (mptcp_pm_is_userspace(msk)) { if (update_subflows) { spin_lock_bh(&pm->lock); - pm->subflows--; + pm->extra_subflows--; spin_unlock_bh(&pm->lock); } return; diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index a82c077b8a20..20bee6fc0625 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -175,7 +175,7 @@ fill_remote_addr(struct mptcp_sock *msk, struct mptcp_addr_info *local, if (!mptcp_pm_addr_families_match(sk, local, &remote)) return 0; - msk->pm.subflows++; + msk->pm.extra_subflows++; *addrs = remote; return 1; @@ -218,10 +218,10 @@ fill_remote_addresses_fullmesh(struct mptcp_sock *msk, /* forbid creating multiple address towards this id */ __set_bit(addrs[i].id, unavail_id); - msk->pm.subflows++; + msk->pm.extra_subflows++; i++; - if (msk->pm.subflows >= subflows_max) + if (msk->pm.extra_subflows >= subflows_max) break; } @@ -313,7 +313,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, msk->pm.add_addr_signaled, add_addr_signal_max, - msk->pm.subflows, subflows_max); + msk->pm.extra_subflows, subflows_max); /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { @@ -353,7 +353,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) subflow: /* check if should create a new subflow */ while (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max) { + msk->pm.extra_subflows < subflows_max) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; int i, nr; @@ -441,10 +441,10 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, if (is_id0) local->addr.id = 0; - msk->pm.subflows++; + msk->pm.extra_subflows++; i++; - if (msk->pm.subflows >= subflows_max) + if (msk->pm.extra_subflows >= subflows_max) break; } rcu_read_unlock(); @@ -483,10 +483,10 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, continue; msk->pm.local_addr_used++; - msk->pm.subflows++; + msk->pm.extra_subflows++; i++; - if (msk->pm.subflows >= subflows_max) + if (msk->pm.extra_subflows >= subflows_max) break; } @@ -510,7 +510,7 @@ fill_local_address_any(struct mptcp_sock *msk, struct mptcp_addr_info *remote, if (!mptcp_pm_addr_families_match(sk, &local->addr, remote)) return 0; - msk->pm.subflows++; + msk->pm.extra_subflows++; return 1; } @@ -586,7 +586,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) if (remote.id) msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.subflows >= subflows_max) + msk->pm.extra_subflows >= subflows_max) WRITE_ONCE(msk->pm.accept_addr, false); } } @@ -1427,7 +1427,7 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || + if (msk->pm.extra_subflows == mptcp_pm_get_subflows_max(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index a715dcbe0146..8cbc1920afb4 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -419,7 +419,7 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) if (err) mptcp_userspace_pm_delete_local_addr(msk, &entry); else - msk->pm.subflows++; + msk->pm.extra_subflows++; spin_unlock_bh(&msk->pm.lock); create_err: diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index cbe54331e5c7..ca68f9a75801 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -235,7 +235,7 @@ struct mptcp_pm_data { u8 add_addr_accepted; u8 local_addr_used; u8 pm_type; - u8 subflows; + u8 extra_subflows; u8 status; ); @@ -1188,7 +1188,7 @@ unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { - if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk)) + if (--msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } @@ -1204,7 +1204,7 @@ static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && mptcp_pm_get_add_addr_accept_max(msk) == 0 && - msk->pm.subflows < mptcp_pm_get_subflows_max(msk); + msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2abe6f1e9940..17966da80239 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -962,7 +962,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) memset(info, 0, sizeof(*info)); - info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_extra_subflows = READ_ONCE(msk->pm.extra_subflows); info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); @@ -996,7 +996,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_bytes_sent = msk->bytes_sent; info->mptcpi_bytes_received = msk->bytes_received; info->mptcpi_bytes_retrans = msk->bytes_retrans; - info->mptcpi_subflows_total = info->mptcpi_subflows + + info->mptcpi_subflows_total = info->mptcpi_extra_subflows + __mptcp_has_initial_subflow(msk); now = tcp_jiffies32; info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent); diff --git a/tools/testing/selftests/bpf/progs/mptcp_subflow.c b/tools/testing/selftests/bpf/progs/mptcp_subflow.c index 70302477e326..41389e579578 100644 --- a/tools/testing/selftests/bpf/progs/mptcp_subflow.c +++ b/tools/testing/selftests/bpf/progs/mptcp_subflow.c @@ -117,7 +117,7 @@ int _getsockopt_subflow(struct bpf_sockopt *ctx) return 1; msk = bpf_core_cast(sk, struct mptcp_sock); - if (msk->pm.subflows != 1) { + if (msk->pm.extra_subflows != 1) { ctx->retval = -1; return 1; } -- cgit v1.2.3 From 3eb3c9a9596a53880f7d7eff28ac5622f3e0ba37 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:41 +0200 Subject: mptcp: pm: in-kernel: rename 'subflows_max' to 'limit_extra_subflows' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'subflows_max', which in fact represents the limit of extra subflows: the limit set via 'ip mptcp limit subflows X' for example. It is not linked to the maximum number of created / possible subflows. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_limit_extra_subflows. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-6-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 12 ++++++------ net/mptcp/pm_kernel.c | 48 ++++++++++++++++++++++++---------------------- net/mptcp/protocol.h | 6 +++--- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 37 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index f807c8dba56e..314200c61f15 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -46,6 +46,7 @@ struct mptcp_info { __u8 mptcpi_add_addr_signal; __u8 mptcpi_add_addr_accepted; __u8 mptcpi_subflows_max; + #define mptcpi_limit_extra_subflows mptcpi_subflows_max __u8 mptcpi_add_addr_signal_max; __u8 mptcpi_add_addr_accepted_max; __u32 mptcpi_flags; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 332e96bdadc0..502f6c235e06 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -483,7 +483,7 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; - unsigned int subflows_max; + unsigned int limit_extra_subflows; int ret = 0; if (mptcp_pm_is_userspace(msk)) { @@ -496,10 +496,10 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) return false; } - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, - pm->extra_subflows, subflows_max, + pm->extra_subflows, limit_extra_subflows, READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ @@ -508,8 +508,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { - ret = pm->extra_subflows < subflows_max; - if (ret && ++pm->extra_subflows == subflows_max) + ret = pm->extra_subflows < limit_extra_subflows; + if (ret && ++pm->extra_subflows == limit_extra_subflows) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); @@ -1029,7 +1029,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) WRITE_ONCE(pm->pm_type, pm_type); if (pm_type == MPTCP_PM_TYPE_KERNEL) { - bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk); + bool subflows_allowed = !!mptcp_pm_get_limit_extra_subflows(msk); /* pm->work_pending must be only be set to 'true' when * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 20bee6fc0625..db0d254d0e6b 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -23,7 +23,7 @@ struct pm_nl_pernet { unsigned int add_addr_signal_max; unsigned int add_addr_accept_max; unsigned int local_addr_max; - unsigned int subflows_max; + unsigned int limit_extra_subflows; unsigned int next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); }; @@ -62,13 +62,13 @@ unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) +unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->subflows_max); + return READ_ONCE(pernet->limit_extra_subflows); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) { @@ -190,10 +190,10 @@ fill_remote_addresses_fullmesh(struct mptcp_sock *msk, DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); struct sock *sk = (struct sock *)msk, *ssk; struct mptcp_subflow_context *subflow; - unsigned int subflows_max; + unsigned int limit_extra_subflows; int i = 0; - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); /* Forbid creation of new subflows matching existing ones, possibly * already created by incoming ADD_ADDR @@ -221,7 +221,7 @@ fill_remote_addresses_fullmesh(struct mptcp_sock *msk, msk->pm.extra_subflows++; i++; - if (msk->pm.extra_subflows >= subflows_max) + if (msk->pm.extra_subflows >= limit_extra_subflows) break; } @@ -274,18 +274,18 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; + unsigned int limit_extra_subflows; unsigned int add_addr_signal_max; bool signal_and_subflow = false; unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; - unsigned int subflows_max; pernet = pm_nl_get_pernet(sock_net(sk)); add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); /* do lazy endpoint usage accounting for the MPC subflows */ if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { @@ -313,7 +313,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, msk->pm.add_addr_signaled, add_addr_signal_max, - msk->pm.extra_subflows, subflows_max); + msk->pm.extra_subflows, limit_extra_subflows); /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { @@ -353,7 +353,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) subflow: /* check if should create a new subflow */ while (msk->pm.local_addr_used < local_addr_max && - msk->pm.extra_subflows < subflows_max) { + msk->pm.extra_subflows < limit_extra_subflows) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; int i, nr; @@ -402,14 +402,15 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, bool c_flag_case) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - unsigned int subflows_max = mptcp_pm_get_subflows_max(msk); struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; + unsigned int limit_extra_subflows; struct mptcp_addr_info mpc_addr; struct mptcp_pm_local *local; int i = 0; mptcp_local_address((struct sock_common *)msk, &mpc_addr); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { @@ -444,7 +445,7 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, msk->pm.extra_subflows++; i++; - if (msk->pm.extra_subflows >= subflows_max) + if (msk->pm.extra_subflows >= limit_extra_subflows) break; } rcu_read_unlock(); @@ -459,13 +460,14 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, { unsigned int local_addr_max = mptcp_pm_get_local_addr_max(msk); struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - unsigned int subflows_max = mptcp_pm_get_subflows_max(msk); struct sock *sk = (struct sock *)msk; + unsigned int limit_extra_subflows; struct mptcp_addr_info mpc_addr; struct mptcp_pm_local *local; int i = 0; mptcp_local_address((struct sock_common *)msk, &mpc_addr); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); while (msk->pm.local_addr_used < local_addr_max) { local = &locals[i]; @@ -486,7 +488,7 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, msk->pm.extra_subflows++; i++; - if (msk->pm.extra_subflows >= subflows_max) + if (msk->pm.extra_subflows >= limit_extra_subflows) break; } @@ -544,14 +546,14 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; + unsigned int limit_extra_subflows; unsigned int add_addr_accept_max; struct mptcp_addr_info remote; - unsigned int subflows_max; bool sf_created = false; int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); pr_debug("accepted %d:%d remote family %d\n", msk->pm.add_addr_accepted, add_addr_accept_max, @@ -586,7 +588,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) if (remote.id) msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.extra_subflows >= subflows_max) + msk->pm.extra_subflows >= limit_extra_subflows) WRITE_ONCE(msk->pm.accept_addr, false); } } @@ -1285,13 +1287,13 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto unlock; - subflows = pernet->subflows_max; + subflows = pernet->limit_extra_subflows; ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); if (ret) goto unlock; WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); - WRITE_ONCE(pernet->subflows_max, subflows); + WRITE_ONCE(pernet->limit_extra_subflows, subflows); unlock: spin_unlock_bh(&pernet->lock); @@ -1318,7 +1320,7 @@ int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, - READ_ONCE(pernet->subflows_max))) + READ_ONCE(pernet->limit_extra_subflows))) goto fail; genlmsg_end(msg, reply); @@ -1427,7 +1429,7 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - if (msk->pm.extra_subflows == mptcp_pm_get_subflows_max(msk) || + if (msk->pm.extra_subflows == mptcp_pm_get_limit_extra_subflows(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); @@ -1462,7 +1464,7 @@ static int __net_init pm_nl_init_net(struct net *net) INIT_LIST_HEAD_RCU(&pernet->local_addr_list); /* Cit. 2 subflows ought to be enough for anybody. */ - pernet->subflows_max = 2; + pernet->limit_extra_subflows = 2; pernet->next_id = 1; pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ca68f9a75801..4c777f87b049 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1182,13 +1182,13 @@ void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { - if (--msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk)) + if (--msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } @@ -1204,7 +1204,7 @@ static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && mptcp_pm_get_add_addr_accept_max(msk) == 0 && - msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk); + msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 17966da80239..4e82bcfcd34e 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -972,8 +972,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) /* The following limits only make sense for the in-kernel PM */ if (mptcp_pm_is_kernel(msk)) { - info->mptcpi_subflows_max = - mptcp_pm_get_subflows_max(msk); + info->mptcpi_limit_extra_subflows = + mptcp_pm_get_limit_extra_subflows(msk); info->mptcpi_add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); info->mptcpi_add_addr_accepted_max = -- cgit v1.2.3 From 45cae570664d58c562e21a3c7409fc02147bba46 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:42 +0200 Subject: mptcp: pm: in-kernel: rename 'add_addr_signal_max' to 'endp_signal_max' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'add_addr_signal_max', which in fact represents the maximum number of 'signal' endpoints that can be used to announced addresses, and not the number of ADD_ADDR that can be signalled. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_endp_signal_max. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-7-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 2 +- net/mptcp/pm_kernel.c | 26 +++++++++++++------------- net/mptcp/protocol.h | 2 +- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 18 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 314200c61f15..69fc20db1c2f 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -48,6 +48,7 @@ struct mptcp_info { __u8 mptcpi_subflows_max; #define mptcpi_limit_extra_subflows mptcpi_subflows_max __u8 mptcpi_add_addr_signal_max; + #define mptcpi_endp_signal_max mptcpi_add_addr_signal_max __u8 mptcpi_add_addr_accepted_max; __u32 mptcpi_flags; __u32 mptcpi_token; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 502f6c235e06..1100ba8b1ce8 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -1037,7 +1037,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_local_addr_max(msk) && subflows_allowed) || - !!mptcp_pm_get_add_addr_signal_max(msk)); + !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_add_addr_accept_max(msk) && subflows_allowed); diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index db0d254d0e6b..740f0b20b941 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -20,7 +20,7 @@ struct pm_nl_pernet { struct list_head local_addr_list; unsigned int addrs; unsigned int stale_loss_cnt; - unsigned int add_addr_signal_max; + unsigned int endp_signal_max; unsigned int add_addr_accept_max; unsigned int local_addr_max; unsigned int limit_extra_subflows; @@ -46,13 +46,13 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) return pm_nl_get_pernet(genl_info_net(info)); } -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) +unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) { const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->add_addr_signal_max); + return READ_ONCE(pernet->endp_signal_max); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) { @@ -275,15 +275,15 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; - unsigned int add_addr_signal_max; bool signal_and_subflow = false; + unsigned int endp_signal_max; unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; pernet = pm_nl_get_pernet(sock_net(sk)); - add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); + endp_signal_max = mptcp_pm_get_endp_signal_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); @@ -312,11 +312,11 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, - msk->pm.add_addr_signaled, add_addr_signal_max, + msk->pm.add_addr_signaled, endp_signal_max, msk->pm.extra_subflows, limit_extra_subflows); /* check first for announce */ - if (msk->pm.add_addr_signaled < add_addr_signal_max) { + if (msk->pm.add_addr_signaled < endp_signal_max) { /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the @@ -699,8 +699,8 @@ find_next: pernet->next_id = entry->addr.id; if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); + addr_max = pernet->endp_signal_max; + WRITE_ONCE(pernet->endp_signal_max, addr_max + 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; @@ -1098,8 +1098,8 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); + addr_max = pernet->endp_signal_max; + WRITE_ONCE(pernet->endp_signal_max, addr_max - 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; @@ -1185,7 +1185,7 @@ static void __flush_addrs(struct list_head *list) static void __reset_counters(struct pm_nl_pernet *pernet) { - WRITE_ONCE(pernet->add_addr_signal_max, 0); + WRITE_ONCE(pernet->endp_signal_max, 0); WRITE_ONCE(pernet->local_addr_max, 0); pernet->addrs = 0; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 4c777f87b049..86c30cd6c1f2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1180,7 +1180,7 @@ void __init mptcp_pm_userspace_register(void); void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 4e82bcfcd34e..4688e0f25d15 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -974,8 +974,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) if (mptcp_pm_is_kernel(msk)) { info->mptcpi_limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); - info->mptcpi_add_addr_signal_max = - mptcp_pm_get_add_addr_signal_max(msk); + info->mptcpi_endp_signal_max = + mptcp_pm_get_endp_signal_max(msk); info->mptcpi_add_addr_accepted_max = mptcp_pm_get_add_addr_accept_max(msk); info->mptcpi_local_addr_max = -- cgit v1.2.3 From 37712d84dfc2e80d4d218ff9be490c86e604aa69 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:43 +0200 Subject: mptcp: pm: in-kernel: rename 'add_addr_accept_max' to 'limit_add_addr_accepted' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'add_addr_accept_max', which in fact represents the limit of ADD_ADDR that can be accepted: the limit set via 'ip mptcp limit add_addr_accepted X' for example. It is not linked to the maximum number of accepted ADD_ADDR. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_limit_add_addr_accepted. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-8-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 2 +- net/mptcp/pm_kernel.c | 27 +++++++++++++++------------ net/mptcp/protocol.h | 4 ++-- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 21 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 69fc20db1c2f..1c275ce96b52 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -50,6 +50,7 @@ struct mptcp_info { __u8 mptcpi_add_addr_signal_max; #define mptcpi_endp_signal_max mptcpi_add_addr_signal_max __u8 mptcpi_add_addr_accepted_max; + #define mptcpi_limit_add_addr_accepted mptcpi_add_addr_accepted_max __u32 mptcpi_flags; __u32 mptcpi_token; __u64 mptcpi_write_seq; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 1100ba8b1ce8..e13bfec50ef8 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -1039,7 +1039,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) subflows_allowed) || !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, - !!mptcp_pm_get_add_addr_accept_max(msk) && + !!mptcp_pm_get_limit_add_addr_accepted(msk) && subflows_allowed); WRITE_ONCE(pm->accept_subflow, subflows_allowed); diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 740f0b20b941..92f7419485a8 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -21,7 +21,7 @@ struct pm_nl_pernet { unsigned int addrs; unsigned int stale_loss_cnt; unsigned int endp_signal_max; - unsigned int add_addr_accept_max; + unsigned int limit_add_addr_accepted; unsigned int local_addr_max; unsigned int limit_extra_subflows; unsigned int next_id; @@ -54,13 +54,13 @@ unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) +unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->add_addr_accept_max); + return READ_ONCE(pernet->limit_add_addr_accepted); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_add_addr_accepted); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) { @@ -547,16 +547,16 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; - unsigned int add_addr_accept_max; + unsigned int limit_add_addr_accepted; struct mptcp_addr_info remote; bool sf_created = false; int i, nr; - add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); + limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); pr_debug("accepted %d:%d remote family %d\n", - msk->pm.add_addr_accepted, add_addr_accept_max, + msk->pm.add_addr_accepted, limit_add_addr_accepted, msk->pm.remote.family); remote = msk->pm.remote; @@ -587,7 +587,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) /* add_addr_accepted is not decr for ID 0 */ if (remote.id) msk->pm.add_addr_accepted++; - if (msk->pm.add_addr_accepted >= add_addr_accept_max || + if (msk->pm.add_addr_accepted >= limit_add_addr_accepted || msk->pm.extra_subflows >= limit_extra_subflows) WRITE_ONCE(msk->pm.accept_addr, false); } @@ -596,10 +596,13 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) { if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + unsigned int limit_add_addr_accepted = + mptcp_pm_get_limit_add_addr_accepted(msk); + /* Note: if the subflow has been closed before, this * add_addr_accepted counter will not be decremented. */ - if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) + if (--msk->pm.add_addr_accepted < limit_add_addr_accepted) WRITE_ONCE(msk->pm.accept_addr, true); } } @@ -1282,7 +1285,7 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) int ret; spin_lock_bh(&pernet->lock); - rcv_addrs = pernet->add_addr_accept_max; + rcv_addrs = pernet->limit_add_addr_accepted; ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); if (ret) goto unlock; @@ -1292,7 +1295,7 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto unlock; - WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); + WRITE_ONCE(pernet->limit_add_addr_accepted, rcv_addrs); WRITE_ONCE(pernet->limit_extra_subflows, subflows); unlock: @@ -1316,7 +1319,7 @@ int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, - READ_ONCE(pernet->add_addr_accept_max))) + READ_ONCE(pernet->limit_add_addr_accepted))) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 86c30cd6c1f2..114995e1352d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1181,7 +1181,7 @@ void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); @@ -1203,7 +1203,7 @@ static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && - mptcp_pm_get_add_addr_accept_max(msk) == 0 && + mptcp_pm_get_limit_add_addr_accepted(msk) == 0 && msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk); } diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 4688e0f25d15..5ab9909dbe79 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -976,8 +976,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_pm_get_limit_extra_subflows(msk); info->mptcpi_endp_signal_max = mptcp_pm_get_endp_signal_max(msk); - info->mptcpi_add_addr_accepted_max = - mptcp_pm_get_add_addr_accept_max(msk); + info->mptcpi_limit_add_addr_accepted = + mptcp_pm_get_limit_add_addr_accepted(msk); info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk); } -- cgit v1.2.3 From e7757b6d3a623671705388be24851af7360b54ba Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:44 +0200 Subject: mptcp: pm: in-kernel: rename 'local_addr_max' to 'endp_subflow_max' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'local_addr_max', which in fact represents the maximum number of 'subflow' endpoints that can be used to create new subflows, and not the number of local addresses that have been used to create subflows. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_endp_subflow_max. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. Also move the variable and function next to the other 'endp_X_max' ones. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-9-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 2 +- net/mptcp/pm_kernel.c | 40 ++++++++++++++++++++-------------------- net/mptcp/protocol.h | 2 +- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 25 insertions(+), 24 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 1c275ce96b52..5ec996977b3f 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -58,6 +58,7 @@ struct mptcp_info { __u64 mptcpi_rcv_nxt; __u8 mptcpi_local_addr_used; __u8 mptcpi_local_addr_max; + #define mptcpi_endp_subflow_max mptcpi_local_addr_max __u8 mptcpi_csum_enabled; __u32 mptcpi_retransmits; __u64 mptcpi_bytes_retrans; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index e13bfec50ef8..2ff1b9499568 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -1035,7 +1035,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL */ WRITE_ONCE(pm->work_pending, - (!!mptcp_pm_get_local_addr_max(msk) && + (!!mptcp_pm_get_endp_subflow_max(msk) && subflows_allowed) || !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 92f7419485a8..e62e21eb9da1 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -21,8 +21,8 @@ struct pm_nl_pernet { unsigned int addrs; unsigned int stale_loss_cnt; unsigned int endp_signal_max; + unsigned int endp_subflow_max; unsigned int limit_add_addr_accepted; - unsigned int local_addr_max; unsigned int limit_extra_subflows; unsigned int next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); @@ -54,6 +54,14 @@ unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); +unsigned int mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->endp_subflow_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max); + unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); @@ -70,14 +78,6 @@ unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows); -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->local_addr_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); - static bool lookup_subflow_by_daddr(const struct list_head *list, const struct mptcp_addr_info *daddr) { @@ -276,15 +276,15 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; bool signal_and_subflow = false; + unsigned int endp_subflow_max; unsigned int endp_signal_max; - unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; pernet = pm_nl_get_pernet(sock_net(sk)); endp_signal_max = mptcp_pm_get_endp_signal_max(msk); - local_addr_max = mptcp_pm_get_local_addr_max(msk); + endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); /* do lazy endpoint usage accounting for the MPC subflows */ @@ -311,7 +311,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) } pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", - msk->pm.local_addr_used, local_addr_max, + msk->pm.local_addr_used, endp_subflow_max, msk->pm.add_addr_signaled, endp_signal_max, msk->pm.extra_subflows, limit_extra_subflows); @@ -352,7 +352,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) subflow: /* check if should create a new subflow */ - while (msk->pm.local_addr_used < local_addr_max && + while (msk->pm.local_addr_used < endp_subflow_max && msk->pm.extra_subflows < limit_extra_subflows) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; @@ -458,7 +458,7 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, struct mptcp_addr_info *remote, struct mptcp_pm_local *locals) { - unsigned int local_addr_max = mptcp_pm_get_local_addr_max(msk); + unsigned int endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; @@ -469,7 +469,7 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, mptcp_local_address((struct sock_common *)msk, &mpc_addr); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); - while (msk->pm.local_addr_used < local_addr_max) { + while (msk->pm.local_addr_used < endp_subflow_max) { local = &locals[i]; if (!select_local_address(pernet, msk, local)) @@ -706,8 +706,8 @@ find_next: WRITE_ONCE(pernet->endp_signal_max, addr_max + 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max + 1); + addr_max = pernet->endp_subflow_max; + WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1); } pernet->addrs++; @@ -1105,8 +1105,8 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) WRITE_ONCE(pernet->endp_signal_max, addr_max - 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max - 1); + addr_max = pernet->endp_subflow_max; + WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1); } pernet->addrs--; @@ -1189,7 +1189,7 @@ static void __flush_addrs(struct list_head *list) static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->endp_signal_max, 0); - WRITE_ONCE(pernet->local_addr_max, 0); + WRITE_ONCE(pernet->endp_subflow_max, 0); pernet->addrs = 0; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 114995e1352d..df8f977039d0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1181,9 +1181,9 @@ void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 5ab9909dbe79..92a2a2742627 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -978,8 +978,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_pm_get_endp_signal_max(msk); info->mptcpi_limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); - info->mptcpi_local_addr_max = - mptcp_pm_get_local_addr_max(msk); + info->mptcpi_endp_subflow_max = + mptcp_pm_get_endp_subflow_max(msk); } if (__mptcp_check_fallback(msk)) -- cgit v1.2.3 From 539f6b9de39ec5d827b16f6f5c8f3cfd58669e93 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:50 +0200 Subject: mptcp: pm: in-kernel: add laminar endpoints Currently, upon the reception of an ADD_ADDR (and when the fullmesh flag is not used), the in-kernel PM will create new subflows using the local address the routing configuration will pick. It would be easier to pick local addresses from a selected list of endpoints, and use it only once, than relying on routing rules. Use case: both the client (C) and the server (S) have two addresses (a and b). The client establishes the connection between C(a) and S(a). Once established, the server announces its additional address S(b). Once received, the client connects to it using its second address C(b). Compared to a situation without the 'laminar' endpoint for C(b), the client didn't use this address C(b) to establish a subflow to the server's primary address S(a). So at the end, we have: C S C(a) --- S(a) C(b) --- S(b) In case of a 3rd address on each side (C(c) and S(c)), upon the reception of an ADD_ADDR with S(c), the client should not pick C(b) because it has already been used. C(c) should then be used. Note that this situation is currently possible if C doesn't add any endpoint, but configure the routing in order to pick C(b) for the route to S(b), and pick C(c) for the route to S(c). That doesn't sound very practical because it means knowing in advance the IP addresses that will be used and announced by the server. 'laminar', like the idea of laminar flows: the different subflows don't mix with each other on an endpoint, unlike the "turbulent" way traffic is mixed by 'fullmesh'. In the code, the new endpoint type is added. Similar to the other subflow types, an MPTCP_INFO counter is added. While at it, hole are now commented in struct mptcp_info, to remember next time that these holes can no longer be used. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/503 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-15-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 6 +++- net/mptcp/pm_kernel.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 1 + net/mptcp/sockopt.c | 2 ++ 4 files changed, 90 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 5ec996977b3f..87cfab874e24 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -39,6 +39,7 @@ #define MPTCP_PM_ADDR_FLAG_BACKUP _BITUL(2) #define MPTCP_PM_ADDR_FLAG_FULLMESH _BITUL(3) #define MPTCP_PM_ADDR_FLAG_IMPLICIT _BITUL(4) +#define MPTCP_PM_ADDR_FLAG_LAMINAR _BITUL(5) struct mptcp_info { __u8 mptcpi_subflows; @@ -51,6 +52,7 @@ struct mptcp_info { #define mptcpi_endp_signal_max mptcpi_add_addr_signal_max __u8 mptcpi_add_addr_accepted_max; #define mptcpi_limit_add_addr_accepted mptcpi_add_addr_accepted_max + /* 16-bit hole that can no longer be filled */ __u32 mptcpi_flags; __u32 mptcpi_token; __u64 mptcpi_write_seq; @@ -60,13 +62,15 @@ struct mptcp_info { __u8 mptcpi_local_addr_max; #define mptcpi_endp_subflow_max mptcpi_local_addr_max __u8 mptcpi_csum_enabled; + /* 8-bit hole that can no longer be filled */ __u32 mptcpi_retransmits; __u64 mptcpi_bytes_retrans; __u64 mptcpi_bytes_sent; __u64 mptcpi_bytes_received; __u64 mptcpi_bytes_acked; __u8 mptcpi_subflows_total; - __u8 reserved[3]; + __u8 mptcpi_endp_laminar_max; + __u8 reserved[2]; __u32 mptcpi_last_data_sent; __u32 mptcpi_last_data_recv; __u32 mptcpi_last_ack_recv; diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 55dbf89d19b8..e0f44dc232aa 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -21,6 +21,7 @@ struct pm_nl_pernet { u8 endpoints; u8 endp_signal_max; u8 endp_subflow_max; + u8 endp_laminar_max; u8 limit_add_addr_accepted; u8 limit_extra_subflows; u8 next_id; @@ -61,6 +62,14 @@ u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max); +u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->endp_laminar_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_laminar_max); + u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); @@ -458,6 +467,66 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, return i; } +static unsigned int +fill_local_laminar_endp(struct mptcp_sock *msk, struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; + struct mptcp_pm_local *local; + int found = 0; + + /* Forbid creation of new subflows matching existing ones, possibly + * already created by 'subflow' endpoints + */ + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if ((1 << inet_sk_state_load(ssk)) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | + TCPF_CLOSE)) + continue; + + __set_bit(subflow_get_local_id(subflow), unavail_id); + } + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR)) + continue; + + if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) + continue; + + if (test_bit(mptcp_endp_get_local_id(msk, &entry->addr), + unavail_id)) + continue; + + local = &locals[0]; + local->addr = entry->addr; + local->flags = entry->flags; + local->ifindex = entry->ifindex; + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + + if (local->addr.id != msk->mpc_endpoint_id) + msk->pm.local_addr_used++; + } + + msk->pm.extra_subflows++; + found = 1; + break; + } + rcu_read_unlock(); + + return found; +} + static unsigned int fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, struct mptcp_addr_info *remote, @@ -532,6 +601,10 @@ fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote, if (i) return i; + /* If there is at least one MPTCP endpoint with a laminar flag */ + if (mptcp_pm_get_endp_laminar_max(msk)) + return fill_local_laminar_endp(msk, remote, locals); + /* Special case: peer sets the C flag, accept one ADD_ADDR if default * limits are used -- accepting no ADD_ADDR -- and use subflow endpoints */ @@ -707,6 +780,10 @@ find_next: addr_max = pernet->endp_subflow_max; WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1); } + if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) { + addr_max = pernet->endp_laminar_max; + WRITE_ONCE(pernet->endp_laminar_max, addr_max + 1); + } pernet->endpoints++; if (!entry->addr.port) @@ -1100,6 +1177,10 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) addr_max = pernet->endp_subflow_max; WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1); } + if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) { + addr_max = pernet->endp_laminar_max; + WRITE_ONCE(pernet->endp_laminar_max, addr_max - 1); + } pernet->endpoints--; list_del_rcu(&entry->list); @@ -1182,6 +1263,7 @@ static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->endp_signal_max, 0); WRITE_ONCE(pernet->endp_subflow_max, 0); + WRITE_ONCE(pernet->endp_laminar_max, 0); pernet->endpoints = 0; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0cd3333cafaf..371084a3fc22 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1182,6 +1182,7 @@ void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 92a2a2742627..a28a48385885 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -980,6 +980,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_pm_get_limit_add_addr_accepted(msk); info->mptcpi_endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); + info->mptcpi_endp_laminar_max = + mptcp_pm_get_endp_laminar_max(msk); } if (__mptcp_check_fallback(msk)) -- cgit v1.2.3 From a680581f6a131fd8c62d284ed4a24d4bc1cc553e Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Sat, 27 Sep 2025 10:49:10 +0200 Subject: dpll: add phase-offset-avg-factor device attribute to netlink spec Add dpll device level attribute DPLL_A_PHASE_OFFSET_AVG_FACTOR to allow control over a calculation of reported phase offset value. Attribute is present, if the driver provides such capability, otherwise attribute shall not be present. Signed-off-by: Ivan Vecera Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250927084912.2343597-2-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/driver-api/dpll.rst | 18 +++++++++++++++++- Documentation/netlink/specs/dpll.yaml | 6 ++++++ drivers/dpll/dpll_nl.c | 5 +++-- include/uapi/linux/dpll.h | 1 + 4 files changed, 27 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst index eca72d9b9ed8..be1fc643b645 100644 --- a/Documentation/driver-api/dpll.rst +++ b/Documentation/driver-api/dpll.rst @@ -179,7 +179,23 @@ Phase offset measurement and adjustment Device may provide ability to measure a phase difference between signals on a pin and its parent dpll device. If pin-dpll phase offset measurement is supported, it shall be provided with ``DPLL_A_PIN_PHASE_OFFSET`` -attribute for each parent dpll device. +attribute for each parent dpll device. The reported phase offset may be +computed as the average of prior values and the current measurement, using +the following formula: + +.. math:: + curr\_avg = prev\_avg * \frac{2^N-1}{2^N} + new\_val * \frac{1}{2^N} + +where `curr_avg` is the current reported phase offset, `prev_avg` is the +previously reported value, `new_val` is the current measurement, and `N` is +the averaging factor. Configured averaging factor value is provided with +``DPLL_A_PHASE_OFFSET_AVG_FACTOR`` attribute of a device and value change can +be requested with the same attribute with ``DPLL_CMD_DEVICE_SET`` command. + + ================================== ====================================== + ``DPLL_A_PHASE_OFFSET_AVG_FACTOR`` attr configured value of phase offset + averaging factor + ================================== ====================================== Device may also provide ability to adjust a signal phase on a pin. If pin phase adjustment is supported, minimal and maximal values that pin diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml index 5decee61a2c4..cafb4ec20447 100644 --- a/Documentation/netlink/specs/dpll.yaml +++ b/Documentation/netlink/specs/dpll.yaml @@ -315,6 +315,10 @@ attribute-sets: If enabled, dpll device shall monitor and notify all currently available inputs for changes of their phase offset against the dpll device. + - + name: phase-offset-avg-factor + type: u32 + doc: Averaging factor applied to calculation of reported phase offset. - name: pin enum-name: dpll_a_pin @@ -523,6 +527,7 @@ operations: - clock-id - type - phase-offset-monitor + - phase-offset-avg-factor dump: reply: *dev-attrs @@ -540,6 +545,7 @@ operations: attributes: - id - phase-offset-monitor + - phase-offset-avg-factor - name: device-create-ntf doc: Notification about device appearing diff --git a/drivers/dpll/dpll_nl.c b/drivers/dpll/dpll_nl.c index 9f2efaf25268..3c6d570babf8 100644 --- a/drivers/dpll/dpll_nl.c +++ b/drivers/dpll/dpll_nl.c @@ -42,9 +42,10 @@ static const struct nla_policy dpll_device_get_nl_policy[DPLL_A_ID + 1] = { }; /* DPLL_CMD_DEVICE_SET - do */ -static const struct nla_policy dpll_device_set_nl_policy[DPLL_A_PHASE_OFFSET_MONITOR + 1] = { +static const struct nla_policy dpll_device_set_nl_policy[DPLL_A_PHASE_OFFSET_AVG_FACTOR + 1] = { [DPLL_A_ID] = { .type = NLA_U32, }, [DPLL_A_PHASE_OFFSET_MONITOR] = NLA_POLICY_MAX(NLA_U32, 1), + [DPLL_A_PHASE_OFFSET_AVG_FACTOR] = { .type = NLA_U32, }, }; /* DPLL_CMD_PIN_ID_GET - do */ @@ -112,7 +113,7 @@ static const struct genl_split_ops dpll_nl_ops[] = { .doit = dpll_nl_device_set_doit, .post_doit = dpll_post_doit, .policy = dpll_device_set_nl_policy, - .maxattr = DPLL_A_PHASE_OFFSET_MONITOR, + .maxattr = DPLL_A_PHASE_OFFSET_AVG_FACTOR, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index 37b438ce8efc..ab1725a954d7 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -216,6 +216,7 @@ enum dpll_a { DPLL_A_LOCK_STATUS_ERROR, DPLL_A_CLOCK_QUALITY_LEVEL, DPLL_A_PHASE_OFFSET_MONITOR, + DPLL_A_PHASE_OFFSET_AVG_FACTOR, __DPLL_A_MAX, DPLL_A_MAX = (__DPLL_A_MAX - 1) -- cgit v1.2.3 From 7bd80ed89d72285515db673803b021469ba71ee8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 24 Sep 2025 14:02:41 +0200 Subject: Documentation: net: add flow control guide and document ethtool API Introduce a new document, flow_control.rst, to provide a comprehensive guide on Ethernet Flow Control in Linux. The guide explains how flow control works, how autonegotiation resolves pause capabilities, and how to configure it using ethtool and Netlink. In parallel, document the pause and pause-stat attributes in the ethtool.yaml netlink spec. This enables the ynl tool to generate kernel-doc comments for the corresponding enums in the UAPI header, making the C interface self-documenting. Finally, replace the legacy flow control section in phy.rst with a reference to the new document and add pointers in the relevant C source files. Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20250924120241.724850-1-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 27 ++ Documentation/networking/flow_control.rst | 373 +++++++++++++++++++++++++ Documentation/networking/index.rst | 1 + Documentation/networking/phy.rst | 12 +- include/linux/ethtool.h | 45 ++- include/uapi/linux/ethtool_netlink_generated.h | 4 +- net/dcb/dcbnl.c | 2 + net/ethtool/pause.c | 4 + 8 files changed, 453 insertions(+), 15 deletions(-) create mode 100644 Documentation/networking/flow_control.rst (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 6a0fb1974513..e4852505294f 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -864,7 +864,9 @@ attribute-sets: - name: pause-stat + doc: Statistics counters for link-wide PAUSE frames (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-stat-cnt + enum-name: ethtool-a-pause-stat attributes: - name: unspec @@ -875,13 +877,17 @@ attribute-sets: type: pad - name: tx-frames + doc: Number of PAUSE frames transmitted. type: u64 - name: rx-frames + doc: Number of PAUSE frames received. type: u64 - name: pause + doc: Parameters for link-wide PAUSE (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-cnt + enum-name: ethtool-a-pause attributes: - name: unspec @@ -893,19 +899,40 @@ attribute-sets: nested-attributes: header - name: autoneg + doc: | + Acts as a mode selector for the driver. + On GET: indicates the driver's behavior. If true, the driver will + respect the negotiated outcome; if false, the driver will use a + forced configuration. + On SET: if true, the driver configures the PHY's advertisement based + on the rx and tx attributes. If false, the driver forces the MAC + into the state defined by the rx and tx attributes. type: u8 - name: rx + doc: | + Enable receiving PAUSE frames (pausing local TX). + On GET: reflects the currently preferred configuration state. type: u8 - name: tx + doc: | + Enable transmitting PAUSE frames (pausing peer TX). + On GET: reflects the currently preferred configuration state. type: u8 - name: stats + doc: | + Contains the pause statistics counters. The source of these + statistics is determined by stats-src. type: nest nested-attributes: pause-stat - name: stats-src + doc: | + Selects the source of the MAC statistics, values from + enum ethtool_mac_stats_src. This allows requesting statistics + from the individual components of the MAC Merge layer. type: u32 - name: eee diff --git a/Documentation/networking/flow_control.rst b/Documentation/networking/flow_control.rst new file mode 100644 index 000000000000..48646d54513f --- /dev/null +++ b/Documentation/networking/flow_control.rst @@ -0,0 +1,373 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _ethernet-flow-control: + +===================== +Ethernet Flow Control +===================== + +This document is a practical guide to Ethernet Flow Control in Linux, covering +what it is, how it works, and how to configure it. + +What is Flow Control? +===================== + +Flow control is a mechanism to prevent a fast sender from overwhelming a +slow receiver with data, which would cause buffer overruns and dropped packets. +The receiver can signal the sender to temporarily stop transmitting, giving it +time to process its backlog. + +Standards references +==================== + +Ethernet flow control mechanisms are specified across consolidated IEEE base +standards; some originated as amendments: + +- Collision-based flow control is part of CSMA/CD in **IEEE 802.3** + (half-duplex). +- Link-wide PAUSE is defined in **IEEE 802.3 Annex 31B** + (originally **802.3x**). +- Priority-based Flow Control (PFC) is defined in **IEEE 802.1Q Clause 36** + (originally **802.1Qbb**). + +In the remainder of this document, the consolidated clause numbers are used. + +How It Works: The Mechanisms +============================ + +The method used for flow control depends on the link's duplex mode. + +.. note:: + The user-visible ``ethtool`` pause API described in this document controls + **link-wide PAUSE** (IEEE 802.3 Annex 31B) only. It does not control the + collision-based behavior that exists on half-duplex links. + +1. Half-Duplex: Collision-Based Flow Control +-------------------------------------------- +On half-duplex links, a device cannot send and receive simultaneously, so PAUSE +frames are not used. Flow control is achieved by leveraging the CSMA/CD +(Carrier Sense Multiple Access with Collision Detection) protocol itself. + +* **How it works**: To inhibit incoming data, a receiving device can force a + collision on the line. When the sending station detects this collision, it + terminates its transmission, sends a "jam" signal, and then executes the + "Collision backoff and retransmission" procedure as defined in IEEE 802.3, + Section 4.2.3.2.5. This algorithm makes the sender wait for a random + period before attempting to retransmit. By repeatedly forcing collisions, + the receiver can effectively throttle the sender's transmission rate. + +.. note:: + While this mechanism is part of the IEEE standard, there is currently no + generic kernel API to configure or control it. Drivers should not enable + this feature until a standardized interface is available. + +.. warning:: + On shared-medium networks (e.g. 10BASE2, or twisted-pair networks using a + hub rather than a switch) forcing collisions inhibits traffic **across the + entire shared segment**, not just a single point-to-point link. Enabling + such behavior is generally undesirable. + +2. Full-Duplex: Link-wide PAUSE (IEEE 802.3 Annex 31B) +------------------------------------------------------ +On full-duplex links, devices can send and receive at the same time. Flow +control is achieved by sending a special **PAUSE frame**, defined by IEEE +802.3 Annex 31B. This mechanism pauses all traffic on the link and is therefore +called *link-wide PAUSE*. + +* **What it is**: A standard Ethernet frame with a globally reserved + destination MAC address (``01-80-C2-00-00-01``). This address is in a range + that standard IEEE 802.1D-compliant bridges do not forward. However, some + unmanaged or misconfigured bridges have been reported to forward these + frames, which can disrupt flow control across a network. + +* **How it works**: The frame contains a MAC Control opcode for PAUSE + (``0x0001``) and a ``pause_time`` value, telling the sender how long to + wait before sending more data frames. This time is specified in units of + "pause quantum", where one quantum is the time it takes to transmit 512 bits. + For example, one pause quantum is 51.2 microseconds on a 10 Mbit/s link, + and 512 nanoseconds on a 1 Gbit/s link. A ``pause_time`` of zero indicates + that the transmitter can resume transmission, even if a previous non-zero + pause time has not yet elapsed. + +* **Who uses it**: Any full-duplex link, from 10 Mbit/s to multi-gigabit speeds. + +3. Full-Duplex: Priority-based Flow Control (PFC) (IEEE 802.1Q Clause 36) +------------------------------------------------------------------------- +Priority-based Flow Control is an enhancement to the standard PAUSE mechanism +that allows flow control to be applied independently to different classes of +traffic, identified by their priority level. + +* **What it is**: PFC allows a receiver to pause traffic for one or more of the + 8 standard priority levels without stopping traffic for other priorities. + This is critical in data center environments for protocols that cannot + tolerate packet loss due to congestion (e.g., Fibre Channel over Ethernet + or RoCE). + +* **How it works**: PFC uses a specific PAUSE frame format. It shares the same + globally reserved destination MAC address (``01-80-C2-00-00-01``) as legacy + PAUSE frames but uses a unique opcode (``0x0101``). The frame payload + contains two key fields: + + - **``priority_enable_vector``**: An 8-bit mask where each bit corresponds to + one of the 8 priorities. If a bit is set to 1, it means the pause time + for that priority is active. + - **``time_vector``**: A list of eight 2-octet fields, one for each priority. + Each field specifies the ``pause_time`` for its corresponding priority, + measured in units of ``pause_quanta`` (the time to transmit 512 bits). + +.. note:: + When PFC is enabled for at least one priority on a port, the standard + **link-wide PAUSE** (IEEE 802.3 Annex 31B) must be disabled for that port. + The two mechanisms are mutually exclusive (IEEE 802.1Q Clause 36). + +Configuring Flow Control +======================== + +Link-wide PAUSE and Priority-based Flow Control are configured with different +tools. + +Configuring Link-wide PAUSE with ``ethtool`` (IEEE 802.3 Annex 31B) +------------------------------------------------------------------- +Use ``ethtool -a `` to view and ``ethtool -A `` to change +the link-wide PAUSE settings. + +.. code-block:: bash + + # View current link-wide PAUSE settings + ethtool -a eth0 + + # Enable RX and TX pause, with autonegotiation + ethtool -A eth0 autoneg on rx on tx on + +**Key Configuration Concepts**: + +* **Pause Autoneg vs Generic Autoneg**: ``ethtool -A ... autoneg {on,off}`` + controls **Pause Autoneg** (Annex 31B) only. It is independent from the + **Generic link autonegotiation** configured with ``ethtool -s``. A device can + have Generic autoneg **on** while Pause Autoneg is **off**, and vice versa. + +* **If Pause Autoneg is off** (``-A ... autoneg off``): the device will **not** + advertise pause in the PHY. The MAC PAUSE state is **forced** according to + ``rx``/``tx`` and does not depend on partner capabilities or resolution. + Ensure the peer is configured complementarily for PAUSE to be effective. + +* **If generic autoneg is off** but **Pause Autoneg is on**, the pause policy + is **remembered** by the kernel and applied later when Generic autoneg is + enabled again. + +* **Autonegotiation Mode**: The PHY will *advertise* the ``rx`` and ``tx`` + capabilities. The final active state is determined by what both sides of the + link agree on. See the "PHY (Physical Layer Transceiver)" section below, + especially the *Resolution* subsection, for details of the negotiation rules. + +* **Forced Mode**: This mode is necessary when autonegotiation is not used or + not possible. This includes links where one or both partners have + autonegotiation disabled, or in setups without a PHY (e.g., direct + MAC-to-MAC connections). The driver bypasses PHY advertisement and + directly forces the MAC into the specified ``rx``/``tx`` state. The + configuration on both sides of the link must be complementary. For + example, if one side is set to ``tx on`` ``rx off``, the link partner must be + set to ``tx off`` ``rx on`` for flow control to function correctly. + +Configuring PFC with ``dcb`` (IEEE 802.1Q Clause 36) +---------------------------------------------------- +PFC is part of the Data Center Bridging (DCB) subsystem and is managed with the +``dcb`` tool (iproute2). Some deployments use ``dcbtool`` (lldpad) instead; this +document shows ``dcb(8)`` examples. + +**Viewing PFC Settings**: + +.. code-block:: text + + $ dcb pfc show dev eth0 + pfc-cap 8 macsec-bypass off delay 4096 + prio-pfc 0:off 1:off 2:off 3:off 4:off 5:off 6:on 7:on + +This shows the PFC state (on/off) for each priority (0-7). + +**Changing PFC Settings**: + +.. code-block:: bash + + # Enable PFC on priorities 6 and 7, leaving others as they are + $ dcb pfc set dev eth0 prio-pfc 6:on 7:on + + # Disable PFC for all priorities except 6 and 7 + $ dcb pfc set dev eth0 prio-pfc all:off 6:on 7:on + +Monitoring Flow Control +======================= + +The standard way to check if flow control is actively being used is to view the +pause-related statistics. + +**Monitoring Link-wide PAUSE**: +Use ``ethtool --include-statistics -a ``. + +.. code-block:: text + + $ ethtool --include-statistics -a eth0 + Pause parameters for eth0: + ... + Statistics: + tx_pause_frames: 0 + rx_pause_frames: 0 + +**Monitoring PFC**: +PFC statistics (sent and received frames per priority) are available +through the ``dcb`` tool. + +.. code-block:: text + + $ dcb pfc show dev eth0 requests indications + requests 0:0 1:0 2:0 3:1024 4:2048 5:0 6:0 7:0 + indications 0:0 1:0 2:0 3:512 4:4096 5:0 6:0 7:0 + +The ``requests`` counters track transmitted PFC frames (TX), and the +``indications`` counters track received PFC frames (RX). + +Link-wide PAUSE Autonegotiation Details +======================================= + +The autonegotiation process for link-wide PAUSE is managed by the PHY and +involves advertising capabilities and resolving the outcome. + +* Terminology (link-wide PAUSE): + + - **Symmetric pause**: both directions are paused when requested (TX+RX + enabled). + - **Asymmetric pause**: only one direction is paused (e.g., RX-only or + TX-only). + + In IEEE 802.3 advertisement/resolution, symmetric/asymmetric are encoded + using two bits (Pause/Asym) and resolved per the standard truth tables + below. + +* **Advertisement**: The PHY advertises the MAC's flow control capabilities. + This is done using two bits in the advertisement register: "Symmetric + Pause" (Pause) and "Asymmetric Pause" (Asym). These bits should be + interpreted as a combined value, not as independent flags. The kernel + converts the user's ``rx`` and ``tx`` settings into this two-bit value as + follows: + + .. code-block:: text + + tx rx | Pause Asym + -------+------------- + 0 0 | 0 0 + 0 1 | 1 1 + 1 0 | 0 1 + 1 1 | 1 0 + +* **Resolution**: After negotiation, the PHY reports the link partner's + advertised Pause and Asym bits. The final flow control mode is determined + by the combination of the local and partner advertisements, according to + the IEEE 802.3 standard: + + .. code-block:: text + + Local Device | Link Partner | Result + Pause Asym | Pause Asym | + -------------------+--------------------+--------- + 0 X | 0 X | Disabled + 0 1 | 1 0 | Disabled + 0 1 | 1 1 | TX only + 1 0 | 0 X | Disabled + 1 X | 1 X | TX + RX + 1 1 | 0 1 | RX only + + It is important to note that the advertised bits reflect the *current + configuration* of the MAC, which may not represent its full hardware + capabilities. + +Kernel Policy: "Set and Trust" +============================== + +The ethtool pause API is defined as a **wish policy** for +IEEE 802.3 link-wide PAUSE only. A user request is always accepted +as the preferred configuration, but it may not be possible to apply +it in all link states. + +Key constraints: + +- Link-wide PAUSE is not valid on half-duplex links. +- Link-wide PAUSE cannot be used together with Priority-based Flow Control + (PFC, IEEE 802.1Q Clause 36). +- If autonegotiation is active and the link is currently down, the future + mode is not yet known. + +Because of these constraints, the kernel stores the requested setting +and applies it only when the link is in a compatible state. + +Implications for userspace: + +1. Set once (the "wish"): the requested Rx/Tx PAUSE policy is + remembered even if it cannot be applied immediately. +2. Applied conditionally: when the link comes up, the kernel enables + PAUSE only if the active mode allows it. + +Component Roles in Flow Control +=============================== + +The configuration of flow control involves several components, each with a +distinct role. + +The MAC (Media Access Controller) +--------------------------------- +The MAC is the hardware component that actually sends and receives PAUSE +frames. Its capabilities define the upper limit of what the driver can support. +For link-wide PAUSE, MACs can vary in their support for symmetric (both +directions) or asymmetric (independent TX/RX) flow control. + +For PFC, the MAC must be capable of generating and interpreting the +priority-based PAUSE frames and managing separate pause states for each +traffic class. + +Many MACs also implement automatic PAUSE frame transmission based on the fill +level of their internal RX FIFO. This is typically configured with two +thresholds: + +* **FLOW_ON (High Water Mark)**: When the RX FIFO usage reaches this + threshold, the MAC automatically transmits a PAUSE frame to stop the sender. + +* **FLOW_OFF (Low Water Mark)**: When the RX FIFO usage drops below this + threshold, the MAC transmits a PAUSE frame with a quantum of zero to tell + the sender it can resume transmission. + +The PHY (Physical Layer Transceiver) +------------------------------------ +The PHY's role is distinct for each flow control mechanism: + +* **Link-wide PAUSE**: During the autonegotiation process, the PHY is + responsible for advertising the device's flow control capabilities. See the + "Link-wide PAUSE Autonegotiation Details" section for more information. + +* **Half-Duplex Collision-Based Flow Control**: The PHY is fundamental to the + CSMA/CD process. It performs carrier sensing (checking if the line is idle) + and collision detection, which is the mechanism leveraged to throttle the + sender. + +* **Priority-based Flow Control (PFC)**: The PHY is not directly involved in + negotiating PFC capabilities. Its role is to establish the physical link. + PFC negotiation happens at a higher layer via the Data Center Bridging + Capability Exchange Protocol (DCBX). + +User Space Interface +==================== +The primary user space tools are ``ethtool`` for link-wide PAUSE and ``dcb`` for +PFC. They communicate with the kernel to configure the network device driver +and underlying hardware. + +**Link-wide PAUSE Netlink Interface (``ethtool``)** + +See the ethtool Netlink spec (``Documentation/netlink/specs/ethtool.yaml``) +for the authoritative definition of the Pause control and Pause statistics +attributes. The generated UAPI is in +``include/uapi/linux/ethtool_netlink_generated.h``. + +**PFC Netlink Interface (``dcb``)** + +The authoritative definitions for DCB/PFC netlink attributes and commands are in +``include/uapi/linux/dcbnl.h``. See also the ``dcb(8)`` manual page and the DCB +subsystem documentation for userspace configuration details. + diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c775cababc8c..52aafdc85f6a 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -55,6 +55,7 @@ Contents: eql fib_trie filter + flow_control generic-hdlc generic_netlink ../netlink/specs/index diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index b0f2ef83735d..40cc0a988d60 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -343,16 +343,8 @@ Some of the interface modes are described below: Pause frames / flow control =========================== -The PHY does not participate directly in flow control/pause frames except by -making sure that the SUPPORTED_Pause and SUPPORTED_AsymPause bits are set in -MII_ADVERTISE to indicate towards the link partner that the Ethernet MAC -controller supports such a thing. Since flow control/pause frames generation -involves the Ethernet MAC driver, it is recommended that this driver takes care -of properly indicating advertisement and support for such features by setting -the SUPPORTED_Pause and SUPPORTED_AsymPause bits accordingly. This can be done -either before or after phy_connect() and/or as a result of implementing the -ethtool::set_pauseparam feature. - +For detailed link-wide PAUSE and PFC behavior and configuration, see +flow_control.rst. Keeping Close Tabs on the PAL ============================= diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c2d8b4ec62eb..eeed1ea50369 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -953,9 +953,48 @@ struct kernel_ethtool_ts_info { * @get_pause_stats: Report pause frame statistics. Drivers must not zero * statistics which they don't report. The stats structure is initialized * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. - * @get_pauseparam: Report pause parameters - * @set_pauseparam: Set pause parameters. Returns a negative error code - * or zero. + * + * @get_pauseparam: Report the configured policy for link-wide PAUSE + * (IEEE 802.3 Annex 31B). Drivers must fill struct ethtool_pauseparam + * such that: + * @autoneg: + * This refers to **Pause Autoneg** (IEEE 802.3 Annex 31B) only + * and is independent of generic link autonegotiation configured + * via ethtool -s. + * true -> the device follows the negotiated result of pause + * autonegotiation (Pause/Asym); + * false -> the device uses a forced MAC state independent of + * negotiation. + * @rx_pause/@tx_pause: + * represent the desired policy (preferred configuration). + * In autoneg mode they describe what is to be advertised; + * in forced mode they describe the MAC state to apply. + * + * Drivers (and/or frameworks) should persist this policy across link + * changes and reapply appropriate MAC programming when link parameters + * change. + * + * @set_pauseparam: Apply a policy for link-wide PAUSE (IEEE 802.3 Annex 31B). + * If @autoneg is true: + * Arrange for pause advertisement (Pause/Asym) based on + * @rx_pause/@tx_pause and program the MAC to follow the + * negotiated result (which may be symmetric, asymmetric, or off + * depending on the link partner). + * If @autoneg is false: + * Do not rely on autonegotiation; force the MAC RX/TX pause + * state directly per @rx_pause/@tx_pause. + * + * Implementations that integrate with PHYLIB/PHYLINK should cooperate + * with those frameworks for advertisement and resolution; MAC drivers are + * still responsible for applying the required MAC state. + * + * Return: 0 on success or a negative errno. Return -EOPNOTSUPP if + * link-wide PAUSE is unsupported. If only symmetric pause is supported, + * reject unsupported asymmetric requests with -EINVAL (or document any + * coercion policy). + * + * See also: Documentation/networking/flow_control.rst + * * @self_test: Run specified self-tests * @get_strings: Return a set of strings that describe the requested objects * @set_phys_id: Identify the physical devices, e.g. by flashing an LED diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 0e8ac0d974e2..3dd9d7cde86e 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -375,7 +375,7 @@ enum { ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) }; -enum { +enum ethtool_a_pause_stat { ETHTOOL_A_PAUSE_STAT_UNSPEC, ETHTOOL_A_PAUSE_STAT_PAD, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, @@ -385,7 +385,7 @@ enum { ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; -enum { +enum ethtool_a_pause { ETHTOOL_A_PAUSE_UNSPEC, ETHTOOL_A_PAUSE_HEADER, ETHTOOL_A_PAUSE_AUTONEG, diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 03eb1d941fca..91ee22f53774 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -27,6 +27,8 @@ * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. + * See Documentation/networking/flow_control.rst for a high level description + * of the user space interface for Priority-based Flow Control (PFC). * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 0f9af1e66548..eacf6a4859bf 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only +/* See Documentation/networking/flow_control.rst for a high level description of + * the userspace interface. + */ + #include "netlink.h" #include "common.h" -- cgit v1.2.3 From 1a98f5699bd57c9b3f66ec54cc38571d5e42ffb1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Sep 2025 15:45:06 +0200 Subject: Revert "Documentation: net: add flow control guide and document ethtool API" This reverts commit 7bd80ed89d72285515db673803b021469ba71ee8. I should not have merged it to begin with due to pending review and changes to be addressed. Link: https://patch.msgid.link/c6f3af12df9b7998920a02027fc8893ce82afc4c.1759239721.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 27 -- Documentation/networking/flow_control.rst | 373 ------------------------- Documentation/networking/index.rst | 1 - Documentation/networking/phy.rst | 12 +- include/linux/ethtool.h | 45 +-- include/uapi/linux/ethtool_netlink_generated.h | 4 +- net/dcb/dcbnl.c | 2 - net/ethtool/pause.c | 4 - 8 files changed, 15 insertions(+), 453 deletions(-) delete mode 100644 Documentation/networking/flow_control.rst (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index e4852505294f..6a0fb1974513 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -864,9 +864,7 @@ attribute-sets: - name: pause-stat - doc: Statistics counters for link-wide PAUSE frames (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-stat-cnt - enum-name: ethtool-a-pause-stat attributes: - name: unspec @@ -877,17 +875,13 @@ attribute-sets: type: pad - name: tx-frames - doc: Number of PAUSE frames transmitted. type: u64 - name: rx-frames - doc: Number of PAUSE frames received. type: u64 - name: pause - doc: Parameters for link-wide PAUSE (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-cnt - enum-name: ethtool-a-pause attributes: - name: unspec @@ -899,40 +893,19 @@ attribute-sets: nested-attributes: header - name: autoneg - doc: | - Acts as a mode selector for the driver. - On GET: indicates the driver's behavior. If true, the driver will - respect the negotiated outcome; if false, the driver will use a - forced configuration. - On SET: if true, the driver configures the PHY's advertisement based - on the rx and tx attributes. If false, the driver forces the MAC - into the state defined by the rx and tx attributes. type: u8 - name: rx - doc: | - Enable receiving PAUSE frames (pausing local TX). - On GET: reflects the currently preferred configuration state. type: u8 - name: tx - doc: | - Enable transmitting PAUSE frames (pausing peer TX). - On GET: reflects the currently preferred configuration state. type: u8 - name: stats - doc: | - Contains the pause statistics counters. The source of these - statistics is determined by stats-src. type: nest nested-attributes: pause-stat - name: stats-src - doc: | - Selects the source of the MAC statistics, values from - enum ethtool_mac_stats_src. This allows requesting statistics - from the individual components of the MAC Merge layer. type: u32 - name: eee diff --git a/Documentation/networking/flow_control.rst b/Documentation/networking/flow_control.rst deleted file mode 100644 index 48646d54513f..000000000000 --- a/Documentation/networking/flow_control.rst +++ /dev/null @@ -1,373 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _ethernet-flow-control: - -===================== -Ethernet Flow Control -===================== - -This document is a practical guide to Ethernet Flow Control in Linux, covering -what it is, how it works, and how to configure it. - -What is Flow Control? -===================== - -Flow control is a mechanism to prevent a fast sender from overwhelming a -slow receiver with data, which would cause buffer overruns and dropped packets. -The receiver can signal the sender to temporarily stop transmitting, giving it -time to process its backlog. - -Standards references -==================== - -Ethernet flow control mechanisms are specified across consolidated IEEE base -standards; some originated as amendments: - -- Collision-based flow control is part of CSMA/CD in **IEEE 802.3** - (half-duplex). -- Link-wide PAUSE is defined in **IEEE 802.3 Annex 31B** - (originally **802.3x**). -- Priority-based Flow Control (PFC) is defined in **IEEE 802.1Q Clause 36** - (originally **802.1Qbb**). - -In the remainder of this document, the consolidated clause numbers are used. - -How It Works: The Mechanisms -============================ - -The method used for flow control depends on the link's duplex mode. - -.. note:: - The user-visible ``ethtool`` pause API described in this document controls - **link-wide PAUSE** (IEEE 802.3 Annex 31B) only. It does not control the - collision-based behavior that exists on half-duplex links. - -1. Half-Duplex: Collision-Based Flow Control --------------------------------------------- -On half-duplex links, a device cannot send and receive simultaneously, so PAUSE -frames are not used. Flow control is achieved by leveraging the CSMA/CD -(Carrier Sense Multiple Access with Collision Detection) protocol itself. - -* **How it works**: To inhibit incoming data, a receiving device can force a - collision on the line. When the sending station detects this collision, it - terminates its transmission, sends a "jam" signal, and then executes the - "Collision backoff and retransmission" procedure as defined in IEEE 802.3, - Section 4.2.3.2.5. This algorithm makes the sender wait for a random - period before attempting to retransmit. By repeatedly forcing collisions, - the receiver can effectively throttle the sender's transmission rate. - -.. note:: - While this mechanism is part of the IEEE standard, there is currently no - generic kernel API to configure or control it. Drivers should not enable - this feature until a standardized interface is available. - -.. warning:: - On shared-medium networks (e.g. 10BASE2, or twisted-pair networks using a - hub rather than a switch) forcing collisions inhibits traffic **across the - entire shared segment**, not just a single point-to-point link. Enabling - such behavior is generally undesirable. - -2. Full-Duplex: Link-wide PAUSE (IEEE 802.3 Annex 31B) ------------------------------------------------------- -On full-duplex links, devices can send and receive at the same time. Flow -control is achieved by sending a special **PAUSE frame**, defined by IEEE -802.3 Annex 31B. This mechanism pauses all traffic on the link and is therefore -called *link-wide PAUSE*. - -* **What it is**: A standard Ethernet frame with a globally reserved - destination MAC address (``01-80-C2-00-00-01``). This address is in a range - that standard IEEE 802.1D-compliant bridges do not forward. However, some - unmanaged or misconfigured bridges have been reported to forward these - frames, which can disrupt flow control across a network. - -* **How it works**: The frame contains a MAC Control opcode for PAUSE - (``0x0001``) and a ``pause_time`` value, telling the sender how long to - wait before sending more data frames. This time is specified in units of - "pause quantum", where one quantum is the time it takes to transmit 512 bits. - For example, one pause quantum is 51.2 microseconds on a 10 Mbit/s link, - and 512 nanoseconds on a 1 Gbit/s link. A ``pause_time`` of zero indicates - that the transmitter can resume transmission, even if a previous non-zero - pause time has not yet elapsed. - -* **Who uses it**: Any full-duplex link, from 10 Mbit/s to multi-gigabit speeds. - -3. Full-Duplex: Priority-based Flow Control (PFC) (IEEE 802.1Q Clause 36) -------------------------------------------------------------------------- -Priority-based Flow Control is an enhancement to the standard PAUSE mechanism -that allows flow control to be applied independently to different classes of -traffic, identified by their priority level. - -* **What it is**: PFC allows a receiver to pause traffic for one or more of the - 8 standard priority levels without stopping traffic for other priorities. - This is critical in data center environments for protocols that cannot - tolerate packet loss due to congestion (e.g., Fibre Channel over Ethernet - or RoCE). - -* **How it works**: PFC uses a specific PAUSE frame format. It shares the same - globally reserved destination MAC address (``01-80-C2-00-00-01``) as legacy - PAUSE frames but uses a unique opcode (``0x0101``). The frame payload - contains two key fields: - - - **``priority_enable_vector``**: An 8-bit mask where each bit corresponds to - one of the 8 priorities. If a bit is set to 1, it means the pause time - for that priority is active. - - **``time_vector``**: A list of eight 2-octet fields, one for each priority. - Each field specifies the ``pause_time`` for its corresponding priority, - measured in units of ``pause_quanta`` (the time to transmit 512 bits). - -.. note:: - When PFC is enabled for at least one priority on a port, the standard - **link-wide PAUSE** (IEEE 802.3 Annex 31B) must be disabled for that port. - The two mechanisms are mutually exclusive (IEEE 802.1Q Clause 36). - -Configuring Flow Control -======================== - -Link-wide PAUSE and Priority-based Flow Control are configured with different -tools. - -Configuring Link-wide PAUSE with ``ethtool`` (IEEE 802.3 Annex 31B) -------------------------------------------------------------------- -Use ``ethtool -a `` to view and ``ethtool -A `` to change -the link-wide PAUSE settings. - -.. code-block:: bash - - # View current link-wide PAUSE settings - ethtool -a eth0 - - # Enable RX and TX pause, with autonegotiation - ethtool -A eth0 autoneg on rx on tx on - -**Key Configuration Concepts**: - -* **Pause Autoneg vs Generic Autoneg**: ``ethtool -A ... autoneg {on,off}`` - controls **Pause Autoneg** (Annex 31B) only. It is independent from the - **Generic link autonegotiation** configured with ``ethtool -s``. A device can - have Generic autoneg **on** while Pause Autoneg is **off**, and vice versa. - -* **If Pause Autoneg is off** (``-A ... autoneg off``): the device will **not** - advertise pause in the PHY. The MAC PAUSE state is **forced** according to - ``rx``/``tx`` and does not depend on partner capabilities or resolution. - Ensure the peer is configured complementarily for PAUSE to be effective. - -* **If generic autoneg is off** but **Pause Autoneg is on**, the pause policy - is **remembered** by the kernel and applied later when Generic autoneg is - enabled again. - -* **Autonegotiation Mode**: The PHY will *advertise* the ``rx`` and ``tx`` - capabilities. The final active state is determined by what both sides of the - link agree on. See the "PHY (Physical Layer Transceiver)" section below, - especially the *Resolution* subsection, for details of the negotiation rules. - -* **Forced Mode**: This mode is necessary when autonegotiation is not used or - not possible. This includes links where one or both partners have - autonegotiation disabled, or in setups without a PHY (e.g., direct - MAC-to-MAC connections). The driver bypasses PHY advertisement and - directly forces the MAC into the specified ``rx``/``tx`` state. The - configuration on both sides of the link must be complementary. For - example, if one side is set to ``tx on`` ``rx off``, the link partner must be - set to ``tx off`` ``rx on`` for flow control to function correctly. - -Configuring PFC with ``dcb`` (IEEE 802.1Q Clause 36) ----------------------------------------------------- -PFC is part of the Data Center Bridging (DCB) subsystem and is managed with the -``dcb`` tool (iproute2). Some deployments use ``dcbtool`` (lldpad) instead; this -document shows ``dcb(8)`` examples. - -**Viewing PFC Settings**: - -.. code-block:: text - - $ dcb pfc show dev eth0 - pfc-cap 8 macsec-bypass off delay 4096 - prio-pfc 0:off 1:off 2:off 3:off 4:off 5:off 6:on 7:on - -This shows the PFC state (on/off) for each priority (0-7). - -**Changing PFC Settings**: - -.. code-block:: bash - - # Enable PFC on priorities 6 and 7, leaving others as they are - $ dcb pfc set dev eth0 prio-pfc 6:on 7:on - - # Disable PFC for all priorities except 6 and 7 - $ dcb pfc set dev eth0 prio-pfc all:off 6:on 7:on - -Monitoring Flow Control -======================= - -The standard way to check if flow control is actively being used is to view the -pause-related statistics. - -**Monitoring Link-wide PAUSE**: -Use ``ethtool --include-statistics -a ``. - -.. code-block:: text - - $ ethtool --include-statistics -a eth0 - Pause parameters for eth0: - ... - Statistics: - tx_pause_frames: 0 - rx_pause_frames: 0 - -**Monitoring PFC**: -PFC statistics (sent and received frames per priority) are available -through the ``dcb`` tool. - -.. code-block:: text - - $ dcb pfc show dev eth0 requests indications - requests 0:0 1:0 2:0 3:1024 4:2048 5:0 6:0 7:0 - indications 0:0 1:0 2:0 3:512 4:4096 5:0 6:0 7:0 - -The ``requests`` counters track transmitted PFC frames (TX), and the -``indications`` counters track received PFC frames (RX). - -Link-wide PAUSE Autonegotiation Details -======================================= - -The autonegotiation process for link-wide PAUSE is managed by the PHY and -involves advertising capabilities and resolving the outcome. - -* Terminology (link-wide PAUSE): - - - **Symmetric pause**: both directions are paused when requested (TX+RX - enabled). - - **Asymmetric pause**: only one direction is paused (e.g., RX-only or - TX-only). - - In IEEE 802.3 advertisement/resolution, symmetric/asymmetric are encoded - using two bits (Pause/Asym) and resolved per the standard truth tables - below. - -* **Advertisement**: The PHY advertises the MAC's flow control capabilities. - This is done using two bits in the advertisement register: "Symmetric - Pause" (Pause) and "Asymmetric Pause" (Asym). These bits should be - interpreted as a combined value, not as independent flags. The kernel - converts the user's ``rx`` and ``tx`` settings into this two-bit value as - follows: - - .. code-block:: text - - tx rx | Pause Asym - -------+------------- - 0 0 | 0 0 - 0 1 | 1 1 - 1 0 | 0 1 - 1 1 | 1 0 - -* **Resolution**: After negotiation, the PHY reports the link partner's - advertised Pause and Asym bits. The final flow control mode is determined - by the combination of the local and partner advertisements, according to - the IEEE 802.3 standard: - - .. code-block:: text - - Local Device | Link Partner | Result - Pause Asym | Pause Asym | - -------------------+--------------------+--------- - 0 X | 0 X | Disabled - 0 1 | 1 0 | Disabled - 0 1 | 1 1 | TX only - 1 0 | 0 X | Disabled - 1 X | 1 X | TX + RX - 1 1 | 0 1 | RX only - - It is important to note that the advertised bits reflect the *current - configuration* of the MAC, which may not represent its full hardware - capabilities. - -Kernel Policy: "Set and Trust" -============================== - -The ethtool pause API is defined as a **wish policy** for -IEEE 802.3 link-wide PAUSE only. A user request is always accepted -as the preferred configuration, but it may not be possible to apply -it in all link states. - -Key constraints: - -- Link-wide PAUSE is not valid on half-duplex links. -- Link-wide PAUSE cannot be used together with Priority-based Flow Control - (PFC, IEEE 802.1Q Clause 36). -- If autonegotiation is active and the link is currently down, the future - mode is not yet known. - -Because of these constraints, the kernel stores the requested setting -and applies it only when the link is in a compatible state. - -Implications for userspace: - -1. Set once (the "wish"): the requested Rx/Tx PAUSE policy is - remembered even if it cannot be applied immediately. -2. Applied conditionally: when the link comes up, the kernel enables - PAUSE only if the active mode allows it. - -Component Roles in Flow Control -=============================== - -The configuration of flow control involves several components, each with a -distinct role. - -The MAC (Media Access Controller) ---------------------------------- -The MAC is the hardware component that actually sends and receives PAUSE -frames. Its capabilities define the upper limit of what the driver can support. -For link-wide PAUSE, MACs can vary in their support for symmetric (both -directions) or asymmetric (independent TX/RX) flow control. - -For PFC, the MAC must be capable of generating and interpreting the -priority-based PAUSE frames and managing separate pause states for each -traffic class. - -Many MACs also implement automatic PAUSE frame transmission based on the fill -level of their internal RX FIFO. This is typically configured with two -thresholds: - -* **FLOW_ON (High Water Mark)**: When the RX FIFO usage reaches this - threshold, the MAC automatically transmits a PAUSE frame to stop the sender. - -* **FLOW_OFF (Low Water Mark)**: When the RX FIFO usage drops below this - threshold, the MAC transmits a PAUSE frame with a quantum of zero to tell - the sender it can resume transmission. - -The PHY (Physical Layer Transceiver) ------------------------------------- -The PHY's role is distinct for each flow control mechanism: - -* **Link-wide PAUSE**: During the autonegotiation process, the PHY is - responsible for advertising the device's flow control capabilities. See the - "Link-wide PAUSE Autonegotiation Details" section for more information. - -* **Half-Duplex Collision-Based Flow Control**: The PHY is fundamental to the - CSMA/CD process. It performs carrier sensing (checking if the line is idle) - and collision detection, which is the mechanism leveraged to throttle the - sender. - -* **Priority-based Flow Control (PFC)**: The PHY is not directly involved in - negotiating PFC capabilities. Its role is to establish the physical link. - PFC negotiation happens at a higher layer via the Data Center Bridging - Capability Exchange Protocol (DCBX). - -User Space Interface -==================== -The primary user space tools are ``ethtool`` for link-wide PAUSE and ``dcb`` for -PFC. They communicate with the kernel to configure the network device driver -and underlying hardware. - -**Link-wide PAUSE Netlink Interface (``ethtool``)** - -See the ethtool Netlink spec (``Documentation/netlink/specs/ethtool.yaml``) -for the authoritative definition of the Pause control and Pause statistics -attributes. The generated UAPI is in -``include/uapi/linux/ethtool_netlink_generated.h``. - -**PFC Netlink Interface (``dcb``)** - -The authoritative definitions for DCB/PFC netlink attributes and commands are in -``include/uapi/linux/dcbnl.h``. See also the ``dcb(8)`` manual page and the DCB -subsystem documentation for userspace configuration details. - diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 52aafdc85f6a..c775cababc8c 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -55,7 +55,6 @@ Contents: eql fib_trie filter - flow_control generic-hdlc generic_netlink ../netlink/specs/index diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index 40cc0a988d60..b0f2ef83735d 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -343,8 +343,16 @@ Some of the interface modes are described below: Pause frames / flow control =========================== -For detailed link-wide PAUSE and PFC behavior and configuration, see -flow_control.rst. +The PHY does not participate directly in flow control/pause frames except by +making sure that the SUPPORTED_Pause and SUPPORTED_AsymPause bits are set in +MII_ADVERTISE to indicate towards the link partner that the Ethernet MAC +controller supports such a thing. Since flow control/pause frames generation +involves the Ethernet MAC driver, it is recommended that this driver takes care +of properly indicating advertisement and support for such features by setting +the SUPPORTED_Pause and SUPPORTED_AsymPause bits accordingly. This can be done +either before or after phy_connect() and/or as a result of implementing the +ethtool::set_pauseparam feature. + Keeping Close Tabs on the PAL ============================= diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index eeed1ea50369..c2d8b4ec62eb 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -953,48 +953,9 @@ struct kernel_ethtool_ts_info { * @get_pause_stats: Report pause frame statistics. Drivers must not zero * statistics which they don't report. The stats structure is initialized * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. - * - * @get_pauseparam: Report the configured policy for link-wide PAUSE - * (IEEE 802.3 Annex 31B). Drivers must fill struct ethtool_pauseparam - * such that: - * @autoneg: - * This refers to **Pause Autoneg** (IEEE 802.3 Annex 31B) only - * and is independent of generic link autonegotiation configured - * via ethtool -s. - * true -> the device follows the negotiated result of pause - * autonegotiation (Pause/Asym); - * false -> the device uses a forced MAC state independent of - * negotiation. - * @rx_pause/@tx_pause: - * represent the desired policy (preferred configuration). - * In autoneg mode they describe what is to be advertised; - * in forced mode they describe the MAC state to apply. - * - * Drivers (and/or frameworks) should persist this policy across link - * changes and reapply appropriate MAC programming when link parameters - * change. - * - * @set_pauseparam: Apply a policy for link-wide PAUSE (IEEE 802.3 Annex 31B). - * If @autoneg is true: - * Arrange for pause advertisement (Pause/Asym) based on - * @rx_pause/@tx_pause and program the MAC to follow the - * negotiated result (which may be symmetric, asymmetric, or off - * depending on the link partner). - * If @autoneg is false: - * Do not rely on autonegotiation; force the MAC RX/TX pause - * state directly per @rx_pause/@tx_pause. - * - * Implementations that integrate with PHYLIB/PHYLINK should cooperate - * with those frameworks for advertisement and resolution; MAC drivers are - * still responsible for applying the required MAC state. - * - * Return: 0 on success or a negative errno. Return -EOPNOTSUPP if - * link-wide PAUSE is unsupported. If only symmetric pause is supported, - * reject unsupported asymmetric requests with -EINVAL (or document any - * coercion policy). - * - * See also: Documentation/networking/flow_control.rst - * + * @get_pauseparam: Report pause parameters + * @set_pauseparam: Set pause parameters. Returns a negative error code + * or zero. * @self_test: Run specified self-tests * @get_strings: Return a set of strings that describe the requested objects * @set_phys_id: Identify the physical devices, e.g. by flashing an LED diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 3dd9d7cde86e..0e8ac0d974e2 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -375,7 +375,7 @@ enum { ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) }; -enum ethtool_a_pause_stat { +enum { ETHTOOL_A_PAUSE_STAT_UNSPEC, ETHTOOL_A_PAUSE_STAT_PAD, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, @@ -385,7 +385,7 @@ enum ethtool_a_pause_stat { ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; -enum ethtool_a_pause { +enum { ETHTOOL_A_PAUSE_UNSPEC, ETHTOOL_A_PAUSE_HEADER, ETHTOOL_A_PAUSE_AUTONEG, diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 91ee22f53774..03eb1d941fca 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -27,8 +27,6 @@ * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. - * See Documentation/networking/flow_control.rst for a high level description - * of the user space interface for Priority-based Flow Control (PFC). * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index eacf6a4859bf..0f9af1e66548 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* See Documentation/networking/flow_control.rst for a high level description of - * the userspace interface. - */ - #include "netlink.h" #include "common.h" -- cgit v1.2.3 From de7342228b7343774d6a9981c2ddbfb5e201044b Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Sat, 4 Oct 2025 22:23:29 +0800 Subject: bpf: Finish constification of 1st parameter of bpf_d_path() The commit 1b8abbb12128 ("bpf...d_path(): constify path argument") constified the first parameter of the bpf_d_path(), but failed to update it in all places. Finish constification. Otherwise the selftest fail to build: .../selftests/bpf/bpf_experimental.h:222:12: error: conflicting types for 'bpf_path_d_path' 222 | extern int bpf_path_d_path(const struct path *path, char *buf, size_t buf__sz) __ksym; | ^ .../selftests/bpf/tools/include/vmlinux.h:153922:12: note: previous declaration is here 153922 | extern int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) __weak __ksym; Fixes: 1b8abbb12128 ("bpf...d_path(): constify path argument") Signed-off-by: Rong Tao Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- scripts/bpf_doc.py | 1 + tools/include/uapi/linux/bpf.h | 2 +- tools/testing/selftests/bpf/progs/verifier_vfs_accept.c | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ae83d8649ef1..6829936d33f5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4891,7 +4891,7 @@ union bpf_attr { * * **-ENOENT** if the bpf_local_storage cannot be found. * - * long bpf_d_path(struct path *path, char *buf, u32 sz) + * long bpf_d_path(const struct path *path, char *buf, u32 sz) * Description * Return full path for given **struct path** object, which * needs to be the kernel BTF *path* object. The path is diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index c77dc40f7689..15d113a1bc1d 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -788,6 +788,7 @@ class PrinterHelpersHeader(Printer): 'struct task_struct', 'struct cgroup', 'struct path', + 'const struct path', 'struct btf_ptr', 'struct inode', 'struct socket', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ae83d8649ef1..6829936d33f5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4891,7 +4891,7 @@ union bpf_attr { * * **-ENOENT** if the bpf_local_storage cannot be found. * - * long bpf_d_path(struct path *path, char *buf, u32 sz) + * long bpf_d_path(const struct path *path, char *buf, u32 sz) * Description * Return full path for given **struct path** object, which * needs to be the kernel BTF *path* object. The path is diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c index 3e2d76ee8050..55398c04290a 100644 --- a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c @@ -70,7 +70,7 @@ __success int BPF_PROG(path_d_path_from_file_argument, struct file *file) { int ret; - struct path *path; + const struct path *path; /* The f_path member is a path which is embedded directly within a * file. Therefore, a pointer to such embedded members are still -- cgit v1.2.3 From beb97995b97532e1f215e3295e6843e59862f94b Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Tue, 7 Oct 2025 14:18:18 +0800 Subject: io_uring: use tab indentation for IORING_SEND_VECTORIZED comment Be consistent with tab style of "liburing/src/include/liburing/io_uring.h". Signed-off-by: Haiyue Wang Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a0cc1cc0dd01..263bed13473e 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -404,7 +404,7 @@ enum io_uring_op { * will be contiguous from the starting buffer ID. * * IORING_SEND_VECTORIZED If set, SEND[_ZC] will take a pointer to a io_vec - * to allow vectorized send operations. + * to allow vectorized send operations. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) -- cgit v1.2.3 From 1f086d2508ebe494d13fd587d1f5e2b908379efc Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 3 Oct 2025 16:05:46 -0400 Subject: drm/amdkfd: Fix two comments in kfd_ioctl.h Queue read and write pointers are "to KFD", not "from KFD". Suggested-by: Robert Liu Signed-off-by: Felix Kuehling Reviewed-by: Alex Deucher Reviewed-by: Robert Liu Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 04c7d283dc7d..5d1727a6d040 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -67,8 +67,8 @@ struct kfd_ioctl_get_version_args { struct kfd_ioctl_create_queue_args { __u64 ring_base_address; /* to KFD */ - __u64 write_pointer_address; /* from KFD */ - __u64 read_pointer_address; /* from KFD */ + __u64 write_pointer_address; /* to KFD */ + __u64 read_pointer_address; /* to KFD */ __u64 doorbell_offset; /* from KFD */ __u32 ring_size; /* to KFD */ -- cgit v1.2.3 From d2042d8f96ddefdeee823737f813efe3ab4b4e8d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:25:54 -0700 Subject: KVM: Rework KVM_CAP_GUEST_MEMFD_MMAP into KVM_CAP_GUEST_MEMFD_FLAGS Rework the not-yet-released KVM_CAP_GUEST_MEMFD_MMAP into a more generic KVM_CAP_GUEST_MEMFD_FLAGS capability so that adding new flags doesn't require a new capability, and so that developers aren't tempted to bundle multiple flags into a single capability. Note, kvm_vm_ioctl_check_extension_generic() can only return a 32-bit value, but that limitation can be easily circumvented by adding e.g. KVM_CAP_GUEST_MEMFD_FLAGS2 in the unlikely event guest_memfd supports more than 32 flags. Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20251003232606.4070510-2-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 10 +++++++--- include/uapi/linux/kvm.h | 2 +- tools/testing/selftests/kvm/guest_memfd_test.c | 13 ++++++------- virt/kvm/kvm_main.c | 7 +++++-- 4 files changed, 19 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 6ae24c5ca559..7ba92f2ced38 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6432,9 +6432,13 @@ most one mapping per page, i.e. binding multiple memory regions to a single guest_memfd range is not allowed (any number of memory regions can be bound to a single guest_memfd file, but the bound ranges must not overlap). -When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field -supports GUEST_MEMFD_FLAG_MMAP. Setting this flag on guest_memfd creation -enables mmap() and faulting of guest_memfd memory to host userspace. +The capability KVM_CAP_GUEST_MEMFD_FLAGS enumerates the `flags` that can be +specified via KVM_CREATE_GUEST_MEMFD. Currently defined flags: + + ============================ ================================================ + GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file + descriptor. + ============================ ================================================ When the KVM MMU performs a PFN lookup to service a guest fault and the backing guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 6efa98a57ec1..b1d52d0c56ec 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -962,7 +962,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 -#define KVM_CAP_GUEST_MEMFD_MMAP 244 +#define KVM_CAP_GUEST_MEMFD_FLAGS 244 struct kvm_irq_routing_irqchip { __u32 irqchip; diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index b3ca6737f304..3e58bd496104 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -262,19 +262,17 @@ static void test_guest_memfd_flags(struct kvm_vm *vm, uint64_t valid_flags) static void test_guest_memfd(unsigned long vm_type) { - uint64_t flags = 0; struct kvm_vm *vm; size_t total_size; size_t page_size; + uint64_t flags; int fd; page_size = getpagesize(); total_size = page_size * 4; vm = vm_create_barebones_type(vm_type); - - if (vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP)) - flags |= GUEST_MEMFD_FLAG_MMAP; + flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); test_create_guest_memfd_multiple(vm); test_create_guest_memfd_invalid_sizes(vm, flags, page_size); @@ -328,13 +326,14 @@ static void test_guest_memfd_guest(void) size_t size; int fd, i; - if (!kvm_has_cap(KVM_CAP_GUEST_MEMFD_MMAP)) + if (!kvm_check_cap(KVM_CAP_GUEST_MEMFD_FLAGS)) return; vm = __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, &vcpu, 1, guest_code); - TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP), - "Default VM type should always support guest_memfd mmap()"); + TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) & GUEST_MEMFD_FLAG_MMAP, + "Default VM type should support MMAP, supported flags = 0x%x", + vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); size = vm->page_size; fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 226faeaa8e56..e3a268757621 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4928,8 +4928,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CAP_GUEST_MEMFD: return 1; - case KVM_CAP_GUEST_MEMFD_MMAP: - return !kvm || kvm_arch_supports_gmem_mmap(kvm); + case KVM_CAP_GUEST_MEMFD_FLAGS: + if (!kvm || kvm_arch_supports_gmem_mmap(kvm)) + return GUEST_MEMFD_FLAG_MMAP; + + return 0; #endif default: break; -- cgit v1.2.3 From fe2bf6234e947bf5544db6d386af1df2a8db80f3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:25:55 -0700 Subject: KVM: guest_memfd: Add INIT_SHARED flag, reject user page faults if not set Add a guest_memfd flag to allow userspace to state that the underlying memory should be configured to be initialized as shared, and reject user page faults if the guest_memfd instance's memory isn't shared. Because KVM doesn't yet support in-place private<=>shared conversions, all guest_memfd memory effectively follows the initial state. Alternatively, KVM could deduce the initial state based on MMAP, which for all intents and purposes is what KVM currently does. However, implicitly deriving the default state based on MMAP will result in a messy ABI when support for in-place conversions is added. For x86 CoCo VMs, which don't yet support MMAP, memory is currently private by default (otherwise the memory would be unusable). If MMAP implies memory is shared by default, then the default state for CoCo VMs will vary based on MMAP, and from userspace's perspective, will change when in-place conversion support is added. I.e. to maintain guest<=>host ABI, userspace would need to immediately convert all memory from shared=>private, which is both ugly and inefficient. The inefficiency could be avoided by adding a flag to state that memory is _private_ by default, irrespective of MMAP, but that would lead to an equally messy and hard to document ABI. Bite the bullet and immediately add a flag to control the default state so that the effective behavior is explicit and straightforward. Fixes: 3d3a04fad25a ("KVM: Allow and advertise support for host mmap() on guest_memfd files") Cc: David Hildenbrand Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20251003232606.4070510-3-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 5 +++++ include/uapi/linux/kvm.h | 3 ++- tools/testing/selftests/kvm/guest_memfd_test.c | 15 ++++++++++++--- virt/kvm/guest_memfd.c | 6 +++++- virt/kvm/kvm_main.c | 3 ++- 5 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 7ba92f2ced38..754b662a453c 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6438,6 +6438,11 @@ specified via KVM_CREATE_GUEST_MEMFD. Currently defined flags: ============================ ================================================ GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file descriptor. + GUEST_MEMFD_FLAG_INIT_SHARED Make all memory in the file shared during + KVM_CREATE_GUEST_MEMFD (memory files created + without INIT_SHARED will be marked private). + Shared memory can be faulted into host userspace + page tables. Private memory cannot. ============================ ================================================ When the KVM MMU performs a PFN lookup to service a guest fault and the backing diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b1d52d0c56ec..52f6000ab020 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1599,7 +1599,8 @@ struct kvm_memory_attributes { #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) -#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0) +#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0) +#define GUEST_MEMFD_FLAG_INIT_SHARED (1ULL << 1) struct kvm_create_guest_memfd { __u64 size; diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 3e58bd496104..0de56ce3c4e2 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -239,8 +239,9 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm) close(fd1); } -static void test_guest_memfd_flags(struct kvm_vm *vm, uint64_t valid_flags) +static void test_guest_memfd_flags(struct kvm_vm *vm) { + uint64_t valid_flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); size_t page_size = getpagesize(); uint64_t flag; int fd; @@ -274,6 +275,10 @@ static void test_guest_memfd(unsigned long vm_type) vm = vm_create_barebones_type(vm_type); flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); + /* This test doesn't yet support testing mmap() on private memory. */ + if (!(flags & GUEST_MEMFD_FLAG_INIT_SHARED)) + flags &= ~GUEST_MEMFD_FLAG_MMAP; + test_create_guest_memfd_multiple(vm); test_create_guest_memfd_invalid_sizes(vm, flags, page_size); @@ -292,7 +297,7 @@ static void test_guest_memfd(unsigned long vm_type) test_fallocate(fd, page_size, total_size); test_invalid_punch_hole(fd, page_size, total_size); - test_guest_memfd_flags(vm, flags); + test_guest_memfd_flags(vm); close(fd); kvm_vm_free(vm); @@ -334,9 +339,13 @@ static void test_guest_memfd_guest(void) TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) & GUEST_MEMFD_FLAG_MMAP, "Default VM type should support MMAP, supported flags = 0x%x", vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); + TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) & GUEST_MEMFD_FLAG_INIT_SHARED, + "Default VM type should support INIT_SHARED, supported flags = 0x%x", + vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); size = vm->page_size; - fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP); + fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP | + GUEST_MEMFD_FLAG_INIT_SHARED); vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, size, NULL, fd, 0); mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 94bafd6c558c..cf3afba23a6b 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -328,6 +328,9 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; + if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED)) + return VM_FAULT_SIGBUS; + folio = kvm_gmem_get_folio(inode, vmf->pgoff); if (IS_ERR(folio)) { int err = PTR_ERR(folio); @@ -525,7 +528,8 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) u64 valid_flags = 0; if (kvm_arch_supports_gmem_mmap(kvm)) - valid_flags |= GUEST_MEMFD_FLAG_MMAP; + valid_flags |= GUEST_MEMFD_FLAG_MMAP | + GUEST_MEMFD_FLAG_INIT_SHARED; if (flags & ~valid_flags) return -EINVAL; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e3a268757621..5f644ca54af3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4930,7 +4930,8 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return 1; case KVM_CAP_GUEST_MEMFD_FLAGS: if (!kvm || kvm_arch_supports_gmem_mmap(kvm)) - return GUEST_MEMFD_FLAG_MMAP; + return GUEST_MEMFD_FLAG_MMAP | + GUEST_MEMFD_FLAG_INIT_SHARED; return 0; #endif -- cgit v1.2.3