From eefca7ec514262aef08d0ef261552f2f604bd851 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 May 2023 11:49:50 -0400 Subject: net/handshake: Enable the SNI extension to work properly Enable the upper layer protocol to specify the SNI peername. This avoids the need for tlshd to use a DNS lookup, which can return a hostname that doesn't match the incoming certificate's SubjectName. Fixes: 2fd5532044a8 ("net/handshake: Add a kernel API for requesting a TLSv1.3 handshake") Reviewed-by: Simon Horman Signed-off-by: Chuck Lever Signed-off-by: David S. Miller --- include/uapi/linux/handshake.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h index 1de4d0b95325..3d7ea58778c9 100644 --- a/include/uapi/linux/handshake.h +++ b/include/uapi/linux/handshake.h @@ -44,6 +44,7 @@ enum { HANDSHAKE_A_ACCEPT_AUTH_MODE, HANDSHAKE_A_ACCEPT_PEER_IDENTITY, HANDSHAKE_A_ACCEPT_CERTIFICATE, + HANDSHAKE_A_ACCEPT_PEERNAME, __HANDSHAKE_A_ACCEPT_MAX, HANDSHAKE_A_ACCEPT_MAX = (__HANDSHAKE_A_ACCEPT_MAX - 1) -- cgit v1.2.3 From 69474a8a5837be63f13c6f60a7d622b98ed5c539 Mon Sep 17 00:00:00 2001 From: Vladimir Nikishkin Date: Fri, 12 May 2023 11:40:33 +0800 Subject: net: vxlan: Add nolocalbypass option to vxlan. If a packet needs to be encapsulated towards a local destination IP, the packet will undergo a "local bypass" and be injected into the Rx path as if it was received by the target VXLAN device without undergoing encapsulation. If such a device does not exist, the packet will be dropped. There are scenarios where we do not want to perform such a bypass, but instead want the packet to be encapsulated and locally received by a user space program for post-processing. To that end, add a new VXLAN device attribute that controls whether a "local bypass" is performed or not. Default to performing a bypass to maintain existing behavior. Signed-off-by: Vladimir Nikishkin Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/vxlan/vxlan_core.c | 21 +++++++++++++++++++-- include/net/vxlan.h | 4 +++- include/uapi/linux/if_link.h | 1 + 3 files changed, 23 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 561fe1b314f5..78744549c1b3 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2352,7 +2352,8 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, #endif /* Bypass encapsulation if the destination is local */ if (rt_flags & RTCF_LOCAL && - !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { + !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && + vxlan->cfg.flags & VXLAN_F_LOCALBYPASS) { struct vxlan_dev *dst_vxlan; dst_release(dst); @@ -3172,6 +3173,7 @@ static void vxlan_raw_setup(struct net_device *dev) } static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { + [IFLA_VXLAN_UNSPEC] = { .strict_start_type = IFLA_VXLAN_LOCALBYPASS }, [IFLA_VXLAN_ID] = { .type = NLA_U32 }, [IFLA_VXLAN_GROUP] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, @@ -3202,6 +3204,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, [IFLA_VXLAN_DF] = { .type = NLA_U8 }, [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, + [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -4011,6 +4014,17 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX; } + if (data[IFLA_VXLAN_LOCALBYPASS]) { + err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS, + VXLAN_F_LOCALBYPASS, changelink, + true, extack); + if (err) + return err; + } else if (!changelink) { + /* default to local bypass on a new device */ + conf->flags |= VXLAN_F_LOCALBYPASS; + } + if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, VXLAN_F_UDP_ZERO_CSUM6_TX, changelink, @@ -4232,6 +4246,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */ 0; } @@ -4308,7 +4323,9 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) || nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, - !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX))) + !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) || + nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS, + !!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS))) goto nla_put_failure; if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 20bd7d893e10..0be91ca78d3a 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -328,6 +328,7 @@ struct vxlan_dev { #define VXLAN_F_TTL_INHERIT 0x10000 #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 +#define VXLAN_F_LOCALBYPASS 0x80000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -348,7 +349,8 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_TX | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ - VXLAN_F_VNIFILTER) + VXLAN_F_VNIFILTER | \ + VXLAN_F_LOCALBYPASS) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 4ac1000b0ef2..0f6a0fe09bdb 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -828,6 +828,7 @@ enum { IFLA_VXLAN_TTL_INHERIT, IFLA_VXLAN_DF, IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ + IFLA_VXLAN_LOCALBYPASS, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) -- cgit v1.2.3 From 1825788e2a96764ae9ad4641191b2aebec6d2b84 Mon Sep 17 00:00:00 2001 From: Athanasios Oikonomou Date: Wed, 11 Jan 2023 19:46:08 +0000 Subject: media: dvb: add missing DVB-S2X FEC parameter values This commit is adding the missing short FEC Missed on commit 6508a50fe84f9858e8b59b53dce3847aaeeab744 More info: https://dvb.org/wp-content/uploads/2021/02/A083-2r2_DVB-S2X_Draft-EN-302-307-2-v131_Feb_2021.pdf Table 1: S2X System configurations and application areas Please note that 128APSK, 256APSK and 256APSK-L and FEC 29/45, 31/45 are still missing from enums. Link: https://lore.kernel.org/linux-media/20230111194608.1853-1-athoik@gmail.com Cc: Robert Schlabbach Cc: Tom Richardson Signed-off-by: Athanasios Oikonomou Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/dvb/frontend.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h index 326f6a53f1f2..7e0983b987c2 100644 --- a/include/uapi/linux/dvb/frontend.h +++ b/include/uapi/linux/dvb/frontend.h @@ -296,6 +296,10 @@ enum fe_spectral_inversion { * @FEC_28_45: Forward Error Correction Code 28/45 * @FEC_32_45: Forward Error Correction Code 32/45 * @FEC_77_90: Forward Error Correction Code 77/90 + * @FEC_11_45: Forward Error Correction Code 11/45 + * @FEC_4_15: Forward Error Correction Code 4/15 + * @FEC_14_45: Forward Error Correction Code 14/45 + * @FEC_7_15: Forward Error Correction Code 7/15 * * Please note that not all FEC types are supported by a given standard. */ @@ -329,6 +333,10 @@ enum fe_code_rate { FEC_28_45, FEC_32_45, FEC_77_90, + FEC_11_45, + FEC_4_15, + FEC_14_45, + FEC_7_15, }; /** -- cgit v1.2.3 From 2a3f9d4def27c5b30563d11c548ac606ab0066ac Mon Sep 17 00:00:00 2001 From: Athanasios Oikonomou Date: Tue, 10 Jan 2023 07:14:21 +0000 Subject: media: dvb: bump DVB API version Bump the DVB API version in order userspace to be aware of the changes recently implemented in enumerations for DVB-S2(X) and DVB-C2. Related: commit 6508a50fe84f ("media: dvb: add DVB-C2 and DVB-S2X parameter values") Link: https://lore.kernel.org/linux-media/20230110071421.31498-1-athoik@gmail.com Cc: Robert Schlabbach Signed-off-by: Athanasios Oikonomou Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/dvb/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/dvb/version.h b/include/uapi/linux/dvb/version.h index 1a8cd038aa0b..20bc874de321 100644 --- a/include/uapi/linux/dvb/version.h +++ b/include/uapi/linux/dvb/version.h @@ -10,6 +10,6 @@ #define _DVBVERSION_H_ #define DVB_API_VERSION 5 -#define DVB_API_VERSION_MINOR 11 +#define DVB_API_VERSION_MINOR 12 #endif /*_DVBVERSION_H_*/ -- cgit v1.2.3 From 03d89a2de25bbc5c77e61a0cf77663978c4b6ea7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Nov 2021 17:20:54 -0600 Subject: io_uring: support for user allocated memory for rings/sqes Currently io_uring applications must call mmap(2) twice to map the rings themselves, and the sqes array. This works fine, but it does not support using huge pages to back the rings/sqes. Provide a way for the application to pass in pre-allocated memory for the rings/sqes, which can then suitably be allocated from shmfs or via mmap to get huge page support. Particularly for larger rings, this reduces the TLBs needed. If an application wishes to take advantage of that, it must pre-allocate the memory needed for the sq/cq ring, and the sqes. The former must be passed in via the io_uring_params->cq_off.user_data field, while the latter is passed in via the io_uring_params->sq_off.user_data field. Then it must set IORING_SETUP_NO_MMAP in the io_uring_params->flags field, and io_uring will then map the existing memory into the kernel for shared use. The application must not call mmap(2) to map rings as it otherwise would have, that will now fail with -EINVAL if this setup flag was used. The pages used for the rings and sqes must be contigious. The intent here is clearly that huge pages should be used, otherwise the normal setup procedure works fine as-is. The application may use one huge page for both the rings and sqes. Outside of those initialization changes, everything works like it did before. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 10 ++++ include/uapi/linux/io_uring.h | 9 +++- io_uring/io_uring.c | 106 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 114 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 1b2a20a42413..f04ce513fadb 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -211,6 +211,16 @@ struct io_ring_ctx { unsigned int compat: 1; enum task_work_notify_mode notify_method; + + /* + * If IORING_SETUP_NO_MMAP is used, then the below holds + * the gup'ed pages for the two rings, and the sqes. + */ + unsigned short n_ring_pages; + unsigned short n_sqe_pages; + struct page **ring_pages; + struct page **sqe_pages; + struct io_rings *rings; struct task_struct *submitter_task; struct percpu_ref refs; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0716cb17e436..2edba9a274de 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -173,6 +173,11 @@ enum { */ #define IORING_SETUP_DEFER_TASKRUN (1U << 13) +/* + * Application provides the memory for the rings + */ +#define IORING_SETUP_NO_MMAP (1U << 14) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -406,7 +411,7 @@ struct io_sqring_offsets { __u32 dropped; __u32 array; __u32 resv1; - __u64 resv2; + __u64 user_addr; }; /* @@ -425,7 +430,7 @@ struct io_cqring_offsets { __u32 cqes; __u32 flags; __u32 resv1; - __u64 resv2; + __u64 user_addr; }; /* diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 74433939a318..61379cf8e7f5 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2688,12 +2688,85 @@ static void io_mem_free(void *ptr) free_compound_page(page); } +static void io_pages_free(struct page ***pages, int npages) +{ + struct page **page_array; + int i; + + if (!pages) + return; + page_array = *pages; + for (i = 0; i < npages; i++) + unpin_user_page(page_array[i]); + kvfree(page_array); + *pages = NULL; +} + +static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, + unsigned long uaddr, size_t size) +{ + struct page **page_array; + unsigned int nr_pages; + int ret; + + *npages = 0; + + if (uaddr & (PAGE_SIZE - 1) || !size) + return ERR_PTR(-EINVAL); + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (nr_pages > USHRT_MAX) + return ERR_PTR(-EINVAL); + page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!page_array) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + page_array); + if (ret != nr_pages) { +err: + io_pages_free(&page_array, ret > 0 ? ret : 0); + return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT); + } + /* + * Should be a single page. If the ring is small enough that we can + * use a normal page, that is fine. If we need multiple pages, then + * userspace should use a huge page. That's the only way to guarantee + * that we get contigious memory, outside of just being lucky or + * (currently) having low memory fragmentation. + */ + if (page_array[0] != page_array[ret - 1]) + goto err; + *pages = page_array; + *npages = nr_pages; + return page_to_virt(page_array[0]); +} + +static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, + size_t size) +{ + return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, + size); +} + +static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, + size_t size) +{ + return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, + size); +} + static void io_rings_free(struct io_ring_ctx *ctx) { - io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); - ctx->rings = NULL; - ctx->sq_sqes = NULL; + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { + io_mem_free(ctx->rings); + io_mem_free(ctx->sq_sqes); + ctx->rings = NULL; + ctx->sq_sqes = NULL; + } else { + io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); + io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); + } } static void *io_mem_alloc(size_t size) @@ -3338,6 +3411,10 @@ static void *io_uring_validate_mmap_request(struct file *file, struct page *page; void *ptr; + /* Don't allow mmap if the ring was setup without it */ + if (ctx->flags & IORING_SETUP_NO_MMAP) + return ERR_PTR(-EINVAL); + switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: @@ -3673,7 +3750,11 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (size == SIZE_MAX) return -EOVERFLOW; - rings = io_mem_alloc(size); + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) + rings = io_mem_alloc(size); + else + rings = io_rings_map(ctx, p->cq_off.user_addr, size); + if (IS_ERR(rings)) return PTR_ERR(rings); @@ -3693,7 +3774,11 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; } - ptr = io_mem_alloc(size); + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) + ptr = io_mem_alloc(size); + else + ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); + if (IS_ERR(ptr)) { io_rings_free(ctx); return PTR_ERR(ptr); @@ -3885,7 +3970,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; p->sq_off.resv1 = 0; - p->sq_off.resv2 = 0; + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) + p->sq_off.user_addr = 0; p->cq_off.head = offsetof(struct io_rings, cq.head); p->cq_off.tail = offsetof(struct io_rings, cq.tail); @@ -3895,7 +3981,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_off.cqes = offsetof(struct io_rings, cqes); p->cq_off.flags = offsetof(struct io_rings, cq_flags); p->cq_off.resv1 = 0; - p->cq_off.resv2 = 0; + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) + p->cq_off.user_addr = 0; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | @@ -3961,7 +4048,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | - IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN)) + IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | + IORING_SETUP_NO_MMAP)) return -EINVAL; return io_uring_create(entries, &p, params); -- cgit v1.2.3 From 6e76ac595855db27bbdaef337173294a6fd6eb2c Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sat, 29 Apr 2023 01:40:30 +0900 Subject: io_uring: Add io_uring_setup flag to pre-register ring fd and never install it With IORING_REGISTER_USE_REGISTERED_RING, an application can register the ring fd and use it via registered index rather than installed fd. This allows using a registered ring for everything *except* the initial mmap. With IORING_SETUP_NO_MMAP, io_uring_setup uses buffers allocated by the user, rather than requiring a subsequent mmap. The combination of the two allows a user to operate *entirely* via a registered ring fd, making it unnecessary to ever install the fd in the first place. So, add a flag IORING_SETUP_REGISTERED_FD_ONLY to make io_uring_setup register the fd and return a registered index, without installing the fd. This allows an application to avoid touching the fd table at all, and allows a library to never even momentarily install a file descriptor. This splits out an io_ring_add_registered_file helper from io_ring_add_registered_fd, for use by io_uring_setup. Signed-off-by: Josh Triplett Link: https://lore.kernel.org/r/bc8f431bada371c183b95a83399628b605e978a3.1682699803.git.josh@joshtriplett.org Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 7 +++++++ io_uring/io_uring.c | 37 ++++++++++++++++++++++--------------- io_uring/io_uring.h | 3 +++ io_uring/tctx.c | 31 ++++++++++++++++++++----------- 4 files changed, 52 insertions(+), 26 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 2edba9a274de..f222d263bc55 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -178,6 +178,13 @@ enum { */ #define IORING_SETUP_NO_MMAP (1U << 14) +/* + * Register the ring fd in itself for use with + * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather + * than an fd. + */ +#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 61379cf8e7f5..dab09f568294 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3788,19 +3788,13 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return 0; } -static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) +static int io_uring_install_fd(struct file *file) { - int ret, fd; + int fd; fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (fd < 0) return fd; - - ret = __io_uring_add_tctx_node(ctx); - if (ret) { - put_unused_fd(fd); - return ret; - } fd_install(fd, file); return fd; } @@ -3840,6 +3834,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, struct io_uring_params __user *params) { struct io_ring_ctx *ctx; + struct io_uring_task *tctx; struct file *file; int ret; @@ -3851,6 +3846,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, entries = IORING_MAX_ENTRIES; } + if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) + && !(p->flags & IORING_SETUP_NO_MMAP)) + return -EINVAL; + /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, @@ -4007,22 +4006,30 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; } + ret = __io_uring_add_tctx_node(ctx); + if (ret) + goto err_fput; + tctx = current->io_uring; + /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup */ - ret = io_uring_install_fd(ctx, file); - if (ret < 0) { - /* fput will clean it up */ - fput(file); - return ret; - } + if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) + ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX); + else + ret = io_uring_install_fd(file); + if (ret < 0) + goto err_fput; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: io_ring_ctx_wait_and_kill(ctx); return ret; +err_fput: + fput(file); + return ret; } /* @@ -4049,7 +4056,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | - IORING_SETUP_NO_MMAP)) + IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 259bf798a390..9b8dfb3bb2b4 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -75,6 +75,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); +int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, + int start, int end); + int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 3a8d1dd97e1b..c043fe93a3f2 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -208,31 +208,40 @@ void io_uring_unreg_ringfd(void) } } -static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, +int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end) { - struct file *file; int offset; - for (offset = start; offset < end; offset++) { offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); if (tctx->registered_rings[offset]) continue; - file = fget(fd); - if (!file) { - return -EBADF; - } else if (!io_is_uring_fops(file)) { - fput(file); - return -EOPNOTSUPP; - } tctx->registered_rings[offset] = file; return offset; } - return -EBUSY; } +static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, + int start, int end) +{ + struct file *file; + int offset; + + file = fget(fd); + if (!file) { + return -EBADF; + } else if (!io_is_uring_fops(file)) { + fput(file); + return -EOPNOTSUPP; + } + offset = io_ring_add_registered_file(tctx, file, start, end); + if (offset < 0) + fput(file); + return offset; +} + /* * Register a ring fd to avoid fdget/fdput for each io_uring_enter() * invocation. User passes in an array of struct io_uring_rsrc_update -- cgit v1.2.3 From 2f440b72e852be428540579b5813ba2b8236578d Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Wed, 26 Apr 2023 17:23:23 +0000 Subject: KVM: arm64: Add KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE Add a capability for userspace to specify the eager split chunk size. The chunk size specifies how many pages to break at a time, using a single allocation. Bigger the chunk size, more pages need to be allocated ahead of time. Suggested-by: Oliver Upton Signed-off-by: Ricardo Koller Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20230426172330.1439644-6-ricarkol@google.com Signed-off-by: Oliver Upton --- Documentation/virt/kvm/api.rst | 27 +++++++++++++++++++++++++++ arch/arm64/include/asm/kvm_host.h | 15 +++++++++++++++ arch/arm64/include/asm/kvm_pgtable.h | 18 ++++++++++++++++++ arch/arm64/kvm/arm.c | 28 ++++++++++++++++++++++++++++ arch/arm64/kvm/mmu.c | 4 ++++ include/uapi/linux/kvm.h | 2 ++ 6 files changed, 94 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index add067793b90..656bd293c8f4 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8445,6 +8445,33 @@ structure. When getting the Modified Change Topology Report value, the attr->addr must point to a byte where the value will be stored or retrieved from. +8.40 KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE +--------------------------------------- + +:Capability: KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE +:Architectures: arm64 +:Type: vm +:Parameters: arg[0] is the new split chunk size. +:Returns: 0 on success, -EINVAL if any memslot was already created. + +This capability sets the chunk size used in Eager Page Splitting. + +Eager Page Splitting improves the performance of dirty-logging (used +in live migrations) when guest memory is backed by huge-pages. It +avoids splitting huge-pages (into PAGE_SIZE pages) on fault, by doing +it eagerly when enabling dirty logging (with the +KVM_MEM_LOG_DIRTY_PAGES flag for a memory region), or when using +KVM_CLEAR_DIRTY_LOG. + +The chunk size specifies how many pages to break at a time, using a +single allocation for each chunk. Bigger the chunk size, more pages +need to be allocated ahead of time. + +The chunk size needs to be a valid block size. The list of acceptable +block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a +64-bit bitmap (each bit describing a block size). The default value is +0, to disable the eager page splitting. + 9. Known KVM API problems ========================= diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7e7e19ef6993..b743198450b3 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -159,6 +159,21 @@ struct kvm_s2_mmu { /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; +#define KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT 0 + /* + * Memory cache used to split + * KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE worth of huge pages. It + * is used to allocate stage2 page tables while splitting huge + * pages. The choice of KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE + * influences both the capacity of the split page cache, and + * how often KVM reschedules. Be wary of raising CHUNK_SIZE + * too high. + * + * Protected by kvm->slots_lock. + */ + struct kvm_mmu_memory_cache split_page_cache; + uint64_t split_page_chunk_size; + struct kvm_arch *arch; }; diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index c8e0e7d9303b..cbc6971e2cb4 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -92,6 +92,24 @@ static inline bool kvm_level_supports_block_mapping(u32 level) return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL; } +static inline u32 kvm_supported_block_sizes(void) +{ + u32 level = KVM_PGTABLE_MIN_BLOCK_LEVEL; + u32 r = 0; + + for (; level < KVM_PGTABLE_MAX_LEVELS; level++) + r |= BIT(kvm_granule_shift(level)); + + return r; +} + +static inline bool kvm_is_block_size_supported(u64 size) +{ + bool is_power_of_two = IS_ALIGNED(size, size); + + return is_power_of_two && (size & kvm_supported_block_sizes()); +} + /** * struct kvm_pgtable_mm_ops - Memory management callbacks. * @zalloc_page: Allocate a single zeroed memory page. diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 14391826241c..c605626801c4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -65,6 +65,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r; + u64 new_cap; if (cap->flags) return -EINVAL; @@ -89,6 +90,24 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = 0; set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); break; + case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: + new_cap = cap->args[0]; + + mutex_lock(&kvm->slots_lock); + /* + * To keep things simple, allow changing the chunk + * size only when no memory slots have been created. + */ + if (!kvm_are_all_memslots_empty(kvm)) { + r = -EINVAL; + } else if (new_cap && !kvm_is_block_size_supported(new_cap)) { + r = -EINVAL; + } else { + r = 0; + kvm->arch.mmu.split_page_chunk_size = new_cap; + } + mutex_unlock(&kvm->slots_lock); + break; default: r = -EINVAL; break; @@ -302,6 +321,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_PTRAUTH_GENERIC: r = system_has_full_ptr_auth(); break; + case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: + if (kvm) + r = kvm->arch.mmu.split_page_chunk_size; + else + r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; + break; + case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES: + r = kvm_supported_block_sizes(); + break; default: r = 0; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a0d3c773af99..f2d30486f755 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -775,6 +775,10 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t for_each_possible_cpu(cpu) *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; + /* The eager page splitting is disabled by default */ + mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; + mmu->split_page_cache.gfp_zero = __GFP_ZERO; + mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); return 0; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 737318b1c1d9..44edee0211fb 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1190,6 +1190,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226 #define KVM_CAP_COUNTER_OFFSET 227 +#define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 +#define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From b9f9a485fb0eb80b0e2b90410b28cbb9b0e85687 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Tue, 9 May 2023 22:19:45 +0100 Subject: netfilter: nft_exthdr: add boolean DCCP option matching The xt_dccp iptables module supports the matching of DCCP packets based on the presence or absence of DCCP options. Extend nft_exthdr to add this functionality to nftables. Link: https://bugzilla.netfilter.org/show_bug.cgi?id=930 Signed-off-by: Jeremy Sowden Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/nft_exthdr.c | 106 +++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c4d4d8e42dc8..e059dc2644df 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -859,12 +859,14 @@ enum nft_exthdr_flags { * @NFT_EXTHDR_OP_TCP: match against tcp options * @NFT_EXTHDR_OP_IPV4: match against ipv4 options * @NFT_EXTHDR_OP_SCTP: match against sctp chunks + * @NFT_EXTHDR_OP_DCCP: match against dccp otions */ enum nft_exthdr_op { NFT_EXTHDR_OP_IPV6, NFT_EXTHDR_OP_TCPOPT, NFT_EXTHDR_OP_IPV4, NFT_EXTHDR_OP_SCTP, + NFT_EXTHDR_OP_DCCP, __NFT_EXTHDR_OP_MAX }; #define NFT_EXTHDR_OP_MAX (__NFT_EXTHDR_OP_MAX - 1) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a54a7f772cec..671474e59817 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -406,6 +407,82 @@ err: regs->verdict.code = NFT_BREAK; } +static void nft_exthdr_dccp_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + unsigned int thoff, dataoff, optoff, optlen, i; + u32 *dest = ®s->data[priv->dreg]; + const struct dccp_hdr *dh; + struct dccp_hdr _dh; + + if (pkt->tprot != IPPROTO_DCCP || pkt->fragoff) + goto err; + + thoff = nft_thoff(pkt); + + dh = skb_header_pointer(pkt->skb, thoff, sizeof(_dh), &_dh); + if (!dh) + goto err; + + dataoff = dh->dccph_doff * sizeof(u32); + optoff = __dccp_hdr_len(dh); + if (dataoff <= optoff) + goto err; + + optlen = dataoff - optoff; + + for (i = 0; i < optlen; ) { + /* Options 0 (DCCPO_PADDING) - 31 (DCCPO_MAX_RESERVED) are 1B in + * the length; the remaining options are at least 2B long. In + * all cases, the first byte contains the option type. In + * multi-byte options, the second byte contains the option + * length, which must be at least two: 1 for the type plus 1 for + * the length plus 0-253 for any following option data. We + * aren't interested in the option data, only the type and the + * length, so we don't need to read more than two bytes at a + * time. + */ + unsigned int buflen = optlen - i; + u8 buf[2], *bufp; + u8 type, len; + + if (buflen > sizeof(buf)) + buflen = sizeof(buf); + + bufp = skb_header_pointer(pkt->skb, thoff + optoff + i, buflen, + &buf); + if (!bufp) + goto err; + + type = bufp[0]; + + if (type == priv->type) { + *dest = 1; + return; + } + + if (type <= DCCPO_MAX_RESERVED) { + i++; + continue; + } + + if (buflen < 2) + goto err; + + len = bufp[1]; + + if (len < 2) + goto err; + + i += len; + } + +err: + *dest = 0; +} + static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, @@ -557,6 +634,22 @@ static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, return 0; } +static int nft_exthdr_dccp_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + int err = nft_exthdr_init(ctx, expr, tb); + + if (err < 0) + return err; + + if (!(priv->flags & NFT_EXTHDR_F_PRESENT)) + return -EOPNOTSUPP; + + return 0; +} + static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) @@ -686,6 +779,15 @@ static const struct nft_expr_ops nft_exthdr_sctp_ops = { .reduce = nft_exthdr_reduce, }; +static const struct nft_expr_ops nft_exthdr_dccp_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_dccp_eval, + .init = nft_exthdr_dccp_init, + .dump = nft_exthdr_dump, + .reduce = nft_exthdr_reduce, +}; + static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -720,6 +822,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_sctp_ops; break; + case NFT_EXTHDR_OP_DCCP: + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_dccp_ops; + break; } return ERR_PTR(-EOPNOTSUPP); -- cgit v1.2.3 From 6ac392815628f317fcfdca1a39df00b9cc4ebc8b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 3 May 2023 13:18:42 +0200 Subject: fs: allow to mount beneath top mount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various distributions are adding or are in the process of adding support for system extensions and in the future configuration extensions through various tools. A more detailed explanation on system and configuration extensions can be found on the manpage which is listed below at [1]. System extension images may – dynamically at runtime — extend the /usr/ and /opt/ directory hierarchies with additional files. This is particularly useful on immutable system images where a /usr/ and/or /opt/ hierarchy residing on a read-only file system shall be extended temporarily at runtime without making any persistent modifications. When one or more system extension images are activated, their /usr/ and /opt/ hierarchies are combined via overlayfs with the same hierarchies of the host OS, and the host /usr/ and /opt/ overmounted with it ("merging"). When they are deactivated, the mount point is disassembled — again revealing the unmodified original host version of the hierarchy ("unmerging"). Merging thus makes the extension's resources suddenly appear below the /usr/ and /opt/ hierarchies as if they were included in the base OS image itself. Unmerging makes them disappear again, leaving in place only the files that were shipped with the base OS image itself. System configuration images are similar but operate on directories containing system or service configuration. On nearly all modern distributions mount propagation plays a crucial role and the rootfs of the OS is a shared mount in a peer group (usually with peer group id 1): TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID / / ext4 shared:1 29 1 On such systems all services and containers run in a separate mount namespace and are pivot_root()ed into their rootfs. A separate mount namespace is almost always used as it is the minimal isolation mechanism services have. But usually they are even much more isolated up to the point where they almost become indistinguishable from containers. Mount propagation again plays a crucial role here. The rootfs of all these services is a slave mount to the peer group of the host rootfs. This is done so the service will receive mount propagation events from the host when certain files or directories are updated. In addition, the rootfs of each service, container, and sandbox is also a shared mount in its separate peer group: TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID / / ext4 shared:24 master:1 71 47 For people not too familiar with mount propagation, the master:1 means that this is a slave mount to peer group 1. Which as one can see is the host rootfs as indicated by shared:1 above. The shared:24 indicates that the service rootfs is a shared mount in a separate peer group with peer group id 24. A service may run other services. Such nested services will also have a rootfs mount that is a slave to the peer group of the outer service rootfs mount. For containers things are just slighly different. A container's rootfs isn't a slave to the service's or host rootfs' peer group. The rootfs mount of a container is simply a shared mount in its own peer group: TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID /home/ubuntu/debian-tree / ext4 shared:99 61 60 So whereas services are isolated OS components a container is treated like a separate world and mount propagation into it is restricted to a single well known mount that is a slave to the peer group of the shared mount /run on the host: TARGET SOURCE FSTYPE PROPAGATION MNT_ID PARENT_ID /propagate/debian-tree /run/host/incoming tmpfs master:5 71 68 Here, the master:5 indicates that this mount is a slave to the peer group with peer group id 5. This allows to propagate mounts into the container and served as a workaround for not being able to insert mounts into mount namespaces directly. But the new mount api does support inserting mounts directly. For the interested reader the blogpost in [2] might be worth reading where I explain the old and the new approach to inserting mounts into mount namespaces. Containers of course, can themselves be run as services. They often run full systems themselves which means they again run services and containers with the exact same propagation settings explained above. The whole system is designed so that it can be easily updated, including all services in various fine-grained ways without having to enter every single service's mount namespace which would be prohibitively expensive. The mount propagation layout has been carefully chosen so it is possible to propagate updates for system extensions and configurations from the host into all services. The simplest model to update the whole system is to mount on top of /usr, /opt, or /etc on the host. The new mount on /usr, /opt, or /etc will then propagate into every service. This works cleanly the first time. However, when the system is updated multiple times it becomes necessary to unmount the first update on /opt, /usr, /etc and then propagate the new update. But this means, there's an interval where the old base system is accessible. This has to be avoided to protect against downgrade attacks. The vfs already exposes a mechanism to userspace whereby mounts can be mounted beneath an existing mount. Such mounts are internally referred to as "tucked". The patch series exposes the ability to mount beneath a top mount through the new MOVE_MOUNT_BENEATH flag for the move_mount() system call. This allows userspace to seamlessly upgrade mounts. After this series the only thing that will have changed is that mounting beneath an existing mount can be done explicitly instead of just implicitly. Today, there are two scenarios where a mount can be mounted beneath an existing mount instead of on top of it: (1) When a service or container is started in a new mount namespace and pivot_root()s into its new rootfs. The way this is done is by mounting the new rootfs beneath the old rootfs: fd_newroot = open("/var/lib/machines/fedora", ...); fd_oldroot = open("/", ...); fchdir(fd_newroot); pivot_root(".", "."); After the pivot_root(".", ".") call the new rootfs is mounted beneath the old rootfs which can then be unmounted to reveal the underlying mount: fchdir(fd_oldroot); umount2(".", MNT_DETACH); Since pivot_root() moves the caller into a new rootfs no mounts must be propagated out of the new rootfs as a consequence of the pivot_root() call. Thus, the mounts cannot be shared. (2) When a mount is propagated to a mount that already has another mount mounted on the same dentry. The easiest example for this is to create a new mount namespace. The following commands will create a mount namespace where the rootfs mount / will be a slave to the peer group of the host rootfs / mount's peer group. IOW, it will receive propagation from the host: mount --make-shared / unshare --mount --propagation=slave Now a new mount on the /mnt dentry in that mount namespace is created. (As it can be confusing it should be spelled out that the tmpfs mount on the /mnt dentry that was just created doesn't propagate back to the host because the rootfs mount / of the mount namespace isn't a peer of the host rootfs.): mount -t tmpfs tmpfs /mnt TARGET SOURCE FSTYPE PROPAGATION └─/mnt tmpfs tmpfs Now another terminal in the host mount namespace can observe that the mount indeed hasn't propagated back to into the host mount namespace. A new mount can now be created on top of the /mnt dentry with the rootfs mount / as its parent: mount --bind /opt /mnt TARGET SOURCE FSTYPE PROPAGATION └─/mnt /dev/sda2[/opt] ext4 shared:1 The mount namespace that was created earlier can now observe that the bind mount created on the host has propagated into it: TARGET SOURCE FSTYPE PROPAGATION └─/mnt /dev/sda2[/opt] ext4 master:1 └─/mnt tmpfs tmpfs But instead of having been mounted on top of the tmpfs mount at the /mnt dentry the /opt mount has been mounted on top of the rootfs mount at the /mnt dentry. And the tmpfs mount has been remounted on top of the propagated /opt mount at the /opt dentry. So in other words, the propagated mount has been mounted beneath the preexisting mount in that mount namespace. Mount namespaces make this easy to illustrate but it's also easy to mount beneath an existing mount in the same mount namespace (The following example assumes a shared rootfs mount / with peer group id 1): mount --bind /opt /opt TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION └─/opt /dev/sda2[/opt] ext4 188 29 shared:1 If another mount is mounted on top of the /opt mount at the /opt dentry: mount --bind /tmp /opt The following clunky mount tree will result: TARGET SOURCE FSTYPE MNT_ID PARENT_ID PROPAGATION └─/opt /dev/sda2[/tmp] ext4 405 29 shared:1 └─/opt /dev/sda2[/opt] ext4 188 405 shared:1 └─/opt /dev/sda2[/tmp] ext4 404 188 shared:1 The /tmp mount is mounted beneath the /opt mount and another copy is mounted on top of the /opt mount. This happens because the rootfs / and the /opt mount are shared mounts in the same peer group. When the new /tmp mount is supposed to be mounted at the /opt dentry then the /tmp mount first propagates to the root mount at the /opt dentry. But there already is the /opt mount mounted at the /opt dentry. So the old /opt mount at the /opt dentry will be mounted on top of the new /tmp mount at the /tmp dentry, i.e. @opt->mnt_parent is @tmp and @opt->mnt_mountpoint is /tmp (Note that @opt->mnt_root is /opt which is what shows up as /opt under SOURCE). So again, a mount will be mounted beneath a preexisting mount. (Fwiw, a few iterations of mount --bind /opt /opt in a loop on a shared rootfs is a good example of what could be referred to as mount explosion.) The main point is that such mounts allows userspace to umount a top mount and reveal an underlying mount. So for example, umounting the tmpfs mount on /mnt that was created in example (1) using mount namespaces reveals the /opt mount which was mounted beneath it. In (2) where a mount was mounted beneath the top mount in the same mount namespace unmounting the top mount would unmount both the top mount and the mount beneath. In the process the original mount would be remounted on top of the rootfs mount / at the /opt dentry again. This again, is a result of mount propagation only this time it's umount propagation. However, this can be avoided by simply making the parent mount / of the @opt mount a private or slave mount. Then the top mount and the original mount can be unmounted to reveal the mount beneath. These two examples are fairly arcane and are merely added to make it clear how mount propagation has effects on current and future features. More common use-cases will just be things like: mount -t btrfs /dev/sdA /mnt mount -t xfs /dev/sdB --beneath /mnt umount /mnt after which we'll have updated from a btrfs filesystem to a xfs filesystem without ever revealing the underlying mountpoint. The crux is that the proposed mechanism already exists and that it is so powerful as to cover cases where mounts are supposed to be updated with new versions. Crucially, it offers an important flexibility. Namely that updates to a system may either be forced or can be delayed and the umount of the top mount be left to a service if it is a cooperative one. This adds a new flag to move_mount() that allows to explicitly move a beneath the top mount adhering to the following semantics: * Mounts cannot be mounted beneath the rootfs. This restriction encompasses the rootfs but also chroots via chroot() and pivot_root(). To mount a mount beneath the rootfs or a chroot, pivot_root() can be used as illustrated above. * The source mount must be a private mount to force the kernel to allocate a new, unused peer group id. This isn't a required restriction but a voluntary one. It avoids repeating a semantical quirk that already exists today. If bind mounts which already have a peer group id are inserted into mount trees that have the same peer group id this can cause a lot of mount propagation events to be generated (For example, consider running mount --bind /opt /opt in a loop where the parent mount is a shared mount.). * Avoid getting rid of the top mount in the kernel. Cooperative services need to be able to unmount the top mount themselves. This also avoids a good deal of additional complexity. The umount would have to be propagated which would be another rather expensive operation. So namespace_lock() and lock_mount_hash() would potentially have to be held for a long time for both a mount and umount propagation. That should be avoided. * The path to mount beneath must be mounted and attached. * The top mount and its parent must be in the caller's mount namespace and the caller must be able to mount in that mount namespace. * The caller must be able to unmount the top mount to prove that they could reveal the underlying mount. * The propagation tree is calculated based on the destination mount's parent mount and the destination mount's mountpoint on the parent mount. Of course, if the parent of the destination mount and the destination mount are shared mounts in the same peer group and the mountpoint of the new mount to be mounted is a subdir of their ->mnt_root then both will receive a mount of /opt. That's probably easier to understand with an example. Assuming a standard shared rootfs /: mount --bind /opt /opt mount --bind /tmp /opt will cause the same mount tree as: mount --bind /opt /opt mount --beneath /tmp /opt because both / and /opt are shared mounts/peers in the same peer group and the /opt dentry is a subdirectory of both the parent's and the child's ->mnt_root. If a mount tree like that is created it almost always is an accident or abuse of mount propagation. Realistically what most people probably mean in this scenarios is: mount --bind /opt /opt mount --make-private /opt mount --make-shared /opt This forces the allocation of a new separate peer group for the /opt mount. Aferwards a mount --bind or mount --beneath actually makes sense as the / and /opt mount belong to different peer groups. Before that it's likely just confusion about what the user wanted to achieve. * Refuse MOVE_MOUNT_BENEATH if: (1) the @mnt_from has been overmounted in between path resolution and acquiring @namespace_sem when locking @mnt_to. This avoids the proliferation of shadow mounts. (2) if @to_mnt is moved to a different mountpoint while acquiring @namespace_sem to lock @to_mnt. (3) if @to_mnt is unmounted while acquiring @namespace_sem to lock @to_mnt. (4) if the parent of the target mount propagates to the target mount at the same mountpoint. This would mean mounting @mnt_from on @mnt_to->mnt_parent and then propagating a copy @c of @mnt_from onto @mnt_to. This defeats the whole purpose of mounting @mnt_from beneath @mnt_to. (5) if the parent mount @mnt_to->mnt_parent propagates to @mnt_from at the same mountpoint. If @mnt_to->mnt_parent propagates to @mnt_from this would mean propagating a copy @c of @mnt_from on top of @mnt_from. Afterwards @mnt_from would be mounted on top of @mnt_to->mnt_parent and @mnt_to would be unmounted from @mnt->mnt_parent and remounted on @mnt_from. But since @c is already mounted on @mnt_from, @mnt_to would ultimately be remounted on top of @c. Afterwards, @mnt_from would be covered by a copy @c of @mnt_from and @c would be covered by @mnt_from itself. This defeats the whole purpose of mounting @mnt_from beneath @mnt_to. Cases (1) to (3) are required as they deal with races that would cause bugs or unexpected behavior for users. Cases (4) and (5) refuse semantical quirks that would not be a bug but would cause weird mount trees to be created. While they can already be created via other means (mount --bind /opt /opt x n) there's no reason to repeat past mistakes in new features. Link: https://man7.org/linux/man-pages/man8/systemd-sysext.8.html [1] Link: https://brauner.io/2023/02/28/mounting-into-mount-namespaces.html [2] Link: https://github.com/flatcar/sysext-bakery Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_1 Link: https://fedoraproject.org/wiki/Changes/Unified_Kernel_Support_Phase_2 Link: https://github.com/systemd/systemd/pull/26013 Reviewed-by: Seth Forshee (DigitalOcean) Message-Id: <20230202-fs-move-mount-replace-v4-4-98f3d80d7eaa@kernel.org> Signed-off-by: Christian Brauner --- fs/namespace.c | 352 ++++++++++++++++++++++++++++++++++++++------- fs/pnode.c | 42 +++++- fs/pnode.h | 3 + include/uapi/linux/mount.h | 3 +- 4 files changed, 350 insertions(+), 50 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/namespace.c b/fs/namespace.c index f91a60e6039d..382edc76aee7 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -926,6 +926,33 @@ void mnt_set_mountpoint(struct mount *mnt, hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } +/** + * mnt_set_mountpoint_beneath - mount a mount beneath another one + * + * @new_parent: the source mount + * @top_mnt: the mount beneath which @new_parent is mounted + * @new_mp: the new mountpoint of @top_mnt on @new_parent + * + * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and + * parent @top_mnt->mnt_parent and mount it on top of @new_parent at + * @new_mp. And mount @new_parent on the old parent and old + * mountpoint of @top_mnt. + * + * Context: This function expects namespace_lock() and lock_mount_hash() + * to have been acquired in that order. + */ +static void mnt_set_mountpoint_beneath(struct mount *new_parent, + struct mount *top_mnt, + struct mountpoint *new_mp) +{ + struct mount *old_top_parent = top_mnt->mnt_parent; + struct mountpoint *old_top_mp = top_mnt->mnt_mp; + + mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent); + mnt_change_mountpoint(new_parent, new_mp, top_mnt); +} + + static void __attach_mnt(struct mount *mnt, struct mount *parent) { hlist_add_head_rcu(&mnt->mnt_hash, @@ -933,15 +960,42 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent) list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } -/* - * vfsmount lock must be held for write +/** + * attach_mnt - mount a mount, attach to @mount_hashtable and parent's + * list of child mounts + * @parent: the parent + * @mnt: the new mount + * @mp: the new mountpoint + * @beneath: whether to mount @mnt beneath or on top of @parent + * + * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt + * to @parent's child mount list and to @mount_hashtable. + * + * If @beneath is true, remove @mnt from its current parent and + * mountpoint and mount it on @mp on @parent, and mount @parent on the + * old parent and old mountpoint of @mnt. Finally, attach @parent to + * @mnt_hashtable and @parent->mnt_parent->mnt_mounts. + * + * Note, when __attach_mnt() is called @mnt->mnt_parent already points + * to the correct parent. + * + * Context: This function expects namespace_lock() and lock_mount_hash() + * to have been acquired in that order. */ -static void attach_mnt(struct mount *mnt, - struct mount *parent, - struct mountpoint *mp) +static void attach_mnt(struct mount *mnt, struct mount *parent, + struct mountpoint *mp, bool beneath) { - mnt_set_mountpoint(parent, mp, mnt); - __attach_mnt(mnt, parent); + if (beneath) + mnt_set_mountpoint_beneath(mnt, parent, mp); + else + mnt_set_mountpoint(parent, mp, mnt); + /* + * Note, @mnt->mnt_parent has to be used. If @mnt was mounted + * beneath @parent then @mnt will need to be attached to + * @parent's old parent, not @parent. IOW, @mnt->mnt_parent + * isn't the same mount as @parent. + */ + __attach_mnt(mnt, mnt->mnt_parent); } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) @@ -953,7 +1007,7 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m hlist_del_init(&mnt->mnt_mp_list); hlist_del_init_rcu(&mnt->mnt_hash); - attach_mnt(mnt, parent, mp); + attach_mnt(mnt, parent, mp, false); put_mountpoint(old_mp); mnt_add_count(old_parent, -1); @@ -1954,7 +2008,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, parent, p->mnt_mp); + attach_mnt(q, parent, p->mnt_mp, false); unlock_mount_hash(); } } @@ -2163,12 +2217,17 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) return 0; } -/* - * @source_mnt : mount tree to be attached - * @nd : place the mount tree @source_mnt is attached - * @parent_nd : if non-null, detach the source_mnt from its parent and - * store the parent mount and mountpoint dentry. - * (done when source_mnt is moved) +enum mnt_tree_flags_t { + MNT_TREE_MOVE = BIT(0), + MNT_TREE_BENEATH = BIT(1), +}; + +/** + * attach_recursive_mnt - attach a source mount tree + * @source_mnt: mount tree to be attached + * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath + * @dest_mp: the mountpoint @source_mnt will be mounted at + * @flags: modify how @source_mnt is supposed to be attached * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. @@ -2225,22 +2284,28 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) * applied to each mount in the tree. * Must be called without spinlocks held, since this function can sleep * in allocations. + * + * Context: The function expects namespace_lock() to be held. + * Return: If @source_mnt was successfully attached 0 is returned. + * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct mount *dest_mnt, - struct mountpoint *dest_mp, - bool moving) + struct mount *top_mnt, + struct mountpoint *dest_mp, + enum mnt_tree_flags_t flags) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); - struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct mnt_namespace *ns = top_mnt->mnt_ns; struct mountpoint *smp; - struct mount *child, *p; + struct mount *child, *dest_mnt, *p; struct hlist_node *n; - int err; + int err = 0; + bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH; - /* Preallocate a mountpoint in case the new mounts need - * to be tucked under other mounts. + /* + * Preallocate a mountpoint in case the new mounts need to be + * mounted beneath mounts on the same mountpoint. */ smp = get_mountpoint(source_mnt->mnt.mnt_root); if (IS_ERR(smp)) @@ -2253,29 +2318,41 @@ static int attach_recursive_mnt(struct mount *source_mnt, goto out; } + if (beneath) + dest_mnt = top_mnt->mnt_parent; + else + dest_mnt = top_mnt; + if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); - lock_mount_hash(); - if (err) - goto out_cleanup_ids; + } + lock_mount_hash(); + if (err) + goto out_cleanup_ids; + + if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); - } else { - lock_mount_hash(); } + if (moving) { + if (beneath) + dest_mp = smp; unhash_mnt(source_mnt); - attach_mnt(source_mnt, dest_mnt, dest_mp); + attach_mnt(source_mnt, top_mnt, dest_mp, beneath); touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { /* move from anon - the caller will destroy */ list_del_init(&source_mnt->mnt_ns->list); } - mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); + if (beneath) + mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); + else + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); commit_tree(source_mnt); } @@ -2315,28 +2392,80 @@ static int attach_recursive_mnt(struct mount *source_mnt, return err; } -static struct mountpoint *lock_mount(struct path *path) +/** + * do_lock_mount - lock mount and mountpoint + * @path: target path + * @beneath: whether the intention is to mount beneath @path + * + * Follow the mount stack on @path until the top mount @mnt is found. If + * the initial @path->{mnt,dentry} is a mountpoint lookup the first + * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root} + * until nothing is stacked on top of it anymore. + * + * Acquire the inode_lock() on the top mount's ->mnt_root to protect + * against concurrent removal of the new mountpoint from another mount + * namespace. + * + * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint + * @mp on @mnt->mnt_parent must be acquired. This protects against a + * concurrent unlink of @mp->mnt_dentry from another mount namespace + * where @mnt doesn't have a child mount mounted @mp. A concurrent + * removal of @mnt->mnt_root doesn't matter as nothing will be mounted + * on top of it for @beneath. + * + * In addition, @beneath needs to make sure that @mnt hasn't been + * unmounted or moved from its current mountpoint in between dropping + * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt + * being unmounted would be detected later by e.g., calling + * check_mnt(mnt) in the function it's called from. For the @beneath + * case however, it's useful to detect it directly in do_lock_mount(). + * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points + * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will + * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL. + * + * Return: Either the target mountpoint on the top mount or the top + * mount's mountpoint. + */ +static struct mountpoint *do_lock_mount(struct path *path, bool beneath) { - struct vfsmount *mnt; + struct vfsmount *mnt = path->mnt; struct dentry *dentry; - struct mountpoint *mp; + struct mountpoint *mp = ERR_PTR(-ENOENT); for (;;) { - dentry = path->dentry; + struct mount *m; + + if (beneath) { + m = real_mount(mnt); + read_seqlock_excl(&mount_lock); + dentry = dget(m->mnt_mountpoint); + read_sequnlock_excl(&mount_lock); + } else { + dentry = path->dentry; + } + inode_lock(dentry->d_inode); if (unlikely(cant_mount(dentry))) { inode_unlock(dentry->d_inode); - return ERR_PTR(-ENOENT); + goto out; } namespace_lock(); + if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + goto out; + } + mnt = lookup_mnt(path); if (likely(!mnt)) break; namespace_unlock(); inode_unlock(dentry->d_inode); + if (beneath) + dput(dentry); path_put(path); path->mnt = mnt; path->dentry = dget(mnt->mnt_root); @@ -2348,9 +2477,18 @@ static struct mountpoint *lock_mount(struct path *path) inode_unlock(dentry->d_inode); } +out: + if (beneath) + dput(dentry); + return mp; } +static inline struct mountpoint *lock_mount(struct path *path) +{ + return do_lock_mount(path, false); +} + static void unlock_mount(struct mountpoint *where) { struct dentry *dentry = where->m_dentry; @@ -2372,7 +2510,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; - return attach_recursive_mnt(mnt, p, mp, false); + return attach_recursive_mnt(mnt, p, mp, 0); } /* @@ -2854,7 +2992,110 @@ out: return err; } -static int do_move_mount(struct path *old_path, struct path *new_path) +/** + * path_overmounted - check if path is overmounted + * @path: path to check + * + * Check if path is overmounted, i.e., if there's a mount on top of + * @path->mnt with @path->dentry as mountpoint. + * + * Context: This function expects namespace_lock() to be held. + * Return: If path is overmounted true is returned, false if not. + */ +static inline bool path_overmounted(const struct path *path) +{ + rcu_read_lock(); + if (unlikely(__lookup_mnt(path->mnt, path->dentry))) { + rcu_read_unlock(); + return true; + } + rcu_read_unlock(); + return false; +} + +/** + * can_move_mount_beneath - check that we can mount beneath the top mount + * @from: mount to mount beneath + * @to: mount under which to mount + * + * - Make sure that @to->dentry is actually the root of a mount under + * which we can mount another mount. + * - Make sure that nothing can be mounted beneath the caller's current + * root or the rootfs of the namespace. + * - Make sure that the caller can unmount the topmost mount ensuring + * that the caller could reveal the underlying mountpoint. + * - Ensure that nothing has been mounted on top of @from before we + * grabbed @namespace_sem to avoid creating pointless shadow mounts. + * - Prevent mounting beneath a mount if the propagation relationship + * between the source mount, parent mount, and top mount would lead to + * nonsensical mount trees. + * + * Context: This function expects namespace_lock() to be held. + * Return: On success 0, and on error a negative error code is returned. + */ +static int can_move_mount_beneath(const struct path *from, + const struct path *to, + const struct mountpoint *mp) +{ + struct mount *mnt_from = real_mount(from->mnt), + *mnt_to = real_mount(to->mnt), + *parent_mnt_to = mnt_to->mnt_parent; + + if (!mnt_has_parent(mnt_to)) + return -EINVAL; + + if (!path_mounted(to)) + return -EINVAL; + + if (IS_MNT_LOCKED(mnt_to)) + return -EINVAL; + + /* Avoid creating shadow mounts during mount propagation. */ + if (path_overmounted(from)) + return -EINVAL; + + /* + * Mounting beneath the rootfs only makes sense when the + * semantics of pivot_root(".", ".") are used. + */ + if (&mnt_to->mnt == current->fs->root.mnt) + return -EINVAL; + if (parent_mnt_to == current->nsproxy->mnt_ns->root) + return -EINVAL; + + for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent) + if (p == mnt_to) + return -EINVAL; + + /* + * If the parent mount propagates to the child mount this would + * mean mounting @mnt_from on @mnt_to->mnt_parent and then + * propagating a copy @c of @mnt_from on top of @mnt_to. This + * defeats the whole purpose of mounting beneath another mount. + */ + if (propagation_would_overmount(parent_mnt_to, mnt_to, mp)) + return -EINVAL; + + /* + * If @mnt_to->mnt_parent propagates to @mnt_from this would + * mean propagating a copy @c of @mnt_from on top of @mnt_from. + * Afterwards @mnt_from would be mounted on top of + * @mnt_to->mnt_parent and @mnt_to would be unmounted from + * @mnt->mnt_parent and remounted on @mnt_from. But since @c is + * already mounted on @mnt_from, @mnt_to would ultimately be + * remounted on top of @c. Afterwards, @mnt_from would be + * covered by a copy @c of @mnt_from and @c would be covered by + * @mnt_from itself. This defeats the whole purpose of mounting + * @mnt_from beneath @mnt_to. + */ + if (propagation_would_overmount(parent_mnt_to, mnt_from, mp)) + return -EINVAL; + + return 0; +} + +static int do_move_mount(struct path *old_path, struct path *new_path, + bool beneath) { struct mnt_namespace *ns; struct mount *p; @@ -2863,8 +3104,9 @@ static int do_move_mount(struct path *old_path, struct path *new_path) struct mountpoint *mp, *old_mp; int err; bool attached; + enum mnt_tree_flags_t flags = 0; - mp = lock_mount(new_path); + mp = do_lock_mount(new_path, beneath); if (IS_ERR(mp)) return PTR_ERR(mp); @@ -2872,6 +3114,8 @@ static int do_move_mount(struct path *old_path, struct path *new_path) p = real_mount(new_path->mnt); parent = old->mnt_parent; attached = mnt_has_parent(old); + if (attached) + flags |= MNT_TREE_MOVE; old_mp = old->mnt_mp; ns = old->mnt_ns; @@ -2902,6 +3146,17 @@ static int do_move_mount(struct path *old_path, struct path *new_path) */ if (attached && IS_MNT_SHARED(parent)) goto out; + + if (beneath) { + err = can_move_mount_beneath(old_path, new_path, mp); + if (err) + goto out; + + err = -EINVAL; + p = p->mnt_parent; + flags |= MNT_TREE_BENEATH; + } + /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. @@ -2915,8 +3170,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path) if (p == old) goto out; - err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, - attached); + err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags); if (err) goto out; @@ -2948,7 +3202,7 @@ static int do_move_mount_old(struct path *path, const char *old_name) if (err) return err; - err = do_move_mount(&old_path, path); + err = do_move_mount(&old_path, path, false); path_put(&old_path); return err; } @@ -3114,13 +3368,10 @@ int finish_automount(struct vfsmount *m, const struct path *path) err = -ENOENT; goto discard_locked; } - rcu_read_lock(); - if (unlikely(__lookup_mnt(path->mnt, dentry))) { - rcu_read_unlock(); + if (path_overmounted(path)) { err = 0; goto discard_locked; } - rcu_read_unlock(); mp = get_mountpoint(dentry); if (IS_ERR(mp)) { err = PTR_ERR(mp); @@ -3812,6 +4063,10 @@ SYSCALL_DEFINE5(move_mount, if (flags & ~MOVE_MOUNT__MASK) return -EINVAL; + if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) == + (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) + return -EINVAL; + /* If someone gives a pathname, they aren't permitted to move * from an fd that requires unmount as we can't get at the flag * to clear it afterwards. @@ -3841,7 +4096,8 @@ SYSCALL_DEFINE5(move_mount, if (flags & MOVE_MOUNT_SET_GROUP) ret = do_set_group(&from_path, &to_path); else - ret = do_move_mount(&from_path, &to_path); + ret = do_move_mount(&from_path, &to_path, + (flags & MOVE_MOUNT_BENEATH)); out_to: path_put(&to_path); @@ -3974,9 +4230,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; } /* mount old root on put_old */ - attach_mnt(root_mnt, old_mnt, old_mp); + attach_mnt(root_mnt, old_mnt, old_mp, false); /* mount new_root on / */ - attach_mnt(new_mnt, root_parent, root_mp); + attach_mnt(new_mnt, root_parent, root_mp, false); mnt_add_count(root_parent, -1); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ diff --git a/fs/pnode.c b/fs/pnode.c index 3cede8b18c8b..e4d0340393d5 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -216,7 +216,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin) static struct mount *last_dest, *first_source, *last_source, *dest_master; static struct hlist_head *list; -static inline bool peers(struct mount *m1, struct mount *m2) +static inline bool peers(const struct mount *m1, const struct mount *m2) { return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; } @@ -354,6 +354,46 @@ static inline int do_refcount_check(struct mount *mnt, int count) return mnt_get_count(mnt) > count; } +/** + * propagation_would_overmount - check whether propagation from @from + * would overmount @to + * @from: shared mount + * @to: mount to check + * @mp: future mountpoint of @to on @from + * + * If @from propagates mounts to @to, @from and @to must either be peers + * or one of the masters in the hierarchy of masters of @to must be a + * peer of @from. + * + * If the root of the @to mount is equal to the future mountpoint @mp of + * the @to mount on @from then @to will be overmounted by whatever is + * propagated to it. + * + * Context: This function expects namespace_lock() to be held and that + * @mp is stable. + * Return: If @from overmounts @to, true is returned, false if not. + */ +bool propagation_would_overmount(const struct mount *from, + const struct mount *to, + const struct mountpoint *mp) +{ + if (!IS_MNT_SHARED(from)) + return false; + + if (IS_MNT_NEW(to)) + return false; + + if (to->mnt.mnt_root != mp->m_dentry) + return false; + + for (const struct mount *m = to; m; m = m->mnt_master) { + if (peers(from, m)) + return true; + } + + return false; +} + /* * check if the mount 'mnt' can be unmounted successfully. * @mnt: the mount to be checked for unmount diff --git a/fs/pnode.h b/fs/pnode.h index 988f1aa9b02a..0b02a6393891 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -53,4 +53,7 @@ struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); int count_mounts(struct mnt_namespace *ns, struct mount *mnt); +bool propagation_would_overmount(const struct mount *from, + const struct mount *to, + const struct mountpoint *mp); #endif /* _LINUX_PNODE_H */ diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 4d93967f8aea..8eb0d7b758d2 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -74,7 +74,8 @@ #define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */ #define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ #define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */ -#define MOVE_MOUNT__MASK 0x00000177 +#define MOVE_MOUNT_BENEATH 0x00000200 /* Mount beneath top mount */ +#define MOVE_MOUNT__MASK 0x00000377 /* * fsopen() flags. -- cgit v1.2.3 From 62fe99cef94a5900cac3bf15fd03ee8baad1a99c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:29 +0800 Subject: ublk: add read()/write() support for ublk char device Support pread()/pwrite() on ublk char device for reading/writing request io buffer, so data copy between io request buffer and userspace buffer can be moved to ublk server from ublk driver. Then UBLK_F_NEED_GET_DATA becomes not necessary, so ublk server can allocate buffer without one extra round uring command communication for userspace to provide buffer. IO buffer can be located by iocb->ki_pos which encodes buffer offset, io tag and queue id info, and type of iocb->ki_pos is u64, so it is big enough for holding reasonable queue depth, nr_queues and max io buffer size. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-7-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 151 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/ublk_cmd.h | 22 +++++- 2 files changed, 172 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 13523c37a165..ec40ac4f9af3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -207,6 +207,23 @@ static unsigned int ublks_added; /* protected by ublk_ctl_mutex */ static struct miscdevice ublk_misc; +static inline unsigned ublk_pos_to_hwq(loff_t pos) +{ + return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) & + UBLK_QID_BITS_MASK; +} + +static inline unsigned ublk_pos_to_buf_off(loff_t pos) +{ + return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK; +} + +static inline unsigned ublk_pos_to_tag(loff_t pos) +{ + return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) & + UBLK_TAG_BITS_MASK; +} + static void ublk_dev_param_basic_apply(struct ublk_device *ub) { struct request_queue *q = ub->ub_disk->queue; @@ -1429,6 +1446,36 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, return -EIOCBQUEUED; } +static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, + struct ublk_queue *ubq, int tag, size_t offset) +{ + struct request *req; + + if (!ublk_need_req_ref(ubq)) + return NULL; + + req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + if (!req) + return NULL; + + if (!ublk_get_req_ref(ubq, req)) + return NULL; + + if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) + goto fail_put; + + if (!ublk_rq_has_data(req)) + goto fail_put; + + if (offset > blk_rq_bytes(req)) + goto fail_put; + + return req; +fail_put: + ublk_put_req_ref(ubq, req); + return NULL; +} + static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { /* @@ -1446,11 +1493,112 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd); } +static inline bool ublk_check_ubuf_dir(const struct request *req, + int ubuf_dir) +{ + /* copy ubuf to request pages */ + if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE) + return true; + + /* copy request pages to ubuf */ + if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST) + return true; + + return false; +} + +static struct request *ublk_check_and_get_req(struct kiocb *iocb, + struct iov_iter *iter, size_t *off, int dir) +{ + struct ublk_device *ub = iocb->ki_filp->private_data; + struct ublk_queue *ubq; + struct request *req; + size_t buf_off; + u16 tag, q_id; + + if (!ub) + return ERR_PTR(-EACCES); + + if (!user_backed_iter(iter)) + return ERR_PTR(-EACCES); + + if (ub->dev_info.state == UBLK_S_DEV_DEAD) + return ERR_PTR(-EACCES); + + tag = ublk_pos_to_tag(iocb->ki_pos); + q_id = ublk_pos_to_hwq(iocb->ki_pos); + buf_off = ublk_pos_to_buf_off(iocb->ki_pos); + + if (q_id >= ub->dev_info.nr_hw_queues) + return ERR_PTR(-EINVAL); + + ubq = ublk_get_queue(ub, q_id); + if (!ubq) + return ERR_PTR(-EINVAL); + + if (tag >= ubq->q_depth) + return ERR_PTR(-EINVAL); + + req = __ublk_check_and_get_req(ub, ubq, tag, buf_off); + if (!req) + return ERR_PTR(-EINVAL); + + if (!req->mq_hctx || !req->mq_hctx->driver_data) + goto fail; + + if (!ublk_check_ubuf_dir(req, dir)) + goto fail; + + *off = buf_off; + return req; +fail: + ublk_put_req_ref(ubq, req); + return ERR_PTR(-EACCES); +} + +static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct ublk_queue *ubq; + struct request *req; + size_t buf_off; + size_t ret; + + req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST); + if (IS_ERR(req)) + return PTR_ERR(req); + + ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST); + ubq = req->mq_hctx->driver_data; + ublk_put_req_ref(ubq, req); + + return ret; +} + +static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct ublk_queue *ubq; + struct request *req; + size_t buf_off; + size_t ret; + + req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE); + if (IS_ERR(req)) + return PTR_ERR(req); + + ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE); + ubq = req->mq_hctx->driver_data; + ublk_put_req_ref(ubq, req); + + return ret; +} + static const struct file_operations ublk_ch_fops = { .owner = THIS_MODULE, .open = ublk_ch_open, .release = ublk_ch_release, .llseek = no_llseek, + .read_iter = ublk_ch_read_iter, + .write_iter = ublk_ch_write_iter, .uring_cmd = ublk_ch_uring_cmd, .mmap = ublk_ch_mmap, }; @@ -2362,6 +2510,9 @@ static int __init ublk_init(void) { int ret; + BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + + UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); + init_waitqueue_head(&ublk_idr_wq); ret = misc_register(&ublk_misc); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 640bf687b94a..c0c1632c671e 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -93,9 +93,29 @@ #define UBLKSRV_CMD_BUF_OFFSET 0 #define UBLKSRV_IO_BUF_OFFSET 0x80000000 -/* tag bit is 12bit, so at most 4096 IOs for each queue */ +/* tag bit is 16bit, so far limit at most 4096 IOs for each queue */ #define UBLK_MAX_QUEUE_DEPTH 4096 +/* single IO buffer max size is 32MB */ +#define UBLK_IO_BUF_OFF 0 +#define UBLK_IO_BUF_BITS 25 +#define UBLK_IO_BUF_BITS_MASK ((1ULL << UBLK_IO_BUF_BITS) - 1) + +/* so at most 64K IOs for each queue */ +#define UBLK_TAG_OFF UBLK_IO_BUF_BITS +#define UBLK_TAG_BITS 16 +#define UBLK_TAG_BITS_MASK ((1ULL << UBLK_TAG_BITS) - 1) + +/* max 4096 queues */ +#define UBLK_QID_OFF (UBLK_TAG_OFF + UBLK_TAG_BITS) +#define UBLK_QID_BITS 12 +#define UBLK_QID_BITS_MASK ((1ULL << UBLK_QID_BITS) - 1) + +#define UBLK_MAX_NR_QUEUES (1U << UBLK_QID_BITS) + +#define UBLKSRV_IO_BUF_TOTAL_BITS (UBLK_QID_OFF + UBLK_QID_BITS) +#define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS) + /* * zero copy requires 4k block size, and can remap ublk driver's io * request into ublksrv's vm space -- cgit v1.2.3 From 1172d5b8beca6b899deb9f7f2850e7e47ec16198 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:30 +0800 Subject: ublk: support user copy Currently copy between io request buffer(pages) and userspace buffer is done inside ublk_map_io() or ublk_unmap_io(). This way performs very well in case of pre-allocated userspace io buffer. For dynamically allocated or external userspace backend io buffer, UBLK_F_NEED_GET_DATA is added for ublk server to provide buffer by one extra command communication for WRITE request. For READ, userspace simply provides buffer, but can't know when the buffer is done[1]. Add UBLK_F_USER_COPY by moving io data copy out of kernel by providing read()/write() on /dev/ublkcN, and simply let ublk server do the io data copy. This way makes both side cleaner, the cost is that one extra syscall for copy io data between request and backend buffer. With UBLK_F_USER_COPY, it actually becomes possible to run per-io zero copy now, such as, only do zero copy for big size IO, so it can be thought as one prep patch for supporting zero copy. Meantime zero copy still needs to expose read()/write() buffer for some corner case, such as passthrough IO. [1] READ buffer in UBLK_F_NEED_GET_DATA https://lore.kernel.org/linux-block/116d8a56-0881-56d3-9bcc-78ff3e1dc4e5@linux.alibaba.com/T/#m23bd4b8634c0a054e6797063167b469949a247bb ublksrv loop usercopy code: https://github.com/ming1/ubdsrv/commits/usercopy Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-8-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 58 +++++++++++++++++++++++++++++++++++-------- include/uapi/linux/ublk_cmd.h | 3 +++ 2 files changed, 50 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index ec40ac4f9af3..e00733b6fea8 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -55,7 +55,8 @@ | UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ | UBLK_F_UNPRIVILEGED_DEV \ - | UBLK_F_CMD_IOCTL_ENCODE) + | UBLK_F_CMD_IOCTL_ENCODE \ + | UBLK_F_USER_COPY) /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ @@ -312,9 +313,18 @@ static int ublk_apply_params(struct ublk_device *ub) return 0; } +static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_USER_COPY; +} + static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) { - return false; + /* + * read()/write() is involved in user copy, so request reference + * has to be grabbed + */ + return ublk_support_user_copy(ubq); } static inline void ublk_init_req_ref(const struct ublk_queue *ubq, @@ -591,6 +601,9 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, { const unsigned int rq_bytes = blk_rq_bytes(req); + if (ublk_support_user_copy(ubq)) + return rq_bytes; + /* * no zero copy, we delay copy WRITE request data into ublksrv * context and the big benefit is that pinning pages in current @@ -615,6 +628,9 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, { const unsigned int rq_bytes = blk_rq_bytes(req); + if (ublk_support_user_copy(ubq)) + return rq_bytes; + if (ublk_need_unmap_req(req)) { struct iov_iter iter; struct iovec iov; @@ -1390,6 +1406,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) goto out; + if (ublk_support_user_copy(ubq) && ub_cmd->addr) { + ret = -EINVAL; + goto out; + } + ret = ublk_check_cmd_op(cmd_op); if (ret) goto out; @@ -1408,23 +1429,34 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, */ if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) goto out; - /* FETCH_RQ has to provide IO buffer if NEED GET DATA is not enabled */ - if (!ub_cmd->addr && !ublk_need_get_data(ubq)) - goto out; + + if (!ublk_support_user_copy(ubq)) { + /* + * FETCH_RQ has to provide IO buffer if NEED GET + * DATA is not enabled + */ + if (!ub_cmd->addr && !ublk_need_get_data(ubq)) + goto out; + } ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_mark_io_ready(ub, ubq); break; case UBLK_IO_COMMIT_AND_FETCH_REQ: req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); - /* - * COMMIT_AND_FETCH_REQ has to provide IO buffer if NEED GET DATA is - * not enabled or it is Read IO. - */ - if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ)) - goto out; + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out; + + if (!ublk_support_user_copy(ubq)) { + /* + * COMMIT_AND_FETCH_REQ has to provide IO buffer if + * NEED GET DATA is not enabled or it is Read IO. + */ + if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || + req_op(req) == REQ_OP_READ)) + goto out; + } ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_commit_completion(ub, ub_cmd); break; @@ -1996,6 +2028,10 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | UBLK_F_URING_CMD_COMP_IN_TASK; + /* GET_DATA isn't needed any more with USER_COPY */ + if (ub->dev_info.flags & UBLK_F_USER_COPY) + ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* We are not ready to support zero copy */ ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index c0c1632c671e..54b5b0aeefca 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -165,6 +165,9 @@ /* use ioctl encoding for uring command */ #define UBLK_F_CMD_IOCTL_ENCODE (1UL << 6) +/* Copy between request and user buffer by pread()/pwrite() */ +#define UBLK_F_USER_COPY (1UL << 7) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 -- cgit v1.2.3 From eca2040972b411ec27483bf75dc8b84e730e88ff Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 11 May 2023 03:13:34 +0200 Subject: scsi: block: ioprio: Clean up interface definition The I/O priority user interface defines the 16-bits ioprio values as the combination of the upper 3-bits for an I/O priority class and the lower 13-bits as priority data. However, the kernel only uses the lower 3-bits of the priority data to define priority levels for the RT and BE priority classes. The data part of an ioprio value is completely ignored for the IDLE and NONE classes. This is enforced by checks done in ioprio_check_cap(), which is called for all paths that allow defining an I/O priority for I/Os: the per-context ioprio_set() system call, aio interface and io_uring interface. Clarify this fact in the uapi ioprio.h header file and introduce the IOPRIO_PRIO_LEVEL_MASK and IOPRIO_PRIO_LEVEL() macros for users to define and get priority levels in an ioprio value. The coarser macro IOPRIO_PRIO_DATA() is retained for backward compatibility with old applications already using it. There is no functional change introduced with this. In-kernel users of the IOPRIO_PRIO_DATA() macro which are explicitly handling I/O priority data as a priority level are modified to use the new IOPRIO_PRIO_LEVEL() macro without any functional change. Since f2fs is the only user of this macro not explicitly using that value as a priority level, it is left unchanged. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Niklas Cassel Link: https://lore.kernel.org/r/20230511011356.227789-2-nks@flawful.org Signed-off-by: Martin K. Petersen --- block/bfq-iosched.c | 8 ++++---- block/ioprio.c | 6 +++--- include/uapi/linux/ioprio.h | 19 ++++++++++++++----- 3 files changed, 21 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3164e3177965..3067b75f3fd0 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5524,16 +5524,16 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfqq->new_ioprio_class = task_nice_ioclass(tsk); break; case IOPRIO_CLASS_RT: - bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio = IOPRIO_PRIO_LEVEL(bic->ioprio); bfqq->new_ioprio_class = IOPRIO_CLASS_RT; break; case IOPRIO_CLASS_BE: - bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio = IOPRIO_PRIO_LEVEL(bic->ioprio); bfqq->new_ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; - bfqq->new_ioprio = 7; + bfqq->new_ioprio = IOPRIO_NR_LEVELS - 1; break; } @@ -5830,7 +5830,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bfq_io_cq *bic, bool respawn) { - const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + const int ioprio = IOPRIO_PRIO_LEVEL(bic->ioprio); const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); struct bfq_queue **async_bfqq = NULL; struct bfq_queue *bfqq; diff --git a/block/ioprio.c b/block/ioprio.c index 32a456b45804..f0d9e818abc5 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -33,7 +33,7 @@ int ioprio_check_cap(int ioprio) { int class = IOPRIO_PRIO_CLASS(ioprio); - int data = IOPRIO_PRIO_DATA(ioprio); + int level = IOPRIO_PRIO_LEVEL(ioprio); switch (class) { case IOPRIO_CLASS_RT: @@ -49,13 +49,13 @@ int ioprio_check_cap(int ioprio) fallthrough; /* rt has prio field too */ case IOPRIO_CLASS_BE: - if (data >= IOPRIO_NR_LEVELS || data < 0) + if (level >= IOPRIO_NR_LEVELS) return -EINVAL; break; case IOPRIO_CLASS_IDLE: break; case IOPRIO_CLASS_NONE: - if (data) + if (level) return -EINVAL; break; default: diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index f70f2596a6bf..4444b4e4fdad 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -17,7 +17,7 @@ ((data) & IOPRIO_PRIO_MASK)) /* - * These are the io priority groups as implemented by the BFQ and mq-deadline + * These are the io priority classes as implemented by the BFQ and mq-deadline * schedulers. RT is the realtime class, it always gets premium service. For * ATA disks supporting NCQ IO priority, RT class IOs will be processed using * high priority NCQ commands. BE is the best-effort scheduling class, the @@ -32,11 +32,20 @@ enum { }; /* - * The RT and BE priority classes both support up to 8 priority levels. + * The RT and BE priority classes both support up to 8 priority levels that + * can be specified using the lower 3-bits of the priority data. */ -#define IOPRIO_NR_LEVELS 8 -#define IOPRIO_BE_NR IOPRIO_NR_LEVELS +#define IOPRIO_LEVEL_NR_BITS 3 +#define IOPRIO_NR_LEVELS (1 << IOPRIO_LEVEL_NR_BITS) +#define IOPRIO_LEVEL_MASK (IOPRIO_NR_LEVELS - 1) +#define IOPRIO_PRIO_LEVEL(ioprio) ((ioprio) & IOPRIO_LEVEL_MASK) +#define IOPRIO_BE_NR IOPRIO_NR_LEVELS + +/* + * Possible values for the "which" argument of the ioprio_get() and + * ioprio_set() system calls (see "man ioprio_set"). + */ enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, @@ -44,7 +53,7 @@ enum { }; /* - * Fallback BE priority level. + * Fallback BE class priority level. */ #define IOPRIO_NORM 4 #define IOPRIO_BE_NORM IOPRIO_NORM -- cgit v1.2.3 From 6c913257226a25879bfd6226e0ee265e98904ce6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 11 May 2023 03:13:35 +0200 Subject: scsi: block: Introduce ioprio hints I/O priorities currently only use 6-bits of the 16-bits ioprio value: the 3-upper bits are used to define up to 8 priority classes (4 of which are valid) and the 3 lower bits of the value are used to define a priority level for the real-time and best-effort class. The remaining 10-bits between the I/O priority class and level are unused, and in fact, cannot be used by the user as doing so would either result in the value being completely ignored, or in an error returned by ioprio_check_cap(). Use these 10-bits of an ioprio value to allow a user to specify I/O hints. An I/O hint is defined as a 10-bitsvalue, allowing up to 1023 different hints to be specified, with the value 0 being reserved as the "no hint" case. An I/O hint can apply to any I/O that specifies a valid priority class other than NONE, regardless of the I/O priority level specified. To do so, the macros IOPRIO_PRIO_HINT() and IOPRIO_PRIO_VALUE_HINT() are introduced in include/uapi/linux/ioprio.h to respectively allow a user to get and set a hint in an ioprio value. To support the ATA and SCSI command duration limits feature, 7 hints are defined: IOPRIO_HINT_DEV_DURATION_LIMIT_1 to IOPRIO_HINT_DEV_DURATION_LIMIT_7, allowing a user to specify which command duration limit descriptor should be applied to the commands serving an I/O. Specifying these hints has for now no effect whatsoever if the target block devices do not support the command duration limits feature. However, in the future, block I/O schedulers can be modified to optimize I/O issuing order based on these hints, even for devices that do not support the command duration limits feature. Given that the 7 duration limits hints defined have no effect on any block layer component, the actual definition of the duration limits implied by these hints remains at the device level. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Niklas Cassel Link: https://lore.kernel.org/r/20230511011356.227789-3-nks@flawful.org Signed-off-by: Martin K. Petersen --- include/uapi/linux/ioprio.h | 49 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index 4444b4e4fdad..4c4806e8230b 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -58,4 +58,53 @@ enum { #define IOPRIO_NORM 4 #define IOPRIO_BE_NORM IOPRIO_NORM +/* + * The 10 bits between the priority class and the priority level are used to + * optionally define I/O hints for any combination of I/O priority class and + * level. Depending on the kernel configuration, I/O scheduler being used and + * the target I/O device being used, hints can influence how I/Os are processed + * without affecting the I/O scheduling ordering defined by the I/O priority + * class and level. + */ +#define IOPRIO_HINT_SHIFT IOPRIO_LEVEL_NR_BITS +#define IOPRIO_HINT_NR_BITS 10 +#define IOPRIO_NR_HINTS (1 << IOPRIO_HINT_NR_BITS) +#define IOPRIO_HINT_MASK (IOPRIO_NR_HINTS - 1) +#define IOPRIO_PRIO_HINT(ioprio) \ + (((ioprio) >> IOPRIO_HINT_SHIFT) & IOPRIO_HINT_MASK) + +/* + * Alternate macro for IOPRIO_PRIO_VALUE() to define an I/O priority with + * a class, level and hint. + */ +#define IOPRIO_PRIO_VALUE_HINT(class, level, hint) \ + ((((class) & IOPRIO_CLASS_MASK) << IOPRIO_CLASS_SHIFT) | \ + (((hint) & IOPRIO_HINT_MASK) << IOPRIO_HINT_SHIFT) | \ + ((level) & IOPRIO_LEVEL_MASK)) + +/* + * I/O hints. + */ +enum { + /* No hint */ + IOPRIO_HINT_NONE = 0, + + /* + * Device command duration limits: indicate to the device a desired + * duration limit for the commands that will be used to process an I/O. + * These will currently only be effective for SCSI and ATA devices that + * support the command duration limits feature. If this feature is + * enabled, then the commands issued to the device to process an I/O with + * one of these hints set will have the duration limit index (dld field) + * set to the value of the hint. + */ + IOPRIO_HINT_DEV_DURATION_LIMIT_1 = 1, + IOPRIO_HINT_DEV_DURATION_LIMIT_2 = 2, + IOPRIO_HINT_DEV_DURATION_LIMIT_3 = 3, + IOPRIO_HINT_DEV_DURATION_LIMIT_4 = 4, + IOPRIO_HINT_DEV_DURATION_LIMIT_5 = 5, + IOPRIO_HINT_DEV_DURATION_LIMIT_6 = 6, + IOPRIO_HINT_DEV_DURATION_LIMIT_7 = 7, +}; + #endif /* _UAPI_LINUX_IOPRIO_H */ -- cgit v1.2.3 From cb8edce28073a906401c9e421eca7c99f3396da1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 15 May 2023 16:48:06 -0700 Subject: bpf: Support O_PATH FDs in BPF_OBJ_PIN and BPF_OBJ_GET commands Current UAPI of BPF_OBJ_PIN and BPF_OBJ_GET commands of bpf() syscall forces users to specify pinning location as a string-based absolute or relative (to current working directory) path. This has various implications related to security (e.g., symlink-based attacks), forces BPF FS to be exposed in the file system, which can cause races with other applications. One of the feedbacks we got from folks working with containers heavily was that inability to use purely FD-based location specification was an unfortunate limitation and hindrance for BPF_OBJ_PIN and BPF_OBJ_GET commands. This patch closes this oversight, adding path_fd field to BPF_OBJ_PIN and BPF_OBJ_GET UAPI, following conventions established by *at() syscalls for dirfd + pathname combinations. This now allows interesting possibilities like working with detached BPF FS mount (e.g., to perform multiple pinnings without running a risk of someone interfering with them), and generally making pinning/getting more secure and not prone to any races and/or security attacks. This is demonstrated by a selftest added in subsequent patch that takes advantage of new mount APIs (fsopen, fsconfig, fsmount) to demonstrate creating detached BPF FS mount, pinning, and then getting BPF map out of it, all while never exposing this private instance of BPF FS to outside worlds. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Reviewed-by: Christian Brauner Link: https://lore.kernel.org/bpf/20230523170013.728457-4-andrii@kernel.org --- include/linux/bpf.h | 4 ++-- include/uapi/linux/bpf.h | 10 ++++++++++ kernel/bpf/inode.c | 16 ++++++++-------- kernel/bpf/syscall.c | 25 ++++++++++++++++++++----- tools/include/uapi/linux/bpf.h | 10 ++++++++++ 5 files changed, 50 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 36e4b2d8cca2..f58895830ada 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2077,8 +2077,8 @@ struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd); struct bpf_link *bpf_link_get_from_fd(u32 ufd); struct bpf_link *bpf_link_get_curr_or_next(u32 *id); -int bpf_obj_pin_user(u32 ufd, const char __user *pathname); -int bpf_obj_get_user(const char __user *pathname, int flags); +int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); +int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); #define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1bb11a6ee667..9273c654743c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1272,6 +1272,9 @@ enum { /* Create a map that will be registered/unregesitered by the backed bpf_link */ BPF_F_LINK = (1U << 13), + +/* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */ + BPF_F_PATH_FD = (1U << 14), }; /* Flags for BPF_PROG_QUERY. */ @@ -1420,6 +1423,13 @@ union bpf_attr { __aligned_u64 pathname; __u32 bpf_fd; __u32 file_flags; + /* Same as dirfd in openat() syscall; see openat(2) + * manpage for details of path FD and pathname semantics; + * path_fd should accompanied by BPF_F_PATH_FD flag set in + * file_flags field, otherwise it should be set to zero; + * if BPF_F_PATH_FD flag is not set, AT_FDCWD is assumed. + */ + __s32 path_fd; }; struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 329f27d5cacf..4174f76133df 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -435,7 +435,7 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent, return ret; } -static int bpf_obj_do_pin(const char __user *pathname, void *raw, +static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, enum bpf_type type) { struct dentry *dentry; @@ -444,7 +444,7 @@ static int bpf_obj_do_pin(const char __user *pathname, void *raw, umode_t mode; int ret; - dentry = user_path_create(AT_FDCWD, pathname, &path, 0); + dentry = user_path_create(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -477,7 +477,7 @@ out: return ret; } -int bpf_obj_pin_user(u32 ufd, const char __user *pathname) +int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname) { enum bpf_type type; void *raw; @@ -487,14 +487,14 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) if (IS_ERR(raw)) return PTR_ERR(raw); - ret = bpf_obj_do_pin(pathname, raw, type); + ret = bpf_obj_do_pin(path_fd, pathname, raw, type); if (ret != 0) bpf_any_put(raw, type); return ret; } -static void *bpf_obj_do_get(const char __user *pathname, +static void *bpf_obj_do_get(int path_fd, const char __user *pathname, enum bpf_type *type, int flags) { struct inode *inode; @@ -502,7 +502,7 @@ static void *bpf_obj_do_get(const char __user *pathname, void *raw; int ret; - ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path); + ret = user_path_at(path_fd, pathname, LOOKUP_FOLLOW, &path); if (ret) return ERR_PTR(ret); @@ -526,7 +526,7 @@ out: return ERR_PTR(ret); } -int bpf_obj_get_user(const char __user *pathname, int flags) +int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags) { enum bpf_type type = BPF_TYPE_UNSPEC; int f_flags; @@ -537,7 +537,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags) if (f_flags < 0) return f_flags; - raw = bpf_obj_do_get(pathname, &type, f_flags); + raw = bpf_obj_do_get(path_fd, pathname, &type, f_flags); if (IS_ERR(raw)) return PTR_ERR(raw); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b2621089904b..c7f6807215e6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2697,23 +2697,38 @@ free_prog: return err; } -#define BPF_OBJ_LAST_FIELD file_flags +#define BPF_OBJ_LAST_FIELD path_fd static int bpf_obj_pin(const union bpf_attr *attr) { - if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0) + int path_fd; + + if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD) + return -EINVAL; + + /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ + if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) return -EINVAL; - return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); + path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; + return bpf_obj_pin_user(attr->bpf_fd, path_fd, + u64_to_user_ptr(attr->pathname)); } static int bpf_obj_get(const union bpf_attr *attr) { + int path_fd; + if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || - attr->file_flags & ~BPF_OBJ_FLAG_MASK) + attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD)) + return -EINVAL; + + /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ + if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) return -EINVAL; - return bpf_obj_get_user(u64_to_user_ptr(attr->pathname), + path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; + return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname), attr->file_flags); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1bb11a6ee667..9273c654743c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1272,6 +1272,9 @@ enum { /* Create a map that will be registered/unregesitered by the backed bpf_link */ BPF_F_LINK = (1U << 13), + +/* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */ + BPF_F_PATH_FD = (1U << 14), }; /* Flags for BPF_PROG_QUERY. */ @@ -1420,6 +1423,13 @@ union bpf_attr { __aligned_u64 pathname; __u32 bpf_fd; __u32 file_flags; + /* Same as dirfd in openat() syscall; see openat(2) + * manpage for details of path FD and pathname semantics; + * path_fd should accompanied by BPF_F_PATH_FD flag set in + * file_flags field, otherwise it should be set to zero; + * if BPF_F_PATH_FD flag is not set, AT_FDCWD is assumed. + */ + __s32 path_fd; }; struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ -- cgit v1.2.3 From 6c8017c6a58d06c2fcce3b034944ad056ccf02ce Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Thu, 11 May 2023 08:44:38 -0700 Subject: vfio/pci: Clear VFIO_IRQ_INFO_NORESIZE for MSI-X Dynamic MSI-X is supported. Clear VFIO_IRQ_INFO_NORESIZE to provide guidance to user space. Signed-off-by: Reinette Chatre Reviewed-by: Kevin Tian Acked-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/fd1ef2bf6ae972da8e2805bc95d5155af5a8fb0a.1683740667.git.reinette.chatre@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 2 +- include/uapi/linux/vfio.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index a3635a8e54c8..ec7e662de033 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1114,7 +1114,7 @@ static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, if (info.index == VFIO_PCI_INTX_IRQ_INDEX) info.flags |= (VFIO_IRQ_INFO_MASKABLE | VFIO_IRQ_INFO_AUTOMASKED); - else + else if (info.index != VFIO_PCI_MSIX_IRQ_INDEX || !vdev->has_dyn_msix) info.flags |= VFIO_IRQ_INFO_NORESIZE; return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 0552e8dcf0cb..1a36134cae5c 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -511,6 +511,9 @@ struct vfio_region_info_cap_nvlink2_lnkspd { * then add and unmask vectors, it's up to userspace to make the decision * whether to allocate the maximum supported number of vectors or tear * down setup and incrementally increase the vectors as each is enabled. + * Absence of the NORESIZE flag indicates that vectors can be enabled + * and disabled dynamically without impacting other vectors within the + * index. */ struct vfio_irq_info { __u32 argsz; -- cgit v1.2.3 From e9261467ae86a6544bb602a55a1eab52696e71e3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 23 May 2023 11:15:48 +0100 Subject: net: mdio: add clause 73 to ethtool conversion helper Add a helper to convert a clause 73 advertisement to an ethtool bitmap. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 39 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/mdio.h | 24 ++++++++++++++++++++++++ 2 files changed, 63 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 27013d6bf24a..0670cc6e067c 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -486,6 +486,45 @@ static inline u32 linkmode_adv_to_mii_10base_t1_t(unsigned long *adv) return result; } +/** + * mii_c73_mod_linkmode - convert a Clause 73 advertisement to linkmodes + * @adv: linkmode advertisement setting + * @lpa: array of three u16s containing the advertisement + * + * Convert an IEEE 802.3 Clause 73 advertisement to ethtool link modes. + */ +static inline void mii_c73_mod_linkmode(unsigned long *adv, u16 *lpa) +{ + linkmode_mod_bit(ETHTOOL_LINK_MODE_Pause_BIT, + adv, lpa[0] & MDIO_AN_C73_0_PAUSE); + linkmode_mod_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, + adv, lpa[0] & MDIO_AN_C73_0_ASM_DIR); + linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT, + adv, lpa[1] & MDIO_AN_C73_1_1000BASE_KX); + linkmode_mod_bit(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT, + adv, lpa[1] & MDIO_AN_C73_1_10GBASE_KX4); + linkmode_mod_bit(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT, + adv, lpa[1] & MDIO_AN_C73_1_40GBASE_KR4); + linkmode_mod_bit(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT, + adv, lpa[1] & MDIO_AN_C73_1_40GBASE_CR4); + /* 100GBASE_CR10 and 100GBASE_KP4 not implemented */ + linkmode_mod_bit(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT, + adv, lpa[1] & MDIO_AN_C73_1_100GBASE_KR4); + linkmode_mod_bit(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT, + adv, lpa[1] & MDIO_AN_C73_1_100GBASE_CR4); + /* 25GBASE_R_S not implemented */ + /* The 25GBASE_R bit can be used for 25Gbase KR or CR modes */ + linkmode_mod_bit(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT, + adv, lpa[1] & MDIO_AN_C73_1_25GBASE_R); + linkmode_mod_bit(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT, + adv, lpa[1] & MDIO_AN_C73_1_25GBASE_R); + linkmode_mod_bit(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT, + adv, lpa[1] & MDIO_AN_C73_1_10GBASE_KR); + linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseX_Full_BIT, + adv, lpa[2] & MDIO_AN_C73_2_2500BASE_KX); + /* 5GBASE_KR not implemented */ +} + int __mdiobus_read(struct mii_bus *bus, int addr, u32 regnum); int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val); int __mdiobus_modify_changed(struct mii_bus *bus, int addr, u32 regnum, diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index 256b463e47a6..b826598d1e94 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -231,6 +231,30 @@ #define MDIO_PMA_EXTABLE_BT1 0x0800 /* BASE-T1 ability */ #define MDIO_PMA_EXTABLE_NBT 0x4000 /* 2.5/5GBASE-T ability */ +/* AN Clause 73 linkword */ +#define MDIO_AN_C73_0_S_MASK GENMASK(4, 0) +#define MDIO_AN_C73_0_E_MASK GENMASK(9, 5) +#define MDIO_AN_C73_0_PAUSE BIT(10) +#define MDIO_AN_C73_0_ASM_DIR BIT(11) +#define MDIO_AN_C73_0_C2 BIT(12) +#define MDIO_AN_C73_0_RF BIT(13) +#define MDIO_AN_C73_0_ACK BIT(14) +#define MDIO_AN_C73_0_NP BIT(15) +#define MDIO_AN_C73_1_T_MASK GENMASK(4, 0) +#define MDIO_AN_C73_1_1000BASE_KX BIT(5) +#define MDIO_AN_C73_1_10GBASE_KX4 BIT(6) +#define MDIO_AN_C73_1_10GBASE_KR BIT(7) +#define MDIO_AN_C73_1_40GBASE_KR4 BIT(8) +#define MDIO_AN_C73_1_40GBASE_CR4 BIT(9) +#define MDIO_AN_C73_1_100GBASE_CR10 BIT(10) +#define MDIO_AN_C73_1_100GBASE_KP4 BIT(11) +#define MDIO_AN_C73_1_100GBASE_KR4 BIT(12) +#define MDIO_AN_C73_1_100GBASE_CR4 BIT(13) +#define MDIO_AN_C73_1_25GBASE_R_S BIT(14) +#define MDIO_AN_C73_1_25GBASE_R BIT(15) +#define MDIO_AN_C73_2_2500BASE_KX BIT(0) +#define MDIO_AN_C73_2_5GBASE_KR BIT(1) + /* PHY XGXS lane state register. */ #define MDIO_PHYXS_LNSTAT_SYNC0 0x0001 #define MDIO_PHYXS_LNSTAT_SYNC1 0x0002 -- cgit v1.2.3 From 96b2b072ee62be8ae68c8ecf14854c4d0505a8f8 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 2 May 2023 15:48:16 +0300 Subject: exportfs: allow exporting non-decodeable file handles to userspace Some userspace programs use st_ino as a unique object identifier, even though inode numbers may be recycable. This issue has been addressed for NFS export long ago using the exportfs file handle API and the unique file handle identifiers are also exported to userspace via name_to_handle_at(2). fanotify also uses file handles to identify objects in events, but only for filesystems that support NFS export. Relax the requirement for NFS export support and allow more filesystems to export a unique object identifier via name_to_handle_at(2) with the flag AT_HANDLE_FID. A file handle requested with the AT_HANDLE_FID flag, may or may not be usable as an argument to open_by_handle_at(2). To allow filesystems to opt-in to supporting AT_HANDLE_FID, a struct export_operations is required, but even an empty struct is sufficient for encoding FIDs. Acked-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: Amir Goldstein Acked-by: Christian Brauner Signed-off-by: Jan Kara Message-Id: <20230502124817.3070545-4-amir73il@gmail.com> --- fs/fhandle.c | 22 ++++++++++++++-------- include/uapi/linux/fcntl.h | 5 +++++ 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/fhandle.c b/fs/fhandle.c index f2bc27d1975e..4a635cf787fc 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -16,7 +16,7 @@ static long do_sys_name_to_handle(const struct path *path, struct file_handle __user *ufh, - int __user *mnt_id) + int __user *mnt_id, int fh_flags) { long retval; struct file_handle f_handle; @@ -24,11 +24,14 @@ static long do_sys_name_to_handle(const struct path *path, struct file_handle *handle = NULL; /* - * We need to make sure whether the file system - * support decoding of the file handle + * We need to make sure whether the file system support decoding of + * the file handle if decodeable file handle was requested. + * Otherwise, even empty export_operations are sufficient to opt-in + * to encoding FIDs. */ if (!path->dentry->d_sb->s_export_op || - !path->dentry->d_sb->s_export_op->fh_to_dentry) + (!(fh_flags & EXPORT_FH_FID) && + !path->dentry->d_sb->s_export_op->fh_to_dentry)) return -EOPNOTSUPP; if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) @@ -45,10 +48,10 @@ static long do_sys_name_to_handle(const struct path *path, /* convert handle size to multiple of sizeof(u32) */ handle_dwords = f_handle.handle_bytes >> 2; - /* we ask for a non connected handle */ + /* we ask for a non connectable maybe decodeable file handle */ retval = exportfs_encode_fh(path->dentry, (struct fid *)handle->f_handle, - &handle_dwords, 0); + &handle_dwords, fh_flags); handle->handle_type = retval; /* convert handle size to bytes */ handle_bytes = handle_dwords * sizeof(u32); @@ -84,6 +87,7 @@ static long do_sys_name_to_handle(const struct path *path, * @handle: resulting file handle * @mnt_id: mount id of the file system containing the file * @flag: flag value to indicate whether to follow symlink or not + * and whether a decodable file handle is required. * * @handle->handle_size indicate the space available to store the * variable part of the file handle in bytes. If there is not @@ -96,17 +100,19 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, { struct path path; int lookup_flags; + int fh_flags; int err; - if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) + if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID)) return -EINVAL; lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; + fh_flags = (flag & AT_HANDLE_FID) ? EXPORT_FH_FID : 0; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; err = user_path_at(dfd, name, lookup_flags, &path); if (!err) { - err = do_sys_name_to_handle(&path, handle, mnt_id); + err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags); path_put(&path); } return err; diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index e8c07da58c9f..6c80f96049bd 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -112,4 +112,9 @@ #define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ +/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */ +#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to + compare object identity and may not + be usable to open_by_handle_at(2) */ + #endif /* _UAPI_LINUX_FCNTL_H */ -- cgit v1.2.3 From 3f6375a2d1956739c6c8ffa3a862c9278d346940 Mon Sep 17 00:00:00 2001 From: Daniel Lundberg Pedersen Date: Mon, 1 May 2023 16:57:06 +0200 Subject: media: videodev2.h: Fix p_s32 and p_s64 pointer types Use the intended pointer types for p_s32 and p_64 in the union of the struct v4l2_ext_control. Fixes: e77eb66342c7 ("videodev2.h: add p_s32 and p_s64 pointers") Signed-off-by: Daniel Lundberg Pedersen Signed-off-by: Hans Verkuil --- include/uapi/linux/videodev2.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index aee75eb9e686..9e7cf1d36945 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -1807,8 +1807,8 @@ struct v4l2_ext_control { __u8 __user *p_u8; __u16 __user *p_u16; __u32 __user *p_u32; - __u32 __user *p_s32; - __u32 __user *p_s64; + __s32 __user *p_s32; + __s64 __user *p_s64; struct v4l2_area __user *p_area; struct v4l2_ctrl_h264_sps __user *p_h264_sps; struct v4l2_ctrl_h264_pps *p_h264_pps; -- cgit v1.2.3 From 26ae58f65e64fa7ba61d64bae752e59e08380c6a Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Thu, 18 May 2023 15:36:49 +0200 Subject: media: videodev2.h: Fix struct v4l2_input tuner index comment VIDIOC_ENUMINPUT documentation describes the tuner field of struct v4l2_input as index: Documentation/userspace-api/media/v4l/vidioc-enuminput.rst " * - __u32 - ``tuner`` - Capture devices can have zero or more tuners (RF demodulators). When the ``type`` is set to ``V4L2_INPUT_TYPE_TUNER`` this is an RF connector and this field identifies the tuner. It corresponds to struct :c:type:`v4l2_tuner` field ``index``. For details on tuners see :ref:`tuner`. " Drivers I could find also use the 'tuner' field as an index, e.g.: drivers/media/pci/bt8xx/bttv-driver.c bttv_enum_input() drivers/media/usb/go7007/go7007-v4l2.c vidioc_enum_input() However, the UAPI comment claims this field is 'enum v4l2_tuner_type': include/uapi/linux/videodev2.h This field being 'enum v4l2_tuner_type' is unlikely as it seems to be never used that way in drivers, and documentation confirms it. It seem this comment got in accidentally in the commit which this patch fixes. Fix the UAPI comment to stop confusion. This was pointed out by Dmitry while reviewing VIDIOC_ENUMINPUT support for strace. Fixes: 6016af82eafc ("[media] v4l2: use __u32 rather than enums in ioctl() structs") Signed-off-by: Marek Vasut Signed-off-by: Hans Verkuil --- include/uapi/linux/videodev2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 9e7cf1d36945..5d8bd754c69f 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -1720,7 +1720,7 @@ struct v4l2_input { __u8 name[32]; /* Label */ __u32 type; /* Type of input */ __u32 audioset; /* Associated audios (bitfield) */ - __u32 tuner; /* enum v4l2_tuner_type */ + __u32 tuner; /* Tuner index */ v4l2_std_id std; __u32 status; __u32 capabilities; -- cgit v1.2.3 From ae440c5da33cdb90a109f2df2a0360c67b3fab7e Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Wed, 24 May 2023 16:07:38 +0800 Subject: media: uapi: HEVC: Add num_delta_pocs_of_ref_rps_idx field Some drivers firmwares parse by themselves slice header and need num_delta_pocs_of_ref_rps_idx value to parse slice header short_term_ref_pic_set(). Use one of the 4 reserved bytes to store this value without changing the v4l2_ctrl_hevc_decode_params structure size and padding. This value also exist in DXVA API. Signed-off-by: Benjamin Gaignard Signed-off-by: Yunfei Dong Reviewed-by: Nicolas Dufresne Signed-off-by: Hans Verkuil [hverkuil: fix typo in num_delta_pocs_of_ref_rps_idx doc] --- .../userspace-api/media/v4l/ext-ctrls-codec-stateless.rst | 7 +++++++ include/uapi/linux/v4l2-controls.h | 6 +++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst index 3d8411acd5b8..677bf0fbcc5a 100644 --- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst +++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst @@ -2923,6 +2923,13 @@ This structure contains all loop filter related parameters. See sections - ``poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]`` - PocLtCurr as described in section 8.3.2 "Decoding process for reference picture set": provides the index of the long term references in DPB array. + * - __u8 + - ``num_delta_pocs_of_ref_rps_idx`` + - When the short_term_ref_pic_set_sps_flag in the slice header is equal to 0, + it is the same as the derived value NumDeltaPocs[RefRpsIdx]. It can be used to parse + the RPS data in slice headers instead of skipping it with @short_term_ref_pic_set_size. + When the value of short_term_ref_pic_set_sps_flag in the slice header is + equal to 1, num_delta_pocs_of_ref_rps_idx shall be set to 0. * - struct :c:type:`v4l2_hevc_dpb_entry` - ``dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]`` - The decoded picture buffer, for meta-data about reference frames. diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 5e80daa4ffe0..7bf59a87a1bf 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -2385,6 +2385,9 @@ struct v4l2_ctrl_hevc_slice_params { * @poc_st_curr_after: provides the index of the short term after references * in DPB array * @poc_lt_curr: provides the index of the long term references in DPB array + * @num_delta_pocs_of_ref_rps_idx: same as the derived value NumDeltaPocs[RefRpsIdx], + * can be used to parse the RPS data in slice headers + * instead of skipping it with @short_term_ref_pic_set_size. * @reserved: padding field. Should be zeroed by applications. * @dpb: the decoded picture buffer, for meta-data about reference frames * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{} @@ -2400,7 +2403,8 @@ struct v4l2_ctrl_hevc_decode_params { __u8 poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; __u8 poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; __u8 poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; - __u8 reserved[4]; + __u8 num_delta_pocs_of_ref_rps_idx; + __u8 reserved[3]; struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; __u64 flags; }; -- cgit v1.2.3 From 98b9564243805810b7d412368e512bd7b1c3837e Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 24 Apr 2023 12:52:12 +0300 Subject: media: uapi: Use unsigned int values for assigning bits in u32 fields Use unsigned int values annoted by "U" for u32 fields. While this is a good practice, there doesn't appear to be a bug that this patch would fix. The patch has been generated using the following command: perl -i -pe 's/\([0-9]+\K < Reviewed-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- include/uapi/linux/media.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h index 3ddadaea849f..1c80b1d6bbaf 100644 --- a/include/uapi/linux/media.h +++ b/include/uapi/linux/media.h @@ -140,8 +140,8 @@ struct media_device_info { #define MEDIA_ENT_F_DV_ENCODER (MEDIA_ENT_F_BASE + 0x6002) /* Entity flags */ -#define MEDIA_ENT_FL_DEFAULT (1 << 0) -#define MEDIA_ENT_FL_CONNECTOR (1 << 1) +#define MEDIA_ENT_FL_DEFAULT (1U << 0) +#define MEDIA_ENT_FL_CONNECTOR (1U << 1) /* OR with the entity id value to find the next entity */ #define MEDIA_ENT_ID_FLAG_NEXT (1U << 31) @@ -205,9 +205,9 @@ struct media_entity_desc { }; }; -#define MEDIA_PAD_FL_SINK (1 << 0) -#define MEDIA_PAD_FL_SOURCE (1 << 1) -#define MEDIA_PAD_FL_MUST_CONNECT (1 << 2) +#define MEDIA_PAD_FL_SINK (1U << 0) +#define MEDIA_PAD_FL_SOURCE (1U << 1) +#define MEDIA_PAD_FL_MUST_CONNECT (1U << 2) struct media_pad_desc { __u32 entity; /* entity ID */ @@ -216,14 +216,14 @@ struct media_pad_desc { __u32 reserved[2]; }; -#define MEDIA_LNK_FL_ENABLED (1 << 0) -#define MEDIA_LNK_FL_IMMUTABLE (1 << 1) -#define MEDIA_LNK_FL_DYNAMIC (1 << 2) +#define MEDIA_LNK_FL_ENABLED (1U << 0) +#define MEDIA_LNK_FL_IMMUTABLE (1U << 1) +#define MEDIA_LNK_FL_DYNAMIC (1U << 2) #define MEDIA_LNK_FL_LINK_TYPE (0xf << 28) -# define MEDIA_LNK_FL_DATA_LINK (0 << 28) -# define MEDIA_LNK_FL_INTERFACE_LINK (1 << 28) -# define MEDIA_LNK_FL_ANCILLARY_LINK (2 << 28) +# define MEDIA_LNK_FL_DATA_LINK (0U << 28) +# define MEDIA_LNK_FL_INTERFACE_LINK (1U << 28) +# define MEDIA_LNK_FL_ANCILLARY_LINK (2U << 28) struct media_link_desc { struct media_pad_desc source; @@ -293,7 +293,7 @@ struct media_links_enum { * struct media_device_info. */ #define MEDIA_V2_ENTITY_HAS_FLAGS(media_version) \ - ((media_version) >= ((4 << 16) | (19 << 8) | 0)) + ((media_version) >= ((4U << 16) | (19U << 8) | 0U)) struct media_v2_entity { __u32 id; @@ -328,7 +328,7 @@ struct media_v2_interface { * struct media_device_info. */ #define MEDIA_V2_PAD_HAS_INDEX(media_version) \ - ((media_version) >= ((4 << 16) | (19 << 8) | 0)) + ((media_version) >= ((4U << 16) | (19U << 8) | 0U)) struct media_v2_pad { __u32 id; @@ -432,7 +432,7 @@ struct media_v2_topology { #define MEDIA_INTF_T_ALSA_TIMER (MEDIA_INTF_T_ALSA_BASE + 7) /* Obsolete symbol for media_version, no longer used in the kernel */ -#define MEDIA_API_VERSION ((0 << 16) | (1 << 8) | 0) +#define MEDIA_API_VERSION ((0U << 16) | (1U << 8) | 0U) #endif -- cgit v1.2.3 From 40ca06d71d60677a8424798610c97a46e4140a21 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 16 Feb 2022 13:53:06 -0600 Subject: uapi: wireless: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zero-length and one-element arrays are deprecated, and we are moving towards adopting C99 flexible-array members, instead. Address the following warnings seen under GCC-13 and -fstrict-flex-arrays=3 enabled: drivers/staging/ks7010/ks_wlan_net.c:1597:50: warning: array subscript 0 is outside array bounds of ‘__u8[0]’ {aka ‘unsigned char[]’} [-Warray-bounds=] drivers/staging/ks7010/ks_wlan_net.c:1603:61: warning: array subscript 16 is outside array bounds of ‘__u8[0]’ {aka ‘unsigned char[]’} [-Warray-bounds=] drivers/staging/ks7010/ks_wlan_net.c:1604:61: warning: array subscript 24 is outside array bounds of ‘__u8[0]’ {aka ‘unsigned char[]’} [-Warray-bounds=] drivers/staging/ks7010/ks_wlan_net.c:1600:61: warning: array subscript 16 is outside array bounds of ‘__u8[0]’ {aka ‘unsigned char[]’} [-Warray-bounds=] drivers/staging/ks7010/ks_wlan_net.c:1586:50: warning: array subscript 0 is outside array bounds of ‘__u8[0]’ {aka ‘unsigned char[]’} [-Warray-bounds=] This helps with the ongoing efforts to tighten the FORTIFY_SOURCE routines on memcpy() and help us make progress towards globally enabling -fstrict-flex-arrays=3 [1]. This results in no differences in binary output. Link: https://github.com/KSPP/linux/issues/21 Link: https://github.com/KSPP/linux/issues/261 Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/602902.html [1] Reviewed-by: Kees Cook Signed-off-by: Gustavo A. R. Silva --- include/uapi/linux/wireless.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/wireless.h b/include/uapi/linux/wireless.h index 08967b3f19c8..3c2ad5fae17f 100644 --- a/include/uapi/linux/wireless.h +++ b/include/uapi/linux/wireless.h @@ -835,7 +835,7 @@ struct iw_encode_ext { * individual keys */ __u16 alg; /* IW_ENCODE_ALG_* */ __u16 key_len; - __u8 key[0]; + __u8 key[]; }; /* SIOCSIWMLME data */ -- cgit v1.2.3 From 6add87e9764dd308006b078cbdbf36d5a611cc9b Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 30 May 2023 08:12:39 +0900 Subject: firewire: cdev: add new version of ABI to notify time stamp at request/response subaction of transaction This commit adds new version of ABI for future new events with time stamp for request/response subaction of asynchronous transaction to user space. Link: https://lore.kernel.org/r/20230529113406.986289-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-cdev.c | 1 + include/uapi/linux/firewire-cdev.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 2c16ee8fd842..88c8b5fac5e5 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -43,6 +43,7 @@ #define FW_CDEV_VERSION_EVENT_REQUEST2 4 #define FW_CDEV_VERSION_ALLOCATE_REGION_END 4 #define FW_CDEV_VERSION_AUTO_FLUSH_ISO_OVERFLOW 5 +#define FW_CDEV_VERSION_EVENT_ASYNC_TSTAMP 6 struct client { u32 version; diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 92be3ea3c6e0..76441eb551e5 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -457,6 +457,7 @@ union fw_cdev_event { * 5 (3.4) - send %FW_CDEV_EVENT_ISO_INTERRUPT events when needed to * avoid dropping data * - added %FW_CDEV_IOC_FLUSH_ISO + * 6 (6.5) - added some event for subactions of asynchronous transaction with time stamp */ /** -- cgit v1.2.3 From 7c22d4a92bb26f2357b27446138c8be54f88caed Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 30 May 2023 08:12:39 +0900 Subject: firewire: cdev: add new event to notify request subaction with time stamp This commit adds new event to notify event of request subaction with time stamp field. Current compiler implementation of System V ABI selects one of structure members which has the maximum alignment size in the structure to decide the size of structure. In the case of fw_cdev_event_request3 structure, it is closure member which has 8 byte storage. The size of alignment for the type of 8 byte storage differs depending on architectures; 4 byte for i386 architecture and 8 byte for the others including x32 architecture. It is inconvenient to device driver developer to use structure layout which varies between architectures since the developer takes care of ioctl compat layer. This commit adds 32 bit member for padding to keep the size of structure as multiples of 8. Cc: kunit-dev@googlegroups.com Link: https://lore.kernel.org/r/20230529113406.986289-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/uapi-test.c | 20 ++++++++++++++ include/uapi/linux/firewire-cdev.h | 53 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 71 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/firewire/uapi-test.c b/drivers/firewire/uapi-test.c index 4dc633b91336..dd95899de316 100644 --- a/drivers/firewire/uapi-test.c +++ b/drivers/firewire/uapi-test.c @@ -26,8 +26,28 @@ static void structure_layout_event_response(struct kunit *test) KUNIT_EXPECT_EQ(test, 20, offsetof(struct fw_cdev_event_response, data)); } +// Added at v6.5. +static void structure_layout_event_request3(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, 56, sizeof(struct fw_cdev_event_request3)); + + KUNIT_EXPECT_EQ(test, 0, offsetof(struct fw_cdev_event_request3, closure)); + KUNIT_EXPECT_EQ(test, 8, offsetof(struct fw_cdev_event_request3, type)); + KUNIT_EXPECT_EQ(test, 12, offsetof(struct fw_cdev_event_request3, tcode)); + KUNIT_EXPECT_EQ(test, 16, offsetof(struct fw_cdev_event_request3, offset)); + KUNIT_EXPECT_EQ(test, 24, offsetof(struct fw_cdev_event_request3, source_node_id)); + KUNIT_EXPECT_EQ(test, 28, offsetof(struct fw_cdev_event_request3, destination_node_id)); + KUNIT_EXPECT_EQ(test, 32, offsetof(struct fw_cdev_event_request3, card)); + KUNIT_EXPECT_EQ(test, 36, offsetof(struct fw_cdev_event_request3, generation)); + KUNIT_EXPECT_EQ(test, 40, offsetof(struct fw_cdev_event_request3, handle)); + KUNIT_EXPECT_EQ(test, 44, offsetof(struct fw_cdev_event_request3, length)); + KUNIT_EXPECT_EQ(test, 48, offsetof(struct fw_cdev_event_request3, tstamp)); + KUNIT_EXPECT_EQ(test, 56, offsetof(struct fw_cdev_event_request3, data)); +} + static struct kunit_case structure_layout_test_cases[] = { KUNIT_CASE(structure_layout_event_response), + KUNIT_CASE(structure_layout_event_request3), {} }; diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 76441eb551e5..7767cd53a013 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -46,6 +46,9 @@ #define FW_CDEV_EVENT_PHY_PACKET_RECEIVED 0x08 #define FW_CDEV_EVENT_ISO_INTERRUPT_MULTICHANNEL 0x09 +/* available since kernel version 6.5 */ +#define FW_CDEV_EVENT_REQUEST3 0x0a + /** * struct fw_cdev_event_common - Common part of all fw_cdev_event_* types * @closure: For arbitrary use by userspace @@ -159,6 +162,38 @@ struct fw_cdev_event_request { * @length: Data length, i.e. the request's payload size in bytes * @data: Incoming data, if any * + * This event is sent instead of &fw_cdev_event_request3 if the kernel or the client implements + * ABI version <= 5. It has the lack of time stamp field comparing to &fw_cdev_event_request3. + */ +struct fw_cdev_event_request2 { + __u64 closure; + __u32 type; + __u32 tcode; + __u64 offset; + __u32 source_node_id; + __u32 destination_node_id; + __u32 card; + __u32 generation; + __u32 handle; + __u32 length; + __u32 data[]; +}; + +/** + * struct fw_cdev_event_request3 - Sent on incoming request to an address region + * @closure: See &fw_cdev_event_common; set by %FW_CDEV_IOC_ALLOCATE ioctl + * @type: See &fw_cdev_event_common; always %FW_CDEV_EVENT_REQUEST2 + * @tcode: Transaction code of the incoming request + * @offset: The offset into the 48-bit per-node address space + * @source_node_id: Sender node ID + * @destination_node_id: Destination node ID + * @card: The index of the card from which the request came + * @generation: Bus generation in which the request is valid + * @handle: Reference to the kernel-side pending request + * @length: Data length, i.e. the request's payload size in bytes + * @tstamp: The time stamp of isochronous cycle at which the request arrived. + * @data: Incoming data, if any + * * This event is sent when the stack receives an incoming request to an address * region registered using the %FW_CDEV_IOC_ALLOCATE ioctl. The request is * guaranteed to be completely contained in the specified region. Userspace is @@ -191,10 +226,14 @@ struct fw_cdev_event_request { * sent. * * If the client subsequently needs to initiate requests to the sender node of - * an &fw_cdev_event_request2, it needs to use a device file with matching + * an &fw_cdev_event_request3, it needs to use a device file with matching * card index, node ID, and generation for outbound requests. + * + * @tstamp is isochronous cycle at which the request arrived. It is 16 bit integer value and the + * higher 3 bits expresses three low order bits of second field in the format of CYCLE_TIME + * register and the rest 13 bits expresses cycle field. */ -struct fw_cdev_event_request2 { +struct fw_cdev_event_request3 { __u64 closure; __u32 type; __u32 tcode; @@ -205,6 +244,12 @@ struct fw_cdev_event_request2 { __u32 generation; __u32 handle; __u32 length; + __u32 tstamp; + /* + * Padding to keep the size of structure as multiples of 8 in various architectures since + * 4 byte alignment is used for 8 byte of object type in System V ABI for i386 architecture. + */ + __u32 padding; __u32 data[]; }; @@ -375,6 +420,8 @@ struct fw_cdev_event_phy_packet { * %FW_CDEV_EVENT_PHY_PACKET_SENT or * %FW_CDEV_EVENT_PHY_PACKET_RECEIVED * + * @request3: Valid if @common.type == %FW_CDEV_EVENT_REQUEST3 + * * Convenience union for userspace use. Events could be read(2) into an * appropriately aligned char buffer and then cast to this union for further * processing. Note that for a request, response or iso_interrupt event, @@ -393,6 +440,7 @@ union fw_cdev_event { struct fw_cdev_event_iso_interrupt_mc iso_interrupt_mc; /* added in 2.6.36 */ struct fw_cdev_event_iso_resource iso_resource; /* added in 2.6.30 */ struct fw_cdev_event_phy_packet phy_packet; /* added in 2.6.36 */ + struct fw_cdev_event_request3 request3; /* added in 6.5 */ }; /* available since kernel version 2.6.22 */ @@ -458,6 +506,7 @@ union fw_cdev_event { * avoid dropping data * - added %FW_CDEV_IOC_FLUSH_ISO * 6 (6.5) - added some event for subactions of asynchronous transaction with time stamp + * - %FW_CDEV_EVENT_REQUEST3 */ /** -- cgit v1.2.3 From fc2b52cf2e0e48938b65e20fae7099e54ef74c76 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 30 May 2023 08:12:40 +0900 Subject: firewire: cdev: add new event to notify response subaction with time stamp This commit adds new event to notify event of response subaction with time stamp field. Current compiler implementation of System V ABI selects one of structure members which has the maximum alignment size in the structure to decide the size of structure. In the case of fw_cdev_event_request3 structure, it is closure member which has 8 byte storage. The size of alignment for the type of 8 byte storage differs depending on architectures; 4 byte for i386 architecture and 8 byte for the others including x32 architecture. It is inconvenient to device driver developer to use structure layout which varies between architectures since the developer takes care of ioctl compat layer. This commit adds 32 bit member for padding to keep the size of structure as multiples of 8. Cc: kunit-dev@googlegroups.com Link: https://lore.kernel.org/r/20230529113406.986289-9-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/uapi-test.c | 15 ++++++++++ include/uapi/linux/firewire-cdev.h | 59 +++++++++++++++++++++++++++++++------- 2 files changed, 64 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/firewire/uapi-test.c b/drivers/firewire/uapi-test.c index dd95899de316..640e5c05415a 100644 --- a/drivers/firewire/uapi-test.c +++ b/drivers/firewire/uapi-test.c @@ -45,9 +45,24 @@ static void structure_layout_event_request3(struct kunit *test) KUNIT_EXPECT_EQ(test, 56, offsetof(struct fw_cdev_event_request3, data)); } +// Added at v6.5. +static void structure_layout_event_response2(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, 32, sizeof(struct fw_cdev_event_response2)); + + KUNIT_EXPECT_EQ(test, 0, offsetof(struct fw_cdev_event_response2, closure)); + KUNIT_EXPECT_EQ(test, 8, offsetof(struct fw_cdev_event_response2, type)); + KUNIT_EXPECT_EQ(test, 12, offsetof(struct fw_cdev_event_response2, rcode)); + KUNIT_EXPECT_EQ(test, 16, offsetof(struct fw_cdev_event_response2, length)); + KUNIT_EXPECT_EQ(test, 20, offsetof(struct fw_cdev_event_response2, request_tstamp)); + KUNIT_EXPECT_EQ(test, 24, offsetof(struct fw_cdev_event_response2, response_tstamp)); + KUNIT_EXPECT_EQ(test, 32, offsetof(struct fw_cdev_event_response2, data)); +} + static struct kunit_case structure_layout_test_cases[] = { KUNIT_CASE(structure_layout_event_response), KUNIT_CASE(structure_layout_event_request3), + KUNIT_CASE(structure_layout_event_response2), {} }; diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 7767cd53a013..6c04ed08e4a5 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -48,6 +48,7 @@ /* available since kernel version 6.5 */ #define FW_CDEV_EVENT_REQUEST3 0x0a +#define FW_CDEV_EVENT_RESPONSE2 0x0b /** * struct fw_cdev_event_common - Common part of all fw_cdev_event_* types @@ -106,6 +107,29 @@ struct fw_cdev_event_bus_reset { * @length: Data length, i.e. the response's payload size in bytes * @data: Payload data, if any * + * This event is sent instead of &fw_cdev_event_response if the kernel or the client implements + * ABI version <= 5. It has the lack of time stamp field comparing to &fw_cdev_event_response2. + */ +struct fw_cdev_event_response { + __u64 closure; + __u32 type; + __u32 rcode; + __u32 length; + __u32 data[]; +}; + +/** + * struct fw_cdev_event_response2 - Sent when a response packet was received + * @closure: See &fw_cdev_event_common; set by %FW_CDEV_IOC_SEND_REQUEST + * or %FW_CDEV_IOC_SEND_BROADCAST_REQUEST + * or %FW_CDEV_IOC_SEND_STREAM_PACKET ioctl + * @type: See &fw_cdev_event_common; always %FW_CDEV_EVENT_RESPONSE + * @rcode: Response code returned by the remote node + * @length: Data length, i.e. the response's payload size in bytes + * @request_tstamp: The time stamp of isochronous cycle at which the request was sent. + * @response_tstamp: The time stamp of isochronous cycle at which the response was sent. + * @data: Payload data, if any + * * This event is sent when the stack receives a response to an outgoing request * sent by %FW_CDEV_IOC_SEND_REQUEST ioctl. The payload data for responses * carrying data (read and lock responses) follows immediately and can be @@ -115,12 +139,25 @@ struct fw_cdev_event_bus_reset { * involve response packets. This includes unified write transactions, * broadcast write transactions, and transmission of asynchronous stream * packets. @rcode indicates success or failure of such transmissions. + * + * The value of @request_tstamp expresses the isochronous cycle at which the request was sent to + * initiate the transaction. The value of @response_tstamp expresses the isochronous cycle at which + * the response arrived to complete the transaction. Each value is unsigned 16 bit integer + * containing three low order bits of second field and all 13 bits of cycle field in format of + * CYCLE_TIMER register. */ -struct fw_cdev_event_response { +struct fw_cdev_event_response2 { __u64 closure; __u32 type; __u32 rcode; __u32 length; + __u32 request_tstamp; + __u32 response_tstamp; + /* + * Padding to keep the size of structure as multiples of 8 in various architectures since + * 4 byte alignment is used for 8 byte of object type in System V ABI for i386 architecture. + */ + __u32 padding; __u32 data[]; }; @@ -421,6 +458,7 @@ struct fw_cdev_event_phy_packet { * %FW_CDEV_EVENT_PHY_PACKET_RECEIVED * * @request3: Valid if @common.type == %FW_CDEV_EVENT_REQUEST3 + * @response2: Valid if @common.type == %FW_CDEV_EVENT_RESPONSE2 * * Convenience union for userspace use. Events could be read(2) into an * appropriately aligned char buffer and then cast to this union for further @@ -441,6 +479,7 @@ union fw_cdev_event { struct fw_cdev_event_iso_resource iso_resource; /* added in 2.6.30 */ struct fw_cdev_event_phy_packet phy_packet; /* added in 2.6.36 */ struct fw_cdev_event_request3 request3; /* added in 6.5 */ + struct fw_cdev_event_response2 response2; /* added in 6.5 */ }; /* available since kernel version 2.6.22 */ @@ -507,6 +546,7 @@ union fw_cdev_event { * - added %FW_CDEV_IOC_FLUSH_ISO * 6 (6.5) - added some event for subactions of asynchronous transaction with time stamp * - %FW_CDEV_EVENT_REQUEST3 + * - %FW_CDEV_EVENT_RESPONSE2 */ /** @@ -552,11 +592,11 @@ struct fw_cdev_get_info { * @data: Userspace pointer to payload * @generation: The bus generation where packet is valid * - * Send a request to the device. This ioctl implements all outgoing requests. - * Both quadlet and block request specify the payload as a pointer to the data - * in the @data field. Once the transaction completes, the kernel writes an - * &fw_cdev_event_response event back. The @closure field is passed back to - * user space in the response event. + * Send a request to the device. This ioctl implements all outgoing requests. Both quadlet and + * block request specify the payload as a pointer to the data in the @data field. Once the + * transaction completes, the kernel writes either &fw_cdev_event_response event or + * &fw_cdev_event_response event back. The @closure field is passed back to user space in the + * response event. */ struct fw_cdev_send_request { __u32 tcode; @@ -1039,10 +1079,9 @@ struct fw_cdev_allocate_iso_resource { * @generation: The bus generation where packet is valid * @speed: Speed to transmit at * - * The %FW_CDEV_IOC_SEND_STREAM_PACKET ioctl sends an asynchronous stream packet - * to every device which is listening to the specified channel. The kernel - * writes an &fw_cdev_event_response event which indicates success or failure of - * the transmission. + * The %FW_CDEV_IOC_SEND_STREAM_PACKET ioctl sends an asynchronous stream packet to every device + * which is listening to the specified channel. The kernel writes either &fw_cdev_event_response + * event or &fw_cdev_event_response2 event which indicates success or failure of the transmission. */ struct fw_cdev_send_stream_packet { __u32 length; -- cgit v1.2.3 From e27b3939128a1d99a4c84f35e4f3897dae73ccd0 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 30 May 2023 08:12:40 +0900 Subject: firewire: cdev: add new event to notify phy packet with time stamp This commit adds new event to notify event of phy packet with time stamp field. Unlike the fw_cdev_event_request3 and fw_cdev_event_response2, the size of new structure, fw_cdev_event_phy_packet2, is multiples of 8, thus padding is not required to keep the same size between System V ABI for different architectures. It is noticeable that for the case of ping request 1394 OHCI controller does not record the isochronous cycle at which the packet was sent for the request subaction. Instead, it records round-trip count measured by hardware at 42.195 MHz resolution. Cc: kunit-dev@googlegroups.com Link: https://lore.kernel.org/r/20230529113406.986289-12-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/uapi-test.c | 14 ++++++++ include/uapi/linux/firewire-cdev.h | 67 ++++++++++++++++++++++++++++++++------ 2 files changed, 71 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/firewire/uapi-test.c b/drivers/firewire/uapi-test.c index 640e5c05415a..c6ebf02e3d45 100644 --- a/drivers/firewire/uapi-test.c +++ b/drivers/firewire/uapi-test.c @@ -59,10 +59,24 @@ static void structure_layout_event_response2(struct kunit *test) KUNIT_EXPECT_EQ(test, 32, offsetof(struct fw_cdev_event_response2, data)); } +// Added at v6.5. +static void structure_layout_event_phy_packet2(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, 24, sizeof(struct fw_cdev_event_phy_packet2)); + + KUNIT_EXPECT_EQ(test, 0, offsetof(struct fw_cdev_event_phy_packet2, closure)); + KUNIT_EXPECT_EQ(test, 8, offsetof(struct fw_cdev_event_phy_packet2, type)); + KUNIT_EXPECT_EQ(test, 12, offsetof(struct fw_cdev_event_phy_packet2, rcode)); + KUNIT_EXPECT_EQ(test, 16, offsetof(struct fw_cdev_event_phy_packet2, length)); + KUNIT_EXPECT_EQ(test, 20, offsetof(struct fw_cdev_event_phy_packet2, tstamp)); + KUNIT_EXPECT_EQ(test, 24, offsetof(struct fw_cdev_event_phy_packet2, data)); +} + static struct kunit_case structure_layout_test_cases[] = { KUNIT_CASE(structure_layout_event_response), KUNIT_CASE(structure_layout_event_request3), KUNIT_CASE(structure_layout_event_response2), + KUNIT_CASE(structure_layout_event_phy_packet2), {} }; diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 6c04ed08e4a5..99e823935427 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -49,6 +49,8 @@ /* available since kernel version 6.5 */ #define FW_CDEV_EVENT_REQUEST3 0x0a #define FW_CDEV_EVENT_RESPONSE2 0x0b +#define FW_CDEV_EVENT_PHY_PACKET_SENT2 0x0c +#define FW_CDEV_EVENT_PHY_PACKET_RECEIVED2 0x0d /** * struct fw_cdev_event_common - Common part of all fw_cdev_event_* types @@ -423,20 +425,59 @@ struct fw_cdev_event_iso_resource { * @type: %FW_CDEV_EVENT_PHY_PACKET_SENT or %..._RECEIVED * @rcode: %RCODE_..., indicates success or failure of transmission * @length: Data length in bytes + * @data: Incoming data for %FW_CDEV_IOC_RECEIVE_PHY_PACKETS. For %FW_CDEV_IOC_SEND_PHY_PACKET + * the field has the same data in the request, thus the length of 8 bytes. + * + * This event is sent instead of &fw_cdev_event_phy_packet2 if the kernel or + * the client implements ABI version <= 5. It has the lack of time stamp field comparing to + * &fw_cdev_event_phy_packet2. + */ +struct fw_cdev_event_phy_packet { + __u64 closure; + __u32 type; + __u32 rcode; + __u32 length; + __u32 data[]; +}; + +/** + * struct fw_cdev_event_phy_packet2 - A PHY packet was transmitted or received with time stamp. + * @closure: See &fw_cdev_event_common; set by %FW_CDEV_IOC_SEND_PHY_PACKET + * or %FW_CDEV_IOC_RECEIVE_PHY_PACKETS ioctl + * @type: %FW_CDEV_EVENT_PHY_PACKET_SENT2 or %FW_CDEV_EVENT_PHY_PACKET_RECEIVED2 + * @rcode: %RCODE_..., indicates success or failure of transmission + * @length: Data length in bytes + * @tstamp: For %FW_CDEV_EVENT_PHY_PACKET_RECEIVED2, the time stamp of isochronous cycle at + * which the packet arrived. For %FW_CDEV_EVENT_PHY_PACKET_SENT2 and non-ping packet, + * the time stamp of isochronous cycle at which the packet was sent. For ping packet, + * the tick count for round-trip time measured by 1394 OHCI controller. + * The time stamp of isochronous cycle at which either the response was sent for + * %FW_CDEV_EVENT_PHY_PACKET_SENT2 or the request arrived for + * %FW_CDEV_EVENT_PHY_PACKET_RECEIVED2. * @data: Incoming data * - * If @type is %FW_CDEV_EVENT_PHY_PACKET_SENT, @length is 0 and @data empty, - * except in case of a ping packet: Then, @length is 4, and @data[0] is the - * ping time in 49.152MHz clocks if @rcode is %RCODE_COMPLETE. + * If @type is %FW_CDEV_EVENT_PHY_PACKET_SENT2, @length is 8 and @data consists of the two PHY + * packet quadlets to be sent, in host byte order, * - * If @type is %FW_CDEV_EVENT_PHY_PACKET_RECEIVED, @length is 8 and @data - * consists of the two PHY packet quadlets, in host byte order. + * If @type is %FW_CDEV_EVENT_PHY_PACKET_RECEIVED2, @length is 8 and @data consists of the two PHY + * packet quadlets, in host byte order. + * + * For %FW_CDEV_EVENT_PHY_PACKET_RECEIVED2, the @tstamp is the isochronous cycle at which the + * packet arrived. It is 16 bit integer value and the higher 3 bits expresses three low order bits + * of second field and the rest 13 bits expresses cycle field in the format of CYCLE_TIME register. + * + * For %FW_CDEV_EVENT_PHY_PACKET_SENT2, the @tstamp has different meanings whether to sent the + * packet for ping or not. If it's not for ping, the @tstamp is the isochronous cycle at which the + * packet was sent, and use the same format as the case of %FW_CDEV_EVENT_PHY_PACKET_SENT2. If it's + * for ping, the @tstamp is for round-trip time measured by 1394 OHCI controller with 42.195 MHz + * resolution. */ -struct fw_cdev_event_phy_packet { +struct fw_cdev_event_phy_packet2 { __u64 closure; __u32 type; __u32 rcode; __u32 length; + __u32 tstamp; __u32 data[]; }; @@ -459,6 +500,8 @@ struct fw_cdev_event_phy_packet { * * @request3: Valid if @common.type == %FW_CDEV_EVENT_REQUEST3 * @response2: Valid if @common.type == %FW_CDEV_EVENT_RESPONSE2 + * @phy_packet2: Valid if @common.type == %FW_CDEV_EVENT_PHY_PACKET_SENT2 or + * %FW_CDEV_EVENT_PHY_PACKET_RECEIVED2 * * Convenience union for userspace use. Events could be read(2) into an * appropriately aligned char buffer and then cast to this union for further @@ -480,6 +523,7 @@ union fw_cdev_event { struct fw_cdev_event_phy_packet phy_packet; /* added in 2.6.36 */ struct fw_cdev_event_request3 request3; /* added in 6.5 */ struct fw_cdev_event_response2 response2; /* added in 6.5 */ + struct fw_cdev_event_phy_packet2 phy_packet2; /* added in 6.5 */ }; /* available since kernel version 2.6.22 */ @@ -547,6 +591,8 @@ union fw_cdev_event { * 6 (6.5) - added some event for subactions of asynchronous transaction with time stamp * - %FW_CDEV_EVENT_REQUEST3 * - %FW_CDEV_EVENT_RESPONSE2 + * - %FW_CDEV_EVENT_PHY_PACKET_SENT2 + * - %FW_CDEV_EVENT_PHY_PACKET_RECEIVED2 */ /** @@ -1100,8 +1146,8 @@ struct fw_cdev_send_stream_packet { * @data: First and second quadlet of the PHY packet * @generation: The bus generation where packet is valid * - * The %FW_CDEV_IOC_SEND_PHY_PACKET ioctl sends a PHY packet to all nodes - * on the same card as this device. After transmission, an + * The %FW_CDEV_IOC_SEND_PHY_PACKET ioctl sends a PHY packet to all nodes on the same card as this + * device. After transmission, either %FW_CDEV_EVENT_PHY_PACKET_SENT event or * %FW_CDEV_EVENT_PHY_PACKET_SENT event is generated. * * The payload @data\[\] shall be specified in host byte order. Usually, @@ -1120,8 +1166,9 @@ struct fw_cdev_send_phy_packet { * struct fw_cdev_receive_phy_packets - start reception of PHY packets * @closure: Passed back to userspace in phy packet events * - * This ioctl activates issuing of %FW_CDEV_EVENT_PHY_PACKET_RECEIVED due to - * incoming PHY packets from any node on the same bus as the device. + * This ioctl activates issuing of either %FW_CDEV_EVENT_PHY_PACKET_RECEIVED or + * %FW_CDEV_EVENT_PHY_PACKET_RECEIVED2 due to incoming PHY packets from any node on the same bus + * as the device. * * The ioctl is only permitted on device files which represent a local node. */ -- cgit v1.2.3 From a45baa079e2a6b7a5516354c6ff08702232384f5 Mon Sep 17 00:00:00 2001 From: Boerge Struempfel Date: Tue, 30 May 2023 16:16:37 +0200 Subject: spi: add SPI_MOSI_IDLE_LOW mode bit Some spi controller switch the mosi line to high, whenever they are idle. This may not be desired in all use cases. For example neopixel leds can get confused and flicker due to misinterpreting the idle state. Therefore, we introduce a new spi-mode bit, with which the idle behaviour can be overwritten on a per device basis. Signed-off-by: Boerge Struempfel Link: https://lore.kernel.org/r/20230530141641.1155691-2-boerge.struempfel@gmail.com Signed-off-by: Mark Brown --- include/uapi/linux/spi/spi.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/spi/spi.h b/include/uapi/linux/spi/spi.h index 9d5f58059703..ca56e477d161 100644 --- a/include/uapi/linux/spi/spi.h +++ b/include/uapi/linux/spi/spi.h @@ -28,6 +28,7 @@ #define SPI_RX_OCTAL _BITUL(14) /* receive with 8 wires */ #define SPI_3WIRE_HIZ _BITUL(15) /* high impedance turnaround */ #define SPI_RX_CPHA_FLIP _BITUL(16) /* flip CPHA on Rx only xfer */ +#define SPI_MOSI_IDLE_LOW _BITUL(17) /* leave mosi line low when idle */ /* * All the bits defined above should be covered by SPI_MODE_USER_MASK. @@ -37,6 +38,6 @@ * These bits must not overlap. A static assert check should make sure of that. * If adding extra bits, make sure to increase the bit index below as well. */ -#define SPI_MODE_USER_MASK (_BITUL(17) - 1) +#define SPI_MODE_USER_MASK (_BITUL(18) - 1) #endif /* _UAPI_SPI_H */ -- cgit v1.2.3 From e910c8e3aa02dc456e2f4c32cb479523c326b534 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 May 2023 10:19:35 +0200 Subject: autofs: use flexible array in ioctl structure Commit df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3") introduced a warning for the autofs_dev_ioctl structure: In function 'check_name', inlined from 'validate_dev_ioctl' at fs/autofs/dev-ioctl.c:131:9, inlined from '_autofs_dev_ioctl' at fs/autofs/dev-ioctl.c:624:8: fs/autofs/dev-ioctl.c:33:14: error: 'strchr' reading 1 or more bytes from a region of size 0 [-Werror=stringop-overread] 33 | if (!strchr(name, '/')) | ^~~~~~~~~~~~~~~~~ In file included from include/linux/auto_dev-ioctl.h:10, from fs/autofs/autofs_i.h:10, from fs/autofs/dev-ioctl.c:14: include/uapi/linux/auto_dev-ioctl.h: In function '_autofs_dev_ioctl': include/uapi/linux/auto_dev-ioctl.h:112:14: note: source object 'path' of size 0 112 | char path[0]; | ^~~~ This is easily fixed by changing the gnu 0-length array into a c99 flexible array. Since this is a uapi structure, we have to be careful about possible regressions but this one should be fine as they are equivalent here. While it would break building with ancient gcc versions that predate c99, it helps building with --std=c99 and -Wpedantic builds in user space, as well as non-gnu compilers. This means we probably also want it fixed in stable kernels. Cc: stable@vger.kernel.org Cc: Kees Cook Cc: Gustavo A. R. Silva" Signed-off-by: Arnd Bergmann Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20230523081944.581710-1-arnd@kernel.org --- Documentation/filesystems/autofs-mount-control.rst | 2 +- Documentation/filesystems/autofs.rst | 2 +- include/uapi/linux/auto_dev-ioctl.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/filesystems/autofs-mount-control.rst b/Documentation/filesystems/autofs-mount-control.rst index bf4b511cdbe8..b5a379d25c40 100644 --- a/Documentation/filesystems/autofs-mount-control.rst +++ b/Documentation/filesystems/autofs-mount-control.rst @@ -196,7 +196,7 @@ information and return operation results:: struct args_ismountpoint ismountpoint; }; - char path[0]; + char path[]; }; The ioctlfd field is a mount point file descriptor of an autofs mount diff --git a/Documentation/filesystems/autofs.rst b/Documentation/filesystems/autofs.rst index 4f490278d22f..3b6e38e646cd 100644 --- a/Documentation/filesystems/autofs.rst +++ b/Documentation/filesystems/autofs.rst @@ -467,7 +467,7 @@ Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure:: struct args_ismountpoint ismountpoint; }; - char path[0]; + char path[]; }; For the **OPEN_MOUNT** and **IS_MOUNTPOINT** commands, the target diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h index 62e625356dc8..08be539605fc 100644 --- a/include/uapi/linux/auto_dev-ioctl.h +++ b/include/uapi/linux/auto_dev-ioctl.h @@ -109,7 +109,7 @@ struct autofs_dev_ioctl { struct args_ismountpoint ismountpoint; }; - char path[0]; + char path[]; }; static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in) -- cgit v1.2.3 From 1a432018c0cdf51a77a2e134b19ba6cab4c29c89 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 29 May 2023 14:48:30 +0300 Subject: net/sched: flower: Allow matching on layer 2 miss Add the 'TCA_FLOWER_L2_MISS' netlink attribute that allows user space to match on packets that encountered a layer 2 miss. The miss indication is set as metadata in the tc skb extension by the bridge driver upon FDB or MDB lookup miss and dissected by the flow dissector to the 'FLOW_DISSECTOR_KEY_META' key. The use of this skb extension is guarded by the 'tc_skb_ext_tc' static key. As such, enable / disable this key when filters that match on layer 2 miss are added / deleted. Tested: # cat tc_skb_ext_tc.py #!/usr/bin/env -S drgn -s vmlinux refcount = prog["tc_skb_ext_tc"].key.enabled.counter.value_() print(f"tc_skb_ext_tc reference count is {refcount}") # ./tc_skb_ext_tc.py tc_skb_ext_tc reference count is 0 # tc filter add dev swp1 egress proto all handle 101 pref 1 flower src_mac 00:11:22:33:44:55 action drop # tc filter add dev swp1 egress proto all handle 102 pref 2 flower src_mac 00:11:22:33:44:55 l2_miss true action drop # tc filter add dev swp1 egress proto all handle 103 pref 3 flower src_mac 00:11:22:33:44:55 l2_miss false action drop # ./tc_skb_ext_tc.py tc_skb_ext_tc reference count is 2 # tc filter replace dev swp1 egress proto all handle 102 pref 2 flower src_mac 00:01:02:03:04:05 l2_miss false action drop # ./tc_skb_ext_tc.py tc_skb_ext_tc reference count is 2 # tc filter del dev swp1 egress proto all handle 103 pref 3 flower # tc filter del dev swp1 egress proto all handle 102 pref 2 flower # tc filter del dev swp1 egress proto all handle 101 pref 1 flower # ./tc_skb_ext_tc.py tc_skb_ext_tc reference count is 0 Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_cls.h | 2 ++ net/sched/cls_flower.c | 30 ++++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 648a82f32666..00933dda7b10 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -594,6 +594,8 @@ enum { TCA_FLOWER_KEY_L2TPV3_SID, /* be32 */ + TCA_FLOWER_L2_MISS, /* u8 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9dbc43388e57..04adcde9eb81 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -120,6 +120,7 @@ struct cls_fl_filter { u32 handle; u32 flags; u32 in_hw_count; + u8 needs_tc_skb_ext:1; struct rcu_work rwork; struct net_device *hw_dev; /* Flower classifier is unlocked, which means that its reference counter @@ -415,6 +416,8 @@ static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp) static void __fl_destroy_filter(struct cls_fl_filter *f) { + if (f->needs_tc_skb_ext) + tc_skb_ext_tc_disable(); tcf_exts_destroy(&f->exts); tcf_exts_put_net(&f->exts); kfree(f); @@ -615,7 +618,8 @@ static void *fl_get(struct tcf_proto *tp, u32 handle) } static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { - [TCA_FLOWER_UNSPEC] = { .type = NLA_UNSPEC }, + [TCA_FLOWER_UNSPEC] = { .strict_start_type = + TCA_FLOWER_L2_MISS }, [TCA_FLOWER_CLASSID] = { .type = NLA_U32 }, [TCA_FLOWER_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, @@ -720,7 +724,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_PPPOE_SID] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_PPP_PROTO] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_L2TPV3_SID] = { .type = NLA_U32 }, - + [TCA_FLOWER_L2_MISS] = NLA_POLICY_MAX(NLA_U8, 1), }; static const struct nla_policy @@ -1668,6 +1672,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb, mask->meta.ingress_ifindex = 0xffffffff; } + fl_set_key_val(tb, &key->meta.l2_miss, TCA_FLOWER_L2_MISS, + &mask->meta.l2_miss, TCA_FLOWER_UNSPEC, + sizeof(key->meta.l2_miss)); + fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, sizeof(key->eth.dst)); @@ -2085,6 +2093,11 @@ errout_cleanup: return ret; } +static bool fl_needs_tc_skb_ext(const struct fl_flow_key *mask) +{ + return mask->meta.l2_miss; +} + static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct cls_fl_filter *f, struct fl_flow_mask *mask, unsigned long base, struct nlattr **tb, @@ -2121,6 +2134,14 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, return -EINVAL; } + /* Enable tc skb extension if filter matches on data extracted from + * this extension. + */ + if (fl_needs_tc_skb_ext(&mask->key)) { + f->needs_tc_skb_ext = 1; + tc_skb_ext_tc_enable(); + } + return 0; } @@ -3074,6 +3095,11 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, goto nla_put_failure; } + if (fl_dump_key_val(skb, &key->meta.l2_miss, + TCA_FLOWER_L2_MISS, &mask->meta.l2_miss, + TCA_FLOWER_UNSPEC, sizeof(key->meta.l2_miss))) + goto nla_put_failure; + if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, sizeof(key->eth.dst)) || -- cgit v1.2.3 From 6c1adb650c8d85c6cb471dbc900c2468f462995a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 30 May 2023 12:19:46 +0300 Subject: net/sched: taprio: add netlink reporting for offload statistics counters Offloading drivers may report some additional statistics counters, some of them even suggested by 802.1Q, like TransmissionOverrun. In my opinion we don't have to limit ourselves to reporting counters only globally to the Qdisc/interface, especially if the device has more detailed reporting (per traffic class), since the more detailed info is valuable for debugging and can help identifying who is exceeding its time slot. But on the other hand, some devices may not be able to report both per TC and global stats. So we end up reporting both ways, and use the good old ethtool_put_stat() strategy to determine which statistics are supported by this NIC. Statistics which aren't set are simply not reported to netlink. For this reason, we need something dynamic (a nlattr nest) to be reported through TCA_STATS_APP, and not something daft like the fixed-size and inextensible struct tc_codel_xstats. A good model for xstats which are a nlattr nest rather than a fixed struct seems to be cake. # Global stats $ tc -s qdisc show dev eth0 root # Per-tc stats $ tc -s class show dev eth0 Signed-off-by: Vladimir Oltean Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 47 ++++++++++++++++++++----- include/uapi/linux/pkt_sched.h | 10 ++++++ net/sched/sch_taprio.c | 78 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 126 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index f5fb11da357b..530d33adec88 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -188,6 +188,27 @@ struct tc_taprio_caps { enum tc_taprio_qopt_cmd { TAPRIO_CMD_REPLACE, TAPRIO_CMD_DESTROY, + TAPRIO_CMD_STATS, + TAPRIO_CMD_TC_STATS, +}; + +/** + * struct tc_taprio_qopt_stats - IEEE 802.1Qbv statistics + * @window_drops: Frames that were dropped because they were too large to be + * transmitted in any of the allotted time windows (open gates) for their + * traffic class. + * @tx_overruns: Frames still being transmitted by the MAC after the + * transmission gate associated with their traffic class has closed. + * Equivalent to `12.29.1.1.2 TransmissionOverrun` from 802.1Q-2018. + */ +struct tc_taprio_qopt_stats { + u64 window_drops; + u64 tx_overruns; +}; + +struct tc_taprio_qopt_tc_stats { + int tc; + struct tc_taprio_qopt_stats stats; }; struct tc_taprio_sched_entry { @@ -199,16 +220,26 @@ struct tc_taprio_sched_entry { }; struct tc_taprio_qopt_offload { - struct tc_mqprio_qopt_offload mqprio; - struct netlink_ext_ack *extack; enum tc_taprio_qopt_cmd cmd; - ktime_t base_time; - u64 cycle_time; - u64 cycle_time_extension; - u32 max_sdu[TC_MAX_QUEUE]; - size_t num_entries; - struct tc_taprio_sched_entry entries[]; + union { + /* TAPRIO_CMD_STATS */ + struct tc_taprio_qopt_stats stats; + /* TAPRIO_CMD_TC_STATS */ + struct tc_taprio_qopt_tc_stats tc_stats; + /* TAPRIO_CMD_REPLACE */ + struct { + struct tc_mqprio_qopt_offload mqprio; + struct netlink_ext_ack *extack; + ktime_t base_time; + u64 cycle_time; + u64 cycle_time_extension; + u32 max_sdu[TC_MAX_QUEUE]; + + size_t num_entries; + struct tc_taprio_sched_entry entries[]; + }; + }; }; #if IS_ENABLED(CONFIG_NET_SCH_TAPRIO) diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 51a7addc56c6..00f6ff0aff1f 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1259,6 +1259,16 @@ enum { TCA_TAPRIO_TC_ENTRY_MAX = (__TCA_TAPRIO_TC_ENTRY_CNT - 1) }; +enum { + TCA_TAPRIO_OFFLOAD_STATS_PAD = 1, /* u64 */ + TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS, /* u64 */ + TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS, /* u64 */ + + /* add new constants above here */ + __TCA_TAPRIO_OFFLOAD_STATS_CNT, + TCA_TAPRIO_OFFLOAD_STATS_MAX = (__TCA_TAPRIO_OFFLOAD_STATS_CNT - 1) +}; + enum { TCA_TAPRIO_ATTR_UNSPEC, TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 06bf4c6355a5..3c4c2c334878 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -27,6 +27,8 @@ #include #include +#define TAPRIO_STAT_NOT_SET (~0ULL) + #include "sch_mqprio_lib.h" static LIST_HEAD(taprio_list); @@ -2289,6 +2291,72 @@ nla_put_failure: return -EMSGSIZE; } +static int taprio_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) +{ + if (val == TAPRIO_STAT_NOT_SET) + return 0; + if (nla_put_u64_64bit(skb, attrtype, val, TCA_TAPRIO_OFFLOAD_STATS_PAD)) + return -EMSGSIZE; + return 0; +} + +static int taprio_dump_xstats(struct Qdisc *sch, struct gnet_dump *d, + struct tc_taprio_qopt_offload *offload, + struct tc_taprio_qopt_stats *stats) +{ + struct net_device *dev = qdisc_dev(sch); + const struct net_device_ops *ops; + struct sk_buff *skb = d->skb; + struct nlattr *xstats; + int err; + + ops = qdisc_dev(sch)->netdev_ops; + + /* FIXME I could use qdisc_offload_dump_helper(), but that messes + * with sch->flags depending on whether the device reports taprio + * stats, and I'm not sure whether that's a good idea, considering + * that stats are optional to the offload itself + */ + if (!ops->ndo_setup_tc) + return 0; + + memset(stats, 0xff, sizeof(*stats)); + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); + if (err == -EOPNOTSUPP) + return 0; + if (err) + return err; + + xstats = nla_nest_start(skb, TCA_STATS_APP); + if (!xstats) + goto err; + + if (taprio_put_stat(skb, stats->window_drops, + TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS) || + taprio_put_stat(skb, stats->tx_overruns, + TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS)) + goto err_cancel; + + nla_nest_end(skb, xstats); + + return 0; + +err_cancel: + nla_nest_cancel(skb, xstats); +err: + return -EMSGSIZE; +} + +static int taprio_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct tc_taprio_qopt_offload offload = { + .cmd = TAPRIO_CMD_STATS, + }; + + return taprio_dump_xstats(sch, d, &offload, &offload.stats); +} + static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); @@ -2389,11 +2457,18 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, { struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); struct Qdisc *child = dev_queue->qdisc_sleeping; + struct tc_taprio_qopt_offload offload = { + .cmd = TAPRIO_CMD_TC_STATS, + .tc_stats = { + .tc = cl - 1, + }, + }; if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 || qdisc_qstats_copy(d, child) < 0) return -1; - return 0; + + return taprio_dump_xstats(sch, d, &offload, &offload.tc_stats.stats); } static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) @@ -2440,6 +2515,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, .dump = taprio_dump, + .dump_stats = taprio_dump_stats, .owner = THIS_MODULE, }; -- cgit v1.2.3 From 0b3dee602abf4a102a7a506d4b1c765355b27685 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Wed, 31 May 2023 10:57:13 +0100 Subject: PCI: Add PCI_EXT_CAP_ID_PL_32GT define Add the define for PCI_EXT_CAP_ID_PL_32GT for drivers that will want this whilst doing Gen5/Gen6 accesses. Link: https://lore.kernel.org/r/20230531095713.293229-1-ben.dooks@codethink.co.uk Signed-off-by: Ben Dooks Signed-off-by: Ben Dooks Signed-off-by: Bjorn Helgaas --- include/uapi/linux/pci_regs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index dc2000e0fe3a..e5f558d96493 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -738,6 +738,7 @@ #define PCI_EXT_CAP_ID_DVSEC 0x23 /* Designated Vendor-Specific */ #define PCI_EXT_CAP_ID_DLF 0x25 /* Data Link Feature */ #define PCI_EXT_CAP_ID_PL_16GT 0x26 /* Physical Layer 16.0 GT/s */ +#define PCI_EXT_CAP_ID_PL_32GT 0x2A /* Physical Layer 32.0 GT/s */ #define PCI_EXT_CAP_ID_DOE 0x2E /* Data Object Exchange */ #define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_DOE -- cgit v1.2.3 From 8ad77e72caae22a1ddcfd0c03f2884929e93b7a4 Mon Sep 17 00:00:00 2001 From: Louis DeLosSantos Date: Wed, 31 May 2023 15:38:48 -0400 Subject: bpf: Add table ID to bpf_fib_lookup BPF helper Add ability to specify routing table ID to the `bpf_fib_lookup` BPF helper. A new field `tbid` is added to `struct bpf_fib_lookup` used as parameters to the `bpf_fib_lookup` BPF helper. When the helper is called with the `BPF_FIB_LOOKUP_DIRECT` and `BPF_FIB_LOOKUP_TBID` flags the `tbid` field in `struct bpf_fib_lookup` will be used as the table ID for the fib lookup. If the `tbid` does not exist the fib lookup will fail with `BPF_FIB_LKUP_RET_NOT_FWDED`. The `tbid` field becomes a union over the vlan related output fields in `struct bpf_fib_lookup` and will be zeroed immediately after usage. This functionality is useful in containerized environments. For instance, if a CNI wants to dictate the next-hop for traffic leaving a container it can create a container-specific routing table and perform a fib lookup against this table in a "host-net-namespace-side" TC program. This functionality also allows `ip rule` like functionality at the TC layer, allowing an eBPF program to pick a routing table based on some aspect of the sk_buff. As a concrete use case, this feature will be used in Cilium's SRv6 L3VPN datapath. When egress traffic leaves a Pod an eBPF program attached by Cilium will determine which VRF the egress traffic should target, and then perform a FIB lookup in a specific table representing this VRF's FIB. Signed-off-by: Louis DeLosSantos Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230505-bpf-add-tbid-fib-lookup-v2-1-0a31c22c748c@gmail.com --- include/uapi/linux/bpf.h | 21 ++++++++++++++++++--- net/core/filter.c | 14 +++++++++++++- tools/include/uapi/linux/bpf.h | 21 ++++++++++++++++++--- 3 files changed, 49 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9273c654743c..a7b5e91dd768 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3177,6 +3177,10 @@ union bpf_attr { * **BPF_FIB_LOOKUP_DIRECT** * Do a direct table lookup vs full lookup using FIB * rules. + * **BPF_FIB_LOOKUP_TBID** + * Used with BPF_FIB_LOOKUP_DIRECT. + * Use the routing table ID present in *params*->tbid + * for the fib lookup. * **BPF_FIB_LOOKUP_OUTPUT** * Perform lookup from an egress perspective (default is * ingress). @@ -6831,6 +6835,7 @@ enum { BPF_FIB_LOOKUP_DIRECT = (1U << 0), BPF_FIB_LOOKUP_OUTPUT = (1U << 1), BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), + BPF_FIB_LOOKUP_TBID = (1U << 3), }; enum { @@ -6891,9 +6896,19 @@ struct bpf_fib_lookup { __u32 ipv6_dst[4]; /* in6_addr; network order */ }; - /* output */ - __be16 h_vlan_proto; - __be16 h_vlan_TCI; + union { + struct { + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + }; + /* input: when accompanied with the + * 'BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_TBID` flags, a + * specific routing table to use for the fib lookup. + */ + __u32 tbid; + }; + __u8 smac[6]; /* ETH_ALEN */ __u8 dmac[6]; /* ETH_ALEN */ }; diff --git a/net/core/filter.c b/net/core/filter.c index 968139f4a1ac..d25d52854c21 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5803,6 +5803,12 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib_table *tb; + if (flags & BPF_FIB_LOOKUP_TBID) { + tbid = params->tbid; + /* zero out for vlan output */ + params->tbid = 0; + } + tb = fib_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; @@ -5936,6 +5942,12 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib6_table *tb; + if (flags & BPF_FIB_LOOKUP_TBID) { + tbid = params->tbid; + /* zero out for vlan output */ + params->tbid = 0; + } + tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; @@ -6008,7 +6020,7 @@ set_fwd_params: #endif #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ - BPF_FIB_LOOKUP_SKIP_NEIGH) + BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID) BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9273c654743c..a7b5e91dd768 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3177,6 +3177,10 @@ union bpf_attr { * **BPF_FIB_LOOKUP_DIRECT** * Do a direct table lookup vs full lookup using FIB * rules. + * **BPF_FIB_LOOKUP_TBID** + * Used with BPF_FIB_LOOKUP_DIRECT. + * Use the routing table ID present in *params*->tbid + * for the fib lookup. * **BPF_FIB_LOOKUP_OUTPUT** * Perform lookup from an egress perspective (default is * ingress). @@ -6831,6 +6835,7 @@ enum { BPF_FIB_LOOKUP_DIRECT = (1U << 0), BPF_FIB_LOOKUP_OUTPUT = (1U << 1), BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), + BPF_FIB_LOOKUP_TBID = (1U << 3), }; enum { @@ -6891,9 +6896,19 @@ struct bpf_fib_lookup { __u32 ipv6_dst[4]; /* in6_addr; network order */ }; - /* output */ - __be16 h_vlan_proto; - __be16 h_vlan_TCI; + union { + struct { + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + }; + /* input: when accompanied with the + * 'BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_TBID` flags, a + * specific routing table to use for the fib lookup. + */ + __u32 tbid; + }; + __u8 smac[6]; /* ETH_ALEN */ __u8 dmac[6]; /* ETH_ALEN */ }; -- cgit v1.2.3 From b5bbc52fd01278642773818642288999a0236cb6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 3 Jun 2023 12:06:01 +0800 Subject: ublk: add control command of UBLK_U_CMD_GET_FEATURES Add control command of UBLK_U_CMD_GET_FEATURES for returning driver's feature set or capability. This way can simplify userspace for maintaining compatibility because userspace doesn't need to send command to one device for querying driver feature set any more. Such as, with the queried feature set, userspace can choose to use: - UBLK_CMD_GET_DEV_INFO2 or UBLK_CMD_GET_DEV_INFO, - UBLK_U_CMD_* or UBLK_CMD_* Userspace code: https://github.com/ming1/ubdsrv/commits/features-cmd Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230603040601.775227-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 21 +++++++++++++++++++++ include/uapi/linux/ublk_cmd.h | 8 ++++++++ 2 files changed, 29 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 539eada32861..222a0341913f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2343,6 +2343,21 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, return ret; } +static int ublk_ctrl_get_features(struct io_uring_cmd *cmd) +{ + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); + void __user *argp = (void __user *)(unsigned long)header->addr; + u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY; + + if (header->len != UBLK_FEATURES_LEN || !header->addr) + return -EINVAL; + + if (copy_to_user(argp, &features, UBLK_FEATURES_LEN)) + return -EFAULT; + + return 0; +} + /* * All control commands are sent via /dev/ublk-control, so we have to check * the destination device's permission @@ -2423,6 +2438,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, case UBLK_CMD_GET_DEV_INFO2: case UBLK_CMD_GET_QUEUE_AFFINITY: case UBLK_CMD_GET_PARAMS: + case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)): mask = MAY_READ; break; case UBLK_CMD_START_DEV: @@ -2472,6 +2488,11 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (ret) goto out; + if (cmd_op == UBLK_U_CMD_GET_FEATURES) { + ret = ublk_ctrl_get_features(cmd); + goto out; + } + if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) { ret = -ENODEV; ub = ublk_get_device_from_id(header->dev_id); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 54b5b0aeefca..4b8558db90e1 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -47,6 +47,14 @@ _IOWR('u', UBLK_CMD_END_USER_RECOVERY, struct ublksrv_ctrl_cmd) #define UBLK_U_CMD_GET_DEV_INFO2 \ _IOR('u', UBLK_CMD_GET_DEV_INFO2, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_GET_FEATURES \ + _IOR('u', 0x13, struct ublksrv_ctrl_cmd) + +/* + * 64bits are enough now, and it should be easy to extend in case of + * running out of feature flags + */ +#define UBLK_FEATURES_LEN 8 /* * IO commands, issued by ublk server, and handled by ublk driver. -- cgit v1.2.3 From 224d80c584d3016cb8d83d1c33914fdd3508aa8c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 31 May 2023 15:08:35 +0200 Subject: types: Introduce [us]128 Introduce [us]128 (when available). Unlike [us]64, ensure they are always naturally aligned. This also enables 128bit wide atomics (which require natural alignment) such as cmpxchg128(). Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Arnd Bergmann Reviewed-by: Mark Rutland Acked-by: Herbert Xu Tested-by: Mark Rutland Link: https://lore.kernel.org/r/20230531132323.385005581@infradead.org --- include/linux/types.h | 5 +++++ include/uapi/linux/types.h | 4 ++++ lib/crypto/curve25519-hacl64.c | 2 -- lib/crypto/poly1305-donna64.c | 2 -- 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/types.h b/include/linux/types.h index 688fb943556a..becb8cd5916f 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -10,6 +10,11 @@ #define DECLARE_BITMAP(name,bits) \ unsigned long name[BITS_TO_LONGS(bits)] +#ifdef __SIZEOF_INT128__ +typedef __s128 s128; +typedef __u128 u128; +#endif + typedef u32 __kernel_dev_t; typedef __kernel_fd_set fd_set; diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h index 308433be33c2..6375a0684052 100644 --- a/include/uapi/linux/types.h +++ b/include/uapi/linux/types.h @@ -13,6 +13,10 @@ #include +#ifdef __SIZEOF_INT128__ +typedef __signed__ __int128 __s128 __attribute__((aligned(16))); +typedef unsigned __int128 __u128 __attribute__((aligned(16))); +#endif /* * Below are truly Linux-specific types that should never collide with diff --git a/lib/crypto/curve25519-hacl64.c b/lib/crypto/curve25519-hacl64.c index 771d82dc5f14..c40e5d913234 100644 --- a/lib/crypto/curve25519-hacl64.c +++ b/lib/crypto/curve25519-hacl64.c @@ -14,8 +14,6 @@ #include #include -typedef __uint128_t u128; - static __always_inline u64 u64_eq_mask(u64 a, u64 b) { u64 x = a ^ b; diff --git a/lib/crypto/poly1305-donna64.c b/lib/crypto/poly1305-donna64.c index d34cf4053668..988702c9b3b2 100644 --- a/lib/crypto/poly1305-donna64.c +++ b/lib/crypto/poly1305-donna64.c @@ -10,8 +10,6 @@ #include #include -typedef __uint128_t u128; - void poly1305_core_setkey(struct poly1305_core_key *key, const u8 raw_key[POLY1305_BLOCK_SIZE]) { -- cgit v1.2.3 From 22725266bdf95bddd01a23841097492489dfc9d9 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Thu, 18 May 2023 17:13:37 +0800 Subject: KVM: Fix comment for KVM_ENABLE_CAP Fix comment for vcpu ioctl version of KVM_ENABLE_CAP. KVM provides ioctl KVM_ENABLE_CAP to allow userspace to enable an extension which is not enabled by default. For vcpu ioctl version, it is available with the capability KVM_CAP_ENABLE_CAP. For vm ioctl version, it is available with the capability KVM_CAP_ENABLE_CAP_VM. Signed-off-by: Binbin Wu Link: https://lore.kernel.org/r/20230518091339.1102-2-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- include/uapi/linux/kvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 737318b1c1d9..bddf2871db8f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1613,7 +1613,7 @@ struct kvm_s390_ucas_mapping { #define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs) #define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs) /* - * vcpu version available with KVM_ENABLE_CAP + * vcpu version available with KVM_CAP_ENABLE_CAP * vm version available with KVM_CAP_ENABLE_CAP_VM */ #define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap) -- cgit v1.2.3 From 0258d889a7ee28310dbe3f5d4d039b48a1d080ad Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 1 Jun 2023 23:49:37 +0900 Subject: firewire: fix warnings to generate UAPI documentation Any target to generate UAPI documentation reports warnings to missing annotation for padding member in structures added recently. This commit suppresses the warnings. Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/lkml/20230531135306.43613a59@canb.auug.org.au/ Fixes: 7c22d4a92bb2 ("firewire: cdev: add new event to notify request subaction with time stamp") Fixes: fc2b52cf2e0e ("firewire: cdev: add new event to notify response subaction with time stamp") Link: https://lore.kernel.org/r/20230601144937.121179-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- include/uapi/linux/firewire-cdev.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/firewire-cdev.h b/include/uapi/linux/firewire-cdev.h index 99e823935427..1f2c9469f921 100644 --- a/include/uapi/linux/firewire-cdev.h +++ b/include/uapi/linux/firewire-cdev.h @@ -130,6 +130,9 @@ struct fw_cdev_event_response { * @length: Data length, i.e. the response's payload size in bytes * @request_tstamp: The time stamp of isochronous cycle at which the request was sent. * @response_tstamp: The time stamp of isochronous cycle at which the response was sent. + * @padding: Padding to keep the size of structure as multiples of 8 in various architectures + * since 4 byte alignment is used for 8 byte of object type in System V ABI for i386 + * architecture. * @data: Payload data, if any * * This event is sent when the stack receives a response to an outgoing request @@ -155,10 +158,6 @@ struct fw_cdev_event_response2 { __u32 length; __u32 request_tstamp; __u32 response_tstamp; - /* - * Padding to keep the size of structure as multiples of 8 in various architectures since - * 4 byte alignment is used for 8 byte of object type in System V ABI for i386 architecture. - */ __u32 padding; __u32 data[]; }; @@ -231,6 +230,9 @@ struct fw_cdev_event_request2 { * @handle: Reference to the kernel-side pending request * @length: Data length, i.e. the request's payload size in bytes * @tstamp: The time stamp of isochronous cycle at which the request arrived. + * @padding: Padding to keep the size of structure as multiples of 8 in various architectures + * since 4 byte alignment is used for 8 byte of object type in System V ABI for i386 + * architecture. * @data: Incoming data, if any * * This event is sent when the stack receives an incoming request to an address @@ -284,10 +286,6 @@ struct fw_cdev_event_request3 { __u32 handle; __u32 length; __u32 tstamp; - /* - * Padding to keep the size of structure as multiples of 8 in various architectures since - * 4 byte alignment is used for 8 byte of object type in System V ABI for i386 architecture. - */ __u32 padding; __u32 data[]; }; -- cgit v1.2.3 From 6afc770048edc90405c444163de70b1cdfbb8b57 Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Tue, 30 May 2023 18:35:36 -0400 Subject: s390/vfio-ap: realize the VFIO_DEVICE_GET_IRQ_INFO ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Realize the VFIO_DEVICE_GET_IRQ_INFO ioctl to retrieve the information for the VFIO device request IRQ. Signed-off-by: Tony Krowiak Acked-by: Alex Williamson Reviewed-by: Cédric Le Goater Reviewed-by: Matthew Rosato Tested-by: Matthew Rosato Link: https://lore.kernel.org/r/20230530223538.279198-2-akrowiak@linux.ibm.com Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/vfio_ap_ops.c | 30 +++++++++++++++++++++++++++++- include/uapi/linux/vfio.h | 9 +++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index cfbcb864ab63..35cd90eee937 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1750,7 +1750,32 @@ static int vfio_ap_mdev_get_device_info(unsigned long arg) info.flags = VFIO_DEVICE_FLAGS_AP | VFIO_DEVICE_FLAGS_RESET; info.num_regions = 0; - info.num_irqs = 0; + info.num_irqs = VFIO_AP_NUM_IRQS; + + return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +} + +static ssize_t vfio_ap_get_irq_info(unsigned long arg) +{ + unsigned long minsz; + struct vfio_irq_info info; + + minsz = offsetofend(struct vfio_irq_info, count); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz || info.index >= VFIO_AP_NUM_IRQS) + return -EINVAL; + + switch (info.index) { + case VFIO_AP_REQ_IRQ_INDEX: + info.count = 1; + info.flags = VFIO_IRQ_INFO_EVENTFD; + break; + default: + return -EINVAL; + } return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } @@ -1770,6 +1795,9 @@ static ssize_t vfio_ap_mdev_ioctl(struct vfio_device *vdev, case VFIO_DEVICE_RESET: ret = vfio_ap_mdev_reset_queues(&matrix_mdev->qtable); break; + case VFIO_DEVICE_GET_IRQ_INFO: + ret = vfio_ap_get_irq_info(arg); + break; default: ret = -EOPNOTSUPP; break; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 0552e8dcf0cb..b71276bd7f91 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -646,6 +646,15 @@ enum { VFIO_CCW_NUM_IRQS }; +/* + * The vfio-ap bus driver makes use of the following IRQ index mapping. + * Unimplemented IRQ types return a count of zero. + */ +enum { + VFIO_AP_REQ_IRQ_INDEX, + VFIO_AP_NUM_IRQS +}; + /** * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12, * struct vfio_pci_hot_reset_info) -- cgit v1.2.3 From 55382134366e641e97cd83264c22c60c7dc10ccd Mon Sep 17 00:00:00 2001 From: "GONG, Ruiqi" Date: Fri, 2 Jun 2023 13:45:27 +0800 Subject: capability: erase checker warnings about struct __user_cap_data_struct Currently Sparse warns the following when compiling kernel/capability.c: kernel/capability.c:191:35: warning: incorrect type in argument 2 (different address spaces) kernel/capability.c:191:35: expected void const *from kernel/capability.c:191:35: got struct __user_cap_data_struct [noderef] __user * kernel/capability.c:168:14: warning: dereference of noderef expression ...... (multiple noderef warnings on different locations) kernel/capability.c:244:29: warning: incorrect type in argument 1 (different address spaces) kernel/capability.c:244:29: expected void *to kernel/capability.c:244:29: got struct __user_cap_data_struct [noderef] __user ( * )[2] kernel/capability.c:247:42: warning: dereference of noderef expression ...... (multiple noderef warnings on different locations) It seems that defining `struct __user_cap_data_struct` together with `cap_user_data_t` make Sparse believe that the struct is `noderef` as well. Separate their definitions to clarify their respective attributes. Signed-off-by: GONG, Ruiqi Acked-by: Serge Hallyn [PM: wrapped long lines in the description] Signed-off-by: Paul Moore --- include/uapi/linux/capability.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 3d61a0ae055d..5bb906098697 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -41,11 +41,12 @@ typedef struct __user_cap_header_struct { int pid; } __user *cap_user_header_t; -typedef struct __user_cap_data_struct { +struct __user_cap_data_struct { __u32 effective; __u32 permitted; __u32 inheritable; -} __user *cap_user_data_t; +}; +typedef struct __user_cap_data_struct __user *cap_user_data_t; #define VFS_CAP_REVISION_MASK 0xFF000000 -- cgit v1.2.3 From 5cadfbd5a11e5495cac217534c5f788168b1afd7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 27 Mar 2023 16:14:49 +0200 Subject: fuse: add feature flag for expire-only Add an init flag idicating whether the FUSE_EXPIRE_ONLY flag of FUSE_NOTIFY_INVAL_ENTRY is effective. This is needed for backports of this feature, otherwise the server could just check the protocol version. Fixes: 4f8d37020e1f ("fuse: add "expire only" mode to FUSE_NOTIFY_INVAL_ENTRY") Cc: # v6.2 Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 3 ++- include/uapi/linux/fuse.h | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d66070af145d..660be31aaabc 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1254,7 +1254,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | - FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP; + FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | + FUSE_HAS_EXPIRE_ONLY; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 1b9d0dfae72d..b3fcab13fcd3 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -206,6 +206,7 @@ * - add extension header * - add FUSE_EXT_GROUPS * - add FUSE_CREATE_SUPP_GROUP + * - add FUSE_HAS_EXPIRE_ONLY */ #ifndef _LINUX_FUSE_H @@ -369,6 +370,7 @@ struct fuse_file_lock { * FUSE_HAS_INODE_DAX: use per inode DAX * FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir, * symlink and mknod (single group that matches parent) + * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -406,6 +408,7 @@ struct fuse_file_lock { #define FUSE_SECURITY_CTX (1ULL << 32) #define FUSE_HAS_INODE_DAX (1ULL << 33) #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) +#define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) /** * CUSE INIT request/reply flags -- cgit v1.2.3 From 3a41db531e5124adaa3a9ab9ca0c724aee85b10c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 10 Mar 2023 18:45:41 +0200 Subject: pktcdvd: Get rid of custom printing macros We may use traditional dev_*() macros instead of custom ones provided by the driver. Signed-off-by: Andy Shevchenko Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230310164549.22133-2-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 247 ++++++++++++++++++++++--------------------- include/linux/pktcdvd.h | 1 - include/uapi/linux/pktcdvd.h | 1 + 3 files changed, 130 insertions(+), 119 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 377f8b345352..a327cce67768 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -72,22 +72,6 @@ #define DRIVER_NAME "pktcdvd" -#define pkt_err(pd, fmt, ...) \ - pr_err("%s: " fmt, pd->name, ##__VA_ARGS__) -#define pkt_notice(pd, fmt, ...) \ - pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__) -#define pkt_info(pd, fmt, ...) \ - pr_info("%s: " fmt, pd->name, ##__VA_ARGS__) - -#define pkt_dbg(level, pd, fmt, ...) \ -do { \ - if (level == 2 && PACKET_DEBUG >= 2) \ - pr_notice("%s: %s():" fmt, \ - pd->name, __func__, ##__VA_ARGS__); \ - else if (level == 1 && PACKET_DEBUG >= 1) \ - pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__); \ -} while (0) - #define MAX_SPEED 0xffff static DEFINE_MUTEX(pktcdvd_mutex); @@ -319,7 +303,7 @@ static void pkt_sysfs_dev_new(struct pktcdvd_device *pd) if (class_is_registered(&class_pktcdvd)) { pd->dev = device_create_with_groups(&class_pktcdvd, NULL, MKDEV(0, 0), pd, pkt_groups, - "%s", pd->name); + "%s", pd->disk->disk_name); if (IS_ERR(pd->dev)) pd->dev = NULL; } @@ -350,7 +334,7 @@ static ssize_t device_map_show(const struct class *c, const struct class_attribu if (!pd) continue; n += sprintf(data+n, "%s %u:%u %u:%u\n", - pd->name, + pd->disk->disk_name, MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev), MAJOR(pd->bdev->bd_dev), MINOR(pd->bdev->bd_dev)); @@ -450,7 +434,7 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) { if (!pkt_debugfs_root) return; - pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root); + pd->dfs_d_root = debugfs_create_dir(pd->disk->disk_name, pkt_debugfs_root); if (!pd->dfs_d_root) return; @@ -484,9 +468,11 @@ static void pkt_debugfs_cleanup(void) static void pkt_bio_finished(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); + BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0); if (atomic_dec_and_test(&pd->cdrw.pending_bios)) { - pkt_dbg(2, pd, "queue empty\n"); + dev_dbg(ddev, "queue empty\n"); atomic_set(&pd->iosched.attention, 1); wake_up(&pd->wqueue); } @@ -717,15 +703,16 @@ static const char *sense_key_string(__u8 index) static void pkt_dump_sense(struct pktcdvd_device *pd, struct packet_command *cgc) { + struct device *ddev = disk_to_dev(pd->disk); struct scsi_sense_hdr *sshdr = cgc->sshdr; if (sshdr) - pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n", + dev_err(ddev, "%*ph - sense %02x.%02x.%02x (%s)\n", CDROM_PACKET_SIZE, cgc->cmd, sshdr->sense_key, sshdr->asc, sshdr->ascq, sense_key_string(sshdr->sense_key)); else - pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); + dev_err(ddev, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); } /* @@ -809,6 +796,7 @@ static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) */ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); if (atomic_read(&pd->iosched.attention) == 0) return; @@ -836,7 +824,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) need_write_seek = 0; if (need_write_seek && reads_queued) { if (atomic_read(&pd->cdrw.pending_bios) > 0) { - pkt_dbg(2, pd, "write, waiting\n"); + dev_dbg(ddev, "write, waiting\n"); break; } pkt_flush_cache(pd); @@ -845,7 +833,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) } else { if (!reads_queued && writes_queued) { if (atomic_read(&pd->cdrw.pending_bios) > 0) { - pkt_dbg(2, pd, "read, waiting\n"); + dev_dbg(ddev, "read, waiting\n"); break; } pd->iosched.writing = 1; @@ -892,6 +880,8 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) */ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q) { + struct device *ddev = disk_to_dev(pd->disk); + if ((pd->settings.size << 9) / CD_FRAMESIZE <= queue_max_segments(q)) { /* @@ -908,7 +898,7 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que set_bit(PACKET_MERGE_SEGS, &pd->flags); return 0; } else { - pkt_err(pd, "cdrom max_phys_segments too small\n"); + dev_err(ddev, "cdrom max_phys_segments too small\n"); return -EIO; } } @@ -919,7 +909,7 @@ static void pkt_end_io_read(struct bio *bio) struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n", + dev_dbg(disk_to_dev(pd->disk), "bio=%p sec0=%llx sec=%llx err=%d\n", bio, (unsigned long long)pkt->sector, (unsigned long long)bio->bi_iter.bi_sector, bio->bi_status); @@ -939,7 +929,7 @@ static void pkt_end_io_packet_write(struct bio *bio) struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_status); + dev_dbg(disk_to_dev(pd->disk), "id=%d, err=%d\n", pkt->id, bio->bi_status); pd->stats.pkt_ended++; @@ -955,6 +945,7 @@ static void pkt_end_io_packet_write(struct bio *bio) */ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) { + struct device *ddev = disk_to_dev(pd->disk); int frames_read = 0; struct bio *bio; int f; @@ -983,8 +974,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) spin_unlock(&pkt->lock); if (pkt->cache_valid) { - pkt_dbg(2, pd, "zone %llx cached\n", - (unsigned long long)pkt->sector); + dev_dbg(ddev, "zone %llx cached\n", (unsigned long long)pkt->sector); goto out_account; } @@ -1005,8 +995,8 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) p = (f * CD_FRAMESIZE) / PAGE_SIZE; offset = (f * CD_FRAMESIZE) % PAGE_SIZE; - pkt_dbg(2, pd, "Adding frame %d, page:%p offs:%d\n", - f, pkt->pages[p], offset); + dev_dbg(ddev, "Adding frame %d, page:%p offs:%d\n", f, + pkt->pages[p], offset); if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset)) BUG(); @@ -1016,8 +1006,8 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) } out_account: - pkt_dbg(2, pd, "need %d frames for zone %llx\n", - frames_read, (unsigned long long)pkt->sector); + dev_dbg(ddev, "need %d frames for zone %llx\n", frames_read, + (unsigned long long)pkt->sector); pd->stats.pkt_started++; pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9); } @@ -1051,17 +1041,18 @@ static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *p } } -static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state state) +static inline void pkt_set_state(struct device *ddev, struct packet_data *pkt, + enum packet_data_state state) { -#if PACKET_DEBUG > 1 static const char *state_name[] = { "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED" }; enum packet_data_state old_state = pkt->state; - pkt_dbg(2, pd, "pkt %2d : s=%6llx %s -> %s\n", + + dev_dbg(ddev, "pkt %2d : s=%6llx %s -> %s\n", pkt->id, (unsigned long long)pkt->sector, state_name[old_state], state_name[state]); -#endif + pkt->state = state; } @@ -1071,6 +1062,7 @@ static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state */ static int pkt_handle_queue(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_data *pkt, *p; struct bio *bio = NULL; sector_t zone = 0; /* Suppress gcc warning */ @@ -1080,7 +1072,7 @@ static int pkt_handle_queue(struct pktcdvd_device *pd) atomic_set(&pd->scan_queue, 0); if (list_empty(&pd->cdrw.pkt_free_list)) { - pkt_dbg(2, pd, "no pkt\n"); + dev_dbg(ddev, "no pkt\n"); return 0; } @@ -1117,7 +1109,7 @@ try_next_bio: } spin_unlock(&pd->lock); if (!bio) { - pkt_dbg(2, pd, "no bio\n"); + dev_dbg(ddev, "no bio\n"); return 0; } @@ -1133,12 +1125,13 @@ try_next_bio: * to this packet. */ spin_lock(&pd->lock); - pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone); + dev_dbg(ddev, "looking for zone %llx\n", (unsigned long long)zone); while ((node = pkt_rbtree_find(pd, zone)) != NULL) { + sector_t tmp = get_zone(node->bio->bi_iter.bi_sector, pd); + bio = node->bio; - pkt_dbg(2, pd, "found zone=%llx\n", (unsigned long long) - get_zone(bio->bi_iter.bi_sector, pd)); - if (get_zone(bio->bi_iter.bi_sector, pd) != zone) + dev_dbg(ddev, "found zone=%llx\n", (unsigned long long)tmp); + if (tmp != zone) break; pkt_rbtree_erase(pd, node); spin_lock(&pkt->lock); @@ -1157,7 +1150,7 @@ try_next_bio: spin_unlock(&pd->lock); pkt->sleep_time = max(PACKET_WAIT_TIME, 1); - pkt_set_state(pkt, PACKET_WAITING_STATE); + pkt_set_state(ddev, pkt, PACKET_WAITING_STATE); atomic_set(&pkt->run_sm, 1); spin_lock(&pd->cdrw.active_list_lock); @@ -1209,6 +1202,7 @@ static void bio_list_copy_data(struct bio *dst, struct bio *src) */ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) { + struct device *ddev = disk_to_dev(pd->disk); int f; bio_init(pkt->w_bio, pd->bdev, pkt->w_bio->bi_inline_vecs, pkt->frames, @@ -1225,7 +1219,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) if (!bio_add_page(pkt->w_bio, page, CD_FRAMESIZE, offset)) BUG(); } - pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt); + dev_dbg(ddev, "vcnt=%d\n", pkt->w_bio->bi_vcnt); /* * Fill-in bvec with data from orig_bios. @@ -1233,11 +1227,11 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) spin_lock(&pkt->lock); bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head); - pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE); + pkt_set_state(ddev, pkt, PACKET_WRITE_WAIT_STATE); spin_unlock(&pkt->lock); - pkt_dbg(2, pd, "Writing %d frames for zone %llx\n", - pkt->write_size, (unsigned long long)pkt->sector); + dev_dbg(ddev, "Writing %d frames for zone %llx\n", pkt->write_size, + (unsigned long long)pkt->sector); if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) pkt->cache_valid = 1; @@ -1265,7 +1259,9 @@ static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status) static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt) { - pkt_dbg(2, pd, "pkt %d\n", pkt->id); + struct device *ddev = disk_to_dev(pd->disk); + + dev_dbg(ddev, "pkt %d\n", pkt->id); for (;;) { switch (pkt->state) { @@ -1275,7 +1271,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data pkt->sleep_time = 0; pkt_gather_data(pd, pkt); - pkt_set_state(pkt, PACKET_READ_WAIT_STATE); + pkt_set_state(ddev, pkt, PACKET_READ_WAIT_STATE); break; case PACKET_READ_WAIT_STATE: @@ -1283,7 +1279,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data return; if (atomic_read(&pkt->io_errors) > 0) { - pkt_set_state(pkt, PACKET_RECOVERY_STATE); + pkt_set_state(ddev, pkt, PACKET_RECOVERY_STATE); } else { pkt_start_write(pd, pkt); } @@ -1294,15 +1290,15 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data return; if (!pkt->w_bio->bi_status) { - pkt_set_state(pkt, PACKET_FINISHED_STATE); + pkt_set_state(ddev, pkt, PACKET_FINISHED_STATE); } else { - pkt_set_state(pkt, PACKET_RECOVERY_STATE); + pkt_set_state(ddev, pkt, PACKET_RECOVERY_STATE); } break; case PACKET_RECOVERY_STATE: - pkt_dbg(2, pd, "No recovery possible\n"); - pkt_set_state(pkt, PACKET_FINISHED_STATE); + dev_dbg(ddev, "No recovery possible\n"); + pkt_set_state(ddev, pkt, PACKET_FINISHED_STATE); break; case PACKET_FINISHED_STATE: @@ -1318,6 +1314,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data static void pkt_handle_packets(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_data *pkt, *next; /* @@ -1338,7 +1335,7 @@ static void pkt_handle_packets(struct pktcdvd_device *pd) if (pkt->state == PACKET_FINISHED_STATE) { list_del(&pkt->list); pkt_put_packet_data(pd, pkt); - pkt_set_state(pkt, PACKET_IDLE_STATE); + pkt_set_state(ddev, pkt, PACKET_IDLE_STATE); atomic_set(&pd->scan_queue, 1); } } @@ -1367,7 +1364,9 @@ static void pkt_count_states(struct pktcdvd_device *pd, int *states) static int kcdrwd(void *foobar) { struct pktcdvd_device *pd = foobar; + struct device *ddev = disk_to_dev(pd->disk); struct packet_data *pkt; + int states[PACKET_NUM_STATES]; long min_sleep_time, residue; set_user_nice(current, MIN_NICE); @@ -1398,13 +1397,9 @@ static int kcdrwd(void *foobar) goto work_to_do; /* Otherwise, go to sleep */ - if (PACKET_DEBUG > 1) { - int states[PACKET_NUM_STATES]; - pkt_count_states(pd, states); - pkt_dbg(2, pd, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", - states[0], states[1], states[2], - states[3], states[4], states[5]); - } + pkt_count_states(pd, states); + dev_dbg(ddev, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", + states[0], states[1], states[2], states[3], states[4], states[5]); min_sleep_time = MAX_SCHEDULE_TIMEOUT; list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { @@ -1412,9 +1407,9 @@ static int kcdrwd(void *foobar) min_sleep_time = pkt->sleep_time; } - pkt_dbg(2, pd, "sleeping\n"); + dev_dbg(ddev, "sleeping\n"); residue = schedule_timeout(min_sleep_time); - pkt_dbg(2, pd, "wake up\n"); + dev_dbg(ddev, "wake up\n"); /* make swsusp happy with our thread */ try_to_freeze(); @@ -1462,7 +1457,7 @@ work_to_do: static void pkt_print_settings(struct pktcdvd_device *pd) { - pkt_info(pd, "%s packets, %u blocks, Mode-%c disc\n", + dev_info(disk_to_dev(pd->disk), "%s packets, %u blocks, Mode-%c disc\n", pd->settings.fp ? "Fixed" : "Variable", pd->settings.size >> 2, pd->settings.block_mode == 8 ? '1' : '2'); @@ -1590,6 +1585,7 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd, */ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_command cgc; struct scsi_sense_hdr sshdr; write_param_page *wp; @@ -1656,7 +1652,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) /* * paranoia */ - pkt_err(pd, "write mode wrong %d\n", wp->data_block_type); + dev_err(ddev, "write mode wrong %d\n", wp->data_block_type); return 1; } wp->packet_size = cpu_to_be32(pd->settings.size >> 2); @@ -1677,6 +1673,8 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) */ static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) { + struct device *ddev = disk_to_dev(pd->disk); + switch (pd->mmc3_profile) { case 0x1a: /* DVD+RW */ case 0x12: /* DVD-RAM */ @@ -1701,7 +1699,7 @@ static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) if (ti->rt == 1 && ti->blank == 0) return 1; - pkt_err(pd, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); + dev_err(ddev, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); return 0; } @@ -1710,6 +1708,8 @@ static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) */ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) { + struct device *ddev = disk_to_dev(pd->disk); + switch (pd->mmc3_profile) { case 0x0a: /* CD-RW */ case 0xffff: /* MMC3 not supported */ @@ -1719,8 +1719,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) case 0x12: /* DVD-RAM */ return 1; default: - pkt_dbg(2, pd, "Wrong disc profile (%x)\n", - pd->mmc3_profile); + dev_dbg(ddev, "Wrong disc profile (%x)\n", pd->mmc3_profile); return 0; } @@ -1729,22 +1728,22 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) * but i'm not sure, should we leave this to user apps? probably. */ if (di->disc_type == 0xff) { - pkt_notice(pd, "unknown disc - no track?\n"); + dev_notice(ddev, "unknown disc - no track?\n"); return 0; } if (di->disc_type != 0x20 && di->disc_type != 0) { - pkt_err(pd, "wrong disc type (%x)\n", di->disc_type); + dev_err(ddev, "wrong disc type (%x)\n", di->disc_type); return 0; } if (di->erasable == 0) { - pkt_notice(pd, "disc not erasable\n"); + dev_err(ddev, "disc not erasable\n"); return 0; } if (di->border_status == PACKET_SESSION_RESERVED) { - pkt_err(pd, "can't write to last track (reserved)\n"); + dev_err(ddev, "can't write to last track (reserved)\n"); return 0; } @@ -1753,6 +1752,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_command cgc; unsigned char buf[12]; disc_information di; @@ -1770,7 +1770,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) ret = pkt_get_disc_info(pd, &di); if (ret) { - pkt_err(pd, "failed get_disc\n"); + dev_err(ddev, "failed get_disc\n"); return ret; } @@ -1782,12 +1782,12 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ ret = pkt_get_track_info(pd, track, 1, &ti); if (ret) { - pkt_err(pd, "failed get_track\n"); + dev_err(ddev, "failed get_track\n"); return ret; } if (!pkt_writable_track(pd, &ti)) { - pkt_err(pd, "can't write to this track\n"); + dev_err(ddev, "can't write to this track\n"); return -EROFS; } @@ -1797,11 +1797,11 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) */ pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2; if (pd->settings.size == 0) { - pkt_notice(pd, "detected zero packet size!\n"); + dev_notice(ddev, "detected zero packet size!\n"); return -ENXIO; } if (pd->settings.size > PACKET_MAX_SECTORS) { - pkt_err(pd, "packet size is too big\n"); + dev_err(ddev, "packet size is too big\n"); return -EROFS; } pd->settings.fp = ti.fp; @@ -1843,7 +1843,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) pd->settings.block_mode = PACKET_BLOCK_MODE2; break; default: - pkt_err(pd, "unknown data mode\n"); + dev_err(ddev, "unknown data mode\n"); return -EROFS; } return 0; @@ -1854,6 +1854,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) */ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_command cgc; struct scsi_sense_hdr sshdr; unsigned char buf[64]; @@ -1883,10 +1884,10 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd) cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff)); ret = pkt_mode_select(pd, &cgc); if (ret) { - pkt_err(pd, "write caching control failed\n"); + dev_err(ddev, "write caching control failed\n"); pkt_dump_sense(pd, &cgc); } else if (!ret && set) - pkt_notice(pd, "enabled write caching\n"); + dev_notice(ddev, "enabled write caching\n"); return ret; } @@ -1967,6 +1968,7 @@ static char us_clv_to_speed[16] = { static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, unsigned *speed) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_command cgc; struct scsi_sense_hdr sshdr; unsigned char buf[64]; @@ -2001,11 +2003,11 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, } if (!(buf[6] & 0x40)) { - pkt_notice(pd, "disc type is not CD-RW\n"); + dev_notice(ddev, "disc type is not CD-RW\n"); return 1; } if (!(buf[6] & 0x4)) { - pkt_notice(pd, "A1 values on media are not valid, maybe not CDRW?\n"); + dev_notice(ddev, "A1 values on media are not valid, maybe not CDRW?\n"); return 1; } @@ -2025,25 +2027,26 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, *speed = us_clv_to_speed[sp]; break; default: - pkt_notice(pd, "unknown disc sub-type %d\n", st); + dev_notice(ddev, "unknown disc sub-type %d\n", st); return 1; } if (*speed) { - pkt_info(pd, "maximum media speed: %d\n", *speed); + dev_info(ddev, "maximum media speed: %d\n", *speed); return 0; } else { - pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st); + dev_notice(ddev, "unknown speed %d for sub-type %d\n", sp, st); return 1; } } static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_command cgc; struct scsi_sense_hdr sshdr; int ret; - pkt_dbg(2, pd, "Performing OPC\n"); + dev_dbg(ddev, "Performing OPC\n"); init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); cgc.sshdr = &sshdr; @@ -2058,18 +2061,19 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) static int pkt_open_write(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); int ret; unsigned int write_speed, media_write_speed, read_speed; ret = pkt_probe_settings(pd); if (ret) { - pkt_dbg(2, pd, "failed probe\n"); + dev_dbg(ddev, "failed probe\n"); return ret; } ret = pkt_set_write_settings(pd); if (ret) { - pkt_dbg(1, pd, "failed saving write settings\n"); + dev_notice(ddev, "failed saving write settings\n"); return -EIO; } @@ -2082,30 +2086,29 @@ static int pkt_open_write(struct pktcdvd_device *pd) case 0x13: /* DVD-RW */ case 0x1a: /* DVD+RW */ case 0x12: /* DVD-RAM */ - pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed); + dev_notice(ddev, "write speed %ukB/s\n", write_speed); break; default: ret = pkt_media_speed(pd, &media_write_speed); if (ret) media_write_speed = 16; write_speed = min(write_speed, media_write_speed * 177); - pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176); + dev_notice(ddev, "write speed %ux\n", write_speed / 176); break; } read_speed = write_speed; ret = pkt_set_speed(pd, write_speed, read_speed); if (ret) { - pkt_dbg(1, pd, "couldn't set write speed\n"); + dev_notice(ddev, "couldn't set write speed\n"); return -EIO; } pd->write_speed = write_speed; pd->read_speed = read_speed; ret = pkt_perform_opc(pd); - if (ret) { - pkt_dbg(1, pd, "Optimum Power Calibration failed\n"); - } + if (ret) + dev_notice(ddev, "Optimum Power Calibration failed\n"); return 0; } @@ -2115,6 +2118,7 @@ static int pkt_open_write(struct pktcdvd_device *pd) */ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) { + struct device *ddev = disk_to_dev(pd->disk); int ret; long lba; struct request_queue *q; @@ -2134,7 +2138,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) ret = pkt_get_last_written(pd, &lba); if (ret) { - pkt_err(pd, "pkt_get_last_written failed\n"); + dev_err(ddev, "pkt_get_last_written failed\n"); goto out_putdev; } @@ -2163,11 +2167,11 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) if (write) { if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { - pkt_err(pd, "not enough memory for buffers\n"); + dev_err(ddev, "not enough memory for buffers\n"); ret = -ENOMEM; goto out_putdev; } - pkt_info(pd, "%lukB available on disc\n", lba << 1); + dev_info(ddev, "%lukB available on disc\n", lba << 1); } return 0; @@ -2184,8 +2188,10 @@ out: */ static void pkt_release_dev(struct pktcdvd_device *pd, int flush) { + struct device *ddev = disk_to_dev(pd->disk); + if (flush && pkt_flush_cache(pd)) - pkt_dbg(1, pd, "not flushing cache\n"); + dev_notice(ddev, "not flushing cache\n"); pkt_lock_door(pd, 0); @@ -2386,13 +2392,14 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) static void pkt_submit_bio(struct bio *bio) { struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata; + struct device *ddev = disk_to_dev(pd->disk); struct bio *split; bio = bio_split_to_limits(bio); if (!bio) return; - pkt_dbg(2, pd, "start = %6llx stop = %6llx\n", + dev_dbg(ddev, "start = %6llx stop = %6llx\n", (unsigned long long)bio->bi_iter.bi_sector, (unsigned long long)bio_end_sector(bio)); @@ -2405,13 +2412,13 @@ static void pkt_submit_bio(struct bio *bio) } if (!test_bit(PACKET_WRITABLE, &pd->flags)) { - pkt_notice(pd, "WRITE for ro device (%llu)\n", + dev_notice(ddev, "WRITE for ro device (%llu)\n", (unsigned long long)bio->bi_iter.bi_sector); goto end_io; } if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) { - pkt_err(pd, "wrong bio size\n"); + dev_err(ddev, "wrong bio size\n"); goto end_io; } @@ -2453,7 +2460,7 @@ static int pkt_seq_show(struct seq_file *m, void *p) char *msg; int states[PACKET_NUM_STATES]; - seq_printf(m, "Writer %s mapped to %pg:\n", pd->name, pd->bdev); + seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name, pd->bdev); seq_printf(m, "\nSettings:\n"); seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2); @@ -2509,12 +2516,13 @@ static int pkt_seq_show(struct seq_file *m, void *p) static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) { + struct device *ddev = disk_to_dev(pd->disk); int i; struct block_device *bdev; struct scsi_device *sdev; if (pd->pkt_dev == dev) { - pkt_err(pd, "recursive setup not allowed\n"); + dev_err(ddev, "recursive setup not allowed\n"); return -EBUSY; } for (i = 0; i < MAX_WRITERS; i++) { @@ -2522,11 +2530,11 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) if (!pd2) continue; if (pd2->bdev->bd_dev == dev) { - pkt_err(pd, "%pg already setup\n", pd2->bdev); + dev_err(ddev, "%pg already setup\n", pd2->bdev); return -EBUSY; } if (pd2->pkt_dev == dev) { - pkt_err(pd, "can't chain pktcdvd devices\n"); + dev_err(ddev, "can't chain pktcdvd devices\n"); return -EBUSY; } } @@ -2550,14 +2558,14 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) pkt_init_queue(pd); atomic_set(&pd->cdrw.pending_bios, 0); - pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name); + pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->disk->disk_name); if (IS_ERR(pd->cdrw.thread)) { - pkt_err(pd, "can't start kernel thread\n"); + dev_err(ddev, "can't start kernel thread\n"); goto out_mem; } - proc_create_single_data(pd->name, 0, pkt_proc, pkt_seq_show, pd); - pkt_dbg(1, pd, "writer mapped to %pg\n", bdev); + proc_create_single_data(pd->disk->disk_name, 0, pkt_proc, pkt_seq_show, pd); + dev_notice(ddev, "writer mapped to %pg\n", bdev); return 0; out_mem: @@ -2570,10 +2578,10 @@ out_mem: static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct pktcdvd_device *pd = bdev->bd_disk->private_data; + struct device *ddev = disk_to_dev(pd->disk); int ret; - pkt_dbg(2, pd, "cmd %x, dev %d:%d\n", - cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); + dev_dbg(ddev, "cmd %x, dev %d:%d\n", cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); mutex_lock(&pktcdvd_mutex); switch (cmd) { @@ -2599,7 +2607,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, ret = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); break; default: - pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd); + dev_dbg(ddev, "Unknown ioctl (%x)\n", cmd); ret = -ENOTTY; } mutex_unlock(&pktcdvd_mutex); @@ -2677,7 +2685,6 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) spin_lock_init(&pd->iosched.lock); bio_list_init(&pd->iosched.read_queue); bio_list_init(&pd->iosched.write_queue); - sprintf(pd->name, DRIVER_NAME"%d", idx); init_waitqueue_head(&pd->wqueue); pd->bio_queue = RB_ROOT; @@ -2694,7 +2701,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) disk->minors = 1; disk->fops = &pktcdvd_ops; disk->flags = GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; - strcpy(disk->disk_name, pd->name); + snprintf(disk->disk_name, sizeof(disk->disk_name), DRIVER_NAME"%d", idx); disk->private_data = pd; pd->pkt_dev = MKDEV(pktdev_major, idx); @@ -2736,6 +2743,7 @@ out_mutex: static int pkt_remove_dev(dev_t pkt_dev) { struct pktcdvd_device *pd; + struct device *ddev; int idx; int ret = 0; @@ -2756,6 +2764,9 @@ static int pkt_remove_dev(dev_t pkt_dev) ret = -EBUSY; goto out; } + + ddev = disk_to_dev(pd->disk); + if (!IS_ERR(pd->cdrw.thread)) kthread_stop(pd->cdrw.thread); @@ -2766,8 +2777,8 @@ static int pkt_remove_dev(dev_t pkt_dev) blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY); - remove_proc_entry(pd->name, pkt_proc); - pkt_dbg(1, pd, "writer unmapped\n"); + remove_proc_entry(pd->disk->disk_name, pkt_proc); + dev_notice(ddev, "writer unmapped\n"); del_gendisk(pd->disk); put_disk(pd->disk); diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index f9c5ac80d59b..80cb00db42a4 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -156,7 +156,6 @@ struct pktcdvd_device { struct block_device *bdev; /* dev attached */ dev_t pkt_dev; /* our dev */ - char name[20]; struct packet_settings settings; struct packet_stats stats; int refcnt; /* Open count */ diff --git a/include/uapi/linux/pktcdvd.h b/include/uapi/linux/pktcdvd.h index 6a5552dfd6af..987a3022dc5f 100644 --- a/include/uapi/linux/pktcdvd.h +++ b/include/uapi/linux/pktcdvd.h @@ -16,6 +16,7 @@ #include /* + * UNUSED: * 1 for normal debug messages, 2 is very verbose. 0 to turn it off. */ #define PACKET_DEBUG 1 -- cgit v1.2.3 From d428487471ba6640ee8bcdabaf830aec08b85400 Mon Sep 17 00:00:00 2001 From: William Breathitt Gray Date: Sun, 16 Apr 2023 13:36:53 -0400 Subject: counter: i8254: Introduce the Intel 8254 interface library module Exposes consumer library functions providing support for interfaces compatible with the venerable Intel 8254 Programmable Interval Timer (PIT). The Intel 8254 PIT first appeared in the early 1980s and was used initially in IBM PC compatibles. The popularity of the original Intel 825x family of chips led to many subsequent variants and clones of the interface in various chips and integrated circuits. Although still popular, interfaces compatible with the Intel 8254 PIT are nowdays typically found embedded in larger VLSI processing chips and FPGA components rather than as discrete ICs. A CONFIG_I8254 Kconfig option is introduced by this patch. Modules wanting access to these i8254 library functions should select this Kconfig option, and import the I8254 symbol namespace. Link: https://lore.kernel.org/r/f6fe32c2db9525d816ab1a01f45abad56c081652.1681665189.git.william.gray@linaro.org/ Signed-off-by: William Breathitt Gray --- Documentation/ABI/testing/sysfs-bus-counter | 54 ++++ MAINTAINERS | 7 + drivers/counter/Kconfig | 15 + drivers/counter/Makefile | 1 + drivers/counter/counter-sysfs.c | 8 +- drivers/counter/i8254.c | 447 ++++++++++++++++++++++++++++ include/linux/i8254.h | 21 ++ include/uapi/linux/counter.h | 6 + 8 files changed, 558 insertions(+), 1 deletion(-) create mode 100644 drivers/counter/i8254.c create mode 100644 include/linux/i8254.h (limited to 'include/uapi/linux') diff --git a/Documentation/ABI/testing/sysfs-bus-counter b/Documentation/ABI/testing/sysfs-bus-counter index 1417c4272c6c..dc3b3a5c876b 100644 --- a/Documentation/ABI/testing/sysfs-bus-counter +++ b/Documentation/ABI/testing/sysfs-bus-counter @@ -90,6 +90,60 @@ Description: counter does not freeze at the boundary points, but counts continuously throughout. + interrupt on terminal count: + The output signal is initially low, and will remain low + until the counter reaches zero. The output signal then + goes high and remains high until a new preset value is + set. + + hardware retriggerable one-shot: + The output signal is initially high. The output signal + will go low by a trigger input signal, and will remain + low until the counter reaches zero. The output will then + go high and remain high until the next trigger. A + trigger results in loading the counter to the preset + value and setting the output signal low, thus starting + the one-shot pulse. + + rate generator: + The output signal is initially high. When the counter + has decremented to 1, the output signal goes low for one + clock pulse. The output signal then goes high again, the + counter is reloaded to the preset value, and the process + repeats in a periodic manner as such. + + square wave mode: + The output signal is initially high. + + If the initial count is even, the counter is decremented + by two on succeeding clock pulses. When the count + expires, the output signal changes value and the + counter is reloaded to the preset value. The process + repeats in periodic manner as such. + + If the initial count is odd, the initial count minus one + (an even number) is loaded and then is decremented by + two on succeeding clock pulses. One clock pulse after + the count expires, the output signal goes low and the + counter is reloaded to the preset value minus one. + Succeeding clock pulses decrement the count by two. When + the count expires, the output goes high again and the + counter is reloaded to the preset value minus one. The + process repeats in a periodic manner as such. + + software triggered strobe: + The output signal is initially high. When the count + expires, the output will go low for one clock pulse and + then go high again. The counting sequence is "triggered" + by setting the preset value. + + hardware triggered strobe: + The output signal is initially high. Counting is started + by a trigger input signal. When the count expires, the + output signal will go low for one clock pulse and then + go high again. A trigger results in loading the counter + to the preset value. + What: /sys/bus/counter/devices/counterX/countY/count_mode_available What: /sys/bus/counter/devices/counterX/countY/error_noise_available What: /sys/bus/counter/devices/counterX/countY/function_available diff --git a/MAINTAINERS b/MAINTAINERS index 7e0b87d5aa2e..8d8b519f2597 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10257,6 +10257,13 @@ L: linux-fbdev@vger.kernel.org S: Maintained F: drivers/video/fbdev/i810/ +INTEL 8254 COUNTER DRIVER +M: William Breathitt Gray +L: linux-iio@vger.kernel.org +S: Maintained +F: drivers/counter/i8254.c +F: include/linux/i8254.h + INTEL 8255 GPIO DRIVER M: William Breathitt Gray L: linux-gpio@vger.kernel.org diff --git a/drivers/counter/Kconfig b/drivers/counter/Kconfig index f8b2a6508bb3..a61a4b9b8ec6 100644 --- a/drivers/counter/Kconfig +++ b/drivers/counter/Kconfig @@ -10,6 +10,21 @@ menuconfig COUNTER interface. You only need to enable this, if you also want to enable one or more of the counter device drivers below. +config I8254 + tristate + select COUNTER + select REGMAP + help + Enables support for the i8254 interface library functions. The i8254 + interface library provides functions to facilitate communication with + interfaces compatible with the venerable Intel 8254 Programmable + Interval Timer (PIT). The Intel 825x family of chips was first + released in the early 1980s but compatible interfaces are nowadays + typically found embedded in larger VLSI processing chips and FPGA + components. + + If built as a module its name will be i8254. + if COUNTER config 104_QUAD_8 diff --git a/drivers/counter/Makefile b/drivers/counter/Makefile index 933fdd50b3e4..fa3c1d08f706 100644 --- a/drivers/counter/Makefile +++ b/drivers/counter/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_COUNTER) += counter.o counter-y := counter-core.o counter-sysfs.o counter-chrdev.o +obj-$(CONFIG_I8254) += i8254.o obj-$(CONFIG_104_QUAD_8) += 104-quad-8.o obj-$(CONFIG_INTERRUPT_CNT) += interrupt-cnt.o obj-$(CONFIG_RZ_MTU3_CNT) += rz-mtu3-cnt.o diff --git a/drivers/counter/counter-sysfs.c b/drivers/counter/counter-sysfs.c index b9efe66f9f8d..42c523343d32 100644 --- a/drivers/counter/counter-sysfs.c +++ b/drivers/counter/counter-sysfs.c @@ -88,7 +88,13 @@ static const char *const counter_count_mode_str[] = { [COUNTER_COUNT_MODE_NORMAL] = "normal", [COUNTER_COUNT_MODE_RANGE_LIMIT] = "range limit", [COUNTER_COUNT_MODE_NON_RECYCLE] = "non-recycle", - [COUNTER_COUNT_MODE_MODULO_N] = "modulo-n" + [COUNTER_COUNT_MODE_MODULO_N] = "modulo-n", + [COUNTER_COUNT_MODE_INTERRUPT_ON_TERMINAL_COUNT] = "interrupt on terminal count", + [COUNTER_COUNT_MODE_HARDWARE_RETRIGGERABLE_ONESHOT] = "hardware retriggerable one-shot", + [COUNTER_COUNT_MODE_RATE_GENERATOR] = "rate generator", + [COUNTER_COUNT_MODE_SQUARE_WAVE_MODE] = "square wave mode", + [COUNTER_COUNT_MODE_SOFTWARE_TRIGGERED_STROBE] = "software triggered strobe", + [COUNTER_COUNT_MODE_HARDWARE_TRIGGERED_STROBE] = "hardware triggered strobe", }; static const char *const counter_signal_polarity_str[] = { diff --git a/drivers/counter/i8254.c b/drivers/counter/i8254.c new file mode 100644 index 000000000000..c41e4fdc9601 --- /dev/null +++ b/drivers/counter/i8254.c @@ -0,0 +1,447 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel 8254 Programmable Interval Timer + * Copyright (C) William Breathitt Gray + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define I8254_COUNTER_REG(_counter) (_counter) +#define I8254_CONTROL_REG 0x3 + +#define I8254_SC GENMASK(7, 6) +#define I8254_RW GENMASK(5, 4) +#define I8254_M GENMASK(3, 1) +#define I8254_CONTROL(_sc, _rw, _m) \ + (u8_encode_bits(_sc, I8254_SC) | u8_encode_bits(_rw, I8254_RW) | \ + u8_encode_bits(_m, I8254_M)) + +#define I8254_RW_TWO_BYTE 0x3 +#define I8254_MODE_INTERRUPT_ON_TERMINAL_COUNT 0 +#define I8254_MODE_HARDWARE_RETRIGGERABLE_ONESHOT 1 +#define I8254_MODE_RATE_GENERATOR 2 +#define I8254_MODE_SQUARE_WAVE_MODE 3 +#define I8254_MODE_SOFTWARE_TRIGGERED_STROBE 4 +#define I8254_MODE_HARDWARE_TRIGGERED_STROBE 5 + +#define I8254_COUNTER_LATCH(_counter) I8254_CONTROL(_counter, 0x0, 0x0) +#define I8254_PROGRAM_COUNTER(_counter, _mode) I8254_CONTROL(_counter, I8254_RW_TWO_BYTE, _mode) + +#define I8254_NUM_COUNTERS 3 + +/** + * struct i8254 - I8254 device private data structure + * @lock: synchronization lock to prevent I/O race conditions + * @preset: array of Counter Register states + * @out_mode: array of mode configuration states + * @map: Regmap for the device + */ +struct i8254 { + struct mutex lock; + u16 preset[I8254_NUM_COUNTERS]; + u8 out_mode[I8254_NUM_COUNTERS]; + struct regmap *map; +}; + +static int i8254_count_read(struct counter_device *const counter, struct counter_count *const count, + u64 *const val) +{ + struct i8254 *const priv = counter_priv(counter); + int ret; + u8 value[2]; + + mutex_lock(&priv->lock); + + ret = regmap_write(priv->map, I8254_CONTROL_REG, I8254_COUNTER_LATCH(count->id)); + if (ret) { + mutex_unlock(&priv->lock); + return ret; + } + ret = regmap_noinc_read(priv->map, I8254_COUNTER_REG(count->id), value, sizeof(value)); + if (ret) { + mutex_unlock(&priv->lock); + return ret; + } + + mutex_unlock(&priv->lock); + + *val = get_unaligned_le16(value); + + return ret; +} + +static int i8254_function_read(struct counter_device *const counter, + struct counter_count *const count, + enum counter_function *const function) +{ + *function = COUNTER_FUNCTION_DECREASE; + return 0; +} + +#define I8254_SYNAPSES_PER_COUNT 2 +#define I8254_SIGNAL_ID_CLK 0 +#define I8254_SIGNAL_ID_GATE 1 + +static int i8254_action_read(struct counter_device *const counter, + struct counter_count *const count, + struct counter_synapse *const synapse, + enum counter_synapse_action *const action) +{ + struct i8254 *const priv = counter_priv(counter); + + switch (synapse->signal->id % I8254_SYNAPSES_PER_COUNT) { + case I8254_SIGNAL_ID_CLK: + *action = COUNTER_SYNAPSE_ACTION_FALLING_EDGE; + return 0; + case I8254_SIGNAL_ID_GATE: + switch (priv->out_mode[count->id]) { + case I8254_MODE_HARDWARE_RETRIGGERABLE_ONESHOT: + case I8254_MODE_RATE_GENERATOR: + case I8254_MODE_SQUARE_WAVE_MODE: + case I8254_MODE_HARDWARE_TRIGGERED_STROBE: + *action = COUNTER_SYNAPSE_ACTION_RISING_EDGE; + return 0; + default: + *action = COUNTER_SYNAPSE_ACTION_NONE; + return 0; + } + default: + /* should never reach this path */ + return -EINVAL; + } +} + +static int i8254_count_ceiling_read(struct counter_device *const counter, + struct counter_count *const count, u64 *const ceiling) +{ + struct i8254 *const priv = counter_priv(counter); + + mutex_lock(&priv->lock); + + switch (priv->out_mode[count->id]) { + case I8254_MODE_RATE_GENERATOR: + /* Rate Generator decrements 0 by one and the counter "wraps around" */ + *ceiling = (priv->preset[count->id] == 0) ? U16_MAX : priv->preset[count->id]; + break; + case I8254_MODE_SQUARE_WAVE_MODE: + if (priv->preset[count->id] % 2) + *ceiling = priv->preset[count->id] - 1; + else if (priv->preset[count->id] == 0) + /* Square Wave Mode decrements 0 by two and the counter "wraps around" */ + *ceiling = U16_MAX - 1; + else + *ceiling = priv->preset[count->id]; + break; + default: + *ceiling = U16_MAX; + break; + } + + mutex_unlock(&priv->lock); + + return 0; +} + +static int i8254_count_mode_read(struct counter_device *const counter, + struct counter_count *const count, + enum counter_count_mode *const count_mode) +{ + const struct i8254 *const priv = counter_priv(counter); + + switch (priv->out_mode[count->id]) { + case I8254_MODE_INTERRUPT_ON_TERMINAL_COUNT: + *count_mode = COUNTER_COUNT_MODE_INTERRUPT_ON_TERMINAL_COUNT; + return 0; + case I8254_MODE_HARDWARE_RETRIGGERABLE_ONESHOT: + *count_mode = COUNTER_COUNT_MODE_HARDWARE_RETRIGGERABLE_ONESHOT; + return 0; + case I8254_MODE_RATE_GENERATOR: + *count_mode = COUNTER_COUNT_MODE_RATE_GENERATOR; + return 0; + case I8254_MODE_SQUARE_WAVE_MODE: + *count_mode = COUNTER_COUNT_MODE_SQUARE_WAVE_MODE; + return 0; + case I8254_MODE_SOFTWARE_TRIGGERED_STROBE: + *count_mode = COUNTER_COUNT_MODE_SOFTWARE_TRIGGERED_STROBE; + return 0; + case I8254_MODE_HARDWARE_TRIGGERED_STROBE: + *count_mode = COUNTER_COUNT_MODE_HARDWARE_TRIGGERED_STROBE; + return 0; + default: + /* should never reach this path */ + return -EINVAL; + } +} + +static int i8254_count_mode_write(struct counter_device *const counter, + struct counter_count *const count, + const enum counter_count_mode count_mode) +{ + struct i8254 *const priv = counter_priv(counter); + u8 out_mode; + int ret; + + switch (count_mode) { + case COUNTER_COUNT_MODE_INTERRUPT_ON_TERMINAL_COUNT: + out_mode = I8254_MODE_INTERRUPT_ON_TERMINAL_COUNT; + break; + case COUNTER_COUNT_MODE_HARDWARE_RETRIGGERABLE_ONESHOT: + out_mode = I8254_MODE_HARDWARE_RETRIGGERABLE_ONESHOT; + break; + case COUNTER_COUNT_MODE_RATE_GENERATOR: + out_mode = I8254_MODE_RATE_GENERATOR; + break; + case COUNTER_COUNT_MODE_SQUARE_WAVE_MODE: + out_mode = I8254_MODE_SQUARE_WAVE_MODE; + break; + case COUNTER_COUNT_MODE_SOFTWARE_TRIGGERED_STROBE: + out_mode = I8254_MODE_SOFTWARE_TRIGGERED_STROBE; + break; + case COUNTER_COUNT_MODE_HARDWARE_TRIGGERED_STROBE: + out_mode = I8254_MODE_HARDWARE_TRIGGERED_STROBE; + break; + default: + /* should never reach this path */ + return -EINVAL; + } + + mutex_lock(&priv->lock); + + /* Counter Register is cleared when the counter is programmed */ + priv->preset[count->id] = 0; + priv->out_mode[count->id] = out_mode; + ret = regmap_write(priv->map, I8254_CONTROL_REG, + I8254_PROGRAM_COUNTER(count->id, out_mode)); + + mutex_unlock(&priv->lock); + + return ret; +} + +static int i8254_count_floor_read(struct counter_device *const counter, + struct counter_count *const count, u64 *const floor) +{ + struct i8254 *const priv = counter_priv(counter); + + mutex_lock(&priv->lock); + + switch (priv->out_mode[count->id]) { + case I8254_MODE_RATE_GENERATOR: + /* counter is always reloaded after 1, but 0 is a possible reload value */ + *floor = (priv->preset[count->id] == 0) ? 0 : 1; + break; + case I8254_MODE_SQUARE_WAVE_MODE: + /* counter is always reloaded after 2 for even preset values */ + *floor = (priv->preset[count->id] % 2 || priv->preset[count->id] == 0) ? 0 : 2; + break; + default: + *floor = 0; + break; + } + + mutex_unlock(&priv->lock); + + return 0; +} + +static int i8254_count_preset_read(struct counter_device *const counter, + struct counter_count *const count, u64 *const preset) +{ + const struct i8254 *const priv = counter_priv(counter); + + *preset = priv->preset[count->id]; + + return 0; +} + +static int i8254_count_preset_write(struct counter_device *const counter, + struct counter_count *const count, const u64 preset) +{ + struct i8254 *const priv = counter_priv(counter); + int ret; + u8 value[2]; + + if (preset > U16_MAX) + return -ERANGE; + + mutex_lock(&priv->lock); + + if (priv->out_mode[count->id] == I8254_MODE_RATE_GENERATOR || + priv->out_mode[count->id] == I8254_MODE_SQUARE_WAVE_MODE) { + if (preset == 1) { + mutex_unlock(&priv->lock); + return -EINVAL; + } + } + + priv->preset[count->id] = preset; + + put_unaligned_le16(preset, value); + ret = regmap_noinc_write(priv->map, I8254_COUNTER_REG(count->id), value, 2); + + mutex_unlock(&priv->lock); + + return ret; +} + +static int i8254_init_hw(struct regmap *const map) +{ + unsigned long i; + int ret; + + for (i = 0; i < I8254_NUM_COUNTERS; i++) { + /* Initialize each counter to Mode 0 */ + ret = regmap_write(map, I8254_CONTROL_REG, + I8254_PROGRAM_COUNTER(i, I8254_MODE_INTERRUPT_ON_TERMINAL_COUNT)); + if (ret) + return ret; + } + + return 0; +} + +static const struct counter_ops i8254_ops = { + .count_read = i8254_count_read, + .function_read = i8254_function_read, + .action_read = i8254_action_read, +}; + +#define I8254_SIGNAL(_id, _name) { \ + .id = (_id), \ + .name = (_name), \ +} + +static struct counter_signal i8254_signals[] = { + I8254_SIGNAL(0, "CLK 0"), I8254_SIGNAL(1, "GATE 0"), + I8254_SIGNAL(2, "CLK 1"), I8254_SIGNAL(3, "GATE 1"), + I8254_SIGNAL(4, "CLK 2"), I8254_SIGNAL(5, "GATE 2"), +}; + +static const enum counter_synapse_action i8254_clk_actions[] = { + COUNTER_SYNAPSE_ACTION_FALLING_EDGE, +}; +static const enum counter_synapse_action i8254_gate_actions[] = { + COUNTER_SYNAPSE_ACTION_NONE, + COUNTER_SYNAPSE_ACTION_RISING_EDGE, +}; + +#define I8254_SYNAPSES_BASE(_id) ((_id) * I8254_SYNAPSES_PER_COUNT) +#define I8254_SYNAPSE_CLK(_id) { \ + .actions_list = i8254_clk_actions, \ + .num_actions = ARRAY_SIZE(i8254_clk_actions), \ + .signal = &i8254_signals[I8254_SYNAPSES_BASE(_id) + 0], \ +} +#define I8254_SYNAPSE_GATE(_id) { \ + .actions_list = i8254_gate_actions, \ + .num_actions = ARRAY_SIZE(i8254_gate_actions), \ + .signal = &i8254_signals[I8254_SYNAPSES_BASE(_id) + 1], \ +} + +static struct counter_synapse i8254_synapses[] = { + I8254_SYNAPSE_CLK(0), I8254_SYNAPSE_GATE(0), + I8254_SYNAPSE_CLK(1), I8254_SYNAPSE_GATE(1), + I8254_SYNAPSE_CLK(2), I8254_SYNAPSE_GATE(2), +}; + +static const enum counter_function i8254_functions_list[] = { + COUNTER_FUNCTION_DECREASE, +}; + +static const enum counter_count_mode i8254_count_modes[] = { + COUNTER_COUNT_MODE_INTERRUPT_ON_TERMINAL_COUNT, + COUNTER_COUNT_MODE_HARDWARE_RETRIGGERABLE_ONESHOT, + COUNTER_COUNT_MODE_RATE_GENERATOR, + COUNTER_COUNT_MODE_SQUARE_WAVE_MODE, + COUNTER_COUNT_MODE_SOFTWARE_TRIGGERED_STROBE, + COUNTER_COUNT_MODE_HARDWARE_TRIGGERED_STROBE, +}; + +static DEFINE_COUNTER_AVAILABLE(i8254_count_modes_available, i8254_count_modes); + +static struct counter_comp i8254_count_ext[] = { + COUNTER_COMP_CEILING(i8254_count_ceiling_read, NULL), + COUNTER_COMP_COUNT_MODE(i8254_count_mode_read, i8254_count_mode_write, + i8254_count_modes_available), + COUNTER_COMP_FLOOR(i8254_count_floor_read, NULL), + COUNTER_COMP_PRESET(i8254_count_preset_read, i8254_count_preset_write), +}; + +#define I8254_COUNT(_id, _name) { \ + .id = (_id), \ + .name = (_name), \ + .functions_list = i8254_functions_list, \ + .num_functions = ARRAY_SIZE(i8254_functions_list), \ + .synapses = &i8254_synapses[I8254_SYNAPSES_BASE(_id)], \ + .num_synapses = I8254_SYNAPSES_PER_COUNT, \ + .ext = i8254_count_ext, \ + .num_ext = ARRAY_SIZE(i8254_count_ext) \ +} + +static struct counter_count i8254_counts[I8254_NUM_COUNTERS] = { + I8254_COUNT(0, "Counter 0"), I8254_COUNT(1, "Counter 1"), I8254_COUNT(2, "Counter 2"), +}; + +/** + * devm_i8254_regmap_register - Register an i8254 Counter device + * @dev: device that is registering this i8254 Counter device + * @config: configuration for i8254_regmap_config + * + * Registers an Intel 8254 Programmable Interval Timer Counter device. Returns 0 on success and + * negative error number on failure. + */ +int devm_i8254_regmap_register(struct device *const dev, + const struct i8254_regmap_config *const config) +{ + struct counter_device *counter; + struct i8254 *priv; + int err; + + if (!config->parent) + return -EINVAL; + + if (!config->map) + return -EINVAL; + + counter = devm_counter_alloc(dev, sizeof(*priv)); + if (!counter) + return -ENOMEM; + priv = counter_priv(counter); + priv->map = config->map; + + counter->name = dev_name(config->parent); + counter->parent = config->parent; + counter->ops = &i8254_ops; + counter->counts = i8254_counts; + counter->num_counts = ARRAY_SIZE(i8254_counts); + counter->signals = i8254_signals; + counter->num_signals = ARRAY_SIZE(i8254_signals); + + mutex_init(&priv->lock); + + err = i8254_init_hw(priv->map); + if (err) + return err; + + err = devm_counter_add(dev, counter); + if (err < 0) + return dev_err_probe(dev, err, "Failed to add counter\n"); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(devm_i8254_regmap_register, I8254); + +MODULE_AUTHOR("William Breathitt Gray"); +MODULE_DESCRIPTION("Intel 8254 Programmable Interval Timer"); +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(COUNTER); diff --git a/include/linux/i8254.h b/include/linux/i8254.h new file mode 100644 index 000000000000..a675c309232b --- /dev/null +++ b/include/linux/i8254.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) William Breathitt Gray */ +#ifndef _I8254_H_ +#define _I8254_H_ + +struct device; +struct regmap; + +/** + * struct i8254_regmap_config - Configuration for the register map of an i8254 + * @parent: parent device + * @map: regmap for the i8254 + */ +struct i8254_regmap_config { + struct device *parent; + struct regmap *map; +}; + +int devm_i8254_regmap_register(struct device *dev, const struct i8254_regmap_config *config); + +#endif /* _I8254_H_ */ diff --git a/include/uapi/linux/counter.h b/include/uapi/linux/counter.h index 8ab12d731e3b..fc248ef00e86 100644 --- a/include/uapi/linux/counter.h +++ b/include/uapi/linux/counter.h @@ -127,6 +127,12 @@ enum counter_count_mode { COUNTER_COUNT_MODE_RANGE_LIMIT, COUNTER_COUNT_MODE_NON_RECYCLE, COUNTER_COUNT_MODE_MODULO_N, + COUNTER_COUNT_MODE_INTERRUPT_ON_TERMINAL_COUNT, + COUNTER_COUNT_MODE_HARDWARE_RETRIGGERABLE_ONESHOT, + COUNTER_COUNT_MODE_RATE_GENERATOR, + COUNTER_COUNT_MODE_SQUARE_WAVE_MODE, + COUNTER_COUNT_MODE_SOFTWARE_TRIGGERED_STROBE, + COUNTER_COUNT_MODE_HARDWARE_TRIGGERED_STROBE, }; /* Count function values */ -- cgit v1.2.3 From 0c59922c769a1361d4699ef6694b59031767a74e Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Mon, 5 Jun 2023 11:07:09 +0000 Subject: riscv: Add ptrace vector support This patch adds ptrace support for riscv vector. The vector registers will be saved in datap pointer of __riscv_v_ext_state. This pointer will be set right after the __riscv_v_ext_state data structure then it will be put in ubuf for ptrace system call to get or set. It will check if the datap got from ubuf is set to the correct address or not when the ptrace system call is trying to set the vector registers. Co-developed-by: Vincent Chen Signed-off-by: Vincent Chen Signed-off-by: Greentime Hu Signed-off-by: Andy Chiu Reviewed-by: Conor Dooley Reviewed-by: Palmer Dabbelt Link: https://lore.kernel.org/r/20230605110724.21391-13-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/uapi/asm/ptrace.h | 7 ++++ arch/riscv/kernel/ptrace.c | 70 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/elf.h | 1 + 3 files changed, 78 insertions(+) (limited to 'include/uapi/linux') diff --git a/arch/riscv/include/uapi/asm/ptrace.h b/arch/riscv/include/uapi/asm/ptrace.h index 586786d023c4..e8d127ec5cf7 100644 --- a/arch/riscv/include/uapi/asm/ptrace.h +++ b/arch/riscv/include/uapi/asm/ptrace.h @@ -94,6 +94,13 @@ struct __riscv_v_ext_state { */ }; +/* + * According to spec: The number of bits in a single vector register, + * VLEN >= ELEN, which must be a power of 2, and must be no greater than + * 2^16 = 65536bits = 8192bytes + */ +#define RISCV_MAX_VLENB (8192) + #endif /* __ASSEMBLY__ */ #endif /* _UAPI_ASM_RISCV_PTRACE_H */ diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 23c48b14a0e7..1d572cf3140f 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -7,6 +7,7 @@ * Copied from arch/tile/kernel/ptrace.c */ +#include #include #include #include @@ -24,6 +25,9 @@ enum riscv_regset { #ifdef CONFIG_FPU REGSET_F, #endif +#ifdef CONFIG_RISCV_ISA_V + REGSET_V, +#endif }; static int riscv_gpr_get(struct task_struct *target, @@ -80,6 +84,61 @@ static int riscv_fpr_set(struct task_struct *target, } #endif +#ifdef CONFIG_RISCV_ISA_V +static int riscv_vr_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct __riscv_v_ext_state *vstate = &target->thread.vstate; + + if (!riscv_v_vstate_query(task_pt_regs(target))) + return -EINVAL; + + /* + * Ensure the vector registers have been saved to the memory before + * copying them to membuf. + */ + if (target == current) + riscv_v_vstate_save(current, task_pt_regs(current)); + + /* Copy vector header from vstate. */ + membuf_write(&to, vstate, offsetof(struct __riscv_v_ext_state, datap)); + membuf_zero(&to, sizeof(vstate->datap)); + + /* Copy all the vector registers from vstate. */ + return membuf_write(&to, vstate->datap, riscv_v_vsize); +} + +static int riscv_vr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret, size; + struct __riscv_v_ext_state *vstate = &target->thread.vstate; + + if (!riscv_v_vstate_query(task_pt_regs(target))) + return -EINVAL; + + /* Copy rest of the vstate except datap */ + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vstate, 0, + offsetof(struct __riscv_v_ext_state, datap)); + if (unlikely(ret)) + return ret; + + /* Skip copy datap. */ + size = sizeof(vstate->datap); + count -= size; + ubuf += size; + + /* Copy all the vector registers. */ + pos = 0; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vstate->datap, + 0, riscv_v_vsize); + return ret; +} +#endif + static const struct user_regset riscv_user_regset[] = { [REGSET_X] = { .core_note_type = NT_PRSTATUS, @@ -99,6 +158,17 @@ static const struct user_regset riscv_user_regset[] = { .set = riscv_fpr_set, }, #endif +#ifdef CONFIG_RISCV_ISA_V + [REGSET_V] = { + .core_note_type = NT_RISCV_VECTOR, + .align = 16, + .n = ((32 * RISCV_MAX_VLENB) + + sizeof(struct __riscv_v_ext_state)) / sizeof(__u32), + .size = sizeof(__u32), + .regset_get = riscv_vr_get, + .set = riscv_vr_set, + }, +#endif }; static const struct user_regset_view riscv_user_native_view = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index ac3da855fb19..7d8d9ae36615 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -440,6 +440,7 @@ typedef struct elf64_shdr { #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ #define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */ #define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */ +#define NT_RISCV_VECTOR 0x900 /* RISC-V vector registers */ #define NT_LOONGARCH_CPUCFG 0xa00 /* LoongArch CPU config registers */ #define NT_LOONGARCH_CSR 0xa01 /* LoongArch control and status registers */ #define NT_LOONGARCH_LSX 0xa02 /* LoongArch Loongson SIMD Extension registers */ -- cgit v1.2.3 From 1fd96a3e9d5d4febe1a8486590ad52c048d1be77 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Mon, 5 Jun 2023 11:07:18 +0000 Subject: riscv: Add prctl controls for userspace vector management This patch add two riscv-specific prctls, to allow usespace control the use of vector unit: * PR_RISCV_V_SET_CONTROL: control the permission to use Vector at next, or all following execve for a thread. Turning off a thread's Vector live is not possible since libraries may have registered ifunc that may execute Vector instructions. * PR_RISCV_V_GET_CONTROL: get the same permission setting for the current thread, and the setting for following execve(s). Signed-off-by: Andy Chiu Reviewed-by: Greentime Hu Reviewed-by: Vincent Chen Link: https://lore.kernel.org/r/20230605110724.21391-22-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/processor.h | 10 ++++ arch/riscv/include/asm/vector.h | 4 ++ arch/riscv/kernel/cpufeature.c | 9 ++- arch/riscv/kernel/process.c | 1 + arch/riscv/kernel/vector.c | 114 +++++++++++++++++++++++++++++++++++++ arch/riscv/kvm/vcpu.c | 2 + include/uapi/linux/prctl.h | 11 ++++ kernel/sys.c | 12 ++++ 8 files changed, 162 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 38ded8c5f207..e82af1097e26 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -40,6 +40,7 @@ struct thread_struct { unsigned long s[12]; /* s[0]: frame pointer */ struct __riscv_d_ext_state fstate; unsigned long bad_cause; + unsigned long vstate_ctrl; struct __riscv_v_ext_state vstate; }; @@ -83,6 +84,15 @@ extern void riscv_fill_hwcap(void); extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); extern unsigned long signal_minsigstksz __ro_after_init; + +#ifdef CONFIG_RISCV_ISA_V +/* Userspace interface for PR_RISCV_V_{SET,GET}_VS prctl()s: */ +#define RISCV_V_SET_CONTROL(arg) riscv_v_vstate_ctrl_set_current(arg) +#define RISCV_V_GET_CONTROL() riscv_v_vstate_ctrl_get_current() +extern long riscv_v_vstate_ctrl_set_current(unsigned long arg); +extern long riscv_v_vstate_ctrl_get_current(void); +#endif /* CONFIG_RISCV_ISA_V */ + #endif /* __ASSEMBLY__ */ #endif /* _ASM_RISCV_PROCESSOR_H */ diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index 8e56da67b5cf..04c0b07bf6cd 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -160,6 +160,9 @@ static inline void __switch_to_vector(struct task_struct *prev, riscv_v_vstate_restore(next, task_pt_regs(next)); } +void riscv_v_vstate_ctrl_init(struct task_struct *tsk); +bool riscv_v_vstate_ctrl_user_allowed(void); + #else /* ! CONFIG_RISCV_ISA_V */ struct pt_regs; @@ -168,6 +171,7 @@ static inline int riscv_v_setup_vsize(void) { return -EOPNOTSUPP; } static __always_inline bool has_vector(void) { return false; } static inline bool riscv_v_first_use_handler(struct pt_regs *regs) { return false; } static inline bool riscv_v_vstate_query(struct pt_regs *regs) { return false; } +static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; } #define riscv_v_vsize (0) #define riscv_v_vstate_save(task, regs) do {} while (0) #define riscv_v_vstate_restore(task, regs) do {} while (0) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 29c0680652a0..8ae43e40fffc 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -295,7 +295,14 @@ void __init riscv_fill_hwcap(void) unsigned long riscv_get_elf_hwcap(void) { - return (elf_hwcap & ((1UL << RISCV_ISA_EXT_BASE) - 1)); + unsigned long hwcap; + + hwcap = (elf_hwcap & ((1UL << RISCV_ISA_EXT_BASE) - 1)); + + if (!riscv_v_vstate_ctrl_user_allowed()) + hwcap &= ~COMPAT_HWCAP_ISA_V; + + return hwcap; } #ifdef CONFIG_RISCV_ALTERNATIVE diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 78eb5ac45888..e32d737e039f 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -149,6 +149,7 @@ void flush_thread(void) #endif #ifdef CONFIG_RISCV_ISA_V /* Reset vector state */ + riscv_v_vstate_ctrl_init(current); riscv_v_vstate_off(task_pt_regs(current)); kfree(current->thread.vstate.datap); memset(¤t->thread.vstate, 0, sizeof(struct __riscv_v_ext_state)); diff --git a/arch/riscv/kernel/vector.c b/arch/riscv/kernel/vector.c index 9d81d1b2a7f3..a7dec9230164 100644 --- a/arch/riscv/kernel/vector.c +++ b/arch/riscv/kernel/vector.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,8 @@ #include #include +static bool riscv_v_implicit_uacc = IS_ENABLED(CONFIG_RISCV_ISA_V_DEFAULT_ENABLE); + unsigned long riscv_v_vsize __read_mostly; EXPORT_SYMBOL_GPL(riscv_v_vsize); @@ -91,6 +94,43 @@ static int riscv_v_thread_zalloc(void) return 0; } +#define VSTATE_CTRL_GET_CUR(x) ((x) & PR_RISCV_V_VSTATE_CTRL_CUR_MASK) +#define VSTATE_CTRL_GET_NEXT(x) (((x) & PR_RISCV_V_VSTATE_CTRL_NEXT_MASK) >> 2) +#define VSTATE_CTRL_MAKE_NEXT(x) (((x) << 2) & PR_RISCV_V_VSTATE_CTRL_NEXT_MASK) +#define VSTATE_CTRL_GET_INHERIT(x) (!!((x) & PR_RISCV_V_VSTATE_CTRL_INHERIT)) +static inline int riscv_v_ctrl_get_cur(struct task_struct *tsk) +{ + return VSTATE_CTRL_GET_CUR(tsk->thread.vstate_ctrl); +} + +static inline int riscv_v_ctrl_get_next(struct task_struct *tsk) +{ + return VSTATE_CTRL_GET_NEXT(tsk->thread.vstate_ctrl); +} + +static inline bool riscv_v_ctrl_test_inherit(struct task_struct *tsk) +{ + return VSTATE_CTRL_GET_INHERIT(tsk->thread.vstate_ctrl); +} + +static inline void riscv_v_ctrl_set(struct task_struct *tsk, int cur, int nxt, + bool inherit) +{ + unsigned long ctrl; + + ctrl = cur & PR_RISCV_V_VSTATE_CTRL_CUR_MASK; + ctrl |= VSTATE_CTRL_MAKE_NEXT(nxt); + if (inherit) + ctrl |= PR_RISCV_V_VSTATE_CTRL_INHERIT; + tsk->thread.vstate_ctrl = ctrl; +} + +bool riscv_v_vstate_ctrl_user_allowed(void) +{ + return riscv_v_ctrl_get_cur(current) == PR_RISCV_V_VSTATE_CTRL_ON; +} +EXPORT_SYMBOL_GPL(riscv_v_vstate_ctrl_user_allowed); + bool riscv_v_first_use_handler(struct pt_regs *regs) { u32 __user *epc = (u32 __user *)regs->epc; @@ -129,3 +169,77 @@ bool riscv_v_first_use_handler(struct pt_regs *regs) riscv_v_vstate_on(regs); return true; } + +void riscv_v_vstate_ctrl_init(struct task_struct *tsk) +{ + bool inherit; + int cur, next; + + if (!has_vector()) + return; + + next = riscv_v_ctrl_get_next(tsk); + if (!next) { + if (riscv_v_implicit_uacc) + cur = PR_RISCV_V_VSTATE_CTRL_ON; + else + cur = PR_RISCV_V_VSTATE_CTRL_OFF; + } else { + cur = next; + } + /* Clear next mask if inherit-bit is not set */ + inherit = riscv_v_ctrl_test_inherit(tsk); + if (!inherit) + next = PR_RISCV_V_VSTATE_CTRL_DEFAULT; + + riscv_v_ctrl_set(tsk, cur, next, inherit); +} + +long riscv_v_vstate_ctrl_get_current(void) +{ + if (!has_vector()) + return -EINVAL; + + return current->thread.vstate_ctrl & PR_RISCV_V_VSTATE_CTRL_MASK; +} + +long riscv_v_vstate_ctrl_set_current(unsigned long arg) +{ + bool inherit; + int cur, next; + + if (!has_vector()) + return -EINVAL; + + if (arg & ~PR_RISCV_V_VSTATE_CTRL_MASK) + return -EINVAL; + + cur = VSTATE_CTRL_GET_CUR(arg); + switch (cur) { + case PR_RISCV_V_VSTATE_CTRL_OFF: + /* Do not allow user to turn off V if current is not off */ + if (riscv_v_ctrl_get_cur(current) != PR_RISCV_V_VSTATE_CTRL_OFF) + return -EPERM; + + break; + case PR_RISCV_V_VSTATE_CTRL_ON: + break; + case PR_RISCV_V_VSTATE_CTRL_DEFAULT: + cur = riscv_v_ctrl_get_cur(current); + break; + default: + return -EINVAL; + } + + next = VSTATE_CTRL_GET_NEXT(arg); + inherit = VSTATE_CTRL_GET_INHERIT(arg); + switch (next) { + case PR_RISCV_V_VSTATE_CTRL_DEFAULT: + case PR_RISCV_V_VSTATE_CTRL_OFF: + case PR_RISCV_V_VSTATE_CTRL_ON: + riscv_v_ctrl_set(current, cur, next, inherit); + return 0; + } + + return -EINVAL; +} diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index e5e045852e6a..de24127e7e93 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -88,6 +88,8 @@ static bool kvm_riscv_vcpu_isa_enable_allowed(unsigned long ext) switch (ext) { case KVM_RISCV_ISA_EXT_H: return false; + case KVM_RISCV_ISA_EXT_V: + return riscv_v_vstate_ctrl_user_allowed(); default: break; } diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index f23d9a16507f..3c36aeade991 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -294,4 +294,15 @@ struct prctl_mm_map { #define PR_SET_MEMORY_MERGE 67 #define PR_GET_MEMORY_MERGE 68 + +#define PR_RISCV_V_SET_CONTROL 69 +#define PR_RISCV_V_GET_CONTROL 70 +# define PR_RISCV_V_VSTATE_CTRL_DEFAULT 0 +# define PR_RISCV_V_VSTATE_CTRL_OFF 1 +# define PR_RISCV_V_VSTATE_CTRL_ON 2 +# define PR_RISCV_V_VSTATE_CTRL_INHERIT (1 << 4) +# define PR_RISCV_V_VSTATE_CTRL_CUR_MASK 0x3 +# define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK 0xc +# define PR_RISCV_V_VSTATE_CTRL_MASK 0x1f + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 339fee3eff6a..05f838929e72 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -140,6 +140,12 @@ #ifndef GET_TAGGED_ADDR_CTRL # define GET_TAGGED_ADDR_CTRL() (-EINVAL) #endif +#ifndef RISCV_V_SET_CONTROL +# define RISCV_V_SET_CONTROL(a) (-EINVAL) +#endif +#ifndef RISCV_V_GET_CONTROL +# define RISCV_V_GET_CONTROL() (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2708,6 +2714,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags); break; #endif + case PR_RISCV_V_SET_CONTROL: + error = RISCV_V_SET_CONTROL(arg2); + break; + case PR_RISCV_V_GET_CONTROL: + error = RISCV_V_GET_CONTROL(); + break; default: error = -EINVAL; break; -- cgit v1.2.3 From ef75a6ef37235e211bbdb17c25e5f79c55df1750 Mon Sep 17 00:00:00 2001 From: Rajneesh Bhardwaj Date: Thu, 3 Mar 2022 10:56:05 -0500 Subject: drm/amdkfd: Update coherence settings for svm ranges Recently introduced commit "drm/amdgpu: Set cache coherency for GC 9.4.3" did not update the settings applicable for svm ranges. Add the coherence settings for svm ranges for GFX IP 9.4.3. Reviewed-by: Amber Lin Signed-off-by: Rajneesh Bhardwaj Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 17 +++++++++++++++++ include/uapi/linux/kfd_ioctl.h | 2 ++ 2 files changed, 19 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 96ccff79902c..4b4f3bf8b823 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1159,6 +1159,7 @@ svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange, uint64_t pte_flags; bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; + bool uncached = flags & KFD_IOCTL_SVM_FLAG_UNCACHED; if (domain == SVM_RANGE_VRAM_DOMAIN) bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev); @@ -1198,6 +1199,22 @@ svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange, AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; } break; + case IP_VERSION(9, 4, 3): + //TODO: Need more work for handling multiple memory partitions + //e.g. NPS4. Current approch is only applicable without memory + //partitions. + snoop = true; + if (uncached) + mapping_flags |= AMDGPU_VM_MTYPE_UC; + /* local HBM region close to partition*/ + else if (bo_adev == adev) + mapping_flags |= AMDGPU_VM_MTYPE_RW; + /* local HBM region far from partition or remote XGMI GPU or + * system memory + */ + else + mapping_flags |= AMDGPU_VM_MTYPE_NC; + break; default: mapping_flags |= coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2da5c3ad71bd..2a9671e1ddb5 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -623,6 +623,8 @@ enum kfd_mmio_remap { #define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020 /* Keep GPU memory mapping always valid as if XNACK is disable */ #define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED 0x00000040 +/* Uncached access to memory */ +#define KFD_IOCTL_SVM_FLAG_UNCACHED 0x00000080 /** * kfd_ioctl_svm_op - SVM ioctl operations -- cgit v1.2.3 From 9de30f579980b498606a9c2440b73ae3b670771b Mon Sep 17 00:00:00 2001 From: Daniel Almeida Date: Mon, 6 Mar 2023 16:18:50 +0000 Subject: media: Add AV1 uAPI This patch adds the AOMedia Video 1 (AV1) kernel uAPI. This design is based on currently available AV1 API implementations and aims to support the development of AV1 stateless video codecs on Linux. Signed-off-by: Daniel Almeida Co-developed-by: Nicolas Dufresne Signed-off-by: Nicolas Dufresne Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/userspace-api/media/v4l/biblio.rst | 9 + .../media/v4l/ext-ctrls-codec-stateless.rst | 1209 +++++++++++++++++++- .../userspace-api/media/v4l/pixfmt-compressed.rst | 16 + .../userspace-api/media/v4l/vidioc-g-ext-ctrls.rst | 16 + .../userspace-api/media/v4l/vidioc-queryctrl.rst | 24 + .../userspace-api/media/videodev2.h.rst.exceptions | 4 + drivers/media/v4l2-core/v4l2-ctrls-core.c | 258 +++++ drivers/media/v4l2-core/v4l2-ctrls-defs.c | 61 + drivers/media/v4l2-core/v4l2-ioctl.c | 1 + include/media/v4l2-ctrls.h | 8 + include/uapi/linux/v4l2-controls.h | 721 ++++++++++++ include/uapi/linux/videodev2.h | 10 + 12 files changed, 2335 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/media/v4l/biblio.rst b/Documentation/userspace-api/media/v4l/biblio.rst index 9cd18c153d19..72aef1759b60 100644 --- a/Documentation/userspace-api/media/v4l/biblio.rst +++ b/Documentation/userspace-api/media/v4l/biblio.rst @@ -427,3 +427,12 @@ VP9 :title: VP9 Bitstream & Decoding Process Specification :author: Adrian Grange (Google), Peter de Rivaz (Argon Design), Jonathan Hunt (Argon Design) + +.. _av1: + +AV1 +=== + +:title: AV1 Bitstream & Decoding Process Specification + +:author: Peter de Rivaz, Argon Design Ltd, Jack Haughton, Argon Design Ltd diff --git a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst index 677bf0fbcc5a..81e60f4002c8 100644 --- a/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst +++ b/Documentation/userspace-api/media/v4l/ext-ctrls-codec-stateless.rst @@ -1890,11 +1890,11 @@ params syntax' of the :ref:`vp9` specification for more details. * - __u8 - ``tree_probs[7]`` - Specifies the probability values to be used when decoding a Segment-ID. - See '5.15. Segmentation map' section of :ref:`vp9` for more details. + See '5.15 Segmentation map' section of :ref:`vp9` for more details. * - __u8 - ``pred_probs[3]`` - Specifies the probability values to be used when decoding a - Predicted-Segment-ID. See '6.4.14. Get segment id syntax' + Predicted-Segment-ID. See '6.4.14 Get segment id syntax' section of :ref:`vp9` for more details. * - __u8 - ``flags`` @@ -2957,3 +2957,1208 @@ This structure contains all loop filter related parameters. See sections * - ``V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR`` - 0x00000004 - + +.. _v4l2-codec-stateless-av1: + +``V4L2_CID_STATELESS_AV1_SEQUENCE (struct)`` + Represents an AV1 Sequence OBU (Open Bitstream Unit). See section 5.5 + "Sequence header OBU syntax" in :ref:`av1` for more details. + +.. c:type:: v4l2_ctrl_av1_sequence + +.. cssclass:: longtable + +.. tabularcolumns:: |p{5.8cm}|p{4.8cm}|p{6.6cm}| + +.. flat-table:: struct v4l2_ctrl_av1_sequence + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u32 + - ``flags`` + - See :ref:`AV1 Sequence Flags `. + * - __u8 + - ``seq_profile`` + - Specifies the features that can be used in the coded video sequence. + * - __u8 + - ``order_hint_bits`` + - Specifies the number of bits used for the order_hint field at each frame. + * - __u8 + - ``bit_depth`` + - the bit depth to use for the sequence as described in section 5.5.2 + "Color config syntax" in :ref:`av1` for more details. + * - __u8 + - ``reserved`` + - Applications and drivers must set this to zero. + * - __u16 + - ``max_frame_width_minus_1`` + - specifies the maximum frame width minus 1 for the frames represented by + this sequence header. + +.. _av1_sequence_flags: + +``AV1 Sequence Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_SEQUENCE_FLAG_STILL_PICTURE`` + - 0x00000001 + - If set, specifies that the coded video sequence contains only one coded + frame. If not set, specifies that the coded video sequence contains one + or more coded frames. + * - ``V4L2_AV1_SEQUENCE_FLAG_USE_128X128_SUPERBLOCK`` + - 0x00000002 + - If set, indicates that superblocks contain 128x128 luma samples. + When equal to 0, it indicates that superblocks contain 64x64 luma + samples. The number of contained chroma samples depends on + subsampling_x and subsampling_y. + * - ``V4L2_AV1_SEQUENCE_FLAG_ENABLE_FILTER_INTRA`` + - 0x00000004 + - If set, specifies that the use_filter_intra syntax element may be + present. If not set, specifies that the use_filter_intra syntax element + will not be present. + * - ``V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTRA_EDGE_FILTER`` + - 0x00000008 + - Specifies whether the intra edge filtering process should be enabled. + * - ``V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTERINTRA_COMPOUND`` + - 0x00000010 + - If set, specifies that the mode info for inter blocks may contain the + syntax element interintra. If not set, specifies that the syntax element + interintra will not be present. + * - ``V4L2_AV1_SEQUENCE_FLAG_ENABLE_MASKED_COMPOUND`` + - 0x00000020 + - If set, specifies that the mode info for inter blocks may contain the + syntax element compound_type. If not set, specifies that the syntax + element compound_type will not be present. + * - ``V4L2_AV1_SEQUENCE_FLAG_ENABLE_WARPED_MOTION`` + - 0x00000040 + - If set, indicates that the allow_warped_motion syntax element may be + present. If not set, indicates that the allow_warped_motion syntax + element will not be present. + * - ``V4L2_AV1_SEQUENCE_FLAG_ENABLE_DUAL_FILTER`` + - 0x00000080 + - If set, indicates that the inter prediction filter type may be specified + independently in the horizontal and vertical directions. If the flag is + equal to 0, only one filter type may be specified, which is then used in + both directions. + * - ``V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT`` + - 0x00000100 + - If set, indicates that tools based on the values of order hints may be + used. If not set, indicates that tools based on order hints are + disabled. + * - ``V4L2_AV1_SEQUENCE_FLAG_ENABLE_JNT_COMP`` + - 0x00000200 + - If set, indicates that the distance weights process may be used for + inter prediction. + * - ``V4L2_AV1_SEQUENCE_FLAG_ENABLE_REF_FRAME_MVS`` + - 0x00000400 + - If set, indicates that the use_ref_frame_mvs syntax element may be + present. If not set, indicates that the use_ref_frame_mvs syntax element + will not be present. + * - ``V4L2_AV1_SEQUENCE_FLAG_ENABLE_SUPERRES`` + - 0x00000800 + - If set, specifies that the use_superres syntax element will be present + in the uncompressed header. If not set, specifies that the use_superres + syntax element will not be present (instead use_superres will be set to + 0 in the uncompressed header without being read). + * - ``V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF`` + - 0x00001000 + - If set, specifies that cdef filtering may be enabled. If not set, + specifies that cdef filtering is disabled. + * - ``V4L2_AV1_SEQUENCE_FLAG_ENABLE_RESTORATION`` + - 0x00002000 + - If set, specifies that loop restoration filtering may be enabled. If not + set, specifies that loop restoration filtering is disabled. + * - ``V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME`` + - 0x00004000 + - If set, indicates that the video does not contain U and V color planes. + If not set, indicates that the video contains Y, U, and V color planes. + * - ``V4L2_AV1_SEQUENCE_FLAG_COLOR_RANGE`` + - 0x00008000 + - If set, signals full swing representation, i.e. "Full Range + Quantization". If not set, signals studio swing representation, i.e. + "Limited Range Quantization". + * - ``V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_X`` + - 0x00010000 + - Specify the chroma subsampling format. + * - ``V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_Y`` + - 0x00020000 + - Specify the chroma subsampling format. + * - ``V4L2_AV1_SEQUENCE_FLAG_FILM_GRAIN_PARAMS_PRESENT`` + - 0x00040000 + - Specifies whether film grain parameters are present in the coded video + sequence. + * - ``V4L2_AV1_SEQUENCE_FLAG_SEPARATE_UV_DELTA_Q`` + - 0x00080000 + - If set, indicates that the U and V planes may have separate delta + quantizer values. If not set, indicates that the U and V planes will share + the same delta quantizer value. + +``V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY (struct)`` + Represents a single AV1 tile inside an AV1 Tile Group. Note that MiRowStart, + MiRowEnd, MiColStart and MiColEnd can be retrieved from struct + v4l2_av1_tile_info in struct v4l2_ctrl_av1_frame using tile_row and + tile_col. See section 6.10.1 "General tile group OBU semantics" in + :ref:`av1` for more details. + +.. c:type:: v4l2_ctrl_av1_tile_group_entry + +.. cssclass:: longtable + +.. tabularcolumns:: |p{5.8cm}|p{4.8cm}|p{6.6cm}| + +.. flat-table:: struct v4l2_ctrl_av1_tile_group_entry + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u32 + - ``tile_offset`` + - Offset from the OBU data, i.e. where the coded tile data actually starts. + * - __u32 + - ``tile_size`` + - Specifies the size in bytes of the coded tile. Equivalent to "TileSize" + in :ref:`av1`. + * - __u32 + - ``tile_row`` + - Specifies the row of the current tile. Equivalent to "TileRow" in + :ref:`av1`. + * - __u32 + - ``tile_col`` + - Specifies the column of the current tile. Equivalent to "TileColumn" in + :ref:`av1`. + +.. c:type:: v4l2_av1_warp_model + + AV1 Warp Model as described in section 3 "Symbols and abbreviated terms" of + :ref:`av1`. + +.. raw:: latex + + \scriptsize + +.. tabularcolumns:: |p{7.4cm}|p{0.3cm}|p{9.6cm}| + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_WARP_MODEL_IDENTITY`` + - 0 + - Warp model is just an identity transform. + * - ``V4L2_AV1_WARP_MODEL_TRANSLATION`` + - 1 + - Warp model is a pure translation. + * - ``V4L2_AV1_WARP_MODEL_ROTZOOM`` + - 2 + - Warp model is a rotation + symmetric zoom + translation. + * - ``V4L2_AV1_WARP_MODEL_AFFINE`` + - 3 + - Warp model is a general affine transform. + +.. c:type:: v4l2_av1_reference_frame + +AV1 Reference Frames as described in section 6.10.24 "Ref frames semantics" +of :ref:`av1`. + +.. raw:: latex + + \scriptsize + +.. tabularcolumns:: |p{7.4cm}|p{0.3cm}|p{9.6cm}| + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_REF_INTRA_FRAME`` + - 0 + - Intra Frame Reference. + * - ``V4L2_AV1_REF_LAST_FRAME`` + - 1 + - Last Frame Reference. + * - ``V4L2_AV1_REF_LAST2_FRAME`` + - 2 + - Last2 Frame Reference. + * - ``V4L2_AV1_REF_LAST3_FRAME`` + - 3 + - Last3 Frame Reference. + * - ``V4L2_AV1_REF_GOLDEN_FRAME`` + - 4 + - Golden Frame Reference. + * - ``V4L2_AV1_REF_BWDREF_FRAME`` + - 5 + - BWD Frame Reference. + * - ``V4L2_AV1_REF_ALTREF2_FRAME`` + - 6 + - ALTREF2 Frame Reference. + * - ``V4L2_AV1_REF_ALTREF_FRAME`` + - 7 + - ALTREF Frame Reference. + +.. c:type:: v4l2_av1_global_motion + +AV1 Global Motion parameters as described in section 6.8.17 +"Global motion params semantics" of :ref:`av1`. + +.. cssclass:: longtable + +.. tabularcolumns:: |p{1.5cm}|p{5.8cm}|p{10.0cm}| + +.. flat-table:: struct v4l2_av1_global_motion + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``flags[V4L2_AV1_TOTAL_REFS_PER_FRAME]`` + - A bitfield containing the flags per reference frame. See + :ref:`AV1 Global Motion Flags ` for more + details. + * - enum :c:type:`v4l2_av1_warp_model` + - ``type[V4L2_AV1_TOTAL_REFS_PER_FRAME]`` + - The type of global motion transform used. + * - __s32 + - ``params[V4L2_AV1_TOTAL_REFS_PER_FRAME][6]`` + - This field has the same meaning as "gm_params" in :ref:`av1`. + * - __u8 + - ``invalid`` + - Bitfield indicating whether the global motion params are invalid for a + given reference frame. See section 7.11.3.6 Setup shear process and the + variable "warpValid". Use V4L2_AV1_GLOBAL_MOTION_IS_INVALID(ref) to + create a suitable mask. + * - __u8 + - ``reserved[3]`` + - Applications and drivers must set this to zero. + +.. _av1_global_motion_flags: + +``AV1 Global Motion Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_GLOBAL_MOTION_FLAG_IS_GLOBAL`` + - 0x00000001 + - Specifies whether global motion parameters are present for a particular + reference frame. + * - ``V4L2_AV1_GLOBAL_MOTION_FLAG_IS_ROT_ZOOM`` + - 0x00000002 + - Specifies whether a particular reference frame uses rotation and zoom + global motion. + * - ``V4L2_AV1_GLOBAL_MOTION_FLAG_IS_TRANSLATION`` + - 0x00000004 + - Specifies whether a particular reference frame uses translation global + motion + +.. c:type:: v4l2_av1_frame_restoration_type + +AV1 Frame Restoration Type. + +.. raw:: latex + + \scriptsize + +.. tabularcolumns:: |p{7.4cm}|p{0.3cm}|p{9.6cm}| + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_FRAME_RESTORE_NONE`` + - 0 + - No filtering is applied. + * - ``V4L2_AV1_FRAME_RESTORE_WIENER`` + - 1 + - Wiener filter process is invoked. + * - ``V4L2_AV1_FRAME_RESTORE_SGRPROJ`` + - 2 + - Self guided filter process is invoked. + * - ``V4L2_AV1_FRAME_RESTORE_SWITCHABLE`` + - 3 + - Restoration filter is swichtable. + +.. c:type:: v4l2_av1_loop_restoration + +AV1 Loop Restauration as described in section 6.10.15 "Loop restoration params +semantics" of :ref:`av1`. + +.. cssclass:: longtable + +.. tabularcolumns:: |p{1.5cm}|p{5.8cm}|p{10.0cm}| + +.. flat-table:: struct v4l2_av1_loop_restoration + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``flags`` + - See :ref:`AV1 Loop Restoration Flags `. + * - __u8 + - ``lr_unit_shift`` + - Specifies if the luma restoration size should be halved. + * - __u8 + - ``lr_uv_shift`` + - Specifies if the chroma size should be half the luma size. + * - __u8 + - ``reserved`` + - Applications and drivers must set this to zero. + * - :c:type:`v4l2_av1_frame_restoration_type` + - ``frame_restoration_type[V4L2_AV1_NUM_PLANES_MAX]`` + - Specifies the type of restoration used for each plane. + * - __u8 + - ``loop_restoration_size[V4L2_AV1_MAX_NUM_PLANES]`` + - Specifies the size of loop restoration units in units of samples in the + current plane. + +.. _av1_loop_restoration_flags: + +``AV1 Loop Restoration Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_LOOP_RESTORATION_FLAG_USES_LR`` + - 0x00000001 + - Retains the same meaning as UsesLr in :ref:`av1`. + * - ``V4L2_AV1_LOOP_RESTORATION_FLAG_USES_CHROMA_LR`` + - 0x00000002 + - Retains the same meaning as UsesChromaLr in :ref:`av1`. + +.. c:type:: v4l2_av1_cdef + +AV1 CDEF params semantics as described in section 6.10.14 "CDEF params +semantics" of :ref:`av1`. + +.. cssclass:: longtable + +.. tabularcolumns:: |p{1.5cm}|p{5.8cm}|p{10.0cm}| + +.. flat-table:: struct v4l2_av1_cdef + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``damping_minus_3`` + - Controls the amount of damping in the deringing filter. + * - __u8 + - ``bits`` + - Specifies the number of bits needed to specify which CDEF filter to + apply. + * - __u8 + - ``y_pri_strength[V4L2_AV1_CDEF_MAX]`` + - Specifies the strength of the primary filter. + * - __u8 + - ``y_sec_strength[V4L2_AV1_CDEF_MAX]`` + - Specifies the strength of the secondary filter. + * - __u8 + - ``uv_pri_strength[V4L2_AV1_CDEF_MAX]`` + - Specifies the strength of the primary filter. + * - __u8 + - ``uv_secondary_strength[V4L2_AV1_CDEF_MAX]`` + - Specifies the strength of the secondary filter. + +.. c:type:: v4l2_av1_segment_feature + +AV1 segment features as described in section 3 "Symbols and abbreviated terms" +of :ref:`av1`. + +.. raw:: latex + + \scriptsize + +.. tabularcolumns:: |p{7.4cm}|p{0.3cm}|p{9.6cm}| + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_SEG_LVL_ALT_Q`` + - 0 + - Index for quantizer segment feature. + * - ``V4L2_AV1_SEG_LVL_ALT_LF_Y_V`` + - 1 + - Index for vertical luma loop filter segment feature. + * - ``V4L2_AV1_SEG_LVL_REF_FRAME`` + - 5 + - Index for reference frame segment feature. + * - ``V4L2_AV1_SEG_LVL_REF_SKIP`` + - 6 + - Index for skip segment feature. + * - ``V4L2_AV1_SEG_LVL_REF_GLOBALMV`` + - 7 + - Index for global mv feature. + * - ``V4L2_AV1_SEG_LVL_MAX`` + - 8 + - Number of segment features. + +.. c:type:: v4l2_av1_segmentation + +AV1 Segmentation params as defined in section 6.8.13 "Segmentation params +semantics" of :ref:`av1`. + +.. cssclass:: longtable + +.. tabularcolumns:: |p{1.5cm}|p{5.8cm}|p{10.0cm}| + +.. flat-table:: struct v4l2_av1_segmentation + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``flags`` + - See :ref:`AV1 Segmentation Flags ` + * - __u8 + - ``last_active_seg_id`` + - Indicates the highest numbered segment id that has some + enabled feature. This is used when decoding the segment id to only decode + choices corresponding to used segments. + * - __u8 + - ``feature_enabled[V4L2_AV1_MAX_SEGMENTS]`` + - Bitmask defining which features are enabled in each segment. Use + V4L2_AV1_SEGMENT_FEATURE_ENABLED to build a suitable mask. + * - __u16 + - `feature_data[V4L2_AV1_MAX_SEGMENTS][V4L2_AV1_SEG_LVL_MAX]`` + - Data attached to each feature. Data entry is only valid if the feature + is enabled. + +.. _av1_segmentation_flags: + +``AV1 Segmentation Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_SEGMENTATION_FLAG_ENABLED`` + - 0x00000001 + - If set, indicates that this frame makes use of the segmentation tool. If + not set, indicates that the frame does not use segmentation. + * - ``V4L2_AV1_SEGMENTATION_FLAG_UPDATE_MAP`` + - 0x00000002 + - If set, indicates that the segmentation map are updated during the + decoding of this frame. If not set, indicates that the segmentation map + from the previous frame is used. + * - ``V4L2_AV1_SEGMENTATION_FLAG_TEMPORAL_UPDATE`` + - 0x00000004 + - If set, indicates that the updates to the segmentation map are coded + relative to the existing segmentation map. If not set, indicates that + the new segmentation map is coded without reference to the existing + segmentation map. + * - ``V4L2_AV1_SEGMENTATION_FLAG_UPDATE_DATA`` + - 0x00000008 + - If set, indicates that the updates to the segmentation map are coded + relative to the existing segmentation map. If not set, indicates that + the new segmentation map is coded without reference to the existing + segmentation map. + * - ``V4L2_AV1_SEGMENTATION_FLAG_SEG_ID_PRE_SKIP`` + - 0x00000010 + - If set, indicates that the segment id will be read before the skip + syntax element. If not set, indicates that the skip syntax element will + be read first. + +.. c:type:: v4l2_av1_loop_filter + +AV1 Loop filter params as defined in section 6.8.10 "Loop filter semantics" of +:ref:`av1`. + +.. cssclass:: longtable + +.. tabularcolumns:: |p{1.5cm}|p{5.8cm}|p{10.0cm}| + +.. flat-table:: struct v4l2_av1_global_motion + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``flags`` + - See + :ref:`AV1 Loop Filter flags ` for more details. + * - __u8 + - ``level[4]`` + - An array containing loop filter strength values. Different loop + filter strength values from the array are used depending on the image + plane being filtered, and the edge direction (vertical or horizontal) + being filtered. + * - __u8 + - ``sharpness`` + - indicates the sharpness level. The loop_filter_level and + loop_filter_sharpness together determine when a block edge is filtered, + and by how much the filtering can change the sample values. The loop + filter process is described in section 7.14 of :ref:`av1`. + * - __u8 + - ``ref_deltas[V4L2_AV1_TOTAL_REFS_PER_FRAME]`` + - contains the adjustment needed for the filter level based on the + chosen reference frame. If this syntax element is not present, it + maintains its previous value. + * - __u8 + - ``mode_deltas[2]`` + - contains the adjustment needed for the filter level based on + the chosen mode. If this syntax element is not present, it maintains its + previous value. + * - __u8 + - ``delta_lf_res`` + - specifies the left shift which should be applied to decoded loop filter + delta values. + +.. _av1_loop_filter_flags: + +``AV1 Loop Filter Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_LOOP_FILTER_FLAG_DELTA_ENABLED`` + - 0x00000001 + - If set, means that the filter level depends on the mode and reference + frame used to predict a block. If not set, means that the filter level + does not depend on the mode and reference frame. + * - ``V4L2_AV1_LOOP_FILTER_FLAG_DELTA_UPDATE`` + - 0x00000002 + - If set, means that additional syntax elements are present that specify + which mode and reference frame deltas are to be updated. If not set, + means that these syntax elements are not present. + * - ``V4L2_AV1_LOOP_FILTER_FLAG_DELTA_LF_PRESENT`` + - 0x00000004 + - Specifies whether loop filter delta values are present + * - ``V4L2_AV1_LOOP_FILTER_FLAG_DELTA_LF_MULTI`` + - 0x00000008 + - A value equal to 1 specifies that separate loop filter + deltas are sent for horizontal luma edges, vertical luma edges, + the U edges, and the V edges. A value of delta_lf_multi equal to 0 + specifies that the same loop filter delta is used for all edges. + +.. c:type:: v4l2_av1_quantization + +AV1 Quantization params as defined in section 6.8.11 "Quantization params +semantics" of :ref:`av1`. + +.. cssclass:: longtable + +.. tabularcolumns:: |p{1.5cm}|p{5.8cm}|p{10.0cm}| + +.. flat-table:: struct v4l2_av1_quantization + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``flags`` + - See + :ref:`AV1 Loop Filter flags ` for more details. + * - __u8 + - ``base_q_idx`` + - Indicates the base frame qindex. This is used for Y AC coefficients and + as the base value for the other quantizers. + * - __u8 + - ``delta_q_y_dc`` + - Indicates the Y DC quantizer relative to base_q_idx. + * - __u8 + - ``delta_q_u_dc`` + - Indicates the U DC quantizer relative to base_q_idx. + * - __u8 + - ``delta_q_u_ac`` + - Indicates the U AC quantizer relative to base_q_idx. + * - __u8 + - ``delta_q_v_dc`` + - Indicates the V DC quantizer relative to base_q_idx. + * - __u8 + - ``delta_q_v_ac`` + - Indicates the V AC quantizer relative to base_q_idx. + * - __u8 + - ``qm_y`` + - Specifies the level in the quantizer matrix that should be used for + luma plane decoding. + * - __u8 + - ``qm_u`` + - Specifies the level in the quantizer matrix that should be used for + chroma U plane decoding. + * - __u8 + - ``qm_v`` + - Specifies the level in the quantizer matrix that should be used for + chroma V plane decoding. + * - __u8 + - ``delta_q_res`` + - Specifies the left shift which should be applied to decoded quantizer + index delta values. + +.. _av1_quantization_flags: + +``AV1 Quantization Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_QUANTIZATION_FLAG_DIFF_UV_DELTA`` + - 0x00000001 + - If set, indicates that the U and V delta quantizer values are coded + separately. If not set, indicates that the U and V delta quantizer + values share a common value. + * - ``V4L2_AV1_QUANTIZATION_FLAG_USING_QMATRIX`` + - 0x00000002 + - If set, specifies that the quantizer matrix will be used to compute + quantizers. + * - ``V4L2_AV1_QUANTIZATION_FLAG_DELTA_Q_PRESENT`` + - 0x00000004 + - Specifies whether quantizer index delta values are present. + +.. c:type:: v4l2_av1_tile_info + +AV1 Tile info as defined in section 6.8.14 "Tile info semantics" of ref:`av1`. + +.. cssclass:: longtable + +.. tabularcolumns:: |p{1.5cm}|p{5.8cm}|p{10.0cm}| + +.. flat-table:: struct v4l2_av1_tile_info + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``flags`` + - See + :ref:`AV1 Tile Info flags ` for more details. + * - __u8 + - ``context_update_tile_id`` + - Specifies which tile to use for the CDF update. + * - __u8 + - ``tile_cols`` + - Specifies the number of tiles across the frame. + * - __u8 + - ``tile_rows`` + - Specifies the number of tiles down the frame. + * - __u32 + - ``mi_col_starts[V4L2_AV1_MAX_TILE_COLS + 1]`` + - An array specifying the start column (in units of 4x4 luma + samples) for each tile across the image. + * - __u32 + - ``mi_row_starts[V4L2_AV1_MAX_TILE_ROWS + 1]`` + - An array specifying the start row (in units of 4x4 luma + samples) for each tile across the image. + * - __u32 + - ``width_in_sbs_minus_1[V4L2_AV1_MAX_TILE_COLS]`` + - Specifies the width of a tile minus 1 in units of superblocks. + * - __u32 + - ``height_in_sbs_minus_1[V4L2_AV1_MAX_TILE_ROWS]`` + - Specifies the height of a tile minus 1 in units of superblocks. + * - __u8 + - ``tile_size_bytes`` + - Specifies the number of bytes needed to code each tile size. + * - __u8 + - ``reserved[3]`` + - Applications and drivers must set this to zero. + +.. _av1_tile_info_flags: + +``AV1 Tile Info Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_TILE_INFO_FLAG_UNIFORM_TILE_SPACING`` + - 0x00000001 + - If set, means that the tiles are uniformly spaced across the frame. (In + other words, all tiles are the same size except for the ones at the + right and bottom edge which can be smaller). If not set means that the + tile sizes are coded. + +.. c:type:: v4l2_av1_frame_type + +AV1 Frame Type + +.. raw:: latex + + \scriptsize + +.. tabularcolumns:: |p{7.4cm}|p{0.3cm}|p{9.6cm}| + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_KEY_FRAME`` + - 0 + - Key frame. + * - ``V4L2_AV1_INTER_FRAME`` + - 1 + - Inter frame. + * - ``V4L2_AV1_INTRA_ONLY_FRAME`` + - 2 + - Intra-only frame. + * - ``V4L2_AV1_SWITCH_FRAME`` + - 3 + - Switch frame. + +.. c:type:: v4l2_av1_interpolation_filter + +AV1 Interpolation Filter + +.. raw:: latex + + \scriptsize + +.. tabularcolumns:: |p{7.4cm}|p{0.3cm}|p{9.6cm}| + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_INTERPOLATION_FILTER_EIGHTTAP`` + - 0 + - Eight tap filter. + * - ``V4L2_AV1_INTERPOLATION_FILTER_EIGHTTAP_SMOOTH`` + - 1 + - Eight tap smooth filter. + * - ``V4L2_AV1_INTERPOLATION_FILTER_EIGHTTAP_SHARP`` + - 2 + - Eight tap sharp filter. + * - ``V4L2_AV1_INTERPOLATION_FILTER_BILINEAR`` + - 3 + - Bilinear filter. + * - ``V4L2_AV1_INTERPOLATION_FILTER_SWITCHABLE`` + - 4 + - Filter selection is signaled at the block level. + +.. c:type:: v4l2_av1_tx_mode + +AV1 Tx mode as described in section 6.8.21 "TX mode semantics" of :ref:`av1`. + +.. raw:: latex + + \scriptsize + +.. tabularcolumns:: |p{7.4cm}|p{0.3cm}|p{9.6cm}| + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_TX_MODE_ONLY_4X4`` + - 0 + - The inverse transform will use only 4x4 transforms. + * - ``V4L2_AV1_TX_MODE_LARGEST`` + - 1 + - The inverse transform will use the largest transform size that fits + inside the block. + * - ``V4L2_AV1_TX_MODE_SELECT`` + - 2 + - The choice of transform size is specified explicitly for each block. + +``V4L2_CID_STATELESS_AV1_FRAME (struct)`` + Represents a Frame Header OBU. See 6.8 "Frame Header OBU semantics" of + :ref:`av1` for more details. + +.. c:type:: v4l2_ctrl_av1_frame + +.. cssclass:: longtable + +.. tabularcolumns:: |p{5.8cm}|p{4.8cm}|p{6.6cm}| + +.. flat-table:: struct v4l2_ctrl_av1_frame + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - struct :c:type:`v4l2_av1_tile_info` + - ``tile_info`` + - Tile info + * - struct :c:type:`v4l2_av1_quantization` + - ``quantization`` + - Quantization parameters. + * - struct :c:type:`v4l2_av1_segmentation` + - ``segmentation`` + - Segmentation parameters. + * - __u8 + - ``superres_denom`` + - The denominator for the upscaling ratio. + * - struct :c:type:`v4l2_av1_loop_filter` + - ``loop_filter`` + - Loop filter params + * - struct :c:type:`v4l2_av1_cdef` + - ``cdef`` + - CDEF params + * - __u8 + - ``skip_mode_frame[2]`` + - Specifies the frames to use for compound prediction when skip_mode is + equal to 1. + * - __u8 + - ``primary_ref_frame`` + - Specifies which reference frame contains the CDF values and other state + that should be loaded at the start of the frame. + * - struct :c:type:`v4l2_av1_loop_restoration` + - ``loop_restoration`` + - Loop restoration parameters. + * - struct :c:type:`v4l2_av1_loop_global_motion` + - ``global_motion`` + - Global motion parameters. + * - __u32 + - ``flags`` + - See + :ref:`AV1 Frame flags ` for more details. + * - enum :c:type:`v4l2_av1_frame_type` + - ``frame_type`` + - Specifies the AV1 frame type + * - __u32 + - ``order_hint`` + - Specifies OrderHintBits least significant bits of the expected output + order for this frame. + * - __u32 + - ``upscaled_width`` + - The upscaled width. + * - enum :c:type:`v4l2_av1_interpolation_filter` + - ``interpolation_filter`` + - Specifies the filter selection used for performing inter prediction. + * - enum :c:type:`v4l2_av1_tx_mode` + - ``tx_mode`` + - Specifies how the transform size is determined. + * - __u32 + - ``frame_width_minus_1`` + - Add 1 to get the frame's width. + * - __u32 + - ``frame_height_minus_1`` + - Add 1 to get the frame's height. + * - __u16 + - ``render_width_minus_1`` + - Add 1 to get the render width of the frame in luma samples. + * - __u16 + - ``render_height_minus_1`` + - Add 1 to get the render height of the frame in luma samples. + * - __u32 + - ``current_frame_id`` + - Specifies the frame id number for the current frame. Frame + id numbers are additional information that do not affect the decoding + process, but provide decoders with a way of detecting missing reference + frames so that appropriate action can be taken. + * - __u8 + - ``buffer_removal_time[V4L2_AV1_MAX_OPERATING_POINTS]`` + - Specifies the frame removal time in units of DecCT clock ticks counted + from the removal time of the last random access point for operating point + opNum. + * - __u8 + - ``reserved[4]`` + - Applications and drivers must set this to zero. + * - __u32 + - ``order_hints[V4L2_AV1_TOTAL_REFS_PER_FRAME]`` + - Specifies the expected output order hint for each reference frame. + This field corresponds to the OrderHints variable from the specification + (section 5.9.2 "Uncompressed header syntax"). As such, this is only + used for non-intra frames and ignored otherwise. order_hints[0] is + always ignored. + * - __u64 + - ``reference_frame_ts[V4L2_AV1_TOTAL_REFS_PER_FRAME]`` + - The V4L2 timestamp for each of the reference frames enumerated in + enum :c:type:`v4l2_av1_reference_frame` starting at + ``V4L2_AV1_REF_LAST_FRAME``. This represents the state of reference + slot as described in the spec and updated by userland through the + "Reference frame update process" in section 7.20 The timestamp refers + to the ``timestamp`` field in struct :c:type:`v4l2_buffer`. Use the + :c:func:`v4l2_timeval_to_ns()` function to convert the struct + :c:type:`timeval` in struct :c:type:`v4l2_buffer` to a __u64. + * - __s8 + - ``ref_frame_idx[V4L2_AV1_REFS_PER_FRAME]`` + - An index into ``reference_frame_ts`` representing the ordered list of + references used by inter-frame. Matches the bitstream syntax + element of the same name. + * - __u8 + - ``refresh_frame_flags`` + - Contains a bitmask that specifies which reference frame slots will be + updated with the current frame after it is decoded. + +.. _av1_frame_flags: + +``AV1 Frame Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_FRAME_FLAG_SHOW_FRAME`` + - 0x00000001 + - If set, specifies that this frame should be immediately output once + decoded. If not set, specifies that this frame should not be immediately + output; it may be output later if a later uncompressed header uses + show_existing_frame equal to 1. + * - ``V4L2_AV1_FRAME_FLAG_SHOWABLE_FRAME`` + - 0x00000002 + - If set, specifies that the frame may be output using the + show_existing_frame mechanism. If not set, specifies that this frame + will not be output using the show_existing_frame mechanism. + * - ``V4L2_AV1_FRAME_FLAG_ERROR_RESILIENT_MODE`` + - 0x00000004 + - Specifies whether error resilient mode is enabled. + * - ``V4L2_AV1_FRAME_FLAG_DISABLE_CDF_UPDATE`` + - 0x00000008 + - Specifies whether the CDF update in the symbol decoding process should + be disabled. + * - ``V4L2_AV1_FRAME_FLAG_ALLOW_SCREEN_CONTENT_TOOLS`` + - 0x00000010 + - If set, indicates that intra blocks may use palette encoding. If not + set, indicates that palette encoding is never used. + * - ``V4L2_AV1_FRAME_FLAG_FORCE_INTEGER_MV`` + - 0x00000020 + - If set, specifies that motion vectors will always be integers. If not + set, specifies that motion vectors can contain fractional bits. + * - ``V4L2_AV1_FRAME_FLAG_ALLOW_INTRABC`` + - 0x00000040 + - If set, indicates that intra block copy may be used in this frame. If + not set, indicates that intra block copy is not allowed in this frame. + * - ``V4L2_AV1_FRAME_FLAG_USE_SUPERRES`` + - 0x00000080 + - If set, indicates that upscaling is needed. + * - ``V4L2_AV1_FRAME_FLAG_ALLOW_HIGH_PRECISION_MV`` + - 0x00000100 + - If set, specifies that motion vectors are specified to eighth pel + precision. If not set, specifies that motion vectors are specified to + quarter pel precision; + * - ``V4L2_AV1_FRAME_FLAG_IS_MOTION_MODE_SWITCHABLE`` + - 0x00000200 + - If not set, specifies that only the SIMPLE motion mode will be used. + * - ``V4L2_AV1_FRAME_FLAG_USE_REF_FRAME_MVS`` + - 0x00000400 + - If set specifies that motion vector information from a previous frame + can be used when decoding the current frame. If not set, specifies that + this information will not be used. + * - ``V4L2_AV1_FRAME_FLAG_DISABLE_FRAME_END_UPDATE_CDF`` + - 0x00000800 + - If set indicates that the end of frame CDF update is disabled. If not + set, indicates that the end of frame CDF update is enabled + * - ``V4L2_AV1_FRAME_FLAG_ALLOW_WARPED_MOTION`` + - 0x00001000 + - If set, indicates that the syntax element motion_mode may be present, if + not set, indicates that the syntax element motion_mode will not be + present. + * - ``V4L2_AV1_FRAME_FLAG_REFERENCE_SELECT`` + - 0x00002000 + - If set, specifies that the mode info for inter blocks contains the + syntax element comp_mode that indicates whether to use single or + compound reference prediction. If not set, specifies that all inter + blocks will use single prediction. + * - ``V4L2_AV1_FRAME_FLAG_REDUCED_TX_SET`` + - 0x00004000 + - If set, specifies that the frame is restricted to a reduced subset of + the full set of transform types. + * - ``V4L2_AV1_FRAME_FLAG_SKIP_MODE_ALLOWED`` + - 0x00008000 + - This flag retains the same meaning as SkipModeAllowed in :ref:`av1`. + * - ``V4L2_AV1_FRAME_FLAG_SKIP_MODE_PRESENT`` + - 0x00010000 + - If set, specifies that the syntax element skip_mode will be present, if + not set, specifies that skip_mode will not be used for this frame. + * - ``V4L2_AV1_FRAME_FLAG_FRAME_SIZE_OVERRIDE`` + - 0x00020000 + - If set, specifies that the frame size will either be specified as the + size of one of the reference frames, or computed from the + frame_width_minus_1 and frame_height_minus_1 syntax elements. If not + set, specifies that the frame size is equal to the size in the sequence + header. + * - ``V4L2_AV1_FRAME_FLAG_BUFFER_REMOVAL_TIME_PRESENT`` + - 0x00040000 + - If set, specifies that buffer_removal_time is present. If not set, + specifies that buffer_removal_time is not present. + * - ``V4L2_AV1_FRAME_FLAG_FRAME_REFS_SHORT_SIGNALING`` + - 0x00080000 + - If set, indicates that only two reference frames are explicitly + signaled. If not set, indicates that all reference frames are explicitly + signaled. + +``V4L2_CID_STATELESS_AV1_FILM_GRAIN (struct)`` + Represents the optional film grain parameters. See section + 6.8.20 "Film grain params semantics" of :ref:`av1` for more details. + +.. c:type:: v4l2_ctrl_av1_film_grain + +.. cssclass:: longtable + +.. tabularcolumns:: |p{1.5cm}|p{5.8cm}|p{10.0cm}| + +.. flat-table:: struct v4l2_ctrl_av1_film_grain + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - __u8 + - ``flags`` + - See :ref:`AV1 Film Grain Flags `. + * - __u8 + - ``cr_mult`` + - Represents a multiplier for the cr component used in derivation of the + input index to the cr component scaling function. + * - __u16 + - ``grain_seed`` + - Specifies the starting value for the pseudo-random numbers used during + film grain synthesis. + * - __u8 + - ``film_grain_params_ref_idx`` + - Indicates which reference frame contains the film grain parameters to be + used for this frame. + * - __u8 + - ``num_y_points`` + - Specifies the number of points for the piece-wise linear scaling + function of the luma component. + * - __u8 + - ``point_y_value[V4L2_AV1_MAX_NUM_Y_POINTS]`` + - Represents the x (luma value) coordinate for the i-th point + of the piecewise linear scaling function for luma component. The values + are signaled on the scale of 0..255. In case of 10 bit video, these + values correspond to luma values divided by 4. In case of 12 bit video, + these values correspond to luma values divided by 16. + * - __u8 + - ``point_y_scaling[V4L2_AV1_MAX_NUM_Y_POINTS]`` + - Represents the scaling (output) value for the i-th point + of the piecewise linear scaling function for luma component. + * - __u8 + - ``num_cb_points`` + - Specifies the number of points for the piece-wise linear scaling + function of the cb component. + * - __u8 + - ``point_cb_value[V4L2_AV1_MAX_NUM_CB_POINTS]`` + - Represents the x coordinate for the i-th point of the + piece-wise linear scaling function for cb component. The values are + signaled on the scale of 0..255. + * - __u8 + - ``point_cb_scaling[V4L2_AV1_MAX_NUM_CB_POINTS]`` + - Represents the scaling (output) value for the i-th point of the + piecewise linear scaling function for cb component. + * - __u8 + - ``num_cr_points`` + - Represents the number of points for the piece-wise + linear scaling function of the cr component. + * - __u8 + - ``point_cr_value[V4L2_AV1_MAX_NUM_CR_POINTS]`` + - Represents the x coordinate for the i-th point of the + piece-wise linear scaling function for cr component. The values are + signaled on the scale of 0..255. + * - __u8 + - ``point_cr_scaling[V4L2_AV1_MAX_NUM_CR_POINTS]`` + - Represents the scaling (output) value for the i-th point of the + piecewise linear scaling function for cr component. + * - __u8 + - ``grain_scaling_minus_8`` + - Represents the shift - 8 applied to the values of the chroma component. + The grain_scaling_minus_8 can take values of 0..3 and determines the + range and quantization step of the standard deviation of film grain. + * - __u8 + - ``ar_coeff_lag`` + - Specifies the number of auto-regressive coefficients for luma and + chroma. + * - __u8 + - ``ar_coeffs_y_plus_128[V4L2_AV1_AR_COEFFS_SIZE]`` + - Specifies auto-regressive coefficients used for the Y plane. + * - __u8 + - ``ar_coeffs_cb_plus_128[V4L2_AV1_AR_COEFFS_SIZE]`` + - Specifies auto-regressive coefficients used for the U plane. + * - __u8 + - ``ar_coeffs_cr_plus_128[V4L2_AV1_AR_COEFFS_SIZE]`` + - Specifies auto-regressive coefficients used for the V plane. + * - __u8 + - ``ar_coeff_shift_minus_6`` + - Specifies the range of the auto-regressive coefficients. Values of 0, + 1, 2, and 3 correspond to the ranges for auto-regressive coefficients of + [-2, 2), [-1, 1), [-0.5, 0.5) and [-0.25, 0.25) respectively. + * - __u8 + - ``grain_scale_shift`` + - Specifies how much the Gaussian random numbers should be scaled down + during the grain synthesis process. + * - __u8 + - ``cb_mult`` + - Represents a multiplier for the cb component used in derivation of the + input index to the cb component scaling function. + * - __u8 + - ``cb_luma_mult`` + - Represents a multiplier for the average luma component used in + derivation of the input index to the cb component scaling function.. + * - __u8 + - ``cr_luma_mult`` + - Represents a multiplier for the average luma component used in + derivation of the input index to the cr component scaling function. + * - __u16 + - ``cb_offset`` + - Represents an offset used in derivation of the input index to the + cb component scaling function. + * - __u16 + - ``cr_offset`` + - Represents an offset used in derivation of the input index to the + cr component scaling function. + * - __u8 + - ``reserved[4]`` + - Applications and drivers must set this to zero. + +.. _av1_film_grain_flags: + +``AV1 Film Grain Flags`` + +.. cssclass:: longtable + +.. flat-table:: + :header-rows: 0 + :stub-columns: 0 + :widths: 1 1 2 + + * - ``V4L2_AV1_FILM_GRAIN_FLAG_APPLY_GRAIN`` + - 0x00000001 + - If set, specifies that film grain should be added to this frame. If not + set, specifies that film grain should not be added. + * - ``V4L2_AV1_FILM_GRAIN_FLAG_UPDATE_GRAIN`` + - 0x00000002 + - If set, means that a new set of parameters should be sent. If not set, + specifies that the previous set of parameters should be used. + * - ``V4L2_AV1_FILM_GRAIN_FLAG_CHROMA_SCALING_FROM_LUMA`` + - 0x00000004 + - If set, specifies that the chroma scaling is inferred from the luma + scaling. + * - ``V4L2_AV1_FILM_GRAIN_FLAG_OVERLAP`` + - 0x00000008 + - If set, indicates that the overlap between film grain blocks shall be + applied. If not set, indicates that the overlap between film grain blocks + shall not be applied. + * - ``V4L2_AV1_FILM_GRAIN_FLAG_CLIP_TO_RESTRICTED_RANGE`` + - 0x00000010 + - If set, indicates that clipping to the restricted (studio, i.e. limited) + range shall be applied to the sample values after adding the film grain + (see the semantics for color_range for an explanation of studio swing). + If not set, indicates that clipping to the full range shall be applied + to the sample values after adding the film grain. diff --git a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst index 06b78e5589d2..806ed73ac474 100644 --- a/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst +++ b/Documentation/userspace-api/media/v4l/pixfmt-compressed.rst @@ -258,6 +258,22 @@ Compressed Formats RV9 players - the format and decoder did not change, only the encoder did. As a result, it uses the same FourCC. + * .. _V4L2-PIX-FMT-AV1-FRAME: + + - ``V4L2_PIX_FMT_AV1_FRAME`` + - 'AV1F' + - AV1 parsed frame, including the frame header, as extracted from the container. + This format is adapted for stateless video decoders that implement a AV1 + pipeline with the :ref:`stateless_decoder`. Metadata associated with the + frame to decode is required to be passed through the + ``V4L2_CID_STATELESS_AV1_SEQUENCE``, ``V4L2_CID_STATELESS_AV1_FRAME``, + and ``V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY`` controls. + See the :ref:`associated Codec Control IDs `. + Exactly one output and one capture buffer must be provided for use with + this pixel format. The output buffer must contain the appropriate number + of macroblocks to decode a full corresponding frame to the matching + capture buffer. + .. raw:: latex \normalsize diff --git a/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst b/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst index 6d85ec6a19b4..f9f73530a6be 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-g-ext-ctrls.rst @@ -279,6 +279,22 @@ still cause this situation. - ``p_hevc_decode_params`` - A pointer to a struct :c:type:`v4l2_ctrl_hevc_decode_params`. Valid if this control is of type ``V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS``. + * - struct :c:type:`v4l2_ctrl_av1_sequence` * + - ``p_av1_sequence`` + - A pointer to a struct :c:type:`v4l2_ctrl_av1_sequence`. Valid if this control is + of type ``V4L2_CTRL_TYPE_AV1_SEQUENCE``. + * - struct :c:type:`v4l2_ctrl_av1_tile_group_entry` * + - ``p_av1_tile_group_entry`` + - A pointer to a struct :c:type:`v4l2_ctrl_av1_tile_group_entry`. Valid if this control is + of type ``V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY``. + * - struct :c:type:`v4l2_ctrl_av1_frame` * + - ``p_av1_frame`` + - A pointer to a struct :c:type:`v4l2_ctrl_av1_frame`. Valid if this control is + of type ``V4L2_CTRL_TYPE_AV1_FRAME``. + * - struct :c:type:`v4l2_ctrl_av1_film_grain` * + - ``p_av1_film_grain`` + - A pointer to a struct :c:type:`v4l2_ctrl_av1_film_grain`. Valid if this control is + of type ``V4L2_CTRL_TYPE_AV1_FILM_GRAIN``. * - void * - ``ptr`` - A pointer to a compound type which can be an N-dimensional array diff --git a/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst b/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst index a20dfa2a933b..4d38acafe8e1 100644 --- a/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst +++ b/Documentation/userspace-api/media/v4l/vidioc-queryctrl.rst @@ -525,6 +525,30 @@ See also the examples in :ref:`control`. - n/a - A struct :c:type:`v4l2_ctrl_vp9_frame`, containing VP9 frame decode parameters for stateless video decoders. + * - ``V4L2_CTRL_TYPE_AV1_SEQUENCE`` + - n/a + - n/a + - n/a + - A struct :c:type:`v4l2_ctrl_av1_sequence`, containing AV1 Sequence OBU + decoding parameters for stateless video decoders. + * - ``V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY`` + - n/a + - n/a + - n/a + - A struct :c:type:`v4l2_ctrl_av1_tile_group_entry`, containing AV1 Tile Group + OBU decoding parameters for stateless video decoders. + * - ``V4L2_CTRL_TYPE_AV1_FRAME`` + - n/a + - n/a + - n/a + - A struct :c:type:`v4l2_ctrl_av1_frame`, containing AV1 Frame/Frame + Header OBU decoding parameters for stateless video decoders. + * - ``V4L2_CTRL_TYPE_AV1_FILM_GRAIN`` + - n/a + - n/a + - n/a + - A struct :c:type:`v4l2_ctrl_av1_film_grain`, containing AV1 Film Grain + parameters for stateless video decoders. .. raw:: latex diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/videodev2.h.rst.exceptions index 2a589d34b80e..3e58aac4ef0b 100644 --- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions +++ b/Documentation/userspace-api/media/videodev2.h.rst.exceptions @@ -161,6 +161,10 @@ replace symbol V4L2_CTRL_TYPE_HEVC_PPS :c:type:`v4l2_ctrl_type` replace symbol V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS :c:type:`v4l2_ctrl_type` replace symbol V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX :c:type:`v4l2_ctrl_type` replace symbol V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS :c:type:`v4l2_ctrl_type` +replace symbol V4L2_CTRL_TYPE_AV1_SEQUENCE :c:type:`v4l2_ctrl_type` +replace symbol V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY :c:type:`v4l2_ctrl_type` +replace symbol V4L2_CTRL_TYPE_AV1_FRAME :c:type:`v4l2_ctrl_type` +replace symbol V4L2_CTRL_TYPE_AV1_FILM_GRAIN :c:type:`v4l2_ctrl_type` # V4L2 capability defines replace define V4L2_CAP_VIDEO_CAPTURE device-capabilities diff --git a/drivers/media/v4l2-core/v4l2-ctrls-core.c b/drivers/media/v4l2-core/v4l2-ctrls-core.c index 29169170880a..9fd37e94db17 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-core.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-core.c @@ -350,6 +350,19 @@ void v4l2_ctrl_type_op_log(const struct v4l2_ctrl *ctrl) case V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS: pr_cont("HEVC_DECODE_PARAMS"); break; + case V4L2_CTRL_TYPE_AV1_SEQUENCE: + pr_cont("AV1_SEQUENCE"); + break; + case V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY: + pr_cont("AV1_TILE_GROUP_ENTRY"); + break; + case V4L2_CTRL_TYPE_AV1_FRAME: + pr_cont("AV1_FRAME"); + break; + case V4L2_CTRL_TYPE_AV1_FILM_GRAIN: + pr_cont("AV1_FILM_GRAIN"); + break; + default: pr_cont("unknown type %d", ctrl->type); break; @@ -547,6 +560,231 @@ validate_vp9_frame(struct v4l2_ctrl_vp9_frame *frame) return 0; } +static int validate_av1_quantization(struct v4l2_av1_quantization *q) +{ + if (q->flags > GENMASK(2, 0)) + return -EINVAL; + + if (q->delta_q_y_dc < -64 || q->delta_q_y_dc > 63 || + q->delta_q_u_dc < -64 || q->delta_q_u_dc > 63 || + q->delta_q_v_dc < -64 || q->delta_q_v_dc > 63 || + q->delta_q_u_ac < -64 || q->delta_q_u_ac > 63 || + q->delta_q_v_ac < -64 || q->delta_q_v_ac > 63 || + q->delta_q_res > GENMASK(1, 0)) + return -EINVAL; + + if (q->qm_y > GENMASK(3, 0) || + q->qm_u > GENMASK(3, 0) || + q->qm_v > GENMASK(3, 0)) + return -EINVAL; + + return 0; +} + +static int validate_av1_segmentation(struct v4l2_av1_segmentation *s) +{ + u32 i; + u32 j; + + if (s->flags > GENMASK(4, 0)) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(s->feature_data); i++) { + static const int segmentation_feature_signed[] = { 1, 1, 1, 1, 1, 0, 0, 0 }; + static const int segmentation_feature_max[] = { 255, 63, 63, 63, 63, 7, 0, 0}; + + for (j = 0; j < ARRAY_SIZE(s->feature_data[j]); j++) { + s32 limit = segmentation_feature_max[j]; + + if (segmentation_feature_signed[j]) { + if (s->feature_data[i][j] < -limit || + s->feature_data[i][j] > limit) + return -EINVAL; + } else { + if (s->feature_data[i][j] < 0 || s->feature_data[i][j] > limit) + return -EINVAL; + } + } + } + + return 0; +} + +static int validate_av1_loop_filter(struct v4l2_av1_loop_filter *lf) +{ + u32 i; + + if (lf->flags > GENMASK(3, 0)) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(lf->level); i++) { + if (lf->level[i] > GENMASK(5, 0)) + return -EINVAL; + } + + if (lf->sharpness > GENMASK(2, 0)) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(lf->ref_deltas); i++) { + if (lf->ref_deltas[i] < -64 || lf->ref_deltas[i] > 63) + return -EINVAL; + } + + for (i = 0; i < ARRAY_SIZE(lf->mode_deltas); i++) { + if (lf->mode_deltas[i] < -64 || lf->mode_deltas[i] > 63) + return -EINVAL; + } + + return 0; +} + +static int validate_av1_cdef(struct v4l2_av1_cdef *cdef) +{ + u32 i; + + if (cdef->damping_minus_3 > GENMASK(1, 0) || + cdef->bits > GENMASK(1, 0)) + return -EINVAL; + + for (i = 0; i < 1 << cdef->bits; i++) { + if (cdef->y_pri_strength[i] > GENMASK(3, 0) || + cdef->y_sec_strength[i] > 4 || + cdef->uv_pri_strength[i] > GENMASK(3, 0) || + cdef->uv_sec_strength[i] > 4) + return -EINVAL; + } + + return 0; +} + +static int validate_av1_loop_restauration(struct v4l2_av1_loop_restoration *lr) +{ + if (lr->lr_unit_shift > 3 || lr->lr_uv_shift > 1) + return -EINVAL; + + return 0; +} + +static int validate_av1_film_grain(struct v4l2_ctrl_av1_film_grain *fg) +{ + u32 i; + + if (fg->flags > GENMASK(4, 0)) + return -EINVAL; + + if (fg->film_grain_params_ref_idx > GENMASK(2, 0) || + fg->num_y_points > 14 || + fg->num_cb_points > 10 || + fg->num_cr_points > GENMASK(3, 0) || + fg->grain_scaling_minus_8 > GENMASK(1, 0) || + fg->ar_coeff_lag > GENMASK(1, 0) || + fg->ar_coeff_shift_minus_6 > GENMASK(1, 0) || + fg->grain_scale_shift > GENMASK(1, 0)) + return -EINVAL; + + if (!(fg->flags & V4L2_AV1_FILM_GRAIN_FLAG_APPLY_GRAIN)) + return 0; + + for (i = 1; i < fg->num_y_points; i++) + if (fg->point_y_value[i] <= fg->point_y_value[i - 1]) + return -EINVAL; + + for (i = 1; i < fg->num_cb_points; i++) + if (fg->point_cb_value[i] <= fg->point_cb_value[i - 1]) + return -EINVAL; + + for (i = 1; i < fg->num_cr_points; i++) + if (fg->point_cr_value[i] <= fg->point_cr_value[i - 1]) + return -EINVAL; + + return 0; +} + +static int validate_av1_frame(struct v4l2_ctrl_av1_frame *f) +{ + int ret = 0; + + ret = validate_av1_quantization(&f->quantization); + if (ret) + return ret; + ret = validate_av1_segmentation(&f->segmentation); + if (ret) + return ret; + ret = validate_av1_loop_filter(&f->loop_filter); + if (ret) + return ret; + ret = validate_av1_cdef(&f->cdef); + if (ret) + return ret; + ret = validate_av1_loop_restauration(&f->loop_restoration); + if (ret) + return ret; + + if (f->flags & + ~(V4L2_AV1_FRAME_FLAG_SHOW_FRAME | + V4L2_AV1_FRAME_FLAG_SHOWABLE_FRAME | + V4L2_AV1_FRAME_FLAG_ERROR_RESILIENT_MODE | + V4L2_AV1_FRAME_FLAG_DISABLE_CDF_UPDATE | + V4L2_AV1_FRAME_FLAG_ALLOW_SCREEN_CONTENT_TOOLS | + V4L2_AV1_FRAME_FLAG_FORCE_INTEGER_MV | + V4L2_AV1_FRAME_FLAG_ALLOW_INTRABC | + V4L2_AV1_FRAME_FLAG_USE_SUPERRES | + V4L2_AV1_FRAME_FLAG_ALLOW_HIGH_PRECISION_MV | + V4L2_AV1_FRAME_FLAG_IS_MOTION_MODE_SWITCHABLE | + V4L2_AV1_FRAME_FLAG_USE_REF_FRAME_MVS | + V4L2_AV1_FRAME_FLAG_DISABLE_FRAME_END_UPDATE_CDF | + V4L2_AV1_FRAME_FLAG_ALLOW_WARPED_MOTION | + V4L2_AV1_FRAME_FLAG_REFERENCE_SELECT | + V4L2_AV1_FRAME_FLAG_REDUCED_TX_SET | + V4L2_AV1_FRAME_FLAG_SKIP_MODE_ALLOWED | + V4L2_AV1_FRAME_FLAG_SKIP_MODE_PRESENT | + V4L2_AV1_FRAME_FLAG_FRAME_SIZE_OVERRIDE | + V4L2_AV1_FRAME_FLAG_BUFFER_REMOVAL_TIME_PRESENT | + V4L2_AV1_FRAME_FLAG_FRAME_REFS_SHORT_SIGNALING)) + return -EINVAL; + + if (f->superres_denom > GENMASK(2, 0) + 9) + return -EINVAL; + + return 0; +} + +static int validate_av1_sequence(struct v4l2_ctrl_av1_sequence *s) +{ + if (s->flags & + ~(V4L2_AV1_SEQUENCE_FLAG_STILL_PICTURE | + V4L2_AV1_SEQUENCE_FLAG_USE_128X128_SUPERBLOCK | + V4L2_AV1_SEQUENCE_FLAG_ENABLE_FILTER_INTRA | + V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTRA_EDGE_FILTER | + V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTERINTRA_COMPOUND | + V4L2_AV1_SEQUENCE_FLAG_ENABLE_MASKED_COMPOUND | + V4L2_AV1_SEQUENCE_FLAG_ENABLE_WARPED_MOTION | + V4L2_AV1_SEQUENCE_FLAG_ENABLE_DUAL_FILTER | + V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT | + V4L2_AV1_SEQUENCE_FLAG_ENABLE_JNT_COMP | + V4L2_AV1_SEQUENCE_FLAG_ENABLE_REF_FRAME_MVS | + V4L2_AV1_SEQUENCE_FLAG_ENABLE_SUPERRES | + V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF | + V4L2_AV1_SEQUENCE_FLAG_ENABLE_RESTORATION | + V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME | + V4L2_AV1_SEQUENCE_FLAG_COLOR_RANGE | + V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_X | + V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_Y | + V4L2_AV1_SEQUENCE_FLAG_FILM_GRAIN_PARAMS_PRESENT | + V4L2_AV1_SEQUENCE_FLAG_SEPARATE_UV_DELTA_Q)) + return -EINVAL; + + if (s->seq_profile == 1 && s->flags & V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME) + return -EINVAL; + + /* reserved */ + if (s->seq_profile > 2) + return -EINVAL; + + /* TODO: PROFILES */ + return 0; +} + /* * Compound controls validation requires setting unused fields/flags to zero * in order to properly detect unchanged controls with v4l2_ctrl_type_op_equal's @@ -911,6 +1149,14 @@ static int std_validate_compound(const struct v4l2_ctrl *ctrl, u32 idx, case V4L2_CTRL_TYPE_VP9_FRAME: return validate_vp9_frame(p); + case V4L2_CTRL_TYPE_AV1_FRAME: + return validate_av1_frame(p); + case V4L2_CTRL_TYPE_AV1_SEQUENCE: + return validate_av1_sequence(p); + case V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY: + break; + case V4L2_CTRL_TYPE_AV1_FILM_GRAIN: + return validate_av1_film_grain(p); case V4L2_CTRL_TYPE_AREA: area = p; @@ -1602,6 +1848,18 @@ static struct v4l2_ctrl *v4l2_ctrl_new(struct v4l2_ctrl_handler *hdl, case V4L2_CTRL_TYPE_VP9_FRAME: elem_size = sizeof(struct v4l2_ctrl_vp9_frame); break; + case V4L2_CTRL_TYPE_AV1_SEQUENCE: + elem_size = sizeof(struct v4l2_ctrl_av1_sequence); + break; + case V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY: + elem_size = sizeof(struct v4l2_ctrl_av1_tile_group_entry); + break; + case V4L2_CTRL_TYPE_AV1_FRAME: + elem_size = sizeof(struct v4l2_ctrl_av1_frame); + break; + case V4L2_CTRL_TYPE_AV1_FILM_GRAIN: + elem_size = sizeof(struct v4l2_ctrl_av1_film_grain); + break; case V4L2_CTRL_TYPE_AREA: elem_size = sizeof(struct v4l2_area); break; diff --git a/drivers/media/v4l2-core/v4l2-ctrls-defs.c b/drivers/media/v4l2-core/v4l2-ctrls-defs.c index 564fedee2c88..8696eb1cdd61 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls-defs.c +++ b/drivers/media/v4l2-core/v4l2-ctrls-defs.c @@ -499,6 +499,40 @@ const char * const *v4l2_ctrl_get_menu(u32 id) NULL, }; + static const char * const av1_profile[] = { + "Main", + "High", + "Professional", + NULL, + }; + static const char * const av1_level[] = { + "2.0", + "2.1", + "2.2", + "2.3", + "3.0", + "3.1", + "3.2", + "3.3", + "4.0", + "4.1", + "4.2", + "4.3", + "5.0", + "5.1", + "5.2", + "5.3", + "6.0", + "6.1", + "6.2", + "6.3", + "7.0", + "7.1", + "7.2", + "7.3", + NULL, + }; + static const char * const hevc_profile[] = { "Main", "Main Still Picture", @@ -704,6 +738,10 @@ const char * const *v4l2_ctrl_get_menu(u32 id) return hevc_tier; case V4L2_CID_MPEG_VIDEO_HEVC_LOOP_FILTER_MODE: return hevc_loop_filter_mode; + case V4L2_CID_MPEG_VIDEO_AV1_PROFILE: + return av1_profile; + case V4L2_CID_MPEG_VIDEO_AV1_LEVEL: + return av1_level; case V4L2_CID_STATELESS_HEVC_DECODE_MODE: return hevc_decode_mode; case V4L2_CID_STATELESS_HEVC_START_CODE: @@ -1004,6 +1042,10 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_MPEG_VIDEO_REF_NUMBER_FOR_PFRAMES: return "Reference Frames for a P-Frame"; case V4L2_CID_MPEG_VIDEO_PREPEND_SPSPPS_TO_IDR: return "Prepend SPS and PPS to IDR"; + /* AV1 controls */ + case V4L2_CID_MPEG_VIDEO_AV1_PROFILE: return "AV1 Profile"; + case V4L2_CID_MPEG_VIDEO_AV1_LEVEL: return "AV1 Level"; + /* CAMERA controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ case V4L2_CID_CAMERA_CLASS: return "Camera Controls"; @@ -1190,6 +1232,10 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_STATELESS_HEVC_DECODE_MODE: return "HEVC Decode Mode"; case V4L2_CID_STATELESS_HEVC_START_CODE: return "HEVC Start Code"; case V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS: return "HEVC Entry Point Offsets"; + case V4L2_CID_STATELESS_AV1_SEQUENCE: return "AV1 Sequence Parameters"; + case V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY: return "AV1 Tile Group Entry"; + case V4L2_CID_STATELESS_AV1_FRAME: return "AV1 Frame Parameters"; + case V4L2_CID_STATELESS_AV1_FILM_GRAIN: return "AV1 Film Grain"; /* Colorimetry controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ @@ -1365,6 +1411,8 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, case V4L2_CID_MPEG_VIDEO_HEVC_SIZE_OF_LENGTH_FIELD: case V4L2_CID_MPEG_VIDEO_HEVC_TIER: case V4L2_CID_MPEG_VIDEO_HEVC_LOOP_FILTER_MODE: + case V4L2_CID_MPEG_VIDEO_AV1_PROFILE: + case V4L2_CID_MPEG_VIDEO_AV1_LEVEL: case V4L2_CID_STATELESS_HEVC_DECODE_MODE: case V4L2_CID_STATELESS_HEVC_START_CODE: case V4L2_CID_STATELESS_H264_DECODE_MODE: @@ -1531,6 +1579,19 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, case V4L2_CID_STATELESS_VP9_FRAME: *type = V4L2_CTRL_TYPE_VP9_FRAME; break; + case V4L2_CID_STATELESS_AV1_SEQUENCE: + *type = V4L2_CTRL_TYPE_AV1_SEQUENCE; + break; + case V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY: + *type = V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY; + *flags |= V4L2_CTRL_FLAG_DYNAMIC_ARRAY; + break; + case V4L2_CID_STATELESS_AV1_FRAME: + *type = V4L2_CTRL_TYPE_AV1_FRAME; + break; + case V4L2_CID_STATELESS_AV1_FILM_GRAIN: + *type = V4L2_CTRL_TYPE_AV1_FILM_GRAIN; + break; case V4L2_CID_UNIT_CELL_SIZE: *type = V4L2_CTRL_TYPE_AREA; *flags |= V4L2_CTRL_FLAG_READ_ONLY; diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index a858acea6547..41d7a8e3712a 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -1506,6 +1506,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt) case V4L2_PIX_FMT_QC08C: descr = "QCOM Compressed 8-bit Format"; break; case V4L2_PIX_FMT_QC10C: descr = "QCOM Compressed 10-bit Format"; break; case V4L2_PIX_FMT_AJPG: descr = "Aspeed JPEG"; break; + case V4L2_PIX_FMT_AV1_FRAME: descr = "AV1 Frame"; break; default: if (fmt->description[0]) return; diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h index 7788eeb3e2bb..59679a42b3e7 100644 --- a/include/media/v4l2-ctrls.h +++ b/include/media/v4l2-ctrls.h @@ -52,6 +52,10 @@ struct video_device; * @p_hdr10_cll: Pointer to an HDR10 Content Light Level structure. * @p_hdr10_mastering: Pointer to an HDR10 Mastering Display structure. * @p_area: Pointer to an area. + * @p_av1_sequence: Pointer to an AV1 sequence structure. + * @p_av1_tile_group_entry: Pointer to an AV1 tile group entry structure. + * @p_av1_frame: Pointer to an AV1 frame structure. + * @p_av1_film_grain: Pointer to an AV1 film grain structure. * @p: Pointer to a compound value. * @p_const: Pointer to a constant compound value. */ @@ -81,6 +85,10 @@ union v4l2_ctrl_ptr { struct v4l2_ctrl_hdr10_cll_info *p_hdr10_cll; struct v4l2_ctrl_hdr10_mastering_display *p_hdr10_mastering; struct v4l2_area *p_area; + struct v4l2_ctrl_av1_sequence *p_av1_sequence; + struct v4l2_ctrl_av1_tile_group_entry *p_av1_tile_group_entry; + struct v4l2_ctrl_av1_frame *p_av1_frame; + struct v4l2_ctrl_av1_film_grain *p_av1_film_grain; void *p; const void *p_const; }; diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 7bf59a87a1bf..c3604a0a3e30 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -804,6 +804,88 @@ enum v4l2_mpeg_video_frame_skip_mode { #define V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY (V4L2_CID_CODEC_BASE + 653) #define V4L2_CID_MPEG_VIDEO_DEC_DISPLAY_DELAY_ENABLE (V4L2_CID_CODEC_BASE + 654) +#define V4L2_CID_MPEG_VIDEO_AV1_PROFILE (V4L2_CID_CODEC_BASE + 655) +/** + * enum v4l2_mpeg_video_av1_profile - AV1 profiles + * + * @V4L2_MPEG_VIDEO_AV1_PROFILE_MAIN: compliant decoders must be able to decode + * streams with seq_profile equal to 0. + * @V4L2_MPEG_VIDEO_AV1_PROFILE_HIGH: compliant decoders must be able to decode + * streams with seq_profile equal less than or equal to 1. + * @V4L2_MPEG_VIDEO_AV1_PROFILE_PROFESSIONAL: compliant decoders must be able to + * decode streams with seq_profile less than or equal to 2. + * + * Conveys the highest profile a decoder can work with. + */ +enum v4l2_mpeg_video_av1_profile { + V4L2_MPEG_VIDEO_AV1_PROFILE_MAIN = 0, + V4L2_MPEG_VIDEO_AV1_PROFILE_HIGH = 1, + V4L2_MPEG_VIDEO_AV1_PROFILE_PROFESSIONAL = 2, +}; + +#define V4L2_CID_MPEG_VIDEO_AV1_LEVEL (V4L2_CID_CODEC_BASE + 656) +/** + * enum v4l2_mpeg_video_av1_level - AV1 levels + * + * @V4L2_MPEG_VIDEO_AV1_LEVEL_2_0: Level 2.0. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_2_1: Level 2.1. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_2_2: Level 2.2. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_2_3: Level 2.3. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_3_0: Level 3.0. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_3_1: Level 3.1. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_3_2: Level 3.2. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_3_3: Level 3.3. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_4_0: Level 4.0. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_4_1: Level 4.1. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_4_2: Level 4.2. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_4_3: Level 4.3. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_5_0: Level 5.0. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_5_1: Level 5.1. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_5_2: Level 5.2. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_5_3: Level 5.3. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_6_0: Level 6.0. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_6_1: Level 6.1. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_6_2: Level 6.2. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_6_3: Level 6.3. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_7_0: Level 7.0. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_7_1: Level 7.1. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_7_2: Level 7.2. + * @V4L2_MPEG_VIDEO_AV1_LEVEL_7_3: Level 7.3. + * + * Conveys the highest level a decoder can work with. + */ +enum v4l2_mpeg_video_av1_level { + V4L2_MPEG_VIDEO_AV1_LEVEL_2_0 = 0, + V4L2_MPEG_VIDEO_AV1_LEVEL_2_1 = 1, + V4L2_MPEG_VIDEO_AV1_LEVEL_2_2 = 2, + V4L2_MPEG_VIDEO_AV1_LEVEL_2_3 = 3, + + V4L2_MPEG_VIDEO_AV1_LEVEL_3_0 = 4, + V4L2_MPEG_VIDEO_AV1_LEVEL_3_1 = 5, + V4L2_MPEG_VIDEO_AV1_LEVEL_3_2 = 6, + V4L2_MPEG_VIDEO_AV1_LEVEL_3_3 = 7, + + V4L2_MPEG_VIDEO_AV1_LEVEL_4_0 = 8, + V4L2_MPEG_VIDEO_AV1_LEVEL_4_1 = 9, + V4L2_MPEG_VIDEO_AV1_LEVEL_4_2 = 10, + V4L2_MPEG_VIDEO_AV1_LEVEL_4_3 = 11, + + V4L2_MPEG_VIDEO_AV1_LEVEL_5_0 = 12, + V4L2_MPEG_VIDEO_AV1_LEVEL_5_1 = 13, + V4L2_MPEG_VIDEO_AV1_LEVEL_5_2 = 14, + V4L2_MPEG_VIDEO_AV1_LEVEL_5_3 = 15, + + V4L2_MPEG_VIDEO_AV1_LEVEL_6_0 = 16, + V4L2_MPEG_VIDEO_AV1_LEVEL_6_1 = 17, + V4L2_MPEG_VIDEO_AV1_LEVEL_6_2 = 18, + V4L2_MPEG_VIDEO_AV1_LEVEL_6_3 = 19, + + V4L2_MPEG_VIDEO_AV1_LEVEL_7_0 = 20, + V4L2_MPEG_VIDEO_AV1_LEVEL_7_1 = 21, + V4L2_MPEG_VIDEO_AV1_LEVEL_7_2 = 22, + V4L2_MPEG_VIDEO_AV1_LEVEL_7_3 = 23 +}; + /* MPEG-class control IDs specific to the CX2341x driver as defined by V4L2 */ #define V4L2_CID_CODEC_CX2341X_BASE (V4L2_CTRL_CLASS_CODEC | 0x1000) #define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE (V4L2_CID_CODEC_CX2341X_BASE+0) @@ -2758,6 +2840,645 @@ struct v4l2_ctrl_vp9_compressed_hdr { struct v4l2_vp9_mv_probs mv; }; +/* Stateless AV1 controls */ + +#define V4L2_AV1_TOTAL_REFS_PER_FRAME 8 +#define V4L2_AV1_CDEF_MAX 8 +#define V4L2_AV1_NUM_PLANES_MAX 3 /* 1 if monochrome, 3 otherwise */ +#define V4L2_AV1_MAX_SEGMENTS 8 +#define V4L2_AV1_MAX_OPERATING_POINTS (1 << 5) /* 5 bits to encode */ +#define V4L2_AV1_REFS_PER_FRAME 7 +#define V4L2_AV1_MAX_NUM_Y_POINTS (1 << 4) /* 4 bits to encode */ +#define V4L2_AV1_MAX_NUM_CB_POINTS (1 << 4) /* 4 bits to encode */ +#define V4L2_AV1_MAX_NUM_CR_POINTS (1 << 4) /* 4 bits to encode */ +#define V4L2_AV1_AR_COEFFS_SIZE 25 /* (2 * 3 * (3 + 1)) + 1 */ +#define V4L2_AV1_MAX_NUM_PLANES 3 +#define V4L2_AV1_MAX_TILE_COLS 64 +#define V4L2_AV1_MAX_TILE_ROWS 64 +#define V4L2_AV1_MAX_TILE_COUNT 512 + +#define V4L2_AV1_SEQUENCE_FLAG_STILL_PICTURE 0x00000001 +#define V4L2_AV1_SEQUENCE_FLAG_USE_128X128_SUPERBLOCK 0x00000002 +#define V4L2_AV1_SEQUENCE_FLAG_ENABLE_FILTER_INTRA 0x00000004 +#define V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTRA_EDGE_FILTER 0x00000008 +#define V4L2_AV1_SEQUENCE_FLAG_ENABLE_INTERINTRA_COMPOUND 0x00000010 +#define V4L2_AV1_SEQUENCE_FLAG_ENABLE_MASKED_COMPOUND 0x00000020 +#define V4L2_AV1_SEQUENCE_FLAG_ENABLE_WARPED_MOTION 0x00000040 +#define V4L2_AV1_SEQUENCE_FLAG_ENABLE_DUAL_FILTER 0x00000080 +#define V4L2_AV1_SEQUENCE_FLAG_ENABLE_ORDER_HINT 0x00000100 +#define V4L2_AV1_SEQUENCE_FLAG_ENABLE_JNT_COMP 0x00000200 +#define V4L2_AV1_SEQUENCE_FLAG_ENABLE_REF_FRAME_MVS 0x00000400 +#define V4L2_AV1_SEQUENCE_FLAG_ENABLE_SUPERRES 0x00000800 +#define V4L2_AV1_SEQUENCE_FLAG_ENABLE_CDEF 0x00001000 +#define V4L2_AV1_SEQUENCE_FLAG_ENABLE_RESTORATION 0x00002000 +#define V4L2_AV1_SEQUENCE_FLAG_MONO_CHROME 0x00004000 +#define V4L2_AV1_SEQUENCE_FLAG_COLOR_RANGE 0x00008000 +#define V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_X 0x00010000 +#define V4L2_AV1_SEQUENCE_FLAG_SUBSAMPLING_Y 0x00020000 +#define V4L2_AV1_SEQUENCE_FLAG_FILM_GRAIN_PARAMS_PRESENT 0x00040000 +#define V4L2_AV1_SEQUENCE_FLAG_SEPARATE_UV_DELTA_Q 0x00080000 + +#define V4L2_CID_STATELESS_AV1_SEQUENCE (V4L2_CID_CODEC_STATELESS_BASE + 500) +/** + * struct v4l2_ctrl_av1_sequence - AV1 Sequence + * + * Represents an AV1 Sequence OBU. See section 5.5 "Sequence header OBU syntax" + * for more details. + * + * @flags: See V4L2_AV1_SEQUENCE_FLAG_{}. + * @seq_profile: specifies the features that can be used in the coded video + * sequence. + * @order_hint_bits: specifies the number of bits used for the order_hint field + * at each frame. + * @bit_depth: the bitdepth to use for the sequence as described in section + * 5.5.2 "Color config syntax". + * @reserved: padding field. Should be zeroed by applications. + * @max_frame_width_minus_1: specifies the maximum frame width minus 1 for the + * frames represented by this sequence header. + * @max_frame_height_minus_1: specifies the maximum frame height minus 1 for the + * frames represented by this sequence header. + */ +struct v4l2_ctrl_av1_sequence { + __u32 flags; + __u8 seq_profile; + __u8 order_hint_bits; + __u8 bit_depth; + __u8 reserved; + __u16 max_frame_width_minus_1; + __u16 max_frame_height_minus_1; +}; + +#define V4L2_CID_STATELESS_AV1_TILE_GROUP_ENTRY (V4L2_CID_CODEC_STATELESS_BASE + 501) +/** + * struct v4l2_ctrl_av1_tile_group_entry - AV1 Tile Group entry + * + * Represents a single AV1 tile inside an AV1 Tile Group. Note that MiRowStart, + * MiRowEnd, MiColStart and MiColEnd can be retrieved from struct + * v4l2_av1_tile_info in struct v4l2_ctrl_av1_frame using tile_row and + * tile_col. See section 6.10.1 "General tile group OBU semantics" for more + * details. + * + * @tile_offset: offset from the OBU data, i.e. where the coded tile data + * actually starts. + * @tile_size: specifies the size in bytes of the coded tile. Equivalent to + * "TileSize" in the AV1 Specification. + * @tile_row: specifies the row of the current tile. Equivalent to "TileRow" in + * the AV1 Specification. + * @tile_col: specifies the col of the current tile. Equivalent to "TileCol" in + * the AV1 Specification. + */ +struct v4l2_ctrl_av1_tile_group_entry { + __u32 tile_offset; + __u32 tile_size; + __u32 tile_row; + __u32 tile_col; +}; + +/** + * enum v4l2_av1_warp_model - AV1 Warp Model as described in section 3 + * "Symbols and abbreviated terms" of the AV1 Specification. + * + * @V4L2_AV1_WARP_MODEL_IDENTITY: Warp model is just an identity transform. + * @V4L2_AV1_WARP_MODEL_TRANSLATION: Warp model is a pure translation. + * @V4L2_AV1_WARP_MODEL_ROTZOOM: Warp model is a rotation + symmetric zoom + + * translation. + * @V4L2_AV1_WARP_MODEL_AFFINE: Warp model is a general affine transform. + */ +enum v4l2_av1_warp_model { + V4L2_AV1_WARP_MODEL_IDENTITY = 0, + V4L2_AV1_WARP_MODEL_TRANSLATION = 1, + V4L2_AV1_WARP_MODEL_ROTZOOM = 2, + V4L2_AV1_WARP_MODEL_AFFINE = 3, +}; + +/** + * enum v4l2_av1_reference_frame - AV1 reference frames + * + * @V4L2_AV1_REF_INTRA_FRAME: Intra Frame Reference + * @V4L2_AV1_REF_LAST_FRAME: Last Reference Frame + * @V4L2_AV1_REF_LAST2_FRAME: Last2 Reference Frame + * @V4L2_AV1_REF_LAST3_FRAME: Last3 Reference Frame + * @V4L2_AV1_REF_GOLDEN_FRAME: Golden Reference Frame + * @V4L2_AV1_REF_BWDREF_FRAME: BWD Reference Frame + * @V4L2_AV1_REF_ALTREF2_FRAME: Alternative2 Reference Frame + * @V4L2_AV1_REF_ALTREF_FRAME: Alternative Reference Frame + */ +enum v4l2_av1_reference_frame { + V4L2_AV1_REF_INTRA_FRAME = 0, + V4L2_AV1_REF_LAST_FRAME = 1, + V4L2_AV1_REF_LAST2_FRAME = 2, + V4L2_AV1_REF_LAST3_FRAME = 3, + V4L2_AV1_REF_GOLDEN_FRAME = 4, + V4L2_AV1_REF_BWDREF_FRAME = 5, + V4L2_AV1_REF_ALTREF2_FRAME = 6, + V4L2_AV1_REF_ALTREF_FRAME = 7, +}; + +#define V4L2_AV1_GLOBAL_MOTION_IS_INVALID(ref) (1 << (ref)) + +#define V4L2_AV1_GLOBAL_MOTION_FLAG_IS_GLOBAL 0x1 +#define V4L2_AV1_GLOBAL_MOTION_FLAG_IS_ROT_ZOOM 0x2 +#define V4L2_AV1_GLOBAL_MOTION_FLAG_IS_TRANSLATION 0x4 +/** + * struct v4l2_av1_global_motion - AV1 Global Motion parameters as described in + * section 6.8.17 "Global motion params semantics" of the AV1 specification. + * + * @flags: A bitfield containing the flags per reference frame. See + * V4L2_AV1_GLOBAL_MOTION_FLAG_{} + * @type: The type of global motion transform used. + * @params: this field has the same meaning as "gm_params" in the AV1 + * specification. + * @invalid: bitfield indicating whether the global motion params are invalid + * for a given reference frame. See section 7.11.3.6 Setup shear process and + * the variable "warpValid". Use V4L2_AV1_GLOBAL_MOTION_IS_INVALID(ref) to + * create a suitable mask. + * @reserved: padding field. Should be zeroed by applications. + */ + +struct v4l2_av1_global_motion { + __u8 flags[V4L2_AV1_TOTAL_REFS_PER_FRAME]; + enum v4l2_av1_warp_model type[V4L2_AV1_TOTAL_REFS_PER_FRAME]; + __s32 params[V4L2_AV1_TOTAL_REFS_PER_FRAME][6]; + __u8 invalid; + __u8 reserved[3]; +}; + +/** + * enum v4l2_av1_frame_restoration_type - AV1 Frame Restoration Type + * @V4L2_AV1_FRAME_RESTORE_NONE: no filtering is applied. + * @V4L2_AV1_FRAME_RESTORE_WIENER: Wiener filter process is invoked. + * @V4L2_AV1_FRAME_RESTORE_SGRPROJ: self guided filter process is invoked. + * @V4L2_AV1_FRAME_RESTORE_SWITCHABLE: restoration filter is swichtable. + */ +enum v4l2_av1_frame_restoration_type { + V4L2_AV1_FRAME_RESTORE_NONE = 0, + V4L2_AV1_FRAME_RESTORE_WIENER = 1, + V4L2_AV1_FRAME_RESTORE_SGRPROJ = 2, + V4L2_AV1_FRAME_RESTORE_SWITCHABLE = 3, +}; + +#define V4L2_AV1_LOOP_RESTORATION_FLAG_USES_LR 0x1 +#define V4L2_AV1_LOOP_RESTORATION_FLAG_USES_CHROMA_LR 0x2 + +/** + * struct v4l2_av1_loop_restoration - AV1 Loop Restauration as described in + * section 6.10.15 "Loop restoration params semantics" of the AV1 specification. + * + * @flags: See V4L2_AV1_LOOP_RESTORATION_FLAG_{}. + * @lr_unit_shift: specifies if the luma restoration size should be halved. + * @lr_uv_shift: specifies if the chroma size should be half the luma size. + * @reserved: padding field. Should be zeroed by applications. + * @frame_restoration_type: specifies the type of restoration used for each + * plane. See enum v4l2_av1_frame_restoration_type. + * @loop_restoration_size: specifies the size of loop restoration units in units + * of samples in the current plane. + */ +struct v4l2_av1_loop_restoration { + __u8 flags; + __u8 lr_unit_shift; + __u8 lr_uv_shift; + __u8 reserved; + enum v4l2_av1_frame_restoration_type frame_restoration_type[V4L2_AV1_NUM_PLANES_MAX]; + __u32 loop_restoration_size[V4L2_AV1_MAX_NUM_PLANES]; +}; + +/** + * struct v4l2_av1_cdef - AV1 CDEF params semantics as described in section + * 6.10.14 "CDEF params semantics" of the AV1 specification + * + * @damping_minus_3: controls the amount of damping in the deringing filter. + * @bits: specifies the number of bits needed to specify which CDEF filter to + * apply. + * @y_pri_strength: specifies the strength of the primary filter. + * @y_sec_strength: specifies the strength of the secondary filter. + * @uv_pri_strength: specifies the strength of the primary filter. + * @uv_sec_strength: specifies the strength of the secondary filter. + */ +struct v4l2_av1_cdef { + __u8 damping_minus_3; + __u8 bits; + __u8 y_pri_strength[V4L2_AV1_CDEF_MAX]; + __u8 y_sec_strength[V4L2_AV1_CDEF_MAX]; + __u8 uv_pri_strength[V4L2_AV1_CDEF_MAX]; + __u8 uv_sec_strength[V4L2_AV1_CDEF_MAX]; +}; + +#define V4L2_AV1_SEGMENTATION_FLAG_ENABLED 0x1 +#define V4L2_AV1_SEGMENTATION_FLAG_UPDATE_MAP 0x2 +#define V4L2_AV1_SEGMENTATION_FLAG_TEMPORAL_UPDATE 0x4 +#define V4L2_AV1_SEGMENTATION_FLAG_UPDATE_DATA 0x8 +#define V4L2_AV1_SEGMENTATION_FLAG_SEG_ID_PRE_SKIP 0x10 + +/** + * enum v4l2_av1_segment_feature - AV1 segment features as described in section + * 3 "Symbols and abbreviated terms" of the AV1 specification. + * + * @V4L2_AV1_SEG_LVL_ALT_Q: Index for quantizer segment feature. + * @V4L2_AV1_SEG_LVL_ALT_LF_Y_V: Index for vertical luma loop filter segment + * feature. + * @V4L2_AV1_SEG_LVL_REF_FRAME: Index for reference frame segment feature. + * @V4L2_AV1_SEG_LVL_REF_SKIP: Index for skip segment feature. + * @V4L2_AV1_SEG_LVL_REF_GLOBALMV: Index for global mv feature. + * @V4L2_AV1_SEG_LVL_MAX: Number of segment features. + */ +enum v4l2_av1_segment_feature { + V4L2_AV1_SEG_LVL_ALT_Q = 0, + V4L2_AV1_SEG_LVL_ALT_LF_Y_V = 1, + V4L2_AV1_SEG_LVL_REF_FRAME = 5, + V4L2_AV1_SEG_LVL_REF_SKIP = 6, + V4L2_AV1_SEG_LVL_REF_GLOBALMV = 7, + V4L2_AV1_SEG_LVL_MAX = 8 +}; + +#define V4L2_AV1_SEGMENT_FEATURE_ENABLED(id) (1 << (id)) + +/** + * struct v4l2_av1_segmentation - AV1 Segmentation params as defined in section + * 6.8.13 "Segmentation params semantics" of the AV1 specification. + * + * @flags: see V4L2_AV1_SEGMENTATION_FLAG_{}. + * @last_active_seg_id: indicates the highest numbered segment id that has some + * enabled feature. This is used when decoding the segment id to only decode + * choices corresponding to used segments. + * @feature_enabled: bitmask defining which features are enabled in each + * segment. Use V4L2_AV1_SEGMENT_FEATURE_ENABLED to build a suitable mask. + * @feature_data: data attached to each feature. Data entry is only valid if the + * feature is enabled + */ +struct v4l2_av1_segmentation { + __u8 flags; + __u8 last_active_seg_id; + __u8 feature_enabled[V4L2_AV1_MAX_SEGMENTS]; + __s16 feature_data[V4L2_AV1_MAX_SEGMENTS][V4L2_AV1_SEG_LVL_MAX]; +}; + +#define V4L2_AV1_LOOP_FILTER_FLAG_DELTA_ENABLED 0x1 +#define V4L2_AV1_LOOP_FILTER_FLAG_DELTA_UPDATE 0x2 +#define V4L2_AV1_LOOP_FILTER_FLAG_DELTA_LF_PRESENT 0x4 +#define V4L2_AV1_LOOP_FILTER_FLAG_DELTA_LF_MULTI 0x8 + +/** + * struct v4l2_av1_loop_filter - AV1 Loop filter params as defined in section + * 6.8.10 "Loop filter semantics" and 6.8.16 "Loop filter delta parameters + * semantics" of the AV1 specification. + * + * @flags: see V4L2_AV1_LOOP_FILTER_FLAG_{} + * @level: an array containing loop filter strength values. Different loop + * filter strength values from the array are used depending on the image plane + * being filtered, and the edge direction (vertical or horizontal) being + * filtered. + * @sharpness: indicates the sharpness level. The loop_filter_level and + * loop_filter_sharpness together determine when a block edge is filtered, and + * by how much the filtering can change the sample values. The loop filter + * process is described in section 7.14 of the AV1 specification. + * @ref_deltas: contains the adjustment needed for the filter level based on the + * chosen reference frame. If this syntax element is not present, it maintains + * its previous value. + * @mode_deltas: contains the adjustment needed for the filter level based on + * the chosen mode. If this syntax element is not present, it maintains its + * previous value. + * @delta_lf_res: specifies the left shift which should be applied to decoded + * loop filter delta values. + */ +struct v4l2_av1_loop_filter { + __u8 flags; + __u8 level[4]; + __u8 sharpness; + __s8 ref_deltas[V4L2_AV1_TOTAL_REFS_PER_FRAME]; + __s8 mode_deltas[2]; + __u8 delta_lf_res; +}; + +#define V4L2_AV1_QUANTIZATION_FLAG_DIFF_UV_DELTA 0x1 +#define V4L2_AV1_QUANTIZATION_FLAG_USING_QMATRIX 0x2 +#define V4L2_AV1_QUANTIZATION_FLAG_DELTA_Q_PRESENT 0x4 + +/** + * struct v4l2_av1_quantization - AV1 Quantization params as defined in section + * 6.8.11 "Quantization params semantics" of the AV1 specification. + * + * @flags: see V4L2_AV1_QUANTIZATION_FLAG_{} + * @base_q_idx: indicates the base frame qindex. This is used for Y AC + * coefficients and as the base value for the other quantizers. + * @delta_q_y_dc: indicates the Y DC quantizer relative to base_q_idx. + * @delta_q_u_dc: indicates the U DC quantizer relative to base_q_idx. + * @delta_q_u_ac: indicates the U AC quantizer relative to base_q_idx. + * @delta_q_v_dc: indicates the V DC quantizer relative to base_q_idx. + * @delta_q_v_ac: indicates the V AC quantizer relative to base_q_idx. + * @qm_y: specifies the level in the quantizer matrix that should be used for + * luma plane decoding. + * @qm_u: specifies the level in the quantizer matrix that should be used for + * chroma U plane decoding. + * @qm_v: specifies the level in the quantizer matrix that should be used for + * chroma V plane decoding. + * @delta_q_res: specifies the left shift which should be applied to decoded + * quantizer index delta values. + */ +struct v4l2_av1_quantization { + __u8 flags; + __u8 base_q_idx; + __s8 delta_q_y_dc; + __s8 delta_q_u_dc; + __s8 delta_q_u_ac; + __s8 delta_q_v_dc; + __s8 delta_q_v_ac; + __u8 qm_y; + __u8 qm_u; + __u8 qm_v; + __u8 delta_q_res; +}; + +#define V4L2_AV1_TILE_INFO_FLAG_UNIFORM_TILE_SPACING 0x1 + +/** + * struct v4l2_av1_tile_info - AV1 Tile info as defined in section 6.8.14 "Tile + * info semantics" of the AV1 specification. + * + * @flags: see V4L2_AV1_TILE_INFO_FLAG_{} + * @context_update_tile_id: specifies which tile to use for the CDF update. + * @tile_rows: specifies the number of tiles down the frame. + * @tile_cols: specifies the number of tiles across the frame. + * @mi_col_starts: an array specifying the start column (in units of 4x4 luma + * samples) for each tile across the image. + * @mi_row_starts: an array specifying the start row (in units of 4x4 luma + * samples) for each tile down the image. + * @width_in_sbs_minus_1: specifies the width of a tile minus 1 in units of + * superblocks. + * @height_in_sbs_minus_1: specifies the height of a tile minus 1 in units of + * superblocks. + * @tile_size_bytes: specifies the number of bytes needed to code each tile + * size. + * @reserved: padding field. Should be zeroed by applications. + */ +struct v4l2_av1_tile_info { + __u8 flags; + __u8 context_update_tile_id; + __u8 tile_cols; + __u8 tile_rows; + __u32 mi_col_starts[V4L2_AV1_MAX_TILE_COLS + 1]; + __u32 mi_row_starts[V4L2_AV1_MAX_TILE_ROWS + 1]; + __u32 width_in_sbs_minus_1[V4L2_AV1_MAX_TILE_COLS]; + __u32 height_in_sbs_minus_1[V4L2_AV1_MAX_TILE_ROWS]; + __u8 tile_size_bytes; + __u8 reserved[3]; +}; + +/** + * enum v4l2_av1_frame_type - AV1 Frame Type + * + * @V4L2_AV1_KEY_FRAME: Key frame + * @V4L2_AV1_INTER_FRAME: Inter frame + * @V4L2_AV1_INTRA_ONLY_FRAME: Intra-only frame + * @V4L2_AV1_SWITCH_FRAME: Switch frame + */ +enum v4l2_av1_frame_type { + V4L2_AV1_KEY_FRAME = 0, + V4L2_AV1_INTER_FRAME = 1, + V4L2_AV1_INTRA_ONLY_FRAME = 2, + V4L2_AV1_SWITCH_FRAME = 3 +}; + +/** + * enum v4l2_av1_interpolation_filter - AV1 interpolation filter types + * + * @V4L2_AV1_INTERPOLATION_FILTER_EIGHTTAP: eight tap filter + * @V4L2_AV1_INTERPOLATION_FILTER_EIGHTTAP_SMOOTH: eight tap smooth filter + * @V4L2_AV1_INTERPOLATION_FILTER_EIGHTTAP_SHARP: eight tap sharp filter + * @V4L2_AV1_INTERPOLATION_FILTER_BILINEAR: bilinear filter + * @V4L2_AV1_INTERPOLATION_FILTER_SWITCHABLE: filter selection is signaled at + * the block level + * + * See section 6.8.9 "Interpolation filter semantics" of the AV1 specification + * for more details. + */ +enum v4l2_av1_interpolation_filter { + V4L2_AV1_INTERPOLATION_FILTER_EIGHTTAP = 0, + V4L2_AV1_INTERPOLATION_FILTER_EIGHTTAP_SMOOTH = 1, + V4L2_AV1_INTERPOLATION_FILTER_EIGHTTAP_SHARP = 2, + V4L2_AV1_INTERPOLATION_FILTER_BILINEAR = 3, + V4L2_AV1_INTERPOLATION_FILTER_SWITCHABLE = 4, +}; + +/** + * enum v4l2_av1_tx_mode - AV1 Tx mode as described in section 6.8.21 "TX mode + * semantics" of the AV1 specification. + * @V4L2_AV1_TX_MODE_ONLY_4X4: the inverse transform will use only 4x4 + * transforms + * @V4L2_AV1_TX_MODE_LARGEST: the inverse transform will use the largest + * transform size that fits inside the block + * @V4L2_AV1_TX_MODE_SELECT: the choice of transform size is specified + * explicitly for each block. + */ +enum v4l2_av1_tx_mode { + V4L2_AV1_TX_MODE_ONLY_4X4 = 0, + V4L2_AV1_TX_MODE_LARGEST = 1, + V4L2_AV1_TX_MODE_SELECT = 2 +}; + +#define V4L2_AV1_FRAME_FLAG_SHOW_FRAME 0x00000001 +#define V4L2_AV1_FRAME_FLAG_SHOWABLE_FRAME 0x00000002 +#define V4L2_AV1_FRAME_FLAG_ERROR_RESILIENT_MODE 0x00000004 +#define V4L2_AV1_FRAME_FLAG_DISABLE_CDF_UPDATE 0x00000008 +#define V4L2_AV1_FRAME_FLAG_ALLOW_SCREEN_CONTENT_TOOLS 0x00000010 +#define V4L2_AV1_FRAME_FLAG_FORCE_INTEGER_MV 0x00000020 +#define V4L2_AV1_FRAME_FLAG_ALLOW_INTRABC 0x00000040 +#define V4L2_AV1_FRAME_FLAG_USE_SUPERRES 0x00000080 +#define V4L2_AV1_FRAME_FLAG_ALLOW_HIGH_PRECISION_MV 0x00000100 +#define V4L2_AV1_FRAME_FLAG_IS_MOTION_MODE_SWITCHABLE 0x00000200 +#define V4L2_AV1_FRAME_FLAG_USE_REF_FRAME_MVS 0x00000400 +#define V4L2_AV1_FRAME_FLAG_DISABLE_FRAME_END_UPDATE_CDF 0x00000800 +#define V4L2_AV1_FRAME_FLAG_ALLOW_WARPED_MOTION 0x00001000 +#define V4L2_AV1_FRAME_FLAG_REFERENCE_SELECT 0x00002000 +#define V4L2_AV1_FRAME_FLAG_REDUCED_TX_SET 0x00004000 +#define V4L2_AV1_FRAME_FLAG_SKIP_MODE_ALLOWED 0x00008000 +#define V4L2_AV1_FRAME_FLAG_SKIP_MODE_PRESENT 0x00010000 +#define V4L2_AV1_FRAME_FLAG_FRAME_SIZE_OVERRIDE 0x00020000 +#define V4L2_AV1_FRAME_FLAG_BUFFER_REMOVAL_TIME_PRESENT 0x00040000 +#define V4L2_AV1_FRAME_FLAG_FRAME_REFS_SHORT_SIGNALING 0x00080000 + +#define V4L2_CID_STATELESS_AV1_FRAME (V4L2_CID_CODEC_STATELESS_BASE + 502) +/** + * struct v4l2_ctrl_av1_frame - Represents an AV1 Frame Header OBU. + * + * @tile_info: tile info + * @quantization: quantization params + * @segmentation: segmentation params + * @superres_denom: the denominator for the upscaling ratio. + * @loop_filter: loop filter params + * @cdef: cdef params + * @skip_mode_frame: specifies the frames to use for compound prediction when + * skip_mode is equal to 1. + * @primary_ref_frame: specifies which reference frame contains the CDF values + * and other state that should be loaded at the start of the frame. + * @loop_restoration: loop restoration params + * @global_motion: global motion params + * @flags: see V4L2_AV1_FRAME_FLAG_{} + * @frame_type: specifies the AV1 frame type + * @order_hint: specifies OrderHintBits least significant bits of the expected + * output order for this frame. + * @upscaled_width: the upscaled width. + * @interpolation_filter: specifies the filter selection used for performing + * inter prediction. + * @tx_mode: specifies how the transform size is determined. + * @frame_width_minus_1: add 1 to get the frame's width. + * @frame_height_minus_1: add 1 to get the frame's height + * @render_width_minus_1: add 1 to get the render width of the frame in luma + * samples. + * @render_height_minus_1: add 1 to get the render height of the frame in luma + * samples. + * @current_frame_id: specifies the frame id number for the current frame. Frame + * id numbers are additional information that do not affect the decoding + * process, but provide decoders with a way of detecting missing reference + * frames so that appropriate action can be taken. + * @buffer_removal_time: specifies the frame removal time in units of DecCT clock + * ticks counted from the removal time of the last random access point for + * operating point opNum. + * @reserved: padding field. Should be zeroed by applications. + * @order_hints: specifies the expected output order hint for each reference + * frame. This field corresponds to the OrderHints variable from the + * specification (section 5.9.2 "Uncompressed header syntax"). As such, this is + * only used for non-intra frames and ignored otherwise. order_hints[0] is + * always ignored. + * @reference_frame_ts: the V4L2 timestamp of the reference frame slots. + * @ref_frame_idx: used to index into @reference_frame_ts when decoding + * inter-frames. The meaning of this array is the same as in the specification. + * The timestamp refers to the timestamp field in struct v4l2_buffer. Use + * v4l2_timeval_to_ns() to convert the struct timeval to a __u64. + * @refresh_frame_flags: contains a bitmask that specifies which reference frame + * slots will be updated with the current frame after it is decoded. + */ +struct v4l2_ctrl_av1_frame { + struct v4l2_av1_tile_info tile_info; + struct v4l2_av1_quantization quantization; + __u8 superres_denom; + struct v4l2_av1_segmentation segmentation; + struct v4l2_av1_loop_filter loop_filter; + struct v4l2_av1_cdef cdef; + __u8 skip_mode_frame[2]; + __u8 primary_ref_frame; + struct v4l2_av1_loop_restoration loop_restoration; + struct v4l2_av1_global_motion global_motion; + __u32 flags; + enum v4l2_av1_frame_type frame_type; + __u32 order_hint; + __u32 upscaled_width; + enum v4l2_av1_interpolation_filter interpolation_filter; + enum v4l2_av1_tx_mode tx_mode; + __u32 frame_width_minus_1; + __u32 frame_height_minus_1; + __u16 render_width_minus_1; + __u16 render_height_minus_1; + + __u32 current_frame_id; + __u32 buffer_removal_time[V4L2_AV1_MAX_OPERATING_POINTS]; + __u8 reserved[4]; + __u32 order_hints[V4L2_AV1_TOTAL_REFS_PER_FRAME]; + __u64 reference_frame_ts[V4L2_AV1_TOTAL_REFS_PER_FRAME]; + __s8 ref_frame_idx[V4L2_AV1_REFS_PER_FRAME]; + __u8 refresh_frame_flags; +}; + +#define V4L2_AV1_FILM_GRAIN_FLAG_APPLY_GRAIN 0x1 +#define V4L2_AV1_FILM_GRAIN_FLAG_UPDATE_GRAIN 0x2 +#define V4L2_AV1_FILM_GRAIN_FLAG_CHROMA_SCALING_FROM_LUMA 0x4 +#define V4L2_AV1_FILM_GRAIN_FLAG_OVERLAP 0x8 +#define V4L2_AV1_FILM_GRAIN_FLAG_CLIP_TO_RESTRICTED_RANGE 0x10 + +#define V4L2_CID_STATELESS_AV1_FILM_GRAIN (V4L2_CID_CODEC_STATELESS_BASE + 505) +/** + * struct v4l2_ctrl_av1_film_grain - AV1 Film Grain parameters. + * + * Film grain parameters as specified by section 6.8.20 of the AV1 Specification. + * + * @flags: see V4L2_AV1_FILM_GRAIN_{}. + * @cr_mult: represents a multiplier for the cr component used in derivation of + * the input index to the cr component scaling function. + * @grain_seed: specifies the starting value for the pseudo-random numbers used + * during film grain synthesis. + * @film_grain_params_ref_idx: indicates which reference frame contains the + * film grain parameters to be used for this frame. + * @num_y_points: specifies the number of points for the piece-wise linear + * scaling function of the luma component. + * @point_y_value: represents the x (luma value) coordinate for the i-th point + * of the piecewise linear scaling function for luma component. The values are + * signaled on the scale of 0..255. In case of 10 bit video, these values + * correspond to luma values divided by 4. In case of 12 bit video, these values + * correspond to luma values divided by 16. + * @point_y_scaling: represents the scaling (output) value for the i-th point + * of the piecewise linear scaling function for luma component. + * @num_cb_points: specifies the number of points for the piece-wise linear + * scaling function of the cb component. + * @point_cb_value: represents the x coordinate for the i-th point of the + * piece-wise linear scaling function for cb component. The values are signaled + * on the scale of 0..255. + * @point_cb_scaling: represents the scaling (output) value for the i-th point + * of the piecewise linear scaling function for cb component. + * @num_cr_points: specifies represents the number of points for the piece-wise + * linear scaling function of the cr component. + * @point_cr_value: represents the x coordinate for the i-th point of the + * piece-wise linear scaling function for cr component. The values are signaled + * on the scale of 0..255. + * @point_cr_scaling: represents the scaling (output) value for the i-th point + * of the piecewise linear scaling function for cr component. + * @grain_scaling_minus_8: represents the shift – 8 applied to the values of the + * chroma component. The grain_scaling_minus_8 can take values of 0..3 and + * determines the range and quantization step of the standard deviation of film + * grain. + * @ar_coeff_lag: specifies the number of auto-regressive coefficients for luma + * and chroma. + * @ar_coeffs_y_plus_128: specifies auto-regressive coefficients used for the Y + * plane. + * @ar_coeffs_cb_plus_128: specifies auto-regressive coefficients used for the U + * plane. + * @ar_coeffs_cr_plus_128: specifies auto-regressive coefficients used for the V + * plane. + * @ar_coeff_shift_minus_6: specifies the range of the auto-regressive + * coefficients. Values of 0, 1, 2, and 3 correspond to the ranges for + * auto-regressive coefficients of [-2, 2), [-1, 1), [-0.5, 0.5) and [-0.25, + * 0.25) respectively. + * @grain_scale_shift: specifies how much the Gaussian random numbers should be + * scaled down during the grain synthesis process. + * @cb_mult: represents a multiplier for the cb component used in derivation of + * the input index to the cb component scaling function. + * @cb_luma_mult: represents a multiplier for the average luma component used in + * derivation of the input index to the cb component scaling function. + * @cr_luma_mult: represents a multiplier for the average luma component used in + * derivation of the input index to the cr component scaling function. + * @cb_offset: represents an offset used in derivation of the input index to the + * cb component scaling function. + * @cr_offset: represents an offset used in derivation of the input index to the + * cr component scaling function. + * @reserved: padding field. Should be zeroed by applications. + */ +struct v4l2_ctrl_av1_film_grain { + __u8 flags; + __u8 cr_mult; + __u16 grain_seed; + __u8 film_grain_params_ref_idx; + __u8 num_y_points; + __u8 point_y_value[V4L2_AV1_MAX_NUM_Y_POINTS]; + __u8 point_y_scaling[V4L2_AV1_MAX_NUM_Y_POINTS]; + __u8 num_cb_points; + __u8 point_cb_value[V4L2_AV1_MAX_NUM_CB_POINTS]; + __u8 point_cb_scaling[V4L2_AV1_MAX_NUM_CB_POINTS]; + __u8 num_cr_points; + __u8 point_cr_value[V4L2_AV1_MAX_NUM_CR_POINTS]; + __u8 point_cr_scaling[V4L2_AV1_MAX_NUM_CR_POINTS]; + __u8 grain_scaling_minus_8; + __u8 ar_coeff_lag; + __u8 ar_coeffs_y_plus_128[V4L2_AV1_AR_COEFFS_SIZE]; + __u8 ar_coeffs_cb_plus_128[V4L2_AV1_AR_COEFFS_SIZE]; + __u8 ar_coeffs_cr_plus_128[V4L2_AV1_AR_COEFFS_SIZE]; + __u8 ar_coeff_shift_minus_6; + __u8 grain_scale_shift; + __u8 cb_mult; + __u8 cb_luma_mult; + __u8 cr_luma_mult; + __u16 cb_offset; + __u16 cr_offset; + __u8 reserved[4]; +}; + /* MPEG-compression definitions kept for backwards compatibility */ #ifndef __KERNEL__ #define V4L2_CTRL_CLASS_MPEG V4L2_CTRL_CLASS_CODEC diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 5d8bd754c69f..585109d8a643 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -758,6 +758,7 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_FWHT_STATELESS v4l2_fourcc('S', 'F', 'W', 'H') /* Stateless FWHT (vicodec) */ #define V4L2_PIX_FMT_H264_SLICE v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */ #define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ +#define V4L2_PIX_FMT_AV1_FRAME v4l2_fourcc('A', 'V', '1', 'F') /* AV1 parsed frame */ #define V4L2_PIX_FMT_SPK v4l2_fourcc('S', 'P', 'K', '0') /* Sorenson Spark */ #define V4L2_PIX_FMT_RV30 v4l2_fourcc('R', 'V', '3', '0') /* RealVideo 8 */ #define V4L2_PIX_FMT_RV40 v4l2_fourcc('R', 'V', '4', '0') /* RealVideo 9 & 10 */ @@ -1828,6 +1829,10 @@ struct v4l2_ext_control { struct v4l2_ctrl_hevc_slice_params __user *p_hevc_slice_params; struct v4l2_ctrl_hevc_scaling_matrix __user *p_hevc_scaling_matrix; struct v4l2_ctrl_hevc_decode_params __user *p_hevc_decode_params; + struct v4l2_ctrl_av1_sequence __user *p_av1_sequence; + struct v4l2_ctrl_av1_tile_group_entry __user *p_av1_tile_group_entry; + struct v4l2_ctrl_av1_frame __user *p_av1_frame; + struct v4l2_ctrl_av1_film_grain __user *p_av1_film_grain; void __user *ptr; }; } __attribute__ ((packed)); @@ -1901,6 +1906,11 @@ enum v4l2_ctrl_type { V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS = 0x0272, V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX = 0x0273, V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS = 0x0274, + + V4L2_CTRL_TYPE_AV1_SEQUENCE = 0x280, + V4L2_CTRL_TYPE_AV1_TILE_GROUP_ENTRY = 0x281, + V4L2_CTRL_TYPE_AV1_FRAME = 0x282, + V4L2_CTRL_TYPE_AV1_FILM_GRAIN = 0x283, }; /* Used in the VIDIOC_QUERYCTRL ioctl for querying controls */ -- cgit v1.2.3 From fc91af075512386facb5f1a84fe85dcf28f68767 Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Wed, 3 May 2023 09:34:29 +0100 Subject: media: Add NV15_4L4 pixel format NV15_4L4 is the 10-bits per component 4x4 tiled format. Signed-off-by: Benjamin Gaignard Signed-off-by: Nicolas Dufresne Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../userspace-api/media/v4l/pixfmt-yuv-planar.rst | 16 ++++++++++++++++ drivers/media/v4l2-core/v4l2-common.c | 2 ++ drivers/media/v4l2-core/v4l2-ioctl.c | 1 + include/uapi/linux/videodev2.h | 1 + 4 files changed, 20 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst index 72324274f20c..1840224faa41 100644 --- a/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst +++ b/Documentation/userspace-api/media/v4l/pixfmt-yuv-planar.rst @@ -137,6 +137,13 @@ All components are stored with the same number of bits per component. - Cb, Cr - No - Linear + * - V4L2_PIX_FMT_NV15_4L4 + - 'VT15' + - 15 + - 4:2:0 + - Cb, Cr + - Yes + - 4x4 tiles * - V4L2_PIX_FMT_NV16 - 'NV16' - 8 @@ -378,6 +385,15 @@ two non-contiguous planes. Example V4L2_PIX_FMT_NV12MT memory layout of tiles +.. _V4L2-PIX-FMT-NV15-4L4: + +Tiled NV15 +---------- + +Semi-planar 10-bit YUV 4:2:0 formats, using 4x4 tiling. +All components are packed without any padding between each other. +As a side-effect, each group of 4 components are stored over 5 bytes +(YYYY or UVUV = 4 * 10 bits = 40 bits = 5 bytes). .. _V4L2-PIX-FMT-NV16: .. _V4L2-PIX-FMT-NV61: diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c index 4b3faec14f69..bee1535b04d3 100644 --- a/drivers/media/v4l2-core/v4l2-common.c +++ b/drivers/media/v4l2-core/v4l2-common.c @@ -283,6 +283,8 @@ const struct v4l2_format_info *v4l2_format_info(u32 format) /* Tiled YUV formats */ { .format = V4L2_PIX_FMT_NV12_4L4, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .bpp_div = { 1, 1, 1, 1 }, .hdiv = 2, .vdiv = 2 }, + { .format = V4L2_PIX_FMT_NV15_4L4, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 5, 10, 0, 0 }, .bpp_div = { 4, 4, 1, 1 }, .hdiv = 2, .vdiv = 2, + .block_w = { 4, 2, 0, 0 }, .block_h = { 1, 1, 0, 0 }}, { .format = V4L2_PIX_FMT_P010_4L4, .pixel_enc = V4L2_PIXEL_ENC_YUV, .mem_planes = 1, .comp_planes = 2, .bpp = { 2, 4, 0, 0 }, .bpp_div = { 1, 1, 1, 1 }, .hdiv = 2, .vdiv = 2 }, /* YUV planar formats, non contiguous variant */ diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 41d7a8e3712a..01ba27f2ef87 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -1356,6 +1356,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt) case V4L2_PIX_FMT_NV12_4L4: descr = "Y/UV 4:2:0 (4x4 Linear)"; break; case V4L2_PIX_FMT_NV12_16L16: descr = "Y/UV 4:2:0 (16x16 Linear)"; break; case V4L2_PIX_FMT_NV12_32L32: descr = "Y/UV 4:2:0 (32x32 Linear)"; break; + case V4L2_PIX_FMT_NV15_4L4: descr = "10-bit Y/UV 4:2:0 (4x4 Linear)"; break; case V4L2_PIX_FMT_P010_4L4: descr = "10-bit Y/UV 4:2:0 (4x4 Linear)"; break; case V4L2_PIX_FMT_NV12M: descr = "Y/UV 4:2:0 (N-C)"; break; case V4L2_PIX_FMT_NV21M: descr = "Y/VU 4:2:0 (N-C)"; break; diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 585109d8a643..3af6a82d0cad 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -672,6 +672,7 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_NV12_4L4 v4l2_fourcc('V', 'T', '1', '2') /* 12 Y/CbCr 4:2:0 4x4 tiles */ #define V4L2_PIX_FMT_NV12_16L16 v4l2_fourcc('H', 'M', '1', '2') /* 12 Y/CbCr 4:2:0 16x16 tiles */ #define V4L2_PIX_FMT_NV12_32L32 v4l2_fourcc('S', 'T', '1', '2') /* 12 Y/CbCr 4:2:0 32x32 tiles */ +#define V4L2_PIX_FMT_NV15_4L4 v4l2_fourcc('V', 'T', '1', '5') /* 15 Y/CbCr 4:2:0 10-bit 4x4 tiles */ #define V4L2_PIX_FMT_P010_4L4 v4l2_fourcc('T', '0', '1', '0') /* 12 Y/CbCr 4:2:0 10-bit 4x4 macroblocks */ #define V4L2_PIX_FMT_NV12_8L128 v4l2_fourcc('A', 'T', '1', '2') /* Y/CbCr 4:2:0 8x128 tiles */ #define V4L2_PIX_FMT_NV12_10BE_8L128 v4l2_fourcc_be('A', 'X', '1', '2') /* Y/CbCr 4:2:0 10-bit 8x128 tiles */ -- cgit v1.2.3 From ba3c87fffb79311f54464288c66421d19c2c1234 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 2 Jun 2023 12:58:05 -0400 Subject: amd/amdkfd: drop unused KFD_IOCTL_SVM_FLAG_UNCACHED flag Was leftover from GC 9.4.3 bring up and is currently unused. Drop it for now. Cc: Philip.Yang@amd.com Cc: rajneesh.bhardwaj@amd.com Cc: Felix.Kuehling@amd.com Reviewed-by: Rajneesh Bhardwaj Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 +- include/uapi/linux/kfd_ioctl.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 615eab3f78c9..5ff1a5a89d96 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1154,7 +1154,7 @@ svm_range_get_pte_flags(struct kfd_node *node, uint64_t pte_flags; bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; - bool uncached = flags & KFD_IOCTL_SVM_FLAG_UNCACHED; + bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/ unsigned int mtype_local; if (domain == SVM_RANGE_VRAM_DOMAIN) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2a9671e1ddb5..2da5c3ad71bd 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -623,8 +623,6 @@ enum kfd_mmio_remap { #define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020 /* Keep GPU memory mapping always valid as if XNACK is disable */ #define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED 0x00000040 -/* Uncached access to memory */ -#define KFD_IOCTL_SVM_FLAG_UNCACHED 0x00000080 /** * kfd_ioctl_svm_op - SVM ioctl operations -- cgit v1.2.3 From 4f98cf2baf9faee5b6f2f7889dad7c0f7686a787 Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Wed, 2 Mar 2022 14:30:12 -0500 Subject: drm/amdkfd: add debug and runtime enable interface Introduce the GPU debug operations interface. For ROCm-GDB to extend the GNU Debugger's ability to inspect the AMD GPU instruction set, provide the necessary interface to allow the debugger to HW debug-mode set and query exceptions per HSA queue, process or device. The runtime_enable interface coordinates exception handling with the HSA runtime. Usage is available in the kern docs at uapi/linux/kfd_ioctl.h. Signed-off-by: Jonathan Kim Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 48 +++ include/uapi/linux/kfd_ioctl.h | 668 ++++++++++++++++++++++++++++++- 2 files changed, 715 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index a9efff94390b..00e34125987c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -2729,6 +2729,48 @@ static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data) return ret; } +static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data) +{ + return 0; +} + +static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, void *data) +{ + struct kfd_ioctl_dbg_trap_args *args = data; + int r = 0; + + if (sched_policy == KFD_SCHED_POLICY_NO_HWS) { + pr_err("Debugging does not support sched_policy %i", sched_policy); + return -EINVAL; + } + + switch (args->op) { + case KFD_IOC_DBG_TRAP_ENABLE: + case KFD_IOC_DBG_TRAP_DISABLE: + case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT: + case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED: + case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE: + case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE: + case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES: + case KFD_IOC_DBG_TRAP_RESUME_QUEUES: + case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH: + case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH: + case KFD_IOC_DBG_TRAP_SET_FLAGS: + case KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT: + case KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO: + case KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT: + case KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT: + pr_warn("Debugging not supported yet\n"); + r = -EACCES; + break; + default: + pr_err("Invalid option: %i\n", args->op); + r = -EINVAL; + } + + return r; +} + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ .cmd_drv = 0, .name = #ioctl} @@ -2841,6 +2883,12 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_EXPORT_DMABUF, kfd_ioctl_export_dmabuf, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_RUNTIME_ENABLE, + kfd_ioctl_runtime_enable, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_TRAP, + kfd_ioctl_set_debug_trap, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 2da5c3ad71bd..32913d674d38 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -110,6 +110,32 @@ struct kfd_ioctl_get_available_memory_args { __u32 pad; }; +struct kfd_dbg_device_info_entry { + __u64 exception_status; + __u64 lds_base; + __u64 lds_limit; + __u64 scratch_base; + __u64 scratch_limit; + __u64 gpuvm_base; + __u64 gpuvm_limit; + __u32 gpu_id; + __u32 location_id; + __u32 vendor_id; + __u32 device_id; + __u32 revision_id; + __u32 subsystem_vendor_id; + __u32 subsystem_device_id; + __u32 fw_version; + __u32 gfx_target_version; + __u32 simd_count; + __u32 max_waves_per_simd; + __u32 array_count; + __u32 simd_arrays_per_engine; + __u32 num_xcc; + __u32 capability; + __u32 debug_prop; +}; + /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ #define KFD_IOC_CACHE_POLICY_COHERENT 0 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 @@ -773,6 +799,640 @@ struct kfd_ioctl_set_xnack_mode_args { __s32 xnack_enabled; }; +/* Wave launch override modes */ +enum kfd_dbg_trap_override_mode { + KFD_DBG_TRAP_OVERRIDE_OR = 0, + KFD_DBG_TRAP_OVERRIDE_REPLACE = 1 +}; + +/* Wave launch overrides */ +enum kfd_dbg_trap_mask { + KFD_DBG_TRAP_MASK_FP_INVALID = 1, + KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL = 2, + KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO = 4, + KFD_DBG_TRAP_MASK_FP_OVERFLOW = 8, + KFD_DBG_TRAP_MASK_FP_UNDERFLOW = 16, + KFD_DBG_TRAP_MASK_FP_INEXACT = 32, + KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO = 64, + KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH = 128, + KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION = 256, + KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START = (1 << 30), + KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END = (1 << 31) +}; + +/* Wave launch modes */ +enum kfd_dbg_trap_wave_launch_mode { + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL = 0, + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT = 1, + KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG = 3 +}; + +/* Address watch modes */ +enum kfd_dbg_trap_address_watch_mode { + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_READ = 0, + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_NONREAD = 1, + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ATOMIC = 2, + KFD_DBG_TRAP_ADDRESS_WATCH_MODE_ALL = 3 +}; + +/* Additional wave settings */ +enum kfd_dbg_trap_flags { + KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP = 1, +}; + +/* Trap exceptions */ +enum kfd_dbg_trap_exception_code { + EC_NONE = 0, + /* per queue */ + EC_QUEUE_WAVE_ABORT = 1, + EC_QUEUE_WAVE_TRAP = 2, + EC_QUEUE_WAVE_MATH_ERROR = 3, + EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION = 4, + EC_QUEUE_WAVE_MEMORY_VIOLATION = 5, + EC_QUEUE_WAVE_APERTURE_VIOLATION = 6, + EC_QUEUE_PACKET_DISPATCH_DIM_INVALID = 16, + EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID = 17, + EC_QUEUE_PACKET_DISPATCH_CODE_INVALID = 18, + EC_QUEUE_PACKET_RESERVED = 19, + EC_QUEUE_PACKET_UNSUPPORTED = 20, + EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID = 21, + EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID = 22, + EC_QUEUE_PACKET_VENDOR_UNSUPPORTED = 23, + EC_QUEUE_PREEMPTION_ERROR = 30, + EC_QUEUE_NEW = 31, + /* per device */ + EC_DEVICE_QUEUE_DELETE = 32, + EC_DEVICE_MEMORY_VIOLATION = 33, + EC_DEVICE_RAS_ERROR = 34, + EC_DEVICE_FATAL_HALT = 35, + EC_DEVICE_NEW = 36, + /* per process */ + EC_PROCESS_RUNTIME = 48, + EC_PROCESS_DEVICE_REMOVE = 49, + EC_MAX +}; + +/* Mask generated by ecode in kfd_dbg_trap_exception_code */ +#define KFD_EC_MASK(ecode) (1ULL << (ecode - 1)) + +/* Masks for exception code type checks below */ +#define KFD_EC_MASK_QUEUE (KFD_EC_MASK(EC_QUEUE_WAVE_ABORT) | \ + KFD_EC_MASK(EC_QUEUE_WAVE_TRAP) | \ + KFD_EC_MASK(EC_QUEUE_WAVE_MATH_ERROR) | \ + KFD_EC_MASK(EC_QUEUE_WAVE_ILLEGAL_INSTRUCTION) | \ + KFD_EC_MASK(EC_QUEUE_WAVE_MEMORY_VIOLATION) | \ + KFD_EC_MASK(EC_QUEUE_WAVE_APERTURE_VIOLATION) | \ + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_DIM_INVALID) | \ + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_GROUP_SEGMENT_SIZE_INVALID) | \ + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_CODE_INVALID) | \ + KFD_EC_MASK(EC_QUEUE_PACKET_RESERVED) | \ + KFD_EC_MASK(EC_QUEUE_PACKET_UNSUPPORTED) | \ + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_WORK_GROUP_SIZE_INVALID) | \ + KFD_EC_MASK(EC_QUEUE_PACKET_DISPATCH_REGISTER_INVALID) | \ + KFD_EC_MASK(EC_QUEUE_PACKET_VENDOR_UNSUPPORTED) | \ + KFD_EC_MASK(EC_QUEUE_PREEMPTION_ERROR) | \ + KFD_EC_MASK(EC_QUEUE_NEW)) +#define KFD_EC_MASK_DEVICE (KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE) | \ + KFD_EC_MASK(EC_DEVICE_RAS_ERROR) | \ + KFD_EC_MASK(EC_DEVICE_FATAL_HALT) | \ + KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION) | \ + KFD_EC_MASK(EC_DEVICE_NEW)) +#define KFD_EC_MASK_PROCESS (KFD_EC_MASK(EC_PROCESS_RUNTIME) | \ + KFD_EC_MASK(EC_PROCESS_DEVICE_REMOVE)) + +/* Checks for exception code types for KFD search */ +#define KFD_DBG_EC_TYPE_IS_QUEUE(ecode) \ + (!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_QUEUE)) +#define KFD_DBG_EC_TYPE_IS_DEVICE(ecode) \ + (!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_DEVICE)) +#define KFD_DBG_EC_TYPE_IS_PROCESS(ecode) \ + (!!(KFD_EC_MASK(ecode) & KFD_EC_MASK_PROCESS)) + + +/* Runtime enable states */ +enum kfd_dbg_runtime_state { + DEBUG_RUNTIME_STATE_DISABLED = 0, + DEBUG_RUNTIME_STATE_ENABLED = 1, + DEBUG_RUNTIME_STATE_ENABLED_BUSY = 2, + DEBUG_RUNTIME_STATE_ENABLED_ERROR = 3 +}; + +/* Runtime enable status */ +struct kfd_runtime_info { + __u64 r_debug; + __u32 runtime_state; + __u32 ttmp_setup; +}; + +/* Enable modes for runtime enable */ +#define KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK 1 +#define KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK 2 + +/** + * kfd_ioctl_runtime_enable_args - Arguments for runtime enable + * + * Coordinates debug exception signalling and debug device enablement with runtime. + * + * @r_debug - pointer to user struct for sharing information between ROCr and the debuggger + * @mode_mask - mask to set mode + * KFD_RUNTIME_ENABLE_MODE_ENABLE_MASK - enable runtime for debugging, otherwise disable + * KFD_RUNTIME_ENABLE_MODE_TTMP_SAVE_MASK - enable trap temporary setup (ignore on disable) + * @capabilities_mask - mask to notify runtime on what KFD supports + * + * Return - 0 on SUCCESS. + * - EBUSY if runtime enable call already pending. + * - EEXIST if user queues already active prior to call. + * If process is debug enabled, runtime enable will enable debug devices and + * wait for debugger process to send runtime exception EC_PROCESS_RUNTIME + * to unblock - see kfd_ioctl_dbg_trap_args. + * + */ +struct kfd_ioctl_runtime_enable_args { + __u64 r_debug; + __u32 mode_mask; + __u32 capabilities_mask; +}; + +/* Queue information */ +struct kfd_queue_snapshot_entry { + __u64 exception_status; + __u64 ring_base_address; + __u64 write_pointer_address; + __u64 read_pointer_address; + __u64 ctx_save_restore_address; + __u32 queue_id; + __u32 gpu_id; + __u32 ring_size; + __u32 queue_type; + __u32 ctx_save_restore_area_size; + __u32 reserved; +}; + +/* Queue status return for suspend/resume */ +#define KFD_DBG_QUEUE_ERROR_BIT 30 +#define KFD_DBG_QUEUE_INVALID_BIT 31 +#define KFD_DBG_QUEUE_ERROR_MASK (1 << KFD_DBG_QUEUE_ERROR_BIT) +#define KFD_DBG_QUEUE_INVALID_MASK (1 << KFD_DBG_QUEUE_INVALID_BIT) + +/* Context save area header information */ +struct kfd_context_save_area_header { + struct { + __u32 control_stack_offset; + __u32 control_stack_size; + __u32 wave_state_offset; + __u32 wave_state_size; + } wave_state; + __u32 debug_offset; + __u32 debug_size; + __u64 err_payload_addr; + __u32 err_event_id; + __u32 reserved1; +}; + +/* + * Debug operations + * + * For specifics on usage and return values, see documentation per operation + * below. Otherwise, generic error returns apply: + * - ESRCH if the process to debug does not exist. + * + * - EINVAL (with KFD_IOC_DBG_TRAP_ENABLE exempt) if operation + * KFD_IOC_DBG_TRAP_ENABLE has not succeeded prior. + * Also returns this error if GPU hardware scheduling is not supported. + * + * - EPERM (with KFD_IOC_DBG_TRAP_DISABLE exempt) if target process is not + * PTRACE_ATTACHED. KFD_IOC_DBG_TRAP_DISABLE is exempt to allow + * clean up of debug mode as long as process is debug enabled. + * + * - EACCES if any DBG_HW_OP (debug hardware operation) is requested when + * AMDKFD_IOC_RUNTIME_ENABLE has not succeeded prior. + * + * - ENODEV if any GPU does not support debugging on a DBG_HW_OP call. + * + * - Other errors may be returned when a DBG_HW_OP occurs while the GPU + * is in a fatal state. + * + */ +enum kfd_dbg_trap_operations { + KFD_IOC_DBG_TRAP_ENABLE = 0, + KFD_IOC_DBG_TRAP_DISABLE = 1, + KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT = 2, + KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED = 3, + KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE = 4, /* DBG_HW_OP */ + KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE = 5, /* DBG_HW_OP */ + KFD_IOC_DBG_TRAP_SUSPEND_QUEUES = 6, /* DBG_HW_OP */ + KFD_IOC_DBG_TRAP_RESUME_QUEUES = 7, /* DBG_HW_OP */ + KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH = 8, /* DBG_HW_OP */ + KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH = 9, /* DBG_HW_OP */ + KFD_IOC_DBG_TRAP_SET_FLAGS = 10, + KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT = 11, + KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO = 12, + KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT = 13, + KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT = 14 +}; + +/** + * kfd_ioctl_dbg_trap_enable_args + * + * Arguments for KFD_IOC_DBG_TRAP_ENABLE. + * + * Enables debug session for target process. Call @op KFD_IOC_DBG_TRAP_DISABLE in + * kfd_ioctl_dbg_trap_args to disable debug session. + * + * @exception_mask (IN) - exceptions to raise to the debugger + * @rinfo_ptr (IN) - pointer to runtime info buffer (see kfd_runtime_info) + * @rinfo_size (IN/OUT) - size of runtime info buffer in bytes + * @dbg_fd (IN) - fd the KFD will nofify the debugger with of raised + * exceptions set in exception_mask. + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Return - 0 on SUCCESS. + * Copies KFD saved kfd_runtime_info to @rinfo_ptr on enable. + * Size of kfd_runtime saved by the KFD returned to @rinfo_size. + * - EBADF if KFD cannot get a reference to dbg_fd. + * - EFAULT if KFD cannot copy runtime info to rinfo_ptr. + * - EINVAL if target process is already debug enabled. + * + */ +struct kfd_ioctl_dbg_trap_enable_args { + __u64 exception_mask; + __u64 rinfo_ptr; + __u32 rinfo_size; + __u32 dbg_fd; +}; + +/** + * kfd_ioctl_dbg_trap_send_runtime_event_args + * + * + * Arguments for KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT. + * Raises exceptions to runtime. + * + * @exception_mask (IN) - exceptions to raise to runtime + * @gpu_id (IN) - target device id + * @queue_id (IN) - target queue id + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Return - 0 on SUCCESS. + * - ENODEV if gpu_id not found. + * If exception_mask contains EC_PROCESS_RUNTIME, unblocks pending + * AMDKFD_IOC_RUNTIME_ENABLE call - see kfd_ioctl_runtime_enable_args. + * All other exceptions are raised to runtime through err_payload_addr. + * See kfd_context_save_area_header. + */ +struct kfd_ioctl_dbg_trap_send_runtime_event_args { + __u64 exception_mask; + __u32 gpu_id; + __u32 queue_id; +}; + +/** + * kfd_ioctl_dbg_trap_set_exceptions_enabled_args + * + * Arguments for KFD_IOC_SET_EXCEPTIONS_ENABLED + * Set new exceptions to be raised to the debugger. + * + * @exception_mask (IN) - new exceptions to raise the debugger + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Return - 0 on SUCCESS. + */ +struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args { + __u64 exception_mask; +}; + +/** + * kfd_ioctl_dbg_trap_set_wave_launch_override_args + * + * Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE + * Enable HW exceptions to raise trap. + * + * @override_mode (IN) - see kfd_dbg_trap_override_mode + * @enable_mask (IN/OUT) - reference kfd_dbg_trap_mask. + * IN is the override modes requested to be enabled. + * OUT is referenced in Return below. + * @support_request_mask (IN/OUT) - reference kfd_dbg_trap_mask. + * IN is the override modes requested for support check. + * OUT is referenced in Return below. + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Return - 0 on SUCCESS. + * Previous enablement is returned in @enable_mask. + * Actual override support is returned in @support_request_mask. + * - EINVAL if override mode is not supported. + * - EACCES if trap support requested is not actually supported. + * i.e. enable_mask (IN) is not a subset of support_request_mask (OUT). + * Otherwise it is considered a generic error (see kfd_dbg_trap_operations). + */ +struct kfd_ioctl_dbg_trap_set_wave_launch_override_args { + __u32 override_mode; + __u32 enable_mask; + __u32 support_request_mask; + __u32 pad; +}; + +/** + * kfd_ioctl_dbg_trap_set_wave_launch_mode_args + * + * Arguments for KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE + * Set wave launch mode. + * + * @mode (IN) - see kfd_dbg_trap_wave_launch_mode + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Return - 0 on SUCCESS. + */ +struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args { + __u32 launch_mode; + __u32 pad; +}; + +/** + * kfd_ioctl_dbg_trap_suspend_queues_ags + * + * Arguments for KFD_IOC_DBG_TRAP_SUSPEND_QUEUES + * Suspend queues. + * + * @exception_mask (IN) - raised exceptions to clear + * @queue_array_ptr (IN) - pointer to array of queue ids (u32 per queue id) + * to suspend + * @num_queues (IN) - number of queues to suspend in @queue_array_ptr + * @grace_period (IN) - wave time allowance before preemption + * per 1K GPU clock cycle unit + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Destruction of a suspended queue is blocked until the queue is + * resumed. This allows the debugger to access queue information and + * the its context save area without running into a race condition on + * queue destruction. + * Automatically copies per queue context save area header information + * into the save area base + * (see kfd_queue_snapshot_entry and kfd_context_save_area_header). + * + * Return - Number of queues suspended on SUCCESS. + * . KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK masked + * for each queue id in @queue_array_ptr array reports unsuccessful + * suspend reason. + * KFD_DBG_QUEUE_ERROR_MASK = HW failure. + * KFD_DBG_QUEUE_INVALID_MASK = queue does not exist, is new or + * is being destroyed. + */ +struct kfd_ioctl_dbg_trap_suspend_queues_args { + __u64 exception_mask; + __u64 queue_array_ptr; + __u32 num_queues; + __u32 grace_period; +}; + +/** + * kfd_ioctl_dbg_trap_resume_queues_args + * + * Arguments for KFD_IOC_DBG_TRAP_RESUME_QUEUES + * Resume queues. + * + * @queue_array_ptr (IN) - pointer to array of queue ids (u32 per queue id) + * to resume + * @num_queues (IN) - number of queues to resume in @queue_array_ptr + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Return - Number of queues resumed on SUCCESS. + * KFD_DBG_QUEUE_ERROR_MASK and KFD_DBG_QUEUE_INVALID_MASK mask + * for each queue id in @queue_array_ptr array reports unsuccessful + * resume reason. + * KFD_DBG_QUEUE_ERROR_MASK = HW failure. + * KFD_DBG_QUEUE_INVALID_MASK = queue does not exist. + */ +struct kfd_ioctl_dbg_trap_resume_queues_args { + __u64 queue_array_ptr; + __u32 num_queues; + __u32 pad; +}; + +/** + * kfd_ioctl_dbg_trap_set_node_address_watch_args + * + * Arguments for KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH + * Sets address watch for device. + * + * @address (IN) - watch address to set + * @mode (IN) - see kfd_dbg_trap_address_watch_mode + * @mask (IN) - watch address mask + * @gpu_id (IN) - target gpu to set watch point + * @id (OUT) - watch id allocated + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Return - 0 on SUCCESS. + * Allocated watch ID returned to @id. + * - ENODEV if gpu_id not found. + * - ENOMEM if watch IDs can be allocated + */ +struct kfd_ioctl_dbg_trap_set_node_address_watch_args { + __u64 address; + __u32 mode; + __u32 mask; + __u32 gpu_id; + __u32 id; +}; + +/** + * kfd_ioctl_dbg_trap_clear_node_address_watch_args + * + * Arguments for KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH + * Clear address watch for device. + * + * @gpu_id (IN) - target device to clear watch point + * @id (IN) - allocated watch id to clear + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Return - 0 on SUCCESS. + * - ENODEV if gpu_id not found. + * - EINVAL if watch ID has not been allocated. + */ +struct kfd_ioctl_dbg_trap_clear_node_address_watch_args { + __u32 gpu_id; + __u32 id; +}; + +/** + * kfd_ioctl_dbg_trap_set_flags_args + * + * Arguments for KFD_IOC_DBG_TRAP_SET_FLAGS + * Sets flags for wave behaviour. + * + * @flags (IN/OUT) - IN = flags to enable, OUT = flags previously enabled + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Return - 0 on SUCCESS. + * - EACCESS if any debug device does not allow flag options. + */ +struct kfd_ioctl_dbg_trap_set_flags_args { + __u32 flags; + __u32 pad; +}; + +/** + * kfd_ioctl_dbg_trap_query_debug_event_args + * + * Arguments for KFD_IOC_DBG_TRAP_QUERY_DEBUG_EVENT + * + * Find one or more raised exceptions. This function can return multiple + * exceptions from a single queue or a single device with one call. To find + * all raised exceptions, this function must be called repeatedly until it + * returns -EAGAIN. Returned exceptions can optionally be cleared by + * setting the corresponding bit in the @exception_mask input parameter. + * However, clearing an exception prevents retrieving further information + * about it with KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO. + * + * @exception_mask (IN/OUT) - exception to clear (IN) and raised (OUT) + * @gpu_id (OUT) - gpu id of exceptions raised + * @queue_id (OUT) - queue id of exceptions raised + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Return - 0 on raised exception found + * Raised exceptions found are returned in @exception mask + * with reported source id returned in @gpu_id or @queue_id. + * - EAGAIN if no raised exception has been found + */ +struct kfd_ioctl_dbg_trap_query_debug_event_args { + __u64 exception_mask; + __u32 gpu_id; + __u32 queue_id; +}; + +/** + * kfd_ioctl_dbg_trap_query_exception_info_args + * + * Arguments KFD_IOC_DBG_TRAP_QUERY_EXCEPTION_INFO + * Get additional info on raised exception. + * + * @info_ptr (IN) - pointer to exception info buffer to copy to + * @info_size (IN/OUT) - exception info buffer size (bytes) + * @source_id (IN) - target gpu or queue id + * @exception_code (IN) - target exception + * @clear_exception (IN) - clear raised @exception_code exception + * (0 = false, 1 = true) + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Return - 0 on SUCCESS. + * If @exception_code is EC_DEVICE_MEMORY_VIOLATION, copy @info_size(OUT) + * bytes of memory exception data to @info_ptr. + * If @exception_code is EC_PROCESS_RUNTIME, copy saved + * kfd_runtime_info to @info_ptr. + * Actual required @info_ptr size (bytes) is returned in @info_size. + */ +struct kfd_ioctl_dbg_trap_query_exception_info_args { + __u64 info_ptr; + __u32 info_size; + __u32 source_id; + __u32 exception_code; + __u32 clear_exception; +}; + +/** + * kfd_ioctl_dbg_trap_get_queue_snapshot_args + * + * Arguments KFD_IOC_DBG_TRAP_GET_QUEUE_SNAPSHOT + * Get queue information. + * + * @exception_mask (IN) - exceptions raised to clear + * @snapshot_buf_ptr (IN) - queue snapshot entry buffer (see kfd_queue_snapshot_entry) + * @num_queues (IN/OUT) - number of queue snapshot entries + * The debugger specifies the size of the array allocated in @num_queues. + * KFD returns the number of queues that actually existed. If this is + * larger than the size specified by the debugger, KFD will not overflow + * the array allocated by the debugger. + * + * @entry_size (IN/OUT) - size per entry in bytes + * The debugger specifies sizeof(struct kfd_queue_snapshot_entry) in + * @entry_size. KFD returns the number of bytes actually populated per + * entry. The debugger should use the KFD_IOCTL_MINOR_VERSION to determine, + * which fields in struct kfd_queue_snapshot_entry are valid. This allows + * growing the ABI in a backwards compatible manner. + * Note that entry_size(IN) should still be used to stride the snapshot buffer in the + * event that it's larger than actual kfd_queue_snapshot_entry. + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Return - 0 on SUCCESS. + * Copies @num_queues(IN) queue snapshot entries of size @entry_size(IN) + * into @snapshot_buf_ptr if @num_queues(IN) > 0. + * Otherwise return @num_queues(OUT) queue snapshot entries that exist. + */ +struct kfd_ioctl_dbg_trap_queue_snapshot_args { + __u64 exception_mask; + __u64 snapshot_buf_ptr; + __u32 num_queues; + __u32 entry_size; +}; + +/** + * kfd_ioctl_dbg_trap_get_device_snapshot_args + * + * Arguments for KFD_IOC_DBG_TRAP_GET_DEVICE_SNAPSHOT + * Get device information. + * + * @exception_mask (IN) - exceptions raised to clear + * @snapshot_buf_ptr (IN) - pointer to snapshot buffer (see kfd_dbg_device_info_entry) + * @num_devices (IN/OUT) - number of debug devices to snapshot + * The debugger specifies the size of the array allocated in @num_devices. + * KFD returns the number of devices that actually existed. If this is + * larger than the size specified by the debugger, KFD will not overflow + * the array allocated by the debugger. + * + * @entry_size (IN/OUT) - size per entry in bytes + * The debugger specifies sizeof(struct kfd_dbg_device_info_entry) in + * @entry_size. KFD returns the number of bytes actually populated. The + * debugger should use KFD_IOCTL_MINOR_VERSION to determine, which fields + * in struct kfd_dbg_device_info_entry are valid. This allows growing the + * ABI in a backwards compatible manner. + * Note that entry_size(IN) should still be used to stride the snapshot buffer in the + * event that it's larger than actual kfd_dbg_device_info_entry. + * + * Generic errors apply (see kfd_dbg_trap_operations). + * Return - 0 on SUCCESS. + * Copies @num_devices(IN) device snapshot entries of size @entry_size(IN) + * into @snapshot_buf_ptr if @num_devices(IN) > 0. + * Otherwise return @num_devices(OUT) queue snapshot entries that exist. + */ +struct kfd_ioctl_dbg_trap_device_snapshot_args { + __u64 exception_mask; + __u64 snapshot_buf_ptr; + __u32 num_devices; + __u32 entry_size; +}; + +/** + * kfd_ioctl_dbg_trap_args + * + * Arguments to debug target process. + * + * @pid - target process to debug + * @op - debug operation (see kfd_dbg_trap_operations) + * + * @op determines which union struct args to use. + * Refer to kern docs for each kfd_ioctl_dbg_trap_*_args struct. + */ +struct kfd_ioctl_dbg_trap_args { + __u32 pid; + __u32 op; + + union { + struct kfd_ioctl_dbg_trap_enable_args enable; + struct kfd_ioctl_dbg_trap_send_runtime_event_args send_runtime_event; + struct kfd_ioctl_dbg_trap_set_exceptions_enabled_args set_exceptions_enabled; + struct kfd_ioctl_dbg_trap_set_wave_launch_override_args launch_override; + struct kfd_ioctl_dbg_trap_set_wave_launch_mode_args launch_mode; + struct kfd_ioctl_dbg_trap_suspend_queues_args suspend_queues; + struct kfd_ioctl_dbg_trap_resume_queues_args resume_queues; + struct kfd_ioctl_dbg_trap_set_node_address_watch_args set_node_address_watch; + struct kfd_ioctl_dbg_trap_clear_node_address_watch_args clear_node_address_watch; + struct kfd_ioctl_dbg_trap_set_flags_args set_flags; + struct kfd_ioctl_dbg_trap_query_debug_event_args query_debug_event; + struct kfd_ioctl_dbg_trap_query_exception_info_args query_exception_info; + struct kfd_ioctl_dbg_trap_queue_snapshot_args queue_snapshot; + struct kfd_ioctl_dbg_trap_device_snapshot_args device_snapshot; + }; +}; + #define AMDKFD_IOCTL_BASE 'K' #define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) #define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) @@ -887,7 +1547,13 @@ struct kfd_ioctl_set_xnack_mode_args { #define AMDKFD_IOC_EXPORT_DMABUF \ AMDKFD_IOWR(0x24, struct kfd_ioctl_export_dmabuf_args) +#define AMDKFD_IOC_RUNTIME_ENABLE \ + AMDKFD_IOWR(0x25, struct kfd_ioctl_runtime_enable_args) + +#define AMDKFD_IOC_DBG_TRAP \ + AMDKFD_IOWR(0x26, struct kfd_ioctl_dbg_trap_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x25 +#define AMDKFD_COMMAND_END 0x27 #endif -- cgit v1.2.3 From d230f1bfe7a1977565ce1e2804ddb7b7a3d911ff Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Fri, 25 Mar 2022 12:39:06 -0400 Subject: drm/amdkfd: display debug capabilities Expose debug capabilities in the KFD topology node's HSA capabilities and debug properties flags. Ensure correct capabilities are exposed based on firmware support. Flag definitions can be referenced in uapi/linux/kfd_sysfs.h. Signed-off-by: Jonathan Kim Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 101 ++++++++++++++++++++++++++++-- drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 6 ++ include/uapi/linux/kfd_sysfs.h | 15 +++++ 3 files changed, 117 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 8302d8967158..3def25b2bdbb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -535,6 +535,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->gpu->kfd->mec_fw_version); sysfs_show_32bit_prop(buffer, offs, "capability", dev->node_props.capability); + sysfs_show_64bit_prop(buffer, offs, "debug_prop", + dev->node_props.debug_prop); sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version", dev->gpu->kfd->sdma_fw_version); sysfs_show_64bit_prop(buffer, offs, "unique_id", @@ -1857,6 +1859,97 @@ err: return res; } +static void kfd_topology_set_dbg_firmware_support(struct kfd_topology_device *dev) +{ + bool firmware_supported = true; + + if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0) && + KFD_GC_VERSION(dev->gpu) < IP_VERSION(12, 0, 0)) { + firmware_supported = + (dev->gpu->adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 9; + goto out; + } + + /* + * Note: Any unlisted devices here are assumed to support exception handling. + * Add additional checks here as needed. + */ + switch (KFD_GC_VERSION(dev->gpu)) { + case IP_VERSION(9, 0, 1): + firmware_supported = dev->gpu->kfd->mec_fw_version >= 459 + 32768; + break; + case IP_VERSION(9, 1, 0): + case IP_VERSION(9, 2, 1): + case IP_VERSION(9, 2, 2): + case IP_VERSION(9, 3, 0): + case IP_VERSION(9, 4, 0): + firmware_supported = dev->gpu->kfd->mec_fw_version >= 459; + break; + case IP_VERSION(9, 4, 1): + firmware_supported = dev->gpu->kfd->mec_fw_version >= 60; + break; + case IP_VERSION(9, 4, 2): + firmware_supported = dev->gpu->kfd->mec_fw_version >= 51; + break; + case IP_VERSION(10, 1, 10): + case IP_VERSION(10, 1, 2): + case IP_VERSION(10, 1, 1): + firmware_supported = dev->gpu->kfd->mec_fw_version >= 144; + break; + case IP_VERSION(10, 3, 0): + case IP_VERSION(10, 3, 2): + case IP_VERSION(10, 3, 1): + case IP_VERSION(10, 3, 4): + case IP_VERSION(10, 3, 5): + firmware_supported = dev->gpu->kfd->mec_fw_version >= 89; + break; + case IP_VERSION(10, 1, 3): + case IP_VERSION(10, 3, 3): + firmware_supported = false; + break; + default: + break; + } + +out: + if (firmware_supported) + dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED; +} + +static void kfd_topology_set_capabilities(struct kfd_topology_device *dev) +{ + dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << + HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & + HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); + + dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_SUPPORT | + HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_TRAP_OVERRIDE_SUPPORTED | + HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_MODE_SUPPORTED; + + if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0)) { + dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9 | + HSA_DBG_WATCH_ADDR_MASK_HI_BIT; + + if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 4, 2)) + dev->node_props.debug_prop |= + HSA_DBG_DISPATCH_INFO_ALWAYS_VALID; + else + dev->node_props.capability |= + HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED; + } else { + dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10 | + HSA_DBG_WATCH_ADDR_MASK_HI_BIT; + + if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(11, 0, 0)) + dev->node_props.debug_prop |= HSA_DBG_DISPATCH_INFO_ALWAYS_VALID; + else + dev->node_props.capability |= + HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED; + } + + kfd_topology_set_dbg_firmware_support(dev); +} + int kfd_topology_add_device(struct kfd_node *gpu) { uint32_t gpu_id; @@ -1967,13 +2060,11 @@ int kfd_topology_add_device(struct kfd_node *gpu) HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); break; default: - if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(9, 0, 1)) - dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << - HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & - HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); - else + if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(9, 0, 1)) WARN(1, "Unexpected ASIC family %u", dev->gpu->adev->asic_type); + else + kfd_topology_set_capabilities(dev); } /* diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index 3b8afb6aba79..cba2cd5ed9d1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -31,6 +31,11 @@ #define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 32 +#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX9 6 +#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10 7 +#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT \ + (29 << HSA_DBG_WATCH_ADDR_MASK_HI_BIT_SHIFT) + struct kfd_node_properties { uint64_t hive_id; uint32_t cpu_cores_count; @@ -42,6 +47,7 @@ struct kfd_node_properties { uint32_t cpu_core_id_base; uint32_t simd_id_base; uint32_t capability; + uint64_t debug_prop; uint32_t max_waves_per_simd; uint32_t lds_size_in_kb; uint32_t gds_size_in_kb; diff --git a/include/uapi/linux/kfd_sysfs.h b/include/uapi/linux/kfd_sysfs.h index 3e330f368917..a51b7331e0b4 100644 --- a/include/uapi/linux/kfd_sysfs.h +++ b/include/uapi/linux/kfd_sysfs.h @@ -43,6 +43,11 @@ #define HSA_CAP_DOORBELL_TYPE_2_0 0x2 #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 +#define HSA_CAP_TRAP_DEBUG_SUPPORT 0x00008000 +#define HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_TRAP_OVERRIDE_SUPPORTED 0x00010000 +#define HSA_CAP_TRAP_DEBUG_WAVE_LAUNCH_MODE_SUPPORTED 0x00020000 +#define HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED 0x00040000 + /* Old buggy user mode depends on this being 0 */ #define HSA_CAP_RESERVED_WAS_SRAM_EDCSUPPORTED 0x00080000 @@ -53,8 +58,18 @@ #define HSA_CAP_SRAM_EDCSUPPORTED 0x04000000 #define HSA_CAP_SVMAPI_SUPPORTED 0x08000000 #define HSA_CAP_FLAGS_COHERENTHOSTACCESS 0x10000000 +#define HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED 0x20000000 #define HSA_CAP_RESERVED 0xe00f8000 +/* debug_prop bits in node properties */ +#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_MASK 0x0000000f +#define HSA_DBG_WATCH_ADDR_MASK_LO_BIT_SHIFT 0 +#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT_MASK 0x000003f0 +#define HSA_DBG_WATCH_ADDR_MASK_HI_BIT_SHIFT 4 +#define HSA_DBG_DISPATCH_INFO_ALWAYS_VALID 0x00000400 +#define HSA_DBG_WATCHPOINTS_EXCLUSIVE 0x00000800 +#define HSA_DBG_RESERVED 0xfffffffffffff000ull + /* Heap types in memory properties */ #define HSA_MEM_HEAP_TYPE_SYSTEM 0 #define HSA_MEM_HEAP_TYPE_FB_PUBLIC 1 -- cgit v1.2.3 From a159afdad2f6b97e4d18549cff2b53d17e68a412 Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Tue, 10 May 2022 12:51:26 -0400 Subject: drm/amdkfd: bump kfd ioctl minor version for debug api availability Bump the minor version to declare debugging capability is now available. Signed-off-by: Jonathan Kim Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1 - include/uapi/linux/kfd_ioctl.h | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f54ff5c3387d..cce2abe12e1b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -2984,7 +2984,6 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v if (!r) target->exception_enable_mask = args->enable.exception_mask; - pr_warn("Debug functions limited\n"); break; case KFD_IOC_DBG_TRAP_DISABLE: r = kfd_dbg_trap_disable(target); diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 32913d674d38..1781e7669982 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -38,9 +38,10 @@ * - 1.10 - Add SMI profiler event log * - 1.11 - Add unified memory for ctx save/restore area * - 1.12 - Add DMA buf export ioctl + * - 1.13 - Add debugger API */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 12 +#define KFD_IOCTL_MINOR_VERSION 13 struct kfd_ioctl_get_version_args { __u32 major_version; /* from KFD */ -- cgit v1.2.3 From cf264e1329fb0307e044f7675849f9f38b44c11a Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Tue, 2 May 2023 18:36:07 -0700 Subject: cachestat: implement cachestat syscall There is currently no good way to query the page cache state of large file sets and directory trees. There is mincore(), but it scales poorly: the kernel writes out a lot of bitmap data that userspace has to aggregate, when the user really doesn not care about per-page information in that case. The user also needs to mmap and unmap each file as it goes along, which can be quite slow as well. Some use cases where this information could come in handy: * Allowing database to decide whether to perform an index scan or direct table queries based on the in-memory cache state of the index. * Visibility into the writeback algorithm, for performance issues diagnostic. * Workload-aware writeback pacing: estimating IO fulfilled by page cache (and IO to be done) within a range of a file, allowing for more frequent syncing when and where there is IO capacity, and batching when there is not. * Computing memory usage of large files/directory trees, analogous to the du tool for disk usage. More information about these use cases could be found in the following thread: https://lore.kernel.org/lkml/20230315170934.GA97793@cmpxchg.org/ This patch implements a new syscall that queries cache state of a file and summarizes the number of cached pages, number of dirty pages, number of pages marked for writeback, number of (recently) evicted pages, etc. in a given range. Currently, the syscall is only wired in for x86 architecture. NAME cachestat - query the page cache statistics of a file. SYNOPSIS #include struct cachestat_range { __u64 off; __u64 len; }; struct cachestat { __u64 nr_cache; __u64 nr_dirty; __u64 nr_writeback; __u64 nr_evicted; __u64 nr_recently_evicted; }; int cachestat(unsigned int fd, struct cachestat_range *cstat_range, struct cachestat *cstat, unsigned int flags); DESCRIPTION cachestat() queries the number of cached pages, number of dirty pages, number of pages marked for writeback, number of evicted pages, number of recently evicted pages, in the bytes range given by `off` and `len`. An evicted page is a page that is previously in the page cache but has been evicted since. A page is recently evicted if its last eviction was recent enough that its reentry to the cache would indicate that it is actively being used by the system, and that there is memory pressure on the system. These values are returned in a cachestat struct, whose address is given by the `cstat` argument. The `off` and `len` arguments must be non-negative integers. If `len` > 0, the queried range is [`off`, `off` + `len`]. If `len` == 0, we will query in the range from `off` to the end of the file. The `flags` argument is unused for now, but is included for future extensibility. User should pass 0 (i.e no flag specified). Currently, hugetlbfs is not supported. Because the status of a page can change after cachestat() checks it but before it returns to the application, the returned values may contain stale information. RETURN VALUE On success, cachestat returns 0. On error, -1 is returned, and errno is set to indicate the error. ERRORS EFAULT cstat or cstat_args points to an invalid address. EINVAL invalid flags. EBADF invalid file descriptor. EOPNOTSUPP file descriptor is of a hugetlbfs file [nphamcs@gmail.com: replace rounddown logic with the existing helper] Link: https://lkml.kernel.org/r/20230504022044.3675469-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20230503013608.2431726-3-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Cc: Brian Foster Cc: Matthew Wilcox (Oracle) Cc: Michael Kerrisk Signed-off-by: Andrew Morton --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 5 + include/uapi/asm-generic/unistd.h | 5 +- include/uapi/linux/mman.h | 14 +++ init/Kconfig | 10 ++ kernel/sys_ni.c | 1 + mm/filemap.c | 171 +++++++++++++++++++++++++++++++++ 8 files changed, 207 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 320480a8db4f..bc0a3c941b35 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -455,3 +455,4 @@ 448 i386 process_mrelease sys_process_mrelease 449 i386 futex_waitv sys_futex_waitv 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node +451 i386 cachestat sys_cachestat diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index c84d12608cd2..227538b0ce80 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -372,6 +372,7 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat # # Due to a historical design error, certain syscalls are numbered differently diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 33a0ee3bcb2e..6648c07c4381 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -72,6 +72,8 @@ struct open_how; struct mount_attr; struct landlock_ruleset_attr; enum landlock_rule_type; +struct cachestat_range; +struct cachestat; #include #include @@ -1058,6 +1060,9 @@ asmlinkage long sys_memfd_secret(unsigned int flags); asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, unsigned long home_node, unsigned long flags); +asmlinkage long sys_cachestat(unsigned int fd, + struct cachestat_range __user *cstat_range, + struct cachestat __user *cstat, unsigned int flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 45fa180cc56a..cd639fae9086 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #define __NR_set_mempolicy_home_node 450 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) +#define __NR_cachestat 451 +__SYSCALL(__NR_cachestat, sys_cachestat) + #undef __NR_syscalls -#define __NR_syscalls 451 +#define __NR_syscalls 452 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index f55bc680b5b0..a246e11988d5 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -4,6 +4,7 @@ #include #include +#include #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 @@ -41,4 +42,17 @@ #define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB +struct cachestat_range { + __u64 off; + __u64 len; +}; + +struct cachestat { + __u64 nr_cache; + __u64 nr_dirty; + __u64 nr_writeback; + __u64 nr_evicted; + __u64 nr_recently_evicted; +}; + #endif /* _UAPI_LINUX_MMAN_H */ diff --git a/init/Kconfig b/init/Kconfig index 32c24950c4ce..f7f65af4ee12 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1771,6 +1771,16 @@ config RSEQ If unsure, say Y. +config CACHESTAT_SYSCALL + bool "Enable cachestat() system call" if EXPERT + default y + help + Enable the cachestat system call, which queries the page cache + statistics of a file (number of cached pages, dirty pages, + pages marked for writeback, (recently) evicted pages). + + If unsure say Y here. + config DEBUG_RSEQ default n bool "Enabled debugging of rseq() system call" if EXPERT diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 860b2dcf3ac4..04bfb1e4d377 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -299,6 +299,7 @@ COND_SYSCALL(set_mempolicy); COND_SYSCALL(migrate_pages); COND_SYSCALL(move_pages); COND_SYSCALL(set_mempolicy_home_node); +COND_SYSCALL(cachestat); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); diff --git a/mm/filemap.c b/mm/filemap.c index b4c9bd368b7e..2d3d70c64dfd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,8 @@ #include +#include "swap.h" + /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -4119,3 +4122,171 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); + +#ifdef CONFIG_CACHESTAT_SYSCALL +/** + * filemap_cachestat() - compute the page cache statistics of a mapping + * @mapping: The mapping to compute the statistics for. + * @first_index: The starting page cache index. + * @last_index: The final page index (inclusive). + * @cs: the cachestat struct to write the result to. + * + * This will query the page cache statistics of a mapping in the + * page range of [first_index, last_index] (inclusive). The statistics + * queried include: number of dirty pages, number of pages marked for + * writeback, and the number of (recently) evicted pages. + */ +static void filemap_cachestat(struct address_space *mapping, + pgoff_t first_index, pgoff_t last_index, struct cachestat *cs) +{ + XA_STATE(xas, &mapping->i_pages, first_index); + struct folio *folio; + + rcu_read_lock(); + xas_for_each(&xas, folio, last_index) { + unsigned long nr_pages; + pgoff_t folio_first_index, folio_last_index; + + if (xas_retry(&xas, folio)) + continue; + + if (xa_is_value(folio)) { + /* page is evicted */ + void *shadow = (void *)folio; + bool workingset; /* not used */ + int order = xa_get_order(xas.xa, xas.xa_index); + + nr_pages = 1 << order; + folio_first_index = round_down(xas.xa_index, 1 << order); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + cs->nr_evicted += nr_pages; + +#ifdef CONFIG_SWAP /* implies CONFIG_MMU */ + if (shmem_mapping(mapping)) { + /* shmem file - in swap cache */ + swp_entry_t swp = radix_to_swp_entry(folio); + + shadow = get_shadow_from_swap_cache(swp); + } +#endif + if (workingset_test_recent(shadow, true, &workingset)) + cs->nr_recently_evicted += nr_pages; + + goto resched; + } + + nr_pages = folio_nr_pages(folio); + folio_first_index = folio_pgoff(folio); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + /* page is in cache */ + cs->nr_cache += nr_pages; + + if (folio_test_dirty(folio)) + cs->nr_dirty += nr_pages; + + if (folio_test_writeback(folio)) + cs->nr_writeback += nr_pages; + +resched: + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + } + rcu_read_unlock(); +} + +/* + * The cachestat(2) system call. + * + * cachestat() returns the page cache statistics of a file in the + * bytes range specified by `off` and `len`: number of cached pages, + * number of dirty pages, number of pages marked for writeback, + * number of evicted pages, and number of recently evicted pages. + * + * An evicted page is a page that is previously in the page cache + * but has been evicted since. A page is recently evicted if its last + * eviction was recent enough that its reentry to the cache would + * indicate that it is actively being used by the system, and that + * there is memory pressure on the system. + * + * `off` and `len` must be non-negative integers. If `len` > 0, + * the queried range is [`off`, `off` + `len`]. If `len` == 0, + * we will query in the range from `off` to the end of the file. + * + * The `flags` argument is unused for now, but is included for future + * extensibility. User should pass 0 (i.e no flag specified). + * + * Currently, hugetlbfs is not supported. + * + * Because the status of a page can change after cachestat() checks it + * but before it returns to the application, the returned values may + * contain stale information. + * + * return values: + * zero - success + * -EFAULT - cstat or cstat_range points to an illegal address + * -EINVAL - invalid flags + * -EBADF - invalid file descriptor + * -EOPNOTSUPP - file descriptor is of a hugetlbfs file + */ +SYSCALL_DEFINE4(cachestat, unsigned int, fd, + struct cachestat_range __user *, cstat_range, + struct cachestat __user *, cstat, unsigned int, flags) +{ + struct fd f = fdget(fd); + struct address_space *mapping; + struct cachestat_range csr; + struct cachestat cs; + pgoff_t first_index, last_index; + + if (!f.file) + return -EBADF; + + if (copy_from_user(&csr, cstat_range, + sizeof(struct cachestat_range))) { + fdput(f); + return -EFAULT; + } + + /* hugetlbfs is not supported */ + if (is_file_hugepages(f.file)) { + fdput(f); + return -EOPNOTSUPP; + } + + if (flags != 0) { + fdput(f); + return -EINVAL; + } + + first_index = csr.off >> PAGE_SHIFT; + last_index = + csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; + memset(&cs, 0, sizeof(struct cachestat)); + mapping = f.file->f_mapping; + filemap_cachestat(mapping, first_index, last_index, &cs); + fdput(f); + + if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) + return -EFAULT; + + return 0; +} +#endif /* CONFIG_CACHESTAT_SYSCALL */ -- cgit v1.2.3 From e069ba07e6c7af69e119316bc87ff44869095f49 Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Fri, 9 Jun 2023 09:59:55 -0400 Subject: net: openvswitch: add support for l4 symmetric hashing Since its introduction, the ovs module execute_hash action allowed hash algorithms other than the skb->l4_hash to be used. However, additional hash algorithms were not implemented. This means flows requiring different hash distributions weren't able to use the kernel datapath. Now, introduce support for symmetric hashing algorithm as an alternative hash supported by the ovs module using the flow dissector. Output of flow using l4_sym hash: recirc_id(0),in_port(3),eth(),eth_type(0x0800), ipv4(dst=64.0.0.0/192.0.0.0,proto=6,frag=no), packets:30473425, bytes:45902883702, used:0.000s, flags:SP., actions:hash(sym_l4(0)),recirc(0xd) Some performance testing with no GRO/GSO, two veths, single flow: hash(l4(0)): 4.35 GBits/s hash(l4_sym(0)): 4.24 GBits/s Signed-off-by: Aaron Conole Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + net/openvswitch/actions.c | 12 ++++++++++-- net/openvswitch/flow_netlink.c | 2 ++ 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index c5d62ee82567..e94870e77ee9 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -765,6 +765,7 @@ struct ovs_action_push_vlan { */ enum ovs_hash_alg { OVS_HASH_ALG_L4, + OVS_HASH_ALG_SYM_L4, }; /* diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 8074ea00d577..cab1e02b63e0 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1073,8 +1073,16 @@ static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, struct ovs_action_hash *hash_act = nla_data(attr); u32 hash = 0; - /* OVS_HASH_ALG_L4 is the only possible hash algorithm. */ - hash = skb_get_hash(skb); + if (hash_act->hash_alg == OVS_HASH_ALG_L4) { + /* OVS_HASH_ALG_L4 hasing type. */ + hash = skb_get_hash(skb); + } else if (hash_act->hash_alg == OVS_HASH_ALG_SYM_L4) { + /* OVS_HASH_ALG_SYM_L4 hashing type. NOTE: this doesn't + * extend past an encapsulated header. + */ + hash = __skb_get_hash_symmetric(skb); + } + hash = jhash_1word(hash, hash_act->hash_basis); if (!hash) hash = 0x1; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index ead5418c126e..41116361433d 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -3221,6 +3221,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, switch (act_hash->hash_alg) { case OVS_HASH_ALG_L4: + fallthrough; + case OVS_HASH_ALG_SYM_L4: break; default: return -EINVAL; -- cgit v1.2.3 From 7cfffd5fed3e385010583840402f0bf66c4ed147 Mon Sep 17 00:00:00 2001 From: Zahari Doychev Date: Thu, 8 Jun 2023 12:56:47 +0200 Subject: net: flower: add support for matching cfm fields Add support to the tc flower classifier to match based on fields in CFM information elements like level and opcode. tc filter add dev ens6 ingress protocol 802.1q \ flower vlan_id 698 vlan_ethtype 0x8902 cfm mdl 5 op 46 \ action drop Signed-off-by: Zahari Doychev Reviewed-by: Simon Horman Reviewed-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_cls.h | 9 ++++ net/sched/cls_flower.c | 102 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 00933dda7b10..7865f5a9885b 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -596,6 +596,8 @@ enum { TCA_FLOWER_L2_MISS, /* u8 */ + TCA_FLOWER_KEY_CFM, /* nested */ + __TCA_FLOWER_MAX, }; @@ -704,6 +706,13 @@ enum { TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), }; +enum { + TCA_FLOWER_KEY_CFM_OPT_UNSPEC, + TCA_FLOWER_KEY_CFM_MD_LEVEL, + TCA_FLOWER_KEY_CFM_OPCODE, + TCA_FLOWER_KEY_CFM_OPT_MAX, +}; + #define TCA_FLOWER_MASK_FLAGS_RANGE (1 << 0) /* Range-based match */ /* Match-all classifier */ diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index e02ecabbb75c..56065cc5a661 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -71,6 +72,7 @@ struct fl_flow_key { struct flow_dissector_key_num_of_vlans num_of_vlans; struct flow_dissector_key_pppoe pppoe; struct flow_dissector_key_l2tpv3 l2tpv3; + struct flow_dissector_key_cfm cfm; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -725,6 +727,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_PPP_PROTO] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_L2TPV3_SID] = { .type = NLA_U32 }, [TCA_FLOWER_L2_MISS] = NLA_POLICY_MAX(NLA_U8, 1), + [TCA_FLOWER_KEY_CFM] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -773,6 +776,12 @@ mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL] = { .type = NLA_U32 }, }; +static const struct nla_policy cfm_opt_policy[TCA_FLOWER_KEY_CFM_OPT_MAX] = { + [TCA_FLOWER_KEY_CFM_MD_LEVEL] = NLA_POLICY_MAX(NLA_U8, + FLOW_DIS_CFM_MDL_MAX), + [TCA_FLOWER_KEY_CFM_OPCODE] = { .type = NLA_U8 }, +}; + static void fl_set_key_val(struct nlattr **tb, void *val, int val_type, void *mask, int mask_type, int len) @@ -1660,6 +1669,53 @@ static bool is_vlan_key(struct nlattr *tb, __be16 *ethertype, return false; } +static void fl_set_key_cfm_md_level(struct nlattr **tb, + struct fl_flow_key *key, + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) +{ + u8 level; + + if (!tb[TCA_FLOWER_KEY_CFM_MD_LEVEL]) + return; + + level = nla_get_u8(tb[TCA_FLOWER_KEY_CFM_MD_LEVEL]); + key->cfm.mdl_ver = FIELD_PREP(FLOW_DIS_CFM_MDL_MASK, level); + mask->cfm.mdl_ver = FLOW_DIS_CFM_MDL_MASK; +} + +static void fl_set_key_cfm_opcode(struct nlattr **tb, + struct fl_flow_key *key, + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) +{ + fl_set_key_val(tb, &key->cfm.opcode, TCA_FLOWER_KEY_CFM_OPCODE, + &mask->cfm.opcode, TCA_FLOWER_UNSPEC, + sizeof(key->cfm.opcode)); +} + +static int fl_set_key_cfm(struct nlattr **tb, + struct fl_flow_key *key, + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *nla_cfm_opt[TCA_FLOWER_KEY_CFM_OPT_MAX]; + int err; + + if (!tb[TCA_FLOWER_KEY_CFM]) + return 0; + + err = nla_parse_nested(nla_cfm_opt, TCA_FLOWER_KEY_CFM_OPT_MAX, + tb[TCA_FLOWER_KEY_CFM], cfm_opt_policy, extack); + if (err < 0) + return err; + + fl_set_key_cfm_opcode(nla_cfm_opt, key, mask, extack); + fl_set_key_cfm_md_level(nla_cfm_opt, key, mask, extack); + + return 0; +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -1814,6 +1870,10 @@ static int fl_set_key(struct net *net, struct nlattr **tb, TCA_FLOWER_KEY_L2TPV3_SID, &mask->l2tpv3.session_id, TCA_FLOWER_UNSPEC, sizeof(key->l2tpv3.session_id)); + } else if (key->basic.n_proto == htons(ETH_P_CFM)) { + ret = fl_set_key_cfm(tb, key, mask, extack); + if (ret) + return ret; } if (key->basic.ip_proto == IPPROTO_TCP || @@ -1996,6 +2056,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_PPPOE, pppoe); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_L2TPV3, l2tpv3); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_CFM, cfm); skb_flow_dissector_init(dissector, keys, cnt); } @@ -3029,6 +3091,43 @@ nla_put_failure: return -EMSGSIZE; } +static int fl_dump_key_cfm(struct sk_buff *skb, + struct flow_dissector_key_cfm *key, + struct flow_dissector_key_cfm *mask) +{ + struct nlattr *opts; + int err; + u8 mdl; + + if (!memchr_inv(mask, 0, sizeof(*mask))) + return 0; + + opts = nla_nest_start(skb, TCA_FLOWER_KEY_CFM); + if (!opts) + return -EMSGSIZE; + + if (FIELD_GET(FLOW_DIS_CFM_MDL_MASK, mask->mdl_ver)) { + mdl = FIELD_GET(FLOW_DIS_CFM_MDL_MASK, key->mdl_ver); + err = nla_put_u8(skb, TCA_FLOWER_KEY_CFM_MD_LEVEL, mdl); + if (err) + goto err_cfm_opts; + } + + if (mask->opcode) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_CFM_OPCODE, key->opcode); + if (err) + goto err_cfm_opts; + } + + nla_nest_end(skb, opts); + + return 0; + +err_cfm_opts: + nla_nest_cancel(skb, opts); + return err; +} + static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, struct flow_dissector_key_enc_opts *enc_opts) { @@ -3316,6 +3415,9 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, sizeof(key->hash.hash))) goto nla_put_failure; + if (fl_dump_key_cfm(skb, &key->cfm, &mask->cfm)) + goto nla_put_failure; + return 0; nla_put_failure: -- cgit v1.2.3 From 2ad66fcb2fded5359a676f7146cf442641d28307 Mon Sep 17 00:00:00 2001 From: Gilad Itzkovitch Date: Thu, 18 May 2023 12:07:23 +1200 Subject: wifi: cfg80211: S1G rate information and calculations Increase the size of S1G rate_info flags to support S1G and add flags for new S1G MCS and the supported bandwidths. Also, include S1G rate information to netlink STA rate message. Lastly, add rate calculation function for S1G MCS. Signed-off-by: Gilad Itzkovitch Link: https://lore.kernel.org/r/20230518000723.991912-1-gilad.itzkovitch@morsemicro.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 18 +++++-- include/uapi/linux/nl80211.h | 14 ++++++ net/wireless/nl80211.c | 23 +++++++++ net/wireless/util.c | 110 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 162 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1b8619685bf6..5d04e7eed43d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1702,6 +1702,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy, * @RATE_INFO_FLAGS_EDMG: 60GHz MCS in EDMG mode * @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS * @RATE_INFO_FLAGS_EHT_MCS: EHT MCS information + * @RATE_INFO_FLAGS_S1G_MCS: MCS field filled with S1G MCS */ enum rate_info_flags { RATE_INFO_FLAGS_MCS = BIT(0), @@ -1712,6 +1713,7 @@ enum rate_info_flags { RATE_INFO_FLAGS_EDMG = BIT(5), RATE_INFO_FLAGS_EXTENDED_SC_DMG = BIT(6), RATE_INFO_FLAGS_EHT_MCS = BIT(7), + RATE_INFO_FLAGS_S1G_MCS = BIT(8), }; /** @@ -1728,6 +1730,11 @@ enum rate_info_flags { * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation * @RATE_INFO_BW_320: 320 MHz bandwidth * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT RU allocation + * @RATE_INFO_BW_1: 1 MHz bandwidth + * @RATE_INFO_BW_2: 2 MHz bandwidth + * @RATE_INFO_BW_4: 4 MHz bandwidth + * @RATE_INFO_BW_8: 8 MHz bandwidth + * @RATE_INFO_BW_16: 16 MHz bandwidth */ enum rate_info_bw { RATE_INFO_BW_20 = 0, @@ -1739,6 +1746,11 @@ enum rate_info_bw { RATE_INFO_BW_HE_RU, RATE_INFO_BW_320, RATE_INFO_BW_EHT_RU, + RATE_INFO_BW_1, + RATE_INFO_BW_2, + RATE_INFO_BW_4, + RATE_INFO_BW_8, + RATE_INFO_BW_16, }; /** @@ -1747,8 +1759,8 @@ enum rate_info_bw { * Information about a receiving or transmitting bitrate * * @flags: bitflag of flags from &enum rate_info_flags - * @mcs: mcs index if struct describes an HT/VHT/HE rate * @legacy: bitrate in 100kbit/s for 802.11abg + * @mcs: mcs index if struct describes an HT/VHT/HE/EHT/S1G rate * @nss: number of streams (VHT & HE only) * @bw: bandwidth (from &enum rate_info_bw) * @he_gi: HE guard interval (from &enum nl80211_he_gi) @@ -1761,9 +1773,9 @@ enum rate_info_bw { * only valid if bw is %RATE_INFO_BW_EHT_RU) */ struct rate_info { - u8 flags; - u8 mcs; + u16 flags; u16 legacy; + u8 mcs; u8 nss; u8 bw; u8 he_gi; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c59fec406da5..435c4ac5d9bf 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3667,6 +3667,13 @@ enum nl80211_eht_ru_alloc { * (u8, see &enum nl80211_eht_gi) * @NL80211_RATE_INFO_EHT_RU_ALLOC: EHT RU allocation, if not present then * non-OFDMA was used (u8, see &enum nl80211_eht_ru_alloc) + * @NL80211_RATE_INFO_S1G_MCS: S1G MCS index (u8, 0-10) + * @NL80211_RATE_INFO_S1G_NSS: S1G NSS value (u8, 1-4) + * @NL80211_RATE_INFO_1_MHZ_WIDTH: 1 MHz S1G rate + * @NL80211_RATE_INFO_2_MHZ_WIDTH: 2 MHz S1G rate + * @NL80211_RATE_INFO_4_MHZ_WIDTH: 4 MHz S1G rate + * @NL80211_RATE_INFO_8_MHZ_WIDTH: 8 MHz S1G rate + * @NL80211_RATE_INFO_16_MHZ_WIDTH: 16 MHz S1G rate * @__NL80211_RATE_INFO_AFTER_LAST: internal use */ enum nl80211_rate_info { @@ -3693,6 +3700,13 @@ enum nl80211_rate_info { NL80211_RATE_INFO_EHT_NSS, NL80211_RATE_INFO_EHT_GI, NL80211_RATE_INFO_EHT_RU_ALLOC, + NL80211_RATE_INFO_S1G_MCS, + NL80211_RATE_INFO_S1G_NSS, + NL80211_RATE_INFO_1_MHZ_WIDTH, + NL80211_RATE_INFO_2_MHZ_WIDTH, + NL80211_RATE_INFO_4_MHZ_WIDTH, + NL80211_RATE_INFO_8_MHZ_WIDTH, + NL80211_RATE_INFO_16_MHZ_WIDTH, /* keep last */ __NL80211_RATE_INFO_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 772671b9bc42..f962765f7e0f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6368,12 +6368,27 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr) return false; switch (info->bw) { + case RATE_INFO_BW_1: + rate_flg = NL80211_RATE_INFO_1_MHZ_WIDTH; + break; + case RATE_INFO_BW_2: + rate_flg = NL80211_RATE_INFO_2_MHZ_WIDTH; + break; + case RATE_INFO_BW_4: + rate_flg = NL80211_RATE_INFO_4_MHZ_WIDTH; + break; case RATE_INFO_BW_5: rate_flg = NL80211_RATE_INFO_5_MHZ_WIDTH; break; + case RATE_INFO_BW_8: + rate_flg = NL80211_RATE_INFO_8_MHZ_WIDTH; + break; case RATE_INFO_BW_10: rate_flg = NL80211_RATE_INFO_10_MHZ_WIDTH; break; + case RATE_INFO_BW_16: + rate_flg = NL80211_RATE_INFO_16_MHZ_WIDTH; + break; default: WARN_ON(1); fallthrough; @@ -6432,6 +6447,14 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr) nla_put_u8(msg, NL80211_RATE_INFO_HE_RU_ALLOC, info->he_ru_alloc)) return false; + } else if (info->flags & RATE_INFO_FLAGS_S1G_MCS) { + if (nla_put_u8(msg, NL80211_RATE_INFO_S1G_MCS, info->mcs)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_S1G_NSS, info->nss)) + return false; + if (info->flags & RATE_INFO_FLAGS_SHORT_GI && + nla_put_flag(msg, NL80211_RATE_INFO_SHORT_GI)) + return false; } else if (info->flags & RATE_INFO_FLAGS_EHT_MCS) { if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_MCS, info->mcs)) return false; diff --git a/net/wireless/util.c b/net/wireless/util.c index 3bc0c3072e78..610a867c14f7 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -1646,6 +1646,114 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) return result / 10000; } +static u32 cfg80211_calculate_bitrate_s1g(struct rate_info *rate) +{ + /* For 1, 2, 4, 8 and 16 MHz channels */ + static const u32 base[5][11] = { + { 300000, + 600000, + 900000, + 1200000, + 1800000, + 2400000, + 2700000, + 3000000, + 3600000, + 4000000, + /* MCS 10 supported in 1 MHz only */ + 150000, + }, + { 650000, + 1300000, + 1950000, + 2600000, + 3900000, + 5200000, + 5850000, + 6500000, + 7800000, + /* MCS 9 not valid */ + }, + { 1350000, + 2700000, + 4050000, + 5400000, + 8100000, + 10800000, + 12150000, + 13500000, + 16200000, + 18000000, + }, + { 2925000, + 5850000, + 8775000, + 11700000, + 17550000, + 23400000, + 26325000, + 29250000, + 35100000, + 39000000, + }, + { 8580000, + 11700000, + 17550000, + 23400000, + 35100000, + 46800000, + 52650000, + 58500000, + 70200000, + 78000000, + }, + }; + u32 bitrate; + /* default is 1 MHz index */ + int idx = 0; + + if (rate->mcs >= 11) + goto warn; + + switch (rate->bw) { + case RATE_INFO_BW_16: + idx = 4; + break; + case RATE_INFO_BW_8: + idx = 3; + break; + case RATE_INFO_BW_4: + idx = 2; + break; + case RATE_INFO_BW_2: + idx = 1; + break; + case RATE_INFO_BW_1: + idx = 0; + break; + case RATE_INFO_BW_5: + case RATE_INFO_BW_10: + case RATE_INFO_BW_20: + case RATE_INFO_BW_40: + case RATE_INFO_BW_80: + case RATE_INFO_BW_160: + default: + goto warn; + } + + bitrate = base[idx][rate->mcs]; + bitrate *= rate->nss; + + if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) + bitrate = (bitrate / 9) * 10; + /* do NOT round down here */ + return (bitrate + 50000) / 100000; +warn: + WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n", + rate->bw, rate->mcs, rate->nss); + return 0; +} + u32 cfg80211_calculate_bitrate(struct rate_info *rate) { if (rate->flags & RATE_INFO_FLAGS_MCS) @@ -1662,6 +1770,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate) return cfg80211_calculate_bitrate_he(rate); if (rate->flags & RATE_INFO_FLAGS_EHT_MCS) return cfg80211_calculate_bitrate_eht(rate); + if (rate->flags & RATE_INFO_FLAGS_S1G_MCS) + return cfg80211_calculate_bitrate_s1g(rate); return rate->legacy; } -- cgit v1.2.3 From 6cf963edbbd3b279185e28e4864c9698ffaa23c3 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 8 Jun 2023 16:36:10 +0300 Subject: wifi: cfg80211: Support association to AP MLD with disabled links An AP part of an AP MLD might be temporarily disabled, and might be enabled later. Such a link should be included in the association exchange, but should not be used until enabled. Extend the NL80211_CMD_ASSOCIATE to also indicate disabled links. Signed-off-by: Ilan Peer Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230608163202.c4c61ee4c4a5.I784ef4a0d619fc9120514b5615458fbef3b3684a@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 ++++- include/uapi/linux/nl80211.h | 7 ++++++- net/wireless/nl80211.c | 13 ++++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5d04e7eed43d..388f2a3851a2 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7,7 +7,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2021, 2023 Intel Corporation */ #include @@ -2882,11 +2882,14 @@ struct cfg80211_auth_request { * if this is %NULL for a link, that link is not requested * @elems: extra elements for the per-STA profile for this link * @elems_len: length of the elements + * @disabled: If set this link should be included during association etc. but it + * should not be used until enabled by the AP MLD. */ struct cfg80211_assoc_link { struct cfg80211_bss *bss; const u8 *elems; size_t elems_len; + bool disabled; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 435c4ac5d9bf..03939bdb0e48 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -11,7 +11,7 @@ * Copyright 2008 Jouni Malinen * Copyright 2008 Colin McCabe * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -2805,6 +2805,9 @@ enum nl80211_commands { * index. If the userspace includes more RNR elements than number of * MBSSID elements then these will be added in every EMA beacon. * + * @NL80211_ATTR_MLO_LINK_DISABLED: Flag attribute indicating that the link is + * disabled. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3341,6 +3344,8 @@ enum nl80211_attrs { NL80211_ATTR_EMA_RNR_ELEMS, + NL80211_ATTR_MLO_LINK_DISABLED, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f962765f7e0f..ec7d467cb096 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation */ #include @@ -816,6 +816,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS] = { .type = NLA_U16 }, [NL80211_ATTR_HW_TIMESTAMP_ENABLED] = { .type = NLA_FLAG }, [NL80211_ATTR_EMA_RNR_ELEMS] = { .type = NLA_NESTED }, + [NL80211_ATTR_MLO_LINK_DISABLED] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -11138,6 +11139,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) goto free; } } + + req.links[link_id].disabled = + nla_get_flag(attrs[NL80211_ATTR_MLO_LINK_DISABLED]); } if (!req.links[req.link_id].bss) { @@ -11152,6 +11156,13 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) goto free; } + if (req.links[req.link_id].disabled) { + GENL_SET_ERR_MSG(info, + "cannot have assoc link disabled"); + err = -EINVAL; + goto free; + } + kfree(attrs); attrs = NULL; } else { -- cgit v1.2.3 From 09b69dd4378b91b3ac3fbac387fb992dc21c0f88 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 14 Jun 2023 11:13:11 -0700 Subject: usb: ch9: Replace 1-element array with flexible array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3"), UBSAN_BOUNDS no longer pretends 1-element arrays are unbounded. Walking wData will trigger a warning, so make it a proper flexible array. Add a union to keep the struct size identical for userspace in case anything was depending on the old size. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202306102333.8f5a7443-oliver.sang@intel.com Cc: Greg Kroah-Hartman Cc: kernel test robot Cc: "Gustavo A. R. Silva" Cc: Dan Williams Cc: "Jó Ágila Bitsch" Signed-off-by: Kees Cook Message-ID: <20230614181307.gonna.256-kees@kernel.org> Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/usb/ch9.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index b17e3a21b15f..82ec6af71a1d 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -376,7 +376,10 @@ struct usb_string_descriptor { __u8 bLength; __u8 bDescriptorType; - __le16 wData[1]; /* UTF-16LE encoded */ + union { + __le16 legacy_padding; + __DECLARE_FLEX_ARRAY(__le16, wData); /* UTF-16LE encoded */ + }; } __attribute__ ((packed)); /* note that "string" zero is special, it holds language codes that -- cgit v1.2.3 From a0df3ef087f8aaebbdf205b1b2e126ec9ef6b113 Mon Sep 17 00:00:00 2001 From: Julien Panis Date: Thu, 11 May 2023 11:51:24 +0200 Subject: misc: tps6594-pfsm: Add driver for TI TPS6594 PFSM This PFSM controls the operational modes of the PMIC: - STANDBY and LP_STANDBY, - ACTIVE state, - MCU_ONLY state, - RETENTION state, with or without DDR and/or GPIO retention. Depending on the current operational mode, some voltage domains remain energized while others can be off. This PFSM is also used to trigger a firmware update, and provides R/W access to device registers. See Documentation/misc-devices/tps6594-pfsm.rst for more information. Signed-off-by: Julien Panis Message-ID: <20230511095126.105104-5-jpanis@baylibre.com> Signed-off-by: Greg Kroah-Hartman --- drivers/misc/Kconfig | 12 ++ drivers/misc/Makefile | 1 + drivers/misc/tps6594-pfsm.c | 306 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/tps6594_pfsm.h | 37 +++++ 4 files changed, 356 insertions(+) create mode 100644 drivers/misc/tps6594-pfsm.c create mode 100644 include/uapi/linux/tps6594_pfsm.h (limited to 'include/uapi/linux') diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index c73c02801330..75e427f124b2 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -549,6 +549,18 @@ config TPS6594_ESM This driver can also be built as a module. If so, the module will be called tps6594-esm. +config TPS6594_PFSM + tristate "TI TPS6594 Pre-configurable Finite State Machine support" + depends on MFD_TPS6594 + default MFD_TPS6594 + help + Support PFSM (Pre-configurable Finite State Machine) on TPS6594 PMIC devices. + These devices integrate a finite state machine engine, which manages the state + of the device during operating state transition. + + This driver can also be built as a module. If so, the module + will be called tps6594-pfsm. + source "drivers/misc/c2port/Kconfig" source "drivers/misc/eeprom/Kconfig" source "drivers/misc/cb710/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 3dc69ec69912..f2a4d1ff65d4 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -66,3 +66,4 @@ obj-$(CONFIG_VCPU_STALL_DETECTOR) += vcpu_stall_detector.o obj-$(CONFIG_TMR_MANAGER) += xilinx_tmr_manager.o obj-$(CONFIG_TMR_INJECT) += xilinx_tmr_inject.o obj-$(CONFIG_TPS6594_ESM) += tps6594-esm.o +obj-$(CONFIG_TPS6594_PFSM) += tps6594-pfsm.o diff --git a/drivers/misc/tps6594-pfsm.c b/drivers/misc/tps6594-pfsm.c new file mode 100644 index 000000000000..5223d1580807 --- /dev/null +++ b/drivers/misc/tps6594-pfsm.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PFSM (Pre-configurable Finite State Machine) driver for TI TPS6594/TPS6593/LP8764 PMICs + * + * Copyright (C) 2023 BayLibre Incorporated - https://www.baylibre.com/ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#define TPS6594_STARTUP_DEST_MCU_ONLY_VAL 2 +#define TPS6594_STARTUP_DEST_ACTIVE_VAL 3 +#define TPS6594_STARTUP_DEST_SHIFT 5 +#define TPS6594_STARTUP_DEST_MCU_ONLY (TPS6594_STARTUP_DEST_MCU_ONLY_VAL \ + << TPS6594_STARTUP_DEST_SHIFT) +#define TPS6594_STARTUP_DEST_ACTIVE (TPS6594_STARTUP_DEST_ACTIVE_VAL \ + << TPS6594_STARTUP_DEST_SHIFT) + +/* + * To update the PMIC firmware, the user must be able to access + * page 0 (user registers) and page 1 (NVM control and configuration). + */ +#define TPS6594_PMIC_MAX_POS 0x200 + +#define TPS6594_FILE_TO_PFSM(f) container_of((f)->private_data, struct tps6594_pfsm, miscdev) + +/** + * struct tps6594_pfsm - device private data structure + * + * @miscdev: misc device infos + * @regmap: regmap for accessing the device registers + */ +struct tps6594_pfsm { + struct miscdevice miscdev; + struct regmap *regmap; +}; + +static ssize_t tps6594_pfsm_read(struct file *f, char __user *buf, + size_t count, loff_t *ppos) +{ + struct tps6594_pfsm *pfsm = TPS6594_FILE_TO_PFSM(f); + loff_t pos = *ppos; + unsigned int val; + int ret; + int i; + + if (pos < 0) + return -EINVAL; + if (pos >= TPS6594_PMIC_MAX_POS) + return 0; + if (count > TPS6594_PMIC_MAX_POS - pos) + count = TPS6594_PMIC_MAX_POS - pos; + + for (i = 0 ; i < count ; i++) { + ret = regmap_read(pfsm->regmap, pos + i, &val); + if (ret) + return ret; + + if (put_user(val, buf + i)) + return -EFAULT; + } + + *ppos = pos + count; + + return count; +} + +static ssize_t tps6594_pfsm_write(struct file *f, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct tps6594_pfsm *pfsm = TPS6594_FILE_TO_PFSM(f); + loff_t pos = *ppos; + char val; + int ret; + int i; + + if (pos < 0) + return -EINVAL; + if (pos >= TPS6594_PMIC_MAX_POS || !count) + return 0; + if (count > TPS6594_PMIC_MAX_POS - pos) + count = TPS6594_PMIC_MAX_POS - pos; + + for (i = 0 ; i < count ; i++) { + if (get_user(val, buf + i)) + return -EFAULT; + + ret = regmap_write(pfsm->regmap, pos + i, val); + if (ret) + return ret; + } + + *ppos = pos + count; + + return count; +} + +static int tps6594_pfsm_configure_ret_trig(struct regmap *regmap, u8 gpio_ret, u8 ddr_ret) +{ + int ret; + + if (gpio_ret) + ret = regmap_set_bits(regmap, TPS6594_REG_FSM_I2C_TRIGGERS, + TPS6594_BIT_TRIGGER_I2C(5) | TPS6594_BIT_TRIGGER_I2C(6)); + else + ret = regmap_clear_bits(regmap, TPS6594_REG_FSM_I2C_TRIGGERS, + TPS6594_BIT_TRIGGER_I2C(5) | TPS6594_BIT_TRIGGER_I2C(6)); + if (ret) + return ret; + + if (ddr_ret) + ret = regmap_set_bits(regmap, TPS6594_REG_FSM_I2C_TRIGGERS, + TPS6594_BIT_TRIGGER_I2C(7)); + else + ret = regmap_clear_bits(regmap, TPS6594_REG_FSM_I2C_TRIGGERS, + TPS6594_BIT_TRIGGER_I2C(7)); + + return ret; +} + +static long tps6594_pfsm_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + struct tps6594_pfsm *pfsm = TPS6594_FILE_TO_PFSM(f); + struct pmic_state_opt state_opt; + void __user *argp = (void __user *)arg; + int ret = -ENOIOCTLCMD; + + switch (cmd) { + case PMIC_GOTO_STANDBY: + /* Disable LP mode */ + ret = regmap_clear_bits(pfsm->regmap, TPS6594_REG_RTC_CTRL_2, + TPS6594_BIT_LP_STANDBY_SEL); + if (ret) + return ret; + + /* Force trigger */ + ret = regmap_write_bits(pfsm->regmap, TPS6594_REG_FSM_I2C_TRIGGERS, + TPS6594_BIT_TRIGGER_I2C(0), TPS6594_BIT_TRIGGER_I2C(0)); + break; + case PMIC_GOTO_LP_STANDBY: + /* Enable LP mode */ + ret = regmap_set_bits(pfsm->regmap, TPS6594_REG_RTC_CTRL_2, + TPS6594_BIT_LP_STANDBY_SEL); + if (ret) + return ret; + + /* Force trigger */ + ret = regmap_write_bits(pfsm->regmap, TPS6594_REG_FSM_I2C_TRIGGERS, + TPS6594_BIT_TRIGGER_I2C(0), TPS6594_BIT_TRIGGER_I2C(0)); + break; + case PMIC_UPDATE_PGM: + /* Force trigger */ + ret = regmap_write_bits(pfsm->regmap, TPS6594_REG_FSM_I2C_TRIGGERS, + TPS6594_BIT_TRIGGER_I2C(3), TPS6594_BIT_TRIGGER_I2C(3)); + break; + case PMIC_SET_ACTIVE_STATE: + /* Modify NSLEEP1-2 bits */ + ret = regmap_set_bits(pfsm->regmap, TPS6594_REG_FSM_NSLEEP_TRIGGERS, + TPS6594_BIT_NSLEEP1B | TPS6594_BIT_NSLEEP2B); + break; + case PMIC_SET_MCU_ONLY_STATE: + if (copy_from_user(&state_opt, argp, sizeof(state_opt))) + return -EFAULT; + + /* Configure retention triggers */ + ret = tps6594_pfsm_configure_ret_trig(pfsm->regmap, state_opt.gpio_retention, + state_opt.ddr_retention); + if (ret) + return ret; + + /* Modify NSLEEP1-2 bits */ + ret = regmap_clear_bits(pfsm->regmap, TPS6594_REG_FSM_NSLEEP_TRIGGERS, + TPS6594_BIT_NSLEEP1B); + if (ret) + return ret; + + ret = regmap_set_bits(pfsm->regmap, TPS6594_REG_FSM_NSLEEP_TRIGGERS, + TPS6594_BIT_NSLEEP2B); + break; + case PMIC_SET_RETENTION_STATE: + if (copy_from_user(&state_opt, argp, sizeof(state_opt))) + return -EFAULT; + + /* Configure wake-up destination */ + if (state_opt.mcu_only_startup_dest) + ret = regmap_write_bits(pfsm->regmap, TPS6594_REG_RTC_CTRL_2, + TPS6594_MASK_STARTUP_DEST, + TPS6594_STARTUP_DEST_MCU_ONLY); + else + ret = regmap_write_bits(pfsm->regmap, TPS6594_REG_RTC_CTRL_2, + TPS6594_MASK_STARTUP_DEST, + TPS6594_STARTUP_DEST_ACTIVE); + if (ret) + return ret; + + /* Configure retention triggers */ + ret = tps6594_pfsm_configure_ret_trig(pfsm->regmap, state_opt.gpio_retention, + state_opt.ddr_retention); + if (ret) + return ret; + + /* Modify NSLEEP1-2 bits */ + ret = regmap_clear_bits(pfsm->regmap, TPS6594_REG_FSM_NSLEEP_TRIGGERS, + TPS6594_BIT_NSLEEP2B); + break; + } + + return ret; +} + +static const struct file_operations tps6594_pfsm_fops = { + .owner = THIS_MODULE, + .llseek = generic_file_llseek, + .read = tps6594_pfsm_read, + .write = tps6594_pfsm_write, + .unlocked_ioctl = tps6594_pfsm_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static irqreturn_t tps6594_pfsm_isr(int irq, void *dev_id) +{ + struct platform_device *pdev = dev_id; + int i; + + for (i = 0 ; i < pdev->num_resources ; i++) { + if (irq == platform_get_irq_byname(pdev, pdev->resource[i].name)) { + dev_err(pdev->dev.parent, "%s event detected\n", pdev->resource[i].name); + return IRQ_HANDLED; + } + } + + return IRQ_NONE; +} + +static int tps6594_pfsm_probe(struct platform_device *pdev) +{ + struct tps6594_pfsm *pfsm; + struct tps6594 *tps = dev_get_drvdata(pdev->dev.parent); + struct device *dev = &pdev->dev; + int irq; + int ret; + int i; + + pfsm = devm_kzalloc(dev, sizeof(struct tps6594_pfsm), GFP_KERNEL); + if (!pfsm) + return -ENOMEM; + + pfsm->regmap = tps->regmap; + + pfsm->miscdev.minor = MISC_DYNAMIC_MINOR; + pfsm->miscdev.name = devm_kasprintf(dev, GFP_KERNEL, "pfsm-%ld-0x%02x", + tps->chip_id, tps->reg); + pfsm->miscdev.fops = &tps6594_pfsm_fops; + pfsm->miscdev.parent = dev->parent; + + for (i = 0 ; i < pdev->num_resources ; i++) { + irq = platform_get_irq_byname(pdev, pdev->resource[i].name); + if (irq < 0) + return dev_err_probe(dev, irq, "Failed to get %s irq\n", + pdev->resource[i].name); + + ret = devm_request_threaded_irq(dev, irq, NULL, + tps6594_pfsm_isr, IRQF_ONESHOT, + pdev->resource[i].name, pdev); + if (ret) + return dev_err_probe(dev, ret, "Failed to request irq\n"); + } + + platform_set_drvdata(pdev, pfsm); + + return misc_register(&pfsm->miscdev); +} + +static int tps6594_pfsm_remove(struct platform_device *pdev) +{ + struct tps6594_pfsm *pfsm = platform_get_drvdata(pdev); + + misc_deregister(&pfsm->miscdev); + + return 0; +} + +static struct platform_driver tps6594_pfsm_driver = { + .driver = { + .name = "tps6594-pfsm", + }, + .probe = tps6594_pfsm_probe, + .remove = tps6594_pfsm_remove, +}; + +module_platform_driver(tps6594_pfsm_driver); + +MODULE_ALIAS("platform:tps6594-pfsm"); +MODULE_AUTHOR("Julien Panis "); +MODULE_DESCRIPTION("TPS6594 Pre-configurable Finite State Machine Driver"); +MODULE_LICENSE("GPL"); diff --git a/include/uapi/linux/tps6594_pfsm.h b/include/uapi/linux/tps6594_pfsm.h new file mode 100644 index 000000000000..c69569e0a7a2 --- /dev/null +++ b/include/uapi/linux/tps6594_pfsm.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Userspace ABI for TPS6594 PMIC Pre-configurable Finite State Machine + * + * Copyright (C) 2023 BayLibre Incorporated - https://www.baylibre.com/ + */ + +#ifndef __TPS6594_PFSM_H +#define __TPS6594_PFSM_H + +#include +#include +#include + +/** + * struct pmic_state_opt - PMIC state options + * @gpio_retention: if enabled, power rails associated with GPIO retention remain active + * @ddr_retention: if enabled, power rails associated with DDR retention remain active + * @mcu_only_startup_dest: if enabled, startup destination state is MCU_ONLY + */ +struct pmic_state_opt { + __u8 gpio_retention; + __u8 ddr_retention; + __u8 mcu_only_startup_dest; +}; + +/* Commands */ +#define PMIC_BASE 'P' + +#define PMIC_GOTO_STANDBY _IO(PMIC_BASE, 0) +#define PMIC_GOTO_LP_STANDBY _IO(PMIC_BASE, 1) +#define PMIC_UPDATE_PGM _IO(PMIC_BASE, 2) +#define PMIC_SET_ACTIVE_STATE _IO(PMIC_BASE, 3) +#define PMIC_SET_MCU_ONLY_STATE _IOW(PMIC_BASE, 4, struct pmic_state_opt) +#define PMIC_SET_RETENTION_STATE _IOW(PMIC_BASE, 5, struct pmic_state_opt) + +#endif /* __TPS6594_PFSM_H */ -- cgit v1.2.3 From 2d8c9dcf7158060fcec9f891c0292ffdb4397523 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Thu, 15 Jun 2023 02:40:28 +0800 Subject: eventfd: add a uapi header for eventfd userspace APIs Create a uapi header include/uapi/linux/eventfd.h, move the associated flags to the uapi header, and include it from linux/eventfd.h. Suggested-by: Christian Brauner Signed-off-by: Wen Yang Reviewed-by: Matthew Wilcox (Oracle) Cc: Alexander Viro Cc: Jens Axboe Cc: Christian Brauner Cc: Christoph Hellwig Cc: Dylan Yudaken Cc: David Woodhouse Cc: Matthew Wilcox Cc: Eric Biggers Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Message-Id: Signed-off-by: Christian Brauner --- include/linux/eventfd.h | 6 +----- include/uapi/linux/eventfd.h | 11 +++++++++++ 2 files changed, 12 insertions(+), 5 deletions(-) create mode 100644 include/uapi/linux/eventfd.h (limited to 'include/uapi/linux') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 98d31cdaca40..b9d83652c097 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -9,12 +9,12 @@ #ifndef _LINUX_EVENTFD_H #define _LINUX_EVENTFD_H -#include #include #include #include #include #include +#include /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining @@ -23,10 +23,6 @@ * from eventfd, in order to leave a free define-space for * shared O_* flags. */ -#define EFD_SEMAPHORE (1 << 0) -#define EFD_CLOEXEC O_CLOEXEC -#define EFD_NONBLOCK O_NONBLOCK - #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE) diff --git a/include/uapi/linux/eventfd.h b/include/uapi/linux/eventfd.h new file mode 100644 index 000000000000..2eb9ab6c32f3 --- /dev/null +++ b/include/uapi/linux/eventfd.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_EVENTFD_H +#define _UAPI_LINUX_EVENTFD_H + +#include + +#define EFD_SEMAPHORE (1 << 0) +#define EFD_CLOEXEC O_CLOEXEC +#define EFD_NONBLOCK O_NONBLOCK + +#endif /* _UAPI_LINUX_EVENTFD_H */ -- cgit v1.2.3 From 6f582513ad15de729ee5c91dfef946f3c266a207 Mon Sep 17 00:00:00 2001 From: James Zhu Date: Wed, 17 May 2023 16:19:51 -0400 Subject: drm/amdkfd: add event age tracking Add event age tracking Signed-off-by: James Zhu Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 1781e7669982..93f1c0bc5caf 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -320,12 +320,20 @@ struct kfd_hsa_hw_exception_data { __u32 gpu_id; }; +/* hsa signal event data */ +struct kfd_hsa_signal_event_data { + __u64 last_event_age; /* to and from KFD */ +}; + /* Event data */ struct kfd_event_data { union { + /* From KFD */ struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_hsa_hw_exception_data hw_exception_data; - }; /* From KFD */ + /* To and From KFD */ + struct kfd_hsa_signal_event_data signal_event_data; + }; __u64 kfd_event_data_ext; /* pointer to an extension structure for future exception types */ __u32 event_id; /* to KFD */ -- cgit v1.2.3 From d297eedf83f5af96751c0da1e4355c19244a55a2 Mon Sep 17 00:00:00 2001 From: James Zhu Date: Wed, 7 Jun 2023 13:27:40 -0400 Subject: drm/amdkfd: bump kfd ioctl minor version for event age availability Bump the minor version to declare event age tracking feature is now available. In kernel amdgpu driver, kfd_wait_on_events is used to support user space signal event wait function. For multiple threads waiting on same event scenery, race condition could occur since some threads after checking signal condition, before calling kfd_wait_on_events, the event interrupt could be fired and wake up other thread which are sleeping on this event. Then those threads could fall into sleep without waking up again. Adding event age tracking in both kernel and user mode, will help avoiding this race condition. Proposed ROCT-Thunk-Interface: https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/commit/efdbf6cfbc026bd68ac3c35d00dacf84370eb81e https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/commit/1820ae0a2db85b6f584611dc0cde1a00e7c22915 Proposed ROCR-Runtime: https://github.com/RadeonOpenCompute/ROCR-Runtime/compare/master...zhums:ROCR-Runtime:new_event_wait_review https://github.com/RadeonOpenCompute/ROCR-Runtime/commit/e1f5bdb88eb882ac798aeca2c00ea3fbb2dba459 https://github.com/RadeonOpenCompute/ROCR-Runtime/commit/7d26afd14107b5c2a754c1a3f415d89f3aabb503 Signed-off-by: James Zhu Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 93f1c0bc5caf..eeb2fdcbdcb7 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -39,9 +39,10 @@ * - 1.11 - Add unified memory for ctx save/restore area * - 1.12 - Add DMA buf export ioctl * - 1.13 - Add debugger API + * - 1.14 - Update kfd_event_data */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 13 +#define KFD_IOCTL_MINOR_VERSION 14 struct kfd_ioctl_get_version_args { __u32 major_version; /* from KFD */ -- cgit v1.2.3 From 01584c1e233740519d0e11aa20daa323d26bf598 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 8 Jun 2023 18:55:56 +0900 Subject: scsi: block: Improve ioprio value validity checks The introduction of the macro IOPRIO_PRIO_LEVEL() in commit eca2040972b4 ("scsi: block: ioprio: Clean up interface definition") results in an iopriority level to always be masked using the macro IOPRIO_LEVEL_MASK, and thus to the kernel always seeing an acceptable value for an I/O priority level when checked in ioprio_check_cap(). Before this patch, this function would return an error for some (but not all) invalid values for a level valid range of [0..7]. Restore and improve the detection of invalid priority levels by introducing the inline function ioprio_value() to check an ioprio class, level and hint value before combining these fields into a single value to be used with ioprio_set() or AIOs. If an invalid value for the class, level or hint of an ioprio is detected, ioprio_value() returns an ioprio using the class IOPRIO_CLASS_INVALID, indicating an invalid value and causing ioprio_check_cap() to return -EINVAL. Fixes: 6c913257226a ("scsi: block: Introduce ioprio hints") Fixes: eca2040972b4 ("scsi: block: ioprio: Clean up interface definition") Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20230608095556.124001-1-dlemoal@kernel.org Reviewed-by: Niklas Cassel Reviewed-by: Linus Walleij Signed-off-by: Martin K. Petersen --- block/ioprio.c | 1 + include/uapi/linux/ioprio.h | 50 ++++++++++++++++++++++++++++++--------------- 2 files changed, 34 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/block/ioprio.c b/block/ioprio.c index f0d9e818abc5..b5a942519a79 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -58,6 +58,7 @@ int ioprio_check_cap(int ioprio) if (level) return -EINVAL; break; + case IOPRIO_CLASS_INVALID: default: return -EINVAL; } diff --git a/include/uapi/linux/ioprio.h b/include/uapi/linux/ioprio.h index 4c4806e8230b..99440b2e8c35 100644 --- a/include/uapi/linux/ioprio.h +++ b/include/uapi/linux/ioprio.h @@ -2,19 +2,20 @@ #ifndef _UAPI_LINUX_IOPRIO_H #define _UAPI_LINUX_IOPRIO_H +#include +#include + /* * Gives us 8 prio classes with 13-bits of data for each class */ #define IOPRIO_CLASS_SHIFT 13 -#define IOPRIO_CLASS_MASK 0x07 +#define IOPRIO_NR_CLASSES 8 +#define IOPRIO_CLASS_MASK (IOPRIO_NR_CLASSES - 1) #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) #define IOPRIO_PRIO_CLASS(ioprio) \ (((ioprio) >> IOPRIO_CLASS_SHIFT) & IOPRIO_CLASS_MASK) #define IOPRIO_PRIO_DATA(ioprio) ((ioprio) & IOPRIO_PRIO_MASK) -#define IOPRIO_PRIO_VALUE(class, data) \ - ((((class) & IOPRIO_CLASS_MASK) << IOPRIO_CLASS_SHIFT) | \ - ((data) & IOPRIO_PRIO_MASK)) /* * These are the io priority classes as implemented by the BFQ and mq-deadline @@ -25,10 +26,13 @@ * served when no one else is using the disk. */ enum { - IOPRIO_CLASS_NONE, - IOPRIO_CLASS_RT, - IOPRIO_CLASS_BE, - IOPRIO_CLASS_IDLE, + IOPRIO_CLASS_NONE = 0, + IOPRIO_CLASS_RT = 1, + IOPRIO_CLASS_BE = 2, + IOPRIO_CLASS_IDLE = 3, + + /* Special class to indicate an invalid ioprio value */ + IOPRIO_CLASS_INVALID = 7, }; /* @@ -73,15 +77,6 @@ enum { #define IOPRIO_PRIO_HINT(ioprio) \ (((ioprio) >> IOPRIO_HINT_SHIFT) & IOPRIO_HINT_MASK) -/* - * Alternate macro for IOPRIO_PRIO_VALUE() to define an I/O priority with - * a class, level and hint. - */ -#define IOPRIO_PRIO_VALUE_HINT(class, level, hint) \ - ((((class) & IOPRIO_CLASS_MASK) << IOPRIO_CLASS_SHIFT) | \ - (((hint) & IOPRIO_HINT_MASK) << IOPRIO_HINT_SHIFT) | \ - ((level) & IOPRIO_LEVEL_MASK)) - /* * I/O hints. */ @@ -107,4 +102,25 @@ enum { IOPRIO_HINT_DEV_DURATION_LIMIT_7 = 7, }; +#define IOPRIO_BAD_VALUE(val, max) ((val) < 0 || (val) >= (max)) + +/* + * Return an I/O priority value based on a class, a level and a hint. + */ +static __always_inline __u16 ioprio_value(int class, int level, int hint) +{ + if (IOPRIO_BAD_VALUE(class, IOPRIO_NR_CLASSES) || + IOPRIO_BAD_VALUE(level, IOPRIO_NR_LEVELS) || + IOPRIO_BAD_VALUE(hint, IOPRIO_NR_HINTS)) + return IOPRIO_CLASS_INVALID << IOPRIO_CLASS_SHIFT; + + return (class << IOPRIO_CLASS_SHIFT) | + (hint << IOPRIO_HINT_SHIFT) | level; +} + +#define IOPRIO_PRIO_VALUE(class, level) \ + ioprio_value(class, level, IOPRIO_HINT_NONE) +#define IOPRIO_PRIO_VALUE_HINT(class, level, hint) \ + ioprio_value(class, level, hint) + #endif /* _UAPI_LINUX_IOPRIO_H */ -- cgit v1.2.3 From a5bfe22db2a4a1ae467f31cfa1d72043eb9f1877 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 19 May 2023 15:47:48 -0600 Subject: vfio/pci-core: Add capability for AtomicOp completer support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test and enable PCIe AtomicOp completer support of various widths and report via device-info capability to userspace. Reviewed-by: Cédric Le Goater Reviewed-by: Robin Voetter Tested-by: Robin Voetter Link: https://lore.kernel.org/r/20230519214748.402003-1-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 38 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/vfio.h | 14 ++++++++++++++ 2 files changed, 52 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index ec7e662de033..20d7b69ea6ff 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -885,6 +885,37 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, } EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region); +static int vfio_pci_info_atomic_cap(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps) +{ + struct vfio_device_info_cap_pci_atomic_comp cap = { + .header.id = VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP, + .header.version = 1 + }; + struct pci_dev *pdev = pci_physfn(vdev->pdev); + u32 devcap2; + + pcie_capability_read_dword(pdev, PCI_EXP_DEVCAP2, &devcap2); + + if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP32) && + !pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32)) + cap.flags |= VFIO_PCI_ATOMIC_COMP32; + + if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP64) && + !pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64)) + cap.flags |= VFIO_PCI_ATOMIC_COMP64; + + if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP128) && + !pci_enable_atomic_ops_to_root(pdev, + PCI_EXP_DEVCAP2_ATOMIC_COMP128)) + cap.flags |= VFIO_PCI_ATOMIC_COMP128; + + if (!cap.flags) + return -ENODEV; + + return vfio_info_add_capability(caps, &cap.header, sizeof(cap)); +} + static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, struct vfio_device_info __user *arg) { @@ -923,6 +954,13 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, return ret; } + ret = vfio_pci_info_atomic_cap(vdev, &caps); + if (ret && ret != -ENODEV) { + pci_warn(vdev->pdev, + "Failed to setup AtomicOps info capability\n"); + return ret; + } + if (caps.size) { info.flags |= VFIO_DEVICE_FLAGS_CAPS; if (info.argsz < sizeof(info) + caps.size) { diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 1a36134cae5c..4f48bad09a37 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -240,6 +240,20 @@ struct vfio_device_info { #define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL 3 #define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP 4 +/* + * The following VFIO_DEVICE_INFO capability reports support for PCIe AtomicOp + * completion to the root bus with supported widths provided via flags. + */ +#define VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP 5 +struct vfio_device_info_cap_pci_atomic_comp { + struct vfio_info_cap_header header; + __u32 flags; +#define VFIO_PCI_ATOMIC_COMP32 (1 << 0) +#define VFIO_PCI_ATOMIC_COMP64 (1 << 1) +#define VFIO_PCI_ATOMIC_COMP128 (1 << 2) + __u32 reserved; +}; + /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, * struct vfio_region_info) -- cgit v1.2.3 From 234489ac561300ceed33e64c3bf3a810b9e2051d Mon Sep 17 00:00:00 2001 From: Nipun Gupta Date: Wed, 31 May 2023 18:15:57 +0530 Subject: vfio/cdx: add support for CDX bus vfio-cdx driver enables IOCTLs for user space to query MMIO regions for CDX devices and mmap them. This change also adds support for reset of CDX devices. With VFIO enabled on CDX devices, user-space applications can also exercise DMA securely via IOMMU on these devices. This change adds the VFIO CDX driver and enables the following ioctls for CDX devices: - VFIO_DEVICE_GET_INFO: - VFIO_DEVICE_GET_REGION_INFO - VFIO_DEVICE_RESET Signed-off-by: Nipun Gupta Reviewed-by: Pieter Jansen van Vuuren Tested-by: Nikhil Agarwal Link: https://lore.kernel.org/r/20230531124557.11009-1-nipun.gupta@amd.com Signed-off-by: Alex Williamson --- MAINTAINERS | 7 ++ drivers/vfio/Kconfig | 1 + drivers/vfio/Makefile | 1 + drivers/vfio/cdx/Kconfig | 17 +++ drivers/vfio/cdx/Makefile | 8 ++ drivers/vfio/cdx/main.c | 234 ++++++++++++++++++++++++++++++++++++++ drivers/vfio/cdx/private.h | 28 +++++ include/linux/cdx/cdx_bus.h | 1 - include/linux/mod_devicetable.h | 6 + include/uapi/linux/vfio.h | 1 + scripts/mod/devicetable-offsets.c | 1 + scripts/mod/file2alias.c | 17 ++- 12 files changed, 320 insertions(+), 2 deletions(-) create mode 100644 drivers/vfio/cdx/Kconfig create mode 100644 drivers/vfio/cdx/Makefile create mode 100644 drivers/vfio/cdx/main.c create mode 100644 drivers/vfio/cdx/private.h (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 27ef11624748..ce6ac552d8f6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22085,6 +22085,13 @@ F: Documentation/filesystems/vfat.rst F: fs/fat/ F: tools/testing/selftests/filesystems/fat/ +VFIO CDX DRIVER +M: Nipun Gupta +M: Nikhil Agarwal +L: kvm@vger.kernel.org +S: Maintained +F: drivers/vfio/cdx/* + VFIO DRIVER M: Alex Williamson L: kvm@vger.kernel.org diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 89e06c981e43..aba36f5be4ec 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -57,6 +57,7 @@ source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" source "drivers/vfio/mdev/Kconfig" source "drivers/vfio/fsl-mc/Kconfig" +source "drivers/vfio/cdx/Kconfig" endif source "virt/lib/Kconfig" diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 8da44aa1ea16..66f418aef5a9 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -14,3 +14,4 @@ obj-$(CONFIG_VFIO_PCI_CORE) += pci/ obj-$(CONFIG_VFIO_PLATFORM_BASE) += platform/ obj-$(CONFIG_VFIO_MDEV) += mdev/ obj-$(CONFIG_VFIO_FSL_MC) += fsl-mc/ +obj-$(CONFIG_VFIO_CDX) += cdx/ diff --git a/drivers/vfio/cdx/Kconfig b/drivers/vfio/cdx/Kconfig new file mode 100644 index 000000000000..e6de0a0caa32 --- /dev/null +++ b/drivers/vfio/cdx/Kconfig @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# VFIO CDX configuration +# +# Copyright (C) 2022-2023, Advanced Micro Devices, Inc. +# + +config VFIO_CDX + tristate "VFIO support for CDX bus devices" + depends on CDX_BUS + select EVENTFD + help + Driver to enable VFIO support for the devices on CDX bus. + This is required to make use of CDX devices present in + the system using the VFIO framework. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/cdx/Makefile b/drivers/vfio/cdx/Makefile new file mode 100644 index 000000000000..cd4a2e6fe609 --- /dev/null +++ b/drivers/vfio/cdx/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2022-2023, Advanced Micro Devices, Inc. +# + +obj-$(CONFIG_VFIO_CDX) += vfio-cdx.o + +vfio-cdx-objs := main.o diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c new file mode 100644 index 000000000000..c376a69d2db2 --- /dev/null +++ b/drivers/vfio/cdx/main.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#include +#include + +#include "private.h" + +static int vfio_cdx_open_device(struct vfio_device *core_vdev) +{ + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); + struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev); + int count = cdx_dev->res_count; + int i; + + vdev->regions = kcalloc(count, sizeof(struct vfio_cdx_region), + GFP_KERNEL_ACCOUNT); + if (!vdev->regions) + return -ENOMEM; + + for (i = 0; i < count; i++) { + struct resource *res = &cdx_dev->res[i]; + + vdev->regions[i].addr = res->start; + vdev->regions[i].size = resource_size(res); + vdev->regions[i].type = res->flags; + /* + * Only regions addressed with PAGE granularity may be + * MMAP'ed securely. + */ + if (!(vdev->regions[i].addr & ~PAGE_MASK) && + !(vdev->regions[i].size & ~PAGE_MASK)) + vdev->regions[i].flags |= + VFIO_REGION_INFO_FLAG_MMAP; + vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_READ; + if (!(cdx_dev->res[i].flags & IORESOURCE_READONLY)) + vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_WRITE; + } + + return 0; +} + +static void vfio_cdx_close_device(struct vfio_device *core_vdev) +{ + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); + + kfree(vdev->regions); + cdx_dev_reset(core_vdev->dev); +} + +static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, + struct vfio_device_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs); + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); + struct vfio_device_info info; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.flags = VFIO_DEVICE_FLAGS_CDX; + info.flags |= VFIO_DEVICE_FLAGS_RESET; + + info.num_regions = cdx_dev->res_count; + info.num_irqs = 0; + + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} + +static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev, + struct vfio_region_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); + struct vfio_region_info info; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (info.index >= cdx_dev->res_count) + return -EINVAL; + + /* map offset to the physical address */ + info.offset = vfio_cdx_index_to_offset(info.index); + info.size = vdev->regions[info.index].size; + info.flags = vdev->regions[info.index].flags; + + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} + +static long vfio_cdx_ioctl(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg) +{ + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); + void __user *uarg = (void __user *)arg; + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + return vfio_cdx_ioctl_get_info(vdev, uarg); + case VFIO_DEVICE_GET_REGION_INFO: + return vfio_cdx_ioctl_get_region_info(vdev, uarg); + case VFIO_DEVICE_RESET: + return cdx_dev_reset(core_vdev->dev); + default: + return -ENOTTY; + } +} + +static int vfio_cdx_mmap_mmio(struct vfio_cdx_region region, + struct vm_area_struct *vma) +{ + u64 size = vma->vm_end - vma->vm_start; + u64 pgoff, base; + + pgoff = vma->vm_pgoff & + ((1U << (VFIO_CDX_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + base = pgoff << PAGE_SHIFT; + + if (base + size > region.size) + return -EINVAL; + + vma->vm_pgoff = (region.addr >> PAGE_SHIFT) + pgoff; + vma->vm_page_prot = pgprot_device(vma->vm_page_prot); + + return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + size, vma->vm_page_prot); +} + +static int vfio_cdx_mmap(struct vfio_device *core_vdev, + struct vm_area_struct *vma) +{ + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); + struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev); + unsigned int index; + + index = vma->vm_pgoff >> (VFIO_CDX_OFFSET_SHIFT - PAGE_SHIFT); + + if (index >= cdx_dev->res_count) + return -EINVAL; + + if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_MMAP)) + return -EINVAL; + + if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_READ) && + (vma->vm_flags & VM_READ)) + return -EPERM; + + if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_WRITE) && + (vma->vm_flags & VM_WRITE)) + return -EPERM; + + return vfio_cdx_mmap_mmio(vdev->regions[index], vma); +} + +static const struct vfio_device_ops vfio_cdx_ops = { + .name = "vfio-cdx", + .open_device = vfio_cdx_open_device, + .close_device = vfio_cdx_close_device, + .ioctl = vfio_cdx_ioctl, + .mmap = vfio_cdx_mmap, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, +}; + +static int vfio_cdx_probe(struct cdx_device *cdx_dev) +{ + struct vfio_cdx_device *vdev; + struct device *dev = &cdx_dev->dev; + int ret; + + vdev = vfio_alloc_device(vfio_cdx_device, vdev, dev, + &vfio_cdx_ops); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); + + ret = vfio_register_group_dev(&vdev->vdev); + if (ret) + goto out_uninit; + + dev_set_drvdata(dev, vdev); + return 0; + +out_uninit: + vfio_put_device(&vdev->vdev); + return ret; +} + +static int vfio_cdx_remove(struct cdx_device *cdx_dev) +{ + struct device *dev = &cdx_dev->dev; + struct vfio_cdx_device *vdev = dev_get_drvdata(dev); + + vfio_unregister_group_dev(&vdev->vdev); + vfio_put_device(&vdev->vdev); + + return 0; +} + +static const struct cdx_device_id vfio_cdx_table[] = { + { CDX_DEVICE_DRIVER_OVERRIDE(CDX_ANY_ID, CDX_ANY_ID, + CDX_ID_F_VFIO_DRIVER_OVERRIDE) }, /* match all by default */ + {} +}; + +MODULE_DEVICE_TABLE(cdx, vfio_cdx_table); + +static struct cdx_driver vfio_cdx_driver = { + .probe = vfio_cdx_probe, + .remove = vfio_cdx_remove, + .match_id_table = vfio_cdx_table, + .driver = { + .name = "vfio-cdx", + .owner = THIS_MODULE, + }, + .driver_managed_dma = true, +}; + +module_driver(vfio_cdx_driver, cdx_driver_register, cdx_driver_unregister); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("VFIO for CDX devices - User Level meta-driver"); diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h new file mode 100644 index 000000000000..8bdc117ea88e --- /dev/null +++ b/drivers/vfio/cdx/private.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef VFIO_CDX_PRIVATE_H +#define VFIO_CDX_PRIVATE_H + +#define VFIO_CDX_OFFSET_SHIFT 40 + +static inline u64 vfio_cdx_index_to_offset(u32 index) +{ + return ((u64)(index) << VFIO_CDX_OFFSET_SHIFT); +} + +struct vfio_cdx_region { + u32 flags; + u32 type; + u64 addr; + resource_size_t size; +}; + +struct vfio_cdx_device { + struct vfio_device vdev; + struct vfio_cdx_region *regions; +}; + +#endif /* VFIO_CDX_PRIVATE_H */ diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h index 35ef41d8a61a..bead71b7bc73 100644 --- a/include/linux/cdx/cdx_bus.h +++ b/include/linux/cdx/cdx_bus.h @@ -14,7 +14,6 @@ #include #define MAX_CDX_DEV_RESOURCES 4 -#define CDX_ANY_ID (0xFFFF) #define CDX_CONTROLLER_ID_SHIFT 4 #define CDX_BUS_NUM_MASK 0xF diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index ccaaeda792c0..ccf017353bb6 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -912,6 +912,12 @@ struct ishtp_device_id { kernel_ulong_t driver_data; }; +#define CDX_ANY_ID (0xFFFF) + +enum { + CDX_ID_F_VFIO_DRIVER_OVERRIDE = 1, +}; + /** * struct cdx_device_id - CDX device identifier * @vendor: Vendor ID diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 4f48bad09a37..9ab864c6f1ff 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -213,6 +213,7 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */ #define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ #define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ +#define VFIO_DEVICE_FLAGS_CDX (1 << 8) /* vfio-cdx device */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ __u32 cap_offset; /* Offset within info struct of first cap */ diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c index 62dc988df84d..abe65f8968dd 100644 --- a/scripts/mod/devicetable-offsets.c +++ b/scripts/mod/devicetable-offsets.c @@ -265,6 +265,7 @@ int main(void) DEVID(cdx_device_id); DEVID_FIELD(cdx_device_id, vendor); DEVID_FIELD(cdx_device_id, device); + DEVID_FIELD(cdx_device_id, override_only); return 0; } diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 28da34ba4359..38120f932b0d 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1458,8 +1458,23 @@ static int do_cdx_entry(const char *filename, void *symval, { DEF_FIELD(symval, cdx_device_id, vendor); DEF_FIELD(symval, cdx_device_id, device); + DEF_FIELD(symval, cdx_device_id, override_only); - sprintf(alias, "cdx:v%08Xd%08Xd", vendor, device); + switch (override_only) { + case 0: + strcpy(alias, "cdx:"); + break; + case CDX_ID_F_VFIO_DRIVER_OVERRIDE: + strcpy(alias, "vfio_cdx:"); + break; + default: + warn("Unknown CDX driver_override alias %08X\n", + override_only); + return 0; + } + + ADD(alias, "v", vendor != CDX_ANY_ID, vendor); + ADD(alias, "d", device != CDX_ANY_ID, device); return 1; } -- cgit v1.2.3 From 89d01306e34d6ace24e9708cb443df0e53c06ce0 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 15 Jun 2023 13:03:49 +0530 Subject: RISC-V: KVM: Implement device interface for AIA irqchip We implement KVM device interface for in-kernel AIA irqchip so that user-space can use KVM device ioctls to create, configure, and destroy in-kernel AIA irqchip. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_aia.h | 132 +++++--- arch/riscv/include/uapi/asm/kvm.h | 45 +++ arch/riscv/kvm/Makefile | 1 + arch/riscv/kvm/aia.c | 11 + arch/riscv/kvm/aia_device.c | 623 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 2 + 6 files changed, 772 insertions(+), 42 deletions(-) create mode 100644 arch/riscv/kvm/aia_device.c (limited to 'include/uapi/linux') diff --git a/arch/riscv/include/asm/kvm_aia.h b/arch/riscv/include/asm/kvm_aia.h index 3bc0a0e47a15..a1281ebc9b92 100644 --- a/arch/riscv/include/asm/kvm_aia.h +++ b/arch/riscv/include/asm/kvm_aia.h @@ -20,6 +20,33 @@ struct kvm_aia { /* In-kernel irqchip initialized */ bool initialized; + + /* Virtualization mode (Emulation, HW Accelerated, or Auto) */ + u32 mode; + + /* Number of MSIs */ + u32 nr_ids; + + /* Number of wired IRQs */ + u32 nr_sources; + + /* Number of group bits in IMSIC address */ + u32 nr_group_bits; + + /* Position of group bits in IMSIC address */ + u32 nr_group_shift; + + /* Number of hart bits in IMSIC address */ + u32 nr_hart_bits; + + /* Number of guest bits in IMSIC address */ + u32 nr_guest_bits; + + /* Guest physical address of APLIC */ + gpa_t aplic_addr; + + /* Internal state of APLIC */ + void *aplic_state; }; struct kvm_vcpu_aia_csr { @@ -38,8 +65,19 @@ struct kvm_vcpu_aia { /* CPU AIA CSR context upon Guest VCPU reset */ struct kvm_vcpu_aia_csr guest_reset_csr; + + /* Guest physical address of IMSIC for this VCPU */ + gpa_t imsic_addr; + + /* HART index of IMSIC extacted from guest physical address */ + u32 hart_index; + + /* Internal state of IMSIC for this VCPU */ + void *imsic_state; }; +#define KVM_RISCV_AIA_UNDEF_ADDR (-1) + #define kvm_riscv_aia_initialized(k) ((k)->arch.aia.initialized) #define irqchip_in_kernel(k) ((k)->arch.aia.in_kernel) @@ -50,10 +88,17 @@ DECLARE_STATIC_KEY_FALSE(kvm_riscv_aia_available); #define kvm_riscv_aia_available() \ static_branch_unlikely(&kvm_riscv_aia_available) +extern struct kvm_device_ops kvm_riscv_aia_device_ops; + static inline void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu) { } +static inline int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu) +{ + return 1; +} + #define KVM_RISCV_AIA_IMSIC_TOPEI (ISELECT_MASK + 1) static inline int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, unsigned long isel, @@ -64,6 +109,41 @@ static inline int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, return 0; } +static inline void kvm_riscv_vcpu_aia_imsic_reset(struct kvm_vcpu *vcpu) +{ +} + +static inline int kvm_riscv_vcpu_aia_imsic_inject(struct kvm_vcpu *vcpu, + u32 guest_index, u32 offset, + u32 iid) +{ + return 0; +} + +static inline int kvm_riscv_vcpu_aia_imsic_init(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static inline void kvm_riscv_vcpu_aia_imsic_cleanup(struct kvm_vcpu *vcpu) +{ +} + +static inline int kvm_riscv_aia_aplic_inject(struct kvm *kvm, + u32 source, bool level) +{ + return 0; +} + +static inline int kvm_riscv_aia_aplic_init(struct kvm *kvm) +{ + return 0; +} + +static inline void kvm_riscv_aia_aplic_cleanup(struct kvm *kvm) +{ +} + #ifdef CONFIG_32BIT void kvm_riscv_vcpu_aia_flush_interrupts(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_aia_sync_interrupts(struct kvm_vcpu *vcpu); @@ -99,50 +179,18 @@ int kvm_riscv_vcpu_aia_rmw_ireg(struct kvm_vcpu *vcpu, unsigned int csr_num, { .base = CSR_SIREG, .count = 1, .func = kvm_riscv_vcpu_aia_rmw_ireg }, \ { .base = CSR_STOPEI, .count = 1, .func = kvm_riscv_vcpu_aia_rmw_topei }, -static inline int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu) -{ - return 1; -} - -static inline void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu) -{ -} - -static inline int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - -static inline void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu) -{ -} - -static inline int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, - u32 hart_index, - u32 guest_index, u32 iid) -{ - return 0; -} - -static inline int kvm_riscv_aia_inject_msi(struct kvm *kvm, - struct kvm_msi *msi) -{ - return 0; -} +int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu); +int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu); -static inline int kvm_riscv_aia_inject_irq(struct kvm *kvm, - unsigned int irq, bool level) -{ - return 0; -} +int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, u32 hart_index, + u32 guest_index, u32 iid); +int kvm_riscv_aia_inject_msi(struct kvm *kvm, struct kvm_msi *msi); +int kvm_riscv_aia_inject_irq(struct kvm *kvm, unsigned int irq, bool level); -static inline void kvm_riscv_aia_init_vm(struct kvm *kvm) -{ -} - -static inline void kvm_riscv_aia_destroy_vm(struct kvm *kvm) -{ -} +void kvm_riscv_aia_init_vm(struct kvm *kvm); +void kvm_riscv_aia_destroy_vm(struct kvm *kvm); int kvm_riscv_aia_alloc_hgei(int cpu, struct kvm_vcpu *owner, void __iomem **hgei_va, phys_addr_t *hgei_pa); diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 332d4a274891..047c8fc5bd71 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -204,6 +204,51 @@ enum KVM_RISCV_SBI_EXT_ID { #define KVM_REG_RISCV_SBI_MULTI_REG_LAST \ KVM_REG_RISCV_SBI_MULTI_REG(KVM_RISCV_SBI_EXT_MAX - 1) +/* Device Control API: RISC-V AIA */ +#define KVM_DEV_RISCV_APLIC_ALIGN 0x1000 +#define KVM_DEV_RISCV_APLIC_SIZE 0x4000 +#define KVM_DEV_RISCV_APLIC_MAX_HARTS 0x4000 +#define KVM_DEV_RISCV_IMSIC_ALIGN 0x1000 +#define KVM_DEV_RISCV_IMSIC_SIZE 0x1000 + +#define KVM_DEV_RISCV_AIA_GRP_CONFIG 0 +#define KVM_DEV_RISCV_AIA_CONFIG_MODE 0 +#define KVM_DEV_RISCV_AIA_CONFIG_IDS 1 +#define KVM_DEV_RISCV_AIA_CONFIG_SRCS 2 +#define KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS 3 +#define KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT 4 +#define KVM_DEV_RISCV_AIA_CONFIG_HART_BITS 5 +#define KVM_DEV_RISCV_AIA_CONFIG_GUEST_BITS 6 + +/* + * Modes of RISC-V AIA device: + * 1) EMUL (aka Emulation): Trap-n-emulate IMSIC + * 2) HWACCEL (aka HW Acceleration): Virtualize IMSIC using IMSIC guest files + * 3) AUTO (aka Automatic): Virtualize IMSIC using IMSIC guest files whenever + * available otherwise fallback to trap-n-emulation + */ +#define KVM_DEV_RISCV_AIA_MODE_EMUL 0 +#define KVM_DEV_RISCV_AIA_MODE_HWACCEL 1 +#define KVM_DEV_RISCV_AIA_MODE_AUTO 2 + +#define KVM_DEV_RISCV_AIA_IDS_MIN 63 +#define KVM_DEV_RISCV_AIA_IDS_MAX 2048 +#define KVM_DEV_RISCV_AIA_SRCS_MAX 1024 +#define KVM_DEV_RISCV_AIA_GROUP_BITS_MAX 8 +#define KVM_DEV_RISCV_AIA_GROUP_SHIFT_MIN 24 +#define KVM_DEV_RISCV_AIA_GROUP_SHIFT_MAX 56 +#define KVM_DEV_RISCV_AIA_HART_BITS_MAX 16 +#define KVM_DEV_RISCV_AIA_GUEST_BITS_MAX 8 + +#define KVM_DEV_RISCV_AIA_GRP_ADDR 1 +#define KVM_DEV_RISCV_AIA_ADDR_APLIC 0 +#define KVM_DEV_RISCV_AIA_ADDR_IMSIC(__vcpu) (1 + (__vcpu)) +#define KVM_DEV_RISCV_AIA_ADDR_MAX \ + (1 + KVM_DEV_RISCV_APLIC_MAX_HARTS) + +#define KVM_DEV_RISCV_AIA_GRP_CTRL 2 +#define KVM_DEV_RISCV_AIA_CTRL_INIT 0 + /* One single KVM irqchip, ie. the AIA */ #define KVM_NR_IRQCHIPS 1 diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile index 8031b8912a0d..dd69ebe098bd 100644 --- a/arch/riscv/kvm/Makefile +++ b/arch/riscv/kvm/Makefile @@ -27,3 +27,4 @@ kvm-y += vcpu_sbi_hsm.o kvm-y += vcpu_timer.o kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o vcpu_sbi_pmu.o kvm-y += aia.o +kvm-y += aia_device.o diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index 18c442c15ff2..585a3b42c52c 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -631,6 +631,14 @@ int kvm_riscv_aia_init(void) if (rc) return rc; + /* Register device operations */ + rc = kvm_register_device_ops(&kvm_riscv_aia_device_ops, + KVM_DEV_TYPE_RISCV_AIA); + if (rc) { + aia_hgei_exit(); + return rc; + } + /* Enable KVM AIA support */ static_branch_enable(&kvm_riscv_aia_available); @@ -642,6 +650,9 @@ void kvm_riscv_aia_exit(void) if (!kvm_riscv_aia_available()) return; + /* Unregister device operations */ + kvm_unregister_device_ops(KVM_DEV_TYPE_RISCV_AIA); + /* Cleanup the HGEI state */ aia_hgei_exit(); } diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c new file mode 100644 index 000000000000..7ab555121872 --- /dev/null +++ b/arch/riscv/kvm/aia_device.c @@ -0,0 +1,623 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + * Copyright (C) 2022 Ventana Micro Systems Inc. + * + * Authors: + * Anup Patel + */ + +#include +#include +#include +#include + +static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx) +{ + struct kvm_vcpu *tmp_vcpu; + + for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { + tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); + mutex_unlock(&tmp_vcpu->mutex); + } +} + +static void unlock_all_vcpus(struct kvm *kvm) +{ + unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1); +} + +static bool lock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *tmp_vcpu; + unsigned long c; + + kvm_for_each_vcpu(c, tmp_vcpu, kvm) { + if (!mutex_trylock(&tmp_vcpu->mutex)) { + unlock_vcpus(kvm, c - 1); + return false; + } + } + + return true; +} + +static int aia_create(struct kvm_device *dev, u32 type) +{ + int ret; + unsigned long i; + struct kvm *kvm = dev->kvm; + struct kvm_vcpu *vcpu; + + if (irqchip_in_kernel(kvm)) + return -EEXIST; + + ret = -EBUSY; + if (!lock_all_vcpus(kvm)) + return ret; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.ran_atleast_once) + goto out_unlock; + } + ret = 0; + + kvm->arch.aia.in_kernel = true; + +out_unlock: + unlock_all_vcpus(kvm); + return ret; +} + +static void aia_destroy(struct kvm_device *dev) +{ + kfree(dev); +} + +static int aia_config(struct kvm *kvm, unsigned long type, + u32 *nr, bool write) +{ + struct kvm_aia *aia = &kvm->arch.aia; + + /* Writes can only be done before irqchip is initialized */ + if (write && kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + switch (type) { + case KVM_DEV_RISCV_AIA_CONFIG_MODE: + if (write) { + switch (*nr) { + case KVM_DEV_RISCV_AIA_MODE_EMUL: + break; + case KVM_DEV_RISCV_AIA_MODE_HWACCEL: + case KVM_DEV_RISCV_AIA_MODE_AUTO: + /* + * HW Acceleration and Auto modes only + * supported on host with non-zero guest + * external interrupts (i.e. non-zero + * VS-level IMSIC pages). + */ + if (!kvm_riscv_aia_nr_hgei) + return -EINVAL; + break; + default: + return -EINVAL; + }; + aia->mode = *nr; + } else + *nr = aia->mode; + break; + case KVM_DEV_RISCV_AIA_CONFIG_IDS: + if (write) { + if ((*nr < KVM_DEV_RISCV_AIA_IDS_MIN) || + (*nr >= KVM_DEV_RISCV_AIA_IDS_MAX) || + ((*nr & KVM_DEV_RISCV_AIA_IDS_MIN) != + KVM_DEV_RISCV_AIA_IDS_MIN) || + (kvm_riscv_aia_max_ids <= *nr)) + return -EINVAL; + aia->nr_ids = *nr; + } else + *nr = aia->nr_ids; + break; + case KVM_DEV_RISCV_AIA_CONFIG_SRCS: + if (write) { + if ((*nr >= KVM_DEV_RISCV_AIA_SRCS_MAX) || + (*nr >= kvm_riscv_aia_max_ids)) + return -EINVAL; + aia->nr_sources = *nr; + } else + *nr = aia->nr_sources; + break; + case KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS: + if (write) { + if (*nr >= KVM_DEV_RISCV_AIA_GROUP_BITS_MAX) + return -EINVAL; + aia->nr_group_bits = *nr; + } else + *nr = aia->nr_group_bits; + break; + case KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT: + if (write) { + if ((*nr < KVM_DEV_RISCV_AIA_GROUP_SHIFT_MIN) || + (*nr >= KVM_DEV_RISCV_AIA_GROUP_SHIFT_MAX)) + return -EINVAL; + aia->nr_group_shift = *nr; + } else + *nr = aia->nr_group_shift; + break; + case KVM_DEV_RISCV_AIA_CONFIG_HART_BITS: + if (write) { + if (*nr >= KVM_DEV_RISCV_AIA_HART_BITS_MAX) + return -EINVAL; + aia->nr_hart_bits = *nr; + } else + *nr = aia->nr_hart_bits; + break; + case KVM_DEV_RISCV_AIA_CONFIG_GUEST_BITS: + if (write) { + if (*nr >= KVM_DEV_RISCV_AIA_GUEST_BITS_MAX) + return -EINVAL; + aia->nr_guest_bits = *nr; + } else + *nr = aia->nr_guest_bits; + break; + default: + return -ENXIO; + }; + + return 0; +} + +static int aia_aplic_addr(struct kvm *kvm, u64 *addr, bool write) +{ + struct kvm_aia *aia = &kvm->arch.aia; + + if (write) { + /* Writes can only be done before irqchip is initialized */ + if (kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + if (*addr & (KVM_DEV_RISCV_APLIC_ALIGN - 1)) + return -EINVAL; + + aia->aplic_addr = *addr; + } else + *addr = aia->aplic_addr; + + return 0; +} + +static int aia_imsic_addr(struct kvm *kvm, u64 *addr, + unsigned long vcpu_idx, bool write) +{ + struct kvm_vcpu *vcpu; + struct kvm_vcpu_aia *vcpu_aia; + + vcpu = kvm_get_vcpu(kvm, vcpu_idx); + if (!vcpu) + return -EINVAL; + vcpu_aia = &vcpu->arch.aia_context; + + if (write) { + /* Writes can only be done before irqchip is initialized */ + if (kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + if (*addr & (KVM_DEV_RISCV_IMSIC_ALIGN - 1)) + return -EINVAL; + } + + mutex_lock(&vcpu->mutex); + if (write) + vcpu_aia->imsic_addr = *addr; + else + *addr = vcpu_aia->imsic_addr; + mutex_unlock(&vcpu->mutex); + + return 0; +} + +static gpa_t aia_imsic_ppn(struct kvm_aia *aia, gpa_t addr) +{ + u32 h, l; + gpa_t mask = 0; + + h = aia->nr_hart_bits + aia->nr_guest_bits + + IMSIC_MMIO_PAGE_SHIFT - 1; + mask = GENMASK_ULL(h, 0); + + if (aia->nr_group_bits) { + h = aia->nr_group_bits + aia->nr_group_shift - 1; + l = aia->nr_group_shift; + mask |= GENMASK_ULL(h, l); + } + + return (addr & ~mask) >> IMSIC_MMIO_PAGE_SHIFT; +} + +static u32 aia_imsic_hart_index(struct kvm_aia *aia, gpa_t addr) +{ + u32 hart, group = 0; + + hart = (addr >> (aia->nr_guest_bits + IMSIC_MMIO_PAGE_SHIFT)) & + GENMASK_ULL(aia->nr_hart_bits - 1, 0); + if (aia->nr_group_bits) + group = (addr >> aia->nr_group_shift) & + GENMASK_ULL(aia->nr_group_bits - 1, 0); + + return (group << aia->nr_hart_bits) | hart; +} + +static int aia_init(struct kvm *kvm) +{ + int ret, i; + unsigned long idx; + struct kvm_vcpu *vcpu; + struct kvm_vcpu_aia *vaia; + struct kvm_aia *aia = &kvm->arch.aia; + gpa_t base_ppn = KVM_RISCV_AIA_UNDEF_ADDR; + + /* Irqchip can be initialized only once */ + if (kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + /* We might be in the middle of creating a VCPU? */ + if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) + return -EBUSY; + + /* Number of sources should be less than or equals number of IDs */ + if (aia->nr_ids < aia->nr_sources) + return -EINVAL; + + /* APLIC base is required for non-zero number of sources */ + if (aia->nr_sources && aia->aplic_addr == KVM_RISCV_AIA_UNDEF_ADDR) + return -EINVAL; + + /* Initialize APLIC */ + ret = kvm_riscv_aia_aplic_init(kvm); + if (ret) + return ret; + + /* Iterate over each VCPU */ + kvm_for_each_vcpu(idx, vcpu, kvm) { + vaia = &vcpu->arch.aia_context; + + /* IMSIC base is required */ + if (vaia->imsic_addr == KVM_RISCV_AIA_UNDEF_ADDR) { + ret = -EINVAL; + goto fail_cleanup_imsics; + } + + /* All IMSICs should have matching base PPN */ + if (base_ppn == KVM_RISCV_AIA_UNDEF_ADDR) + base_ppn = aia_imsic_ppn(aia, vaia->imsic_addr); + if (base_ppn != aia_imsic_ppn(aia, vaia->imsic_addr)) { + ret = -EINVAL; + goto fail_cleanup_imsics; + } + + /* Update HART index of the IMSIC based on IMSIC base */ + vaia->hart_index = aia_imsic_hart_index(aia, + vaia->imsic_addr); + + /* Initialize IMSIC for this VCPU */ + ret = kvm_riscv_vcpu_aia_imsic_init(vcpu); + if (ret) + goto fail_cleanup_imsics; + } + + /* Set the initialized flag */ + kvm->arch.aia.initialized = true; + + return 0; + +fail_cleanup_imsics: + for (i = idx - 1; i >= 0; i--) { + vcpu = kvm_get_vcpu(kvm, i); + if (!vcpu) + continue; + kvm_riscv_vcpu_aia_imsic_cleanup(vcpu); + } + kvm_riscv_aia_aplic_cleanup(kvm); + return ret; +} + +static int aia_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + u32 nr; + u64 addr; + int nr_vcpus, r = -ENXIO; + unsigned long type = (unsigned long)attr->attr; + void __user *uaddr = (void __user *)(long)attr->addr; + + switch (attr->group) { + case KVM_DEV_RISCV_AIA_GRP_CONFIG: + if (copy_from_user(&nr, uaddr, sizeof(nr))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = aia_config(dev->kvm, type, &nr, true); + mutex_unlock(&dev->kvm->lock); + + break; + + case KVM_DEV_RISCV_AIA_GRP_ADDR: + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + nr_vcpus = atomic_read(&dev->kvm->online_vcpus); + mutex_lock(&dev->kvm->lock); + if (type == KVM_DEV_RISCV_AIA_ADDR_APLIC) + r = aia_aplic_addr(dev->kvm, &addr, true); + else if (type < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus)) + r = aia_imsic_addr(dev->kvm, &addr, + type - KVM_DEV_RISCV_AIA_ADDR_IMSIC(0), true); + mutex_unlock(&dev->kvm->lock); + + break; + + case KVM_DEV_RISCV_AIA_GRP_CTRL: + switch (type) { + case KVM_DEV_RISCV_AIA_CTRL_INIT: + mutex_lock(&dev->kvm->lock); + r = aia_init(dev->kvm); + mutex_unlock(&dev->kvm->lock); + break; + } + + break; + } + + return r; +} + +static int aia_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + u32 nr; + u64 addr; + int nr_vcpus, r = -ENXIO; + void __user *uaddr = (void __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + + switch (attr->group) { + case KVM_DEV_RISCV_AIA_GRP_CONFIG: + if (copy_from_user(&nr, uaddr, sizeof(nr))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = aia_config(dev->kvm, type, &nr, false); + mutex_unlock(&dev->kvm->lock); + if (r) + return r; + + if (copy_to_user(uaddr, &nr, sizeof(nr))) + return -EFAULT; + + break; + case KVM_DEV_RISCV_AIA_GRP_ADDR: + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + nr_vcpus = atomic_read(&dev->kvm->online_vcpus); + mutex_lock(&dev->kvm->lock); + if (type == KVM_DEV_RISCV_AIA_ADDR_APLIC) + r = aia_aplic_addr(dev->kvm, &addr, false); + else if (type < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus)) + r = aia_imsic_addr(dev->kvm, &addr, + type - KVM_DEV_RISCV_AIA_ADDR_IMSIC(0), false); + mutex_unlock(&dev->kvm->lock); + if (r) + return r; + + if (copy_to_user(uaddr, &addr, sizeof(addr))) + return -EFAULT; + + break; + } + + return r; +} + +static int aia_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + int nr_vcpus; + + switch (attr->group) { + case KVM_DEV_RISCV_AIA_GRP_CONFIG: + switch (attr->attr) { + case KVM_DEV_RISCV_AIA_CONFIG_MODE: + case KVM_DEV_RISCV_AIA_CONFIG_IDS: + case KVM_DEV_RISCV_AIA_CONFIG_SRCS: + case KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS: + case KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT: + case KVM_DEV_RISCV_AIA_CONFIG_HART_BITS: + case KVM_DEV_RISCV_AIA_CONFIG_GUEST_BITS: + return 0; + } + break; + case KVM_DEV_RISCV_AIA_GRP_ADDR: + nr_vcpus = atomic_read(&dev->kvm->online_vcpus); + if (attr->attr == KVM_DEV_RISCV_AIA_ADDR_APLIC) + return 0; + else if (attr->attr < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus)) + return 0; + break; + case KVM_DEV_RISCV_AIA_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_RISCV_AIA_CTRL_INIT: + return 0; + } + break; + } + + return -ENXIO; +} + +struct kvm_device_ops kvm_riscv_aia_device_ops = { + .name = "kvm-riscv-aia", + .create = aia_create, + .destroy = aia_destroy, + .set_attr = aia_set_attr, + .get_attr = aia_get_attr, + .has_attr = aia_has_attr, +}; + +int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu) +{ + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(vcpu->kvm)) + return 1; + + /* Update the IMSIC HW state before entering guest mode */ + return kvm_riscv_vcpu_aia_imsic_update(vcpu); +} + +void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr; + struct kvm_vcpu_aia_csr *reset_csr = + &vcpu->arch.aia_context.guest_reset_csr; + + if (!kvm_riscv_aia_available()) + return; + memcpy(csr, reset_csr, sizeof(*csr)); + + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(vcpu->kvm)) + return; + + /* Reset the IMSIC context */ + kvm_riscv_vcpu_aia_imsic_reset(vcpu); +} + +int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_aia *vaia = &vcpu->arch.aia_context; + + if (!kvm_riscv_aia_available()) + return 0; + + /* + * We don't do any memory allocations over here because these + * will be done after AIA device is initialized by the user-space. + * + * Refer, aia_init() implementation for more details. + */ + + /* Initialize default values in AIA vcpu context */ + vaia->imsic_addr = KVM_RISCV_AIA_UNDEF_ADDR; + vaia->hart_index = vcpu->vcpu_idx; + + return 0; +} + +void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu) +{ + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(vcpu->kvm)) + return; + + /* Cleanup IMSIC context */ + kvm_riscv_vcpu_aia_imsic_cleanup(vcpu); +} + +int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, u32 hart_index, + u32 guest_index, u32 iid) +{ + unsigned long idx; + struct kvm_vcpu *vcpu; + + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + /* Inject MSI to matching VCPU */ + kvm_for_each_vcpu(idx, vcpu, kvm) { + if (vcpu->arch.aia_context.hart_index == hart_index) + return kvm_riscv_vcpu_aia_imsic_inject(vcpu, + guest_index, + 0, iid); + } + + return 0; +} + +int kvm_riscv_aia_inject_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + gpa_t tppn, ippn; + unsigned long idx; + struct kvm_vcpu *vcpu; + u32 g, toff, iid = msi->data; + struct kvm_aia *aia = &kvm->arch.aia; + gpa_t target = (((gpa_t)msi->address_hi) << 32) | msi->address_lo; + + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + /* Convert target address to target PPN */ + tppn = target >> IMSIC_MMIO_PAGE_SHIFT; + + /* Extract and clear Guest ID from target PPN */ + g = tppn & (BIT(aia->nr_guest_bits) - 1); + tppn &= ~((gpa_t)(BIT(aia->nr_guest_bits) - 1)); + + /* Inject MSI to matching VCPU */ + kvm_for_each_vcpu(idx, vcpu, kvm) { + ippn = vcpu->arch.aia_context.imsic_addr >> + IMSIC_MMIO_PAGE_SHIFT; + if (ippn == tppn) { + toff = target & (IMSIC_MMIO_PAGE_SZ - 1); + return kvm_riscv_vcpu_aia_imsic_inject(vcpu, g, + toff, iid); + } + } + + return 0; +} + +int kvm_riscv_aia_inject_irq(struct kvm *kvm, unsigned int irq, bool level) +{ + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + /* Inject interrupt level change in APLIC */ + return kvm_riscv_aia_aplic_inject(kvm, irq, level); +} + +void kvm_riscv_aia_init_vm(struct kvm *kvm) +{ + struct kvm_aia *aia = &kvm->arch.aia; + + if (!kvm_riscv_aia_available()) + return; + + /* + * We don't do any memory allocations over here because these + * will be done after AIA device is initialized by the user-space. + * + * Refer, aia_init() implementation for more details. + */ + + /* Initialize default values in AIA global context */ + aia->mode = (kvm_riscv_aia_nr_hgei) ? + KVM_DEV_RISCV_AIA_MODE_AUTO : KVM_DEV_RISCV_AIA_MODE_EMUL; + aia->nr_ids = kvm_riscv_aia_max_ids - 1; + aia->nr_sources = 0; + aia->nr_group_bits = 0; + aia->nr_group_shift = KVM_DEV_RISCV_AIA_GROUP_SHIFT_MIN; + aia->nr_hart_bits = 0; + aia->nr_guest_bits = 0; + aia->aplic_addr = KVM_RISCV_AIA_UNDEF_ADDR; +} + +void kvm_riscv_aia_destroy_vm(struct kvm *kvm) +{ + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(kvm)) + return; + + /* Cleanup APLIC context */ + kvm_riscv_aia_aplic_cleanup(kvm); +} diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 737318b1c1d9..27ccd07898e1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1442,6 +1442,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_XIVE KVM_DEV_TYPE_XIVE KVM_DEV_TYPE_ARM_PV_TIME, #define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME + KVM_DEV_TYPE_RISCV_AIA, +#define KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_MAX, }; -- cgit v1.2.3 From 884ad5c52da253e5d38f947cd8d1d9412a47429c Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Mon, 19 Jun 2023 17:36:26 +1000 Subject: powerpc/ptrace: Expose DEXCR and HDEXCR registers to ptrace The DEXCR register is of interest when ptracing processes. Currently it is static, but eventually will be dynamically controllable by a process. If a process can control its own, then it is useful for it to be ptrace-able to (e.g., for checkpoint-restore functionality). It is also relevant to core dumps (the NPHIE aspect in particular), which use the ptrace mechanism (or is it the other way around?) to decide what to dump. The HDEXCR is useful here too, as the NPHIE aspect may be set in the HDEXCR without being set in the DEXCR. Although the HDEXCR is per-cpu and we don't track it in the task struct (it's useless in normal operation), it would be difficult to imagine why a hypervisor would set it to different values within a guest. A hypervisor cannot safely set NPHIE differently at least, as that would break programs. Expose a read-only view of the userspace DEXCR and HDEXCR to ptrace. The HDEXCR is always readonly, and is useful for diagnosing the core dumps (as the HDEXCR may set NPHIE without the DEXCR setting it). Signed-off-by: Benjamin Gray Reviewed-by: Russell Currey [mpe: Use lower_32_bits() rather than open coding] Signed-off-by: Michael Ellerman Link: https://msgid.link/20230616034846.311705-7-bgray@linux.ibm.com --- arch/powerpc/include/uapi/asm/elf.h | 1 + arch/powerpc/kernel/ptrace/ptrace-decl.h | 1 + arch/powerpc/kernel/ptrace/ptrace-view.c | 36 +++++++++++++++++++++++++++++++- include/uapi/linux/elf.h | 1 + 4 files changed, 38 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/arch/powerpc/include/uapi/asm/elf.h b/arch/powerpc/include/uapi/asm/elf.h index dbc4a5b8d02d..e0d323c808dd 100644 --- a/arch/powerpc/include/uapi/asm/elf.h +++ b/arch/powerpc/include/uapi/asm/elf.h @@ -98,6 +98,7 @@ #define ELF_NEBB 3 /* includes ebbrr, ebbhr, bescr */ #define ELF_NPMU 5 /* includes siar, sdar, sier, mmcr2, mmcr0 */ #define ELF_NPKEY 3 /* includes amr, iamr, uamor */ +#define ELF_NDEXCR 2 /* includes dexcr, hdexcr */ typedef unsigned long elf_greg_t64; typedef elf_greg_t64 elf_gregset_t64[ELF_NGREG]; diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h b/arch/powerpc/kernel/ptrace/ptrace-decl.h index 463a63eb8cc7..998a84f64804 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-decl.h +++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h @@ -57,6 +57,7 @@ enum powerpc_regset { REGSET_TAR, /* TAR register */ REGSET_EBB, /* EBB registers */ REGSET_PMR, /* Performance Monitor Registers */ + REGSET_DEXCR, /* DEXCR registers */ #endif #ifdef CONFIG_PPC_MEM_KEYS REGSET_PKEY, /* AMR register */ diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c index 5fff0d04b23f..f1032fe626f4 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-view.c +++ b/arch/powerpc/kernel/ptrace/ptrace-view.c @@ -454,7 +454,36 @@ static int pmu_set(struct task_struct *target, const struct user_regset *regset, 5 * sizeof(unsigned long)); return ret; } -#endif + +static int dexcr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + return regset->n; +} + +static int dexcr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + /* + * The DEXCR is currently static across all CPUs, so we don't + * store the target's value anywhere, but the static value + * will also be correct. + */ + membuf_store(&to, (u64)lower_32_bits(DEXCR_INIT)); + + /* + * Technically the HDEXCR is per-cpu, but a hypervisor can't reasonably + * change it between CPUs of the same guest. + */ + return membuf_store(&to, (u64)lower_32_bits(mfspr(SPRN_HDEXCR_RO))); +} + +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_MEM_KEYS static int pkey_active(struct task_struct *target, const struct user_regset *regset) @@ -615,6 +644,11 @@ static const struct user_regset native_regsets[] = { .size = sizeof(u64), .align = sizeof(u64), .active = pmu_active, .regset_get = pmu_get, .set = pmu_set }, + [REGSET_DEXCR] = { + .core_note_type = NT_PPC_DEXCR, .n = ELF_NDEXCR, + .size = sizeof(u64), .align = sizeof(u64), + .active = dexcr_active, .regset_get = dexcr_get + }, #endif #ifdef CONFIG_PPC_MEM_KEYS [REGSET_PKEY] = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index ac3da855fb19..cfa31f1eb5d7 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -403,6 +403,7 @@ typedef struct elf64_shdr { #define NT_PPC_TM_CPPR 0x10e /* TM checkpointed Program Priority Register */ #define NT_PPC_TM_CDSCR 0x10f /* TM checkpointed Data Stream Control Register */ #define NT_PPC_PKEY 0x110 /* Memory Protection Keys registers */ +#define NT_PPC_DEXCR 0x111 /* PowerPC DEXCR registers */ #define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ #define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ -- cgit v1.2.3 From 97228ca375c78bfd960767dcd4919c981add306f Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Mon, 19 Jun 2023 17:36:26 +1000 Subject: powerpc/ptrace: Expose HASHKEYR register to ptrace The HASHKEYR register contains a secret per-process key to enable unique hashes per process. In general it should not be exposed to userspace at all and a regular process has no need to know its key. However, checkpoint restore in userspace (CRIU) functionality requires that a process be able to set the HASHKEYR of another process, otherwise existing hashes on the stack would be invalidated by a new random key. Exposing HASHKEYR in this way also makes it appear in core dumps, which is a security concern. Multiple threads may share a key, for example just after a fork() call, where the kernel cannot know if the child is going to return back along the parent's stack. If such a thread is coerced into making a core dump, then the HASHKEYR value will be readable and able to be used against all other threads sharing that key, effectively undoing any protection offered by hashst/hashchk. Therefore we expose HASHKEYR to ptrace when CONFIG_CHECKPOINT_RESTORE is enabled, providing a choice of increased security or migratable ROP protected processes. This is similar to how ARM exposes its PAC keys. Signed-off-by: Benjamin Gray Reviewed-by: Russell Currey Signed-off-by: Michael Ellerman Link: https://msgid.link/20230616034846.311705-8-bgray@linux.ibm.com --- arch/powerpc/include/uapi/asm/elf.h | 1 + arch/powerpc/kernel/ptrace/ptrace-decl.h | 3 +++ arch/powerpc/kernel/ptrace/ptrace-view.c | 36 ++++++++++++++++++++++++++++++++ include/uapi/linux/elf.h | 1 + 4 files changed, 41 insertions(+) (limited to 'include/uapi/linux') diff --git a/arch/powerpc/include/uapi/asm/elf.h b/arch/powerpc/include/uapi/asm/elf.h index e0d323c808dd..a5377f494fa3 100644 --- a/arch/powerpc/include/uapi/asm/elf.h +++ b/arch/powerpc/include/uapi/asm/elf.h @@ -99,6 +99,7 @@ #define ELF_NPMU 5 /* includes siar, sdar, sier, mmcr2, mmcr0 */ #define ELF_NPKEY 3 /* includes amr, iamr, uamor */ #define ELF_NDEXCR 2 /* includes dexcr, hdexcr */ +#define ELF_NHASHKEYR 1 /* includes hashkeyr */ typedef unsigned long elf_greg_t64; typedef elf_greg_t64 elf_gregset_t64[ELF_NGREG]; diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h b/arch/powerpc/kernel/ptrace/ptrace-decl.h index 998a84f64804..4171a5727197 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-decl.h +++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h @@ -58,6 +58,9 @@ enum powerpc_regset { REGSET_EBB, /* EBB registers */ REGSET_PMR, /* Performance Monitor Registers */ REGSET_DEXCR, /* DEXCR registers */ +#ifdef CONFIG_CHECKPOINT_RESTORE + REGSET_HASHKEYR, /* HASHKEYR register */ +#endif #endif #ifdef CONFIG_PPC_MEM_KEYS REGSET_PKEY, /* AMR register */ diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c index f1032fe626f4..3910cd7bb2d9 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-view.c +++ b/arch/powerpc/kernel/ptrace/ptrace-view.c @@ -483,6 +483,35 @@ static int dexcr_get(struct task_struct *target, const struct user_regset *regse return membuf_store(&to, (u64)lower_32_bits(mfspr(SPRN_HDEXCR_RO))); } +#ifdef CONFIG_CHECKPOINT_RESTORE +static int hashkeyr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + return regset->n; +} + +static int hashkeyr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + return membuf_store(&to, target->thread.hashkeyr); +} + +static int hashkeyr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, const void *kbuf, + const void __user *ubuf) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.hashkeyr, + 0, sizeof(unsigned long)); +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_MEM_KEYS @@ -649,6 +678,13 @@ static const struct user_regset native_regsets[] = { .size = sizeof(u64), .align = sizeof(u64), .active = dexcr_active, .regset_get = dexcr_get }, +#ifdef CONFIG_CHECKPOINT_RESTORE + [REGSET_HASHKEYR] = { + .core_note_type = NT_PPC_HASHKEYR, .n = ELF_NHASHKEYR, + .size = sizeof(u64), .align = sizeof(u64), + .active = hashkeyr_active, .regset_get = hashkeyr_get, .set = hashkeyr_set + }, +#endif #endif #ifdef CONFIG_PPC_MEM_KEYS [REGSET_PKEY] = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index cfa31f1eb5d7..b705b301d88f 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -404,6 +404,7 @@ typedef struct elf64_shdr { #define NT_PPC_TM_CDSCR 0x10f /* TM checkpointed Data Stream Control Register */ #define NT_PPC_PKEY 0x110 /* Memory Protection Keys registers */ #define NT_PPC_DEXCR 0x111 /* PowerPC DEXCR registers */ +#define NT_PPC_HASHKEYR 0x112 /* PowerPC HASHKEYR register */ #define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ #define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ -- cgit v1.2.3 From 065563b20a664a6575dc158688dfb0e121c25b38 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Fri, 17 Mar 2023 19:51:53 +0530 Subject: wifi: cfg80211/nl80211: Add support to indicate STA MLD setup links removal STA MLD setup links may get removed if AP MLD remove the corresponding affiliated APs with Multi-Link reconfiguration as described in P802.11be_D3.0, section 35.3.6.2.2 Removing affiliated APs. Currently, there is no support to notify such operation to cfg80211 and userspace. Add support for the drivers to indicate STA MLD setup links removal to cfg80211 and notify the same to userspace. Upon receiving such indication from the driver, clear the MLO links information of the removed links in the WDEV. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/20230317142153.237900-1-quic_vjakkam@quicinc.com [rename function and attribute, fix kernel-doc] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 13 ++++++++ include/uapi/linux/nl80211.h | 7 +++++ net/wireless/core.h | 1 + net/wireless/nl80211.c | 70 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/sme.c | 15 ++++++++++ net/wireless/trace.h | 15 ++++++++++ 6 files changed, 121 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9972de114d73..3a736f9286b0 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -9205,4 +9205,17 @@ static inline int cfg80211_color_change_notify(struct net_device *dev) bool cfg80211_valid_disable_subchannel_bitmap(u16 *bitmap, const struct cfg80211_chan_def *chandef); +/** + * cfg80211_links_removed - Notify about removed STA MLD setup links. + * @dev: network device. + * @link_mask: BIT mask of removed STA MLD setup link IDs. + * + * Inform cfg80211 and the userspace about removed STA MLD setup links due to + * AP MLD removing the corresponding affiliated APs with Multi-Link + * reconfiguration. Note that it's not valid to remove all links, in this + * case disconnect instead. + * Also note that the wdev mutex must be held. + */ +void cfg80211_links_removed(struct net_device *dev, u16 link_mask); + #endif /* __NET_CFG80211_H */ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 03939bdb0e48..3190d34269ef 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1309,6 +1309,11 @@ * The number of peers that HW timestamping can be enabled for concurrently * is indicated by %NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS. * + * @NL80211_CMD_LINKS_REMOVED: Notify userspace about the removal of STA MLD + * setup links due to AP MLD removing the corresponding affiliated APs with + * Multi-Link reconfiguration. %NL80211_ATTR_MLO_LINKS is used to provide + * information about the removed STA MLD setup links. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1562,6 +1567,8 @@ enum nl80211_commands { NL80211_CMD_SET_HW_TIMESTAMP, + NL80211_CMD_LINKS_REMOVED, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ diff --git a/net/wireless/core.h b/net/wireless/core.h index 291c6d83d56f..8a807b609ef7 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -576,5 +576,6 @@ void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id); void cfg80211_remove_links(struct wireless_dev *wdev); int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); +void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask); #endif /* __NET_WIRELESS_CORE_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7b547aeb52f1..0da2e6a2a7ea 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -18288,6 +18288,76 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, nlmsg_free(msg); } +void cfg80211_links_removed(struct net_device *dev, u16 link_mask) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + struct nlattr *links; + void *hdr; + + ASSERT_WDEV_LOCK(wdev); + trace_cfg80211_links_removed(dev, link_mask); + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) + return; + + if (WARN_ON(!wdev->valid_links || !link_mask || + (wdev->valid_links & link_mask) != link_mask || + wdev->valid_links == link_mask)) + return; + + cfg80211_wdev_release_link_bsses(wdev, link_mask); + wdev->valid_links &= ~link_mask; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_LINKS_REMOVED); + if (!hdr) { + nlmsg_free(msg); + return; + } + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + links = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS); + if (!links) + goto nla_put_failure; + + while (link_mask) { + struct nlattr *link; + int link_id = __ffs(link_mask); + + link = nla_nest_start(msg, link_id + 1); + if (!link) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id)) + goto nla_put_failure; + + nla_nest_end(msg, link); + link_mask &= ~(1 << link_id); + } + + nla_nest_end(msg, links); + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + NL80211_MCGRP_MLME, GFP_KERNEL); + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_links_removed); + void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, gfp_t gfp) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 247369004aaa..9bba233b5a6e 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -491,6 +491,21 @@ static void cfg80211_wdev_release_bsses(struct wireless_dev *wdev) } } +void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask) +{ + unsigned int link; + + for_each_valid_link(wdev, link) { + if (!wdev->links[link].client.current_bss || + !(link_mask & BIT(link))) + continue; + cfg80211_unhold_bss(wdev->links[link].client.current_bss); + cfg80211_put_bss(wdev->wiphy, + &wdev->links[link].client.current_bss->pub); + wdev->links[link].client.current_bss = NULL; + } +} + static int cfg80211_sme_get_conn_ies(struct wireless_dev *wdev, const u8 *ies, size_t ies_len, const u8 **out_ies, size_t *out_ies_len) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index e63990b81249..617c0d0dfa96 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3966,6 +3966,21 @@ TRACE_EVENT(rdev_set_hw_timestamp, __entry->enable) ); +TRACE_EVENT(cfg80211_links_removed, + TP_PROTO(struct net_device *netdev, u16 link_mask), + TP_ARGS(netdev, link_mask), + TP_STRUCT__entry( + NETDEV_ENTRY + __field(u16, link_mask) + ), + TP_fast_assign( + NETDEV_ASSIGN; + __entry->link_mask = link_mask; + ), + TP_printk(NETDEV_PR_FMT ", link_mask:%u", NETDEV_PR_ARG, + __entry->link_mask) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From c3b60ab7a4dff6e6e608e685b70ddc3d6b2aca81 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Mon, 12 Jun 2023 14:14:56 -0700 Subject: ptp: Add .getmaxphase callback to ptp_clock_info Enables advertisement of the maximum offset supported by the phase control functionality of PHCs. The callback is used to return an error if an offset not supported by the PHC is used in ADJ_OFFSET. The ioctls PTP_CLOCK_GETCAPS and PTP_CLOCK_GETCAPS2 now advertise the maximum offset a PHC's phase control functionality is capable of supporting. Introduce new sysfs node, max_phase_adjustment. Cc: Jakub Kicinski Cc: Shuah Khan Cc: Richard Cochran Cc: Maciek Machnikowski Signed-off-by: Rahul Rameshbabu Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 5 ++++- drivers/ptp/ptp_clock.c | 4 ++++ drivers/ptp/ptp_sysfs.c | 12 ++++++++++++ include/linux/ptp_clock_kernel.h | 5 +++++ include/uapi/linux/ptp_clock.h | 3 ++- tools/testing/selftests/ptp/testptp.c | 6 ++++-- 6 files changed, 31 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index af3bc65c4595..362bf756e6b7 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -136,7 +136,10 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) caps.pps = ptp->info->pps; caps.n_pins = ptp->info->n_pins; caps.cross_timestamping = ptp->info->getcrosststamp != NULL; - caps.adjust_phase = ptp->info->adjphase != NULL; + caps.adjust_phase = ptp->info->adjphase != NULL && + ptp->info->getmaxphase != NULL; + if (caps.adjust_phase) + caps.max_phase_adj = ptp->info->getmaxphase(ptp->info); if (copy_to_user((void __user *)arg, &caps, sizeof(caps))) err = -EFAULT; break; diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 790f9250b381..80f74e38c2da 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -135,11 +135,15 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct __kernel_timex *tx) ptp->dialed_frequency = tx->freq; } else if (tx->modes & ADJ_OFFSET) { if (ops->adjphase) { + s32 max_phase_adj = ops->getmaxphase(ops); s32 offset = tx->offset; if (!(tx->modes & ADJ_NANO)) offset *= NSEC_PER_USEC; + if (offset > max_phase_adj || offset < -max_phase_adj) + return -ERANGE; + err = ops->adjphase(ops, offset); } } else if (tx->modes == 0) { diff --git a/drivers/ptp/ptp_sysfs.c b/drivers/ptp/ptp_sysfs.c index f30b0a439470..77219cdcd683 100644 --- a/drivers/ptp/ptp_sysfs.c +++ b/drivers/ptp/ptp_sysfs.c @@ -18,6 +18,17 @@ static ssize_t clock_name_show(struct device *dev, } static DEVICE_ATTR_RO(clock_name); +static ssize_t max_phase_adjustment_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct ptp_clock *ptp = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE - 1, "%d\n", + ptp->info->getmaxphase(ptp->info)); +} +static DEVICE_ATTR_RO(max_phase_adjustment); + #define PTP_SHOW_INT(name, var) \ static ssize_t var##_show(struct device *dev, \ struct device_attribute *attr, char *page) \ @@ -309,6 +320,7 @@ static struct attribute *ptp_attrs[] = { &dev_attr_clock_name.attr, &dev_attr_max_adjustment.attr, + &dev_attr_max_phase_adjustment.attr, &dev_attr_n_alarms.attr, &dev_attr_n_external_timestamps.attr, &dev_attr_n_periodic_outputs.attr, diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index f8e8443a8b35..1ef4e0f9bd2a 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -82,6 +82,10 @@ struct ptp_system_timestamp { * parameter delta: PHC servo phase adjustment target * in nanoseconds. * + * @getmaxphase: Advertises maximum offset that can be provided + * to the hardware clock's phase control functionality + * through adjphase. + * * @adjtime: Shifts the time of the hardware clock. * parameter delta: Desired change in nanoseconds. * @@ -171,6 +175,7 @@ struct ptp_clock_info { struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); int (*adjphase)(struct ptp_clock_info *ptp, s32 phase); + s32 (*getmaxphase)(struct ptp_clock_info *ptp); int (*adjtime)(struct ptp_clock_info *ptp, s64 delta); int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts); int (*gettimex64)(struct ptp_clock_info *ptp, struct timespec64 *ts, diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 1d108d597f66..05cc35fc94ac 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -95,7 +95,8 @@ struct ptp_clock_caps { int cross_timestamping; /* Whether the clock supports adjust phase */ int adjust_phase; - int rsv[12]; /* Reserved for future use. */ + int max_phase_adj; /* Maximum phase adjustment in nanoseconds. */ + int rsv[11]; /* Reserved for future use. */ }; struct ptp_extts_request { diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c index cc535f76db99..e9438a1862ad 100644 --- a/tools/testing/selftests/ptp/testptp.c +++ b/tools/testing/selftests/ptp/testptp.c @@ -292,7 +292,8 @@ int main(int argc, char *argv[]) " %d pulse per second\n" " %d programmable pins\n" " %d cross timestamping\n" - " %d adjust_phase\n", + " %d adjust_phase\n" + " %d maximum phase adjustment (ns)\n", caps.max_adj, caps.n_alarm, caps.n_ext_ts, @@ -300,7 +301,8 @@ int main(int argc, char *argv[]) caps.pps, caps.n_pins, caps.cross_timestamping, - caps.adjust_phase); + caps.adjust_phase, + caps.max_phase_adj); } } -- cgit v1.2.3 From 95a55437dc49fb3342c82e61f5472a71c63d9ed0 Mon Sep 17 00:00:00 2001 From: Michael Schmitz Date: Wed, 21 Jun 2023 08:17:24 +1200 Subject: block: change all __u32 annotations to __be32 in affs_hardblocks.h The Amiga partition parser module uses signed int for partition sector address and count, which will overflow for disks larger than 1 TB. Use u64 as type for sector address and size to allow using disks up to 2 TB without LBD support, and disks larger than 2 TB with LBD. The RBD format allows to specify disk sizes up to 2^128 bytes (though native OS limitations reduce this somewhat, to max 2^68 bytes), so check for u64 overflow carefully to protect against overflowing sector_t. This bug was reported originally in 2012, and the fix was created by the RDB author, Joanne Dow . A patch had been discussed and reviewed on linux-m68k at that time but never officially submitted (now resubmitted as patch 1 of this series). Patch 3 (this series) adds additional error checking and warning messages. One of the error checks now makes use of the previously unused rdb_CylBlocks field, which causes a 'sparse' warning (cast to restricted __be32). Annotate all 32 bit fields in affs_hardblocks.h as __be32, as the on-disk format of RDB and partition blocks is always big endian. Reported-by: Martin Steigerwald Closes: https://bugzilla.kernel.org/show_bug.cgi?id=43511 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Message-ID: <201206192146.09327.Martin@lichtvoll.de> Cc: # 5.2 Signed-off-by: Michael Schmitz Reviewed-by: Christoph Hellwig Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20230620201725.7020-3-schmitzmic@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/affs_hardblocks.h | 68 ++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/affs_hardblocks.h b/include/uapi/linux/affs_hardblocks.h index 5e2fb8481252..a5aff2eb5f70 100644 --- a/include/uapi/linux/affs_hardblocks.h +++ b/include/uapi/linux/affs_hardblocks.h @@ -7,42 +7,42 @@ /* Just the needed definitions for the RDB of an Amiga HD. */ struct RigidDiskBlock { - __u32 rdb_ID; + __be32 rdb_ID; __be32 rdb_SummedLongs; - __s32 rdb_ChkSum; - __u32 rdb_HostID; + __be32 rdb_ChkSum; + __be32 rdb_HostID; __be32 rdb_BlockBytes; - __u32 rdb_Flags; - __u32 rdb_BadBlockList; + __be32 rdb_Flags; + __be32 rdb_BadBlockList; __be32 rdb_PartitionList; - __u32 rdb_FileSysHeaderList; - __u32 rdb_DriveInit; - __u32 rdb_Reserved1[6]; - __u32 rdb_Cylinders; - __u32 rdb_Sectors; - __u32 rdb_Heads; - __u32 rdb_Interleave; - __u32 rdb_Park; - __u32 rdb_Reserved2[3]; - __u32 rdb_WritePreComp; - __u32 rdb_ReducedWrite; - __u32 rdb_StepRate; - __u32 rdb_Reserved3[5]; - __u32 rdb_RDBBlocksLo; - __u32 rdb_RDBBlocksHi; - __u32 rdb_LoCylinder; - __u32 rdb_HiCylinder; - __u32 rdb_CylBlocks; - __u32 rdb_AutoParkSeconds; - __u32 rdb_HighRDSKBlock; - __u32 rdb_Reserved4; + __be32 rdb_FileSysHeaderList; + __be32 rdb_DriveInit; + __be32 rdb_Reserved1[6]; + __be32 rdb_Cylinders; + __be32 rdb_Sectors; + __be32 rdb_Heads; + __be32 rdb_Interleave; + __be32 rdb_Park; + __be32 rdb_Reserved2[3]; + __be32 rdb_WritePreComp; + __be32 rdb_ReducedWrite; + __be32 rdb_StepRate; + __be32 rdb_Reserved3[5]; + __be32 rdb_RDBBlocksLo; + __be32 rdb_RDBBlocksHi; + __be32 rdb_LoCylinder; + __be32 rdb_HiCylinder; + __be32 rdb_CylBlocks; + __be32 rdb_AutoParkSeconds; + __be32 rdb_HighRDSKBlock; + __be32 rdb_Reserved4; char rdb_DiskVendor[8]; char rdb_DiskProduct[16]; char rdb_DiskRevision[4]; char rdb_ControllerVendor[8]; char rdb_ControllerProduct[16]; char rdb_ControllerRevision[4]; - __u32 rdb_Reserved5[10]; + __be32 rdb_Reserved5[10]; }; #define IDNAME_RIGIDDISK 0x5244534B /* "RDSK" */ @@ -50,16 +50,16 @@ struct RigidDiskBlock { struct PartitionBlock { __be32 pb_ID; __be32 pb_SummedLongs; - __s32 pb_ChkSum; - __u32 pb_HostID; + __be32 pb_ChkSum; + __be32 pb_HostID; __be32 pb_Next; - __u32 pb_Flags; - __u32 pb_Reserved1[2]; - __u32 pb_DevFlags; + __be32 pb_Flags; + __be32 pb_Reserved1[2]; + __be32 pb_DevFlags; __u8 pb_DriveName[32]; - __u32 pb_Reserved2[15]; + __be32 pb_Reserved2[15]; __be32 pb_Environment[17]; - __u32 pb_EReserved[15]; + __be32 pb_EReserved[15]; }; #define IDNAME_PARTITION 0x50415254 /* "PART" */ -- cgit v1.2.3 From 6c5b9a3296e146cc74b1d006c6a546ea92534ade Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 19 Jun 2023 16:26:53 +0300 Subject: wifi: nl80211/reg: add no-EHT regulatory flag This just propagates to the channel flags, like no-HE and similar other flags before it. Signed-off-by: Johannes Berg Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230619161906.74ce2983aed8.Ifa343ba89c11760491daad5aee5a81209d5735a7@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/reg.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 3190d34269ef..88eb85c63029 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4450,6 +4450,7 @@ enum nl80211_sched_scan_match_attr { * @NL80211_RRF_NO_160MHZ: 160MHz operation not allowed * @NL80211_RRF_NO_HE: HE operation not allowed * @NL80211_RRF_NO_320MHZ: 320MHz operation not allowed + * @NL80211_RRF_NO_EHT: EHT operation not allowed */ enum nl80211_reg_rule_flags { NL80211_RRF_NO_OFDM = 1<<0, @@ -4469,6 +4470,7 @@ enum nl80211_reg_rule_flags { NL80211_RRF_NO_160MHZ = 1<<16, NL80211_RRF_NO_HE = 1<<17, NL80211_RRF_NO_320MHZ = 1<<18, + NL80211_RRF_NO_EHT = 1<<19, }; #define NL80211_RRF_PASSIVE_SCAN NL80211_RRF_NO_IR diff --git a/net/wireless/reg.c b/net/wireless/reg.c index f9e03850d71b..0317cf9da307 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,7 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2022 Intel Corporation + * Copyright (C) 2018 - 2023 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -1587,6 +1587,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_NO_HE; if (rd_flags & NL80211_RRF_NO_320MHZ) channel_flags |= IEEE80211_CHAN_NO_320MHZ; + if (rd_flags & NL80211_RRF_NO_EHT) + channel_flags |= IEEE80211_CHAN_NO_EHT; return channel_flags; } -- cgit v1.2.3 From 38967f424b5be79c4c676712e5640d846efd07e3 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:30:15 +0200 Subject: mptcp: track some aggregate data counters Currently there are no data transfer counters accounting for all the subflows used by a given MPTCP socket. The user-space can compute such figures aggregating the subflow info, but that is inaccurate if any subflow is closed before the MPTCP socket itself. Add the new counters in the MPTCP socket itself and expose them via the existing diag and sockopt. While touching mptcp_diag_fill_info(), acquire the relevant locks before fetching the msk data, to ensure better data consistency Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/385 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 5 +++++ net/mptcp/options.c | 10 ++++++++-- net/mptcp/protocol.c | 11 ++++++++++- net/mptcp/protocol.h | 4 ++++ net/mptcp/sockopt.c | 25 ++++++++++++++++++++----- 5 files changed, 47 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 32af2d278cb4..a124be6ebbba 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -123,6 +123,11 @@ struct mptcp_info { __u8 mptcpi_local_addr_used; __u8 mptcpi_local_addr_max; __u8 mptcpi_csum_enabled; + __u32 mptcpi_retransmits; + __u64 mptcpi_bytes_retrans; + __u64 mptcpi_bytes_sent; + __u64 mptcpi_bytes_received; + __u64 mptcpi_bytes_acked; }; /* diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 4bdcd2b326bd..c254accb14de 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1026,6 +1026,12 @@ u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) return cur_seq; } +static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) +{ + msk->bytes_acked += new_snd_una - msk->snd_una; + msk->snd_una = new_snd_una; +} + static void ack_update_msk(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_options_received *mp_opt) @@ -1057,7 +1063,7 @@ static void ack_update_msk(struct mptcp_sock *msk, __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { - msk->snd_una = new_snd_una; + __mptcp_snd_una_update(msk, new_snd_una); __mptcp_data_acked(sk); } mptcp_data_unlock(sk); @@ -1123,7 +1129,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) /* on fallback we just need to ignore the msk-level snd_una, as * this is really plain TCP */ - msk->snd_una = READ_ONCE(msk->snd_nxt); + __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt)); __mptcp_data_acked(subflow->conn); mptcp_data_unlock(subflow->conn); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9c756d675d4d..d5b8e488bce1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -377,6 +377,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ + msk->bytes_received += copy_len; WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail && mptcp_try_coalesce(sk, tail, skb)) @@ -760,6 +761,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) MPTCP_SKB_CB(skb)->map_seq += delta; __skb_queue_tail(&sk->sk_receive_queue, skb); } + msk->bytes_received += end_seq - msk->ack_seq; msk->ack_seq = end_seq; moved = true; } @@ -1531,8 +1533,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk, * that has been handed to the subflow for transmission * and skip update in case it was old dfrag. */ - if (likely(after64(snd_nxt_new, msk->snd_nxt))) + if (likely(after64(snd_nxt_new, msk->snd_nxt))) { + msk->bytes_sent += snd_nxt_new - msk->snd_nxt; msk->snd_nxt = snd_nxt_new; + } } void mptcp_check_and_set_pending(struct sock *sk) @@ -2590,6 +2594,7 @@ static void __mptcp_retrans(struct sock *sk) } if (copied) { dfrag->already_sent = max(dfrag->already_sent, info.sent); + msk->bytes_retrans += copied; tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); WRITE_ONCE(msk->allow_infinite_fallback, false); @@ -3102,6 +3107,10 @@ static int mptcp_disconnect(struct sock *sk, int flags) WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); + msk->bytes_acked = 0; + msk->bytes_received = 0; + msk->bytes_sent = 0; + msk->bytes_retrans = 0; WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 47b46602870e..27adfcc5aaa2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -262,10 +262,13 @@ struct mptcp_sock { u64 local_key; u64 remote_key; u64 write_seq; + u64 bytes_sent; u64 snd_nxt; + u64 bytes_received; u64 ack_seq; atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; + u64 bytes_retrans; int rmem_fwd_alloc; struct sock *last_snd; int snd_burst; @@ -274,6 +277,7 @@ struct mptcp_sock { * recovery related fields are under data_lock * protection */ + u64 bytes_acked; u64 snd_una; u64 wnd_end; unsigned long timer_ival; diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index e172a5848b0d..fa5055d5b029 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -889,7 +889,9 @@ out: void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) { + struct sock *sk = (struct sock *)msk; u32 flags = 0; + bool slow; memset(info, 0, sizeof(*info)); @@ -898,6 +900,9 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); + if (inet_sk_state_load(sk) == TCP_LISTEN) + return; + /* The following limits only make sense for the in-kernel PM */ if (mptcp_pm_is_kernel(msk)) { info->mptcpi_subflows_max = @@ -915,11 +920,21 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) if (READ_ONCE(msk->can_ack)) flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; info->mptcpi_flags = flags; - info->mptcpi_token = READ_ONCE(msk->token); - info->mptcpi_write_seq = READ_ONCE(msk->write_seq); - info->mptcpi_snd_una = READ_ONCE(msk->snd_una); - info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); - info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); + mptcp_data_lock(sk); + info->mptcpi_snd_una = msk->snd_una; + info->mptcpi_rcv_nxt = msk->ack_seq; + info->mptcpi_bytes_acked = msk->bytes_acked; + mptcp_data_unlock(sk); + + slow = lock_sock_fast(sk); + info->mptcpi_csum_enabled = msk->csum_enabled; + info->mptcpi_token = msk->token; + info->mptcpi_write_seq = msk->write_seq; + info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits; + info->mptcpi_bytes_sent = msk->bytes_sent; + info->mptcpi_bytes_received = msk->bytes_received; + info->mptcpi_bytes_retrans = msk->bytes_retrans; + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(mptcp_diag_fill_info); -- cgit v1.2.3 From 492432074e4fce4f8880213bf009b47adbf94a3a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 20 Jun 2023 18:30:18 +0200 Subject: mptcp: introduce MPTCP_FULL_INFO getsockopt Some user-space applications want to monitor the subflows utilization. Dumping the per subflow tcp_info is not enough, as the PM could close and re-create the subflows under-the-hood, fooling the accounting. Even checking the src/dst addresses used by each subflow could not be enough, because new subflows could re-use the same address/port of the just closed one. This patch introduces a new socket option, allow dumping all the relevant information all-at-once (everything, everywhere...), in a consistent manner. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/388 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 24 +++++++++ net/mptcp/sockopt.c | 127 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index a124be6ebbba..ee9c49f949a2 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -249,9 +249,33 @@ struct mptcp_subflow_addrs { }; }; +struct mptcp_subflow_info { + __u32 id; + struct mptcp_subflow_addrs addrs; +}; + +struct mptcp_full_info { + __u32 size_tcpinfo_kernel; /* must be 0, set by kernel */ + __u32 size_tcpinfo_user; + __u32 size_sfinfo_kernel; /* must be 0, set by kernel */ + __u32 size_sfinfo_user; + __u32 num_subflows; /* must be 0, set by kernel (real subflow count) */ + __u32 size_arrays_user; /* max subflows that userspace is interested in; + * the buffers at subflow_info/tcp_info + * are respectively at least: + * size_arrays * size_sfinfo_user + * size_arrays * size_tcpinfo_user + * bytes wide + */ + __aligned_u64 subflow_info; + __aligned_u64 tcp_info; + struct mptcp_info mptcp_info; +}; + /* MPTCP socket options */ #define MPTCP_INFO 1 #define MPTCP_TCPINFO 2 #define MPTCP_SUBFLOW_ADDRS 3 +#define MPTCP_FULL_INFO 4 #endif /* _UAPI_MPTCP_H */ diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index fa5055d5b029..63f7a09335c5 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -14,7 +14,8 @@ #include #include "protocol.h" -#define MIN_INFO_OPTLEN_SIZE 16 +#define MIN_INFO_OPTLEN_SIZE 16 +#define MIN_FULL_INFO_OPTLEN_SIZE 40 static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { @@ -981,7 +982,8 @@ static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd, } static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd, - char __user *optval, int __user *optlen) + char __user *optval, + int __user *optlen) { int len, copylen; @@ -1162,6 +1164,125 @@ static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *o return 0; } +static int mptcp_get_full_info(struct mptcp_full_info *mfi, + char __user *optval, + int __user *optlen) +{ + int len; + + BUILD_BUG_ON(offsetof(struct mptcp_full_info, mptcp_info) != + MIN_FULL_INFO_OPTLEN_SIZE); + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < MIN_FULL_INFO_OPTLEN_SIZE) + return -EINVAL; + + memset(mfi, 0, sizeof(*mfi)); + if (copy_from_user(mfi, optval, MIN_FULL_INFO_OPTLEN_SIZE)) + return -EFAULT; + + if (mfi->size_tcpinfo_kernel || + mfi->size_sfinfo_kernel || + mfi->num_subflows) + return -EINVAL; + + if (mfi->size_sfinfo_user > INT_MAX || + mfi->size_tcpinfo_user > INT_MAX) + return -EINVAL; + + return len - MIN_FULL_INFO_OPTLEN_SIZE; +} + +static int mptcp_put_full_info(struct mptcp_full_info *mfi, + char __user *optval, + u32 copylen, + int __user *optlen) +{ + copylen += MIN_FULL_INFO_OPTLEN_SIZE; + if (put_user(copylen, optlen)) + return -EFAULT; + + if (copy_to_user(optval, mfi, copylen)) + return -EFAULT; + return 0; +} + +static int mptcp_getsockopt_full_info(struct mptcp_sock *msk, char __user *optval, + int __user *optlen) +{ + unsigned int sfcount = 0, copylen = 0; + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + void __user *tcpinfoptr, *sfinfoptr; + struct mptcp_full_info mfi; + int len; + + len = mptcp_get_full_info(&mfi, optval, optlen); + if (len < 0) + return len; + + /* don't bother filling the mptcp info if there is not enough + * user-space-provided storage + */ + if (len > 0) { + mptcp_diag_fill_info(msk, &mfi.mptcp_info); + copylen += min_t(unsigned int, len, sizeof(struct mptcp_info)); + } + + mfi.size_tcpinfo_kernel = sizeof(struct tcp_info); + mfi.size_tcpinfo_user = min_t(unsigned int, mfi.size_tcpinfo_user, + sizeof(struct tcp_info)); + sfinfoptr = u64_to_user_ptr(mfi.subflow_info); + mfi.size_sfinfo_kernel = sizeof(struct mptcp_subflow_info); + mfi.size_sfinfo_user = min_t(unsigned int, mfi.size_sfinfo_user, + sizeof(struct mptcp_subflow_info)); + tcpinfoptr = u64_to_user_ptr(mfi.tcp_info); + + lock_sock(sk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + struct mptcp_subflow_info sfinfo; + struct tcp_info tcp_info; + + if (sfcount++ >= mfi.size_arrays_user) + continue; + + /* fetch addr/tcp_info only if the user space buffers + * are wide enough + */ + memset(&sfinfo, 0, sizeof(sfinfo)); + sfinfo.id = subflow->subflow_id; + if (mfi.size_sfinfo_user > + offsetof(struct mptcp_subflow_info, addrs)) + mptcp_get_sub_addrs(ssk, &sfinfo.addrs); + if (copy_to_user(sfinfoptr, &sfinfo, mfi.size_sfinfo_user)) + goto fail_release; + + if (mfi.size_tcpinfo_user) { + tcp_get_info(ssk, &tcp_info); + if (copy_to_user(tcpinfoptr, &tcp_info, + mfi.size_tcpinfo_user)) + goto fail_release; + } + + tcpinfoptr += mfi.size_tcpinfo_user; + sfinfoptr += mfi.size_sfinfo_user; + } + release_sock(sk); + + mfi.num_subflows = sfcount; + if (mptcp_put_full_info(&mfi, optval, copylen, optlen)) + return -EFAULT; + + return 0; + +fail_release: + release_sock(sk); + return -EFAULT; +} + static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval, int __user *optlen, int val) { @@ -1235,6 +1356,8 @@ static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname, switch (optname) { case MPTCP_INFO: return mptcp_getsockopt_info(msk, optval, optlen); + case MPTCP_FULL_INFO: + return mptcp_getsockopt_full_info(msk, optval, optlen); case MPTCP_TCPINFO: return mptcp_getsockopt_tcpinfo(msk, optval, optlen); case MPTCP_SUBFLOW_ADDRS: -- cgit v1.2.3 From 735d86a8aaf660e2a5fd5d711ee05fa817e8d567 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 9 Jun 2023 14:10:51 +0200 Subject: can: uapi: move CAN_RAW_FILTER_MAX definition to raw.h CAN_RAW_FILTER_MAX is only relevant for CAN_RAW sockets and used in linux/can/raw.c or in userspace applications that include the raw.h file anyway. Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20230609121051.9631-1-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can.h | 1 - include/uapi/linux/can/raw.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h index dd645ea72306..939db2388208 100644 --- a/include/uapi/linux/can.h +++ b/include/uapi/linux/can.h @@ -285,6 +285,5 @@ struct can_filter { }; #define CAN_INV_FILTER 0x20000000U /* to be set in can_filter.can_id */ -#define CAN_RAW_FILTER_MAX 512 /* maximum number of can_filter set via setsockopt() */ #endif /* !_UAPI_CAN_H */ diff --git a/include/uapi/linux/can/raw.h b/include/uapi/linux/can/raw.h index ff12f525c37c..31622c9b7988 100644 --- a/include/uapi/linux/can/raw.h +++ b/include/uapi/linux/can/raw.h @@ -49,6 +49,8 @@ #include #define SOL_CAN_RAW (SOL_CAN_BASE + CAN_RAW) +#define CAN_RAW_FILTER_MAX 512 /* maximum number of can_filter set via setsockopt() */ + enum { SCM_CAN_RAW_ERRQUEUE = 1, }; -- cgit v1.2.3 From 0b3d412798a4763aaf39213a279e453e1fddc81d Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Fri, 23 Jun 2023 08:50:11 +0300 Subject: elf: correct note name comment NT_PRFPREG note is named "CORE". Correct the comment accordingly. Fixes: 00e19ceec80b ("ELF: Add ELF program property parsing support") Signed-off-by: Baruch Siach Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/455b22b986de4d3bc6d9bfd522378e442943de5f.1687499411.git.baruch@tkos.co.il --- include/uapi/linux/elf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index ac3da855fb19..4d1c8d46e7f0 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -372,7 +372,8 @@ typedef struct elf64_shdr { * Notes used in ET_CORE. Architectures export some of the arch register sets * using the corresponding note types via the PTRACE_GETREGSET and * PTRACE_SETREGSET requests. - * The note name for all these is "LINUX". + * The note name for these types is "LINUX", except NT_PRFPREG that is named + * "CORE". */ #define NT_PRSTATUS 1 #define NT_PRFPREG 2 -- cgit v1.2.3 From 079cd633219d7298d087cd115c17682264244c18 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 15 Jun 2023 16:31:40 +0200 Subject: netfilter: nf_tables: Introduce NFT_MSG_GETSETELEM_RESET Analogous to NFT_MSG_GETOBJ_RESET, but for set elements with a timeout or attached stateful expressions like counters or quotas - reset them all at once. Respect a per element timeout value if present to reset the 'expires' value to. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/nf_tables_api.c | 68 ++++++++++++++++++++++---------- 2 files changed, 50 insertions(+), 20 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index e059dc2644df..8466c2a9938f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -105,6 +105,7 @@ enum nft_verdicts { * @NFT_MSG_DESTROYSETELEM: destroy a set element (enum nft_set_elem_attributes) * @NFT_MSG_DESTROYOBJ: destroy a stateful object (enum nft_object_attributes) * @NFT_MSG_DESTROYFLOWTABLE: destroy flow table (enum nft_flowtable_attributes) + * @NFT_MSG_GETSETELEM_RESET: get set elements and reset attached stateful expressions (enum nft_set_elem_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -140,6 +141,7 @@ enum nf_tables_msg_types { NFT_MSG_DESTROYSETELEM, NFT_MSG_DESTROYOBJ, NFT_MSG_DESTROYFLOWTABLE, + NFT_MSG_GETSETELEM_RESET, NFT_MSG_MAX, }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index dfd441ff1e3e..30fd62224df9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5229,7 +5229,8 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + static int nft_set_elem_expr_dump(struct sk_buff *skb, const struct nft_set *set, - const struct nft_set_ext *ext) + const struct nft_set_ext *ext, + bool reset) { struct nft_set_elem_expr *elem_expr; u32 size, num_exprs = 0; @@ -5242,7 +5243,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, if (num_exprs == 1) { expr = nft_setelem_expr_at(elem_expr, 0); - if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, false) < 0) + if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, reset) < 0) return -1; return 0; @@ -5253,7 +5254,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, nft_setelem_expr_foreach(expr, elem_expr, size) { expr = nft_setelem_expr_at(elem_expr, size); - if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, false) < 0) + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -5266,11 +5267,13 @@ nla_put_failure: static int nf_tables_fill_setelem(struct sk_buff *skb, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_set_elem *elem, + bool reset) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; + u64 timeout = 0; nest = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); if (nest == NULL) @@ -5293,7 +5296,7 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) && - nft_set_elem_expr_dump(skb, set, ext)) + nft_set_elem_expr_dump(skb, set, ext, reset)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) && @@ -5306,11 +5309,15 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, htonl(*nft_set_ext_flags(ext)))) goto nla_put_failure; - if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && - nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)), - NFTA_SET_ELEM_PAD)) - goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { + timeout = *nft_set_ext_timeout(ext); + if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, + nf_jiffies64_to_msecs(timeout), + NFTA_SET_ELEM_PAD)) + goto nla_put_failure; + } else if (set->flags & NFT_SET_TIMEOUT) { + timeout = READ_ONCE(set->timeout); + } if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { u64 expires, now = get_jiffies_64(); @@ -5325,6 +5332,9 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, nf_jiffies64_to_msecs(expires), NFTA_SET_ELEM_PAD)) goto nla_put_failure; + + if (reset) + *nft_set_ext_expiration(ext) = now + timeout; } if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { @@ -5348,6 +5358,7 @@ struct nft_set_dump_args { const struct netlink_callback *cb; struct nft_set_iter iter; struct sk_buff *skb; + bool reset; }; static int nf_tables_dump_setelem(const struct nft_ctx *ctx, @@ -5358,7 +5369,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, struct nft_set_dump_args *args; args = container_of(iter, struct nft_set_dump_args, iter); - return nf_tables_fill_setelem(args->skb, set, elem); + return nf_tables_fill_setelem(args->skb, set, elem, args->reset); } struct nft_set_dump_ctx { @@ -5367,7 +5378,7 @@ struct nft_set_dump_ctx { }; static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, - const struct nft_set *set) + const struct nft_set *set, bool reset) { struct nft_set_elem_catchall *catchall; u8 genmask = nft_genmask_cur(net); @@ -5382,7 +5393,7 @@ static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, continue; elem.priv = catchall->elem; - ret = nf_tables_fill_setelem(skb, set, &elem); + ret = nf_tables_fill_setelem(skb, set, &elem, reset); break; } @@ -5400,6 +5411,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) bool set_found = false; struct nlmsghdr *nlh; struct nlattr *nest; + bool reset = false; u32 portid, seq; int event; @@ -5447,8 +5459,12 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto nla_put_failure; + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) + reset = true; + args.cb = cb; args.skb = skb; + args.reset = reset; args.iter.genmask = nft_genmask_cur(net); args.iter.skip = cb->args[0]; args.iter.count = 0; @@ -5457,7 +5473,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) set->ops->walk(&dump_ctx->ctx, set, &args.iter); if (!args.iter.err && args.iter.count == cb->args[0]) - args.iter.err = nft_set_catchall_dump(net, skb, set); + args.iter.err = nft_set_catchall_dump(net, skb, set, reset); rcu_read_unlock(); nla_nest_end(skb, nest); @@ -5495,7 +5511,8 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_ctx *ctx, u32 seq, u32 portid, int event, u16 flags, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_set_elem *elem, + bool reset) { struct nlmsghdr *nlh; struct nlattr *nest; @@ -5516,7 +5533,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, if (nest == NULL) goto nla_put_failure; - err = nf_tables_fill_setelem(skb, set, elem); + err = nf_tables_fill_setelem(skb, set, elem, reset); if (err < 0) goto nla_put_failure; @@ -5622,7 +5639,7 @@ static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set, } static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, - const struct nlattr *attr) + const struct nlattr *attr, bool reset) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_elem elem; @@ -5666,7 +5683,8 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid, - NFT_MSG_NEWSETELEM, 0, set, &elem); + NFT_MSG_NEWSETELEM, 0, set, &elem, + reset); if (err < 0) goto err_fill_setelem; @@ -5690,6 +5708,7 @@ static int nf_tables_getsetelem(struct sk_buff *skb, struct nft_set *set; struct nlattr *attr; struct nft_ctx ctx; + bool reset = false; int rem, err = 0; table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, @@ -5724,8 +5743,11 @@ static int nf_tables_getsetelem(struct sk_buff *skb, if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) return -EINVAL; + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET) + reset = true; + nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_get_set_elem(&ctx, set, attr); + err = nft_get_set_elem(&ctx, set, attr, reset); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); break; @@ -5758,7 +5780,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, - set, elem); + set, elem, false); if (err < 0) { kfree_skb(skb); goto err; @@ -8715,6 +8737,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_GETSETELEM_RESET] = { + .call = nf_tables_getsetelem, + .type = NFNL_CB_RCU, + .attr_count = NFTA_SET_ELEM_LIST_MAX, + .policy = nft_set_elem_list_policy, + }, [NFT_MSG_DELSETELEM] = { .call = nf_tables_delsetelem, .type = NFNL_CB_BATCH, -- cgit v1.2.3 From 9408d8a37e6cce8803681ab816383450a056c3a9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 12 Jun 2023 12:03:43 -0700 Subject: nvme: improved uring polling Drivers can poll requests directly, so use that. We just need to ensure the driver's request was allocated from a polled hctx, so a special driver flag is added to struct io_uring_cmd. The allows unshared and multipath namespaces to use the same polling callback, and multipath is guaranteed to get the same queue as the command was submitted on. Previously multipath polling might check a different path and poll the wrong info. The other bonus is we don't need a bio payload in order to poll, allowing commands like 'flush' and 'write zeroes' to be submitted on the same high priority queue as read and write commands. Finally, using the request based polling skips the unnecessary bio overhead. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230612190343.2087040-3-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 70 ++++++++++++------------------------------- drivers/nvme/host/multipath.c | 2 +- drivers/nvme/host/nvme.h | 2 -- include/uapi/linux/io_uring.h | 2 ++ 4 files changed, 22 insertions(+), 54 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 2130ad65b58c..5c3250f36ce7 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -505,7 +505,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, { struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); - void *cookie = READ_ONCE(ioucmd->cookie); req->bio = pdu->bio; if (nvme_req(req)->flags & NVME_REQ_CANCELLED) @@ -518,10 +517,12 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, * For iopoll, complete it directly. * Otherwise, move the completion to task work. */ - if (cookie != NULL && blk_rq_is_poll(req)) + if (blk_rq_is_poll(req)) { + WRITE_ONCE(ioucmd->cookie, NULL); nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); - else + } else { io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + } return RQ_END_IO_FREE; } @@ -531,7 +532,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, { struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); - void *cookie = READ_ONCE(ioucmd->cookie); req->bio = pdu->bio; pdu->req = req; @@ -540,10 +540,12 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, * For iopoll, complete it directly. * Otherwise, move the completion to task work. */ - if (cookie != NULL && blk_rq_is_poll(req)) + if (blk_rq_is_poll(req)) { + WRITE_ONCE(ioucmd->cookie, NULL); nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED); - else + } else { io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb); + } return RQ_END_IO_NONE; } @@ -599,7 +601,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (issue_flags & IO_URING_F_IOPOLL) rq_flags |= REQ_POLLED; -retry: req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags); if (IS_ERR(req)) return PTR_ERR(req); @@ -613,17 +614,11 @@ retry: return ret; } - if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) { - if (unlikely(!req->bio)) { - /* we can't poll this, so alloc regular req instead */ - blk_mq_free_request(req); - rq_flags &= ~REQ_POLLED; - goto retry; - } else { - WRITE_ONCE(ioucmd->cookie, req->bio); - req->bio->bi_opf |= REQ_POLLED; - } + if (blk_rq_is_poll(req)) { + ioucmd->flags |= IORING_URING_CMD_POLLED; + WRITE_ONCE(ioucmd->cookie, req); } + /* to free bio on completion, as req->bio will be null at that time */ pdu->bio = req->bio; pdu->meta_len = d.metadata_len; @@ -785,18 +780,16 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, struct io_comp_batch *iob, unsigned int poll_flags) { - struct bio *bio; + struct request *req; int ret = 0; - struct nvme_ns *ns; - struct request_queue *q; + + if (!(ioucmd->flags & IORING_URING_CMD_POLLED)) + return 0; rcu_read_lock(); - bio = READ_ONCE(ioucmd->cookie); - ns = container_of(file_inode(ioucmd->file)->i_cdev, - struct nvme_ns, cdev); - q = ns->queue; - if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev) - ret = bio_poll(bio, iob, poll_flags); + req = READ_ONCE(ioucmd->cookie); + if (req && blk_rq_is_poll(req)) + ret = blk_rq_poll(req, iob, poll_flags); rcu_read_unlock(); return ret; } @@ -890,31 +883,6 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, srcu_read_unlock(&head->srcu, srcu_idx); return ret; } - -int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, - struct io_comp_batch *iob, - unsigned int poll_flags) -{ - struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; - struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); - int srcu_idx = srcu_read_lock(&head->srcu); - struct nvme_ns *ns = nvme_find_path(head); - struct bio *bio; - int ret = 0; - struct request_queue *q; - - if (ns) { - rcu_read_lock(); - bio = READ_ONCE(ioucmd->cookie); - q = ns->queue; - if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio - && bio->bi_bdev) - ret = bio_poll(bio, iob, poll_flags); - rcu_read_unlock(); - } - srcu_read_unlock(&head->srcu, srcu_idx); - return ret; -} #endif /* CONFIG_NVME_MULTIPATH */ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 98001eebd275..5aa1592849af 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -470,7 +470,7 @@ static const struct file_operations nvme_ns_head_chr_fops = { .unlocked_ioctl = nvme_ns_head_chr_ioctl, .compat_ioctl = compat_ptr_ioctl, .uring_cmd = nvme_ns_head_chr_uring_cmd, - .uring_cmd_iopoll = nvme_ns_head_chr_uring_cmd_iopoll, + .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll, }; static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9a98c14c552a..791cafd9910a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -854,8 +854,6 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, struct io_comp_batch *iob, unsigned int poll_flags); -int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, - struct io_comp_batch *iob, unsigned int poll_flags); int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f222d263bc55..08720c7bd92f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -244,8 +244,10 @@ enum io_uring_op { * sqe->uring_cmd_flags * IORING_URING_CMD_FIXED use registered buffer; pass this flag * along with setting sqe->buf_index. + * IORING_URING_CMD_POLLED driver use only */ #define IORING_URING_CMD_FIXED (1U << 0) +#define IORING_URING_CMD_POLLED (1U << 31) /* -- cgit v1.2.3 From c1ecd8e9500797748ae4f79657971955d452d69d Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 26 Jun 2023 18:23:05 -0500 Subject: vhost: allow userspace to create workers For vhost-scsi with 3 vqs or more and a workload that tries to use them in parallel like: fio --filename=/dev/sdb --direct=1 --rw=randrw --bs=4k \ --ioengine=libaio --iodepth=128 --numjobs=3 the single vhost worker thread will become a bottlneck and we are stuck at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are used. To better utilize virtqueues and available CPUs, this patch allows userspace to create workers and bind them to vqs. You can have N workers per dev and also share N workers with M vqs on that dev. This patch adds the interface related code and the next patch will hook vhost-scsi into it. The patches do not try to hook net and vsock into the interface because: 1. multiple workers don't seem to help vsock. The problem is that with only 2 virtqueues we never fully use the existing worker when doing bidirectional tests. This seems to match vhost-scsi where we don't see the worker as a bottleneck until 3 virtqueues are used. 2. net already has a way to use multiple workers. Signed-off-by: Mike Christie Message-Id: <20230626232307.97930-16-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 142 ++++++++++++++++++++++++++++++++++++++- drivers/vhost/vhost.h | 3 + include/uapi/linux/vhost.h | 33 +++++++++ include/uapi/linux/vhost_types.h | 16 +++++ 4 files changed, 193 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index ffbaf7d32e2c..bfba91ecbd0a 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -626,6 +626,76 @@ free_worker: return NULL; } +/* Caller must have device mutex */ +static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, + struct vhost_worker *worker) +{ + if (vq->worker) + vq->worker->attachment_cnt--; + worker->attachment_cnt++; + vq->worker = worker; +} + + /* Caller must have device and virtqueue mutex */ +static int vhost_vq_attach_worker(struct vhost_virtqueue *vq, + struct vhost_vring_worker *info) +{ + unsigned long index = info->worker_id; + struct vhost_dev *dev = vq->dev; + struct vhost_worker *worker; + + if (!dev->use_worker) + return -EINVAL; + + /* + * We only allow userspace to set a virtqueue's worker if it's not + * active and polling is not enabled. We also assume drivers + * supporting this will not be internally queueing works directly or + * via calls like vhost_dev_flush at this time. + */ + if (vhost_vq_get_backend(vq) || vq->kick) + return -EBUSY; + + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); + if (!worker || worker->id != info->worker_id) + return -ENODEV; + + __vhost_vq_attach_worker(vq, worker); + return 0; +} + +/* Caller must have device mutex */ +static int vhost_new_worker(struct vhost_dev *dev, + struct vhost_worker_state *info) +{ + struct vhost_worker *worker; + + worker = vhost_worker_create(dev); + if (!worker) + return -ENOMEM; + + info->worker_id = worker->id; + return 0; +} + +/* Caller must have device mutex */ +static int vhost_free_worker(struct vhost_dev *dev, + struct vhost_worker_state *info) +{ + unsigned long index = info->worker_id; + struct vhost_worker *worker; + + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); + if (!worker || worker->id != info->worker_id) + return -ENODEV; + + if (worker->attachment_cnt) + return -EBUSY; + + vhost_worker_destroy(dev, worker); + return 0; +} + static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp, struct vhost_virtqueue **vq, u32 *id) { @@ -647,6 +717,76 @@ static int vhost_get_vq_from_user(struct vhost_dev *dev, void __user *argp, return 0; } +/* Caller must have device mutex */ +long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, + void __user *argp) +{ + struct vhost_vring_worker ring_worker; + struct vhost_worker_state state; + struct vhost_virtqueue *vq; + long ret; + u32 idx; + + if (!dev->use_worker) + return -EINVAL; + + if (!vhost_dev_has_owner(dev)) + return -EINVAL; + + ret = vhost_dev_check_owner(dev); + if (ret) + return ret; + + switch (ioctl) { + /* dev worker ioctls */ + case VHOST_NEW_WORKER: + ret = vhost_new_worker(dev, &state); + if (!ret && copy_to_user(argp, &state, sizeof(state))) + ret = -EFAULT; + return ret; + case VHOST_FREE_WORKER: + if (copy_from_user(&state, argp, sizeof(state))) + return -EFAULT; + return vhost_free_worker(dev, &state); + /* vring worker ioctls */ + case VHOST_ATTACH_VRING_WORKER: + case VHOST_GET_VRING_WORKER: + break; + default: + return -ENOIOCTLCMD; + } + + ret = vhost_get_vq_from_user(dev, argp, &vq, &idx); + if (ret) + return ret; + + mutex_lock(&vq->mutex); + switch (ioctl) { + case VHOST_ATTACH_VRING_WORKER: + if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) { + ret = -EFAULT; + break; + } + + ret = vhost_vq_attach_worker(vq, &ring_worker); + break; + case VHOST_GET_VRING_WORKER: + ring_worker.index = idx; + ring_worker.worker_id = vq->worker->id; + + if (copy_to_user(argp, &ring_worker, sizeof(ring_worker))) + ret = -EFAULT; + break; + default: + ret = -ENOIOCTLCMD; + break; + } + + mutex_unlock(&vq->mutex); + return ret; +} +EXPORT_SYMBOL_GPL(vhost_worker_ioctl); + /* Caller should have device mutex */ long vhost_dev_set_owner(struct vhost_dev *dev) { @@ -684,7 +824,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev) smp_wmb(); for (i = 0; i < dev->nvqs; i++) - dev->vqs[i]->worker = worker; + __vhost_vq_attach_worker(dev->vqs[i], worker); } return 0; diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 31937e98c01b..4920ca63b8de 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -31,6 +31,7 @@ struct vhost_worker { struct llist_head work_list; u64 kcov_handle; u32 id; + int attachment_cnt; }; /* Poll a file (eventfd or socket) */ @@ -190,6 +191,8 @@ void vhost_dev_cleanup(struct vhost_dev *); void vhost_dev_stop(struct vhost_dev *); long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp); +long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, + void __user *argp); bool vhost_vq_access_ok(struct vhost_virtqueue *vq); bool vhost_log_access_ok(struct vhost_dev *); void vhost_clear_msg(struct vhost_dev *dev); diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 92e1b700b51c..96dc146c2d15 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -45,6 +45,25 @@ #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) /* Specify an eventfd file descriptor to signal on log write. */ #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) +/* By default, a device gets one vhost_worker that its virtqueues share. This + * command allows the owner of the device to create an additional vhost_worker + * for the device. It can later be bound to 1 or more of its virtqueues using + * the VHOST_ATTACH_VRING_WORKER command. + * + * This must be called after VHOST_SET_OWNER and the caller must be the owner + * of the device. The new thread will inherit caller's cgroups and namespaces, + * and will share the caller's memory space. The new thread will also be + * counted against the caller's RLIMIT_NPROC value. + * + * The worker's ID used in other commands will be returned in + * vhost_worker_state. + */ +#define VHOST_NEW_WORKER _IOR(VHOST_VIRTIO, 0x8, struct vhost_worker_state) +/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any + * virtqueue. If userspace is not able to call this for workers its created, + * the kernel will free all the device's workers when the device is closed. + */ +#define VHOST_FREE_WORKER _IOW(VHOST_VIRTIO, 0x9, struct vhost_worker_state) /* Ring setup. */ /* Set number of descriptors in ring. This parameter can not @@ -70,6 +89,20 @@ #define VHOST_VRING_BIG_ENDIAN 1 #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) +/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's + * virtqueues. This must be done before VHOST_SET_VRING_KICK and the driver + * specific ioctl to activate the virtqueue (VHOST_SCSI_SET_ENDPOINT, + * VHOST_NET_SET_BACKEND, VHOST_VSOCK_SET_RUNNING) has been run. + * + * This will replace the virtqueue's existing worker. If the replaced worker + * is no longer attached to any virtqueues, it can be freed with + * VHOST_FREE_WORKER. + */ +#define VHOST_ATTACH_VRING_WORKER _IOW(VHOST_VIRTIO, 0x15, \ + struct vhost_vring_worker) +/* Return the vring worker's ID */ +#define VHOST_GET_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x16, \ + struct vhost_vring_worker) /* The following ioctls use eventfd file descriptors to signal and poll * for events. */ diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index c5690a8992d8..d3aad12ad1fa 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -47,6 +47,22 @@ struct vhost_vring_addr { __u64 log_guest_addr; }; +struct vhost_worker_state { + /* + * For VHOST_NEW_WORKER the kernel will return the new vhost_worker id. + * For VHOST_FREE_WORKER this must be set to the id of the vhost_worker + * to free. + */ + unsigned int worker_id; +}; + +struct vhost_vring_worker { + /* vring index */ + unsigned int index; + /* The id of the vhost_worker returned from VHOST_NEW_WORKER */ + unsigned int worker_id; +}; + /* no alignment requirement */ struct vhost_iotlb_msg { __u64 iova; -- cgit v1.2.3 From 228a27cf78afc63a18f744a56740d26570ecaec0 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 26 Jun 2023 18:23:07 -0500 Subject: vhost: Allow worker switching while work is queueing This patch drops the requirement that we can only switch workers if work has not been queued by using RCU for the vq based queueing paths and a mutex for the device wide flush. We can also use this to support SIGKILL properly in the future where we should exit almost immediately after getting that signal. With this patch, when get_signal returns true, we can set the vq->worker to NULL and do a synchronize_rcu to prevent new work from being queued to the vhost_task that has been killed. Signed-off-by: Mike Christie Message-Id: <20230626232307.97930-18-michael.christie@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 153 ++++++++++++++++++++++++++++++++------------- drivers/vhost/vhost.h | 4 +- include/uapi/linux/vhost.h | 4 +- 3 files changed, 115 insertions(+), 46 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index bfba91ecbd0a..c71d573f1c94 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -233,16 +233,9 @@ void vhost_poll_stop(struct vhost_poll *poll) } EXPORT_SYMBOL_GPL(vhost_poll_stop); -static bool vhost_worker_queue(struct vhost_worker *worker, +static void vhost_worker_queue(struct vhost_worker *worker, struct vhost_work *work) { - if (!worker) - return false; - /* - * vsock can queue while we do a VHOST_SET_OWNER, so we have a smp_wmb - * when setting up the worker. We don't have a smp_rmb here because - * test_and_set_bit gives us a mb already. - */ if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) { /* We can only add the work to the list after we're * sure it was not in the list. @@ -251,47 +244,85 @@ static bool vhost_worker_queue(struct vhost_worker *worker, llist_add(&work->node, &worker->work_list); vhost_task_wake(worker->vtsk); } - - return true; } bool vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work) { - return vhost_worker_queue(vq->worker, work); + struct vhost_worker *worker; + bool queued = false; + + rcu_read_lock(); + worker = rcu_dereference(vq->worker); + if (worker) { + queued = true; + vhost_worker_queue(worker, work); + } + rcu_read_unlock(); + + return queued; } EXPORT_SYMBOL_GPL(vhost_vq_work_queue); -static void vhost_worker_flush(struct vhost_worker *worker) +void vhost_vq_flush(struct vhost_virtqueue *vq) { struct vhost_flush_struct flush; init_completion(&flush.wait_event); vhost_work_init(&flush.work, vhost_flush_work); - if (vhost_worker_queue(worker, &flush.work)) + if (vhost_vq_work_queue(vq, &flush.work)) wait_for_completion(&flush.wait_event); } +EXPORT_SYMBOL_GPL(vhost_vq_flush); -void vhost_vq_flush(struct vhost_virtqueue *vq) +/** + * vhost_worker_flush - flush a worker + * @worker: worker to flush + * + * This does not use RCU to protect the worker, so the device or worker + * mutex must be held. + */ +static void vhost_worker_flush(struct vhost_worker *worker) { - vhost_worker_flush(vq->worker); + struct vhost_flush_struct flush; + + init_completion(&flush.wait_event); + vhost_work_init(&flush.work, vhost_flush_work); + + vhost_worker_queue(worker, &flush.work); + wait_for_completion(&flush.wait_event); } -EXPORT_SYMBOL_GPL(vhost_vq_flush); void vhost_dev_flush(struct vhost_dev *dev) { struct vhost_worker *worker; unsigned long i; - xa_for_each(&dev->worker_xa, i, worker) + xa_for_each(&dev->worker_xa, i, worker) { + mutex_lock(&worker->mutex); + if (!worker->attachment_cnt) { + mutex_unlock(&worker->mutex); + continue; + } vhost_worker_flush(worker); + mutex_unlock(&worker->mutex); + } } EXPORT_SYMBOL_GPL(vhost_dev_flush); /* A lockless hint for busy polling code to exit the loop */ bool vhost_vq_has_work(struct vhost_virtqueue *vq) { - return !llist_empty(&vq->worker->work_list); + struct vhost_worker *worker; + bool has_work = false; + + rcu_read_lock(); + worker = rcu_dereference(vq->worker); + if (worker && !llist_empty(&worker->work_list)) + has_work = true; + rcu_read_unlock(); + + return has_work; } EXPORT_SYMBOL_GPL(vhost_vq_has_work); @@ -356,7 +387,7 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->busyloop_timeout = 0; vq->umem = NULL; vq->iotlb = NULL; - vq->worker = NULL; + rcu_assign_pointer(vq->worker, NULL); vhost_vring_call_reset(&vq->call_ctx); __vhost_vq_meta_reset(vq); } @@ -578,7 +609,7 @@ static void vhost_workers_free(struct vhost_dev *dev) return; for (i = 0; i < dev->nvqs; i++) - dev->vqs[i]->worker = NULL; + rcu_assign_pointer(dev->vqs[i]->worker, NULL); /* * Free the default worker we created and cleanup workers userspace * created but couldn't clean up (it forgot or crashed). @@ -606,6 +637,7 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) if (!vtsk) goto free_worker; + mutex_init(&worker->mutex); init_llist_head(&worker->work_list); worker->kcov_handle = kcov_common_handle(); worker->vtsk = vtsk; @@ -630,13 +662,54 @@ free_worker: static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq, struct vhost_worker *worker) { - if (vq->worker) - vq->worker->attachment_cnt--; + struct vhost_worker *old_worker; + + old_worker = rcu_dereference_check(vq->worker, + lockdep_is_held(&vq->dev->mutex)); + + mutex_lock(&worker->mutex); worker->attachment_cnt++; - vq->worker = worker; + mutex_unlock(&worker->mutex); + rcu_assign_pointer(vq->worker, worker); + + if (!old_worker) + return; + /* + * Take the worker mutex to make sure we see the work queued from + * device wide flushes which doesn't use RCU for execution. + */ + mutex_lock(&old_worker->mutex); + old_worker->attachment_cnt--; + /* + * We don't want to call synchronize_rcu for every vq during setup + * because it will slow down VM startup. If we haven't done + * VHOST_SET_VRING_KICK and not done the driver specific + * SET_ENDPOINT/RUNNUNG then we can skip the sync since there will + * not be any works queued for scsi and net. + */ + mutex_lock(&vq->mutex); + if (!vhost_vq_get_backend(vq) && !vq->kick) { + mutex_unlock(&vq->mutex); + mutex_unlock(&old_worker->mutex); + /* + * vsock can queue anytime after VHOST_VSOCK_SET_GUEST_CID. + * Warn if it adds support for multiple workers but forgets to + * handle the early queueing case. + */ + WARN_ON(!old_worker->attachment_cnt && + !llist_empty(&old_worker->work_list)); + return; + } + mutex_unlock(&vq->mutex); + + /* Make sure new vq queue/flush/poll calls see the new worker */ + synchronize_rcu(); + /* Make sure whatever was queued gets run */ + vhost_worker_flush(old_worker); + mutex_unlock(&old_worker->mutex); } - /* Caller must have device and virtqueue mutex */ + /* Caller must have device mutex */ static int vhost_vq_attach_worker(struct vhost_virtqueue *vq, struct vhost_vring_worker *info) { @@ -647,15 +720,6 @@ static int vhost_vq_attach_worker(struct vhost_virtqueue *vq, if (!dev->use_worker) return -EINVAL; - /* - * We only allow userspace to set a virtqueue's worker if it's not - * active and polling is not enabled. We also assume drivers - * supporting this will not be internally queueing works directly or - * via calls like vhost_dev_flush at this time. - */ - if (vhost_vq_get_backend(vq) || vq->kick) - return -EBUSY; - worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT); if (!worker || worker->id != info->worker_id) return -ENODEV; @@ -689,8 +753,12 @@ static int vhost_free_worker(struct vhost_dev *dev, if (!worker || worker->id != info->worker_id) return -ENODEV; - if (worker->attachment_cnt) + mutex_lock(&worker->mutex); + if (worker->attachment_cnt) { + mutex_unlock(&worker->mutex); return -EBUSY; + } + mutex_unlock(&worker->mutex); vhost_worker_destroy(dev, worker); return 0; @@ -723,6 +791,7 @@ long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, { struct vhost_vring_worker ring_worker; struct vhost_worker_state state; + struct vhost_worker *worker; struct vhost_virtqueue *vq; long ret; u32 idx; @@ -760,7 +829,6 @@ long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, if (ret) return ret; - mutex_lock(&vq->mutex); switch (ioctl) { case VHOST_ATTACH_VRING_WORKER: if (copy_from_user(&ring_worker, argp, sizeof(ring_worker))) { @@ -771,8 +839,15 @@ long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, ret = vhost_vq_attach_worker(vq, &ring_worker); break; case VHOST_GET_VRING_WORKER: + worker = rcu_dereference_check(vq->worker, + lockdep_is_held(&dev->mutex)); + if (!worker) { + ret = -EINVAL; + break; + } + ring_worker.index = idx; - ring_worker.worker_id = vq->worker->id; + ring_worker.worker_id = worker->id; if (copy_to_user(argp, &ring_worker, sizeof(ring_worker))) ret = -EFAULT; @@ -782,7 +857,6 @@ long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl, break; } - mutex_unlock(&vq->mutex); return ret; } EXPORT_SYMBOL_GPL(vhost_worker_ioctl); @@ -817,11 +891,6 @@ long vhost_dev_set_owner(struct vhost_dev *dev) err = -ENOMEM; goto err_worker; } - /* - * vsock can already try to queue so make sure the worker - * is setup before vhost_vq_work_queue sees vq->worker is set. - */ - smp_wmb(); for (i = 0; i < dev->nvqs; i++) __vhost_vq_attach_worker(dev->vqs[i], worker); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 4920ca63b8de..f1e7d4d13219 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -28,6 +28,8 @@ struct vhost_work { struct vhost_worker { struct vhost_task *vtsk; + /* Used to serialize device wide flushing with worker swapping. */ + struct mutex mutex; struct llist_head work_list; u64 kcov_handle; u32 id; @@ -76,7 +78,7 @@ struct vhost_vring_call { /* The virtqueue structure describes a queue attached to a device. */ struct vhost_virtqueue { struct vhost_dev *dev; - struct vhost_worker *worker; + struct vhost_worker __rcu *worker; /* The actual ring of buffers. */ struct mutex mutex; diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 96dc146c2d15..f5c48b61ab62 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -90,9 +90,7 @@ #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) /* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's - * virtqueues. This must be done before VHOST_SET_VRING_KICK and the driver - * specific ioctl to activate the virtqueue (VHOST_SCSI_SET_ENDPOINT, - * VHOST_NET_SET_BACKEND, VHOST_VSOCK_SET_RUNNING) has been run. + * virtqueues. * * This will replace the virtqueue's existing worker. If the replaced worker * is no longer attached to any virtqueues, it can be freed with -- cgit v1.2.3 From 01f23c5f1526f5b6ff744887aa511b9e69d4401b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 29 Jun 2023 12:09:00 -0700 Subject: usb: ch9: Replace bmSublinkSpeedAttr 1-element array with flexible array Since commit 2d47c6956ab3 ("ubsan: Tighten UBSAN_BOUNDS on GCC"), UBSAN_BOUNDS no longer pretends 1-element arrays are unbounded. Walking bmSublinkSpeedAttr will trigger a warning, so make it a proper flexible array. Add a union to keep the struct size identical for userspace in case anything was depending on the old size. False positive warning was: UBSAN: array-index-out-of-bounds in drivers/usb/host/xhci-hub.c:231:31 index 1 is out of range for type '__le32 [1]' for this line of code: ssp_cap->bmSublinkSpeedAttr[offset++] = cpu_to_le32(attr); Reported-by: Borislav Petkov Closes: https://lore.kernel.org/lkml/2023062945-fencing-pebble-0411@gregkh/ Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/lkml/9a8e34ad-8a8b-3830-4878-3c2c82e69dd9@alu.unizg.hr/ Cc: Greg Kroah-Hartman Cc: "Gustavo A. R. Silva" Tested-by: "Borislav Petkov (AMD)" Tested-by: Mirsad Todorovac Reviewed-by: "Gustavo A. R. Silva" Link: https://lore.kernel.org/r/20230629190900.never.787-kees@kernel.org Signed-off-by: Kees Cook --- include/uapi/linux/usb/ch9.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index b17e3a21b15f..3ff98c7ba7e3 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -981,7 +981,11 @@ struct usb_ssp_cap_descriptor { #define USB_SSP_MIN_RX_LANE_COUNT (0xf << 8) #define USB_SSP_MIN_TX_LANE_COUNT (0xf << 12) __le16 wReserved; - __le32 bmSublinkSpeedAttr[1]; /* list of sublink speed attrib entries */ + union { + __le32 legacy_padding; + /* list of sublink speed attrib entries */ + __DECLARE_FLEX_ARRAY(__le32, bmSublinkSpeedAttr); + }; #define USB_SSP_SUBLINK_SPEED_SSID (0xf) /* sublink speed ID */ #define USB_SSP_SUBLINK_SPEED_LSE (0x3 << 4) /* Lanespeed exponent */ #define USB_SSP_SUBLINK_SPEED_LSE_BPS 0 -- cgit v1.2.3 From e0933b526fbfd937c4a8f4e35fcdd49f0e22d411 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 6 Jul 2023 13:14:12 -0700 Subject: block: Fix a source code comment in include/uapi/linux/blkzoned.h Fix the symbolic names for zone conditions in the blkzoned.h header file. Cc: Hannes Reinecke Cc: Damien Le Moal Fixes: 6a0cb1bc106f ("block: Implement support for zoned block devices") Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20230706201422.3987341-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- include/uapi/linux/blkzoned.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index b80fcc9ea525..f85743ef6e7d 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -51,13 +51,13 @@ enum blk_zone_type { * * The Zone Condition state machine in the ZBC/ZAC standards maps the above * deinitions as: - * - ZC1: Empty | BLK_ZONE_EMPTY + * - ZC1: Empty | BLK_ZONE_COND_EMPTY * - ZC2: Implicit Open | BLK_ZONE_COND_IMP_OPEN * - ZC3: Explicit Open | BLK_ZONE_COND_EXP_OPEN - * - ZC4: Closed | BLK_ZONE_CLOSED - * - ZC5: Full | BLK_ZONE_FULL - * - ZC6: Read Only | BLK_ZONE_READONLY - * - ZC7: Offline | BLK_ZONE_OFFLINE + * - ZC4: Closed | BLK_ZONE_COND_CLOSED + * - ZC5: Full | BLK_ZONE_COND_FULL + * - ZC6: Read Only | BLK_ZONE_COND_READONLY + * - ZC7: Offline | BLK_ZONE_COND_OFFLINE * * Conditions 0x5 to 0xC are reserved by the current ZBC/ZAC spec and should * be considered invalid. -- cgit v1.2.3 From a0ade8404c3bc2bf2631cb0f20d372eed22d9d96 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Jul 2023 14:34:25 -0700 Subject: af_packet: Fix warning of fortified memcpy() in packet_getname(). syzkaller found a warning in packet_getname() [0], where we try to copy 16 bytes to sockaddr_ll.sll_addr[8]. Some devices (ip6gre, vti6, ip6tnl) have 16 bytes address expressed by struct in6_addr. Also, Infiniband has 32 bytes as MAX_ADDR_LEN. The write seems to overflow, but actually not since we use struct sockaddr_storage defined in __sys_getsockname() and its size is 128 (_K_SS_MAXSIZE) bytes. Thus, we have sufficient room after sll_addr[] as __data[]. To avoid the warning, let's add a flex array member union-ed with sll_addr. Another option would be to use strncpy() and limit the copied length to sizeof(sll_addr), but it will return the partial address and break an application that passes sockaddr_storage to getsockname(). [0]: memcpy: detected field-spanning write (size 16) of single field "sll->sll_addr" at net/packet/af_packet.c:3604 (size 8) WARNING: CPU: 0 PID: 255 at net/packet/af_packet.c:3604 packet_getname+0x25c/0x3a0 net/packet/af_packet.c:3604 Modules linked in: CPU: 0 PID: 255 Comm: syz-executor750 Not tainted 6.5.0-rc1-00330-g60cc1f7d0605 #4 Hardware name: linux,dummy-virt (DT) pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : packet_getname+0x25c/0x3a0 net/packet/af_packet.c:3604 lr : packet_getname+0x25c/0x3a0 net/packet/af_packet.c:3604 sp : ffff800089887bc0 x29: ffff800089887bc0 x28: ffff000010f80f80 x27: 0000000000000003 x26: dfff800000000000 x25: ffff700011310f80 x24: ffff800087d55000 x23: dfff800000000000 x22: ffff800089887c2c x21: 0000000000000010 x20: ffff00000de08310 x19: ffff800089887c20 x18: ffff800086ab1630 x17: 20646c6569662065 x16: 6c676e697320666f x15: 0000000000000001 x14: 1fffe0000d56d7ca x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: 0000000000000000 x9 : 3e60944c3da92b00 x8 : 3e60944c3da92b00 x7 : 0000000000000001 x6 : 0000000000000001 x5 : ffff8000898874f8 x4 : ffff800086ac99e0 x3 : ffff8000803f8808 x2 : 0000000000000001 x1 : 0000000100000000 x0 : 0000000000000000 Call trace: packet_getname+0x25c/0x3a0 net/packet/af_packet.c:3604 __sys_getsockname+0x168/0x24c net/socket.c:2042 __do_sys_getsockname net/socket.c:2057 [inline] __se_sys_getsockname net/socket.c:2054 [inline] __arm64_sys_getsockname+0x7c/0x94 net/socket.c:2054 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall+0x98/0x2c0 arch/arm64/kernel/syscall.c:52 el0_svc_common+0x134/0x240 arch/arm64/kernel/syscall.c:139 do_el0_svc+0x64/0x198 arch/arm64/kernel/syscall.c:188 el0_svc+0x2c/0x7c arch/arm64/kernel/entry-common.c:647 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:665 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:591 Fixes: df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3") Reported-by: syzkaller Suggested-by: Kees Cook Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230724213425.22920-3-kuniyu@amazon.com Reviewed-by: Simon Horman Reviewed-by: Kees Cook Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_packet.h | 6 +++++- net/packet/af_packet.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 9efc42382fdb..4d0ad22f83b5 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -18,7 +18,11 @@ struct sockaddr_ll { unsigned short sll_hatype; unsigned char sll_pkttype; unsigned char sll_halen; - unsigned char sll_addr[8]; + union { + unsigned char sll_addr[8]; + /* Actual length is in sll_halen. */ + __DECLARE_FLEX_ARRAY(unsigned char, sll_addr_flex); + }; }; /* Packet types */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 85ff90a03b0c..8e3ddec4c3d5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3601,7 +3601,7 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; - memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); + memcpy(sll->sll_addr_flex, dev->dev_addr, dev->addr_len); } else { sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ sll->sll_halen = 0; -- cgit v1.2.3 From 4d50e50045aa46d9f3e578ed2edea9bd0a123d24 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jul 2023 14:58:15 +0000 Subject: net: flower: fix stack-out-of-bounds in fl_set_key_cfm() Typical misuse of nla_parse_nested(array, XXX_MAX, ...); array must be declared as struct nlattr *array[XXX_MAX + 1]; v2: Based on feedbacks from Ido Schimmel and Zahari Doychev, I also changed TCA_FLOWER_KEY_CFM_OPT_MAX and cfm_opt_policy definitions. syzbot reported: BUG: KASAN: stack-out-of-bounds in __nla_validate_parse+0x136/0x2bd0 lib/nlattr.c:588 Write of size 32 at addr ffffc90003a0ee20 by task syz-executor296/5014 CPU: 0 PID: 5014 Comm: syz-executor296 Not tainted 6.5.0-rc2-syzkaller-00307-gd192f5382581 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e7/0x2d0 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:364 [inline] print_report+0x163/0x540 mm/kasan/report.c:475 kasan_report+0x175/0x1b0 mm/kasan/report.c:588 kasan_check_range+0x27e/0x290 mm/kasan/generic.c:187 __asan_memset+0x23/0x40 mm/kasan/shadow.c:84 __nla_validate_parse+0x136/0x2bd0 lib/nlattr.c:588 __nla_parse+0x40/0x50 lib/nlattr.c:700 nla_parse_nested include/net/netlink.h:1262 [inline] fl_set_key_cfm+0x1e3/0x440 net/sched/cls_flower.c:1718 fl_set_key+0x2168/0x6620 net/sched/cls_flower.c:1884 fl_tmplt_create+0x1fe/0x510 net/sched/cls_flower.c:2666 tc_chain_tmplt_add net/sched/cls_api.c:2959 [inline] tc_ctl_chain+0x131d/0x1ac0 net/sched/cls_api.c:3068 rtnetlink_rcv_msg+0x82b/0xf50 net/core/rtnetlink.c:6424 netlink_rcv_skb+0x1df/0x430 net/netlink/af_netlink.c:2549 netlink_unicast_kernel net/netlink/af_netlink.c:1339 [inline] netlink_unicast+0x7c3/0x990 net/netlink/af_netlink.c:1365 netlink_sendmsg+0xa2a/0xd60 net/netlink/af_netlink.c:1914 sock_sendmsg_nosec net/socket.c:725 [inline] sock_sendmsg net/socket.c:748 [inline] ____sys_sendmsg+0x592/0x890 net/socket.c:2494 ___sys_sendmsg net/socket.c:2548 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2577 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f54c6150759 Code: 48 83 c4 28 c3 e8 d7 19 00 00 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffe06c30578 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f54c619902d RCX: 00007f54c6150759 RDX: 0000000000000000 RSI: 0000000020000280 RDI: 0000000000000003 RBP: 00007ffe06c30590 R08: 0000000000000000 R09: 00007ffe06c305f0 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f54c61c35f0 R13: 00007ffe06c30778 R14: 0000000000000001 R15: 0000000000000001 The buggy address belongs to stack of task syz-executor296/5014 and is located at offset 32 in frame: fl_set_key_cfm+0x0/0x440 net/sched/cls_flower.c:374 This frame has 1 object: [32, 56) 'nla_cfm_opt' The buggy address belongs to the virtual mapping at [ffffc90003a08000, ffffc90003a11000) created by: copy_process+0x5c8/0x4290 kernel/fork.c:2330 Fixes: 7cfffd5fed3e ("net: flower: add support for matching cfm fields") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Simon Horman Reviewed-by: Ido Schimmel Reviewed-by: Zahari Doychev Link: https://lore.kernel.org/r/20230726145815.943910-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_cls.h | 4 +++- net/sched/cls_flower.c | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7865f5a9885b..4f3932bb712d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -710,9 +710,11 @@ enum { TCA_FLOWER_KEY_CFM_OPT_UNSPEC, TCA_FLOWER_KEY_CFM_MD_LEVEL, TCA_FLOWER_KEY_CFM_OPCODE, - TCA_FLOWER_KEY_CFM_OPT_MAX, + __TCA_FLOWER_KEY_CFM_OPT_MAX, }; +#define TCA_FLOWER_KEY_CFM_OPT_MAX (__TCA_FLOWER_KEY_CFM_OPT_MAX - 1) + #define TCA_FLOWER_MASK_FLAGS_RANGE (1 << 0) /* Range-based match */ /* Match-all classifier */ diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 8da9d039d964..9f0711da9c95 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -776,7 +776,8 @@ mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL] = { .type = NLA_U32 }, }; -static const struct nla_policy cfm_opt_policy[TCA_FLOWER_KEY_CFM_OPT_MAX] = { +static const struct nla_policy +cfm_opt_policy[TCA_FLOWER_KEY_CFM_OPT_MAX + 1] = { [TCA_FLOWER_KEY_CFM_MD_LEVEL] = NLA_POLICY_MAX(NLA_U8, FLOW_DIS_CFM_MDL_MAX), [TCA_FLOWER_KEY_CFM_OPCODE] = { .type = NLA_U8 }, @@ -1709,7 +1710,7 @@ static int fl_set_key_cfm(struct nlattr **tb, struct fl_flow_key *mask, struct netlink_ext_ack *extack) { - struct nlattr *nla_cfm_opt[TCA_FLOWER_KEY_CFM_OPT_MAX]; + struct nlattr *nla_cfm_opt[TCA_FLOWER_KEY_CFM_OPT_MAX + 1]; int err; if (!tb[TCA_FLOWER_KEY_CFM]) -- cgit v1.2.3 From e3f9324b231aba1dc707572bfe80be210c2d4cbd Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Wed, 16 Aug 2023 15:54:48 +0000 Subject: RISC-V: Remove ptrace support for vectors We've found two bugs here: NT_RISCV_VECTOR steps on NT_RISCV_CSR (which is only for embedded), and we don't have vlenb in the core dumps. Given that we've have a pair of bugs croup up as part of the GDB review we've probably got other issues, so let's just cut this for 6.5 and get it right. Fixes: 0c59922c769a ("riscv: Add ptrace vector support") Reviewed-by: Maciej W. Rozycki Signed-off-by: Andy Chiu Link: https://lore.kernel.org/r/20230816155450.26200-2-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/ptrace.c | 69 ---------------------------------------------- include/uapi/linux/elf.h | 1 - 2 files changed, 70 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 1d572cf3140f..487303e3ef22 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -25,9 +25,6 @@ enum riscv_regset { #ifdef CONFIG_FPU REGSET_F, #endif -#ifdef CONFIG_RISCV_ISA_V - REGSET_V, -#endif }; static int riscv_gpr_get(struct task_struct *target, @@ -84,61 +81,6 @@ static int riscv_fpr_set(struct task_struct *target, } #endif -#ifdef CONFIG_RISCV_ISA_V -static int riscv_vr_get(struct task_struct *target, - const struct user_regset *regset, - struct membuf to) -{ - struct __riscv_v_ext_state *vstate = &target->thread.vstate; - - if (!riscv_v_vstate_query(task_pt_regs(target))) - return -EINVAL; - - /* - * Ensure the vector registers have been saved to the memory before - * copying them to membuf. - */ - if (target == current) - riscv_v_vstate_save(current, task_pt_regs(current)); - - /* Copy vector header from vstate. */ - membuf_write(&to, vstate, offsetof(struct __riscv_v_ext_state, datap)); - membuf_zero(&to, sizeof(vstate->datap)); - - /* Copy all the vector registers from vstate. */ - return membuf_write(&to, vstate->datap, riscv_v_vsize); -} - -static int riscv_vr_set(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) -{ - int ret, size; - struct __riscv_v_ext_state *vstate = &target->thread.vstate; - - if (!riscv_v_vstate_query(task_pt_regs(target))) - return -EINVAL; - - /* Copy rest of the vstate except datap */ - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vstate, 0, - offsetof(struct __riscv_v_ext_state, datap)); - if (unlikely(ret)) - return ret; - - /* Skip copy datap. */ - size = sizeof(vstate->datap); - count -= size; - ubuf += size; - - /* Copy all the vector registers. */ - pos = 0; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vstate->datap, - 0, riscv_v_vsize); - return ret; -} -#endif - static const struct user_regset riscv_user_regset[] = { [REGSET_X] = { .core_note_type = NT_PRSTATUS, @@ -158,17 +100,6 @@ static const struct user_regset riscv_user_regset[] = { .set = riscv_fpr_set, }, #endif -#ifdef CONFIG_RISCV_ISA_V - [REGSET_V] = { - .core_note_type = NT_RISCV_VECTOR, - .align = 16, - .n = ((32 * RISCV_MAX_VLENB) + - sizeof(struct __riscv_v_ext_state)) / sizeof(__u32), - .size = sizeof(__u32), - .regset_get = riscv_vr_get, - .set = riscv_vr_set, - }, -#endif }; static const struct user_regset_view riscv_user_native_view = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 0c8cf359ea5b..e0e159138331 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -443,7 +443,6 @@ typedef struct elf64_shdr { #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ #define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */ #define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */ -#define NT_RISCV_VECTOR 0x900 /* RISC-V vector registers */ #define NT_LOONGARCH_CPUCFG 0xa00 /* LoongArch CPU config registers */ #define NT_LOONGARCH_CSR 0xa01 /* LoongArch control and status registers */ #define NT_LOONGARCH_LSX 0xa02 /* LoongArch Loongson SIMD Extension registers */ -- cgit v1.2.3