From 73fa7547c70b32cc69685f79be31135797734eb6 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Mon, 31 Aug 2020 11:32:08 -0400 Subject: vfs: add RWF_NOAPPEND flag for pwritev2 The pwrite function, originally defined by POSIX (thus the "p"), is defined to ignore O_APPEND and write at the offset passed as its argument. However, historically Linux honored O_APPEND if set and ignored the offset. This cannot be changed due to stability policy, but is documented in the man page as a bug. Now that there's a pwritev2 syscall providing a superset of the pwrite functionality that has a flags argument, the conforming behavior can be offered to userspace via a new flag. Since pwritev2 checks flag validity (in kiocb_set_rw_flags) and reports unknown ones with EOPNOTSUPP, callers will not get wrong behavior on old kernels that don't support the new flag; the error is reported and the caller can decide how to handle it. Signed-off-by: Rich Felker Link: https://lore.kernel.org/r/20200831153207.GO3265@brightrain.aerifal.cx Reviewed-by: Jann Horn Signed-off-by: Christian Brauner --- include/uapi/linux/fs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 48ad69f7722e..2203d3194b91 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -301,9 +301,12 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* per-IO negation of O_APPEND */ +#define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_NOAPPEND) /* Pagemap ioctl */ #define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) -- cgit v1.2.3 From 64bef697d33b75fc06c5789b3f8108680271529f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 31 Jan 2024 14:26:02 +0100 Subject: pidfd: implement PIDFD_THREAD flag for pidfd_open() With this flag: - pidfd_open() doesn't require that the target task must be a thread-group leader - pidfd_poll() succeeds when the task exits and becomes a zombie (iow, passes exit_notify()), even if it is a leader and thread-group is not empty. This means that the behaviour of pidfd_poll(PIDFD_THREAD, pid-of-group-leader) is not well defined if it races with exec() from its sub-thread; pidfd_poll() can succeed or not depending on whether pidfd_task_exited() is called before or after exchange_tids(). Perhaps we can improve this behaviour later, pidfd_poll() can probably take sig->group_exec_task into account. But this doesn't really differ from the case when the leader exits before other threads (so pidfd_poll() succeeds) and then another thread execs and pidfd_poll() will block again. thread_group_exited() is no longer used, perhaps it can die. Co-developed-by: Tycho Andersen Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240131132602.GA23641@redhat.com Tested-by: Tycho Andersen Reviewed-by: Tycho Andersen Signed-off-by: Christian Brauner --- fs/exec.c | 6 +++++- include/linux/pid.h | 3 ++- include/uapi/linux/pidfd.h | 3 ++- kernel/exit.c | 7 +++++++ kernel/fork.c | 38 +++++++++++++++++++++++++++++++------- kernel/pid.c | 14 +++----------- kernel/signal.c | 6 ++++-- 7 files changed, 54 insertions(+), 23 deletions(-) (limited to 'include/uapi') diff --git a/fs/exec.c b/fs/exec.c index 8cdd5b2dd09c..b68f61bbcaa8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1143,7 +1143,11 @@ static int de_thread(struct task_struct *tsk) BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; - + /* + * leader and tsk exhanged their pids, the old pid dies, + * wake up the PIDFD_THREAD waiters. + */ + do_notify_pidfd(leader); /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees diff --git a/include/linux/pid.h b/include/linux/pid.h index e6a041cb8bac..8124d57752b9 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -70,10 +70,11 @@ extern const struct file_operations pidfd_fops; struct file; -extern struct pid *pidfd_pid(const struct file *file); +struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); +void do_notify_pidfd(struct task_struct *task); static inline struct pid *get_pid(struct pid *pid) { diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 5406fbc13074..2e6461459877 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -7,6 +7,7 @@ #include /* Flags for pidfd_open(). */ -#define PIDFD_NONBLOCK O_NONBLOCK +#define PIDFD_NONBLOCK O_NONBLOCK +#define PIDFD_THREAD O_EXCL #endif /* _UAPI_LINUX_PIDFD_H */ diff --git a/kernel/exit.c b/kernel/exit.c index 3988a02efaef..c038d10dfb38 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; + /* + * sub-thread or delay_group_leader(), wake up the + * PIDFD_THREAD waiters. + */ + if (!thread_group_empty(tsk)) + do_notify_pidfd(tsk); + if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && diff --git a/kernel/fork.c b/kernel/fork.c index 726a92043531..1a9b91055916 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -101,6 +101,7 @@ #include #include #include +#include #include #include @@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) seq_put_decimal_ll(m, "Pid:\t", nr); + /* TODO: report PIDFD_THREAD */ + #ifdef CONFIG_PID_NS seq_put_decimal_ll(m, "\nNSpid:\t", nr); if (nr > 0) { @@ -2068,22 +2071,35 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) } #endif +static bool pidfd_task_exited(struct pid *pid, bool thread) +{ + struct task_struct *task; + bool exited; + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + exited = !task || + (READ_ONCE(task->exit_state) && (thread || thread_group_empty(task))); + rcu_read_unlock(); + + return exited; +} + /* * Poll support for process exit notification. */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct pid *pid = file->private_data; + bool thread = file->f_flags & PIDFD_THREAD; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); - /* - * Inform pollers only when the whole thread group exits. - * If the thread group leader exits before all other threads in the - * group, then poll(2) should block, similar to the wait(2) family. + * Depending on PIDFD_THREAD, inform pollers when the thread + * or the whole thread-group exits. */ - if (thread_group_exited(pid)) + if (pidfd_task_exited(pid, thread)) poll_flags = EPOLLIN | EPOLLRDNORM; return poll_flags; @@ -2141,6 +2157,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re return PTR_ERR(pidfd_file); } get_pid(pid); /* held by pidfd_file now */ + /* + * anon_inode_getfile() ignores everything outside of the + * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually. + */ + pidfd_file->f_flags |= (flags & PIDFD_THREAD); *ret = pidfd_file; return pidfd; } @@ -2154,7 +2175,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * - * The helper verifies that @pid is used as a thread group leader. + * The helper verifies that @pid is still in use, without PIDFD_THREAD the + * task identified by @pid must be a thread-group leader. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in @@ -2173,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + bool thread = flags & PIDFD_THREAD; + + if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) return -EINVAL; return __pidfd_prepare(pid, flags, ret); diff --git a/kernel/pid.c b/kernel/pid.c index c7a3e359f8f5..e11144466828 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -552,11 +552,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) * Return the task associated with @pidfd. The function takes a reference on * the returned task. The caller is responsible for releasing that reference. * - * Currently, the process identified by @pidfd is always a thread-group leader. - * This restriction currently exists for all aspects of pidfds including pidfd - * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling - * (only supports thread group leaders). - * * Return: On success, the task_struct associated with the pidfd. * On error, a negative errno number will be returned. */ @@ -615,11 +610,8 @@ static int pidfd_create(struct pid *pid, unsigned int flags) * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set for - * the process identified by @pid. Currently, the process identified by - * @pid must be a thread-group leader. This restriction currently exists - * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot - * be used with CLONE_THREAD) and pidfd polling (only supports thread group - * leaders). + * the task identified by @pid. Without PIDFD_THREAD flag the target task + * must be a thread-group leader. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. @@ -629,7 +621,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) int fd; struct pid *p; - if (flags & ~PIDFD_NONBLOCK) + if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD)) return -EINVAL; if (pid <= 0) diff --git a/kernel/signal.c b/kernel/signal.c index 9561a3962ca6..9b40109f0c56 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2019,7 +2019,7 @@ ret: return ret; } -static void do_notify_pidfd(struct task_struct *task) +void do_notify_pidfd(struct task_struct *task) { struct pid *pid; @@ -2051,7 +2051,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); /* - * tsk is a group leader and has no threads, wake up the pidfd waiters. + * tsk is a group leader and has no threads, wake up the + * non-PIDFD_THREAD waiters. */ if (thread_group_empty(tsk)) do_notify_pidfd(tsk); @@ -3926,6 +3927,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, prepare_kill_siginfo(sig, &kinfo); } + /* TODO: respect PIDFD_THREAD */ ret = kill_pid_info(sig, &kinfo, pid); err: -- cgit v1.2.3 From 41bcbe59c3b3fa7171dd2e3a365e6d5154198f30 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Feb 2024 21:56:17 -0500 Subject: fs: FS_IOC_GETUUID Add a new generic ioctls for querying the filesystem UUID. These are lifted versions of the ext4 ioctls, with one change: we're not using a flexible array member, because UUIDs will never be more than 16 bytes. This patch adds a generic implementation of FS_IOC_GETFSUUID, which reads from super_block->s_uuid. We're not lifting SETFSUUID from ext4 - that can be done on offline filesystems by the people who need it, trying to do it online is just asking for too much trouble. Cc: Christian Brauner Cc: Jan Kara Cc: Dave Chinner Cc: Darrick J. Wong Cc: Theodore Ts'o Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Kent Overstreet Link: https://lore.kernel.org/r/20240207025624.1019754-4-kent.overstreet@linux.dev Signed-off-by: Christian Brauner --- Documentation/userspace-api/ioctl/ioctl-number.rst | 3 ++- fs/ioctl.c | 16 ++++++++++++++++ include/uapi/linux/fs.h | 15 +++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 457e16f06e04..3731ecf1e437 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -82,8 +82,9 @@ Code Seq# Include File Comments 0x10 00-0F drivers/char/s390/vmcp.h 0x10 10-1F arch/s390/include/uapi/sclp_ctl.h 0x10 20-2F arch/s390/include/uapi/asm/hypfs.h -0x12 all linux/fs.h +0x12 all linux/fs.h BLK* ioctls linux/blkpg.h +0x15 all linux/fs.h FS_IOC_* ioctls 0x1b all InfiniBand Subsystem 0x20 all drivers/cdrom/cm206.h diff --git a/fs/ioctl.c b/fs/ioctl.c index 76cf22ac97d7..74eab9549383 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -763,6 +763,19 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp) return err; } +static int ioctl_getfsuuid(struct file *file, void __user *argp) +{ + struct super_block *sb = file_inode(file)->i_sb; + struct fsuuid2 u = { .len = sb->s_uuid_len, }; + + if (!sb->s_uuid_len) + return -ENOIOCTLCMD; + + memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len); + + return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0; +} + /* * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. * It's just a simple helper for sys_ioctl and compat_sys_ioctl. @@ -845,6 +858,9 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, case FS_IOC_FSSETXATTR: return ioctl_fssetxattr(filp, argp); + case FS_IOC_GETFSUUID: + return ioctl_getfsuuid(filp, argp); + default: if (S_ISREG(inode->i_mode)) return file_ioctl(filp, cmd, argp); diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 48ad69f7722e..40880ce99d02 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -64,6 +64,19 @@ struct fstrim_range { __u64 minlen; }; +/* + * We include a length field because some filesystems (vfat) have an identifier + * that we do want to expose as a UUID, but doesn't have the standard length. + * + * We use a fixed size buffer beacuse this interface will, by fiat, never + * support "UUIDs" longer than 16 bytes; we don't want to force all downstream + * users to have to deal with that. + */ +struct fsuuid2 { + __u8 len; + __u8 uuid[16]; +}; + /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */ #define FILE_DEDUPE_RANGE_SAME 0 #define FILE_DEDUPE_RANGE_DIFFERS 1 @@ -215,6 +228,8 @@ struct fsxattr { #define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) #define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX]) #define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) +/* Returns the external filesystem UUID, the same one blkid returns */ +#define FS_IOC_GETFSUUID _IOR(0x15, 0, struct fsuuid2) /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) -- cgit v1.2.3 From b4bb1900c12e6a0fe11ff51e1aa6eea19a4ad635 Mon Sep 17 00:00:00 2001 From: Tony Solomonik Date: Fri, 2 Feb 2024 14:17:24 +0200 Subject: io_uring: add support for ftruncate Adds support for doing truncate through io_uring, eliminating the need for applications to roll their own thread pool or offload mechanism to be able to do non-blocking truncates. Signed-off-by: Tony Solomonik Link: https://lore.kernel.org/r/20240202121724.17461-3-tony.solomonik@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/Makefile | 2 +- io_uring/opdef.c | 10 +++++++++ io_uring/truncate.c | 48 +++++++++++++++++++++++++++++++++++++++++++ io_uring/truncate.h | 4 ++++ 5 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 io_uring/truncate.c create mode 100644 io_uring/truncate.h (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7a673b52827b..fb812f1b6bb5 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -255,6 +255,7 @@ enum io_uring_op { IORING_OP_FUTEX_WAKE, IORING_OP_FUTEX_WAITV, IORING_OP_FIXED_FD_INSTALL, + IORING_OP_FTRUNCATE, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/Makefile b/io_uring/Makefile index 2cdc51825405..42afe4a1ff31 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -8,6 +8,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ cancel.o kbuf.o rsrc.o rw.o opdef.o \ - notif.o waitid.o register.o + notif.o waitid.o register.o truncate.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o diff --git a/io_uring/opdef.c b/io_uring/opdef.c index b1ee3a9c3807..9c080aadc5a6 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -35,6 +35,7 @@ #include "rw.h" #include "waitid.h" #include "futex.h" +#include "truncate.h" static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) { @@ -474,6 +475,12 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_install_fixed_fd_prep, .issue = io_install_fixed_fd, }, + [IORING_OP_FTRUNCATE] = { + .needs_file = 1, + .hash_reg_file = 1, + .prep = io_ftruncate_prep, + .issue = io_ftruncate, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -712,6 +719,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_FIXED_FD_INSTALL] = { .name = "FIXED_FD_INSTALL", }, + [IORING_OP_FTRUNCATE] = { + .name = "FTRUNCATE", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/truncate.c b/io_uring/truncate.c new file mode 100644 index 000000000000..62ee73d34d72 --- /dev/null +++ b/io_uring/truncate.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../fs/internal.h" + +#include "io_uring.h" +#include "truncate.h" + +struct io_ftrunc { + struct file *file; + loff_t len; +}; + +int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc); + + if (sqe->rw_flags || sqe->addr || sqe->len || sqe->buf_index || + sqe->splice_fd_in || sqe->addr3) + return -EINVAL; + + ft->len = READ_ONCE(sqe->off); + + req->flags |= REQ_F_FORCE_ASYNC; + return 0; +} + +int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc); + int ret; + + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); + + ret = do_ftruncate(req->file, ft->len, 1); + + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/truncate.h b/io_uring/truncate.h new file mode 100644 index 000000000000..ec088293a478 --- /dev/null +++ b/io_uring/truncate.h @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags); -- cgit v1.2.3 From ef1186c1a875bfa8a8cbfc2a9670b14b082187a9 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 8 Jun 2023 09:38:38 -0700 Subject: io_uring: add register/unregister napi function This adds an api to register and unregister the napi for io-uring. If the arg value is specified when unregistering, the current napi setting for the busy poll timeout is copied into the user structure. If this is not required, NULL can be passed as the arg value. Signed-off-by: Stefan Roesch Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230608163839.2891748-7-shr@devkernel.io Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 ++++++++++ io_uring/napi.c | 52 +++++++++++++++++++++++++++++++++++++++++++ io_uring/napi.h | 11 +++++++++ io_uring/register.c | 13 +++++++++++ 4 files changed, 88 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index fb812f1b6bb5..7bd10201a02b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -571,6 +571,10 @@ enum { /* return status information for a buffer group */ IORING_REGISTER_PBUF_STATUS = 26, + /* set/clear busy poll settings */ + IORING_REGISTER_NAPI = 27, + IORING_UNREGISTER_NAPI = 28, + /* this goes last */ IORING_REGISTER_LAST, @@ -704,6 +708,14 @@ struct io_uring_buf_status { __u32 resv[8]; }; +/* argument for IORING_(UN)REGISTER_NAPI */ +struct io_uring_napi { + __u32 busy_poll_to; + __u8 prefer_busy_poll; + __u8 pad[3]; + __u64 resv; +}; + /* * io_uring_restriction->opcode values */ diff --git a/io_uring/napi.c b/io_uring/napi.c index 3e578df36cc5..8ec016899539 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -207,6 +207,58 @@ void io_napi_free(struct io_ring_ctx *ctx) spin_unlock(&ctx->napi_lock); } +/* + * io_napi_register() - Register napi with io-uring + * @ctx: pointer to io-uring context structure + * @arg: pointer to io_uring_napi structure + * + * Register napi in the io-uring context. + */ +int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) +{ + const struct io_uring_napi curr = { + .busy_poll_to = ctx->napi_busy_poll_to, + .prefer_busy_poll = ctx->napi_prefer_busy_poll + }; + struct io_uring_napi napi; + + if (copy_from_user(&napi, arg, sizeof(napi))) + return -EFAULT; + if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv) + return -EINVAL; + + WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to); + WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll); + + if (copy_to_user(arg, &curr, sizeof(curr))) + return -EFAULT; + + return 0; +} + +/* + * io_napi_unregister() - Unregister napi with io-uring + * @ctx: pointer to io-uring context structure + * @arg: pointer to io_uring_napi structure + * + * Unregister napi. If arg has been specified copy the busy poll timeout and + * prefer busy poll setting to the passed in structure. + */ +int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) +{ + const struct io_uring_napi curr = { + .busy_poll_to = ctx->napi_busy_poll_to, + .prefer_busy_poll = ctx->napi_prefer_busy_poll + }; + + if (arg && copy_to_user(arg, &curr, sizeof(curr))) + return -EFAULT; + + WRITE_ONCE(ctx->napi_busy_poll_to, 0); + WRITE_ONCE(ctx->napi_prefer_busy_poll, false); + return 0; +} + /* * __io_napi_adjust_timeout() - Add napi id to the busy poll list * @ctx: pointer to io-uring context structure diff --git a/io_uring/napi.h b/io_uring/napi.h index b6d6243fc7fe..6fc0393d0dbe 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -12,6 +12,9 @@ void io_napi_init(struct io_ring_ctx *ctx); void io_napi_free(struct io_ring_ctx *ctx); +int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); +int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); + void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, @@ -68,6 +71,14 @@ static inline void io_napi_init(struct io_ring_ctx *ctx) static inline void io_napi_free(struct io_ring_ctx *ctx) { } +static inline int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) +{ + return -EOPNOTSUPP; +} +static inline int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) +{ + return -EOPNOTSUPP; +} static inline bool io_napi(struct io_ring_ctx *ctx) { return false; diff --git a/io_uring/register.c b/io_uring/register.c index 5e62c1208996..99c37775f974 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -26,6 +26,7 @@ #include "register.h" #include "cancel.h" #include "kbuf.h" +#include "napi.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -550,6 +551,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_pbuf_status(ctx, arg); break; + case IORING_REGISTER_NAPI: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_napi(ctx, arg); + break; + case IORING_UNREGISTER_NAPI: + ret = -EINVAL; + if (nr_args != 1) + break; + ret = io_unregister_napi(ctx, arg); + break; default: ret = -EINVAL; break; -- cgit v1.2.3 From ae8c511757304e0c393661b5ed2ad7073e2a351d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Feb 2024 21:56:19 -0500 Subject: fs: add FS_IOC_GETFSSYSFSPATH Add a new ioctl for getting the sysfs name of a filesystem - the path under /sys/fs. This is going to let us standardize exporting data from sysfs across filesystems, e.g. time stats. The returned path will always be of the form "$FSTYP/$SYSFS_IDENTIFIER", where the sysfs identifier may be a UUID (for bcachefs) or a device name (xfs). Cc: Christian Brauner Cc: Jan Kara Cc: Dave Chinner Cc: Darrick J. Wong Cc: Theodore Ts'o Cc: Josef Bacik Signed-off-by: Kent Overstreet Link: https://lore.kernel.org/r/20240207025624.1019754-6-kent.overstreet@linux.dev Signed-off-by: Christian Brauner --- fs/ioctl.c | 17 +++++++++++++++++ include/linux/fs.h | 43 +++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/fs.h | 10 ++++++++++ 3 files changed, 70 insertions(+) (limited to 'include/uapi') diff --git a/fs/ioctl.c b/fs/ioctl.c index 74eab9549383..1d5abfdf0f22 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -776,6 +776,20 @@ static int ioctl_getfsuuid(struct file *file, void __user *argp) return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0; } +static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp) +{ + struct super_block *sb = file_inode(file)->i_sb; + + if (!strlen(sb->s_sysfs_name)) + return -ENOIOCTLCMD; + + struct fs_sysfs_path u = {}; + + u.len = scnprintf(u.name, sizeof(u.name), "%s/%s", sb->s_type->name, sb->s_sysfs_name); + + return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0; +} + /* * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. * It's just a simple helper for sys_ioctl and compat_sys_ioctl. @@ -861,6 +875,9 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, case FS_IOC_GETFSUUID: return ioctl_getfsuuid(filp, argp); + case FS_IOC_GETFSSYSFSPATH: + return ioctl_get_fs_sysfs_path(filp, argp); + default: if (S_ISREG(inode->i_mode)) return file_ioctl(filp, cmd, argp); diff --git a/include/linux/fs.h b/include/linux/fs.h index acdc56987cb1..c6d9e1b7032c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1255,10 +1255,23 @@ struct super_block { struct fsnotify_mark_connector __rcu *s_fsnotify_marks; #endif + /* + * q: why are s_id and s_sysfs_name not the same? both are human + * readable strings that identify the filesystem + * a: s_id is allowed to change at runtime; it's used in log messages, + * and we want to when a device starts out as single device (s_id is dev + * name) but then a device is hot added and we have to switch to + * identifying it by UUID + * but s_sysfs_name is a handle for programmatic access, and can't + * change at runtime + */ char s_id[32]; /* Informational name */ uuid_t s_uuid; /* UUID */ u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ + /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ + char s_sysfs_name[UUID_STRING_LEN + 1]; + unsigned int s_max_links; /* @@ -2541,6 +2554,36 @@ static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsign memcpy(&sb->s_uuid, uuid, len); } +/* set sb sysfs name based on sb->s_bdev */ +static inline void super_set_sysfs_name_bdev(struct super_block *sb) +{ + snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev); +} + +/* set sb sysfs name based on sb->s_uuid */ +static inline void super_set_sysfs_name_uuid(struct super_block *sb) +{ + WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid)); + snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b); +} + +/* set sb sysfs name based on sb->s_id */ +static inline void super_set_sysfs_name_id(struct super_block *sb) +{ + strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name)); +} + +/* try to use something standard before you use this */ +__printf(2, 3) +static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args); + va_end(args); +} + extern int current_umask(void); extern void ihold(struct inode * inode); diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 40880ce99d02..dcd6c8c7d2c7 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -77,6 +77,11 @@ struct fsuuid2 { __u8 uuid[16]; }; +struct fs_sysfs_path { + __u8 len; + __u8 name[128]; +}; + /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */ #define FILE_DEDUPE_RANGE_SAME 0 #define FILE_DEDUPE_RANGE_DIFFERS 1 @@ -230,6 +235,11 @@ struct fsxattr { #define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) /* Returns the external filesystem UUID, the same one blkid returns */ #define FS_IOC_GETFSUUID _IOR(0x15, 0, struct fsuuid2) +/* + * Returns the path component under /sys/fs/ that refers to this filesystem; + * also /sys/kernel/debug/ for filesystems with debugfs exports + */ +#define FS_IOC_GETFSSYSFSPATH _IOR(0x15, 1, struct fs_sysfs_path) /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) -- cgit v1.2.3 From e1fb1dc08e73466830612bcf2f9f72180965c9ba Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Feb 2024 15:49:45 +0100 Subject: pidfd: allow to override signal scope in pidfd_send_signal() Right now we determine the scope of the signal based on the type of pidfd. There are use-cases where it's useful to override the scope of the signal. For example in [1]. Add flags to determine the scope of the signal: (1) PIDFD_SIGNAL_THREAD: send signal to specific thread reference by @pidfd (2) PIDFD_SIGNAL_THREAD_GROUP: send signal to thread-group of @pidfd (2) PIDFD_SIGNAL_PROCESS_GROUP: send signal to process-group of @pidfd Since we now allow specifying PIDFD_SEND_PROCESS_GROUP for pidfd_send_signal() to send signals to process groups we need to adjust the check restricting si_code emulation by userspace to account for PIDTYPE_PGID. Reviewed-by: Oleg Nesterov Link: https://github.com/systemd/systemd/issues/31093 [1] Link: https://lore.kernel.org/r/20240210-chihuahua-hinzog-3945b6abd44a@brauner Link: https://lore.kernel.org/r/20240214123655.GB16265@redhat.com Signed-off-by: Christian Brauner --- include/uapi/linux/pidfd.h | 5 +++++ kernel/signal.c | 46 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 2e6461459877..72ec000a97cd 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -10,4 +10,9 @@ #define PIDFD_NONBLOCK O_NONBLOCK #define PIDFD_THREAD O_EXCL +/* Flags for pidfd_send_signal(). */ +#define PIDFD_SIGNAL_THREAD (1UL << 0) +#define PIDFD_SIGNAL_THREAD_GROUP (1UL << 1) +#define PIDFD_SIGNAL_PROCESS_GROUP (1UL << 2) + #endif /* _UAPI_LINUX_PIDFD_H */ diff --git a/kernel/signal.c b/kernel/signal.c index 8b8169623850..bdca529f0f7b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1905,16 +1905,19 @@ int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, return send_sig_info(info.si_signo, &info, t); } -int kill_pgrp(struct pid *pid, int sig, int priv) +static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { int ret; - read_lock(&tasklist_lock); - ret = __kill_pgrp_info(sig, __si_special(priv), pid); + ret = __kill_pgrp_info(sig, info, pgrp); read_unlock(&tasklist_lock); - return ret; } + +int kill_pgrp(struct pid *pid, int sig, int priv) +{ + return kill_pgrp_info(sig, __si_special(priv), pid); +} EXPORT_SYMBOL(kill_pgrp); int kill_pid(struct pid *pid, int sig, int priv) @@ -3873,6 +3876,10 @@ static struct pid *pidfd_to_pid(const struct file *file) return tgid_pidfd_to_pid(file); } +#define PIDFD_SEND_SIGNAL_FLAGS \ + (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \ + PIDFD_SIGNAL_PROCESS_GROUP) + /** * sys_pidfd_send_signal - Signal a process through a pidfd * @pidfd: file descriptor of the process @@ -3897,7 +3904,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, enum pid_type type; /* Enforce flags be set to 0 until we add an extension. */ - if (flags) + if (flags & ~PIDFD_SEND_SIGNAL_FLAGS) + return -EINVAL; + + /* Ensure that only a single signal scope determining flag is set. */ + if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; f = fdget(pidfd); @@ -3915,10 +3926,24 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, if (!access_pidfd_pidns(pid)) goto err; - if (f.file->f_flags & PIDFD_THREAD) + switch (flags) { + case 0: + /* Infer scope from the type of pidfd. */ + if (f.file->f_flags & PIDFD_THREAD) + type = PIDTYPE_PID; + else + type = PIDTYPE_TGID; + break; + case PIDFD_SIGNAL_THREAD: type = PIDTYPE_PID; - else + break; + case PIDFD_SIGNAL_THREAD_GROUP: type = PIDTYPE_TGID; + break; + case PIDFD_SIGNAL_PROCESS_GROUP: + type = PIDTYPE_PGID; + break; + } if (info) { ret = copy_siginfo_from_user_any(&kinfo, info); @@ -3931,14 +3956,17 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, /* Only allow sending arbitrary signals to yourself. */ ret = -EPERM; - if ((task_pid(current) != pid) && + if ((task_pid(current) != pid || type > PIDTYPE_TGID) && (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) goto err; } else { prepare_kill_siginfo(sig, &kinfo, type); } - ret = kill_pid_info_type(sig, &kinfo, pid, type); + if (type == PIDTYPE_PGID) + ret = kill_pgrp_info(sig, &kinfo, pid); + else + ret = kill_pid_info_type(sig, &kinfo, pid, type); err: fdput(f); return ret; -- cgit v1.2.3 From 85df6b5a6658c788d7e27d52ea334aa83abcbf56 Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Thu, 22 Feb 2024 18:36:49 +0100 Subject: ALSA: pcm: clarify and fix default msbits value for all formats Return used most significant bits from sample bit-width rather than the whole physical sample word size. The starting bit offset is defined in the format itself. The behaviour is not changed for 32-bit formats like S32_LE. But with this change - msbits value 24 instead 32 is returned for 24-bit formats like S24_LE etc. Also, commit 2112aa034907 ("ALSA: pcm: Introduce MSBITS subformat interface") compares sample bit-width not physical sample bit-width to reset MSBITS_MAX bit from the subformat bitmask. Probably no applications are using msbits value for other than S32_LE/U32_LE formats, because no drivers are reducing msbits value for other formats (with the msb offset) at the moment. For sanity, increase PCM protocol version, letting the user space to detect the changed behaviour. Signed-off-by: Jaroslav Kysela Link: https://lore.kernel.org/r/20240222173649.1447549-1-perex@perex.cz Signed-off-by: Takashi Iwai --- include/uapi/sound/asound.h | 4 ++-- sound/core/pcm_native.c | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index d5b9cfbd9cea..628d46a0da92 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -142,7 +142,7 @@ struct snd_hwdep_dsp_image { * * *****************************************************************************/ -#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 16) +#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 17) typedef unsigned long snd_pcm_uframes_t; typedef signed long snd_pcm_sframes_t; @@ -416,7 +416,7 @@ struct snd_pcm_hw_params { unsigned int rmask; /* W: requested masks */ unsigned int cmask; /* R: changed masks */ unsigned int info; /* R: Info flags for returned setup */ - unsigned int msbits; /* R: used most significant bits */ + unsigned int msbits; /* R: used most significant bits (in sample bit-width) */ unsigned int rate_num; /* R: rate numerator */ unsigned int rate_den; /* R: rate denominator */ snd_pcm_uframes_t fifo_size; /* R: chip FIFO size in frames */ diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index f5ff00f99788..21baf6bf7e25 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -486,6 +486,11 @@ static int fixup_unreferenced_params(struct snd_pcm_substream *substream, i = hw_param_interval_c(params, SNDRV_PCM_HW_PARAM_SAMPLE_BITS); if (snd_interval_single(i)) params->msbits = snd_interval_value(i); + m = hw_param_mask_c(params, SNDRV_PCM_HW_PARAM_FORMAT); + if (snd_mask_single(m)) { + snd_pcm_format_t format = (__force snd_pcm_format_t)snd_mask_min(m); + params->msbits = snd_pcm_format_width(format); + } } if (params->msbits) { -- cgit v1.2.3 From 6a2008641920a9c6fe1abbeb9acbec463215d505 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Mon, 26 Feb 2024 13:49:21 +0100 Subject: uapi: in6: replace temporary label with rfc9486 Not really a fix per se, but IPV6_TLV_IOAM is still tagged as "TEMPORARY IANA allocation for IOAM", while RFC 9486 is available for some time now. Just update the reference. Fixes: 9ee11f0fff20 ("ipv6: ioam: Data plane support for Pre-allocated Trace") Signed-off-by: Justin Iurman Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240226124921.9097-1-justin.iurman@uliege.be Signed-off-by: Jakub Kicinski --- include/uapi/linux/in6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index c4c53a9ab959..ff8d21f9e95b 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -145,7 +145,7 @@ struct in6_flowlabel_req { #define IPV6_TLV_PADN 1 #define IPV6_TLV_ROUTERALERT 5 #define IPV6_TLV_CALIPSO 7 /* RFC 5570 */ -#define IPV6_TLV_IOAM 49 /* TEMPORARY IANA allocation for IOAM */ +#define IPV6_TLV_IOAM 49 /* RFC 9486 */ #define IPV6_TLV_JUMBO 194 #define IPV6_TLV_HAO 201 /* home address option */ -- cgit v1.2.3 From 13fe8e6825e44129b6cbeee41d3012554bf8d687 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 23 Feb 2024 15:55:39 +0800 Subject: ublk: add UBLK_CMD_DEL_DEV_ASYNC The current command UBLK_CMD_DEL_DEV won't return until the device is released, this way looks more reliable, but makes userspace more difficult to implement, especially about orders: unmap command buffer(which holds one ublkc reference), ublkc close, io_uring_file_unregister, ublkb close. Add UBLK_CMD_DEL_DEV_ASYNC so that device deletion won't wait release, then userspace needn't worry about the above order. Actually both loop and nbd is deleted in this async way. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240223075539.89945-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 9 ++++++--- include/uapi/linux/ublk_cmd.h | 2 ++ 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 06d88d2008ba..bea3d5cf8a83 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2468,7 +2468,7 @@ static inline bool ublk_idr_freed(int id) return ptr == NULL; } -static int ublk_ctrl_del_dev(struct ublk_device **p_ub) +static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait) { struct ublk_device *ub = *p_ub; int idx = ub->ub_number; @@ -2502,7 +2502,7 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub) * - the device number is freed already, we will not find this * device via ublk_get_device_from_id() */ - if (wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) + if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) return -EINTR; return 0; } @@ -2901,7 +2901,10 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_add_dev(cmd); break; case UBLK_CMD_DEL_DEV: - ret = ublk_ctrl_del_dev(&ub); + ret = ublk_ctrl_del_dev(&ub, true); + break; + case UBLK_U_CMD_DEL_DEV_ASYNC: + ret = ublk_ctrl_del_dev(&ub, false); break; case UBLK_CMD_GET_QUEUE_AFFINITY: ret = ublk_ctrl_get_queue_affinity(ub, cmd); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index b9cfc5c96268..c8dc5f8ea699 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -49,6 +49,8 @@ _IOR('u', UBLK_CMD_GET_DEV_INFO2, struct ublksrv_ctrl_cmd) #define UBLK_U_CMD_GET_FEATURES \ _IOR('u', 0x13, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_DEL_DEV_ASYNC \ + _IOR('u', 0x14, struct ublksrv_ctrl_cmd) /* * 64bits are enough now, and it should be easy to extend in case of -- cgit v1.2.3 From 7e10d87e63f7f9c324d533bb4369e35bb19ab9a9 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 21 Feb 2024 14:30:18 +0100 Subject: drm/xe: Add uapi for dumpable bos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the flag XE_VM_BIND_FLAG_DUMPABLE to notify devcoredump that this mapping should be dumped. This is not hooked up, but the uapi should be ready before merging. It's likely easier to dump the contents of the bo's at devcoredump readout time, so it's better if the bos will stay unmodified after a hang. The NEEDS_CPU_MAPPING flag is removed as requirement. Signed-off-by: Maarten Lankhorst Reviewed-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20240221133024.898315-3-maarten.lankhorst@linux.intel.com (cherry picked from commit 76a86b58d2b3de31e88acb487ebfa0c3cc7c41d2) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_vm.c | 3 ++- include/uapi/drm/xe_drm.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 921ca28d49dd..945c89b5e4b5 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -2722,7 +2722,8 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm, #define SUPPORTED_FLAGS \ (DRM_XE_VM_BIND_FLAG_READONLY | \ - DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL) + DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \ + DRM_XE_VM_BIND_FLAG_DUMPABLE) #define XE_64K_PAGE_MASK 0xffffull #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP) diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 6d11ee9e571a..4f010949f812 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -931,6 +931,7 @@ struct drm_xe_vm_bind_op { #define DRM_XE_VM_BIND_FLAG_READONLY (1 << 0) #define DRM_XE_VM_BIND_FLAG_IMMEDIATE (1 << 1) #define DRM_XE_VM_BIND_FLAG_NULL (1 << 2) +#define DRM_XE_VM_BIND_FLAG_DUMPABLE (1 << 3) /** @flags: Bind flags */ __u32 flags; -- cgit v1.2.3 From b6f4fb397db09024c189834d638abbd21bf00769 Mon Sep 17 00:00:00 2001 From: José Roberto de Souza Date: Tue, 26 Dec 2023 08:11:14 -0800 Subject: drm/xe/uapi: Remove DRM_XE_VM_BIND_FLAG_ASYNC comment left over MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a comment left over of commit d3d767396a02 ("drm/xe/uapi: Remove sync binds"). Fixes: d3d767396a02 ("drm/xe/uapi: Remove sync binds") Reviewed-by: Rodrigo Vivi Cc: Matthew Brost Signed-off-by: José Roberto de Souza Link: https://patchwork.freedesktop.org/patch/msgid/20231226172321.61518-1-jose.souza@intel.com (cherry picked from commit f031c3a7af8ea06790dd0a71872c4f0175084baa) Signed-off-by: Thomas Hellström --- include/uapi/drm/xe_drm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 4f010949f812..a7274a99d456 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -832,7 +832,6 @@ struct drm_xe_vm_destroy { * * and the @flags can be: * - %DRM_XE_VM_BIND_FLAG_READONLY - * - %DRM_XE_VM_BIND_FLAG_ASYNC * - %DRM_XE_VM_BIND_FLAG_IMMEDIATE - Valid on a faulting VM only, do the * MAP operation immediately rather than deferring the MAP to the page * fault handler. -- cgit v1.2.3 From eaa367a0317ea4cbc7aa60f25829c89c0e12717b Mon Sep 17 00:00:00 2001 From: Francois Dugast Date: Thu, 22 Feb 2024 18:23:56 -0500 Subject: drm/xe/uapi: Remove unused flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those cases missed in previous uAPI cleanups were mostly accidentally brought in from i915 or created to exercise the possibilities of gpuvm but they are not used by userspace yet, so let's remove them. They can still be brought back later if needed. v2: - Fix XE_VM_FLAG_FAULT_MODE support in xe_lrc.c (Brian Welty) - Leave DRM_XE_VM_BIND_OP_UNMAP_ALL (José Roberto de Souza) - Ensure invalid flag values are rejected (Rodrigo Vivi) v3: Rebase after removal of persistent exec_queues (Francois Dugast) v4: Rodrigo: Rebase after the new dumpable flag. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Thomas Hellström Cc: Rodrigo Vivi Signed-off-by: Francois Dugast Reviewed-by: Rodrigo Vivi Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240222232356.175431-1-rodrigo.vivi@intel.com (cherry picked from commit 84a1ed5e67565b09b8fd22a26754d2897de55ce0) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_exec_queue.c | 88 ++------------------------------ drivers/gpu/drm/xe/xe_exec_queue_types.h | 10 ---- drivers/gpu/drm/xe/xe_lrc.c | 10 +--- drivers/gpu/drm/xe/xe_vm.c | 12 +---- drivers/gpu/drm/xe/xe_vm_types.h | 4 -- include/uapi/drm/xe_drm.h | 19 ------- 6 files changed, 6 insertions(+), 137 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c index 3acfd4f07666..49223026c89f 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue.c +++ b/drivers/gpu/drm/xe/xe_exec_queue.c @@ -309,85 +309,6 @@ static int exec_queue_set_timeslice(struct xe_device *xe, struct xe_exec_queue * return q->ops->set_timeslice(q, value); } -static int exec_queue_set_preemption_timeout(struct xe_device *xe, - struct xe_exec_queue *q, u64 value, - bool create) -{ - u32 min = 0, max = 0; - - xe_exec_queue_get_prop_minmax(q->hwe->eclass, - XE_EXEC_QUEUE_PREEMPT_TIMEOUT, &min, &max); - - if (xe_exec_queue_enforce_schedule_limit() && - !xe_hw_engine_timeout_in_range(value, min, max)) - return -EINVAL; - - return q->ops->set_preempt_timeout(q, value); -} - -static int exec_queue_set_job_timeout(struct xe_device *xe, struct xe_exec_queue *q, - u64 value, bool create) -{ - u32 min = 0, max = 0; - - if (XE_IOCTL_DBG(xe, !create)) - return -EINVAL; - - xe_exec_queue_get_prop_minmax(q->hwe->eclass, - XE_EXEC_QUEUE_JOB_TIMEOUT, &min, &max); - - if (xe_exec_queue_enforce_schedule_limit() && - !xe_hw_engine_timeout_in_range(value, min, max)) - return -EINVAL; - - return q->ops->set_job_timeout(q, value); -} - -static int exec_queue_set_acc_trigger(struct xe_device *xe, struct xe_exec_queue *q, - u64 value, bool create) -{ - if (XE_IOCTL_DBG(xe, !create)) - return -EINVAL; - - if (XE_IOCTL_DBG(xe, !xe->info.has_usm)) - return -EINVAL; - - q->usm.acc_trigger = value; - - return 0; -} - -static int exec_queue_set_acc_notify(struct xe_device *xe, struct xe_exec_queue *q, - u64 value, bool create) -{ - if (XE_IOCTL_DBG(xe, !create)) - return -EINVAL; - - if (XE_IOCTL_DBG(xe, !xe->info.has_usm)) - return -EINVAL; - - q->usm.acc_notify = value; - - return 0; -} - -static int exec_queue_set_acc_granularity(struct xe_device *xe, struct xe_exec_queue *q, - u64 value, bool create) -{ - if (XE_IOCTL_DBG(xe, !create)) - return -EINVAL; - - if (XE_IOCTL_DBG(xe, !xe->info.has_usm)) - return -EINVAL; - - if (value > DRM_XE_ACC_GRANULARITY_64M) - return -EINVAL; - - q->usm.acc_granularity = value; - - return 0; -} - typedef int (*xe_exec_queue_set_property_fn)(struct xe_device *xe, struct xe_exec_queue *q, u64 value, bool create); @@ -395,11 +316,6 @@ typedef int (*xe_exec_queue_set_property_fn)(struct xe_device *xe, static const xe_exec_queue_set_property_fn exec_queue_set_property_funcs[] = { [DRM_XE_EXEC_QUEUE_SET_PROPERTY_PRIORITY] = exec_queue_set_priority, [DRM_XE_EXEC_QUEUE_SET_PROPERTY_TIMESLICE] = exec_queue_set_timeslice, - [DRM_XE_EXEC_QUEUE_SET_PROPERTY_PREEMPTION_TIMEOUT] = exec_queue_set_preemption_timeout, - [DRM_XE_EXEC_QUEUE_SET_PROPERTY_JOB_TIMEOUT] = exec_queue_set_job_timeout, - [DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_TRIGGER] = exec_queue_set_acc_trigger, - [DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_NOTIFY] = exec_queue_set_acc_notify, - [DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_GRANULARITY] = exec_queue_set_acc_granularity, }; static int exec_queue_user_ext_set_property(struct xe_device *xe, @@ -418,7 +334,9 @@ static int exec_queue_user_ext_set_property(struct xe_device *xe, if (XE_IOCTL_DBG(xe, ext.property >= ARRAY_SIZE(exec_queue_set_property_funcs)) || - XE_IOCTL_DBG(xe, ext.pad)) + XE_IOCTL_DBG(xe, ext.pad) || + XE_IOCTL_DBG(xe, ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_PRIORITY && + ext.property != DRM_XE_EXEC_QUEUE_SET_PROPERTY_TIMESLICE)) return -EINVAL; idx = array_index_nospec(ext.property, ARRAY_SIZE(exec_queue_set_property_funcs)); diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 947bbc4b285d..36f4901d8d7e 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -150,16 +150,6 @@ struct xe_exec_queue { spinlock_t lock; } compute; - /** @usm: unified shared memory state */ - struct { - /** @acc_trigger: access counter trigger */ - u32 acc_trigger; - /** @acc_notify: access counter notify */ - u32 acc_notify; - /** @acc_granularity: access counter granularity */ - u32 acc_granularity; - } usm; - /** @ops: submission backend exec queue operations */ const struct xe_exec_queue_ops *ops; diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index 0ec5ad2539f1..b38319d2801e 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -682,8 +682,6 @@ static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm) #define PVC_CTX_ASID (0x2e + 1) #define PVC_CTX_ACC_CTR_THOLD (0x2a + 1) -#define ACC_GRANULARITY_S 20 -#define ACC_NOTIFY_S 16 int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size) @@ -754,13 +752,7 @@ int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe, xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL, RING_CTL_SIZE(lrc->ring.size) | RING_VALID); if (xe->info.has_asid && vm) - xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, - (q->usm.acc_granularity << - ACC_GRANULARITY_S) | vm->usm.asid); - if (xe->info.has_usm && vm) - xe_lrc_write_ctx_reg(lrc, PVC_CTX_ACC_CTR_THOLD, - (q->usm.acc_notify << ACC_NOTIFY_S) | - q->usm.acc_trigger); + xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid); lrc->desc = LRC_VALID; lrc->desc |= LRC_LEGACY_64B_CONTEXT << LRC_ADDRESSING_MODE_SHIFT; diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 945c89b5e4b5..1d82616aa935 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -2117,10 +2117,6 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo, struct xe_vma_op *op = gpuva_op_to_vma_op(__op); if (__op->op == DRM_GPUVA_OP_MAP) { - op->map.immediate = - flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE; - op->map.read_only = - flags & DRM_XE_VM_BIND_FLAG_READONLY; op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL; op->map.pat_index = pat_index; } else if (__op->op == DRM_GPUVA_OP_PREFETCH) { @@ -2313,8 +2309,6 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q, switch (op->base.op) { case DRM_GPUVA_OP_MAP: { - flags |= op->map.read_only ? - VMA_CREATE_FLAG_READ_ONLY : 0; flags |= op->map.is_null ? VMA_CREATE_FLAG_IS_NULL : 0; @@ -2445,7 +2439,7 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm, case DRM_GPUVA_OP_MAP: err = xe_vm_bind(vm, vma, op->q, xe_vma_bo(vma), op->syncs, op->num_syncs, - op->map.immediate || !xe_vm_in_fault_mode(vm), + !xe_vm_in_fault_mode(vm), op->flags & XE_VMA_OP_FIRST, op->flags & XE_VMA_OP_LAST); break; @@ -2720,9 +2714,7 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm, return 0; } -#define SUPPORTED_FLAGS \ - (DRM_XE_VM_BIND_FLAG_READONLY | \ - DRM_XE_VM_BIND_FLAG_IMMEDIATE | DRM_XE_VM_BIND_FLAG_NULL | \ +#define SUPPORTED_FLAGS (DRM_XE_VM_BIND_FLAG_NULL | \ DRM_XE_VM_BIND_FLAG_DUMPABLE) #define XE_64K_PAGE_MASK 0xffffull #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP) diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index a603cc2eb56b..0f220b5d2e7b 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -288,10 +288,6 @@ struct xe_vm { struct xe_vma_op_map { /** @vma: VMA to map */ struct xe_vma *vma; - /** @immediate: Immediate bind */ - bool immediate; - /** @read_only: Read only */ - bool read_only; /** @is_null: is NULL binding */ bool is_null; /** @pat_index: The pat index to use for this operation. */ diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index a7274a99d456..bb0c8a994116 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -831,10 +831,6 @@ struct drm_xe_vm_destroy { * - %DRM_XE_VM_BIND_OP_PREFETCH * * and the @flags can be: - * - %DRM_XE_VM_BIND_FLAG_READONLY - * - %DRM_XE_VM_BIND_FLAG_IMMEDIATE - Valid on a faulting VM only, do the - * MAP operation immediately rather than deferring the MAP to the page - * fault handler. * - %DRM_XE_VM_BIND_FLAG_NULL - When the NULL flag is set, the page * tables are setup with a special bit which indicates writes are * dropped and all reads return zero. In the future, the NULL flags @@ -927,8 +923,6 @@ struct drm_xe_vm_bind_op { /** @op: Bind operation to perform */ __u32 op; -#define DRM_XE_VM_BIND_FLAG_READONLY (1 << 0) -#define DRM_XE_VM_BIND_FLAG_IMMEDIATE (1 << 1) #define DRM_XE_VM_BIND_FLAG_NULL (1 << 2) #define DRM_XE_VM_BIND_FLAG_DUMPABLE (1 << 3) /** @flags: Bind flags */ @@ -1045,19 +1039,6 @@ struct drm_xe_exec_queue_create { #define DRM_XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY 0 #define DRM_XE_EXEC_QUEUE_SET_PROPERTY_PRIORITY 0 #define DRM_XE_EXEC_QUEUE_SET_PROPERTY_TIMESLICE 1 -#define DRM_XE_EXEC_QUEUE_SET_PROPERTY_PREEMPTION_TIMEOUT 2 -#define DRM_XE_EXEC_QUEUE_SET_PROPERTY_JOB_TIMEOUT 4 -#define DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_TRIGGER 5 -#define DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_NOTIFY 6 -#define DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_GRANULARITY 7 -/* Monitor 128KB contiguous region with 4K sub-granularity */ -#define DRM_XE_ACC_GRANULARITY_128K 0 -/* Monitor 2MB contiguous region with 64KB sub-granularity */ -#define DRM_XE_ACC_GRANULARITY_2M 1 -/* Monitor 16MB contiguous region with 512KB sub-granularity */ -#define DRM_XE_ACC_GRANULARITY_16M 2 -/* Monitor 64MB contiguous region with 2M sub-granularity */ -#define DRM_XE_ACC_GRANULARITY_64M 3 /** @extensions: Pointer to the first extension struct, if any */ __u64 extensions; -- cgit v1.2.3 From cb12fd8e0dabb9a1c8aef55a6a41e2c255fcdf4b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 12 Feb 2024 16:32:38 +0100 Subject: pidfd: add pidfs This moves pidfds from the anonymous inode infrastructure to a tiny pseudo filesystem. This has been on my todo for quite a while as it will unblock further work that we weren't able to do simply because of the very justified limitations of anonymous inodes. Moving pidfds to a tiny pseudo filesystem allows: * statx() on pidfds becomes useful for the first time. * pidfds can be compared simply via statx() and then comparing inode numbers. * pidfds have unique inode numbers for the system lifetime. * struct pid is now stashed in inode->i_private instead of file->private_data. This means it is now possible to introduce concepts that operate on a process once all file descriptors have been closed. A concrete example is kill-on-last-close. * file->private_data is freed up for per-file options for pidfds. * Each struct pid will refer to a different inode but the same struct pid will refer to the same inode if it's opened multiple times. In contrast to now where each struct pid refers to the same inode. Even if we were to move to anon_inode_create_getfile() which creates new inodes we'd still be associating the same struct pid with multiple different inodes. The tiny pseudo filesystem is not visible anywhere in userspace exactly like e.g., pipefs and sockfs. There's no lookup, there's no complex inode operations, nothing. Dentries and inodes are always deleted when the last pidfd is closed. We allocate a new inode for each struct pid and we reuse that inode for all pidfds. We use iget_locked() to find that inode again based on the inode number which isn't recycled. We allocate a new dentry for each pidfd that uses the same inode. That is similar to anonymous inodes which reuse the same inode for thousands of dentries. For pidfds we're talking way less than that. There usually won't be a lot of concurrent openers of the same struct pid. They can probably often be counted on two hands. I know that systemd does use separate pidfd for the same struct pid for various complex process tracking issues. So I think with that things actually become way simpler. Especially because we don't have to care about lookup. Dentries and inodes continue to be always deleted. The code is entirely optional and fairly small. If it's not selected we fallback to anonymous inodes. Heavily inspired by nsfs which uses a similar stashing mechanism just for namespaces. Link: https://lore.kernel.org/r/20240213-vfs-pidfd_fs-v1-2-f863f58cfce1@kernel.org Signed-off-by: Christian Brauner --- fs/Kconfig | 7 ++ fs/pidfs.c | 156 ++++++++++++++++++++++++++++++++++++++++++++- include/linux/pid.h | 5 +- include/linux/pidfs.h | 8 +++ include/uapi/linux/magic.h | 1 + init/main.c | 2 + kernel/fork.c | 13 +--- kernel/nsproxy.c | 2 +- kernel/pid.c | 11 ++++ 9 files changed, 188 insertions(+), 17 deletions(-) create mode 100644 include/linux/pidfs.h (limited to 'include/uapi') diff --git a/fs/Kconfig b/fs/Kconfig index 89fdbefd1075..f3dbd84a0e40 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -174,6 +174,13 @@ source "fs/proc/Kconfig" source "fs/kernfs/Kconfig" source "fs/sysfs/Kconfig" +config FS_PID + bool "Pseudo filesystem for process file descriptors" + depends on 64BIT + default y + help + Pidfs implements advanced features for process file descriptors. + config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM diff --git a/fs/pidfs.c b/fs/pidfs.c index eccb291862a0..6c3f010074af 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -1,9 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include #include #include +#include #include #include #include @@ -14,10 +16,12 @@ static int pidfd_release(struct inode *inode, struct file *file) { +#ifndef CONFIG_FS_PID struct pid *pid = file->private_data; file->private_data = NULL; put_pid(pid); +#endif return 0; } @@ -59,7 +63,7 @@ static int pidfd_release(struct inode *inode, struct file *file) */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { - struct pid *pid = f->private_data; + struct pid *pid = pidfd_pid(f); struct pid_namespace *ns; pid_t nr = -1; @@ -93,7 +97,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { - struct pid *pid = file->private_data; + struct pid *pid = pidfd_pid(file); bool thread = file->f_flags & PIDFD_THREAD; struct task_struct *task; __poll_t poll_flags = 0; @@ -113,10 +117,156 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) return poll_flags; } -const struct file_operations pidfd_fops = { +static const struct file_operations pidfs_file_operations = { .release = pidfd_release, .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, #endif }; + +struct pid *pidfd_pid(const struct file *file) +{ + if (file->f_op != &pidfs_file_operations) + return ERR_PTR(-EBADF); +#ifdef CONFIG_FS_PID + return file_inode(file)->i_private; +#else + return file->private_data; +#endif +} + +#ifdef CONFIG_FS_PID +static struct vfsmount *pidfs_mnt __ro_after_init; +static struct super_block *pidfs_sb __ro_after_init; + +/* + * The vfs falls back to simple_setattr() if i_op->setattr() isn't + * implemented. Let's reject it completely until we have a clean + * permission concept for pidfds. + */ +static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + return -EOPNOTSUPP; +} + +static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + return 0; +} + +static const struct inode_operations pidfs_inode_operations = { + .getattr = pidfs_getattr, + .setattr = pidfs_setattr, +}; + +static void pidfs_evict_inode(struct inode *inode) +{ + struct pid *pid = inode->i_private; + + clear_inode(inode); + put_pid(pid); +} + +static const struct super_operations pidfs_sops = { + .drop_inode = generic_delete_inode, + .evict_inode = pidfs_evict_inode, + .statfs = simple_statfs, +}; + +static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(buffer, buflen, "pidfd:[%lu]", + d_inode(dentry)->i_ino); +} + +static const struct dentry_operations pidfs_dentry_operations = { + .d_delete = always_delete_dentry, + .d_dname = pidfs_dname, +}; + +static int pidfs_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx; + + ctx = init_pseudo(fc, PID_FS_MAGIC); + if (!ctx) + return -ENOMEM; + + ctx->ops = &pidfs_sops; + ctx->dops = &pidfs_dentry_operations; + return 0; +} + +static struct file_system_type pidfs_type = { + .name = "pidfs", + .init_fs_context = pidfs_init_fs_context, + .kill_sb = kill_anon_super, +}; + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) +{ + + struct inode *inode; + struct file *pidfd_file; + + inode = iget_locked(pidfs_sb, pid->ino); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + /* + * Inode numbering for pidfs start at RESERVED_PIDS + 1. + * This avoids collisions with the root inode which is 1 + * for pseudo filesystems. + */ + inode->i_ino = pid->ino; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_op = &pidfs_inode_operations; + inode->i_fop = &pidfs_file_operations; + inode->i_flags |= S_IMMUTABLE; + inode->i_private = get_pid(pid); + simple_inode_init_ts(inode); + unlock_new_inode(inode); + } + + pidfd_file = alloc_file_pseudo(inode, pidfs_mnt, "", flags, + &pidfs_file_operations); + if (IS_ERR(pidfd_file)) + iput(inode); + + return pidfd_file; +} + +void __init pidfs_init(void) +{ + pidfs_mnt = kern_mount(&pidfs_type); + if (IS_ERR(pidfs_mnt)) + panic("Failed to mount pidfs pseudo filesystem"); + + pidfs_sb = pidfs_mnt->mnt_sb; +} + +#else /* !CONFIG_FS_PID */ + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) +{ + struct file *pidfd_file; + + pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid, + flags | O_RDWR); + if (IS_ERR(pidfd_file)) + return pidfd_file; + + get_pid(pid); + return pidfd_file; +} + +void __init pidfs_init(void) { } +#endif diff --git a/include/linux/pid.h b/include/linux/pid.h index 8124d57752b9..956481128e8d 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -55,6 +55,9 @@ struct pid refcount_t count; unsigned int level; spinlock_t lock; +#ifdef CONFIG_FS_PID + unsigned long ino; +#endif /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; @@ -66,8 +69,6 @@ struct pid extern struct pid init_struct_pid; -extern const struct file_operations pidfd_fops; - struct file; struct pid *pidfd_pid(const struct file *file); diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h new file mode 100644 index 000000000000..75bdf9807802 --- /dev/null +++ b/include/linux/pidfs.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PID_FS_H +#define _LINUX_PID_FS_H + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); +void __init pidfs_init(void); + +#endif /* _LINUX_PID_FS_H */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 6325d1d0e90f..1b40a968ba91 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -101,5 +101,6 @@ #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ +#define PID_FS_MAGIC 0x50494446 /* "PIDF" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/init/main.c b/init/main.c index e24b0780fdff..2fbf6a3114d5 100644 --- a/init/main.c +++ b/init/main.c @@ -99,6 +99,7 @@ #include #include #include +#include #include #include @@ -1059,6 +1060,7 @@ void start_kernel(void) seq_file_init(); proc_root_init(); nsfs_init(); + pidfs_init(); cpuset_init(); cgroup_init(); taskstats_init_early(); diff --git a/kernel/fork.c b/kernel/fork.c index 662a61f340ce..2f839c290dcf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include @@ -1985,14 +1986,6 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -struct pid *pidfd_pid(const struct file *file) -{ - if (file->f_op == &pidfd_fops) - return file->private_data; - - return ERR_PTR(-EBADF); -} - /** * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd @@ -2030,13 +2023,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re if (pidfd < 0) return pidfd; - pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, - flags | O_RDWR); + pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); if (IS_ERR(pidfd_file)) { put_unused_fd(pidfd); return PTR_ERR(pidfd_file); } - get_pid(pid); /* held by pidfd_file now */ /* * anon_inode_getfile() ignores everything outside of the * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually. diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 15781acaac1c..6ec3deec68c2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -573,7 +573,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(f.file)) err = validate_ns(&nsset, ns); else - err = validate_nsset(&nsset, f.file->private_data); + err = validate_nsset(&nsset, pidfd_pid(f.file)); if (!err) { commit_nsset(&nsset); perf_event_namespaces(current); diff --git a/kernel/pid.c b/kernel/pid.c index c1d940fbd314..581cc34341fd 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -65,6 +66,13 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; +#ifdef CONFIG_FS_PID +/* + * Pseudo filesystems start inode numbering after one. We use Reserved + * PIDs as a natural offset. + */ +static u64 pidfs_ino = RESERVED_PIDS; +#endif /* * PID-map pages start out as NULL, they get allocated upon @@ -272,6 +280,9 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; +#ifdef CONFIG_FS_PID + pid->ino = ++pidfs_ino; +#endif for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); -- cgit v1.2.3