diff options
Diffstat (limited to 'kernel')
312 files changed, 21936 insertions, 11781 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 38ef6d06888e..ce1435cb08b1 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -30,7 +30,7 @@ choice 250 Hz is a good compromise choice allowing server performance while also showing good interactive responsiveness even on SMP and NUMA systems. If you are going to be using NTSC video - or multimedia, selected 300Hz instead. + or multimedia, select 300Hz instead. config HZ_300 bool "300 HZ" diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 4d111f871951..2ee603a98813 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -38,8 +38,7 @@ config KEXEC config KEXEC_FILE bool "Enable kexec file based system call" depends on ARCH_SUPPORTS_KEXEC_FILE - select CRYPTO - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA256 select KEXEC_CORE help This is new version of kexec system call. This system call is @@ -95,6 +94,20 @@ config KEXEC_JUMP Jump between original kernel and kexeced kernel and invoke code in physical address mode via KEXEC +config KEXEC_HANDOVER + bool "kexec handover" + depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE + select MEMBLOCK_KHO_SCRATCH + select KEXEC_FILE + select DEBUG_FS + select LIBFDT + select CMA + help + Allow kexec to hand over state across kernels by generating and + passing additional metadata to the target kernel. This is useful + to keep data or state alive across the kexec. For this to work, + both source and target kernels need to have this option enabled. + config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP @@ -116,6 +129,24 @@ config CRASH_DUMP For s390, this option also enables zfcpdump. See also <file:Documentation/arch/s390/zfcpdump.rst> +config CRASH_DM_CRYPT + bool "Support saving crash dump to dm-crypt encrypted volume" + depends on KEXEC_FILE + depends on CRASH_DUMP + depends on DM_CRYPT + depends on KEYS + help + With this option enabled, user space can intereact with + /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys + persistent for the dump-capture kernel. + +config CRASH_DM_CRYPT_CONFIGS + def_tristate CRASH_DM_CRYPT + select CONFIGFS_FS + help + CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that + is required to be built-in. + config CRASH_HOTPLUG bool "Update the crash elfcorehdr on system configuration changes" default y diff --git a/kernel/Makefile b/kernel/Makefile index 87866b037fbe..32e80dd626af 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,6 +21,11 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_context_tracking.o += -DDISABLE_BRANCH_PROFILING +endif + # Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() # in coverage traces. KCOV_INSTRUMENT_softirq.o := n @@ -72,9 +77,11 @@ obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_CRASH_DUMP) += crash_core.o +obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o +obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/acct.c b/kernel/acct.c index 31222e8cd534..6520baa13669 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -103,48 +103,50 @@ struct bsd_acct_struct { atomic_long_t count; struct rcu_head rcu; struct mutex lock; - int active; + bool active; + bool check_space; unsigned long needcheck; struct file *file; struct pid_namespace *ns; struct work_struct work; struct completion done; + acct_t ac; }; -static void do_acct_process(struct bsd_acct_struct *acct); +static void fill_ac(struct bsd_acct_struct *acct); +static void acct_write_process(struct bsd_acct_struct *acct); /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct bsd_acct_struct *acct) +static bool check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - if (time_is_after_jiffies(acct->needcheck)) - goto out; + if (!acct->check_space) + return acct->active; /* May block */ if (vfs_statfs(&acct->file->f_path, &sbuf)) - goto out; + return acct->active; if (acct->active) { u64 suspend = sbuf.f_blocks * SUSPEND; do_div(suspend, 100); if (sbuf.f_bavail <= suspend) { - acct->active = 0; + acct->active = false; pr_info("Process accounting paused\n"); } } else { u64 resume = sbuf.f_blocks * RESUME; do_div(resume, 100); if (sbuf.f_bavail >= resume) { - acct->active = 1; + acct->active = true; pr_info("Process accounting resumed\n"); } } acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; -out: return acct->active; } @@ -189,7 +191,11 @@ static void acct_pin_kill(struct fs_pin *pin) { struct bsd_acct_struct *acct = to_acct(pin); mutex_lock(&acct->lock); - do_acct_process(acct); + /* + * Fill the accounting struct with the exiting task's info + * before punting to the workqueue. + */ + fill_ac(acct); schedule_work(&acct->work); wait_for_completion(&acct->done); cmpxchg(&acct->ns->bacct, pin, NULL); @@ -202,6 +208,9 @@ static void close_work(struct work_struct *work) { struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); struct file *file = acct->file; + + /* We were fired by acct_pin_kill() which holds acct->lock. */ + acct_write_process(acct); if (file->f_op->flush) file->f_op->flush(file, NULL); __fput_sync(file); @@ -234,6 +243,20 @@ static int acct_on(struct filename *pathname) return -EACCES; } + /* Exclude kernel kernel internal filesystems. */ + if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) { + kfree(acct); + filp_close(file, NULL); + return -EINVAL; + } + + /* Exclude procfs and sysfs. */ + if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) { + kfree(acct); + filp_close(file, NULL); + return -EINVAL; + } + if (!(file->f_mode & FMODE_CAN_WRITE)) { kfree(acct); filp_close(file, NULL); @@ -430,13 +453,27 @@ static u32 encode_float(u64 value) * do_exit() or when switching to a different output file. */ -static void fill_ac(acct_t *ac) +static void fill_ac(struct bsd_acct_struct *acct) { struct pacct_struct *pacct = ¤t->signal->pacct; + struct file *file = acct->file; + acct_t *ac = &acct->ac; u64 elapsed, run_time; time64_t btime; struct tty_struct *tty; + lockdep_assert_held(&acct->lock); + + if (time_is_after_jiffies(acct->needcheck)) { + acct->check_space = false; + + /* Don't fill in @ac if nothing will be written. */ + if (!acct->active) + return; + } else { + acct->check_space = true; + } + /* * Fill the accounting struct with the needed info as recorded * by the different kernel functions. @@ -484,64 +521,61 @@ static void fill_ac(acct_t *ac) ac->ac_majflt = encode_comp_t(pacct->ac_majflt); ac->ac_exitcode = pacct->ac_exitcode; spin_unlock_irq(¤t->sighand->siglock); -} -/* - * do_acct_process does all actual work. Caller holds the reference to file. - */ -static void do_acct_process(struct bsd_acct_struct *acct) -{ - acct_t ac; - unsigned long flim; - const struct cred *orig_cred; - struct file *file = acct->file; - /* - * Accounting records are not subject to resource limits. - */ - flim = rlimit(RLIMIT_FSIZE); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - /* Perform file operations on behalf of whoever enabled accounting */ - orig_cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. - */ - if (!check_free_space(acct)) - goto out; - - fill_ac(&ac); /* we really need to bite the bullet and change layout */ - ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); - ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); + ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid()); + ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid()); #if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ - ac.ac_uid16 = ac.ac_uid; - ac.ac_gid16 = ac.ac_gid; + ac->ac_uid16 = ac->ac_uid; + ac->ac_gid16 = ac->ac_gid; #elif ACCT_VERSION == 3 { struct pid_namespace *ns = acct->ns; - ac.ac_pid = task_tgid_nr_ns(current, ns); + ac->ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); - ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), - ns); + ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); rcu_read_unlock(); } #endif +} + +static void acct_write_process(struct bsd_acct_struct *acct) +{ + struct file *file = acct->file; + const struct cred *cred; + acct_t *ac = &acct->ac; + + /* Perform file operations on behalf of whoever enabled accounting */ + cred = override_creds(file->f_cred); + /* - * Get freeze protection. If the fs is frozen, just skip the write - * as we could deadlock the system otherwise. + * First check to see if there is enough free_space to continue + * the process accounting system. Then get freeze protection. If + * the fs is frozen, just skip the write as we could deadlock + * the system otherwise. */ - if (file_start_write_trylock(file)) { + if (check_free_space(acct) && file_start_write_trylock(file)) { /* it's been opened O_APPEND, so position is irrelevant */ loff_t pos = 0; - __kernel_write(file, &ac, sizeof(acct_t), &pos); + __kernel_write(file, ac, sizeof(acct_t), &pos); file_end_write(file); } -out: + + revert_creds(cred); +} + +static void do_acct_process(struct bsd_acct_struct *acct) +{ + unsigned long flim; + + /* Accounting records are not subject to resource limits. */ + flim = rlimit(RLIMIT_FSIZE); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + fill_ac(acct); + acct_write_process(acct); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - revert_creds(orig_cred); } /** diff --git a/kernel/audit.c b/kernel/audit.c index 5f5bf85bcc90..61b5744d0bb6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1956,8 +1956,8 @@ static inline int audit_expand(struct audit_buffer *ab, int extra) * will be called a second time. Currently, we assume that a printk * can't format message larger than 1024 bytes, so we don't either. */ -static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, - va_list args) +static __printf(2, 0) +void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) { int len, avail; struct sk_buff *skb; @@ -2285,7 +2285,7 @@ void audit_log_path_denied(int type, const char *operation) { struct audit_buffer *ab; - if (!audit_enabled || audit_dummy_context()) + if (!audit_enabled) return; /* Generate log with subject, operation, outcome. */ diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f2f38903b2fe..b0eae2a3c895 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -668,12 +668,6 @@ int audit_remove_tree_rule(struct audit_krule *rule) return 0; } -static int compare_root(struct vfsmount *mnt, void *arg) -{ - return inode_to_key(d_backing_inode(mnt->mnt_root)) == - (unsigned long)arg; -} - void audit_trim_trees(void) { struct list_head cursor; @@ -683,8 +677,9 @@ void audit_trim_trees(void) while (cursor.next != &tree_list) { struct audit_tree *tree; struct path path; - struct vfsmount *root_mnt; struct audit_node *node; + struct path *paths; + struct path array[16]; int err; tree = container_of(cursor.next, struct audit_tree, list); @@ -696,9 +691,9 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(root_mnt)) + if (IS_ERR(paths)) goto skip_it; spin_lock(&hash_lock); @@ -706,14 +701,17 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - if (iterate_mounts(compare_root, - (void *)(chunk->key), - root_mnt)) - node->index &= ~(1U<<31); + for (struct path *p = paths; p->dentry; p++) { + struct inode *inode = p->dentry->d_inode; + if (inode_to_key(inode) == chunk->key) { + node->index &= ~(1U<<31); + break; + } + } } spin_unlock(&hash_lock); trim_marked(tree); - drop_collected_mounts(root_mnt); + drop_collected_paths(paths, array); skip_it: put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -742,9 +740,14 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mount(struct vfsmount *mnt, void *arg) +static int tag_mounts(struct path *paths, struct audit_tree *tree) { - return tag_chunk(d_backing_inode(mnt->mnt_root), arg); + for (struct path *p = paths; p->dentry; p++) { + int err = tag_chunk(p->dentry->d_inode, tree); + if (err) + return err; + } + return 0; } /* @@ -801,7 +804,8 @@ int audit_add_tree_rule(struct audit_krule *rule) { struct audit_tree *seed = rule->tree, *tree; struct path path; - struct vfsmount *mnt; + struct path array[16]; + struct path *paths; int err; rule->tree = NULL; @@ -828,16 +832,16 @@ int audit_add_tree_rule(struct audit_krule *rule) err = kern_path(tree->pathname, 0, &path); if (err) goto Err; - mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); + if (IS_ERR(paths)) { + err = PTR_ERR(paths); goto Err; } get_tree(tree); - err = iterate_mounts(tag_mount, tree, mnt); - drop_collected_mounts(mnt); + err = tag_mounts(paths, tree); + drop_collected_paths(paths, array); if (!err) { struct audit_node *node; @@ -872,20 +876,21 @@ int audit_tag_tree(char *old, char *new) struct list_head cursor, barrier; int failed = 0; struct path path1, path2; - struct vfsmount *tagged; + struct path array[16]; + struct path *paths; int err; err = kern_path(new, 0, &path2); if (err) return err; - tagged = collect_mounts(&path2); + paths = collect_paths(&path2, array, 16); path_put(&path2); - if (IS_ERR(tagged)) - return PTR_ERR(tagged); + if (IS_ERR(paths)) + return PTR_ERR(paths); err = kern_path(old, 0, &path1); if (err) { - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return err; } @@ -914,7 +919,7 @@ int audit_tag_tree(char *old, char *new) continue; } - failed = iterate_mounts(tag_mount, tree, tagged); + failed = tag_mounts(paths, tree); if (failed) { put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -955,7 +960,7 @@ int audit_tag_tree(char *old, char *new) list_del(&cursor); mutex_unlock(&audit_filter_mutex); path_put(&path1); - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return failed; } diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 7f358740e958..0ebbbe37a60f 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -347,14 +347,18 @@ static void audit_remove_parent_watches(struct audit_parent *parent) /* Get path information necessary for adding watches. */ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct dentry *d = kern_path_locked(watch->path, parent); + struct dentry *d; + + d = kern_path_locked_negative(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); + if (d_is_positive(d)) { /* update watch filter fields */ watch->dev = d->d_sb->s_dev; watch->ino = d_backing_inode(d)->i_ino; } + inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9c853cde9abe..78fd876a5473 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2207,10 +2207,8 @@ __audit_reusename(const __user char *uptr) list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; - if (n->name->uptr == uptr) { - atomic_inc(&n->name->refcnt); - return n->name; - } + if (n->name->uptr == uptr) + return refname(n->name); } return NULL; } @@ -2237,7 +2235,7 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; - atomic_inc(&name->refcnt); + refname(name); } static inline int audit_copy_fcaps(struct audit_names *name, @@ -2369,7 +2367,7 @@ out_alloc: return; if (name) { n->name = name; - atomic_inc(&name->refcnt); + refname(name); } out: @@ -2496,7 +2494,7 @@ void __audit_inode_child(struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - atomic_inc(&found_child->name->refcnt); + refname(found_child->name); } } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 410028633621..3a335c50e6e3 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif @@ -53,6 +53,9 @@ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o +ifeq ($(CONFIG_DMA_SHARED_BUFFER),y) +obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o +endif CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 870aeb51d70a..0d56cea71602 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -39,7 +39,7 @@ */ /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ -#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8) +#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) struct bpf_arena { @@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_SIGSEGV; /* Account into memcg of the process that created bpf_arena */ - ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); + ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); return VM_FAULT_SIGSEGV; @@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt if (ret) goto out_free_pages; - ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, - node_id, page_cnt, pages); + ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); if (ret) goto out; @@ -577,8 +576,8 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) -BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index d5dc65bb1755..148da8f7ff36 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -153,7 +153,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &cgroup_cache, NULL); + bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy); } /* *gfp_flags* is a hidden argument provided by the verifier */ @@ -161,6 +161,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; + bool nobusy; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) @@ -169,21 +170,21 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!cgroup) return (unsigned long)NULL; - if (!bpf_cgrp_storage_trylock()) - return (unsigned long)NULL; + nobusy = bpf_cgrp_storage_trylock(); - sdata = cgroup_storage_lookup(cgroup, map, true); + sdata = cgroup_storage_lookup(cgroup, map, nobusy); if (sdata) goto unlock; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); unlock: - bpf_cgrp_storage_unlock(); + if (nobusy) + bpf_cgrp_storage_unlock(); return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 106735145948..380e9a7cac75 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -335,7 +335,7 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo, tinfo->btf_id = prog->aux->attach_btf_id; } -bool bpf_iter_prog_supported(struct bpf_prog *prog) +int bpf_iter_prog_supported(struct bpf_prog *prog) { const char *attach_fname = prog->aux->attach_func_name; struct bpf_iter_target_info *tinfo = NULL, *iter; @@ -344,7 +344,7 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) int prefix_len = strlen(prefix); if (strncmp(attach_fname, prefix, prefix_len)) - return false; + return -EINVAL; mutex_lock(&targets_mutex); list_for_each_entry(iter, &targets, list) { @@ -360,12 +360,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) } mutex_unlock(&targets_mutex); - if (tinfo) { - prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; - prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; - } + if (!tinfo) + return -EINVAL; - return tinfo != NULL; + return bpf_prog_ctx_arg_info_init(prog, tinfo->reg_info->ctx_arg_info, + tinfo->reg_info->ctx_arg_info_size); } const struct bpf_func_proto * diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 3dabdd137d10..2d6e1c98d8ad 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -337,12 +337,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); - if (++nfree == LOCAL_FREE_TARGET) + if (++nfree == lru->target_free) break; } - if (nfree < LOCAL_FREE_TARGET) - __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + if (nfree < lru->target_free) + __bpf_lru_list_shrink(lru, l, lru->target_free - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); @@ -577,6 +577,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } + + lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2, + 1, LOCAL_FREE_TARGET); } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index cbd8d3720c2b..fe2661a58ea9 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -58,6 +58,7 @@ struct bpf_lru { del_from_htab_func del_from_htab; void *del_arg; unsigned int hash_offset; + unsigned int target_free; unsigned int nr_scans; bool percpu; }; diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 967492b65185..0a59df1c550a 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -316,7 +316,9 @@ BTF_ID(func, bpf_lsm_inode_getxattr) BTF_ID(func, bpf_lsm_inode_mknod) BTF_ID(func, bpf_lsm_inode_need_killpriv) BTF_ID(func, bpf_lsm_inode_post_setxattr) +BTF_ID(func, bpf_lsm_inode_post_removexattr) BTF_ID(func, bpf_lsm_inode_readlink) +BTF_ID(func, bpf_lsm_inode_removexattr) BTF_ID(func, bpf_lsm_inode_rename) BTF_ID(func, bpf_lsm_inode_rmdir) BTF_ID(func, bpf_lsm_inode_setattr) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 040fb1cd840b..96113633e391 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -146,39 +146,7 @@ void bpf_struct_ops_image_free(void *image) } #define MAYBE_NULL_SUFFIX "__nullable" -#define MAX_STUB_NAME 128 - -/* Return the type info of a stub function, if it exists. - * - * The name of a stub function is made up of the name of the struct_ops and - * the name of the function pointer member, separated by "__". For example, - * if the struct_ops type is named "foo_ops" and the function pointer - * member is named "bar", the stub function name would be "foo_ops__bar". - */ -static const struct btf_type * -find_stub_func_proto(const struct btf *btf, const char *st_op_name, - const char *member_name) -{ - char stub_func_name[MAX_STUB_NAME]; - const struct btf_type *func_type; - s32 btf_id; - int cp; - - cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s", - st_op_name, member_name); - if (cp >= MAX_STUB_NAME) { - pr_warn("Stub function name too long\n"); - return NULL; - } - btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC); - if (btf_id < 0) - return NULL; - func_type = btf_type_by_id(btf, btf_id); - if (!func_type) - return NULL; - - return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */ -} +#define REFCOUNTED_SUFFIX "__ref" /* Prepare argument info for every nullable argument of a member of a * struct_ops type. @@ -203,27 +171,44 @@ find_stub_func_proto(const struct btf *btf, const char *st_op_name, static int prepare_arg_info(struct btf *btf, const char *st_ops_name, const char *member_name, - const struct btf_type *func_proto, + const struct btf_type *func_proto, void *stub_func_addr, struct bpf_struct_ops_arg_info *arg_info) { const struct btf_type *stub_func_proto, *pointed_type; + bool is_nullable = false, is_refcounted = false; const struct btf_param *stub_args, *args; struct bpf_ctx_arg_aux *info, *info_buf; u32 nargs, arg_no, info_cnt = 0; + char ksym[KSYM_SYMBOL_LEN]; + const char *stub_fname; + const char *suffix; + s32 stub_func_id; u32 arg_btf_id; int offset; - stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name); - if (!stub_func_proto) - return 0; + stub_fname = kallsyms_lookup((unsigned long)stub_func_addr, NULL, NULL, NULL, ksym); + if (!stub_fname) { + pr_warn("Cannot find the stub function name for the %s in struct %s\n", + member_name, st_ops_name); + return -ENOENT; + } + + stub_func_id = btf_find_by_name_kind(btf, stub_fname, BTF_KIND_FUNC); + if (stub_func_id < 0) { + pr_warn("Cannot find the stub function %s in btf\n", stub_fname); + return -ENOENT; + } + + stub_func_proto = btf_type_by_id(btf, stub_func_id); + stub_func_proto = btf_type_by_id(btf, stub_func_proto->type); /* Check if the number of arguments of the stub function is the same * as the number of arguments of the function pointer. */ nargs = btf_type_vlen(func_proto); if (nargs != btf_type_vlen(stub_func_proto)) { - pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n", - st_ops_name, member_name, member_name, st_ops_name); + pr_warn("the number of arguments of the stub function %s does not match the number of arguments of the member %s of struct %s\n", + stub_fname, member_name, st_ops_name); return -EINVAL; } @@ -241,10 +226,18 @@ static int prepare_arg_info(struct btf *btf, info = info_buf; for (arg_no = 0; arg_no < nargs; arg_no++) { /* Skip arguments that is not suffixed with - * "__nullable". + * "__nullable or __ref". */ - if (!btf_param_match_suffix(btf, &stub_args[arg_no], - MAYBE_NULL_SUFFIX)) + is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no], + MAYBE_NULL_SUFFIX); + is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no], + REFCOUNTED_SUFFIX); + + if (is_nullable) + suffix = MAYBE_NULL_SUFFIX; + else if (is_refcounted) + suffix = REFCOUNTED_SUFFIX; + else continue; /* Should be a pointer to struct */ @@ -253,30 +246,34 @@ static int prepare_arg_info(struct btf *btf, &arg_btf_id); if (!pointed_type || !btf_type_is_struct(pointed_type)) { - pr_warn("stub function %s__%s has %s tagging to an unsupported type\n", - st_ops_name, member_name, MAYBE_NULL_SUFFIX); + pr_warn("stub function %s has %s tagging to an unsupported type\n", + stub_fname, suffix); goto err_out; } offset = btf_ctx_arg_offset(btf, func_proto, arg_no); if (offset < 0) { - pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n", - st_ops_name, member_name, arg_no); + pr_warn("stub function %s has an invalid trampoline ctx offset for arg#%u\n", + stub_fname, arg_no); goto err_out; } if (args[arg_no].type != stub_args[arg_no].type) { - pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n", - arg_no, st_ops_name, member_name); + pr_warn("arg#%u type in stub function %s does not match with its original func_proto\n", + arg_no, stub_fname); goto err_out; } /* Fill the information of the new argument */ - info->reg_type = - PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; info->btf_id = arg_btf_id; info->btf = btf; info->offset = offset; + if (is_nullable) { + info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; + } else if (is_refcounted) { + info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID; + info->refcounted = true; + } info++; info_cnt++; @@ -324,6 +321,13 @@ static bool is_module_member(const struct btf *btf, u32 id) return !strcmp(btf_name_by_offset(btf, t->name_off), "module"); } +int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) +{ + void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); + + return func_ptr ? 0 : -ENOTSUPP; +} + int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, struct btf *btf, struct bpf_verifier_log *log) @@ -386,8 +390,11 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, st_ops_desc->value_type = btf_type_by_id(btf, value_id); for_each_member(i, t, member) { - const struct btf_type *func_proto; + const struct btf_type *func_proto, *ret_type; + void **stub_func_addr; + u32 moff; + moff = __btf_member_bit_offset(t, member) / 8; mname = btf_name_by_offset(btf, member->name_off); if (!*mname) { pr_warn("anon member in struct %s is not supported\n", @@ -413,9 +420,23 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, func_proto = btf_type_resolve_func_ptr(btf, member->type, NULL); - if (!func_proto) + + /* The member is not a function pointer or + * the function pointer is not supported. + */ + if (!func_proto || bpf_struct_ops_supported(st_ops, moff)) continue; + if (func_proto->type) { + ret_type = btf_type_resolve_ptr(btf, func_proto->type, NULL); + if (ret_type && !__btf_type_is_struct(ret_type)) { + pr_warn("func ptr %s in struct %s returns non-struct pointer, which is not supported\n", + mname, st_ops->name); + err = -EOPNOTSUPP; + goto errout; + } + } + if (btf_distill_func_proto(log, btf, func_proto, mname, &st_ops->func_models[i])) { @@ -425,8 +446,9 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, goto errout; } + stub_func_addr = *(void **)(st_ops->cfi_stubs + moff); err = prepare_arg_info(btf, st_ops->name, mname, - func_proto, + func_proto, stub_func_addr, arg_info + i); if (err) goto errout; @@ -579,7 +601,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, if (model->ret_size > 0) flags |= BPF_TRAMP_F_RET_FENTRY_RET; - size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); + size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func); if (size <= 0) return size ? : -EFAULT; @@ -1152,13 +1174,6 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } -int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) -{ - void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); - - return func_ptr ? 0 : -ENOTSUPP; -} - static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9de6acddd479..1d2cf898e21e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -26,6 +26,7 @@ #include <linux/bsearch.h> #include <linux/kobject.h> #include <linux/sysfs.h> +#include <linux/overflow.h> #include <net/netfilter/nf_bpf_link.h> @@ -606,6 +607,7 @@ s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) spin_unlock_bh(&btf_idr_lock); return ret; } +EXPORT_SYMBOL_GPL(bpf_find_btf_id); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, u32 id, u32 *res_id) @@ -2575,7 +2577,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_kflag(t)) { + if (btf_type_kflag(t) && !btf_type_is_type_tag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } @@ -3332,6 +3334,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, u32 off, int sz, struct btf_field_info *info, u32 field_mask) { enum btf_field_type type; + const char *tag_value; + bool is_type_tag; u32 res_id; /* Permit modifiers on the pointer itself */ @@ -3341,19 +3345,20 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, if (!btf_type_is_ptr(t)) return BTF_FIELD_IGNORE; t = btf_type_by_id(btf, t->type); - - if (!btf_type_is_type_tag(t)) + is_type_tag = btf_type_is_type_tag(t) && !btf_type_kflag(t); + if (!is_type_tag) return BTF_FIELD_IGNORE; /* Reject extra tags */ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type))) return -EINVAL; - if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off))) + tag_value = __btf_name_by_offset(btf, t->name_off); + if (!strcmp("kptr_untrusted", tag_value)) type = BPF_KPTR_UNREF; - else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("kptr", tag_value)) type = BPF_KPTR_REF; - else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("percpu_kptr", tag_value)) type = BPF_KPTR_PERCPU; - else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("uptr", tag_value)) type = BPF_UPTR; else return -EINVAL; @@ -3477,6 +3482,15 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ goto end; } } + if (field_mask & BPF_RES_SPIN_LOCK) { + if (!strcmp(name, "bpf_res_spin_lock")) { + if (*seen_mask & BPF_RES_SPIN_LOCK) + return -E2BIG; + *seen_mask |= BPF_RES_SPIN_LOCK; + type = BPF_RES_SPIN_LOCK; + goto end; + } + } if (field_mask & BPF_TIMER) { if (!strcmp(name, "bpf_timer")) { if (*seen_mask & BPF_TIMER) @@ -3655,6 +3669,7 @@ static int btf_find_field_one(const struct btf *btf, switch (field_type) { case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_WORKQUEUE: case BPF_LIST_NODE: @@ -3943,11 +3958,12 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* This needs to be kzalloc to zero out padding and unused fields, see * comment in btf_record_equal. */ - rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN); + rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL | __GFP_NOWARN); if (!rec) return ERR_PTR(-ENOMEM); rec->spin_lock_off = -EINVAL; + rec->res_spin_lock_off = -EINVAL; rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; @@ -3975,6 +3991,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->spin_lock_off = rec->fields[i].offset; break; + case BPF_RES_SPIN_LOCK: + WARN_ON_ONCE(rec->spin_lock_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->res_spin_lock_off = rec->fields[i].offset; + break; case BPF_TIMER: WARN_ON_ONCE(rec->timer_off >= 0); /* Cache offset for faster lookup at runtime */ @@ -4018,9 +4039,15 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->cnt++; } + if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) { + ret = -EINVAL; + goto end; + } + /* bpf_{list_head, rb_node} require bpf_spin_lock */ if ((btf_record_has_field(rec, BPF_LIST_HEAD) || - btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) { + btf_record_has_field(rec, BPF_RB_ROOT)) && + (rec->spin_lock_off < 0 && rec->res_spin_lock_off < 0)) { ret = -EINVAL; goto end; } @@ -4944,11 +4971,6 @@ static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - component_idx = btf_type_decl_tag(t)->component_idx; if (component_idx < -1) { btf_verifier_log_type(env, t, "Invalid component_idx"); @@ -5562,7 +5584,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) if (id < 0) continue; - new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; @@ -5589,7 +5611,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) if (ret != BTF_FIELD_FOUND) continue; - new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]), + new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_aof) { ret = -ENOMEM; @@ -5626,7 +5648,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) continue; parse: tab_cnt = tab ? tab->cnt : 0; - new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]), + new_tab = krealloc(tab, struct_size(new_tab, types, tab_cnt + 1), GFP_KERNEL | __GFP_NOWARN); if (!new_tab) { ret = -ENOMEM; @@ -5638,7 +5660,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) type = &tab->types[tab->cnt]; type->btf_id = i; - record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | + record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT | BPF_KPTR, t->size); /* The record cannot be unset, treat it as an error if so */ @@ -6362,16 +6384,15 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) return prog->aux->attach_btf; } -static bool is_int_ptr(struct btf *btf, const struct btf_type *t) +static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t) { /* skip modifiers */ t = btf_type_skip_modifiers(btf, t->type, NULL); - - return btf_type_is_int(t); + return btf_type_is_void(t) || btf_type_is_int(t); } -static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, - int off) +u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, + int off) { const struct btf_param *args; const struct btf_type *t; @@ -6507,6 +6528,10 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { /* rxrpc */ { "rxrpc_recvdata", 0x1 }, { "rxrpc_resend", 0x10 }, + { "rxrpc_tq", 0x10 }, + { "rxrpc_client", 0x1 }, + /* skb */ + {"kfree_skb", 0x1000}, /* sunrpc */ { "xs_stream_read_data", 0x1 }, /* ... from xprt_cong_event event class */ @@ -6516,6 +6541,7 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { { "xprt_put_cong", 0x10 }, /* tcp */ { "tcp_send_reset", 0x11 }, + { "tcp_sendmsg_locked", 0x100 }, /* tegra_apb_dma */ { "tegra_dma_tx_status", 0x100 }, /* timer_migration */ @@ -6527,6 +6553,103 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { { "mr_integ_alloc", 0x2000 }, /* bpf_testmod */ { "bpf_testmod_test_read", 0x0 }, + /* amdgpu */ + { "amdgpu_vm_bo_map", 0x1 }, + { "amdgpu_vm_bo_unmap", 0x1 }, + /* netfs */ + { "netfs_folioq", 0x1 }, + /* xfs from xfs_defer_pending_class */ + { "xfs_defer_create_intent", 0x1 }, + { "xfs_defer_cancel_list", 0x1 }, + { "xfs_defer_pending_finish", 0x1 }, + { "xfs_defer_pending_abort", 0x1 }, + { "xfs_defer_relog_intent", 0x1 }, + { "xfs_defer_isolate_paused", 0x1 }, + { "xfs_defer_item_pause", 0x1 }, + { "xfs_defer_item_unpause", 0x1 }, + /* xfs from xfs_defer_pending_item_class */ + { "xfs_defer_add_item", 0x1 }, + { "xfs_defer_cancel_item", 0x1 }, + { "xfs_defer_finish_item", 0x1 }, + /* xfs from xfs_icwalk_class */ + { "xfs_ioc_free_eofblocks", 0x10 }, + { "xfs_blockgc_free_space", 0x10 }, + /* xfs from xfs_btree_cur_class */ + { "xfs_btree_updkeys", 0x100 }, + { "xfs_btree_overlapped_query_range", 0x100 }, + /* xfs from xfs_imap_class*/ + { "xfs_map_blocks_found", 0x10000 }, + { "xfs_map_blocks_alloc", 0x10000 }, + { "xfs_iomap_alloc", 0x1000 }, + { "xfs_iomap_found", 0x1000 }, + /* xfs from xfs_fs_class */ + { "xfs_inodegc_flush", 0x1 }, + { "xfs_inodegc_push", 0x1 }, + { "xfs_inodegc_start", 0x1 }, + { "xfs_inodegc_stop", 0x1 }, + { "xfs_inodegc_queue", 0x1 }, + { "xfs_inodegc_throttle", 0x1 }, + { "xfs_fs_sync_fs", 0x1 }, + { "xfs_blockgc_start", 0x1 }, + { "xfs_blockgc_stop", 0x1 }, + { "xfs_blockgc_worker", 0x1 }, + { "xfs_blockgc_flush_all", 0x1 }, + /* xfs_scrub */ + { "xchk_nlinks_live_update", 0x10 }, + /* xfs_scrub from xchk_metapath_class */ + { "xchk_metapath_lookup", 0x100 }, + /* nfsd */ + { "nfsd_dirent", 0x1 }, + { "nfsd_file_acquire", 0x1001 }, + { "nfsd_file_insert_err", 0x1 }, + { "nfsd_file_cons_err", 0x1 }, + /* nfs4 */ + { "nfs4_setup_sequence", 0x1 }, + { "pnfs_update_layout", 0x10000 }, + { "nfs4_inode_callback_event", 0x200 }, + { "nfs4_inode_stateid_callback_event", 0x200 }, + /* nfs from pnfs_layout_event */ + { "pnfs_mds_fallback_pg_init_read", 0x10000 }, + { "pnfs_mds_fallback_pg_init_write", 0x10000 }, + { "pnfs_mds_fallback_pg_get_mirror_count", 0x10000 }, + { "pnfs_mds_fallback_read_done", 0x10000 }, + { "pnfs_mds_fallback_write_done", 0x10000 }, + { "pnfs_mds_fallback_read_pagelist", 0x10000 }, + { "pnfs_mds_fallback_write_pagelist", 0x10000 }, + /* coda */ + { "coda_dec_pic_run", 0x10 }, + { "coda_dec_pic_done", 0x10 }, + /* cfg80211 */ + { "cfg80211_scan_done", 0x11 }, + { "rdev_set_coalesce", 0x10 }, + { "cfg80211_report_wowlan_wakeup", 0x100 }, + { "cfg80211_inform_bss_frame", 0x100 }, + { "cfg80211_michael_mic_failure", 0x10000 }, + /* cfg80211 from wiphy_work_event */ + { "wiphy_work_queue", 0x10 }, + { "wiphy_work_run", 0x10 }, + { "wiphy_work_cancel", 0x10 }, + { "wiphy_work_flush", 0x10 }, + /* hugetlbfs */ + { "hugetlbfs_alloc_inode", 0x10 }, + /* spufs */ + { "spufs_context", 0x10 }, + /* kvm_hv */ + { "kvm_page_fault_enter", 0x100 }, + /* dpu */ + { "dpu_crtc_setup_mixer", 0x100 }, + /* binder */ + { "binder_transaction", 0x100 }, + /* bcachefs */ + { "btree_path_free", 0x100 }, + /* hfi1_tx */ + { "hfi1_sdma_progress", 0x1000 }, + /* iptfs */ + { "iptfs_ingress_postq_event", 0x1000 }, + /* neigh */ + { "neigh_update", 0x10 }, + /* snd_firewire_lib */ + { "amdtp_packet", 0x100 }, }; bool btf_ctx_access(int off, int size, enum bpf_access_type type, @@ -6549,7 +6672,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, tname, off); return false; } - arg = get_ctx_arg_idx(btf, t, off); + arg = btf_ctx_arg_idx(btf, t, off); args = (const struct btf_param *)(t + 1); /* if (t == NULL) Fall back to default BPF prog with * MAX_BPF_FUNC_REG_ARGS u64 arguments. @@ -6654,14 +6777,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } } - if (t->type == 0) - /* This is a pointer to void. - * It is the same as scalar from the verifier safety pov. - * No further pointer walking is allowed. - */ - return true; - - if (is_int_ptr(btf, t)) + /* + * If it's a pointer to void, it's the same as scalar from the verifier + * safety POV. Either way, no futher pointer walking is allowed. + */ + if (is_void_or_int_ptr(btf, t)) return true; /* this is a pointer to another type */ @@ -6677,6 +6797,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = ctx_arg_info->reg_type; info->btf = ctx_arg_info->btf ? : btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; + info->ref_obj_id = ctx_arg_info->ref_obj_id; return true; } } @@ -6706,10 +6827,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* Is this a func with potential NULL args? */ if (strcmp(tname, raw_tp_null_args[i].func)) continue; - if (raw_tp_null_args[i].mask & (0x1 << (arg * 4))) + if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4))) info->reg_type |= PTR_MAYBE_NULL; /* Is the current arg IS_ERR? */ - if (raw_tp_null_args[i].mask & (0x2 << (arg * 4))) + if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4))) ptr_err_raw_tp = true; break; } @@ -6743,7 +6864,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->btf_id = t->type; t = btf_type_by_id(btf, t->type); - if (btf_type_is_type_tag(t)) { + if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); if (strcmp(tag_value, "user") == 0) info->reg_type |= MEM_USER; @@ -7002,7 +7123,7 @@ error: /* check type tag */ t = btf_type_by_id(btf, mtype->type); - if (btf_type_is_type_tag(t)) { + if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); /* check __user tag */ if (strcmp(tag_value, "user") == 0) @@ -7539,7 +7660,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) return 0; if (!prog->aux->func_info) { - bpf_log(log, "Verifier bug\n"); + verifier_bug(env, "func_info undefined"); return -EFAULT; } @@ -7563,7 +7684,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tname = btf_name_by_offset(btf, fn_t->name_off); if (prog->aux->func_info_aux[subprog].unreliable) { - bpf_log(log, "Verifier bug in function %s()\n", tname); + verifier_bug(env, "unreliable BTF for function %s()", tname); return -EFAULT; } if (prog_type == BPF_PROG_TYPE_EXT) @@ -8440,7 +8561,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, /* Grow set */ set = krealloc(tab->sets[hook], - offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]), + struct_size(set, pairs, set_cnt + add_set->cnt), GFP_KERNEL | __GFP_NOWARN); if (!set) { ret = -ENOMEM; @@ -8524,6 +8645,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: return BTF_KFUNC_HOOK_CGROUP; case BPF_PROG_TYPE_SCHED_ACT: return BTF_KFUNC_HOOK_SCHED_ACT; @@ -8725,7 +8847,7 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c } tab = krealloc(btf->dtor_kfunc_tab, - offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]), + struct_size(tab, dtors, tab_cnt + add_cnt), GFP_KERNEL | __GFP_NOWARN); if (!tab) { ret = -ENOMEM; @@ -9283,8 +9405,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops, tab = btf->struct_ops_tab; if (!tab) { - tab = kzalloc(offsetof(struct btf_struct_ops_tab, ops[4]), - GFP_KERNEL); + tab = kzalloc(struct_size(tab, ops, 4), GFP_KERNEL); if (!tab) return -ENOMEM; tab->capacity = 4; @@ -9297,8 +9418,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops, if (tab->cnt == tab->capacity) { new_tab = krealloc(tab, - offsetof(struct btf_struct_ops_tab, - ops[tab->capacity * 2]), + struct_size(tab, ops, tab->capacity * 2), GFP_KERNEL); if (!new_tab) return -ENOMEM; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 46e5db65dbc8..f4885514f007 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -41,6 +41,19 @@ static int __init cgroup_bpf_wq_init(void) } core_initcall(cgroup_bpf_wq_init); +static int cgroup_bpf_lifetime_notify(struct notifier_block *nb, + unsigned long action, void *data); + +static struct notifier_block cgroup_bpf_lifetime_nb = { + .notifier_call = cgroup_bpf_lifetime_notify, +}; + +void __init cgroup_bpf_lifetime_notifier_init(void) +{ + BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier, + &cgroup_bpf_lifetime_nb)); +} + /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ @@ -206,7 +219,7 @@ bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) } #endif /* CONFIG_BPF_LSM */ -void cgroup_bpf_offline(struct cgroup *cgrp) +static void cgroup_bpf_offline(struct cgroup *cgrp) { cgroup_get(cgrp); percpu_ref_kill(&cgrp->bpf.refcnt); @@ -369,7 +382,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) /* count number of elements in the list. * it's slow but the list cannot be long */ -static u32 prog_list_length(struct hlist_head *head) +static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt) { struct bpf_prog_list *pl; u32 cnt = 0; @@ -377,6 +390,8 @@ static u32 prog_list_length(struct hlist_head *head) hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; + if (preorder_cnt && (pl->flags & BPF_F_PREORDER)) + (*preorder_cnt)++; cnt++; } return cnt; @@ -400,7 +415,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, if (flags & BPF_F_ALLOW_MULTI) return true; - cnt = prog_list_length(&p->bpf.progs[atype]); + cnt = prog_list_length(&p->bpf.progs[atype], NULL); WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE); @@ -423,12 +438,12 @@ static int compute_effective_progs(struct cgroup *cgrp, struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; - int cnt = 0; + int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart; /* count number of effective programs by walking parents */ do { if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) - cnt += prog_list_length(&p->bpf.progs[atype]); + cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt); p = cgroup_parent(p); } while (p); @@ -439,20 +454,34 @@ static int compute_effective_progs(struct cgroup *cgrp, /* populate the array with effective progs */ cnt = 0; p = cgrp; + fstart = preorder_cnt; + bstart = preorder_cnt - 1; do { if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; + init_bstart = bstart; hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; - item = &progs->items[cnt]; + if (pl->flags & BPF_F_PREORDER) { + item = &progs->items[bstart]; + bstart--; + } else { + item = &progs->items[fstart]; + fstart++; + } item->prog = prog_list_prog(pl); bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage); cnt++; } + + /* reverse pre-ordering progs at this cgroup level */ + for (i = bstart + 1, j = init_bstart; i < j; i++, j--) + swap(progs->items[i], progs->items[j]); + } while ((p = cgroup_parent(p))); *array = progs; @@ -475,7 +504,7 @@ static void activate_effective_progs(struct cgroup *cgrp, * cgroup_bpf_inherit() - inherit effective programs from parent * @cgrp: the cgroup to modify */ -int cgroup_bpf_inherit(struct cgroup *cgrp) +static int cgroup_bpf_inherit(struct cgroup *cgrp) { /* has to use marco instead of const int, since compiler thinks * that array below is variable length @@ -518,6 +547,27 @@ cleanup: return -ENOMEM; } +static int cgroup_bpf_lifetime_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct cgroup *cgrp = data; + int ret = 0; + + if (cgrp->root != &cgrp_dfl_root) + return NOTIFY_OK; + + switch (action) { + case CGROUP_LIFETIME_ONLINE: + ret = cgroup_bpf_inherit(cgrp); + break; + case CGROUP_LIFETIME_OFFLINE: + cgroup_bpf_offline(cgrp); + break; + } + + return notifier_from_errno(ret); +} + static int update_effective_progs(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype) { @@ -663,7 +713,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, */ return -EPERM; - if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) + if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; pl = find_attach_entry(progs, prog, link, replace_prog, @@ -698,6 +748,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, pl->prog = prog; pl->link = link; + pl->flags = flags; bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; @@ -1073,7 +1124,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, lockdep_is_held(&cgroup_mutex)); total_cnt += bpf_prog_array_length(effective); } else { - total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); + total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL); } } @@ -1105,7 +1156,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, u32 id; progs = &cgrp->bpf.progs[atype]; - cnt = min_t(int, prog_list_length(progs), total_cnt); + cnt = min_t(int, prog_list_length(progs, NULL), total_cnt); i = 0; hlist_for_each_entry(pl, progs, node) { prog = prog_list_prog(pl); @@ -1636,10 +1687,6 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; @@ -2087,7 +2134,7 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -2187,10 +2234,6 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_sysctl_get_name: return &bpf_sysctl_get_name_proto; @@ -2334,10 +2377,6 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { #ifdef CONFIG_NET case BPF_FUNC_get_netns_cookie: @@ -2584,23 +2623,3 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } } - -/* Common helpers for cgroup hooks with valid process context. */ -const struct bpf_func_proto * -cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; - default: - return NULL; - } -} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index da729cbbaeb9..c20babbf998f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1663,14 +1663,17 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(JMP, JSET, K), \ INSN_2(JMP, JA), \ INSN_2(JMP32, JA), \ + /* Atomic operations. */ \ + INSN_3(STX, ATOMIC, B), \ + INSN_3(STX, ATOMIC, H), \ + INSN_3(STX, ATOMIC, W), \ + INSN_3(STX, ATOMIC, DW), \ /* Store instructions. */ \ /* Register based. */ \ INSN_3(STX, MEM, B), \ INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ - INSN_3(STX, ATOMIC, W), \ - INSN_3(STX, ATOMIC, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ @@ -2152,24 +2155,33 @@ out: if (BPF_SIZE(insn->code) == BPF_W) \ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ (DST + insn->off)); \ - else \ + else if (BPF_SIZE(insn->code) == BPF_DW) \ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ (DST + insn->off)); \ + else \ + goto default_label; \ break; \ case BOP | BPF_FETCH: \ if (BPF_SIZE(insn->code) == BPF_W) \ SRC = (u32) atomic_fetch_##KOP( \ (u32) SRC, \ (atomic_t *)(unsigned long) (DST + insn->off)); \ - else \ + else if (BPF_SIZE(insn->code) == BPF_DW) \ SRC = (u64) atomic64_fetch_##KOP( \ (u64) SRC, \ (atomic64_t *)(unsigned long) (DST + insn->off)); \ + else \ + goto default_label; \ break; STX_ATOMIC_DW: STX_ATOMIC_W: + STX_ATOMIC_H: + STX_ATOMIC_B: switch (IMM) { + /* Atomic read-modify-write instructions support only W and DW + * size modifiers. + */ ATOMIC_ALU_OP(BPF_ADD, add) ATOMIC_ALU_OP(BPF_AND, and) ATOMIC_ALU_OP(BPF_OR, or) @@ -2181,20 +2193,63 @@ out: SRC = (u32) atomic_xchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) SRC); - else + else if (BPF_SIZE(insn->code) == BPF_DW) SRC = (u64) atomic64_xchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) SRC); + else + goto default_label; break; case BPF_CMPXCHG: if (BPF_SIZE(insn->code) == BPF_W) BPF_R0 = (u32) atomic_cmpxchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) BPF_R0, (u32) SRC); - else + else if (BPF_SIZE(insn->code) == BPF_DW) BPF_R0 = (u64) atomic64_cmpxchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) BPF_R0, (u64) SRC); + else + goto default_label; + break; + /* Atomic load and store instructions support all size + * modifiers. + */ + case BPF_LOAD_ACQ: + switch (BPF_SIZE(insn->code)) { +#define LOAD_ACQUIRE(SIZEOP, SIZE) \ + case BPF_##SIZEOP: \ + DST = (SIZE)smp_load_acquire( \ + (SIZE *)(unsigned long)(SRC + insn->off)); \ + break; + LOAD_ACQUIRE(B, u8) + LOAD_ACQUIRE(H, u16) + LOAD_ACQUIRE(W, u32) +#ifdef CONFIG_64BIT + LOAD_ACQUIRE(DW, u64) +#endif +#undef LOAD_ACQUIRE + default: + goto default_label; + } + break; + case BPF_STORE_REL: + switch (BPF_SIZE(insn->code)) { +#define STORE_RELEASE(SIZEOP, SIZE) \ + case BPF_##SIZEOP: \ + smp_store_release( \ + (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC); \ + break; + STORE_RELEASE(B, u8) + STORE_RELEASE(H, u16) + STORE_RELEASE(W, u32) +#ifdef CONFIG_64BIT + STORE_RELEASE(DW, u64) +#endif +#undef STORE_RELEASE + default: + goto default_label; + } break; default: @@ -2290,20 +2345,21 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) insn->code = BPF_JMP | BPF_CALL_ARGS; } #endif -#else +#endif + static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON - * is not working properly, so warn about it! + * is not working properly, or interpreter is being used when + * prog->jit_requested is not 0, so warn about it! */ WARN_ON_ONCE(1); return 0; } -#endif -bool bpf_prog_map_compatible(struct bpf_map *map, - const struct bpf_prog *fp) +static bool __bpf_prog_map_compatible(struct bpf_map *map, + const struct bpf_prog *fp) { enum bpf_prog_type prog_type = resolve_prog_type(fp); bool ret; @@ -2312,14 +2368,6 @@ bool bpf_prog_map_compatible(struct bpf_map *map, if (fp->kprobe_override) return false; - /* XDP programs inserted into maps are not guaranteed to run on - * a particular netdev (and can run outside driver context entirely - * in the case of devmap and cpumap). Until device checks - * are implemented, prohibit adding dev-bound programs to program maps. - */ - if (bpf_prog_is_dev_bound(aux)) - return false; - spin_lock(&map->owner.lock); if (!map->owner.type) { /* There's no owner yet where we could check for @@ -2353,6 +2401,19 @@ bool bpf_prog_map_compatible(struct bpf_map *map, return ret; } +bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) +{ + /* XDP programs inserted into maps are not guaranteed to run on + * a particular netdev (and can run outside driver context entirely + * in the case of devmap and cpumap). Until device checks + * are implemented, prohibit adding dev-bound programs to program maps. + */ + if (bpf_prog_is_dev_bound(fp->aux)) + return false; + + return __bpf_prog_map_compatible(map, fp); +} + static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; @@ -2365,7 +2426,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) if (!map_type_contains_progs(map)) continue; - if (!bpf_prog_map_compatible(map, fp)) { + if (!__bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } @@ -2380,8 +2441,18 @@ static void bpf_prog_select_func(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + u32 idx = (round_up(stack_depth, 32) / 32) - 1; - fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; + /* may_goto may cause stack size > 512, leading to idx out-of-bounds. + * But for non-JITed programs, we don't need bpf_func, so no bounds + * check needed. + */ + if (!fp->jit_requested && + !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) { + fp->bpf_func = interpreters[idx]; + } else { + fp->bpf_func = __bpf_prog_ret0_warn; + } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif @@ -2403,7 +2474,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ - bool jit_needed = false; + bool jit_needed = fp->jit_requested; if (fp->bpf_func) goto finalize; @@ -2906,6 +2977,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void) return NULL; } +const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void) +{ + return NULL; +} + u64 __weak bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) @@ -3058,6 +3134,32 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, { } +bool __weak bpf_jit_supports_timed_may_goto(void) +{ + return false; +} + +u64 __weak arch_bpf_timed_may_goto(void) +{ + return 0; +} + +u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) +{ + u64 time = ktime_get_mono_fast_ns(); + + /* Populate the timestamp for this stack frame, and refresh count. */ + if (!p->timestamp) { + p->timestamp = time; + return BPF_MAX_TIMED_LOOPS; + } + /* Check if we've exhausted our time slice, and zero count. */ + if (time - p->timestamp >= (NSEC_PER_SEC / 4)) + return 0; + /* Refresh the count for the stack frame. */ + return BPF_MAX_TIMED_LOOPS; +} + /* for configs without MMU or 32-bit */ __weak const struct bpf_map_ops arena_map_ops; __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 774accbd4a22..67e8a2fc1a99 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -33,8 +33,8 @@ #include <trace/events/xdp.h> #include <linux/btf_ids.h> -#include <linux/netdevice.h> /* netif_receive_skb_list */ -#include <linux/etherdevice.h> /* eth_type_trans */ +#include <linux/netdevice.h> +#include <net/gro.h> /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is @@ -68,6 +68,7 @@ struct bpf_cpu_map_entry { struct bpf_cpumap_val value; struct bpf_prog *prog; + struct gro_node gro; struct completion kthread_running; struct rcu_work free_work; @@ -133,22 +134,23 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) } } -static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, - struct list_head *listp, - struct xdp_cpumap_stats *stats) +static u32 cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, + void **skbs, u32 skb_n, + struct xdp_cpumap_stats *stats) { - struct sk_buff *skb, *tmp; struct xdp_buff xdp; - u32 act; + u32 act, pass = 0; int err; - list_for_each_entry_safe(skb, tmp, listp, list) { + for (u32 i = 0; i < skb_n; i++) { + struct sk_buff *skb = skbs[i]; + act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog); switch (act) { case XDP_PASS: + skbs[pass++] = skb; break; case XDP_REDIRECT: - skb_list_del_init(skb); err = xdp_do_generic_redirect(skb->dev, skb, &xdp, rcpu->prog); if (unlikely(err)) { @@ -157,7 +159,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, } else { stats->redirect++; } - return; + break; default: bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; @@ -165,12 +167,15 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, trace_xdp_exception(skb->dev, rcpu->prog, act); fallthrough; case XDP_DROP: - skb_list_del_init(skb); - kfree_skb(skb); + napi_consume_skb(skb, true); stats->drop++; - return; + break; } } + + stats->pass += pass; + + return pass; } static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, @@ -204,7 +209,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, stats->drop++; } else { frames[nframes++] = xdpf; - stats->pass++; } break; case XDP_REDIRECT: @@ -228,43 +232,65 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } xdp_clear_return_frame_no_direct(); + stats->pass += nframes; return nframes; } #define CPUMAP_BATCH 8 -static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, - int xdp_n, struct xdp_cpumap_stats *stats, - struct list_head *list) +struct cpu_map_ret { + u32 xdp_n; + u32 skb_n; +}; + +static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, + void **skbs, struct cpu_map_ret *ret, + struct xdp_cpumap_stats *stats) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; - int nframes; if (!rcpu->prog) - return xdp_n; + goto out; - rcu_read_lock_bh(); + rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); - nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); + ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); + if (unlikely(ret->skb_n)) + ret->skb_n = cpu_map_bpf_prog_run_skb(rcpu, skbs, ret->skb_n, + stats); if (stats->redirect) xdp_do_flush(); - if (unlikely(!list_empty(list))) - cpu_map_bpf_prog_run_skb(rcpu, list, stats); - bpf_net_ctx_clear(bpf_net_ctx); - rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ + rcu_read_unlock(); - return nframes; +out: + if (unlikely(ret->skb_n) && ret->xdp_n) + memmove(&skbs[ret->xdp_n], skbs, ret->skb_n * sizeof(*skbs)); +} + +static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty) +{ + /* + * If the ring is not empty, there'll be a new iteration soon, and we + * only need to do a full flush if a tick is long (> 1 ms). + * If the ring is empty, to not hold GRO packets in the stack for too + * long, do a full flush. + * This is equivalent to how NAPI decides whether to perform a full + * flush. + */ + gro_flush(&rcpu->gro, !empty && HZ >= 1000); + gro_normal_list(&rcpu->gro); } static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; unsigned long last_qs = jiffies; + u32 packets = 0; complete(&rcpu->kthread_running); set_current_state(TASK_INTERRUPTIBLE); @@ -277,11 +303,11 @@ static int cpu_map_kthread_run(void *data) while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { struct xdp_cpumap_stats stats = {}; /* zero stats */ unsigned int kmem_alloc_drops = 0, sched = 0; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - int i, n, m, nframes, xdp_n; + struct cpu_map_ret ret = { }; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - LIST_HEAD(list); + u32 i, n, m; + bool empty; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -306,7 +332,7 @@ static int cpu_map_kthread_run(void *data) */ n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - for (i = 0, xdp_n = 0; i < n; i++) { + for (i = 0; i < n; i++) { void *f = frames[i]; struct page *page; @@ -314,11 +340,11 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = f; __ptr_clear_bit(0, &skb); - list_add_tail(&skb->list, &list); + skbs[ret.skb_n++] = skb; continue; } - frames[xdp_n++] = f; + frames[ret.xdp_n++] = f; page = virt_to_page(f); /* Bring struct page memory area to curr CPU. Read by @@ -328,40 +354,51 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } + local_bh_disable(); + /* Support running another XDP prog on this CPU */ - nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list); - if (nframes) { - m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, - gfp, nframes, skbs); - if (unlikely(m == 0)) { - for (i = 0; i < nframes; i++) - skbs[i] = NULL; /* effect: xdp_return_frame */ - kmem_alloc_drops += nframes; - } + cpu_map_bpf_prog_run(rcpu, frames, skbs, &ret, &stats); + if (!ret.xdp_n) + goto stats; + + m = napi_skb_cache_get_bulk(skbs, ret.xdp_n); + if (unlikely(m < ret.xdp_n)) { + for (i = m; i < ret.xdp_n; i++) + xdp_return_frame(frames[i]); + + if (ret.skb_n) + memmove(&skbs[m], &skbs[ret.xdp_n], + ret.skb_n * sizeof(*skbs)); + + kmem_alloc_drops += ret.xdp_n - m; + ret.xdp_n = m; } - local_bh_disable(); - for (i = 0; i < nframes; i++) { + for (i = 0; i < ret.xdp_n; i++) { struct xdp_frame *xdpf = frames[i]; - struct sk_buff *skb = skbs[i]; - skb = __xdp_build_skb_from_frame(xdpf, skb, - xdpf->dev_rx); - if (!skb) { - xdp_return_frame(xdpf); - continue; - } - - list_add_tail(&skb->list, &list); + /* Can fail only when !skb -- already handled above */ + __xdp_build_skb_from_frame(xdpf, skbs[i], xdpf->dev_rx); } +stats: /* Feedback loop via tracepoint. * NB: keep before recv to allow measuring enqueue/dequeue latency. */ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, sched, &stats); - netif_receive_skb_list(&list); + for (i = 0; i < ret.xdp_n + ret.skb_n; i++) + gro_receive_skb(&rcpu->gro, skbs[i]); + + /* Flush either every 64 packets or in case of empty ring */ + packets += n; + empty = __ptr_ring_empty(rcpu->queue); + if (packets >= NAPI_POLL_WEIGHT || empty) { + cpu_map_gro_flush(rcpu, empty); + packets = 0; + } + local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); @@ -430,6 +467,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu->cpu = cpu; rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; + gro_init(&rcpu->gro); if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) goto free_ptr_ring; @@ -458,6 +496,7 @@ free_prog: if (rcpu->prog) bpf_prog_put(rcpu->prog); free_ptr_ring: + gro_cleanup(&rcpu->gro); ptr_ring_cleanup(rcpu->queue, NULL); free_queue: kfree(rcpu->queue); @@ -487,6 +526,7 @@ static void __cpu_map_entry_free(struct work_struct *work) if (rcpu->prog) bpf_prog_put(rcpu->prog); + gro_cleanup(&rcpu->gro); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index cfa1c18e3a48..9876c5fe6c2a 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -45,6 +45,10 @@ __bpf_kfunc_start_defs(); * * bpf_cpumask_create() allocates memory using the BPF memory allocator, and * will not block. It may return NULL if no memory is available. + * + * Return: + * * A pointer to a new struct bpf_cpumask instance on success. + * * NULL if the BPF memory allocator is out of memory. */ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) { @@ -71,6 +75,10 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) * Acquires a reference to a BPF cpumask. The cpumask returned by this function * must either be embedded in a map as a kptr, or freed with * bpf_cpumask_release(). + * + * Return: + * * The struct bpf_cpumask pointer passed to the function. + * */ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) { @@ -106,6 +114,9 @@ CFI_NOSEAL(bpf_cpumask_release_dtor); * * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask * pointer may be safely passed to this function. + * + * Return: + * * The index of the first nonzero bit in the struct cpumask. */ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask) { @@ -119,6 +130,9 @@ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask) * * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask * pointer may be safely passed to this function. + * + * Return: + * * The index of the first zero bit in the struct cpumask. */ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) { @@ -133,6 +147,9 @@ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) * * Find the index of the first nonzero bit of the AND of two cpumasks. * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + * + * Return: + * * The index of the first bit that is nonzero in both cpumask instances. */ __bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1, const struct cpumask *src2) @@ -414,12 +431,47 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, * @cpumask: The cpumask being queried. * * Count the number of set bits in the given cpumask. + * + * Return: + * * The number of bits set in the mask. */ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask) { return cpumask_weight(cpumask); } +/** + * bpf_cpumask_populate() - Populate the CPU mask from the contents of + * a BPF memory region. + * + * @cpumask: The cpumask being populated. + * @src: The BPF memory holding the bit pattern. + * @src__sz: Length of the BPF memory region in bytes. + * + * Return: + * * 0 if the struct cpumask * instance was populated successfully. + * * -EACCES if the memory region is too small to populate the cpumask. + * * -EINVAL if the memory region is not aligned to the size of a long + * and the architecture does not support efficient unaligned accesses. + */ +__bpf_kfunc int bpf_cpumask_populate(struct cpumask *cpumask, void *src, size_t src__sz) +{ + unsigned long source = (unsigned long)src; + + /* The memory region must be large enough to populate the entire CPU mask. */ + if (src__sz < bitmap_size(nr_cpu_ids)) + return -EACCES; + + /* If avoiding unaligned accesses, the input region must be aligned to the nearest long. */ + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + !IS_ALIGNED(source, sizeof(long))) + return -EINVAL; + + bitmap_copy(cpumask_bits(cpumask), src, nr_cpu_ids); + + return 0; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(cpumask_kfunc_btf_ids) @@ -448,6 +500,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_populate, KF_RCU) BTF_KFUNCS_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 309c4aa1b026..20883c6b1546 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -202,7 +202,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (is_addr_space_cast(insn)) { - verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", + verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %u, %u)\n", insn->code, insn->dst_reg, insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); } else if (is_mov_percpu_addr(insn)) { @@ -267,6 +267,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_LOAD_ACQ) { + verbose(cbs->private_data, "(%02x) r%d = load_acquire((%s *)(r%d %+d))\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_STORE_REL) { + verbose(cbs->private_data, "(%02x) store_release((%s *)(r%d %+d), r%d)\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } @@ -369,7 +381,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->code, class == BPF_JMP32 ? 'w' : 'r', insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->imm, insn->off); + (u32)insn->imm, insn->off); } } else { verbose(cbs->private_data, "(%02x) %s\n", diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c new file mode 100644 index 000000000000..4dd7ef7c145c --- /dev/null +++ b/kernel/bpf/dmabuf_iter.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Google LLC */ +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/dma-buf.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> + +static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos) + return NULL; + + return dma_buf_iter_begin(); +} + +static void *dmabuf_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct dma_buf *dmabuf = v; + + ++*pos; + + return dma_buf_iter_next(dmabuf); +} + +struct bpf_iter__dmabuf { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct dma_buf *, dmabuf); +}; + +static int __dmabuf_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter_meta meta = { + .seq = seq, + }; + struct bpf_iter__dmabuf ctx = { + .meta = &meta, + .dmabuf = v, + }; + struct bpf_prog *prog = bpf_iter_get_info(&meta, in_stop); + + if (prog) + return bpf_iter_run_prog(prog, &ctx); + + return 0; +} + +static int dmabuf_iter_seq_show(struct seq_file *seq, void *v) +{ + return __dmabuf_seq_show(seq, v, false); +} + +static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v) +{ + struct dma_buf *dmabuf = v; + + if (dmabuf) + dma_buf_put(dmabuf); +} + +static const struct seq_operations dmabuf_iter_seq_ops = { + .start = dmabuf_iter_seq_start, + .next = dmabuf_iter_seq_next, + .stop = dmabuf_iter_seq_stop, + .show = dmabuf_iter_seq_show, +}; + +static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + seq_puts(seq, "dmabuf iter\n"); +} + +static const struct bpf_iter_seq_info dmabuf_iter_seq_info = { + .seq_ops = &dmabuf_iter_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = 0, +}; + +static struct bpf_iter_reg bpf_dmabuf_reg_info = { + .target = "dmabuf", + .feature = BPF_ITER_RESCHED, + .show_fdinfo = bpf_iter_dmabuf_show_fdinfo, + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__dmabuf, dmabuf), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &dmabuf_iter_seq_info, +}; + +DEFINE_BPF_ITER_FUNC(dmabuf, struct bpf_iter_meta *meta, struct dma_buf *dmabuf) +BTF_ID_LIST_SINGLE(bpf_dmabuf_btf_id, struct, dma_buf) + +static int __init dmabuf_iter_init(void) +{ + bpf_dmabuf_reg_info.ctx_arg_info[0].btf_id = bpf_dmabuf_btf_id[0]; + return bpf_iter_reg_target(&bpf_dmabuf_reg_info); +} + +late_initcall(dmabuf_iter_init); + +struct bpf_iter_dmabuf { + /* + * opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __aligned(8); + +/* Non-opaque version of bpf_iter_dmabuf */ +struct bpf_iter_dmabuf_kern { + struct dma_buf *dmabuf; +} __aligned(8); + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_iter_dmabuf_new(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + BUILD_BUG_ON(sizeof(*kit) > sizeof(*it)); + BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it)); + + kit->dmabuf = NULL; + return 0; +} + +__bpf_kfunc struct dma_buf *bpf_iter_dmabuf_next(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + if (kit->dmabuf) + kit->dmabuf = dma_buf_iter_next(kit->dmabuf); + else + kit->dmabuf = dma_buf_iter_begin(); + + return kit->dmabuf; +} + +__bpf_kfunc void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it) +{ + struct bpf_iter_dmabuf_kern *kit = (void *)it; + + if (kit->dmabuf) + dma_buf_put(kit->dmabuf); +} + +__bpf_kfunc_end_defs(); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4a9eeb7aef85..71f9931ac64c 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -16,6 +16,7 @@ #include "bpf_lru_list.h" #include "map_in_map.h" #include <linux/bpf_mem_alloc.h> +#include <asm/rqspinlock.h> #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ @@ -78,7 +79,7 @@ */ struct bucket { struct hlist_nulls_head head; - raw_spinlock_t raw_lock; + rqspinlock_t raw_lock; }; #define HASHTAB_MAP_LOCK_COUNT 8 @@ -104,8 +105,6 @@ struct bpf_htab { u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; - struct lock_class_key lockdep_key; - int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; }; /* each htab element is struct htab_elem + key + value */ @@ -140,45 +139,26 @@ static void htab_init_buckets(struct bpf_htab *htab) for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - raw_spin_lock_init(&htab->buckets[i].raw_lock); - lockdep_set_class(&htab->buckets[i].raw_lock, - &htab->lockdep_key); + raw_res_spin_lock_init(&htab->buckets[i].raw_lock); cond_resched(); } } -static inline int htab_lock_bucket(const struct bpf_htab *htab, - struct bucket *b, u32 hash, - unsigned long *pflags) +static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags) { unsigned long flags; + int ret; - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); - - preempt_disable(); - local_irq_save(flags); - if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { - __this_cpu_dec(*(htab->map_locked[hash])); - local_irq_restore(flags); - preempt_enable(); - return -EBUSY; - } - - raw_spin_lock(&b->raw_lock); + ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags); + if (ret) + return ret; *pflags = flags; - return 0; } -static inline void htab_unlock_bucket(const struct bpf_htab *htab, - struct bucket *b, u32 hash, - unsigned long flags) +static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags) { - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); - raw_spin_unlock(&b->raw_lock); - __this_cpu_dec(*(htab->map_locked[hash])); - local_irq_restore(flags); - preempt_enable(); + raw_res_spin_unlock_irqrestore(&b->raw_lock, flags); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); @@ -195,20 +175,30 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static inline bool is_fd_htab(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS; +} + +static inline void *htab_elem_value(struct htab_elem *l, u32 key_size) +{ + return l->key + round_up(key_size, 8); +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { - *(void __percpu **)(l->key + key_size) = pptr; + *(void __percpu **)htab_elem_value(l, key_size) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { - return *(void __percpu **)(l->key + key_size); + return *(void __percpu **)htab_elem_value(l, key_size); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) { - return *(void **)(l->key + roundup(map->key_size, 8)); + return *(void **)htab_elem_value(l, map->key_size); } static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) @@ -216,9 +206,13 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } +/* Both percpu and fd htab support in-place update, so no need for + * extra elem. LRU itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ static bool htab_has_extra_elems(struct bpf_htab *htab) { - return !htab_is_percpu(htab) && !htab_is_lru(htab); + return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab); } static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) @@ -235,10 +229,10 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) elem = get_htab_elem(htab, i); if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + htab_elem_value(elem, htab->map.key_size)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } } @@ -265,7 +259,8 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab) cond_resched(); } } else { - bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); + bpf_obj_free_fields(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); cond_resched(); } cond_resched(); @@ -473,8 +468,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); - bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || - attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has @@ -483,14 +476,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; - int err, i; + int err; htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); - lockdep_register_key(&htab->lockdep_key); - bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { @@ -536,15 +527,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab->buckets) goto free_elem_count; - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { - htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, - sizeof(int), - sizeof(int), - GFP_USER); - if (!htab->map_locked[i]) - goto free_map_locked; - } - if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else @@ -580,10 +562,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (err) goto free_map_locked; - if (!percpu && !lru) { - /* lru itself can remove the least used element, so - * there is no need for an extra elem during map_update. - */ + if (htab_has_extra_elems(htab)) { err = alloc_extra_elems(htab); if (err) goto free_prealloc; @@ -607,15 +586,12 @@ free_prealloc: free_map_locked: if (htab->use_percpu_counter) percpu_counter_destroy(&htab->pcount); - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) - free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); free_elem_count: bpf_map_free_elem_count(&htab->map); free_htab: - lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); return ERR_PTR(err); } @@ -704,7 +680,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); return NULL; } @@ -743,7 +719,7 @@ static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, if (l) { if (mark) bpf_lru_node_set_ref(&l->lru_node); - return l->key + round_up(map->key_size, 8); + return htab_elem_value(l, map->key_size); } return NULL; @@ -787,6 +763,9 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, static void check_and_free_fields(struct bpf_htab *htab, struct htab_elem *elem) { + if (IS_ERR_OR_NULL(htab->map.record)) + return; + if (htab_is_percpu(htab)) { void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); int cpu; @@ -794,7 +773,7 @@ static void check_and_free_fields(struct bpf_htab *htab, for_each_possible_cpu(cpu) bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); } else { - void *map_value = elem->key + round_up(htab->map.key_size, 8); + void *map_value = htab_elem_value(elem, htab->map.key_size); bpf_obj_free_fields(htab->map.record, map_value); } @@ -817,7 +796,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) b = __select_bucket(htab, tgt_l->hash); head = &b->head; - ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return false; @@ -828,7 +807,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) break; } - htab_unlock_bucket(htab, b, tgt_l->hash, flags); + htab_unlock_bucket(b, flags); if (l == tgt_l) check_and_free_fields(htab, l); @@ -999,8 +978,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { - return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && - BITS_PER_LONG == 64; + return is_fd_htab(htab) && BITS_PER_LONG == 64; } static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, @@ -1070,11 +1048,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, htab_elem_set_ptr(l_new, key_size, pptr); } else if (fd_htab_map_needs_adjust(htab)) { size = round_up(size, 8); - memcpy(l_new->key + round_up(key_size, 8), value, size); + memcpy(htab_elem_value(l_new, key_size), value, size); } else { - copy_map_value(&htab->map, - l_new->key + round_up(key_size, 8), - value); + copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value); } l_new->hash = hash; @@ -1103,10 +1079,9 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; unsigned long flags; - void *old_map_ptr; struct bucket *b; u32 key_size, hash; int ret; @@ -1137,7 +1112,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (l_old) { /* grab the element lock and update value in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); return 0; } @@ -1147,7 +1122,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, */ } - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1165,7 +1140,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, * and update element in place */ copy_map_value_locked(map, - l_old->key + round_up(key_size, 8), + htab_elem_value(l_old, key_size), value, false); ret = 0; goto err; @@ -1187,27 +1162,17 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_del_rcu(&l_old->hash_node); /* l_old has already been stashed in htab->extra_elems, free - * its special fields before it is available for reuse. Also - * save the old map pointer in htab of maps before unlock - * and release it after unlock. + * its special fields before it is available for reuse. */ - old_map_ptr = NULL; - if (htab_is_prealloc(htab)) { - if (map->ops->map_fd_put_ptr) - old_map_ptr = fd_htab_map_get_ptr(map, l_old); + if (htab_is_prealloc(htab)) check_and_free_fields(htab, l_old); - } - } - htab_unlock_bucket(htab, b, hash, flags); - if (l_old) { - if (old_map_ptr) - map->ops->map_fd_put_ptr(map, old_map_ptr, true); - if (!htab_is_prealloc(htab)) - free_htab_elem(htab, l_old); } + htab_unlock_bucket(b, flags); + if (l_old && !htab_is_prealloc(htab)) + free_htab_elem(htab, l_old); return 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); return ret; } @@ -1251,10 +1216,9 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM; - copy_map_value(&htab->map, - l_new->key + round_up(map->key_size, 8), value); + copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value); - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) goto err_lock_bucket; @@ -1275,7 +1239,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value ret = 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); err_lock_bucket: if (ret) @@ -1286,13 +1250,14 @@ err_lock_bucket: return ret; } -static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, +static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, void *value, u64 map_flags, - bool onallcpus) + bool percpu, bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new = NULL, *l_old; + struct htab_elem *l_new, *l_old; struct hlist_nulls_head *head; + void *old_map_ptr = NULL; unsigned long flags; struct bucket *b; u32 key_size, hash; @@ -1312,7 +1277,7 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1323,21 +1288,29 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { - /* per-cpu hash map can update value in-place */ - pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + /* Update value in-place */ + if (percpu) { + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + } else { + void **inner_map_pptr = htab_elem_value(l_old, key_size); + + old_map_ptr = *inner_map_pptr; + WRITE_ONCE(*inner_map_pptr, *(void **)value); + } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus, NULL); + hash, percpu, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; } hlist_nulls_add_head_rcu(&l_new->hash_node, head); } - ret = 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); + if (old_map_ptr) + map->ops->map_fd_put_ptr(map, old_map_ptr, true); return ret; } @@ -1378,7 +1351,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, return -ENOMEM; } - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) goto err_lock_bucket; @@ -1402,7 +1375,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); err_lock_bucket: if (l_new) { bpf_map_dec_elem_count(&htab->map); @@ -1414,7 +1387,7 @@ err_lock_bucket: static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { - return __htab_percpu_map_update_elem(map, key, value, map_flags, false); + return htab_map_update_elem_in_place(map, key, value, map_flags, true, false); } static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, @@ -1444,7 +1417,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1454,7 +1427,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) else ret = -ENOENT; - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); if (l) free_htab_elem(htab, l); @@ -1480,7 +1453,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1491,7 +1464,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) else ret = -ENOENT; - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); if (l) htab_lru_push_free(htab, l); return ret; @@ -1531,10 +1504,10 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) /* We only free timer on uref dropping to zero */ if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); + htab_elem_value(l, htab->map.key_size)); if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, - l->key + round_up(htab->map.key_size, 8)); + htab_elem_value(l, htab->map.key_size)); } cond_resched_rcu(); } @@ -1558,7 +1531,6 @@ static void htab_map_free_timers_and_wq(struct bpf_map *map) static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - int i; /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. * bpf_free_used_maps() is called after bpf prog is no longer executing. @@ -1583,9 +1555,6 @@ static void htab_map_free(struct bpf_map *map) bpf_mem_alloc_destroy(&htab->ma); if (htab->use_percpu_counter) percpu_counter_destroy(&htab->pcount); - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) - free_percpu(htab->map_locked[i]); - lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); } @@ -1628,7 +1597,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &bflags); + ret = htab_lock_bucket(b, &bflags); if (ret) return ret; @@ -1650,22 +1619,19 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, off += roundup_value_size; } } else { - u32 roundup_key_size = round_up(map->key_size, 8); + void *src = htab_elem_value(l, map->key_size); if (flags & BPF_F_LOCK) - copy_map_value_locked(map, value, l->key + - roundup_key_size, - true); + copy_map_value_locked(map, value, src, true); else - copy_map_value(map, value, l->key + - roundup_key_size); + copy_map_value(map, value, src); /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, value); } hlist_nulls_del_rcu(&l->hash_node); out_unlock: - htab_unlock_bucket(htab, b, hash, bflags); + htab_unlock_bucket(b, bflags); if (l) { if (is_lru_map) @@ -1715,12 +1681,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, bool is_percpu) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - u32 bucket_cnt, total, key_size, value_size, roundup_key_size; void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size, map_id; + u32 bucket_cnt, total, key_size, value_size; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; @@ -1755,7 +1721,6 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, return -ENOENT; key_size = htab->map.key_size; - roundup_key_size = round_up(htab->map.key_size, 8); value_size = htab->map.value_size; size = round_up(value_size, 8); if (is_percpu) @@ -1787,7 +1752,7 @@ again_nocopy: head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) { - ret = htab_lock_bucket(htab, b, batch, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) { rcu_read_unlock(); bpf_enable_instrumentation(); @@ -1810,7 +1775,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, batch, flags); + htab_unlock_bucket(b, flags); rcu_read_unlock(); bpf_enable_instrumentation(); goto after_loop; @@ -1821,7 +1786,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, batch, flags); + htab_unlock_bucket(b, flags); rcu_read_unlock(); bpf_enable_instrumentation(); kvfree(keys); @@ -1847,8 +1812,8 @@ again_nocopy: off += size; } } else { - value = l->key + roundup_key_size; - if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + value = htab_elem_value(l, key_size); + if (is_fd_htab(htab)) { struct bpf_map **inner_map = value; /* Actual value is the id of the inner map */ @@ -1884,7 +1849,7 @@ again_nocopy: dst_val += value_size; } - htab_unlock_bucket(htab, b, batch, flags); + htab_unlock_bucket(b, flags); locked = false; while (node_to_free) { @@ -2098,11 +2063,11 @@ static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) { struct bpf_iter_seq_hash_map_info *info = seq->private; - u32 roundup_key_size, roundup_value_size; struct bpf_iter__bpf_map_elem ctx = {}; struct bpf_map *map = info->map; struct bpf_iter_meta meta; int ret = 0, off = 0, cpu; + u32 roundup_value_size; struct bpf_prog *prog; void __percpu *pptr; @@ -2112,10 +2077,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) ctx.meta = &meta; ctx.map = info->map; if (elem) { - roundup_key_size = round_up(map->key_size, 8); ctx.key = elem->key; if (!info->percpu_value_buf) { - ctx.value = elem->key + roundup_key_size; + ctx.value = htab_elem_value(elem, map->key_size); } else { roundup_value_size = round_up(map->value_size, 8); pptr = htab_elem_get_ptr(elem, map->key_size); @@ -2200,7 +2164,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ struct hlist_nulls_head *head; struct hlist_nulls_node *n; struct htab_elem *elem; - u32 roundup_key_size; int i, num_elems = 0; void __percpu *pptr; struct bucket *b; @@ -2215,7 +2178,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ is_percpu = htab_is_percpu(htab); - roundup_key_size = round_up(map->key_size, 8); /* migration has been disabled, so percpu value prepared here will be * the same as the one seen by the bpf program with * bpf_map_lookup_elem(). @@ -2224,14 +2186,14 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ b = &htab->buckets[i]; rcu_read_lock(); head = &b->head; - hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) { key = elem->key; if (is_percpu) { /* current cpu value for percpu map */ pptr = htab_elem_get_ptr(elem, map->key_size); val = this_cpu_ptr(pptr); } else { - val = elem->key + roundup_key_size; + val = htab_elem_value(elem, map->key_size); } num_elems++; ret = callback_fn((u64)(long)map, (u64)(long)key, @@ -2354,7 +2316,7 @@ static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, - offsetof(struct htab_elem, key) + map->key_size); + offsetof(struct htab_elem, key) + roundup(map->key_size, 8)); *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); @@ -2446,8 +2408,8 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, ret = __htab_lru_percpu_map_update_elem(map, key, value, map_flags, true); else - ret = __htab_percpu_map_update_elem(map, key, value, map_flags, - true); + ret = htab_map_update_elem_in_place(map, key, value, map_flags, + true, true); rcu_read_unlock(); return ret; @@ -2571,24 +2533,23 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) return ret; } -/* only called from syscall */ +/* Only called from syscall */ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) { void *ptr; int ret; - u32 ufd = *(u32 *)value; - ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value); if (IS_ERR(ptr)) return PTR_ERR(ptr); /* The htab bucket lock is always held during update operations in fd * htab map, and the following rcu_read_lock() is only used to avoid - * the WARN_ON_ONCE in htab_map_update_elem(). + * the WARN_ON_ONCE in htab_map_update_elem_in_place(). */ rcu_read_lock(); - ret = htab_map_update_elem(map, key, &ptr, map_flags); + ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false); rcu_read_unlock(); if (ret) map->ops->map_fd_put_ptr(map, ptr, false); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f27ce162427a..b71e428ad936 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -23,6 +23,7 @@ #include <linux/btf_ids.h> #include <linux/bpf_mem_alloc.h> #include <linux/kasan.h> +#include <linux/bpf_verifier.h> #include "../../lib/kstrtox.h" @@ -129,7 +130,8 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu); } @@ -1284,8 +1286,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u atomic_set(&t->cancelling, 0); INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); - hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); - t->timer.function = bpf_timer_cb; + hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT); cb->value = (void *)async - map->record->timer_off; break; case BPF_ASYNC_TYPE_WQ: @@ -1714,16 +1715,6 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) -{ - u32 size = __bpf_dynptr_size(ptr); - - if (len > size || offset > size - len) - return -E2BIG; - - return 0; -} - BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) { int err; @@ -1759,8 +1750,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, - u32, offset, u64, flags) +static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src, + u32 offset, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1793,6 +1784,12 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern } } +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, + u32, offset, u64, flags) +{ + return __bpf_dynptr_read(dst, len, src, offset, flags); +} + static const struct bpf_func_proto bpf_dynptr_read_proto = { .func = bpf_dynptr_read, .gpl_only = false, @@ -1804,8 +1801,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .arg5_type = ARG_ANYTHING, }; -BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, - u32, len, u64, flags) +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, + u32 len, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1843,6 +1840,12 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v } } +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, + u32, len, u64, flags) +{ + return __bpf_dynptr_write(dst, offset, src, len, flags); +} + static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, @@ -1901,6 +1904,12 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_proto __weak; const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak; const struct bpf_func_proto bpf_task_pt_regs_proto __weak; +const struct bpf_func_proto bpf_perf_event_read_proto __weak; +const struct bpf_func_proto bpf_send_signal_proto __weak; +const struct bpf_func_proto bpf_send_signal_thread_proto __weak; +const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak; +const struct bpf_func_proto bpf_get_task_stack_proto __weak; +const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak; const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -1954,6 +1963,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_ns_current_pid_tgid: return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; default: break; } @@ -2011,7 +2022,21 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_get_current_ancestor_cgroup_id: return &bpf_get_current_ancestor_cgroup_id_proto; + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; #endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif + case BPF_FUNC_task_storage_get: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_get_recur_proto; + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_delete_recur_proto; + return &bpf_task_storage_delete_proto; default: break; } @@ -2026,6 +2051,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: return &bpf_get_current_task_btf_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: @@ -2036,6 +2063,10 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_probe_read_kernel_str: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; + case BPF_FUNC_copy_from_user: + return &bpf_copy_from_user_proto; + case BPF_FUNC_copy_from_user_task: + return &bpf_copy_from_user_task_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: @@ -2044,6 +2075,21 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_pt_regs_proto; case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); + case BPF_FUNC_perf_event_read_value: + return bpf_get_perf_event_read_value_proto(); + case BPF_FUNC_perf_event_read: + return &bpf_perf_event_read_proto; + case BPF_FUNC_send_signal: + return &bpf_send_signal_proto; + case BPF_FUNC_send_signal_thread: + return &bpf_send_signal_thread_proto; + case BPF_FUNC_get_task_stack: + return prog->sleepable ? &bpf_get_task_stack_sleepable_proto + : &bpf_get_task_stack_proto; + case BPF_FUNC_get_branch_snapshot: + return &bpf_get_branch_snapshot_proto; + case BPF_FUNC_find_vma: + return &bpf_find_vma_proto; default: return NULL; } @@ -2280,6 +2326,26 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) return __bpf_list_del(head, true); } +__bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + if (list_empty(h) || unlikely(!h->next)) + return NULL; + + return (struct bpf_list_node *)h->next; +} + +__bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + if (list_empty(h) || unlikely(!h->next)) + return NULL; + + return (struct bpf_list_node *)h->prev; +} + __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { @@ -2353,6 +2419,33 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) return (struct bpf_rb_node *)rb_first_cached(r); } +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root) +{ + struct rb_root_cached *r = (struct rb_root_cached *)root; + + return (struct bpf_rb_node *)r->rb_root.rb_node; +} + +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node) +{ + struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; + + if (READ_ONCE(node_internal->owner) != root) + return NULL; + + return (struct bpf_rb_node *)node_internal->rb_node.rb_left; +} + +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node) +{ + struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; + + if (READ_ONCE(node_internal->owner) != root) + return NULL; + + return (struct bpf_rb_node *)node_internal->rb_node.rb_right; +} + /** * bpf_task_acquire - Acquire a reference to a task. A task acquired by this * kfunc which is not stored in a map as a kptr, must be released by calling @@ -2758,6 +2851,61 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, return 0; } +/** + * bpf_dynptr_copy() - Copy data from one dynptr to another. + * @dst_ptr: Destination dynptr - where data should be copied to + * @dst_off: Offset into the destination dynptr + * @src_ptr: Source dynptr - where data should be copied from + * @src_off: Offset into the source dynptr + * @size: Length of the data to copy from source to destination + * + * Copies data from source dynptr to destination dynptr. + * Returns 0 on success; negative error, otherwise. + */ +__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, + struct bpf_dynptr *src_ptr, u32 src_off, u32 size) +{ + struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; + struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; + void *src_slice, *dst_slice; + char buf[256]; + u32 off; + + src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); + dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); + + if (src_slice && dst_slice) { + memmove(dst_slice, src_slice, size); + return 0; + } + + if (src_slice) + return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0); + + if (dst_slice) + return __bpf_dynptr_read(dst_slice, size, src, src_off, 0); + + if (bpf_dynptr_check_off_len(dst, dst_off, size) || + bpf_dynptr_check_off_len(src, src_off, size)) + return -E2BIG; + + off = 0; + while (off < size) { + u32 chunk_sz = min_t(u32, sizeof(buf), size - off); + int err; + + err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); + if (err) + return err; + err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0); + if (err) + return err; + + off += chunk_sz; + } + return 0; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -2855,9 +3003,9 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, int (callback_fn)(void *map, int *key, void *value), unsigned int flags, - void *aux__ign) + void *aux__prog) { - struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign; + struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog; struct bpf_async_kern *async = (struct bpf_async_kern *)wq; if (flags) @@ -3067,6 +3215,50 @@ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user return ret + 1; } +/** + * bpf_copy_from_user_task_str() - Copy a string from an task's address space + * @dst: Destination address, in kernel space. This buffer must be + * at least @dst__sz bytes long. + * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. + * @unsafe_ptr__ign: Source address in the task's address space. + * @tsk: The task whose address space will be used + * @flags: The only supported flag is BPF_F_PAD_ZEROS + * + * Copies a NUL terminated string from a task's address space to @dst__sz + * buffer. If user string is too long this will still ensure zero termination + * in the @dst__sz buffer unless buffer size is 0. + * + * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success + * and memset all of @dst__sz on failure. + * + * Return: The number of copied bytes on success including the NUL terminator. + * A negative error code on failure. + */ +__bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz, + const void __user *unsafe_ptr__ign, + struct task_struct *tsk, u64 flags) +{ + int ret; + + if (unlikely(flags & ~BPF_F_PAD_ZEROS)) + return -EINVAL; + + if (unlikely(dst__sz == 0)) + return 0; + + ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0); + if (ret < 0) { + if (flags & BPF_F_PAD_ZEROS) + memset(dst, 0, dst__sz); + return ret; + } + + if (flags & BPF_F_PAD_ZEROS) + memset(dst + ret, 0, dst__sz - ret); + + return ret + 1; +} + /* Keep unsinged long in prototype so that kfunc is usable when emitted to * vmlinux.h in BPF programs directly, but note that while in BPF prog, the * unsigned long always points to 8-byte region on stack, the kernel may only @@ -3082,6 +3274,10 @@ __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag) local_irq_restore(*flags__irq_flag); } +__bpf_kfunc void __bpf_trap(void) +{ +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -3097,11 +3293,16 @@ BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_add_impl) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL) #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) @@ -3162,6 +3363,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +BTF_ID_FLAGS(func, bpf_dynptr_copy) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif @@ -3174,12 +3376,27 @@ BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_get_kmem_cache) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_local_irq_save) BTF_ID_FLAGS(func, bpf_local_irq_restore) +BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr) +BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr) +BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) +BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) +BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +#ifdef CONFIG_DMA_SHARED_BUFFER +BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) +#endif +BTF_ID_FLAGS(func, __bpf_trap) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9aaf5124648b..5c2e96b19392 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -150,14 +150,14 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } -static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); inode->i_op = &bpf_dir_iops; inode->i_fop = &simple_dir_operations; @@ -166,7 +166,7 @@ static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, inc_nlink(dir); bpf_dentry_finalize(dentry, inode, dir); - return 0; + return NULL; } struct map_iter { @@ -421,7 +421,7 @@ static int bpf_iter_link_pin_kernel(struct dentry *parent, int ret; inode_lock(parent->d_inode); - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) { inode_unlock(parent->d_inode); return PTR_ERR(dentry); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e8a772e64324..be66d7e520e0 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -15,6 +15,7 @@ #include <net/ipv6.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> +#include <asm/rqspinlock.h> #include <linux/bpf_mem_alloc.h> /* Intermediate node */ @@ -36,7 +37,7 @@ struct lpm_trie { size_t n_entries; size_t max_prefixlen; size_t data_size; - raw_spinlock_t lock; + rqspinlock_t lock; }; /* This trie implements a longest prefix match algorithm that can be used to @@ -342,7 +343,9 @@ static long trie_update_elem(struct bpf_map *map, if (!new_node) return -ENOMEM; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); + if (ret) + goto out_free; new_node->prefixlen = key->prefixlen; RCU_INIT_POINTER(new_node->child[0], NULL); @@ -356,8 +359,7 @@ static long trie_update_elem(struct bpf_map *map, */ slot = &trie->root; - while ((node = rcu_dereference_protected(*slot, - lockdep_is_held(&trie->lock)))) { + while ((node = rcu_dereference(*slot))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -442,8 +444,8 @@ static long trie_update_elem(struct bpf_map *map, rcu_assign_pointer(*slot, im_node); out: - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); - + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); +out_free: if (ret) bpf_mem_cache_free(&trie->ma, new_node); bpf_mem_cache_free_rcu(&trie->ma, free_node); @@ -467,7 +469,9 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); + if (ret) + return ret; /* Walk the tree looking for an exact key/length match and keeping * track of the path we traverse. We will need to know the node @@ -478,8 +482,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) trim = &trie->root; trim2 = trim; parent = NULL; - while ((node = rcu_dereference_protected( - *trim, lockdep_is_held(&trie->lock)))) { + while ((node = rcu_dereference(*trim))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -543,7 +546,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) free_node = node; out: - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); bpf_mem_cache_free_rcu(&trie->ma, free_parent); bpf_mem_cache_free_rcu(&trie->ma, free_node); @@ -592,7 +595,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) offsetof(struct bpf_lpm_trie_key_u8, data); trie->max_prefixlen = trie->data_size * 8; - raw_spin_lock_init(&trie->lock); + raw_res_spin_lock_init(&trie->lock); /* Allocate intermediate and leaf nodes from the same allocator */ leaf_size = sizeof(struct lpm_trie_node) + trie->data_size + diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 1a4fec330eaa..42ae8d595c2c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -25,6 +25,7 @@ #include <linux/rhashtable.h> #include <linux/rtnetlink.h> #include <linux/rwsem.h> +#include <net/netdev_lock.h> #include <net/xdp.h> /* Protects offdevs, members of bpf_offload_netdev and offload members @@ -528,13 +529,14 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&offmap->map, attr); - rtnl_lock(); - down_write(&bpf_devs_lock); offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); err = bpf_dev_offload_check(offmap->netdev); if (err) - goto err_unlock; + goto err_unlock_rtnl; + + netdev_lock_ops(offmap->netdev); + down_write(&bpf_devs_lock); ondev = bpf_offload_find_netdev(offmap->netdev); if (!ondev) { @@ -548,12 +550,15 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) list_add_tail(&offmap->offloads, &ondev->maps); up_write(&bpf_devs_lock); + netdev_unlock_ops(offmap->netdev); rtnl_unlock(); return &offmap->map; err_unlock: up_write(&bpf_devs_lock); + netdev_unlock_ops(offmap->netdev); +err_unlock_rtnl: rtnl_unlock(); bpf_map_area_free(offmap); return ERR_PTR(err); diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 034cf87b54e9..632762b57299 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -14,11 +14,9 @@ int pcpu_freelist_init(struct pcpu_freelist *s) for_each_possible_cpu(cpu) { struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); - raw_spin_lock_init(&head->lock); + raw_res_spin_lock_init(&head->lock); head->first = NULL; } - raw_spin_lock_init(&s->extralist.lock); - s->extralist.first = NULL; return 0; } @@ -34,58 +32,39 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, WRITE_ONCE(head->first, node); } -static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, +static inline bool ___pcpu_freelist_push(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { - raw_spin_lock(&head->lock); - pcpu_freelist_push_node(head, node); - raw_spin_unlock(&head->lock); -} - -static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, - struct pcpu_freelist_node *node) -{ - if (!raw_spin_trylock(&s->extralist.lock)) + if (raw_res_spin_lock(&head->lock)) return false; - - pcpu_freelist_push_node(&s->extralist, node); - raw_spin_unlock(&s->extralist.lock); + pcpu_freelist_push_node(head, node); + raw_res_spin_unlock(&head->lock); return true; } -static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, - struct pcpu_freelist_node *node) +void __pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) { - int cpu, orig_cpu; + struct pcpu_freelist_head *head; + int cpu; - orig_cpu = raw_smp_processor_id(); - while (1) { - for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { - struct pcpu_freelist_head *head; + if (___pcpu_freelist_push(this_cpu_ptr(s->freelist), node)) + return; + while (true) { + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { + if (cpu == raw_smp_processor_id()) + continue; head = per_cpu_ptr(s->freelist, cpu); - if (raw_spin_trylock(&head->lock)) { - pcpu_freelist_push_node(head, node); - raw_spin_unlock(&head->lock); - return; - } - } - - /* cannot lock any per cpu lock, try extralist */ - if (pcpu_freelist_try_push_extra(s, node)) + if (raw_res_spin_lock(&head->lock)) + continue; + pcpu_freelist_push_node(head, node); + raw_res_spin_unlock(&head->lock); return; + } } } -void __pcpu_freelist_push(struct pcpu_freelist *s, - struct pcpu_freelist_node *node) -{ - if (in_nmi()) - ___pcpu_freelist_push_nmi(s, node); - else - ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); -} - void pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { @@ -120,71 +99,29 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { + struct pcpu_freelist_node *node = NULL; struct pcpu_freelist_head *head; - struct pcpu_freelist_node *node; int cpu; for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) continue; - raw_spin_lock(&head->lock); + if (raw_res_spin_lock(&head->lock)) + continue; node = head->first; if (node) { WRITE_ONCE(head->first, node->next); - raw_spin_unlock(&head->lock); + raw_res_spin_unlock(&head->lock); return node; } - raw_spin_unlock(&head->lock); + raw_res_spin_unlock(&head->lock); } - - /* per cpu lists are all empty, try extralist */ - if (!READ_ONCE(s->extralist.first)) - return NULL; - raw_spin_lock(&s->extralist.lock); - node = s->extralist.first; - if (node) - WRITE_ONCE(s->extralist.first, node->next); - raw_spin_unlock(&s->extralist.lock); - return node; -} - -static struct pcpu_freelist_node * -___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) -{ - struct pcpu_freelist_head *head; - struct pcpu_freelist_node *node; - int cpu; - - for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { - head = per_cpu_ptr(s->freelist, cpu); - if (!READ_ONCE(head->first)) - continue; - if (raw_spin_trylock(&head->lock)) { - node = head->first; - if (node) { - WRITE_ONCE(head->first, node->next); - raw_spin_unlock(&head->lock); - return node; - } - raw_spin_unlock(&head->lock); - } - } - - /* cannot pop from per cpu lists, try extralist */ - if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) - return NULL; - node = s->extralist.first; - if (node) - WRITE_ONCE(s->extralist.first, node->next); - raw_spin_unlock(&s->extralist.lock); return node; } struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { - if (in_nmi()) - return ___pcpu_freelist_pop_nmi(s); return ___pcpu_freelist_pop(s); } diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index 3c76553cfe57..914798b74967 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -5,15 +5,15 @@ #define __PERCPU_FREELIST_H__ #include <linux/spinlock.h> #include <linux/percpu.h> +#include <asm/rqspinlock.h> struct pcpu_freelist_head { struct pcpu_freelist_node *first; - raw_spinlock_t lock; + rqspinlock_t lock; }; struct pcpu_freelist { struct pcpu_freelist_head __percpu *freelist; - struct pcpu_freelist_head extralist; }; struct pcpu_freelist_node { diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 0c63bc2cd895..774e5a538811 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -89,4 +89,6 @@ static void __exit fini(void) } late_initcall(load); module_exit(fini); +MODULE_IMPORT_NS("BPF_INTERNAL"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs"); diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index d869f51ea93a..9a5f94371e50 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -9,13 +9,14 @@ #include <linux/slab.h> #include <linux/btf_ids.h> #include "percpu_freelist.h" +#include <asm/rqspinlock.h> #define QUEUE_STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_queue_stack { struct bpf_map map; - raw_spinlock_t lock; + rqspinlock_t lock; u32 head, tail; u32 size; /* max_entries + 1 */ @@ -78,7 +79,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) qs->size = size; - raw_spin_lock_init(&qs->lock); + raw_res_spin_lock_init(&qs->lock); return &qs->map; } @@ -98,12 +99,8 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) int err = 0; void *ptr; - if (in_nmi()) { - if (!raw_spin_trylock_irqsave(&qs->lock, flags)) - return -EBUSY; - } else { - raw_spin_lock_irqsave(&qs->lock, flags); - } + if (raw_res_spin_lock_irqsave(&qs->lock, flags)) + return -EBUSY; if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -120,7 +117,7 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) } out: - raw_spin_unlock_irqrestore(&qs->lock, flags); + raw_res_spin_unlock_irqrestore(&qs->lock, flags); return err; } @@ -133,12 +130,8 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) void *ptr; u32 index; - if (in_nmi()) { - if (!raw_spin_trylock_irqsave(&qs->lock, flags)) - return -EBUSY; - } else { - raw_spin_lock_irqsave(&qs->lock, flags); - } + if (raw_res_spin_lock_irqsave(&qs->lock, flags)) + return -EBUSY; if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -157,7 +150,7 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) qs->head = index; out: - raw_spin_unlock_irqrestore(&qs->lock, flags); + raw_res_spin_unlock_irqrestore(&qs->lock, flags); return err; } @@ -203,12 +196,8 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, if (flags & BPF_NOEXIST || flags > BPF_EXIST) return -EINVAL; - if (in_nmi()) { - if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags)) - return -EBUSY; - } else { - raw_spin_lock_irqsave(&qs->lock, irq_flags); - } + if (raw_res_spin_lock_irqsave(&qs->lock, irq_flags)) + return -EBUSY; if (queue_stack_map_is_full(qs)) { if (!replace) { @@ -227,7 +216,7 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, qs->head = 0; out: - raw_spin_unlock_irqrestore(&qs->lock, irq_flags); + raw_res_spin_unlock_irqrestore(&qs->lock, irq_flags); return err; } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index e1cfe890e0be..719d73299397 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -11,6 +11,7 @@ #include <linux/kmemleak.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> +#include <asm/rqspinlock.h> #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) @@ -29,7 +30,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; - raw_spinlock_t spinlock ____cacheline_aligned_in_smp; + rqspinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is @@ -173,7 +174,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) if (!rb) return NULL; - raw_spin_lock_init(&rb->spinlock); + raw_res_spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -268,8 +269,6 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma /* allow writable mapping for the consumer_pos only */ if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) return -EPERM; - } else { - vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, @@ -289,8 +288,6 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma * position, and the ring buffer data itself. */ return -EPERM; - } else { - vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); @@ -420,12 +417,8 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) cons_pos = smp_load_acquire(&rb->consumer_pos); - if (in_nmi()) { - if (!raw_spin_trylock_irqsave(&rb->spinlock, flags)) - return NULL; - } else { - raw_spin_lock_irqsave(&rb->spinlock, flags); - } + if (raw_res_spin_lock_irqsave(&rb->spinlock, flags)) + return NULL; pend_pos = rb->pending_pos; prod_pos = rb->producer_pos; @@ -450,7 +443,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) */ if (new_prod_pos - cons_pos > rb->mask || new_prod_pos - pend_pos > rb->mask) { - raw_spin_unlock_irqrestore(&rb->spinlock, flags); + raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } @@ -462,7 +455,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); - raw_spin_unlock_irqrestore(&rb->spinlock, flags); + raw_res_spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; } diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c new file mode 100644 index 000000000000..338305c8852c --- /dev/null +++ b/kernel/bpf/rqspinlock.c @@ -0,0 +1,737 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Resilient Queued Spin Lock + * + * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. + * (C) Copyright 2013-2014,2018 Red Hat, Inc. + * (C) Copyright 2015 Intel Corp. + * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. + * + * Authors: Waiman Long <longman@redhat.com> + * Peter Zijlstra <peterz@infradead.org> + * Kumar Kartikeya Dwivedi <memxor@gmail.com> + */ + +#include <linux/smp.h> +#include <linux/bug.h> +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/mutex.h> +#include <linux/prefetch.h> +#include <asm/byteorder.h> +#ifdef CONFIG_QUEUED_SPINLOCKS +#include <asm/qspinlock.h> +#endif +#include <trace/events/lock.h> +#include <asm/rqspinlock.h> +#include <linux/timekeeping.h> + +/* + * Include queued spinlock definitions and statistics code + */ +#ifdef CONFIG_QUEUED_SPINLOCKS +#include "../locking/qspinlock.h" +#include "../locking/lock_events.h" +#include "rqspinlock.h" +#include "../locking/mcs_spinlock.h" +#endif + +/* + * The basic principle of a queue-based spinlock can best be understood + * by studying a classic queue-based spinlock implementation called the + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and + * Scott") is available at + * + * https://bugzilla.kernel.org/show_bug.cgi?id=206115 + * + * This queued spinlock implementation is based on the MCS lock, however to + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its + * existing API, we must modify it somehow. + * + * In particular; where the traditional MCS lock consists of a tail pointer + * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to + * unlock the next pending (next->locked), we compress both these: {tail, + * next->locked} into a single u32 value. + * + * Since a spinlock disables recursion of its own context and there is a limit + * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there + * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now + * we can encode the tail by combining the 2-bit nesting level with the cpu + * number. With one byte for the lock value and 3 bytes for the tail, only a + * 32-bit word is now needed. Even though we only need 1 bit for the lock, + * we extend it to a full byte to achieve better performance for architectures + * that support atomic byte write. + * + * We also change the first spinner to spin on the lock bit instead of its + * node; whereby avoiding the need to carry a node from lock to unlock, and + * preserving existing lock API. This also makes the unlock code simpler and + * faster. + * + * N.B. The current implementation only supports architectures that allow + * atomic operations on smaller 8-bit and 16-bit data types. + * + */ + +struct rqspinlock_timeout { + u64 timeout_end; + u64 duration; + u64 cur; + u16 spin; +}; + +#define RES_TIMEOUT_VAL 2 + +DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); +EXPORT_SYMBOL_GPL(rqspinlock_held_locks); + +static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) +{ + if (!(atomic_read_acquire(&lock->val) & (mask))) + return true; + return false; +} + +static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + int cnt = min(RES_NR_HELD, rqh->cnt); + + /* + * Return an error if we hold the lock we are attempting to acquire. + * We'll iterate over max 32 locks; no need to do is_lock_released. + */ + for (int i = 0; i < cnt - 1; i++) { + if (rqh->locks[i] == lock) + return -EDEADLK; + } + return 0; +} + +/* + * This focuses on the most common case of ABBA deadlocks (or ABBA involving + * more locks, which reduce to ABBA). This is not exhaustive, and we rely on + * timeouts as the final line of defense. + */ +static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + int rqh_cnt = min(RES_NR_HELD, rqh->cnt); + void *remote_lock; + int cpu; + + /* + * Find the CPU holding the lock that we want to acquire. If there is a + * deadlock scenario, we will read a stable set on the remote CPU and + * find the target. This would be a constant time operation instead of + * O(NR_CPUS) if we could determine the owning CPU from a lock value, but + * that requires increasing the size of the lock word. + */ + for_each_possible_cpu(cpu) { + struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu); + int real_cnt = READ_ONCE(rqh_cpu->cnt); + int cnt = min(RES_NR_HELD, real_cnt); + + /* + * Let's ensure to break out of this loop if the lock is available for + * us to potentially acquire. + */ + if (is_lock_released(lock, mask, ts)) + return 0; + + /* + * Skip ourselves, and CPUs whose count is less than 2, as they need at + * least one held lock and one acquisition attempt (reflected as top + * most entry) to participate in an ABBA deadlock. + * + * If cnt is more than RES_NR_HELD, it means the current lock being + * acquired won't appear in the table, and other locks in the table are + * already held, so we can't determine ABBA. + */ + if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD) + continue; + + /* + * Obtain the entry at the top, this corresponds to the lock the + * remote CPU is attempting to acquire in a deadlock situation, + * and would be one of the locks we hold on the current CPU. + */ + remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]); + /* + * If it is NULL, we've raced and cannot determine a deadlock + * conclusively, skip this CPU. + */ + if (!remote_lock) + continue; + /* + * Find if the lock we're attempting to acquire is held by this CPU. + * Don't consider the topmost entry, as that must be the latest lock + * being held or acquired. For a deadlock, the target CPU must also + * attempt to acquire a lock we hold, so for this search only 'cnt - 1' + * entries are important. + */ + for (int i = 0; i < cnt - 1; i++) { + if (READ_ONCE(rqh_cpu->locks[i]) != lock) + continue; + /* + * We found our lock as held on the remote CPU. Is the + * acquisition attempt on the remote CPU for a lock held + * by us? If so, we have a deadlock situation, and need + * to recover. + */ + for (int i = 0; i < rqh_cnt - 1; i++) { + if (rqh->locks[i] == remote_lock) + return -EDEADLK; + } + /* + * Inconclusive; retry again later. + */ + return 0; + } + } + return 0; +} + +static noinline int check_deadlock(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + int ret; + + ret = check_deadlock_AA(lock, mask, ts); + if (ret) + return ret; + ret = check_deadlock_ABBA(lock, mask, ts); + if (ret) + return ret; + + return 0; +} + +static noinline int check_timeout(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + u64 time = ktime_get_mono_fast_ns(); + u64 prev = ts->cur; + + if (!ts->timeout_end) { + ts->cur = time; + ts->timeout_end = time + ts->duration; + return 0; + } + + if (time > ts->timeout_end) + return -ETIMEDOUT; + + /* + * A millisecond interval passed from last time? Trigger deadlock + * checks. + */ + if (prev + NSEC_PER_MSEC < time) { + ts->cur = time; + return check_deadlock(lock, mask, ts); + } + + return 0; +} + +/* + * Do not amortize with spins when res_smp_cond_load_acquire is defined, + * as the macro does internal amortization for us. + */ +#ifndef res_smp_cond_load_acquire +#define RES_CHECK_TIMEOUT(ts, ret, mask) \ + ({ \ + if (!(ts).spin++) \ + (ret) = check_timeout((lock), (mask), &(ts)); \ + (ret); \ + }) +#else +#define RES_CHECK_TIMEOUT(ts, ret, mask) \ + ({ (ret) = check_timeout((lock), (mask), &(ts)); }) +#endif + +/* + * Initialize the 'spin' member. + * Set spin member to 0 to trigger AA/ABBA checks immediately. + */ +#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) + +/* + * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary. + * Duration is defined for each spin attempt, so set it here. + */ +#define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) + +/* + * Provide a test-and-set fallback for cases when queued spin lock support is + * absent from the architecture. + */ +int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) +{ + struct rqspinlock_timeout ts; + int val, ret = 0; + + RES_INIT_TIMEOUT(ts); + grab_held_lock_entry(lock); + + /* + * Since the waiting loop's time is dependent on the amount of + * contention, a short timeout unlike rqspinlock waiting loops + * isn't enough. Choose a second as the timeout value. + */ + RES_RESET_TIMEOUT(ts, NSEC_PER_SEC); +retry: + val = atomic_read(&lock->val); + + if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) { + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) + goto out; + cpu_relax(); + goto retry; + } + + return 0; +out: + release_held_lock_entry(); + return ret; +} +EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); + +#ifdef CONFIG_QUEUED_SPINLOCKS + +/* + * Per-CPU queue node structures; we can never have more than 4 nested + * contexts: task, softirq, hardirq, nmi. + * + * Exactly fits one 64-byte cacheline on a 64-bit architecture. + */ +static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); + +#ifndef res_smp_cond_load_acquire +#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) +#endif + +#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) + +/** + * resilient_queued_spin_lock_slowpath - acquire the queued spinlock + * @lock: Pointer to queued spinlock structure + * @val: Current value of the queued spinlock 32-bit word + * + * Return: + * * 0 - Lock was acquired successfully. + * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. + * * -ETIMEDOUT - Lock acquisition failed because of timeout. + * + * (queue tail, pending bit, lock value) + * + * fast : slow : unlock + * : : + * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) + * : | ^--------.------. / : + * : v \ \ | : + * pending : (0,1,1) +--> (0,1,0) \ | : + * : | ^--' | | : + * : v | | : + * uncontended : (n,x,y) +--> (n,0,0) --' | : + * queue : | ^--' | : + * : v | : + * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : + * queue : ^--' : + */ +int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) +{ + struct mcs_spinlock *prev, *next, *node; + struct rqspinlock_timeout ts; + int idx, ret = 0; + u32 old, tail; + + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); + + if (resilient_virt_spin_lock_enabled()) + return resilient_virt_spin_lock(lock); + + RES_INIT_TIMEOUT(ts); + + /* + * Wait for in-progress pending->locked hand-overs with a bounded + * number of spins so that we guarantee forward progress. + * + * 0,1,0 -> 0,0,1 + */ + if (val == _Q_PENDING_VAL) { + int cnt = _Q_PENDING_LOOPS; + val = atomic_cond_read_relaxed(&lock->val, + (VAL != _Q_PENDING_VAL) || !cnt--); + } + + /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + /* + * trylock || pending + * + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock + */ + val = queued_fetch_set_pending_acquire(lock); + + /* + * If we observe contention, there is a concurrent locker. + * + * Undo and queue; our setting of PENDING might have made the + * n,0,0 -> 0,0,0 transition fail and it will now be waiting + * on @next to become !NULL. + */ + if (unlikely(val & ~_Q_LOCKED_MASK)) { + + /* Undo PENDING if we set it. */ + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); + + goto queue; + } + + /* + * Grab an entry in the held locks array, to enable deadlock detection. + */ + grab_held_lock_entry(lock); + + /* + * We're pending, wait for the owner to go away. + * + * 0,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. + */ + if (val & _Q_LOCKED_MASK) { + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); + res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); + } + + if (ret) { + /* + * We waited for the locked bit to go back to 0, as the pending + * waiter, but timed out. We need to clear the pending bit since + * we own it. Once a stuck owner has been recovered, the lock + * must be restored to a valid state, hence removing the pending + * bit is necessary. + * + * *,1,* -> *,0,* + */ + clear_pending(lock); + lockevent_inc(rqspinlock_lock_timeout); + goto err_release_entry; + } + + /* + * take ownership and clear the pending bit. + * + * 0,1,0 -> 0,0,1 + */ + clear_pending_set_locked(lock); + lockevent_inc(lock_pending); + return 0; + + /* + * End of pending bit optimistic spinning and beginning of MCS + * queuing. + */ +queue: + lockevent_inc(lock_slowpath); + /* + * Grab deadlock detection entry for the queue path. + */ + grab_held_lock_entry(lock); + + node = this_cpu_ptr(&rqnodes[0].mcs); + idx = node->count++; + tail = encode_tail(smp_processor_id(), idx); + + trace_contention_begin(lock, LCB_F_SPIN); + + /* + * 4 nodes are allocated based on the assumption that there will + * not be nested NMIs taking spinlocks. That may not be true in + * some architectures even though the chance of needing more than + * 4 nodes will still be extremely unlikely. When that happens, + * we fall back to spinning on the lock directly without using + * any MCS node. This is not the most elegant solution, but is + * simple enough. + */ + if (unlikely(idx >= _Q_MAX_NODES)) { + lockevent_inc(lock_no_node); + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); + while (!queued_spin_trylock(lock)) { + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { + lockevent_inc(rqspinlock_lock_timeout); + goto err_release_node; + } + cpu_relax(); + } + goto release; + } + + node = grab_mcs_node(node, idx); + + /* + * Keep counts of non-zero index values: + */ + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + + node->locked = 0; + node->next = NULL; + + /* + * We touched a (possibly) cold cacheline in the per-cpu queue node; + * attempt the trylock once more in the hope someone let go while we + * weren't watching. + */ + if (queued_spin_trylock(lock)) + goto release; + + /* + * Ensure that the initialisation of @node is complete before we + * publish the updated tail via xchg_tail() and potentially link + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. + */ + smp_wmb(); + + /* + * Publish the updated tail. + * We have already touched the queueing cacheline; don't bother with + * pending stuff. + * + * p,*,* -> n,*,* + */ + old = xchg_tail(lock, tail); + next = NULL; + + /* + * if there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_MASK) { + int val; + + prev = decode_tail(old, rqnodes); + + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); + + val = arch_mcs_spin_lock_contended(&node->locked); + if (val == RES_TIMEOUT_VAL) { + ret = -EDEADLK; + goto waitq_timeout; + } + + /* + * While waiting for the MCS lock, the next pointer may have + * been set by another lock waiter. We optimistically load + * the next pointer & prefetch the cacheline for writing + * to reduce latency in the upcoming MCS unlock operation. + */ + next = READ_ONCE(node->next); + if (next) + prefetchw(next); + } + + /* + * we're at the head of the waitqueue, wait for the owner & pending to + * go away. + * + * *,x,y -> *,0,0 + * + * this wait loop must use a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because the set_locked() function below + * does not imply a full barrier. + * + * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is + * meant to span maximum allowed time per critical section, and we may + * have both the owner of the lock and the pending bit waiter ahead of + * us. + */ + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); + val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || + RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); + +waitq_timeout: + if (ret) { + /* + * If the tail is still pointing to us, then we are the final waiter, + * and are responsible for resetting the tail back to 0. Otherwise, if + * the cmpxchg operation fails, we signal the next waiter to take exit + * and try the same. For a waiter with tail node 'n': + * + * n,*,* -> 0,*,* + * + * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is + * possible locked/pending bits keep changing and we see failures even + * when we remain the head of wait queue. However, eventually, + * pending bit owner will unset the pending bit, and new waiters + * will queue behind us. This will leave the lock owner in + * charge, and it will eventually either set locked bit to 0, or + * leave it as 1, allowing us to make progress. + * + * We terminate the whole wait queue for two reasons. Firstly, + * we eschew per-waiter timeouts with one applied at the head of + * the wait queue. This allows everyone to break out faster + * once we've seen the owner / pending waiter not responding for + * the timeout duration from the head. Secondly, it avoids + * complicated synchronization, because when not leaving in FIFO + * order, prev's next pointer needs to be fixed up etc. + */ + if (!try_cmpxchg_tail(lock, tail, 0)) { + next = smp_cond_load_relaxed(&node->next, VAL); + WRITE_ONCE(next->locked, RES_TIMEOUT_VAL); + } + lockevent_inc(rqspinlock_lock_timeout); + goto err_release_node; + } + + /* + * claim the lock: + * + * n,0,0 -> 0,0,1 : lock, uncontended + * *,*,0 -> *,*,1 : lock, contended + * + * If the queue head is the only one in the queue (lock value == tail) + * and nobody is pending, clear the tail code and grab the lock. + * Otherwise, we only need to grab the lock. + */ + + /* + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the + * above wait condition, therefore any concurrent setting of + * PENDING will make the uncontended transition fail. + */ + if ((val & _Q_TAIL_MASK) == tail) { + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + } + + /* + * Either somebody is queued behind us or _Q_PENDING_VAL got set + * which will then detect the remaining tail and queue behind us + * ensuring we'll see a @next. + */ + set_locked(lock); + + /* + * contended path; wait for next if not observed yet, release. + */ + if (!next) + next = smp_cond_load_relaxed(&node->next, (VAL)); + + arch_mcs_spin_unlock_contended(&next->locked); + +release: + trace_contention_end(lock, 0); + + /* + * release the node + */ + __this_cpu_dec(rqnodes[0].mcs.count); + return ret; +err_release_node: + trace_contention_end(lock, ret); + __this_cpu_dec(rqnodes[0].mcs.count); +err_release_entry: + release_held_lock_entry(); + return ret; +} +EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); + +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) +{ + int ret; + + BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); + BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock)); + + preempt_disable(); + ret = res_spin_lock((rqspinlock_t *)lock); + if (unlikely(ret)) { + preempt_enable(); + return ret; + } + return 0; +} + +__bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) +{ + res_spin_unlock((rqspinlock_t *)lock); + preempt_enable(); +} + +__bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) +{ + u64 *ptr = (u64 *)flags__irq_flag; + unsigned long flags; + int ret; + + preempt_disable(); + local_irq_save(flags); + ret = res_spin_lock((rqspinlock_t *)lock); + if (unlikely(ret)) { + local_irq_restore(flags); + preempt_enable(); + return ret; + } + *ptr = flags; + return 0; +} + +__bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) +{ + u64 *ptr = (u64 *)flags__irq_flag; + unsigned long flags = *ptr; + + res_spin_unlock((rqspinlock_t *)lock); + local_irq_restore(flags); + preempt_enable(); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(rqspinlock_kfunc_ids) +BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_res_spin_unlock) +BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore) +BTF_KFUNCS_END(rqspinlock_kfunc_ids) + +static const struct btf_kfunc_id_set rqspinlock_kfunc_set = { + .owner = THIS_MODULE, + .set = &rqspinlock_kfunc_ids, +}; + +static __init int rqspinlock_register_kfuncs(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set); +} +late_initcall(rqspinlock_register_kfuncs); diff --git a/kernel/bpf/rqspinlock.h b/kernel/bpf/rqspinlock.h new file mode 100644 index 000000000000..5d8cb1b1aab4 --- /dev/null +++ b/kernel/bpf/rqspinlock.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Resilient Queued Spin Lock defines + * + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. + * + * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> + */ +#ifndef __LINUX_RQSPINLOCK_H +#define __LINUX_RQSPINLOCK_H + +#include "../locking/qspinlock.h" + +/* + * try_cmpxchg_tail - Return result of cmpxchg of tail word with a new value + * @lock: Pointer to queued spinlock structure + * @tail: The tail to compare against + * @new_tail: The new queue tail code word + * Return: Bool to indicate whether the cmpxchg operation succeeded + * + * This is used by the head of the wait queue to clean up the queue. + * Provides relaxed ordering, since observers only rely on initialized + * state of the node which was made visible through the xchg_tail operation, + * i.e. through the smp_wmb preceding xchg_tail. + * + * We avoid using 16-bit cmpxchg, which is not available on all architectures. + */ +static __always_inline bool try_cmpxchg_tail(struct qspinlock *lock, u32 tail, u32 new_tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + /* + * Is the tail part we compare to already stale? Fail. + */ + if ((old & _Q_TAIL_MASK) != tail) + return false; + /* + * Encode latest locked/pending state for new tail. + */ + new = (old & _Q_LOCKED_PENDING_MASK) | new_tail; + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return true; +} + +#endif /* __LINUX_RQSPINLOCK_H */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c420edbfb7c8..dd5304c6ac3c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -36,6 +36,7 @@ #include <linux/memcontrol.h> #include <linux/trace_events.h> #include <linux/tracepoint.h> +#include <linux/overflow.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> @@ -569,7 +570,24 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, +static bool can_alloc_pages(void) +{ + return preempt_count() == 0 && !irqs_disabled() && + !IS_ENABLED(CONFIG_PREEMPT_RT); +} + +static struct page *__bpf_alloc_page(int nid) +{ + if (!can_alloc_pages()) + return alloc_pages_nolock(nid, 0); + + return alloc_pages_node(nid, + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT + | __GFP_NOWARN, + 0); +} + +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **pages) { unsigned long i, j; @@ -582,14 +600,14 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, old_memcg = set_active_memcg(memcg); #endif for (i = 0; i < nr_pages; i++) { - pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); + pg = __bpf_alloc_page(nid); if (pg) { pages[i] = pg; continue; } for (j = 0; j < i; j++) - __free_page(pages[j]); + free_pages_nolock(pages[j], 0); ret = -ENOMEM; break; } @@ -648,6 +666,7 @@ void btf_record_free(struct btf_record *rec) case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: @@ -675,7 +694,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) if (IS_ERR_OR_NULL(rec)) return NULL; - size = offsetof(struct btf_record, fields[rec->cnt]); + size = struct_size(rec, fields, rec->cnt); new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); if (!new_rec) return ERR_PTR(-ENOMEM); @@ -700,6 +719,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: @@ -729,7 +749,7 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r return false; if (rec_a->cnt != rec_b->cnt) return false; - size = offsetof(struct btf_record, fields[rec_a->cnt]); + size = struct_size(rec_a, fields, rec_a->cnt); /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused * members are zeroed out. So memcmp is safe to do without worrying * about padding/unused fields. @@ -777,6 +797,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) switch (fields[i].type) { case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: break; case BPF_TIMER: bpf_timer_cancel_and_free(field_ptr); @@ -1035,7 +1056,7 @@ static const struct vm_operations_struct bpf_map_default_vmops = { static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) { struct bpf_map *map = filp->private_data; - int err; + int err = 0; if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; @@ -1059,24 +1080,33 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) err = -EACCES; goto out; } + bpf_map_write_active_inc(map); } +out: + mutex_unlock(&map->freeze_mutex); + if (err) + return err; /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; vm_flags_clear(vma, VM_MAYEXEC); + /* If mapping is read-only, then disallow potentially re-mapping with + * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing + * means that as far as BPF map's memory-mapped VMAs are concerned, + * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set, + * both should be set, so we can forget about VM_MAYWRITE and always + * check just VM_WRITE + */ if (!(vma->vm_flags & VM_WRITE)) - /* disallow re-mapping with PROT_WRITE */ vm_flags_clear(vma, VM_MAYWRITE); err = map->ops->map_mmap(map, vma); - if (err) - goto out; + if (err) { + if (vma->vm_flags & VM_WRITE) + bpf_map_write_active_dec(map); + } - if (vma->vm_flags & VM_MAYWRITE) - bpf_map_write_active_inc(map); -out: - mutex_unlock(&map->freeze_mutex); return err; } @@ -1203,7 +1233,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, return -EINVAL; map->record = btf_parse_fields(btf, value_type, - BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | + BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { @@ -1222,6 +1252,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case 0: continue; case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && @@ -1306,7 +1337,7 @@ static bool bpf_net_capable(void) #define BPF_MAP_CREATE_LAST_FIELD map_token_fd /* called via syscall */ -static int map_create(union bpf_attr *attr) +static int map_create(union bpf_attr *attr, bool kernel) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1496,7 +1527,7 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token); + err = security_bpf_map_create(map, attr, token, kernel); if (err) goto free_map_sec; @@ -1553,7 +1584,7 @@ struct bpf_map *bpf_map_get(u32 ufd) return map; } -EXPORT_SYMBOL(bpf_map_get); +EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL"); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { @@ -1584,11 +1615,8 @@ struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) { - spin_lock_bh(&map_idr_lock); - map = __bpf_map_inc_not_zero(map, false); - spin_unlock_bh(&map_idr_lock); - - return map; + lockdep_assert(rcu_read_lock_held()); + return __bpf_map_inc_not_zero(map, false); } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); @@ -1968,8 +1996,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, return err; } -#define MAP_LOOKUP_RETRIES 3 - int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -1979,8 +2005,8 @@ int generic_map_lookup_batch(struct bpf_map *map, void __user *values = u64_to_user_ptr(attr->batch.values); void __user *keys = u64_to_user_ptr(attr->batch.keys); void *buf, *buf_prevkey, *prev_key, *key, *value; - int err, retry = MAP_LOOKUP_RETRIES; u32 value_size, cp, max_count; + int err; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; @@ -2026,14 +2052,8 @@ int generic_map_lookup_batch(struct bpf_map *map, err = bpf_map_copy_value(map, key, value, attr->batch.elem_flags); - if (err == -ENOENT) { - if (retry) { - retry--; - continue; - } - err = -EINTR; - break; - } + if (err == -ENOENT) + goto next_key; if (err) goto free_buf; @@ -2048,12 +2068,12 @@ int generic_map_lookup_batch(struct bpf_map *map, goto free_buf; } + cp++; +next_key: if (!prev_key) prev_key = buf_prevkey; swap(prev_key, key); - retry = MAP_LOOKUP_RETRIES; - cp++; cond_resched(); } @@ -2313,6 +2333,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) kvfree(prog->aux->jited_linfo); kvfree(prog->aux->linfo); kfree(prog->aux->kfunc_tab); + kfree(prog->aux->ctx_arg_info); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); @@ -2943,7 +2964,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_prog; - err = security_bpf_prog_load(prog, attr, token); + err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); if (err) goto free_prog_sec; @@ -3344,7 +3365,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) bpf_link_inc(link); return link; } -EXPORT_SYMBOL(bpf_link_get_from_fd); +EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL"); static void bpf_tracing_link_release(struct bpf_link *link) { @@ -3779,14 +3800,14 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, static int bpf_perf_link_fill_uprobe(const struct perf_event *event, struct bpf_link_info *info) { + u64 ref_ctr_offset, offset; char __user *uname; - u64 addr, offset; u32 ulen, type; int err; uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset, &type, NULL); if (err) return err; @@ -3798,6 +3819,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, info->perf_event.uprobe.name_len = ulen; info->perf_event.uprobe.offset = offset; info->perf_event.uprobe.cookie = event->bpf_cookie; + info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset; return 0; } #endif @@ -4168,7 +4190,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_F_ATTACH_MASK_BASE \ (BPF_F_ALLOW_OVERRIDE | \ BPF_F_ALLOW_MULTI | \ - BPF_F_REPLACE) + BPF_F_REPLACE | \ + BPF_F_PREORDER) #define BPF_F_ATTACH_MASK_MPROG \ (BPF_F_REPLACE | \ @@ -4732,6 +4755,8 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.recursion_misses = stats.misses; info.verified_insns = prog->aux->verified_insns; + if (prog->aux->btf) + info.btf_id = btf_obj_id(prog->aux->btf); if (!bpf_capable()) { info.jited_prog_len = 0; @@ -4878,8 +4903,6 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } - if (prog->aux->btf) - info.btf_id = btf_obj_id(prog->aux->btf); info.attach_btf_id = prog->aux->attach_btf_id; if (attach_btf) info.attach_btf_obj_id = btf_obj_id(attach_btf); @@ -5120,15 +5143,34 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_ return btf_new_fd(attr, uattr, uattr_size); } -#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id +#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) { + struct bpf_token *token = NULL; + if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (attr->open_flags & ~BPF_F_TOKEN_FD) + return -EINVAL; + + if (attr->open_flags & BPF_F_TOKEN_FD) { + token = bpf_token_get_from_fd(attr->fd_by_id_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + if (!bpf_token_allow_cmd(token, BPF_BTF_GET_FD_BY_ID)) { + bpf_token_put(token); + token = NULL; + } + } + + if (!bpf_token_capable(token, CAP_SYS_ADMIN)) { + bpf_token_put(token); return -EPERM; + } + + bpf_token_put(token); return btf_get_fd_by_id(attr->btf_id); } @@ -5767,13 +5809,13 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; - err = security_bpf(cmd, &attr, size); + err = security_bpf(cmd, &attr, size, uattr.is_kernel); if (err < 0) return err; switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr); + err = map_create(&attr, uattr.is_kernel); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); @@ -5980,7 +6022,7 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) return ____bpf_sys_bpf(cmd, attr, size); } } -EXPORT_SYMBOL(kern_sys_bpf); +EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL"); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 81d6cf90584a..941d0d2427e3 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -7,14 +7,46 @@ #include <linux/kobject.h> #include <linux/init.h> #include <linux/sysfs.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/btf.h> /* See scripts/link-vmlinux.sh, gen_btf() func for details */ extern char __start_BTF[]; extern char __stop_BTF[]; +static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT; + size_t vm_size = vma->vm_end - vma->vm_start; + phys_addr_t addr = virt_to_phys(__start_BTF); + unsigned long pfn = addr >> PAGE_SHIFT; + + if (attr->private != __start_BTF || !PAGE_ALIGNED(addr)) + return -EINVAL; + + if (vma->vm_pgoff) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE)) + return -EACCES; + + if (pfn + pages < pfn) + return -EINVAL; + + if ((vm_size >> PAGE_SHIFT) > pages) + return -EINVAL; + + vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE); + return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot); +} + static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .attr = { .name = "vmlinux", .mode = 0444, }, .read_new = sysfs_bin_attr_simple_read, + .mmap = btf_sysfs_vmlinux_mmap, }; struct kobject *btf_kobj; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9971c03adfd5..169845710c7e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -322,6 +322,7 @@ struct bpf_kfunc_call_arg_meta { struct btf *arg_btf; u32 arg_btf_id; bool arg_owning_ref; + bool arg_prog; struct { struct btf_field *field; @@ -456,7 +457,7 @@ static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog) static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { - return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); + return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK); } static bool type_is_rdonly_mem(u32 type) @@ -579,6 +580,13 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) insn->imm == BPF_CMPXCHG; } +static bool is_atomic_load_insn(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_LOAD_ACQ; +} + static int __get_spi(s32 off) { return (-off - 1) / BPF_REG_SIZE; @@ -1148,7 +1156,8 @@ static int release_irq_state(struct bpf_verifier_state *state, int id); static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, - struct bpf_reg_state *reg, int insn_idx) + struct bpf_reg_state *reg, int insn_idx, + int kfunc_class) { struct bpf_func_state *state = func(env, reg); struct bpf_stack_state *slot; @@ -1170,6 +1179,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = id; + st->irq.kfunc_class = kfunc_class; for (i = 0; i < BPF_REG_SIZE; i++) slot->slot_type[i] = STACK_IRQ_FLAG; @@ -1178,7 +1188,8 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, return 0; } -static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + int kfunc_class) { struct bpf_func_state *state = func(env, reg); struct bpf_stack_state *slot; @@ -1192,6 +1203,15 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r slot = &state->stack[spi]; st = &slot->spilled_ptr; + if (st->irq.kfunc_class != kfunc_class) { + const char *flag_kfunc = st->irq.kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; + const char *used_kfunc = kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; + + verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n", + flag_kfunc, used_kfunc); + return -EINVAL; + } + err = release_irq_state(env->cur_state, st->ref_obj_id); WARN_ON_ONCE(err && err != -EACCES); if (err) { @@ -1409,6 +1429,8 @@ static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf dst->active_preempt_locks = src->active_preempt_locks; dst->active_rcu_lock = src->active_rcu_lock; dst->active_irq_id = src->active_irq_id; + dst->active_lock_id = src->active_lock_id; + dst->active_lock_ptr = src->active_lock_ptr; return 0; } @@ -1501,11 +1523,15 @@ static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum r struct bpf_reference_state *s; s = acquire_reference_state(env, insn_idx); + if (!s) + return -ENOMEM; s->type = type; s->id = id; s->ptr = ptr; state->active_locks++; + state->active_lock_id = id; + state->active_lock_ptr = ptr; return 0; } @@ -1543,18 +1569,37 @@ static void release_reference_state(struct bpf_verifier_state *state, int idx) return; } +static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id) +{ + int i; + + for (i = 0; i < state->acquired_refs; i++) + if (state->refs[i].id == ptr_id) + return true; + + return false; +} + static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) { + void *prev_ptr = NULL; + u32 prev_id = 0; int i; for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].type != type) - continue; - if (state->refs[i].id == id && state->refs[i].ptr == ptr) { + if (state->refs[i].type == type && state->refs[i].id == id && + state->refs[i].ptr == ptr) { release_reference_state(state, i); state->active_locks--; + /* Reassign active lock (id, ptr). */ + state->active_lock_id = prev_id; + state->active_lock_ptr = prev_ptr; return 0; } + if (state->refs[i].type & REF_TYPE_LOCK_MASK) { + prev_id = state->refs[i].id; + prev_ptr = state->refs[i].ptr; + } } return -EINVAL; } @@ -1589,7 +1634,7 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st for (i = 0; i < state->acquired_refs; i++) { struct bpf_reference_state *s = &state->refs[i]; - if (s->type != type) + if (!(s->type & type)) continue; if (s->id == id && s->ptr == ptr) @@ -1598,6 +1643,14 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st return NULL; } +static void update_peak_states(struct bpf_verifier_env *env) +{ + u32 cur_states; + + cur_states = env->explored_states_size + env->free_list_size; + env->peak_states = max(env->peak_states, cur_states); +} + static void free_func_state(struct bpf_func_state *state) { if (!state) @@ -1620,6 +1673,50 @@ static void free_verifier_state(struct bpf_verifier_state *state, kfree(state); } +/* struct bpf_verifier_state->{parent,loop_entry} refer to states + * that are in either of env->{expored_states,free_list}. + * In both cases the state is contained in struct bpf_verifier_state_list. + */ +static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st) +{ + if (st->parent) + return container_of(st->parent, struct bpf_verifier_state_list, state); + return NULL; +} + +static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verifier_state *st) +{ + if (st->loop_entry) + return container_of(st->loop_entry, struct bpf_verifier_state_list, state); + return NULL; +} + +/* A state can be freed if it is no longer referenced: + * - is in the env->free_list; + * - has no children states; + * - is not used as loop_entry. + * + * Freeing a state can make it's loop_entry free-able. + */ +static void maybe_free_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state_list *sl) +{ + struct bpf_verifier_state_list *loop_entry_sl; + + while (sl && sl->in_free_list && + sl->state.branches == 0 && + sl->state.used_as_loop_entry == 0) { + loop_entry_sl = state_loop_entry_as_list(&sl->state); + if (loop_entry_sl) + loop_entry_sl->state.used_as_loop_entry--; + list_del(&sl->node); + free_verifier_state(&sl->state, false); + kfree(sl); + env->free_list_size--; + sl = loop_entry_sl; + } +} + /* copy verifier state from src to dst growing dst stack space * when necessary to accommodate larger src stack */ @@ -1659,6 +1756,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; dst_state->may_goto_depth = src->may_goto_depth; + dst_state->loop_entry = src->loop_entry; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -1679,7 +1777,7 @@ static u32 state_htab_size(struct bpf_verifier_env *env) return env->prog->len; } -static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx) +static struct list_head *explored_state(struct bpf_verifier_env *env, int idx) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_func_state *state = cur->frame[cur->curframe]; @@ -1787,16 +1885,13 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta * # Find outermost loop entry known for n * def get_loop_entry(n): * h = entries.get(n, None) - * while h in entries and entries[h] != h: + * while h in entries: * h = entries[h] * return h * - * # Update n's loop entry if h's outermost entry comes - * # before n's outermost entry in current DFS path. + * # Update n's loop entry if h comes before n in current DFS path. * def update_loop_entry(n, h): - * n1 = get_loop_entry(n) or n - * h1 = get_loop_entry(h) or h - * if h1 in path and depths[h1] <= depths[n1]: + * if h in path and depths[entries.get(n, n)] < depths[n]: * entries[n] = h1 * * def dfs(n, depth): @@ -1808,7 +1903,7 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta * # Case A: explore succ and update cur's loop entry * # only if succ's entry is in current DFS path. * dfs(succ, depth + 1) - * h = get_loop_entry(succ) + * h = entries.get(succ, None) * update_loop_entry(n, h) * else: * # Case B or C depending on `h1 in path` check in update_loop_entry(). @@ -1820,46 +1915,46 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta * and cur's loop entry has to be updated (case A), handle this in * update_branch_counts(); * - use st->branch > 0 as a signal that st is in the current DFS path; - * - handle cases B and C in is_state_visited(); - * - update topmost loop entry for intermediate states in get_loop_entry(). + * - handle cases B and C in is_state_visited(). */ -static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st) +static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) { - struct bpf_verifier_state *topmost = st->loop_entry, *old; + struct bpf_verifier_state *topmost = st->loop_entry; + u32 steps = 0; - while (topmost && topmost->loop_entry && topmost != topmost->loop_entry) + while (topmost && topmost->loop_entry) { + if (verifier_bug_if(steps++ > st->dfs_depth, env, "infinite loop")) + return ERR_PTR(-EFAULT); topmost = topmost->loop_entry; - /* Update loop entries for intermediate states to avoid this - * traversal in future get_loop_entry() calls. - */ - while (st && st->loop_entry != topmost) { - old = st->loop_entry; - st->loop_entry = topmost; - st = old; } return topmost; } -static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) +static void update_loop_entry(struct bpf_verifier_env *env, + struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) { - struct bpf_verifier_state *cur1, *hdr1; - - cur1 = get_loop_entry(cur) ?: cur; - hdr1 = get_loop_entry(hdr) ?: hdr; - /* The head1->branches check decides between cases B and C in - * comment for get_loop_entry(). If hdr1->branches == 0 then + /* The hdr->branches check decides between cases B and C in + * comment for get_loop_entry(). If hdr->branches == 0 then * head's topmost loop entry is not in current DFS path, * hence 'cur' and 'hdr' are not in the same loop and there is * no need to update cur->loop_entry. */ - if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) { + if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) { + if (cur->loop_entry) { + cur->loop_entry->used_as_loop_entry--; + maybe_free_verifier_state(env, state_loop_entry_as_list(cur)); + } cur->loop_entry = hdr; - hdr->used_as_loop_entry = true; + hdr->used_as_loop_entry++; } } static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { + struct bpf_verifier_state_list *sl = NULL, *parent_sl; + struct bpf_verifier_state *parent; + while (st) { u32 br = --st->branches; @@ -1869,7 +1964,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi * This is a part of 'case A' in get_loop_entry() comment. */ if (br == 0 && st->parent && st->loop_entry) - update_loop_entry(st->parent, st->loop_entry); + update_loop_entry(env, st->parent, st->loop_entry); /* WARN_ON(br > 1) technically makes sense here, * but see comment in push_stack(), hence: @@ -1879,7 +1974,12 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi br); if (br) break; - st = st->parent; + parent = st->parent; + parent_sl = state_parent_as_list(st); + if (sl) + maybe_free_verifier_state(env, sl); + st = parent; + sl = parent_sl; } } @@ -3204,6 +3304,21 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, return res ? &res->func_model : NULL; } +static int add_kfunc_in_insns(struct bpf_verifier_env *env, + struct bpf_insn *insn, int cnt) +{ + int i, ret; + + for (i = 0; i < cnt; i++, insn++) { + if (bpf_pseudo_kfunc_call(insn)) { + ret = add_kfunc_call(env, insn->imm, insn->off); + if (ret < 0) + return ret; + } + } + return 0; +} + static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; @@ -3267,6 +3382,15 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return 0; } +static int jmp_offset(struct bpf_insn *insn) +{ + u8 code = insn->code; + + if (code == (BPF_JMP32 | BPF_JA)) + return insn->imm; + return insn->off; +} + static int check_subprogs(struct bpf_verifier_env *env) { int i, subprog_start, subprog_end, off, cur_subprog = 0; @@ -3293,10 +3417,7 @@ static int check_subprogs(struct bpf_verifier_env *env) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; - if (code == (BPF_JMP32 | BPF_JA)) - off = i + insn[i].imm + 1; - else - off = i + insn[i].off + 1; + off = i + jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -3336,12 +3457,11 @@ static int mark_reg_read(struct bpf_verifier_env *env, /* if read wasn't screened by an earlier write ... */ if (writes && state->live & REG_LIVE_WRITTEN) break; - if (parent->live & REG_LIVE_DONE) { - verbose(env, "verifier BUG type %s var_off %lld off %d\n", - reg_type_str(env, parent->type), - parent->var_off.value, parent->off); + if (verifier_bug_if(parent->live & REG_LIVE_DONE, env, + "type %s var_off %lld off %d", + reg_type_str(env, parent->type), + parent->var_off.value, parent->off)) return -EFAULT; - } /* The first condition is more likely to be true than the * second, checked it first. */ @@ -3481,7 +3601,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (class == BPF_STX) { - /* BPF_STX (including atomic variants) has multiple source + /* BPF_STX (including atomic variants) has one or more source * operands, one of which is a ptr. Check whether the caller is * asking about it. */ @@ -3526,16 +3646,16 @@ static int insn_def_regno(const struct bpf_insn *insn) case BPF_ST: return -1; case BPF_STX: - if ((BPF_MODE(insn->code) == BPF_ATOMIC || - BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) && - (insn->imm & BPF_FETCH)) { + if (BPF_MODE(insn->code) == BPF_ATOMIC || + BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) { if (insn->imm == BPF_CMPXCHG) return BPF_REG_0; - else + else if (insn->imm == BPF_LOAD_ACQ) + return insn->dst_reg; + else if (insn->imm & BPF_FETCH) return insn->src_reg; - } else { - return -1; } + return -1; default: return insn->dst_reg; } @@ -3734,14 +3854,14 @@ static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_s /* atomic instructions push insn_flags twice, for READ and * WRITE sides, but they should agree on stack slot */ - WARN_ONCE((env->cur_hist_ent->flags & insn_flags) && - (env->cur_hist_ent->flags & insn_flags) != insn_flags, - "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n", - env->insn_idx, env->cur_hist_ent->flags, insn_flags); + verifier_bug_if((env->cur_hist_ent->flags & insn_flags) && + (env->cur_hist_ent->flags & insn_flags) != insn_flags, + env, "insn history: insn_idx %d cur flags %x new flags %x", + env->insn_idx, env->cur_hist_ent->flags, insn_flags); env->cur_hist_ent->flags |= insn_flags; - WARN_ONCE(env->cur_hist_ent->linked_regs != 0, - "verifier insn history bug: insn_idx %d linked_regs != 0: %#llx\n", - env->insn_idx, env->cur_hist_ent->linked_regs); + verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env, + "insn history: insn_idx %d linked_regs: %#llx", + env->insn_idx, env->cur_hist_ent->linked_regs); env->cur_hist_ent->linked_regs = linked_regs; return 0; } @@ -3826,6 +3946,17 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) return btf_name_by_offset(desc_btf, func->name_off); } +static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, + .cb_print = verbose, + .private_data = env, + }; + + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); +} + static inline void bt_init(struct backtrack_state *bt, u32 frame) { bt->frame = frame; @@ -3853,8 +3984,7 @@ static inline u32 bt_empty(struct backtrack_state *bt) static inline int bt_subprog_enter(struct backtrack_state *bt) { if (bt->frame == MAX_CALL_FRAMES - 1) { - verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(bt->env, "subprog enter from frame %d", bt->frame); return -EFAULT; } bt->frame++; @@ -3864,8 +3994,7 @@ static inline int bt_subprog_enter(struct backtrack_state *bt) static inline int bt_subprog_exit(struct backtrack_state *bt) { if (bt->frame == 0) { - verbose(bt->env, "BUG subprog exit from frame 0\n"); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(bt->env, "subprog exit from frame 0"); return -EFAULT; } bt->frame--; @@ -4026,11 +4155,6 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) { - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; struct bpf_insn *insn = env->prog->insnsi + idx; u8 class = BPF_CLASS(insn->code); u8 opcode = BPF_OP(insn->code); @@ -4048,7 +4172,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); verbose(env, "stack=%s before ", env->tmp_str_buf); verbose(env, "%d: ", idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + verbose_insn(env, insn); } /* If there is a history record that some registers gained range at this insn, @@ -4095,7 +4219,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * dreg still needs precision before this insn */ } - } else if (class == BPF_LDX) { + } else if (class == BPF_LDX || is_atomic_load_insn(insn)) { if (!bt_is_reg_set(bt, dreg)) return 0; bt_clear_reg(bt, dreg); @@ -4148,14 +4272,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * should be literally next instruction in * caller program */ - WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug"); + verifier_bug_if(idx + 1 != subseq_idx, env, + "extra insn from subprog"); /* r1-r5 are invalidated after subprog call, * so for global func call it shouldn't be set * anymore */ if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "global subprog unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } /* global subprog always sets R0 */ @@ -4169,16 +4294,17 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * the current frame should be zero by now */ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "static subprog unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } /* we are now tracking register spills correctly, * so any instance of leftover slots is a bug */ if (bt_stack_mask(bt) != 0) { - verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)"); + verifier_bug(env, + "static subprog leftover stack slots %llx", + bt_stack_mask(bt)); return -EFAULT; } /* propagate r1-r5 to the caller */ @@ -4201,13 +4327,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * not actually arguments passed directly to callback subprogs */ if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "callback unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } if (bt_stack_mask(bt) != 0) { - verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)"); + verifier_bug(env, "callback leftover stack slots %llx", + bt_stack_mask(bt)); return -EFAULT; } /* clear r1-r5 in callback subprog's mask */ @@ -4226,11 +4352,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, /* regular helper call sets R0 */ bt_clear_reg(bt, BPF_REG_0); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - /* if backtracing was looking for registers R1-R5 + /* if backtracking was looking for registers R1-R5 * they should have been found already. */ - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking call unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } } else if (opcode == BPF_EXIT) { @@ -4248,8 +4374,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, for (i = BPF_REG_1; i <= BPF_REG_5; i++) bt_clear_reg(bt, i); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - verbose(env, "BUG regs %x\n", bt_reg_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking exit unexpected regs %x", + bt_reg_mask(bt)); return -EFAULT; } @@ -4284,8 +4410,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * before it would be equally necessary to * propagate it to dreg. */ - bt_set_reg(bt, dreg); - bt_set_reg(bt, sreg); + if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK)) + bt_set_reg(bt, sreg); + if (!hist || !(hist->flags & INSN_F_DST_REG_STACK)) + bt_set_reg(bt, dreg); } else if (BPF_SRC(insn->code) == BPF_K) { /* dreg <cond> K * Only dreg still needs precision before @@ -4590,9 +4718,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; } - verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n", - st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt)); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx", + st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt)); return -EFAULT; } @@ -4628,8 +4755,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) * It means the backtracking missed the spot where * particular register was initialized with a constant. */ - verbose(env, "BUG backtracking idx %d\n", i); - WARN_ONCE(1, "verifier backtracking bug"); + verifier_bug(env, "backtracking idx %d", i); return -EFAULT; } } @@ -4654,12 +4780,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); for_each_set_bit(i, mask, 64) { - if (i >= func->allocated_stack / BPF_REG_SIZE) { - verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n", - i, func->allocated_stack / BPF_REG_SIZE); - WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)"); + if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE, + env, "stack slot %d, total slots %d", + i, func->allocated_stack / BPF_REG_SIZE)) return -EFAULT; - } if (!is_spilled_scalar_reg(&func->stack[i])) { bt_clear_frame_slot(bt, fr, i); @@ -5980,18 +6104,10 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, - enum bpf_access_type t, enum bpf_reg_type *reg_type, - struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx) + enum bpf_access_type t, struct bpf_insn_access_aux *info) { - struct bpf_insn_access_aux info = { - .reg_type = *reg_type, - .log = &env->log, - .is_retval = false, - .is_ldsx = is_ldsx, - }; - if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, env->prog, &info)) { + env->ops->is_valid_access(off, size, t, env->prog, info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -5999,14 +6115,15 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, * will only allow for whole field access and rejects any other * type of narrower access. */ - *reg_type = info.reg_type; - *is_retval = info.is_retval; - - if (base_type(*reg_type) == PTR_TO_BTF_ID) { - *btf = info.btf; - *btf_id = info.btf_id; + if (base_type(info->reg_type) == PTR_TO_BTF_ID) { + if (info->ref_obj_id && + !find_reference_state(env->cur_state, info->ref_obj_id)) { + verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n", + off); + return -EACCES; + } } else { - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + env->insn_aux_data[insn_idx].ctx_field_size = info->ctx_field_size; } /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) @@ -6116,6 +6233,26 @@ static bool is_arena_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_ARENA; } +/* Return false if @regno contains a pointer whose type isn't supported for + * atomic instruction @insn. + */ +static bool atomic_ptr_type_ok(struct bpf_verifier_env *env, int regno, + struct bpf_insn *insn) +{ + if (is_ctx_reg(env, regno)) + return false; + if (is_pkt_reg(env, regno)) + return false; + if (is_flow_key_reg(env, regno)) + return false; + if (is_sk_reg(env, regno)) + return false; + if (is_arena_reg(env, regno)) + return bpf_jit_supports_insn(insn, true); + + return true; +} + static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { #ifdef CONFIG_NET [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], @@ -6419,21 +6556,18 @@ continue_func: /* find the callee */ next_insn = i + insn[i].imm + 1; sidx = find_subprog(env, next_insn); - if (sidx < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - next_insn); + if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn)) return -EFAULT; - } if (subprog[sidx].is_async_cb) { if (subprog[sidx].has_tail_call) { - verbose(env, "verifier bug. subprog has tail_call and async cb\n"); + verifier_bug(env, "subprog has tail_call and async cb"); return -EFAULT; } /* async callbacks don't increase bpf prog stack size unless called directly */ if (!bpf_pseudo_call(insn + i)) continue; if (subprog[sidx].is_exception_cb) { - verbose(env, "insn %d cannot call exception cb directly\n", i); + verbose(env, "insn %d cannot call exception cb directly", i); return -EINVAL; } } @@ -6533,11 +6667,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, int start = idx + insn->imm + 1, subprog; subprog = find_subprog(env, start); - if (subprog < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - start); + if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start)) return -EFAULT; - } return env->subprog_info[subprog].stack_depth; } #endif @@ -6896,8 +7027,7 @@ BTF_TYPE_SAFE_TRUSTED(struct file) { struct inode *f_inode; }; -BTF_TYPE_SAFE_TRUSTED(struct dentry) { - /* no negative dentry-s in places where bpf can see it */ +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry) { struct inode *d_inode; }; @@ -6935,7 +7065,6 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } @@ -6945,6 +7074,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); @@ -7363,11 +7493,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { - bool is_retval = false; struct bpf_retval_range range; - enum bpf_reg_type reg_type = SCALAR_VALUE; - struct btf *btf = NULL; - u32 btf_id = 0; + struct bpf_insn_access_aux info = { + .reg_type = SCALAR_VALUE, + .is_ldsx = is_ldsx, + .log = &env->log, + }; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { @@ -7379,8 +7510,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, - &btf_id, &is_retval, is_ldsx); + err = check_ctx_access(env, insn_idx, off, size, t, &info); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { @@ -7388,8 +7518,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * PTR_TO_PACKET[_META,_END]. In the latter * case, we know the offset is zero. */ - if (reg_type == SCALAR_VALUE) { - if (is_retval && get_func_retval_range(env->prog, &range)) { + if (info.reg_type == SCALAR_VALUE) { + if (info.is_retval && get_func_retval_range(env->prog, &range)) { err = __mark_reg_s32_range(env, regs, value_regno, range.minval, range.maxval); if (err) @@ -7400,7 +7530,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { mark_reg_known_zero(env, regs, value_regno); - if (type_may_be_null(reg_type)) + if (type_may_be_null(info.reg_type)) regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the @@ -7408,12 +7538,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (base_type(reg_type) == PTR_TO_BTF_ID) { - regs[value_regno].btf = btf; - regs[value_regno].btf_id = btf_id; + if (base_type(info.reg_type) == PTR_TO_BTF_ID) { + regs[value_regno].btf = info.btf; + regs[value_regno].btf_id = info.btf_id; + regs[value_regno].ref_obj_id = info.ref_obj_id; } } - regs[value_regno].type = reg_type; + regs[value_regno].type = info.reg_type; } } else if (reg->type == PTR_TO_STACK) { @@ -7516,27 +7647,72 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, bool allow_trust_mismatch); -static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) +static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once, bool is_ldsx, + bool allow_trust_mismatch, const char *ctx) { - int load_reg; + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type src_reg_type; int err; - switch (insn->imm) { - case BPF_ADD: - case BPF_ADD | BPF_FETCH: - case BPF_AND: - case BPF_AND | BPF_FETCH: - case BPF_OR: - case BPF_OR | BPF_FETCH: - case BPF_XOR: - case BPF_XOR | BPF_FETCH: - case BPF_XCHG: - case BPF_CMPXCHG: - break; - default: - verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); - return -EINVAL; - } + /* check src operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check dst operand */ + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + src_reg_type = regs[insn->src_reg].type; + + /* Check if (src_reg + off) is readable. The state of dst_reg will be + * updated by this call. + */ + err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, + strict_alignment_once, is_ldsx); + err = err ?: save_aux_ptr_type(env, src_reg_type, + allow_trust_mismatch); + err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], ctx); + + return err; +} + +static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once) +{ + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type dst_reg_type; + int err; + + /* check src1 operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check src2 operand */ + err = check_reg_arg(env, insn->dst_reg, SRC_OP); + if (err) + return err; + + dst_reg_type = regs[insn->dst_reg].type; + + /* Check if (dst_reg + off) is writeable. */ + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, + strict_alignment_once, false); + err = err ?: save_aux_ptr_type(env, dst_reg_type, false); + + return err; +} + +static int check_atomic_rmw(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int load_reg; + int err; if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { verbose(env, "invalid atomic operand size\n"); @@ -7572,11 +7748,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return -EACCES; } - if (is_ctx_reg(env, insn->dst_reg) || - is_pkt_reg(env, insn->dst_reg) || - is_flow_key_reg(env, insn->dst_reg) || - is_sk_reg(env, insn->dst_reg) || - (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) { + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); @@ -7603,12 +7775,12 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i /* Check whether we can read the memory, with second call for fetch * case to simulate the register fill. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, load_reg, - true, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, load_reg, true, false); if (err) return err; @@ -7618,13 +7790,86 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return err; } /* Check whether we can write into the same memory. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; return 0; } +static int check_atomic_load(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int err; + + err = check_load_mem(env, insn, true, false, false, "atomic_load"); + if (err) + return err; + + if (!atomic_ptr_type_ok(env, insn->src_reg, insn)) { + verbose(env, "BPF_ATOMIC loads from R%d %s is not allowed\n", + insn->src_reg, + reg_type_str(env, reg_state(env, insn->src_reg)->type)); + return -EACCES; + } + + return 0; +} + +static int check_atomic_store(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int err; + + err = check_store_reg(env, insn, true); + if (err) + return err; + + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", + insn->dst_reg, + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); + return -EACCES; + } + + return 0; +} + +static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + switch (insn->imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + case BPF_AND: + case BPF_AND | BPF_FETCH: + case BPF_OR: + case BPF_OR | BPF_FETCH: + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + case BPF_XCHG: + case BPF_CMPXCHG: + return check_atomic_rmw(env, insn); + case BPF_LOAD_ACQ: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit load-acquires are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_load(env, insn); + case BPF_STORE_REL: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit store-releases are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_store(env, insn); + default: + verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", + insn->imm); + return -EINVAL; + } +} + /* When register 'regno' is used to read the stack (either directly or through * a helper function) make sure that it's within stack boundary and, depending * on the access type and privileges, that all elements of the stack are @@ -7727,7 +7972,7 @@ static int check_stack_range_initialized( slot = -i - 1; spi = slot / BPF_REG_SIZE; if (state->allocated_stack <= slot) { - verbose(env, "verifier bug: allocated_stack too small\n"); + verbose(env, "allocated_stack too small\n"); return -EFAULT; } @@ -7983,6 +8228,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg return err; } +enum { + PROCESS_SPIN_LOCK = (1 << 0), + PROCESS_RES_LOCK = (1 << 1), + PROCESS_LOCK_IRQ = (1 << 2), +}; + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL. * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL. @@ -8005,30 +8256,33 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg * env->cur_state->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ -static int process_spin_lock(struct bpf_verifier_env *env, int regno, - bool is_lock) +static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) { + bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; + const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); + bool is_irq = flags & PROCESS_LOCK_IRQ; u64 val = reg->var_off.value; struct bpf_map *map = NULL; struct btf *btf = NULL; struct btf_record *rec; + u32 spin_lock_off; int err; if (!is_const) { verbose(env, - "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", - regno); + "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n", + regno, lock_str); return -EINVAL; } if (reg->type == PTR_TO_MAP_VALUE) { map = reg->map_ptr; if (!map->btf) { verbose(env, - "map '%s' has to have BTF in order to use bpf_spin_lock\n", - map->name); + "map '%s' has to have BTF in order to use %s_lock\n", + map->name, lock_str); return -EINVAL; } } else { @@ -8036,36 +8290,53 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, } rec = reg_btf_record(reg); - if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) { - verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local", - map ? map->name : "kptr"); + if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) { + verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local", + map ? map->name : "kptr", lock_str); return -EINVAL; } - if (rec->spin_lock_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", - val + reg->off, rec->spin_lock_off); + spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off; + if (spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n", + val + reg->off, lock_str, spin_lock_off); return -EINVAL; } if (is_lock) { void *ptr; + int type; if (map) ptr = map; else ptr = btf; - if (cur->active_locks) { - verbose(env, - "Locking two bpf_spin_locks are not allowed\n"); - return -EINVAL; + if (!is_res_lock && cur->active_locks) { + if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) { + verbose(env, + "Locking two bpf_spin_locks are not allowed\n"); + return -EINVAL; + } + } else if (is_res_lock && cur->active_locks) { + if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) { + verbose(env, "Acquiring the same lock again, AA deadlock detected\n"); + return -EINVAL; + } } - err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr); + + if (is_res_lock && is_irq) + type = REF_TYPE_RES_LOCK_IRQ; + else if (is_res_lock) + type = REF_TYPE_RES_LOCK; + else + type = REF_TYPE_LOCK; + err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr); if (err < 0) { verbose(env, "Failed to acquire lock state\n"); return err; } } else { void *ptr; + int type; if (map) ptr = map; @@ -8073,12 +8344,26 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, ptr = btf; if (!cur->active_locks) { - verbose(env, "bpf_spin_unlock without taking a lock\n"); + verbose(env, "%s_unlock without taking a lock\n", lock_str); return -EINVAL; } - if (release_lock_state(env->cur_state, REF_TYPE_LOCK, reg->id, ptr)) { - verbose(env, "bpf_spin_unlock of different lock\n"); + if (is_res_lock && is_irq) + type = REF_TYPE_RES_LOCK_IRQ; + else if (is_res_lock) + type = REF_TYPE_RES_LOCK; + else + type = REF_TYPE_LOCK; + if (!find_lock_state(cur, type, reg->id, ptr)) { + verbose(env, "%s_unlock of different lock\n", lock_str); + return -EINVAL; + } + if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) { + verbose(env, "%s_unlock cannot be out of order\n", lock_str); + return -EINVAL; + } + if (release_lock_state(cur, type, reg->id, ptr)) { + verbose(env, "%s_unlock of different lock\n", lock_str); return -EINVAL; } @@ -8116,7 +8401,7 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } if (meta->map_ptr) { - verbose(env, "verifier bug. Two map pointers in a timer helper\n"); + verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } meta->map_uid = reg->map_uid; @@ -8429,10 +8714,12 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env, { struct bpf_verifier_state_list *sl; struct bpf_verifier_state *st; + struct list_head *pos, *head; /* Explored states are pushed in stack order, most recent states come first */ - sl = *explored_state(env, insn_idx); - for (; sl; sl = sl->next) { + head = explored_state(env, insn_idx); + list_for_each(pos, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); /* If st->branches != 0 state is a part of current DFS verification path, * hence cur & st for a loop. */ @@ -9149,10 +9436,11 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return 0; } -/* Returns constant key value if possible, else negative error */ -static s64 get_constant_map_key(struct bpf_verifier_env *env, +/* Returns constant key value in `value` if possible, else negative error */ +static int get_constant_map_key(struct bpf_verifier_env *env, struct bpf_reg_state *key, - u32 key_size) + u32 key_size, + s64 *value) { struct bpf_func_state *state = func(env, key); struct bpf_reg_state *reg; @@ -9179,8 +9467,10 @@ static s64 get_constant_map_key(struct bpf_verifier_env *env, /* First handle precisely tracked STACK_ZERO */ for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--) zero_size++; - if (zero_size >= key_size) + if (zero_size >= key_size) { + *value = 0; return 0; + } /* Check that stack contains a scalar spill of expected size */ if (!is_spilled_scalar_reg(&state->stack[spi])) @@ -9203,9 +9493,12 @@ static s64 get_constant_map_key(struct bpf_verifier_env *env, if (err < 0) return err; - return reg->var_off.value; + *value = reg->var_off.value; + return 0; } +static bool can_elide_value_nullness(enum bpf_map_type type); + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn, @@ -9354,9 +9647,16 @@ skip_type_check: err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); if (err) return err; - meta->const_map_key = get_constant_map_key(env, reg, key_size); - if (meta->const_map_key < 0 && meta->const_map_key != -EOPNOTSUPP) - return meta->const_map_key; + if (can_elide_value_nullness(meta->map_ptr->map_type)) { + err = get_constant_map_key(env, reg, key_size, &meta->const_map_key); + if (err < 0) { + meta->const_map_key = -1; + if (err == -EOPNOTSUPP) + err = 0; + else + return err; + } + } break; case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) @@ -9389,11 +9689,11 @@ skip_type_check: return -EACCES; } if (meta->func_id == BPF_FUNC_spin_lock) { - err = process_spin_lock(env, regno, true); + err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK); if (err) return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - err = process_spin_lock(env, regno, false); + err = process_spin_lock(env, regno, 0); if (err) return err; } else { @@ -9651,7 +9951,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) { - verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); + verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n"); return -EINVAL; } break; @@ -9973,8 +10273,7 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls } if (state->frame[state->curframe + 1]) { - verbose(env, "verifier bug. Frame %d already allocated\n", - state->curframe + 1); + verifier_bug(env, "Frame %d already allocated", state->curframe + 1); return -EFAULT; } @@ -10088,8 +10387,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, if (err) return err; } else { - bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n", - i, arg->arg_type); + verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type); return -EFAULT; } } @@ -10152,13 +10450,13 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_cb = true; if (bpf_pseudo_kfunc_call(insn) && !is_callback_calling_kfunc(insn->imm)) { - verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); + verifier_bug(env, "kfunc %s#%d not marked as callback-calling", + func_id_name(insn->imm), insn->imm); return -EFAULT; } else if (!bpf_pseudo_kfunc_call(insn) && !is_callback_calling_function(insn->imm)) { /* helper */ - verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); + verifier_bug(env, "helper %s#%d not marked as callback-calling", + func_id_name(insn->imm), insn->imm); return -EFAULT; } @@ -10210,10 +10508,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, target_insn = *insn_idx + insn->imm + 1; subprog = find_subprog(env, target_insn); - if (subprog < 0) { - verbose(env, "verifier bug. No program starts at insn %d\n", target_insn); + if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program", + target_insn)) return -EFAULT; - } caller = state->frame[state->curframe]; err = btf_check_subprog_call(env, subprog, caller->regs); @@ -10222,23 +10519,18 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (subprog_is_global(env, subprog)) { const char *sub_name = subprog_name(env, subprog); - /* Only global subprogs cannot be called with a lock held. */ if (env->cur_state->active_locks) { verbose(env, "global function calls are not allowed while holding a lock,\n" "use static function instead\n"); return -EINVAL; } - /* Only global subprogs cannot be called with preemption disabled. */ - if (env->cur_state->active_preempt_locks) { - verbose(env, "global function calls are not allowed with preemption disabled,\n" - "use static function instead\n"); - return -EINVAL; - } - - if (env->cur_state->active_irq_id) { - verbose(env, "global function calls are not allowed with IRQs disabled,\n" - "use static function instead\n"); + if (env->subprog_info[subprog].might_sleep && + (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks || + env->cur_state->active_irq_id || !in_sleepable(env))) { + verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" + "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" + "a non-sleepable BPF program context\n"); return -EINVAL; } @@ -10737,6 +11029,8 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit) { struct bpf_verifier_state *state = env->cur_state; + enum bpf_prog_type type = resolve_prog_type(env->prog); + struct bpf_reg_state *reg = reg_state(env, BPF_REG_0); bool refs_lingering = false; int i; @@ -10746,6 +11040,12 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type != REF_TYPE_PTR) continue; + /* Allow struct_ops programs to return a referenced kptr back to + * kernel. Type checks are performed later in check_return_code. + */ + if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit && + reg->ref_obj_id == state->refs[i].id) + continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); refs_lingering = true; @@ -10809,7 +11109,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, fmt_map_off); if (err) { - verbose(env, "verifier bug\n"); + verbose(env, "failed to retrieve map value address\n"); return -EFAULT; } fmt = (char *)(long)fmt_addr + fmt_map_off; @@ -11272,7 +11572,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].map_uid = meta.map_uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; if (!type_may_be_null(ret_flag) && - btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { + btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } break; @@ -11444,10 +11744,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* mark_btf_func_reg_size() is used when the reg size is determined by * the BTF func_proto's return value size and argument. */ -static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, - size_t reg_size) +static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs, + u32 regno, size_t reg_size) { - struct bpf_reg_state *reg = &cur_regs(env)[regno]; + struct bpf_reg_state *reg = ®s[regno]; if (regno == BPF_REG_0) { /* Function return value */ @@ -11465,6 +11765,12 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, } } +static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, + size_t reg_size) +{ + return __mark_btf_func_reg_size(env, cur_regs(env), regno, reg_size); +} + static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_ACQUIRE; @@ -11576,6 +11882,11 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__irq_flag"); } +static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg) +{ + return btf_param_match_suffix(btf, arg, "__prog"); +} + static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -11602,6 +11913,7 @@ enum { KF_ARG_RB_ROOT_ID, KF_ARG_RB_NODE_ID, KF_ARG_WORKQUEUE_ID, + KF_ARG_RES_SPIN_LOCK_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -11611,6 +11923,7 @@ BTF_ID(struct, bpf_list_node) BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) +BTF_ID(struct, bpf_res_spin_lock) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -11659,6 +11972,21 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); } +static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); +} + +static bool is_rbtree_node_type(const struct btf_type *t) +{ + return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_RB_NODE_ID]); +} + +static bool is_list_node_type(const struct btf_type *t) +{ + return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_LIST_NODE_ID]); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -11730,6 +12058,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_MAP, KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, + KF_ARG_PTR_TO_RES_SPIN_LOCK, }; enum special_kfunc_type { @@ -11740,6 +12069,8 @@ enum special_kfunc_type { KF_bpf_list_push_back_impl, KF_bpf_list_pop_front, KF_bpf_list_pop_back, + KF_bpf_list_front, + KF_bpf_list_back, KF_bpf_cast_to_kern_ctx, KF_bpf_rdonly_cast, KF_bpf_rcu_read_lock, @@ -11747,6 +12078,9 @@ enum special_kfunc_type { KF_bpf_rbtree_remove, KF_bpf_rbtree_add_impl, KF_bpf_rbtree_first, + KF_bpf_rbtree_root, + KF_bpf_rbtree_left, + KF_bpf_rbtree_right, KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, KF_bpf_dynptr_slice, @@ -11766,37 +12100,15 @@ enum special_kfunc_type { KF_bpf_iter_num_new, KF_bpf_iter_num_next, KF_bpf_iter_num_destroy, + KF_bpf_set_dentry_xattr, + KF_bpf_remove_dentry_xattr, + KF_bpf_res_spin_lock, + KF_bpf_res_spin_unlock, + KF_bpf_res_spin_lock_irqsave, + KF_bpf_res_spin_unlock_irqrestore, + KF___bpf_trap, }; -BTF_SET_START(special_kfunc_set) -BTF_ID(func, bpf_obj_new_impl) -BTF_ID(func, bpf_obj_drop_impl) -BTF_ID(func, bpf_refcount_acquire_impl) -BTF_ID(func, bpf_list_push_front_impl) -BTF_ID(func, bpf_list_push_back_impl) -BTF_ID(func, bpf_list_pop_front) -BTF_ID(func, bpf_list_pop_back) -BTF_ID(func, bpf_cast_to_kern_ctx) -BTF_ID(func, bpf_rdonly_cast) -BTF_ID(func, bpf_rbtree_remove) -BTF_ID(func, bpf_rbtree_add_impl) -BTF_ID(func, bpf_rbtree_first) -#ifdef CONFIG_NET -BTF_ID(func, bpf_dynptr_from_skb) -BTF_ID(func, bpf_dynptr_from_xdp) -#endif -BTF_ID(func, bpf_dynptr_slice) -BTF_ID(func, bpf_dynptr_slice_rdwr) -BTF_ID(func, bpf_dynptr_clone) -BTF_ID(func, bpf_percpu_obj_new_impl) -BTF_ID(func, bpf_percpu_obj_drop_impl) -BTF_ID(func, bpf_throw) -BTF_ID(func, bpf_wq_set_callback_impl) -#ifdef CONFIG_CGROUPS -BTF_ID(func, bpf_iter_css_task_new) -#endif -BTF_SET_END(special_kfunc_set) - BTF_ID_LIST(special_kfunc_list) BTF_ID(func, bpf_obj_new_impl) BTF_ID(func, bpf_obj_drop_impl) @@ -11805,6 +12117,8 @@ BTF_ID(func, bpf_list_push_front_impl) BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_list_front) +BTF_ID(func, bpf_list_back) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rcu_read_lock) @@ -11812,6 +12126,9 @@ BTF_ID(func, bpf_rcu_read_unlock) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add_impl) BTF_ID(func, bpf_rbtree_first) +BTF_ID(func, bpf_rbtree_root) +BTF_ID(func, bpf_rbtree_left) +BTF_ID(func, bpf_rbtree_right) #ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) @@ -11844,6 +12161,18 @@ BTF_ID(func, bpf_local_irq_restore) BTF_ID(func, bpf_iter_num_new) BTF_ID(func, bpf_iter_num_next) BTF_ID(func, bpf_iter_num_destroy) +#ifdef CONFIG_BPF_LSM +BTF_ID(func, bpf_set_dentry_xattr) +BTF_ID(func, bpf_remove_dentry_xattr) +#else +BTF_ID_UNUSED +BTF_ID_UNUSED +#endif +BTF_ID(func, bpf_res_spin_lock) +BTF_ID(func, bpf_res_spin_unlock) +BTF_ID(func, bpf_res_spin_lock_irqsave) +BTF_ID(func, bpf_res_spin_unlock_irqrestore) +BTF_ID(func, __bpf_trap) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -11937,6 +12266,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG; + if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_RES_SPIN_LOCK; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -12044,13 +12376,19 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, struct bpf_kfunc_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + int err, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; - int err; - if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save]) { + if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) { irq_save = true; - } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore]) { + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) + kfunc_class = IRQ_LOCK_KFUNC; + } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) { irq_save = false; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) + kfunc_class = IRQ_LOCK_KFUNC; } else { verbose(env, "verifier internal error: unknown irq flags kfunc\n"); return -EFAULT; @@ -12066,7 +12404,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (err) return err; - err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx); + err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx, kfunc_class); if (err) return err; } else { @@ -12080,7 +12418,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (err) return err; - err = unmark_stack_slot_irq_flag(env, reg); + err = unmark_stack_slot_irq_flag(env, reg, kfunc_class); if (err) return err; } @@ -12207,7 +12545,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ if (!env->cur_state->active_locks) return -EINVAL; - s = find_lock_state(env->cur_state, REF_TYPE_LOCK, id, ptr); + s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr); if (!s) { verbose(env, "held lock and object are not in the same allocation\n"); return -EINVAL; @@ -12220,14 +12558,19 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] || btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] || btf_id == special_kfunc_list[KF_bpf_list_pop_front] || - btf_id == special_kfunc_list[KF_bpf_list_pop_back]; + btf_id == special_kfunc_list[KF_bpf_list_pop_back] || + btf_id == special_kfunc_list[KF_bpf_list_front] || + btf_id == special_kfunc_list[KF_bpf_list_back]; } static bool is_bpf_rbtree_api_kfunc(u32 btf_id) { return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - btf_id == special_kfunc_list[KF_bpf_rbtree_first]; + btf_id == special_kfunc_list[KF_bpf_rbtree_first] || + btf_id == special_kfunc_list[KF_bpf_rbtree_root] || + btf_id == special_kfunc_list[KF_bpf_rbtree_left] || + btf_id == special_kfunc_list[KF_bpf_rbtree_right]; } static bool is_bpf_iter_num_api_kfunc(u32 btf_id) @@ -12243,9 +12586,18 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; } +static bool is_bpf_res_spin_lock_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_res_spin_lock] || + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock] || + btf_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]; +} + static bool kfunc_spin_allowed(u32 btf_id) { - return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id); + return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) || + is_bpf_res_spin_lock_kfunc(btf_id); } static bool is_sync_callback_calling_kfunc(u32 btf_id) @@ -12318,7 +12670,9 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, break; case BPF_RB_NODE: ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]); + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]); break; default: verbose(env, "verifier internal error: unexpected graph node argument type %s\n", @@ -12538,6 +12892,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_ignore(btf, &args[i])) continue; + if (is_kfunc_arg_prog(btf, &args[i])) { + /* Used to reject repeated use of __prog. */ + if (meta->arg_prog) { + verbose(env, "Only 1 prog->aux argument supported per-kfunc\n"); + return -EFAULT; + } + meta->arg_prog = true; + cur_aux(env)->arg_prog = regno; + continue; + } + if (btf_type_is_scalar(t)) { if (reg->type != SCALAR_VALUE) { verbose(env, "R%d is not a scalar\n", regno); @@ -12677,6 +13042,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: case KF_ARG_PTR_TO_IRQ_FLAG: + case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; default: WARN_ON_ONCE(1); @@ -12831,22 +13197,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_RB_NODE: - if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) { - if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) { - verbose(env, "rbtree_remove node input must be non-owning ref\n"); + if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL; } - if (in_rbtree_lock_required_cb(env)) { - verbose(env, "rbtree_remove not allowed in rbtree cb\n"); + if (!reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); return -EINVAL; } } else { - if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name); return -EINVAL; } - if (!reg->ref_obj_id) { - verbose(env, "allocated object must be referenced\n"); + if (in_rbtree_lock_required_cb(env)) { + verbose(env, "%s not allowed in rbtree cb\n", func_name); return -EINVAL; } } @@ -12975,6 +13341,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_RES_SPIN_LOCK: + { + int flags = PROCESS_RES_LOCK; + + if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d doesn't point to map value or allocated object\n", i); + return -EINVAL; + } + + if (!is_bpf_res_spin_lock_kfunc(meta->func_id)) + return -EFAULT; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) + flags |= PROCESS_SPIN_LOCK; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) + flags |= PROCESS_LOCK_IRQ; + ret = process_spin_lock(env, regno, flags); + if (ret < 0) + return ret; + break; + } } } @@ -13029,6 +13417,178 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, return 0; } +/* check special kfuncs and return: + * 1 - not fall-through to 'else' branch, continue verification + * 0 - fall-through to 'else' branch + * < 0 - not fall-through to 'else' branch, return error + */ +static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, + struct bpf_reg_state *regs, struct bpf_insn_aux_data *insn_aux, + const struct btf_type *ptr_type, struct btf *desc_btf) +{ + const struct btf_type *ret_t; + int err = 0; + + if (meta->btf != btf_vmlinux) + return 0; + + if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || + meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + struct btf_struct_meta *struct_meta; + struct btf *ret_btf; + u32 ret_btf_id; + + if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) + return -ENOMEM; + + if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) { + verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); + return -EINVAL; + } + + ret_btf = env->prog->aux->btf; + ret_btf_id = meta->arg_constant.value; + + /* This may be NULL due to user not supplying a BTF */ + if (!ret_btf) { + verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); + return -EINVAL; + } + + ret_t = btf_type_by_id(ret_btf, ret_btf_id); + if (!ret_t || !__btf_type_is_struct(ret_t)) { + verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); + return -EINVAL; + } + + if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { + verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", + ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); + return -EINVAL; + } + + if (!bpf_global_percpu_ma_set) { + mutex_lock(&bpf_percpu_ma_lock); + if (!bpf_global_percpu_ma_set) { + /* Charge memory allocated with bpf_global_percpu_ma to + * root memcg. The obj_cgroup for root memcg is NULL. + */ + err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); + if (!err) + bpf_global_percpu_ma_set = true; + } + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + mutex_lock(&bpf_percpu_ma_lock); + err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); + mutex_unlock(&bpf_percpu_ma_lock); + if (err) + return err; + } + + struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); + if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { + verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); + return -EINVAL; + } + + if (struct_meta) { + verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); + return -EINVAL; + } + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = ret_btf; + regs[BPF_REG_0].btf_id = ret_btf_id; + if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) + regs[BPF_REG_0].type |= MEM_PERCPU; + + insn_aux->obj_new_size = ret_t->size; + insn_aux->kptr_struct_meta = struct_meta; + } else if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = meta->arg_btf; + regs[BPF_REG_0].btf_id = meta->arg_btf_id; + + insn_aux->kptr_struct_meta = + btf_find_struct_meta(meta->arg_btf, + meta->arg_btf_id); + } else if (is_list_node_type(ptr_type)) { + struct btf_field *field = meta->arg_list_head.field; + + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); + } else if (is_rbtree_node_type(ptr_type)) { + struct btf_field *field = meta->arg_rbtree_root.field; + + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); + } else if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta->ret_btf_id; + } else if (meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { + ret_t = btf_type_by_id(desc_btf, meta->arg_constant.value); + if (!ret_t || !btf_type_is_struct(ret_t)) { + verbose(env, + "kfunc bpf_rdonly_cast type ID argument must be of a struct\n"); + return -EINVAL; + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta->arg_constant.value; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] || + meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { + enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type); + + mark_reg_known_zero(env, regs, BPF_REG_0); + + if (!meta->arg_constant.found) { + verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n"); + return -EFAULT; + } + + regs[BPF_REG_0].mem_size = meta->arg_constant.value; + + /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */ + regs[BPF_REG_0].type = PTR_TO_MEM | type_flag; + + if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { + regs[BPF_REG_0].type |= MEM_RDONLY; + } else { + /* this will set env->seen_direct_write to true */ + if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { + verbose(env, "the prog does not allow writes to packet data\n"); + return -EINVAL; + } + } + + if (!meta->initialized_dynptr.id) { + verbose(env, "verifier internal error: no dynptr id\n"); + return -EFAULT; + } + regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id; + + /* we don't need to set BPF_REG_0's ref obj id + * because packet slices are not refcounted (see + * dynptr_type_refcounted) + */ + } else { + return 0; + } + + return 1; +} + static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name); static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -13043,7 +13603,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; const struct btf_param *args; - const struct btf_type *ret_t; struct btf *desc_btf; /* skip for now, but return error when we find this in fixup_kfunc_call */ @@ -13060,6 +13619,36 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_aux->is_iter_next = is_iter_next_kfunc(&meta); + if (!insn->off && + (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] || + insn->imm == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) { + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (!branch) { + verbose(env, "failed to push state for failed lock acquisition\n"); + return -ENOMEM; + } + + regs = branch->frame[branch->curframe]->regs; + + /* Clear r0-r5 registers in forked state */ + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(env, regs, caller_saved[i]); + + mark_reg_unknown(env, regs, BPF_REG_0); + err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1); + if (err) { + verbose(env, "failed to mark s32 range for retval in forked state for lock\n"); + return err; + } + __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32)); + } else if (!insn->off && insn->imm == special_kfunc_list[KF___bpf_trap]) { + verbose(env, "unexpected __bpf_trap() due to uninitialized variable?\n"); + return -EFAULT; + } + if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); return -EACCES; @@ -13230,168 +13819,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (btf_type_is_scalar(t)) { mark_reg_unknown(env, regs, BPF_REG_0); + if (meta.btf == btf_vmlinux && (meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock] || + meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) + __mark_reg_const_zero(env, ®s[BPF_REG_0]); mark_btf_func_reg_size(env, BPF_REG_0, t->size); } else if (btf_type_is_ptr(t)) { ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); - - if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] || - meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - struct btf_struct_meta *struct_meta; - struct btf *ret_btf; - u32 ret_btf_id; - - if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) - return -ENOMEM; - - if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { - verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); - return -EINVAL; - } - - ret_btf = env->prog->aux->btf; - ret_btf_id = meta.arg_constant.value; - - /* This may be NULL due to user not supplying a BTF */ - if (!ret_btf) { - verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n"); - return -EINVAL; - } - - ret_t = btf_type_by_id(ret_btf, ret_btf_id); - if (!ret_t || !__btf_type_is_struct(ret_t)) { - verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n"); - return -EINVAL; - } - - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { - verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", - ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); - return -EINVAL; - } - - if (!bpf_global_percpu_ma_set) { - mutex_lock(&bpf_percpu_ma_lock); - if (!bpf_global_percpu_ma_set) { - /* Charge memory allocated with bpf_global_percpu_ma to - * root memcg. The obj_cgroup for root memcg is NULL. - */ - err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL); - if (!err) - bpf_global_percpu_ma_set = true; - } - mutex_unlock(&bpf_percpu_ma_lock); - if (err) - return err; - } - - mutex_lock(&bpf_percpu_ma_lock); - err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size); - mutex_unlock(&bpf_percpu_ma_lock); - if (err) - return err; - } - - struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { - verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); - return -EINVAL; - } - - if (struct_meta) { - verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n"); - return -EINVAL; - } - } - - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = ret_btf; - regs[BPF_REG_0].btf_id = ret_btf_id; - if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) - regs[BPF_REG_0].type |= MEM_PERCPU; - - insn_aux->obj_new_size = ret_t->size; - insn_aux->kptr_struct_meta = struct_meta; - } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = meta.arg_btf; - regs[BPF_REG_0].btf_id = meta.arg_btf_id; - - insn_aux->kptr_struct_meta = - btf_find_struct_meta(meta.arg_btf, - meta.arg_btf_id); - } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || - meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { - struct btf_field *field = meta.arg_list_head.field; - - mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); - } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] || - meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { - struct btf_field *field = meta.arg_rbtree_root.field; - - mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); - } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].btf_id = meta.ret_btf_id; - } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { - ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value); - if (!ret_t || !btf_type_is_struct(ret_t)) { - verbose(env, - "kfunc bpf_rdonly_cast type ID argument must be of a struct\n"); - return -EINVAL; - } - - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].btf_id = meta.arg_constant.value; - } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] || - meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { - enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type); - - mark_reg_known_zero(env, regs, BPF_REG_0); - - if (!meta.arg_constant.found) { - verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n"); - return -EFAULT; - } - - regs[BPF_REG_0].mem_size = meta.arg_constant.value; - - /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */ - regs[BPF_REG_0].type = PTR_TO_MEM | type_flag; - - if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) { - regs[BPF_REG_0].type |= MEM_RDONLY; - } else { - /* this will set env->seen_direct_write to true */ - if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) { - verbose(env, "the prog does not allow writes to packet data\n"); - return -EINVAL; - } - } - - if (!meta.initialized_dynptr.id) { - verbose(env, "verifier internal error: no dynptr id\n"); - return -EFAULT; - } - regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id; - - /* we don't need to set BPF_REG_0's ref obj id - * because packet slices are not refcounted (see - * dynptr_type_refcounted) - */ - } else { - verbose(env, "kernel function %s unhandled dynamic return type\n", - meta.func_name); - return -EFAULT; - } + err = check_special_kfunc(env, &meta, regs, insn_aux, ptr_type, desc_btf); + if (err) { + if (err < 0) + return err; } else if (btf_type_is_void(ptr_type)) { /* kfunc returning 'void *' is equivalent to returning scalar */ mark_reg_unknown(env, regs, BPF_REG_0); @@ -13460,14 +13897,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_ret_null(&meta)) regs[BPF_REG_0].id = id; regs[BPF_REG_0].ref_obj_id = id; - } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { + } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) { ref_set_non_owning(env, ®s[BPF_REG_0]); } if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) regs[BPF_REG_0].id = ++env->id_gen; } else if (btf_type_is_void(t)) { - if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { + if (meta.btf == btf_vmlinux) { if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { insn_aux->kptr_struct_meta = @@ -15956,6 +16393,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *eq_branch_regs; struct linked_regs linked_regs = {}; u8 opcode = BPF_OP(insn->code); + int insn_flags = 0; bool is_jmp32; int pred = -1; int err; @@ -16014,6 +16452,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->src_reg); return -EACCES; } + + if (src_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_SRC_REG_STACK; + if (dst_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_DST_REG_STACK; } else { if (insn->src_reg != BPF_REG_0) { verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); @@ -16023,6 +16466,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, memset(src_reg, 0, sizeof(*src_reg)); src_reg->type = SCALAR_VALUE; __mark_reg_known(src_reg, insn->imm); + + if (dst_reg->type == PTR_TO_STACK) + insn_flags |= INSN_F_DST_REG_STACK; + } + + if (insn_flags) { + err = push_insn_history(env, this_branch, insn_flags, 0); + if (err) + return err; } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; @@ -16384,13 +16836,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char const char *exit_ctx = "At program exit"; struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; - struct bpf_reg_state *reg; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_retval_range range = retval_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; struct bpf_func_state *frame = env->cur_state->frame[0]; const bool is_subprog = frame->subprogno; bool return_32bit = false; + const struct btf_type *reg_type, *ret_type = NULL; /* LSM and struct_ops func-ptr's return type could be "void" */ if (!is_subprog || frame->in_exception_callback_fn) { @@ -16399,10 +16852,26 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char if (prog->expected_attach_type == BPF_LSM_CGROUP) /* See below, can be 0 or 0-1 depending on hook. */ break; - fallthrough; + if (!prog->aux->attach_func_proto->type) + return 0; + break; case BPF_PROG_TYPE_STRUCT_OPS: if (!prog->aux->attach_func_proto->type) return 0; + + if (frame->in_exception_callback_fn) + break; + + /* Allow a struct_ops program to return a referenced kptr if it + * matches the operator's return type and is in its unmodified + * form. A scalar zero (i.e., a null pointer) is also allowed. + */ + reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL; + ret_type = btf_type_resolve_ptr(prog->aux->attach_btf, + prog->aux->attach_func_proto->type, + NULL); + if (ret_type && ret_type == reg_type && reg->ref_obj_id) + return __check_ptr_off_reg(env, reg, regno, false); break; default: break; @@ -16424,8 +16893,6 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return -EACCES; } - reg = cur_regs(env) + regno; - if (frame->in_async_callback_fn) { /* enforce return zero from async callbacks like timer */ exit_ctx = "At async callback return"; @@ -16524,6 +16991,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char case BPF_PROG_TYPE_NETFILTER: range = retval_range(NF_DROP, NF_ACCEPT); break; + case BPF_PROG_TYPE_STRUCT_OPS: + if (!ret_type) + return 0; + range = retval_range(0, 0); + break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. @@ -16567,6 +17039,14 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) subprog->changes_pkt_data = true; } +static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = find_containing_subprog(env, off); + subprog->might_sleep = true; +} + /* 't' is an index of a call-site. * 'w' is a callee entry point. * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. @@ -16580,6 +17060,7 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) caller = find_containing_subprog(env, t); callee = find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; + caller->might_sleep |= callee->might_sleep; } /* non-recursive DFS pseudo code @@ -16738,27 +17219,6 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, /* Bitmask with 1s for all caller saved registers */ #define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) -/* Return a bitmask specifying which caller saved registers are - * clobbered by a call to a helper *as if* this helper follows - * bpf_fastcall contract: - * - includes R0 if function is non-void; - * - includes R1-R5 if corresponding parameter has is described - * in the function prototype. - */ -static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn) -{ - u32 mask; - int i; - - mask = 0; - if (fn->ret_type != RET_VOID) - mask |= BIT(BPF_REG_0); - for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) - if (fn->arg_type[i] != ARG_DONTCARE) - mask |= BIT(BPF_REG_1 + i); - return mask; -} - /* True if do_misc_fixups() replaces calls to helper number 'imm', * replacement patch is presumed to follow bpf_fastcall contract * (see mark_fastcall_pattern_for_call() below). @@ -16775,24 +17235,54 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) } } -/* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */ -static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta) +struct call_summary { + u8 num_params; + bool is_void; + bool fastcall; +}; + +/* If @call is a kfunc or helper call, fills @cs and returns true, + * otherwise returns false. + */ +static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call, + struct call_summary *cs) { - u32 vlen, i, mask; + struct bpf_kfunc_call_arg_meta meta; + const struct bpf_func_proto *fn; + int i; - vlen = btf_type_vlen(meta->func_proto); - mask = 0; - if (!btf_type_is_void(btf_type_by_id(meta->btf, meta->func_proto->type))) - mask |= BIT(BPF_REG_0); - for (i = 0; i < vlen; ++i) - mask |= BIT(BPF_REG_1 + i); - return mask; -} + if (bpf_helper_call(call)) { -/* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */ -static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_FASTCALL; + if (get_helper_proto(env, call->imm, &fn) < 0) + /* error would be reported later */ + return false; + cs->fastcall = fn->allow_fastcall && + (verifier_inlines_helper_call(env, call->imm) || + bpf_jit_inlines_helper_call(call->imm)); + cs->is_void = fn->ret_type == RET_VOID; + cs->num_params = 0; + for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) { + if (fn->arg_type[i] == ARG_DONTCARE) + break; + cs->num_params++; + } + return true; + } + + if (bpf_pseudo_kfunc_call(call)) { + int err; + + err = fetch_kfunc_meta(env, call, &meta, NULL); + if (err < 0) + /* error would be reported later */ + return false; + cs->num_params = btf_type_vlen(meta.func_proto); + cs->fastcall = meta.kfunc_flags & KF_FASTCALL; + cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type)); + return true; + } + + return false; } /* LLVM define a bpf_fastcall function attribute. @@ -16875,39 +17365,23 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, { struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx; struct bpf_insn *call = &env->prog->insnsi[insn_idx]; - const struct bpf_func_proto *fn; - u32 clobbered_regs_mask = ALL_CALLER_SAVED_REGS; + u32 clobbered_regs_mask; + struct call_summary cs; u32 expected_regs_mask; - bool can_be_inlined = false; s16 off; int i; - if (bpf_helper_call(call)) { - if (get_helper_proto(env, call->imm, &fn) < 0) - /* error would be reported later */ - return; - clobbered_regs_mask = helper_fastcall_clobber_mask(fn); - can_be_inlined = fn->allow_fastcall && - (verifier_inlines_helper_call(env, call->imm) || - bpf_jit_inlines_helper_call(call->imm)); - } - - if (bpf_pseudo_kfunc_call(call)) { - struct bpf_kfunc_call_arg_meta meta; - int err; - - err = fetch_kfunc_meta(env, call, &meta, NULL); - if (err < 0) - /* error would be reported later */ - return; - - clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta); - can_be_inlined = is_fastcall_kfunc_call(&meta); - } - - if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS) + if (!get_call_summary(env, call, &cs)) return; + /* A bitmask specifying which caller saved registers are clobbered + * by a call to a helper/kfunc *as if* this helper/kfunc follows + * bpf_fastcall contract: + * - includes R0 if function is non-void; + * - includes R1-R5 if corresponding parameter has is described + * in the function prototype. + */ + clobbered_regs_mask = GENMASK(cs.num_params, cs.is_void ? 1 : 0); /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */ expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS; @@ -16965,7 +17439,7 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills() * does not remove spill/fill pair {4,6}. */ - if (can_be_inlined) + if (cs.fastcall) env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1; else subprog->keep_fastcall_stack = 1; @@ -17047,9 +17521,20 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_prune_point(env, t); mark_jmp_point(env, t); } - if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm)) - mark_subprog_changes_pkt_data(env, t); - if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + if (bpf_helper_call(insn)) { + const struct bpf_func_proto *fp; + + ret = get_helper_proto(env, insn->imm, &fp); + /* If called in a non-sleepable context program will be + * rejected anyway, so we should end up with precise + * sleepable marks on subprogs, except for dead code + * elimination. + */ + if (ret == 0 && fp->might_sleep) + mark_subprog_might_sleep(env, t); + if (bpf_helper_changes_pkt_data(insn->imm)) + mark_subprog_changes_pkt_data(env, t); + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; ret = fetch_kfunc_meta(env, insn, &meta, NULL); @@ -17068,6 +17553,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env) */ mark_force_checkpoint(env, t); } + /* Same as helpers, if called in a non-sleepable context + * program will be rejected anyway, so we should end up + * with precise sleepable marks on subprogs, except for + * dead code elimination. + */ + if (ret == 0 && is_kfunc_sleepable(&meta)) + mark_subprog_might_sleep(env, t); } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); @@ -17110,9 +17602,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; - int *insn_stack, *insn_state; + int *insn_stack, *insn_state, *insn_postorder; int ex_insn_beg, i, ret = 0; - bool ex_done = false; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) @@ -17124,6 +17615,17 @@ static int check_cfg(struct bpf_verifier_env *env) return -ENOMEM; } + insn_postorder = env->cfg.insn_postorder = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_postorder) { + kvfree(insn_state); + kvfree(insn_stack); + return -ENOMEM; + } + + ex_insn_beg = env->exception_callback_subprog + ? env->subprog_info[env->exception_callback_subprog].start + : 0; + insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ insn_stack[0] = 0; /* 0 is the first instruction */ env->cfg.cur_stack = 1; @@ -17137,6 +17639,7 @@ walk_cfg: case DONE_EXPLORING: insn_state[t] = EXPLORED; env->cfg.cur_stack--; + insn_postorder[env->cfg.cur_postorder++] = t; break; case KEEP_EXPLORING: break; @@ -17155,13 +17658,10 @@ walk_cfg: goto err_free; } - if (env->exception_callback_subprog && !ex_done) { - ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start; - + if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) { insn_state[ex_insn_beg] = DISCOVERED; insn_stack[0] = ex_insn_beg; env->cfg.cur_stack = 1; - ex_done = true; goto walk_cfg; } @@ -17184,6 +17684,7 @@ walk_cfg: } ret = 0; /* cfg looks good */ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; + env->prog->aux->might_sleep = env->subprog_info[0].might_sleep; err_free: kvfree(insn_state); @@ -17800,18 +18301,22 @@ static void clean_verifier_state(struct bpf_verifier_env *env, static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { + struct bpf_verifier_state *loop_entry; struct bpf_verifier_state_list *sl; + struct list_head *pos, *head; - sl = *explored_state(env, insn); - while (sl) { + head = explored_state(env, insn); + list_for_each(pos, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); if (sl->state.branches) - goto next; + continue; + loop_entry = get_loop_entry(env, &sl->state); + if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches) + continue; if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) - goto next; + continue; clean_verifier_state(env, &sl->state); -next: - sl = sl->next; } } @@ -18112,7 +18617,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, case STACK_IRQ_FLAG: old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; - if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || + old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) return false; break; case STACK_MISC: @@ -18147,6 +18653,10 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) return false; + if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) || + old->active_lock_ptr != cur->active_lock_ptr) + return false; + for (i = 0; i < old->acquired_refs; i++) { if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || old->refs[i].type != cur->refs[i].type) @@ -18156,6 +18666,8 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c case REF_TYPE_IRQ: break; case REF_TYPE_LOCK: + case REF_TYPE_RES_LOCK: + case REF_TYPE_RES_LOCK_IRQ: if (old->refs[i].ptr != cur->refs[i].ptr) return false; break; @@ -18195,15 +18707,17 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c * the current state will reach 'bpf_exit' instruction safely */ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, enum exact_level exact) + struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact) { - int i; + u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before; + u16 i; if (old->callback_depth > cur->callback_depth) return false; for (i = 0; i < MAX_BPF_REG; i++) - if (!regsafe(env, &old->regs[i], &cur->regs[i], + if (((1 << i) & live_regs) && + !regsafe(env, &old->regs[i], &cur->regs[i], &env->idmap_scratch, exact)) return false; @@ -18224,6 +18738,7 @@ static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, enum exact_level exact) { + u32 insn_idx; int i; if (old->curframe != cur->curframe) @@ -18247,9 +18762,12 @@ static bool states_equal(struct bpf_verifier_env *env, * and all frame states need to be equivalent */ for (i = 0; i <= old->curframe; i++) { + insn_idx = i == old->curframe + ? env->insn_idx + : old->frame[i + 1]->callsite; if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(env, old->frame[i], cur->frame[i], exact)) + if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact)) return false; } return true; @@ -18502,10 +19020,11 @@ static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; - struct bpf_verifier_state_list *sl, **pprev; + struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; int i, j, n, err, states_cnt = 0; bool force_new_state, add_new_state, force_exact; + struct list_head *pos, *tmp, *head; force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || /* Avoid accumulating infinitely long jmp history */ @@ -18524,15 +19043,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) env->insn_processed - env->prev_insn_processed >= 8) add_new_state = true; - pprev = explored_state(env, insn_idx); - sl = *pprev; - clean_live_states(env, insn_idx, cur); - while (sl) { + head = explored_state(env, insn_idx); + list_for_each_safe(pos, tmp, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); states_cnt++; if (sl->state.insn_idx != insn_idx) - goto next; + continue; if (sl->state.branches) { struct bpf_func_state *frame = sl->state.frame[sl->state.curframe]; @@ -18606,7 +19124,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) spi = __get_spi(iter_reg->off + iter_reg->var_off.value); iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { - update_loop_entry(cur, &sl->state); + update_loop_entry(env, cur, &sl->state); goto hit; } } @@ -18615,7 +19133,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) if (is_may_goto_insn_at(env, insn_idx)) { if (sl->state.may_goto_depth != cur->may_goto_depth && states_equal(env, &sl->state, cur, RANGE_WITHIN)) { - update_loop_entry(cur, &sl->state); + update_loop_entry(env, cur, &sl->state); goto hit; } } @@ -18682,11 +19200,13 @@ skip_inf_loop_check: * * Additional details are in the comment before get_loop_entry(). */ - loop_entry = get_loop_entry(&sl->state); + loop_entry = get_loop_entry(env, &sl->state); + if (IS_ERR(loop_entry)) + return PTR_ERR(loop_entry); force_exact = loop_entry && loop_entry->branches > 0; if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) { if (force_exact) - update_loop_entry(cur, loop_entry); + update_loop_entry(env, cur, loop_entry); hit: sl->hit_cnt++; /* reached equivalent register/stack state, @@ -18735,31 +19255,13 @@ miss: /* the state is unlikely to be useful. Remove it to * speed up verification */ - *pprev = sl->next; - if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE && - !sl->state.used_as_loop_entry) { - u32 br = sl->state.branches; - - WARN_ONCE(br, - "BUG live_done but branches_to_explore %d\n", - br); - free_verifier_state(&sl->state, false); - kfree(sl); - env->peak_states--; - } else { - /* cannot free this state, since parentage chain may - * walk it later. Add it for free_list instead to - * be freed at the end of verification - */ - sl->next = env->free_list; - env->free_list = sl; - } - sl = *pprev; - continue; + sl->in_free_list = true; + list_del(&sl->node); + list_add(&sl->node, &env->free_list); + env->free_list_size++; + env->explored_states_size--; + maybe_free_verifier_state(env, sl); } -next: - pprev = &sl->next; - sl = *pprev; } if (env->max_states_per_insn < states_cnt) @@ -18784,7 +19286,8 @@ next: if (!new_sl) return -ENOMEM; env->total_states++; - env->peak_states++; + env->explored_states_size++; + update_peak_states(env); env->prev_jmps_processed = env->jmps_processed; env->prev_insn_processed = env->insn_processed; @@ -18808,8 +19311,8 @@ next: cur->first_insn_idx = insn_idx; cur->insn_hist_start = cur->insn_hist_end; cur->dfs_depth = new->dfs_depth + 1; - new_sl->next = *explored_state(env, insn_idx); - *explored_state(env, insn_idx) = new_sl; + list_add(&new_sl->node, head); + /* connect new state to parentage chain. Current frame needs all * registers connected. Only r6 - r9 of the callers are alive (pushed * to the stack implicitly by JITs) so in callers' frames connect just @@ -18996,19 +19499,13 @@ static int do_check(struct bpf_verifier_env *env) } if (env->log.level & BPF_LOG_LEVEL) { - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; - if (verifier_state_scratched(env)) print_insn_state(env, state, state->curframe); verbose_linfo(env, env->insn_idx, "; "); env->prev_log_pos = env->log.end_pos; verbose(env, "%d: ", env->insn_idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + verbose_insn(env, insn); env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos; env->prev_log_pos = env->log.end_pos; } @@ -19030,37 +19527,18 @@ static int do_check(struct bpf_verifier_env *env) return err; } else if (class == BPF_LDX) { - enum bpf_reg_type src_reg_type; + bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX; - /* check for reserved fields is already done */ - - /* check src operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; - - err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); - if (err) - return err; - - src_reg_type = regs[insn->src_reg].type; - - /* check that memory (src_reg + off) is readable, - * the state of dst_reg will be updated by this func + /* Check for reserved fields is already done in + * resolve_pseudo_ldimm64(). */ - err = check_mem_access(env, env->insn_idx, insn->src_reg, - insn->off, BPF_SIZE(insn->code), - BPF_READ, insn->dst_reg, false, - BPF_MODE(insn->code) == BPF_MEMSX); - err = err ?: save_aux_ptr_type(env, src_reg_type, true); - err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], "ldx"); + err = check_load_mem(env, insn, false, is_ldsx, true, + "ldx"); if (err) return err; } else if (class == BPF_STX) { - enum bpf_reg_type dst_reg_type; - if (BPF_MODE(insn->code) == BPF_ATOMIC) { - err = check_atomic(env, env->insn_idx, insn); + err = check_atomic(env, insn); if (err) return err; env->insn_idx++; @@ -19072,25 +19550,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - /* check src1 operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; - /* check src2 operand */ - err = check_reg_arg(env, insn->dst_reg, SRC_OP); - if (err) - return err; - - dst_reg_type = regs[insn->dst_reg].type; - - /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, - insn->off, BPF_SIZE(insn->code), - BPF_WRITE, insn->src_reg, false, false); - if (err) - return err; - - err = save_aux_ptr_type(env, dst_reg_type, false); + err = check_store_reg(env, insn, false); if (err) return err; } else if (class == BPF_ST) { @@ -19230,6 +19690,9 @@ process_bpf_exit: return err; break; } else { + if (verifier_bug_if(env->cur_state->loop_entry, env, + "broken loop detection")) + return -EFAULT; do_print_state = true; continue; } @@ -19489,7 +19952,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { + if (btf_record_has_field(map->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); return -EINVAL; @@ -20287,10 +20750,9 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, if (bpf_pseudo_kfunc_call(&insn)) continue; - if (WARN_ON(load_reg == -1)) { - verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n"); + if (verifier_bug_if(load_reg == -1, env, + "zext_dst is set, but no reg is defined")) return -EFAULT; - } zext_patch[0] = insn; zext_patch[1].dst_reg = load_reg; @@ -20319,7 +20781,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprogs = env->subprog_info; const struct bpf_verifier_ops *ops = env->ops; - int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0; + int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0; const int insn_cnt = env->prog->len; struct bpf_insn *epilogue_buf = env->epilogue_buf; struct bpf_insn *insn_buf = env->insn_buf; @@ -20348,6 +20810,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return -ENOMEM; env->prog = new_prog; delta += cnt - 1; + + ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1); + if (ret < 0) + return ret; } } @@ -20368,6 +20834,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) env->prog = new_prog; delta += cnt - 1; + + ret = add_kfunc_in_insns(env, insn_buf, cnt - 1); + if (ret < 0) + return ret; } } @@ -20400,7 +20870,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code == (BPF_ST | BPF_MEM | BPF_W) || insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; - } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || + } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); @@ -20597,11 +21069,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) * propagated in any case. */ subprog = find_subprog(env, i + insn->imm + 1); - if (subprog < 0) { - WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", - i + insn->imm + 1); + if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d", + i + insn->imm + 1)) return -EFAULT; - } /* temporarily remember subprog id inside insn instead of * aux_data, since next loop will split up all insns into funcs */ @@ -20708,6 +21178,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; + func[i]->aux->might_sleep = env->subprog_info[i].might_sleep; if (!i) func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); @@ -20924,6 +21395,14 @@ static void specialize_kfunc(struct bpf_verifier_env *env, */ env->seen_direct_write = seen_direct_write; } + + if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] && + bpf_lsm_has_d_inode_locked(prog)) + *addr = (unsigned long)bpf_set_dentry_xattr_locked; + + if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr] && + bpf_lsm_has_d_inode_locked(prog)) + *addr = (unsigned long)bpf_remove_dentry_xattr_locked; } static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, @@ -21035,13 +21514,17 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; - } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) { - struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) }; + } - insn_buf[0] = ld_addrs[0]; - insn_buf[1] = ld_addrs[1]; - insn_buf[2] = *insn; - *cnt = 3; + if (env->insn_aux_data[insn_idx].arg_prog) { + u32 regno = env->insn_aux_data[insn_idx].arg_prog; + struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) }; + int idx = *cnt; + + insn_buf[idx++] = ld_addrs[0]; + insn_buf[idx++] = ld_addrs[1]; + insn_buf[idx++] = *insn; + *cnt = idx; } return 0; } @@ -21358,7 +21841,50 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } - if (is_may_goto_insn(insn)) { + if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) { + int stack_off_cnt = -stack_depth - 16; + + /* + * Two 8 byte slots, depth-16 stores the count, and + * depth-8 stores the start timestamp of the loop. + * + * The starting value of count is BPF_MAX_TIMED_LOOPS + * (0xffff). Every iteration loads it and subs it by 1, + * until the value becomes 0 in AX (thus, 1 in stack), + * after which we call arch_bpf_timed_may_goto, which + * either sets AX to 0xffff to keep looping, or to 0 + * upon timeout. AX is then stored into the stack. In + * the next iteration, we either see 0 and break out, or + * continue iterating until the next time value is 0 + * after subtraction, rinse and repeat. + */ + stack_depth_extra = 16; + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt); + if (insn->off >= 0) + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5); + else + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); + insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); + insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2); + /* + * AX is used as an argument to pass in stack_off_cnt + * (to add to r10/fp), and also as the return value of + * the call to arch_bpf_timed_may_goto. + */ + insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt); + insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto); + insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt); + cnt = 7; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } else if (is_may_goto_insn(insn)) { int stack_off = -stack_depth - 8; stack_depth_extra = 8; @@ -21687,12 +22213,12 @@ patch_map_ops_generic: if (insn->imm == BPF_FUNC_get_smp_processor_id && verifier_inlines_helper_call(env, insn->imm)) { /* BPF_FUNC_get_smp_processor_id inlining is an - * optimization, so if pcpu_hot.cpu_number is ever + * optimization, so if cpu_number is ever * changed in some incompatible and hard to support * way, it's fine to back out this inlining logic */ #ifdef CONFIG_SMP - insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number); insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); cnt = 3; @@ -21882,6 +22408,13 @@ next_insn: if (subprogs[cur_subprog + 1].start == i + delta + 1) { subprogs[cur_subprog].stack_depth += stack_depth_extra; subprogs[cur_subprog].stack_extra = stack_depth_extra; + + stack_depth = subprogs[cur_subprog].stack_depth; + if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) { + verbose(env, "stack size %d(extra %d) is too large\n", + stack_depth, stack_depth_extra); + return -EINVAL; + } cur_subprog++; stack_depth = subprogs[cur_subprog].stack_depth; stack_depth_extra = 0; @@ -21892,23 +22425,33 @@ next_insn: env->prog->aux->stack_depth = subprogs[0].stack_depth; for (i = 0; i < env->subprog_cnt; i++) { + int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1; int subprog_start = subprogs[i].start; int stack_slots = subprogs[i].stack_extra / 8; + int slots = delta, cnt = 0; if (!stack_slots) continue; - if (stack_slots > 1) { - verbose(env, "verifier bug: stack_slots supports may_goto only\n"); + /* We need two slots in case timed may_goto is supported. */ + if (stack_slots > slots) { + verifier_bug(env, "stack_slots supports may_goto only"); return -EFAULT; } - /* Add ST insn to subprog prologue to init extra stack */ - insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, - -subprogs[i].stack_depth, BPF_MAX_LOOPS); + stack_depth = subprogs[i].stack_depth; + if (bpf_jit_supports_timed_may_goto()) { + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_TIMED_LOOPS); + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0); + } else { + /* Add ST insn to subprog prologue to init extra stack */ + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_LOOPS); + } /* Copy first actual insn to preserve it */ - insn_buf[1] = env->prog->insnsi[subprog_start]; + insn_buf[cnt++] = env->prog->insnsi[subprog_start]; - new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2); + new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt); if (!new_prog) return -ENOMEM; env->prog = prog = new_prog; @@ -21918,7 +22461,7 @@ next_insn: * to insn after BPF_ST that inits may_goto count. * Adjustment will succeed because bpf_patch_insn_data() didn't fail. */ - WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1)); + WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta)); } /* Since poke tab is now finalized, publish aux to tracker. */ @@ -22116,31 +22659,29 @@ static int remove_fastcall_spills_fills(struct bpf_verifier_env *env) static void free_states(struct bpf_verifier_env *env) { - struct bpf_verifier_state_list *sl, *sln; + struct bpf_verifier_state_list *sl; + struct list_head *head, *pos, *tmp; int i; - sl = env->free_list; - while (sl) { - sln = sl->next; + list_for_each_safe(pos, tmp, &env->free_list) { + sl = container_of(pos, struct bpf_verifier_state_list, node); free_verifier_state(&sl->state, false); kfree(sl); - sl = sln; } - env->free_list = NULL; + INIT_LIST_HEAD(&env->free_list); if (!env->explored_states) return; for (i = 0; i < state_htab_size(env); i++) { - sl = env->explored_states[i]; + head = &env->explored_states[i]; - while (sl) { - sln = sl->next; + list_for_each_safe(pos, tmp, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); free_verifier_state(&sl->state, false); kfree(sl); - sl = sln; } - env->explored_states[i] = NULL; + INIT_LIST_HEAD(&env->explored_states[i]); } } @@ -22148,6 +22689,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_subprog_info *sub = subprog_info(env, subprog); + struct bpf_prog_aux *aux = env->prog->aux; struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; @@ -22255,6 +22797,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, BPF_REG_1); } + /* Acquire references for struct_ops program arguments tagged with "__ref" */ + if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + for (i = 0; i < aux->ctx_arg_info_size; i++) + aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ? + acquire_reference(env, 0) : 0; + } + ret = do_check(env); out: /* check for NULL is necessary, since cur_state can be freed inside @@ -22377,6 +22926,15 @@ static void print_verification_stats(struct bpf_verifier_env *env) env->peak_states, env->longest_mark_read_walk); } +int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, + const struct bpf_ctx_arg_aux *info, u32 cnt) +{ + prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL); + prog->aux->ctx_arg_info_size = cnt; + + return prog->aux->ctx_arg_info ? 0 : -ENOMEM; +} + static int check_struct_ops_btf_id(struct bpf_verifier_env *env) { const struct btf_type *t, *func_proto; @@ -22384,10 +22942,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) const struct bpf_struct_ops *st_ops; const struct btf_member *member; struct bpf_prog *prog = env->prog; - u32 btf_id, member_idx; + bool has_refcounted_arg = false; + u32 btf_id, member_idx, member_off; struct btf *btf; const char *mname; - int err; + int i, err; if (!prog->gpl_compatible) { verbose(env, "struct ops programs must have a GPL compatible license\n"); @@ -22435,7 +22994,8 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EINVAL; } - err = bpf_struct_ops_supported(st_ops, __btf_member_bit_offset(t, member) / 8); + member_off = __btf_member_bit_offset(t, member) / 8; + err = bpf_struct_ops_supported(st_ops, member_off); if (err) { verbose(env, "attach to unsupported member %s of struct %s\n", mname, st_ops->name); @@ -22457,17 +23017,32 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EACCES; } - /* btf_ctx_access() used this to provide argument type info */ - prog->aux->ctx_arg_info = - st_ops_desc->arg_info[member_idx].info; - prog->aux->ctx_arg_info_size = - st_ops_desc->arg_info[member_idx].cnt; + for (i = 0; i < st_ops_desc->arg_info[member_idx].cnt; i++) { + if (st_ops_desc->arg_info[member_idx].info->refcounted) { + has_refcounted_arg = true; + break; + } + } + + /* Tail call is not allowed for programs with refcounted arguments since we + * cannot guarantee that valid refcounted kptrs will be passed to the callee. + */ + for (i = 0; i < env->subprog_cnt; i++) { + if (has_refcounted_arg && env->subprog_info[i].has_tail_call) { + verbose(env, "program with __ref argument cannot tail call\n"); + return -EINVAL; + } + } + + prog->aux->st_ops = st_ops; + prog->aux->attach_st_ops_member_off = member_off; prog->aux->attach_func_proto = func_proto; prog->aux->attach_func_name = mname; env->ops = st_ops->verifier_ops; - return 0; + return bpf_prog_ctx_arg_info_init(prog, st_ops_desc->arg_info[member_idx].info, + st_ops_desc->arg_info[member_idx].cnt); } #define SECURITY_PREFIX "security_" @@ -22543,6 +23118,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; bool tgt_changes_pkt_data; + bool tgt_might_sleep; if (bpf_prog_is_dev_bound(prog->aux) && !bpf_prog_dev_bound_match(prog, tgt_prog)) { @@ -22585,6 +23161,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, "Extension program changes packet data, while original does not\n"); return -EINVAL; } + + tgt_might_sleep = aux->func + ? aux->func[subprog]->aux->might_sleep + : aux->might_sleep; + if (prog->aux->might_sleep && !tgt_might_sleep) { + bpf_log(log, + "Extension program may sleep, while original does not\n"); + return -EINVAL; + } } if (!tgt_prog->jited) { bpf_log(log, "Can attach to only JITed progs\n"); @@ -22841,6 +23426,33 @@ BTF_ID(func, __rcu_read_unlock) #endif BTF_SET_END(btf_id_deny) +/* fexit and fmod_ret can't be used to attach to __noreturn functions. + * Currently, we must manually list all __noreturn functions here. Once a more + * robust solution is implemented, this workaround can be removed. + */ +BTF_SET_START(noreturn_deny) +#ifdef CONFIG_IA32_EMULATION +BTF_ID(func, __ia32_sys_exit) +BTF_ID(func, __ia32_sys_exit_group) +#endif +#ifdef CONFIG_KUNIT +BTF_ID(func, __kunit_abort) +BTF_ID(func, kunit_try_catch_throw) +#endif +#ifdef CONFIG_MODULES +BTF_ID(func, __module_put_and_kthread_exit) +#endif +#ifdef CONFIG_X86_64 +BTF_ID(func, __x64_sys_exit) +BTF_ID(func, __x64_sys_exit_group) +#endif +BTF_ID(func, do_exit) +BTF_ID(func, do_group_exit) +BTF_ID(func, kthread_complete_and_exit) +BTF_ID(func, kthread_exit) +BTF_ID(func, make_task_dead) +BTF_SET_END(noreturn_deny) + static bool can_be_sleepable(struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_TRACING) { @@ -22917,9 +23529,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_btf_trace = true; return 0; } else if (prog->expected_attach_type == BPF_TRACE_ITER) { - if (!bpf_iter_prog_supported(prog)) - return -EINVAL; - return 0; + return bpf_iter_prog_supported(prog); } if (prog->type == BPF_PROG_TYPE_LSM) { @@ -22929,6 +23539,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } else if (prog->type == BPF_PROG_TYPE_TRACING && btf_id_set_contains(&btf_id_deny, btf_id)) { return -EINVAL; + } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_MODIFY_RETURN) && + btf_id_set_contains(&noreturn_deny, btf_id)) { + verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n"); + return -EINVAL; } key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); @@ -23021,6 +23636,302 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, return 0; } +static bool can_fallthrough(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + if (class != BPF_JMP && class != BPF_JMP32) + return true; + + if (opcode == BPF_EXIT || opcode == BPF_JA) + return false; + + return true; +} + +static bool can_jump(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + if (class != BPF_JMP && class != BPF_JMP32) + return false; + + switch (opcode) { + case BPF_JA: + case BPF_JEQ: + case BPF_JNE: + case BPF_JLT: + case BPF_JLE: + case BPF_JGT: + case BPF_JGE: + case BPF_JSGT: + case BPF_JSGE: + case BPF_JSLT: + case BPF_JSLE: + case BPF_JCOND: + return true; + } + + return false; +} + +static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +{ + struct bpf_insn *insn = &prog->insnsi[idx]; + int i = 0, insn_sz; + u32 dst; + + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + if (can_fallthrough(insn) && idx + 1 < prog->len) + succ[i++] = idx + insn_sz; + + if (can_jump(insn)) { + dst = idx + jmp_offset(insn) + 1; + if (i == 0 || succ[0] != dst) + succ[i++] = dst; + } + + return i; +} + +/* Each field is a register bitmask */ +struct insn_live_regs { + u16 use; /* registers read by instruction */ + u16 def; /* registers written by instruction */ + u16 in; /* registers that may be alive before instruction */ + u16 out; /* registers that may be alive after instruction */ +}; + +/* Bitmask with 1s for all caller saved registers */ +#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) + +/* Compute info->{use,def} fields for the instruction */ +static void compute_insn_live_regs(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct insn_live_regs *info) +{ + struct call_summary cs; + u8 class = BPF_CLASS(insn->code); + u8 code = BPF_OP(insn->code); + u8 mode = BPF_MODE(insn->code); + u16 src = BIT(insn->src_reg); + u16 dst = BIT(insn->dst_reg); + u16 r0 = BIT(0); + u16 def = 0; + u16 use = 0xffff; + + switch (class) { + case BPF_LD: + switch (mode) { + case BPF_IMM: + if (BPF_SIZE(insn->code) == BPF_DW) { + def = dst; + use = 0; + } + break; + case BPF_LD | BPF_ABS: + case BPF_LD | BPF_IND: + /* stick with defaults */ + break; + } + break; + case BPF_LDX: + switch (mode) { + case BPF_MEM: + case BPF_MEMSX: + def = dst; + use = src; + break; + } + break; + case BPF_ST: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst; + break; + } + break; + case BPF_STX: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst | src; + break; + case BPF_ATOMIC: + switch (insn->imm) { + case BPF_CMPXCHG: + use = r0 | dst | src; + def = r0; + break; + case BPF_LOAD_ACQ: + def = dst; + use = src; + break; + case BPF_STORE_REL: + def = 0; + use = dst | src; + break; + default: + use = dst | src; + if (insn->imm & BPF_FETCH) + def = src; + else + def = 0; + } + break; + } + break; + case BPF_ALU: + case BPF_ALU64: + switch (code) { + case BPF_END: + use = dst; + def = dst; + break; + case BPF_MOV: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = 0; + else + use = src; + break; + default: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + case BPF_JMP: + case BPF_JMP32: + switch (code) { + case BPF_JA: + case BPF_JCOND: + def = 0; + use = 0; + break; + case BPF_EXIT: + def = 0; + use = r0; + break; + case BPF_CALL: + def = ALL_CALLER_SAVED_REGS; + use = def & ~BIT(BPF_REG_0); + if (get_call_summary(env, insn, &cs)) + use = GENMASK(cs.num_params, 1); + break; + default: + def = 0; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + } + + info->def = def; + info->use = use; +} + +/* Compute may-live registers after each instruction in the program. + * The register is live after the instruction I if it is read by some + * instruction S following I during program execution and is not + * overwritten between I and S. + * + * Store result in env->insn_aux_data[i].live_regs. + */ +static int compute_live_registers(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *insn_aux = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + struct insn_live_regs *state; + int insn_cnt = env->prog->len; + int err = 0, i, j; + bool changed; + + /* Use the following algorithm: + * - define the following: + * - I.use : a set of all registers read by instruction I; + * - I.def : a set of all registers written by instruction I; + * - I.in : a set of all registers that may be alive before I execution; + * - I.out : a set of all registers that may be alive after I execution; + * - insn_successors(I): a set of instructions S that might immediately + * follow I for some program execution; + * - associate separate empty sets 'I.in' and 'I.out' with each instruction; + * - visit each instruction in a postorder and update + * state[i].in, state[i].out as follows: + * + * state[i].out = U [state[s].in for S in insn_successors(i)] + * state[i].in = (state[i].out / state[i].def) U state[i].use + * + * (where U stands for set union, / stands for set difference) + * - repeat the computation while {in,out} fields changes for + * any instruction. + */ + state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL); + if (!state) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < insn_cnt; ++i) + compute_insn_live_regs(env, &insns[i], &state[i]); + + changed = true; + while (changed) { + changed = false; + for (i = 0; i < env->cfg.cur_postorder; ++i) { + int insn_idx = env->cfg.insn_postorder[i]; + struct insn_live_regs *live = &state[insn_idx]; + int succ_num; + u32 succ[2]; + u16 new_out = 0; + u16 new_in = 0; + + succ_num = insn_successors(env->prog, insn_idx, succ); + for (int s = 0; s < succ_num; ++s) + new_out |= state[succ[s]].in; + new_in = (new_out & ~live->def) | live->use; + if (new_out != live->out || new_in != live->in) { + live->in = new_in; + live->out = new_out; + changed = true; + } + } + } + + for (i = 0; i < insn_cnt; ++i) + insn_aux[i].live_regs_before = state[i].in; + + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "Live regs before insn:\n"); + for (i = 0; i < insn_cnt; ++i) { + verbose(env, "%3d: ", i); + for (j = BPF_REG_0; j < BPF_REG_10; ++j) + if (insn_aux[i].live_regs_before & BIT(j)) + verbose(env, "%d", j); + else + verbose(env, "."); + verbose(env, " "); + verbose_insn(env, &insns[i]); + if (bpf_is_ldimm64(&insns[i])) + i++; + } + } + +out: + kvfree(state); + kvfree(env->cfg.insn_postorder); + env->cfg.insn_postorder = NULL; + env->cfg.cur_postorder = 0; + return err; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); @@ -23098,12 +24009,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS; env->explored_states = kvcalloc(state_htab_size(env), - sizeof(struct bpf_verifier_state_list *), + sizeof(struct list_head), GFP_USER); ret = -ENOMEM; if (!env->explored_states) goto skip_full_check; + for (i = 0; i < state_htab_size(env); i++) + INIT_LIST_HEAD(&env->explored_states[i]); + INIT_LIST_HEAD(&env->free_list); + ret = check_btf_info_early(env, attr, uattr); if (ret < 0) goto skip_full_check; @@ -23138,6 +24053,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret) goto skip_full_check; + ret = compute_live_registers(env); + if (ret < 0) + goto skip_full_check; + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; @@ -23276,6 +24195,7 @@ err_unlock: vfree(env->insn_aux_data); kvfree(env->insn_hist); err_free_env: + kvfree(env->cfg.insn_postorder); kvfree(env); return ret; } diff --git a/kernel/capability.c b/kernel/capability.c index e089d2628c29..829f49ae07b9 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -286,22 +286,6 @@ bool has_ns_capability(struct task_struct *t, } /** - * has_capability - Does a task have a capability in init_user_ns - * @t: The task in question - * @cap: The capability to be tested for - * - * Return true if the specified task has the given superior capability - * currently in effect to the initial user namespace, false if not. - * - * Note that this does not set PF_SUPERPRIV on the task. - */ -bool has_capability(struct task_struct *t, int cap) -{ - return has_ns_capability(t, &init_user_ns, cap); -} -EXPORT_SYMBOL(has_capability); - -/** * has_ns_capability_noaudit - Does a task have a capability (unaudited) * in a specific user ns. * @t: The task in question diff --git a/kernel/cfi.c b/kernel/cfi.c index 08caad776717..422fa4f958ae 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -7,6 +7,8 @@ #include <linux/cfi.h> +bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE); + enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, unsigned long *target, u32 type) { @@ -17,7 +19,7 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, pr_err("CFI failure at %pS (no target information)\n", (void *)addr); - if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) { + if (cfi_warn) { __warn(NULL, 0, (void *)addr, 0, regs, NULL); return BUG_TRAP_TYPE_WARN; } @@ -71,14 +73,11 @@ static bool is_module_cfi_trap(unsigned long addr) struct module *mod; bool found = false; - rcu_read_lock_sched_notrace(); - + guard(rcu)(); mod = __module_address(addr); if (mod) found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end); - rcu_read_unlock_sched_notrace(); - return found; } #else /* CONFIG_MODULES */ diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c964dd7ff967..b14e61c64a34 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -168,6 +168,7 @@ struct cgroup_mgctx { extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; +extern bool cgrp_dfl_visible; /* iterate across the hierarchies */ #define for_each_root(root) \ @@ -269,9 +270,9 @@ int cgroup_task_count(const struct cgroup *cgrp); /* * rstat.c */ -int cgroup_rstat_init(struct cgroup *cgrp); -void cgroup_rstat_exit(struct cgroup *cgrp); -void cgroup_rstat_boot(void); +int css_rstat_init(struct cgroup_subsys_state *css); +void css_rstat_exit(struct cgroup_subsys_state *css); +int ss_rstat_init(struct cgroup_subsys *ss); void cgroup_base_stat_cputime_show(struct seq_file *seq); /* diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index e28d5f0d20ed..fa24c032ed6f 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -673,6 +673,7 @@ struct cftype cgroup1_base_files[] = { int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; + bool cgrp_v1_visible = false; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); @@ -684,12 +685,18 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) { if (cgroup1_subsys_absent(ss)) continue; + cgrp_v1_visible |= ss->root != &cgrp_dfl_root; + seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); } + if (cgrp_dfl_visible && !cgrp_v1_visible) + pr_info_once("/proc/cgroups lists only v1 controllers, use cgroup.controllers of root cgroup for v2 info\n"); + + return 0; } @@ -844,7 +851,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; - if (kn->parent != new_parent) + if (rcu_access_pointer(kn->__parent) != new_parent) return -EIO; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index afc665b7b1fe..a723b7dc6e4e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -90,11 +90,14 @@ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); -#ifdef CONFIG_PROVE_RCU +#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif +struct blocking_notifier_head cgroup_lifetime_notifier = + BLOCKING_NOTIFIER_INIT(cgroup_lifetime_notifier); + DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; static bool cgroup_debug __read_mostly; @@ -161,17 +164,21 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS -static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); +static DEFINE_PER_CPU(struct css_rstat_cpu, root_rstat_cpu); +static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu); /* the default hierarchy */ -struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; +struct cgroup_root cgrp_dfl_root = { + .cgrp.self.rstat_cpu = &root_rstat_cpu, + .cgrp.rstat_base_cpu = &root_rstat_base_cpu, +}; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ -static bool cgrp_dfl_visible; +bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; @@ -633,9 +640,22 @@ int cgroup_task_count(const struct cgroup *cgrp) return count; } +static struct cgroup *kn_priv(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + /* + * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT. + * Therefore it is always safe to dereference this pointer outside of a + * RCU section. + */ + parent = rcu_dereference_check(kn->__parent, + kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT); + return parent->priv; +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { - struct cgroup *cgrp = of->kn->parent->priv; + struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); /* @@ -1322,6 +1342,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) { struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; + int ret; trace_cgroup_destroy_root(root); @@ -1330,6 +1351,10 @@ static void cgroup_destroy_root(struct cgroup_root *root) BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); + ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_OFFLINE, cgrp); + WARN_ON_ONCE(notifier_to_errno(ret)); + /* Rebind all subsystems back to the default hierarchy */ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); @@ -1358,7 +1383,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_unlock(); - cgroup_rstat_exit(cgrp); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } @@ -1612,7 +1636,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn) if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else - cgrp = kn->parent->priv; + cgrp = kn_priv(kn); cgroup_unlock(); @@ -1644,7 +1668,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else - cgrp = kn->parent->priv; + cgrp = kn_priv(kn); /* * We're gonna grab cgroup_mutex which nests outside kernfs @@ -1682,7 +1706,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); - del_timer_sync(&cfile->notify_timer); + timer_delete_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); @@ -1702,7 +1726,7 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; - if (!css->ss) { + if (css_is_self(css)) { if (cgroup_on_dfl(cgrp)) { cgroup_addrm_files(css, cgrp, cgroup_base_files, false); @@ -1734,7 +1758,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (css->flags & CSS_VISIBLE) return 0; - if (!css->ss) { + if (css_is_self(css)) { if (cgroup_on_dfl(cgrp)) { ret = cgroup_addrm_files(css, cgrp, cgroup_base_files, true); @@ -1863,13 +1887,6 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) } spin_unlock_irq(&css_set_lock); - if (ss->css_rstat_flush) { - list_del_rcu(&css->rstat_css_node); - synchronize_rcu(); - list_add_rcu(&css->rstat_css_node, - &dcgrp->rstat_css_list); - } - /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { @@ -2052,7 +2069,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; - INIT_LIST_HEAD(&cgrp->rstat_css_list); prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) @@ -2118,7 +2134,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_SUPPORT_EXPORTOP | - KERNFS_ROOT_SUPPORT_USER_XATTR, + KERNFS_ROOT_SUPPORT_USER_XATTR | + KERNFS_ROOT_INVARIANT_PARENT, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); @@ -2132,7 +2149,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; - ret = cgroup_rstat_init(root_cgrp); + ret = css_rstat_init(&root_cgrp->self); if (ret) goto destroy_root; @@ -2140,10 +2157,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto exit_stats; - if (root == &cgrp_dfl_root) { - ret = cgroup_bpf_inherit(root_cgrp); - WARN_ON_ONCE(ret); - } + ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_ONLINE, root_cgrp); + WARN_ON_ONCE(notifier_to_errno(ret)); trace_cgroup_setup_root(root); @@ -2174,7 +2190,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto out; exit_stats: - cgroup_rstat_exit(root_cgrp); + css_rstat_exit(&root_cgrp->self); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; @@ -2339,9 +2355,37 @@ static struct file_system_type cgroup2_fs_type = { }; #ifdef CONFIG_CPUSETS_V1 +enum cpuset_param { + Opt_cpuset_v2_mode, +}; + +static const struct fs_parameter_spec cpuset_fs_parameters[] = { + fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), + {} +}; + +static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, cpuset_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_cpuset_v2_mode: + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; + return 0; + } + return -EINVAL; +} + static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, + .parse_param = cpuset_parse_param, }; /* @@ -2378,6 +2422,7 @@ static int cpuset_init_fs_context(struct fs_context *fc) static struct file_system_type cpuset_fs_type = { .name = "cpuset", .init_fs_context = cpuset_init_fs_context, + .parameters = cpuset_fs_parameters, .fs_flags = FS_USERNS_MOUNT, }; #endif @@ -4115,7 +4160,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_file_ctx *ctx = of->priv; - struct cgroup *cgrp = of->kn->parent->priv; + struct cgroup *cgrp = kn_priv(of->kn); struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; @@ -4447,7 +4492,7 @@ int cgroup_rm_cftypes(struct cftype *cfts) * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; @@ -5401,8 +5446,9 @@ static void css_free_rwork_fn(struct work_struct *work) struct cgroup *cgrp = css->cgroup; percpu_ref_exit(&css->refcnt); + css_rstat_exit(css); - if (ss) { + if (!css_is_self(css)) { /* css free path */ struct cgroup_subsys_state *parent = css->parent; int id = css->id; @@ -5431,7 +5477,6 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); - cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -5456,14 +5501,10 @@ static void css_release_work_fn(struct work_struct *work) css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); - if (ss) { + if (!css_is_self(css)) { struct cgroup *parent_cgrp; - /* css release path */ - if (!list_empty(&css->rstat_css_node)) { - cgroup_rstat_flush(cgrp); - list_del_rcu(&css->rstat_css_node); - } + css_rstat_flush(css); cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) @@ -5489,7 +5530,7 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); - cgroup_rstat_flush(cgrp); + css_rstat_flush(&cgrp->self); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; @@ -5537,7 +5578,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); - INIT_LIST_HEAD(&css->rstat_css_node); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); @@ -5546,9 +5586,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } - if (ss->css_rstat_flush) - list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); - BUG_ON(cgroup_css(cgrp, ss)); } @@ -5641,6 +5678,10 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, goto err_free_css; css->id = err; + err = css_rstat_init(css); + if (err) + goto err_free_css; + /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); @@ -5654,7 +5695,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); err_free_css: - list_del_rcu(&css->rstat_css_node); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); @@ -5670,7 +5710,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; struct kernfs_node *kn; - int level = parent->level + 1; + int i, level = parent->level + 1; int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ @@ -5682,17 +5722,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_free_cgrp; - ret = cgroup_rstat_init(cgrp); - if (ret) - goto out_cancel_ref; - /* create the directory */ kn = kernfs_create_dir_ns(parent->kn, name, mode, current_fsuid(), current_fsgid(), cgrp, NULL); if (IS_ERR(kn)) { ret = PTR_ERR(kn); - goto out_stat_exit; + goto out_cancel_ref; } cgrp->kn = kn; @@ -5702,15 +5738,20 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, cgrp->root = root; cgrp->level = level; - ret = psi_cgroup_alloc(cgrp); + /* + * Now that init_cgroup_housekeeping() has been called and cgrp->self + * is setup, it is safe to perform rstat initialization on it. + */ + ret = css_rstat_init(&cgrp->self); if (ret) goto out_kernfs_remove; - if (cgrp->root == &cgrp_dfl_root) { - ret = cgroup_bpf_inherit(cgrp); - if (ret) - goto out_psi_free; - } + ret = psi_cgroup_alloc(cgrp); + if (ret) + goto out_stat_exit; + + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + cgrp->ancestors[tcgrp->level] = tcgrp; /* * New cgroup inherits effective freeze counter, and @@ -5728,24 +5769,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, set_bit(CGRP_FROZEN, &cgrp->flags); } - spin_lock_irq(&css_set_lock); - for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestors[tcgrp->level] = tcgrp; - - if (tcgrp != cgrp) { - tcgrp->nr_descendants++; - - /* - * If the new cgroup is frozen, all ancestor cgroups - * get a new frozen descendant, but their state can't - * change because of this. - */ - if (cgrp->freezer.e_freeze) - tcgrp->freezer.nr_frozen_descendants++; - } - } - spin_unlock_irq(&css_set_lock); - if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5754,7 +5777,29 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, cgrp->self.serial_nr = css_serial_nr_next++; + ret = blocking_notifier_call_chain_robust(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_ONLINE, + CGROUP_LIFETIME_OFFLINE, cgrp); + ret = notifier_to_errno(ret); + if (ret) + goto out_psi_free; + /* allocation complete, commit to creation */ + spin_lock_irq(&css_set_lock); + for (i = 0; i < level; i++) { + tcgrp = cgrp->ancestors[i]; + tcgrp->nr_descendants++; + + /* + * If the new cgroup is frozen, all ancestor cgroups get a new + * frozen descendant, but their state can't change because of + * this. + */ + if (cgrp->freezer.e_freeze) + tcgrp->freezer.nr_frozen_descendants++; + } + spin_unlock_irq(&css_set_lock); + list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); @@ -5772,10 +5817,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, out_psi_free: psi_cgroup_free(cgrp); +out_stat_exit: + css_rstat_exit(&cgrp->self); out_kernfs_remove: kernfs_remove(cgrp->kn); -out_stat_exit: - cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5831,7 +5876,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) } /* - * This extra ref will be put in cgroup_free_fn() and guarantees + * This extra ref will be put in css_free_rwork_fn() and guarantees * that @cgrp->kn is always accessible. */ kernfs_get(cgrp->kn); @@ -5909,6 +5954,12 @@ static void kill_css(struct cgroup_subsys_state *css) if (css->flags & CSS_DYING) return; + /* + * Call css_killed(), if defined, before setting the CSS_DYING flag + */ + if (css->ss->css_killed) + css->ss->css_killed(css); + css->flags |= CSS_DYING; /* @@ -5966,7 +6017,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; - int ssid; + int ssid, ret; lockdep_assert_held(&cgroup_mutex); @@ -6024,8 +6075,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup1_check_for_release(parent); - if (cgrp->root == &cgrp_dfl_root) - cgroup_bpf_offline(cgrp); + ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier, + CGROUP_LIFETIME_OFFLINE, cgrp); + WARN_ON_ONCE(notifier_to_errno(ret)); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -6087,6 +6139,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) } else { css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); + + BUG_ON(ss_rstat_init(ss)); + BUG_ON(css_rstat_init(css)); } /* Update the init_css_set to contain a subsys @@ -6135,6 +6190,8 @@ int __init cgroup_init_early(void) ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); + WARN(ss->early_init && ss->css_rstat_flush, + "cgroup rstat cannot be used with early init subsystem\n"); ss->id = i; ss->name = cgroup_subsys_name[i]; @@ -6163,7 +6220,7 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); - cgroup_rstat_boot(); + BUG_ON(ss_rstat_init(NULL)); get_user_ns(init_cgroup_ns.user_ns); @@ -6176,6 +6233,8 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); + cgroup_bpf_lifetime_notifier_init(); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); cgroup_unlock(); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 976a8bc3ff60..383963e28ac6 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -33,6 +33,7 @@ enum prs_errcode { PERR_CPUSEMPTY, PERR_HKEEPING, PERR_ACCESS, + PERR_REMOTE, }; /* bits in struct cpuset flags field */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 25c1d7b77e2f..b69a7db67090 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include "cgroup-internal.h" #include "cpuset-internal.h" /* @@ -175,6 +176,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = update_relax_domain_level(cs, val); break; default: @@ -373,6 +375,46 @@ out: return ret; } +#ifdef CONFIG_PROC_PID_CPUSET +/* + * proc_cpuset_show() + * - Print tasks cpuset path into seq_file. + * - Used for /proc/<pid>/cpuset. + */ +int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) +{ + char *buf; + struct cgroup_subsys_state *css; + int retval; + + retval = -ENOMEM; + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + goto out; + + rcu_read_lock(); + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); + + if (retval == -E2BIG) + retval = -ENAMETOOLONG; + if (retval < 0) + goto out_free; + seq_puts(m, buf); + seq_putc(m, '\n'); + retval = 0; +out_free: + kfree(buf); +out: + return retval; +} +#endif /* CONFIG_PROC_PID_CPUSET */ + static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct cpuset *cs = css_cs(css); @@ -424,24 +466,31 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); break; case FILE_MEM_EXCLUSIVE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); break; case FILE_MEM_HARDWALL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); break; case FILE_SCHED_LOAD_BALANCE: + pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name); retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: + pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name); cpuset_memory_pressure_enabled = !!val; break; case FILE_SPREAD_PAGE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); break; case FILE_SPREAD_SLAB: + pr_warn_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); break; default: diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0f910c828973..3bc4301466f3 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,7 +21,6 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ -#include "cgroup-internal.h" #include "cpuset-internal.h" #include <linux/init.h> @@ -62,10 +61,17 @@ static const char * const perr_strings[] = { [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", [PERR_ACCESS] = "Enable partition not permitted", + [PERR_REMOTE] = "Have remote partition underneath", }; /* - * Exclusive CPUs distributed out to sub-partitions of top_cpuset + * For local partitions, update to subpartitions_cpus & isolated_cpus is done + * in update_parent_effective_cpumask(). For remote partitions, it is done in + * the remote_partition_*() and remote_cpus_update() helpers. + */ +/* + * Exclusive CPUs distributed out to local or remote sub-partitions of + * top_cpuset */ static cpumask_var_t subpartitions_cpus; @@ -87,7 +93,6 @@ static struct list_head remote_children; * A flag to force sched domain rebuild at the end of an operation. * It can be set in * - update_partition_sd_lb() - * - remote_partition_check() * - update_cpumasks_hier() * - cpuset_update_flag() * - cpuset_hotplug_update_tasks() @@ -187,6 +192,20 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) WRITE_ONCE(cs->prs_err, PERR_NONE); } +/* + * The top_cpuset is always synchronized to cpu_active_mask and we should avoid + * using cpu_online_mask as much as possible. An active CPU is always an online + * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ + * during hotplug operations. A CPU is marked active at the last stage of CPU + * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code + * will be called to update the sched domains so that the scheduler can move + * a normal task to a newly active CPU or remove tasks away from a newly + * inactivated CPU. The online bit is set much earlier in the CPU bringup + * process and cleared much later in CPU teardown. + * + * If cpu_online_mask is used while a hotunplug operation is happening in + * parallel, we may leave an offline CPU in cpu_allowed or some other masks. + */ static struct cpuset top_cpuset = { .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), @@ -350,18 +369,18 @@ static inline bool partition_is_populated(struct cpuset *cs, * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset - * of cpu_online_mask. + * of cpu_active_mask. * * Call with callback_lock or cpuset_mutex held. */ -static void guarantee_online_cpus(struct task_struct *tsk, +static void guarantee_active_cpus(struct task_struct *tsk, struct cpumask *pmask) { const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); struct cpuset *cs; - if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) - cpumask_copy(pmask, cpu_online_mask); + if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask))) + cpumask_copy(pmask, cpu_active_mask); rcu_read_lock(); cs = task_cs(tsk); @@ -954,10 +973,12 @@ static void dl_update_tasks_root_domain(struct cpuset *cs) css_task_iter_end(&it); } -static void dl_rebuild_rd_accounting(void) +void dl_rebuild_rd_accounting(void) { struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; + int cpu; + u64 cookie = ++dl_cookie; lockdep_assert_held(&cpuset_mutex); lockdep_assert_cpus_held(); @@ -965,11 +986,12 @@ static void dl_rebuild_rd_accounting(void) rcu_read_lock(); - /* - * Clear default root domain DL accounting, it will be computed again - * if a task belongs to it. - */ - dl_clear_root_domain(&def_root_domain); + for_each_possible_cpu(cpu) { + if (dl_bw_visited(cpu, cookie)) + continue; + + dl_clear_root_domain_cpu(cpu); + } cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { @@ -990,16 +1012,6 @@ static void dl_rebuild_rd_accounting(void) rcu_read_unlock(); } -static void -partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ - mutex_lock(&sched_domains_mutex); - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - dl_rebuild_rd_accounting(); - mutex_unlock(&sched_domains_mutex); -} - /* * Rebuild scheduler domains. * @@ -1061,7 +1073,7 @@ void rebuild_sched_domains_locked(void) ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); + partition_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ void rebuild_sched_domains_locked(void) @@ -1083,6 +1095,13 @@ void rebuild_sched_domains(void) cpus_read_unlock(); } +void cpuset_reset_sched_domains(void) +{ + mutex_lock(&cpuset_mutex); + partition_sched_domains(1, NULL, NULL); + mutex_unlock(&cpuset_mutex); +} + /** * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed @@ -1090,9 +1109,14 @@ void rebuild_sched_domains(void) * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, - * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() - * is used instead of effective_cpus to make sure all offline CPUs are also - * included as hotplug code won't update cpumasks for tasks in top_cpuset. + * cpuset membership stays stable. + * + * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus + * to make sure all offline CPUs are also included as hotplug code won't + * update cpumasks for tasks in top_cpuset. + * + * As task_cpu_possible_mask() can be task dependent in arm64, we have to + * do cpu masking per task instead of doing it once for all. */ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { @@ -1106,9 +1130,11 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) if (top_cs) { /* - * Percpu kthreads in top_cpuset are ignored + * PF_NO_SETAFFINITY tasks are ignored. + * All per cpu kthreads should have PF_NO_SETAFFINITY + * flag set, see kthread_set_per_cpu(). */ - if (kthread_is_per_cpu(task)) + if (task->flags & PF_NO_SETAFFINITY) continue; cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus); } else { @@ -1152,7 +1178,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * * Return: 0 if successful, an error code otherwise */ -static int update_partition_exclusive(struct cpuset *cs, int new_prs) +static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs) { bool exclusive = (new_prs > PRS_MEMBER); @@ -1235,12 +1261,12 @@ static void reset_partition_data(struct cpuset *cs) } /* - * partition_xcpus_newstate - Exclusive CPUs state change + * isolated_cpus_update - Update the isolated_cpus mask * @old_prs: old partition_root_state * @new_prs: new partition_root_state * @xcpus: exclusive CPUs with state change */ -static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus) +static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) { WARN_ON_ONCE(old_prs == new_prs); if (new_prs == PRS_ISOLATED) @@ -1274,8 +1300,8 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, isolcpus_updated = (new_prs != parent->partition_root_state); if (isolcpus_updated) - partition_xcpus_newstate(parent->partition_root_state, new_prs, - xcpus); + isolated_cpus_update(parent->partition_root_state, new_prs, + xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); return isolcpus_updated; @@ -1305,8 +1331,8 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, isolcpus_updated = (old_prs != parent->partition_root_state); if (isolcpus_updated) - partition_xcpus_newstate(old_prs, parent->partition_root_state, - xcpus); + isolated_cpus_update(old_prs, parent->partition_root_state, + xcpus); cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); @@ -1341,20 +1367,55 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); * compute_effective_exclusive_cpumask - compute effective exclusive CPUs * @cs: cpuset * @xcpus: effective exclusive CPUs value to be set - * Return: true if xcpus is not empty, false otherwise. + * @real_cs: the real cpuset (can be NULL) + * Return: 0 if there is no sibling conflict, > 0 otherwise * - * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), - * it must be a subset of parent's effective_xcpus. + * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to + * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus + * as well. The provision of real_cs means that a cpumask is being changed and + * the given cs is a trial one. */ -static bool compute_effective_exclusive_cpumask(struct cpuset *cs, - struct cpumask *xcpus) +static int compute_effective_exclusive_cpumask(struct cpuset *cs, + struct cpumask *xcpus, + struct cpuset *real_cs) { + struct cgroup_subsys_state *css; struct cpuset *parent = parent_cs(cs); + struct cpuset *sibling; + int retval = 0; if (!xcpus) xcpus = cs->effective_xcpus; - return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); + cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); + + if (!real_cs) { + if (!cpumask_empty(cs->exclusive_cpus)) + return 0; + } else { + cs = real_cs; + } + + /* + * Exclude exclusive CPUs from siblings + */ + rcu_read_lock(); + cpuset_for_each_child(sibling, css, parent) { + if (sibling == cs) + continue; + + if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) { + cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus); + retval++; + continue; + } + if (cpumask_intersects(xcpus, sibling->effective_xcpus)) { + cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus); + retval++; + } + } + rcu_read_unlock(); + return retval; } static inline bool is_remote_partition(struct cpuset *cs) @@ -1392,21 +1453,26 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * The requested exclusive_cpus must not be allocated to other * partitions and it can't use up all the root's effective_cpus. * - * Note that if there is any local partition root above it or - * remote partition root underneath it, its exclusive_cpus must - * have overlapped with subpartitions_cpus. + * The effective_xcpus mask can contain offline CPUs, but there must + * be at least one or more online CPUs present before it can be enabled. + * + * Note that creating a remote partition with any local partition root + * above it or remote partition root underneath it is not allowed. */ - compute_effective_exclusive_cpumask(cs, tmp->new_cpus); - if (cpumask_empty(tmp->new_cpus) || - cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || + compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL); + WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); + if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) return PERR_INVCPUS; spin_lock_irq(&callback_lock); isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); + cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); + cpuset_force_rebuild(); + cs->prs_err = 0; /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1429,20 +1495,24 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { bool isolcpus_updated; - compute_effective_exclusive_cpumask(cs, tmp->new_cpus); WARN_ON_ONCE(!is_remote_partition(cs)); - WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); + WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); list_del_init(&cs->remote_sibling); isolcpus_updated = partition_xcpus_del(cs->partition_root_state, - NULL, tmp->new_cpus); - cs->partition_root_state = -cs->partition_root_state; - if (!cs->prs_err) - cs->prs_err = PERR_INVCPUS; + NULL, cs->effective_xcpus); + if (cs->prs_err) + cs->partition_root_state = -cs->partition_root_state; + else + cs->partition_root_state = PRS_MEMBER; + + /* effective_xcpus may need to be changed */ + compute_effective_exclusive_cpumask(cs, NULL, NULL); reset_partition_data(cs); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); + cpuset_force_rebuild(); /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1454,14 +1524,15 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) /* * remote_cpus_update - cpus_exclusive change of remote partition * @cs: the cpuset to be updated - * @newmask: the new effective_xcpus mask + * @xcpus: the new exclusive_cpus mask, if non-NULL + * @excpus: the new effective_xcpus mask * @tmp: temporary masks * * top_cpuset and subpartitions_cpus will be updated or partition can be * invalidated. */ -static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, - struct tmpmasks *tmp) +static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, + struct cpumask *excpus, struct tmpmasks *tmp) { bool adding, deleting; int prs = cs->partition_root_state; @@ -1472,29 +1543,46 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); - if (cpumask_empty(newmask)) + if (cpumask_empty(excpus)) { + cs->prs_err = PERR_CPUSEMPTY; goto invalidate; + } - adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus); - deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask); + adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus); + deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus); /* * Additions of remote CPUs is only allowed if those CPUs are * not allocated to other partitions and there are effective_cpus * left in the top cpuset. */ - if (adding && (!capable(CAP_SYS_ADMIN) || - cpumask_intersects(tmp->addmask, subpartitions_cpus) || - cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))) - goto invalidate; + if (adding) { + WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus)); + if (!capable(CAP_SYS_ADMIN)) + cs->prs_err = PERR_ACCESS; + else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) + cs->prs_err = PERR_NOCPUS; + if (cs->prs_err) + goto invalidate; + } spin_lock_irq(&callback_lock); if (adding) isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); + /* + * Need to update effective_xcpus and exclusive_cpus now as + * update_sibling_cpumasks() below may iterate back to the same cs. + */ + cpumask_copy(cs->effective_xcpus, excpus); + if (xcpus) + cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); + if (adding || deleting) + cpuset_force_rebuild(); /* * Propagate changes in top_cpuset's effective_cpus down the hierarchy. @@ -1508,47 +1596,6 @@ invalidate: } /* - * remote_partition_check - check if a child remote partition needs update - * @cs: the cpuset to be updated - * @newmask: the new effective_xcpus mask - * @delmask: temporary mask for deletion (not in tmp) - * @tmp: temporary masks - * - * This should be called before the given cs has updated its cpus_allowed - * and/or effective_xcpus. - */ -static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, - struct cpumask *delmask, struct tmpmasks *tmp) -{ - struct cpuset *child, *next; - int disable_cnt = 0; - - /* - * Compute the effective exclusive CPUs that will be deleted. - */ - if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) || - !cpumask_intersects(delmask, subpartitions_cpus)) - return; /* No deletion of exclusive CPUs in partitions */ - - /* - * Searching the remote children list to look for those that will - * be impacted by the deletion of exclusive CPUs. - * - * Since a cpuset must be removed from the remote children list - * before it can go offline and holding cpuset_mutex will prevent - * any change in cpuset status. RCU read lock isn't needed. - */ - lockdep_assert_held(&cpuset_mutex); - list_for_each_entry_safe(child, next, &remote_children, remote_sibling) - if (cpumask_intersects(child->effective_cpus, delmask)) { - remote_partition_disable(child, tmp); - disable_cnt++; - } - if (disable_cnt) - cpuset_force_rebuild(); -} - -/* * prstate_housekeeping_conflict - check for partition & housekeeping conflicts * @prstate: partition root state to be checked * @new_cpus: cpu mask @@ -1602,7 +1649,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) * The partcmd_update command is used by update_cpumasks_hier() with newmask * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used * by update_cpumask() with NULL newmask. In both cases, the callers won't - * check for error and so partition_root_state and prs_error will be updated + * check for error and so partition_root_state and prs_err will be updated * directly. */ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, @@ -1615,11 +1662,12 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ int subparts_delta = 0; - struct cpumask *xcpus; /* cs effective_xcpus */ int isolcpus_updated = 0; + struct cpumask *xcpus = user_xcpus(cs); bool nocpu; lockdep_assert_held(&cpuset_mutex); + WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */ /* * new_prs will only be changed for the partcmd_update and @@ -1627,7 +1675,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; - xcpus = user_xcpus(cs); if (cmd == partcmd_invalidate) { if (is_prs_invalid(old_prs)) @@ -1662,12 +1709,19 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { /* + * Need to call compute_effective_exclusive_cpumask() in case + * exclusive_cpus not set. Sibling conflict should only happen + * if exclusive_cpus isn't set. + */ + xcpus = tmp->delmask; + if (compute_effective_exclusive_cpumask(cs, xcpus, NULL)) + WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); + + /* * Enabling partition root is not allowed if its - * effective_xcpus is empty or doesn't overlap with - * parent's effective_xcpus. + * effective_xcpus is empty. */ - if (cpumask_empty(xcpus) || - !cpumask_intersects(xcpus, parent->effective_xcpus)) + if (cpumask_empty(xcpus)) return PERR_INVCPUS; if (prstate_housekeeping_conflict(new_prs, xcpus)) @@ -1680,19 +1734,33 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (nocpu) return PERR_NOCPUS; - cpumask_copy(tmp->delmask, xcpus); + /* + * This function will only be called when all the preliminary + * checks have passed. At this point, the following condition + * should hold. + * + * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus + * + * Warn if it is not the case. + */ + cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); + deleting = true; subparts_delta++; new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; } else if (cmd == partcmd_disable) { /* - * May need to add cpus to parent's effective_cpus for - * valid partition root. + * May need to add cpus back to parent's effective_cpus + * (and maybe removed from subpartitions_cpus/isolated_cpus) + * for valid partition root. xcpus may contain CPUs that + * shouldn't be removed from the two global cpumasks. */ - adding = !is_prs_invalid(old_prs) && - cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); - if (adding) + if (is_partition_valid(cs)) { + cpumask_copy(tmp->addmask, cs->effective_xcpus); + adding = true; subparts_delta--; + } new_prs = PRS_MEMBER; } else if (newmask) { /* @@ -1702,6 +1770,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, part_error = PERR_CPUSEMPTY; goto write_error; } + /* Check newmask again, whether cpus are available for parent/cs */ nocpu |= tasks_nocpu_error(parent, cs, newmask); @@ -1733,6 +1802,15 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, parent->effective_xcpus); } /* + * The new CPUs to be removed from parent's effective CPUs + * must be present. + */ + if (deleting) { + cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); + } + + /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ @@ -1830,7 +1908,7 @@ write_error: * CPU lists in cs haven't been updated yet. So defer it to later. */ if ((old_prs != new_prs) && (cmd != partcmd_update)) { - int err = update_partition_exclusive(cs, new_prs); + int err = update_partition_exclusive_flag(cs, new_prs); if (err) return err; @@ -1868,7 +1946,7 @@ write_error: update_unbound_workqueue_cpumask(isolcpus_updated); if ((old_prs != new_prs) && (cmd == partcmd_update)) - update_partition_exclusive(cs, new_prs); + update_partition_exclusive_flag(cs, new_prs); if (adding || deleting) { cpuset_update_tasks_cpumask(parent, tmp->addmask); @@ -1918,7 +1996,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * 2) All the effective_cpus will be used up and cp * has tasks */ - compute_effective_exclusive_cpumask(cs, new_ecpus); + compute_effective_exclusive_cpumask(cs, new_ecpus, NULL); cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); rcu_read_lock(); @@ -1926,6 +2004,11 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, if (!is_partition_valid(child)) continue; + /* + * There shouldn't be a remote partition underneath another + * partition root. + */ + WARN_ON_ONCE(is_remote_partition(child)); child->prs_err = 0; if (!cpumask_subset(child->effective_xcpus, cs->effective_xcpus)) @@ -1981,32 +2064,39 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, bool remote = is_remote_partition(cp); bool update_parent = false; + old_prs = new_prs = cp->partition_root_state; + /* - * Skip descendent remote partition that acquires CPUs - * directly from top cpuset unless it is cs. + * For child remote partition root (!= cs), we need to call + * remote_cpus_update() if effective_xcpus will be changed. + * Otherwise, we can skip the whole subtree. + * + * remote_cpus_update() will reuse tmp->new_cpus only after + * its value is being processed. */ if (remote && (cp != cs)) { - pos_css = css_rightmost_descendant(pos_css); - continue; - } + compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL); + if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + rcu_read_unlock(); + remote_cpus_update(cp, NULL, tmp->new_cpus, tmp); + rcu_read_lock(); - /* - * Update effective_xcpus if exclusive_cpus set. - * The case when exclusive_cpus isn't set is handled later. - */ - if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) { - spin_lock_irq(&callback_lock); - compute_effective_exclusive_cpumask(cp, NULL); - spin_unlock_irq(&callback_lock); + /* Remote partition may be invalidated */ + new_prs = cp->partition_root_state; + remote = (new_prs == old_prs); } - old_prs = new_prs = cp->partition_root_state; - if (remote || (is_partition_valid(parent) && - is_partition_valid(cp))) + if (remote || (is_partition_valid(parent) && is_partition_valid(cp))) compute_partition_effective_cpumask(cp, tmp->new_cpus); else compute_effective_cpumask(tmp->new_cpus, cp, parent); + if (remote) + goto get_css; /* Ready to update cpuset data */ + /* * A partition with no effective_cpus is allowed as long as * there is no task associated with it. Call @@ -2026,9 +2116,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) cpumask_copy(tmp->new_cpus, parent->effective_cpus); - if (remote) - goto get_css; - /* * Skip the whole subtree if * 1) the cpumask remains the same, @@ -2089,6 +2176,9 @@ get_css: spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); cp->partition_root_state = new_prs; + if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) + compute_effective_exclusive_cpumask(cp, NULL, NULL); + /* * Make sure effective_xcpus is properly set for a valid * partition root. @@ -2175,7 +2265,14 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, parent); if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) continue; + } else if (is_remote_partition(sibling)) { + /* + * Change in a sibling cpuset won't affect a remote + * partition root. + */ + continue; } + if (!css_tryget_online(&sibling->css)) continue; @@ -2203,7 +2300,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, bool force = false; int old_prs = cs->partition_root_state; - /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ + /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */ if (cs == &top_cpuset) return -EACCES; @@ -2232,8 +2329,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * trialcs->effective_xcpus is used as a temporary cpumask * for checking validity of the partition root. */ + trialcs->partition_root_state = PRS_MEMBER; if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) - compute_effective_exclusive_cpumask(trialcs, NULL); + compute_effective_exclusive_cpumask(trialcs, NULL, cs); } /* Nothing to do if the cpus didn't change */ @@ -2306,19 +2404,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Call remote_cpus_update() to handle valid remote partition */ if (is_remote_partition(cs)) - remote_cpus_update(cs, xcpus, &tmp); + remote_cpus_update(cs, NULL, xcpus, &tmp); else if (invalidate) update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); else update_parent_effective_cpumask(cs, partcmd_update, xcpus, &tmp); - } else if (!cpumask_empty(cs->exclusive_cpus)) { - /* - * Use trialcs->effective_cpus as a temp cpumask - */ - remote_partition_check(cs, trialcs->effective_xcpus, - trialcs->effective_cpus, &tmp); } spin_lock_irq(&callback_lock); @@ -2370,8 +2462,15 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) return 0; - if (*buf) - compute_effective_exclusive_cpumask(trialcs, NULL); + if (*buf) { + trialcs->partition_root_state = PRS_MEMBER; + /* + * Reject the change if there is exclusive CPUs conflict with + * the siblings. + */ + if (compute_effective_exclusive_cpumask(trialcs, NULL, cs)) + return -EINVAL; + } /* * Check all the descendants in update_cpumasks_hier() if @@ -2402,8 +2501,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (invalidate) remote_partition_disable(cs, &tmp); else - remote_cpus_update(cs, trialcs->effective_xcpus, - &tmp); + remote_cpus_update(cs, trialcs->exclusive_cpus, + trialcs->effective_xcpus, &tmp); } else if (invalidate) { update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); @@ -2411,12 +2510,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, update_parent_effective_cpumask(cs, partcmd_update, trialcs->effective_xcpus, &tmp); } - } else if (!cpumask_empty(trialcs->exclusive_cpus)) { - /* - * Use trialcs->effective_cpus as a temp cpumask - */ - remote_partition_check(cs, trialcs->effective_xcpus, - trialcs->effective_cpus, &tmp); } spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); @@ -2783,7 +2876,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) int err = PERR_NONE, old_prs = cs->partition_root_state; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; - bool new_xcpus_state = false; + bool isolcpus_updated = false; if (old_prs == new_prs) return 0; @@ -2797,18 +2890,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; - /* - * Setup effective_xcpus if not properly set yet, it will be cleared - * later if partition becomes invalid. - */ - if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) { - spin_lock_irq(&callback_lock); - cpumask_and(cs->effective_xcpus, - cs->cpus_allowed, parent->effective_xcpus); - spin_unlock_irq(&callback_lock); - } - - err = update_partition_exclusive(cs, new_prs); + err = update_partition_exclusive_flag(cs, new_prs); if (err) goto out; @@ -2822,6 +2904,19 @@ static int update_prstate(struct cpuset *cs, int new_prs) } /* + * We don't support the creation of a new local partition with + * a remote partition underneath it. This unsupported + * setting can happen only if parent is the top_cpuset because + * a remote partition cannot be created underneath an existing + * local or remote partition. + */ + if ((parent == &top_cpuset) && + cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) { + err = PERR_REMOTE; + goto out; + } + + /* * If parent is valid partition, enable local partiion. * Otherwise, enable a remote partition. */ @@ -2836,8 +2931,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. + * Need to update isolated_cpus. */ - new_xcpus_state = true; + isolcpus_updated = true; } else { /* * Switching back to member is always allowed even if it @@ -2861,7 +2957,7 @@ out: */ if (err) { new_prs = -new_prs; - update_partition_exclusive(cs, new_prs); + update_partition_exclusive_flag(cs, new_prs); } spin_lock_irq(&callback_lock); @@ -2869,14 +2965,18 @@ out: WRITE_ONCE(cs->prs_err, err); if (!is_partition_valid(cs)) reset_partition_data(cs); - else if (new_xcpus_state) - partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus); + else if (isolcpus_updated) + isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(new_xcpus_state); + update_unbound_workqueue_cpumask(isolcpus_updated); - /* Force update if switching back to member */ + /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); + /* A newly created partition must have effective_xcpus set */ + WARN_ON_ONCE(!old_prs && (new_prs > 0) + && cpumask_empty(cs->effective_xcpus)); + /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); @@ -3019,7 +3119,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) lockdep_assert_held(&cpuset_mutex); if (cs != &top_cpuset) - guarantee_online_cpus(task, cpus_attach); + guarantee_active_cpus(task, cpus_attach); else cpumask_andnot(cpus_attach, task_cpu_possible_mask(task), subpartitions_cpus); @@ -3209,7 +3309,7 @@ int cpuset_common_seq_show(struct seq_file *sf, void *v) return ret; } -static int sched_partition_show(struct seq_file *seq, void *v) +static int cpuset_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); const char *err, *type = NULL; @@ -3240,7 +3340,7 @@ static int sched_partition_show(struct seq_file *seq, void *v) return 0; } -static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, +static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); @@ -3261,11 +3361,8 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, css_get(&cs->css); cpus_read_lock(); mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) - goto out_unlock; - - retval = update_prstate(cs, val); -out_unlock: + if (is_cpuset_online(cs)) + retval = update_prstate(cs, val); mutex_unlock(&cpuset_mutex); cpus_read_unlock(); css_put(&cs->css); @@ -3309,8 +3406,8 @@ static struct cftype dfl_files[] = { { .name = "cpus.partition", - .seq_show = sched_partition_show, - .write = sched_partition_write, + .seq_show = cpuset_partition_show, + .write = cpuset_partition_write, .private = FILE_PARTITION_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct cpuset, partition_file), @@ -3464,11 +3561,7 @@ out_unlock: * will call rebuild_sched_domains_locked(). That is not needed * in the default hierarchy where only changes in partition * will cause repartitioning. - * - * If the cpuset has the 'sched.partition' flag enabled, simulate - * turning 'sched.partition" off. */ - static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); @@ -3476,9 +3569,6 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpus_read_lock(); mutex_lock(&cpuset_mutex); - if (is_partition_valid(cs)) - update_prstate(cs, 0); - if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -3489,6 +3579,27 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpus_read_unlock(); } +/* + * If a dying cpuset has the 'cpus.partition' enabled, turn it off by + * changing it back to member to free its exclusive CPUs back to the pool to + * be used by other online cpusets. + */ +static void cpuset_css_killed(struct cgroup_subsys_state *css) +{ + struct cpuset *cs = css_cs(css); + + cpus_read_lock(); + mutex_lock(&cpuset_mutex); + + /* Reset valid partition back to member */ + if (is_partition_valid(cs)) + update_prstate(cs, PRS_MEMBER); + + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + +} + static void cpuset_css_free(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); @@ -3610,6 +3721,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, + .css_killed = cpuset_css_killed, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, @@ -3740,10 +3852,10 @@ retry: if (remote && cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) { + cs->prs_err = PERR_HOTPLUG; remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; - cpuset_force_rebuild(); } /* @@ -3952,7 +4064,7 @@ void __init cpuset_init_smp(void) * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_online_mask, even if this means going outside the + * subset of cpu_active_mask, even if this means going outside the * tasks cpuset, except when the task is in the top cpuset. **/ @@ -3966,7 +4078,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) cs = task_cs(tsk); if (cs != &top_cpuset) - guarantee_online_cpus(tsk, pmask); + guarantee_active_cpus(tsk, pmask); /* * Tasks in the top cpuset won't get update to their cpumasks * when a hotplug online/offline event happens. So we include all @@ -3980,7 +4092,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) * allowable online cpu left, we fall back to all possible cpus. */ cpumask_andnot(pmask, possible_mask, subpartitions_cpus); - if (!cpumask_intersects(pmask, cpu_online_mask)) + if (!cpumask_intersects(pmask, cpu_active_mask)) cpumask_copy(pmask, possible_mask); } @@ -4090,7 +4202,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) } /* - * cpuset_node_allowed - Can we allocate on a memory node? + * cpuset_current_node_allowed - Can current task allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -4129,7 +4241,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -bool cpuset_node_allowed(int node, gfp_t gfp_mask) +bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ bool allowed; /* is allocation in zone z allowed? */ @@ -4163,6 +4275,42 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask) return allowed; } +bool cpuset_node_allowed(struct cgroup *cgroup, int nid) +{ + struct cgroup_subsys_state *css; + struct cpuset *cs; + bool allowed; + + /* + * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy + * and mems_allowed is likely to be empty even if we could get to it, + * so return true to avoid taking a global lock on the empty check. + */ + if (!cpuset_v2()) + return true; + + css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys); + if (!css) + return true; + + /* + * Normally, accessing effective_mems would require the cpuset_mutex + * or callback_lock - but node_isset is atomic and the reference + * taken via cgroup_get_e_css is sufficient to protect css. + * + * Since this interface is intended for use by migration paths, we + * relax locking here to avoid taking global locks - while accepting + * there may be rare scenarios where the result may be innaccurate. + * + * Reclaim and migration are subject to these same race conditions, and + * cannot make strong isolation guarantees, so this is acceptable. + */ + cs = container_of(css, struct cpuset, css); + allowed = node_isset(nid, cs->effective_mems); + css_put(css); + return allowed; +} + /** * cpuset_spread_node() - On which node to begin search for a page * @rotor: round robin rotor @@ -4244,50 +4392,6 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } -#ifdef CONFIG_PROC_PID_CPUSET -/* - * proc_cpuset_show() - * - Print tasks cpuset path into seq_file. - * - Used for /proc/<pid>/cpuset. - * - No need to task_lock(tsk) on this tsk->cpuset reference, as it - * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_mutex, keeping cpuset_attach() from changing it - * anyway. - */ -int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - char *buf; - struct cgroup_subsys_state *css; - int retval; - - retval = -ENOMEM; - buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!buf) - goto out; - - rcu_read_lock(); - spin_lock_irq(&css_set_lock); - css = task_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - spin_unlock_irq(&css_set_lock); - rcu_read_unlock(); - - if (retval == -E2BIG) - retval = -ENAMETOOLONG; - if (retval < 0) - goto out_free; - seq_puts(m, buf); - seq_putc(m, '\n'); - retval = 0; -out_free: - kfree(buf); -out: - return retval; -} -#endif /* CONFIG_PROC_PID_CPUSET */ - /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index fbe34299673d..10b63433f057 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -220,60 +220,32 @@ dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool, struct dmem_cgroup_pool_state *test_pool) { struct page_counter *climit; - struct cgroup_subsys_state *css, *next_css; + struct cgroup_subsys_state *css; struct dmemcg_state *dmemcg_iter; - struct dmem_cgroup_pool_state *pool, *parent_pool; - bool found_descendant; + struct dmem_cgroup_pool_state *pool, *found_pool; climit = &limit_pool->cnt; rcu_read_lock(); - parent_pool = pool = limit_pool; - css = &limit_pool->cs->css; - /* - * This logic is roughly equivalent to css_foreach_descendant_pre, - * except we also track the parent pool to find out which pool we need - * to calculate protection values for. - * - * We can stop the traversal once we find test_pool among the - * descendants since we don't really care about any others. - */ - while (pool != test_pool) { - next_css = css_next_child(NULL, css); - if (next_css) { - parent_pool = pool; - } else { - while (css != &limit_pool->cs->css) { - next_css = css_next_child(css, css->parent); - if (next_css) - break; - css = css->parent; - parent_pool = pool_parent(parent_pool); - } - /* - * We can only hit this when test_pool is not a - * descendant of limit_pool. - */ - if (WARN_ON_ONCE(css == &limit_pool->cs->css)) - break; - } - css = next_css; - - found_descendant = false; + css_for_each_descendant_pre(css, &limit_pool->cs->css) { dmemcg_iter = container_of(css, struct dmemcg_state, css); + found_pool = NULL; list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) { - if (pool_parent(pool) == parent_pool) { - found_descendant = true; + if (pool->region == limit_pool->region) { + found_pool = pool; break; } } - if (!found_descendant) + if (!found_pool) continue; page_counter_calculate_protection( - climit, &pool->cnt, true); + climit, &found_pool->cnt, true); + + if (found_pool == test_pool) + break; } rcu_read_unlock(); } diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 074653f964c1..507b8f19a262 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -188,13 +188,12 @@ static void freezer_attach(struct cgroup_taskset *tset) if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { - freeze_task(task); - /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; freezer = parent_freezer(freezer); } + freeze_task(task); } } @@ -430,9 +429,11 @@ static ssize_t freezer_write(struct kernfs_open_file *of, if (strcmp(buf, freezer_state_strs(0)) == 0) freeze = false; - else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) { + pr_info_once("Freezing with imperfect legacy cgroup freezer. " + "See cgroup.freeze of cgroup v2\n"); freeze = true; - else + } else return -EINVAL; freezer_change_state(css_freezer(of_css(of)), freeze); diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 0e26068995a6..6a01d91ea4cb 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -24,6 +24,10 @@ static const char *const misc_res_name[] = { /* AMD SEV-ES ASIDs resource */ "sev_es", #endif +#ifdef CONFIG_INTEL_TDX_HOST + /* Intel TDX HKIDs resource */ + "tdx", +#endif }; /* Root misc cgroup */ @@ -68,22 +72,6 @@ static inline bool valid_type(enum misc_res_type type) } /** - * misc_cg_res_total_usage() - Get the current total usage of the resource. - * @type: misc res type. - * - * Context: Any context. - * Return: Current total usage of the resource. - */ -u64 misc_cg_res_total_usage(enum misc_res_type type) -{ - if (valid_type(type)) - return atomic64_read(&root_cg.res[type].usage); - - return 0; -} -EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); - -/** * misc_cg_set_capacity() - Set the capacity of the misc cgroup res. * @type: Type of the misc res. * @capacity: Supported capacity of the misc res on the host. diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index aac91466279f..cbeaa499a96a 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -9,18 +9,64 @@ #include <trace/events/cgroup.h> -static DEFINE_SPINLOCK(cgroup_rstat_lock); -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); +static DEFINE_SPINLOCK(rstat_base_lock); +static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); -static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) +/* + * Determines whether a given css can participate in rstat. + * css's that are cgroup::self use rstat for base stats. + * Other css's associated with a subsystem use rstat only when + * they define the ss->css_rstat_flush callback. + */ +static inline bool css_uses_rstat(struct cgroup_subsys_state *css) +{ + return css_is_self(css) || css->ss->css_rstat_flush != NULL; +} + +static struct css_rstat_cpu *css_rstat_cpu( + struct cgroup_subsys_state *css, int cpu) +{ + return per_cpu_ptr(css->rstat_cpu, cpu); +} + +static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( + struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->rstat_base_cpu, cpu); +} + +static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) { - return per_cpu_ptr(cgrp->rstat_cpu, cpu); + if (ss) + return &ss->rstat_ss_lock; + + return &rstat_base_lock; +} + +static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) +{ + if (ss) { + /* + * Depending on config, the subsystem per-cpu lock type may be an + * empty struct. In enviromnents where this is the case, allocation + * of this field is not performed in ss_rstat_init(). Avoid a + * cpu-based offset relative to NULL by returning early. When the + * lock type is zero in size, the corresponding lock functions are + * no-ops so passing them NULL is acceptable. + */ + if (sizeof(*ss->rstat_ss_cpu_lock) == 0) + return NULL; + + return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu); + } + + return per_cpu_ptr(&rstat_base_cpu_lock, cpu); } /* - * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock). + * Helper functions for rstat per CPU locks. * * This makes it easier to diagnose locking issues and contention in * production environments. The parameter @fast_path determine the @@ -28,20 +74,23 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) * operations without handling high-frequency fast-path "update" events. */ static __always_inline -unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, - struct cgroup *cgrp, const bool fast_path) +unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu, + const bool fast_path) { + struct cgroup *cgrp = css->cgroup; + raw_spinlock_t *cpu_lock; unsigned long flags; bool contended; /* - * The _irqsave() is needed because cgroup_rstat_lock is - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring - * this lock with the _irq() suffix only disables interrupts on - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables - * interrupts on both configurations. The _irqsave() ensures - * that interrupts are always disabled and later restored. + * The _irqsave() is needed because the locks used for flushing are + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock + * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT + * kernel. The raw_spinlock_t below disables interrupts on both + * configurations. The _irqsave() ensures that interrupts are always + * disabled and later restored. */ + cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); contended = !raw_spin_trylock_irqsave(cpu_lock, flags); if (contended) { if (fast_path) @@ -61,50 +110,59 @@ unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, } static __always_inline -void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, - struct cgroup *cgrp, unsigned long flags, - const bool fast_path) +void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, + unsigned long flags, const bool fast_path) { + struct cgroup *cgrp = css->cgroup; + raw_spinlock_t *cpu_lock; + if (fast_path) trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); else trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); + cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); raw_spin_unlock_irqrestore(cpu_lock, flags); } /** - * cgroup_rstat_updated - keep track of updated rstat_cpu - * @cgrp: target cgroup + * css_rstat_updated - keep track of updated rstat_cpu + * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * - * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching - * rstat_cpu->updated_children list. See the comment on top of - * cgroup_rstat_cpu definition for details. + * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching + * rstat_cpu->updated_children list. See the comment on top of + * css_rstat_cpu definition for details. */ -__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) +__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); unsigned long flags; /* + * Since bpf programs can call this function, prevent access to + * uninitialized rstat pointers. + */ + if (!css_uses_rstat(css)) + return; + + /* * Speculative already-on-list test. This may race leading to * temporary inaccuracies, which is fine. * * Because @parent's updated_children is terminated with @parent - * instead of NULL, we can tell whether @cgrp is on the list by + * instead of NULL, we can tell whether @css is on the list by * testing the next pointer for NULL. */ - if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) + if (data_race(css_rstat_cpu(css, cpu)->updated_next)) return; - flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true); + flags = _css_rstat_cpu_lock(css, cpu, true); - /* put @cgrp and all ancestors on the corresponding updated lists */ + /* put @css and all ancestors on the corresponding updated lists */ while (true) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_rstat_cpu *prstatc; + struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); + struct cgroup_subsys_state *parent = css->parent; + struct css_rstat_cpu *prstatc; /* * Both additions and removals are bottom-up. If a cgroup @@ -115,53 +173,78 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) /* Root has no parent to link it to, but mark it busy */ if (!parent) { - rstatc->updated_next = cgrp; + rstatc->updated_next = css; break; } - prstatc = cgroup_rstat_cpu(parent, cpu); + prstatc = css_rstat_cpu(parent, cpu); rstatc->updated_next = prstatc->updated_children; - prstatc->updated_children = cgrp; + prstatc->updated_children = css; - cgrp = parent; + css = parent; } - _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true); + _css_rstat_cpu_unlock(css, cpu, flags, true); } /** - * cgroup_rstat_push_children - push children cgroups into the given list + * css_rstat_push_children - push children css's into the given list * @head: current head of the list (= subtree root) * @child: first child of the root * @cpu: target cpu - * Return: A new singly linked list of cgroups to be flush + * Return: A new singly linked list of css's to be flushed * - * Iteratively traverse down the cgroup_rstat_cpu updated tree level by + * Iteratively traverse down the css_rstat_cpu updated tree level by * level and push all the parents first before their next level children - * into a singly linked list built from the tail backward like "pushing" - * cgroups into a stack. The root is pushed by the caller. + * into a singly linked list via the rstat_flush_next pointer built from the + * tail backward like "pushing" css's into a stack. The root is pushed by + * the caller. */ -static struct cgroup *cgroup_rstat_push_children(struct cgroup *head, - struct cgroup *child, int cpu) +static struct cgroup_subsys_state *css_rstat_push_children( + struct cgroup_subsys_state *head, + struct cgroup_subsys_state *child, int cpu) { - struct cgroup *chead = child; /* Head of child cgroup level */ - struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */ - struct cgroup *parent, *grandchild; - struct cgroup_rstat_cpu *crstatc; + struct cgroup_subsys_state *cnext = child; /* Next head of child css level */ + struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */ + struct cgroup_subsys_state *parent, *grandchild; + struct css_rstat_cpu *crstatc; child->rstat_flush_next = NULL; + /* + * The subsystem rstat lock must be held for the whole duration from + * here as the rstat_flush_next list is being constructed to when + * it is consumed later in css_rstat_flush(). + */ + lockdep_assert_held(ss_rstat_lock(head->ss)); + + /* + * Notation: -> updated_next pointer + * => rstat_flush_next pointer + * + * Assuming the following sample updated_children lists: + * P: C1 -> C2 -> P + * C1: G11 -> G12 -> C1 + * C2: G21 -> G22 -> C2 + * + * After 1st iteration: + * head => C2 => C1 => NULL + * ghead => G21 => G11 => NULL + * + * After 2nd iteration: + * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL + */ next_level: - while (chead) { - child = chead; - chead = child->rstat_flush_next; - parent = cgroup_parent(child); + while (cnext) { + child = cnext; + cnext = child->rstat_flush_next; + parent = child->parent; - /* updated_next is parent cgroup terminated */ + /* updated_next is parent cgroup terminated if !NULL */ while (child != parent) { child->rstat_flush_next = head; head = child; - crstatc = cgroup_rstat_cpu(child, cpu); + crstatc = css_rstat_cpu(child, cpu); grandchild = crstatc->updated_children; if (grandchild != child) { /* Push the grand child to the next level */ @@ -175,7 +258,7 @@ next_level: } if (ghead) { - chead = ghead; + cnext = ghead; ghead = NULL; goto next_level; } @@ -183,31 +266,31 @@ next_level: } /** - * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed - * @root: root of the cgroup subtree to traverse + * css_rstat_updated_list - build a list of updated css's to be flushed + * @root: root of the css subtree to traverse * @cpu: target cpu - * Return: A singly linked list of cgroups to be flushed + * Return: A singly linked list of css's to be flushed * * Walks the updated rstat_cpu tree on @cpu from @root. During traversal, - * each returned cgroup is unlinked from the updated tree. + * each returned css is unlinked from the updated tree. * * The only ordering guarantee is that, for a parent and a child pair * covered by a given traversal, the child is before its parent in * the list. * * Note that updated_children is self terminated and points to a list of - * child cgroups if not empty. Whereas updated_next is like a sibling link - * within the children list and terminated by the parent cgroup. An exception - * here is the cgroup root whose updated_next can be self terminated. + * child css's if not empty. Whereas updated_next is like a sibling link + * within the children list and terminated by the parent css. An exception + * here is the css root whose updated_next can be self terminated. */ -static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) +static struct cgroup_subsys_state *css_rstat_updated_list( + struct cgroup_subsys_state *root, int cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu); - struct cgroup *head = NULL, *parent, *child; + struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); + struct cgroup_subsys_state *head = NULL, *parent, *child; unsigned long flags; - flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false); + flags = _css_rstat_cpu_lock(root, cpu, false); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) @@ -217,17 +300,17 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) * Unlink @root from its parent. As the updated_children list is * singly linked, we have to walk it to find the removal point. */ - parent = cgroup_parent(root); + parent = root->parent; if (parent) { - struct cgroup_rstat_cpu *prstatc; - struct cgroup **nextp; + struct css_rstat_cpu *prstatc; + struct cgroup_subsys_state **nextp; - prstatc = cgroup_rstat_cpu(parent, cpu); + prstatc = css_rstat_cpu(parent, cpu); nextp = &prstatc->updated_children; while (*nextp != root) { - struct cgroup_rstat_cpu *nrstatc; + struct css_rstat_cpu *nrstatc; - nrstatc = cgroup_rstat_cpu(*nextp, cpu); + nrstatc = css_rstat_cpu(*nextp, cpu); WARN_ON_ONCE(*nextp == parent); nextp = &nrstatc->updated_next; } @@ -242,16 +325,16 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) child = rstatc->updated_children; rstatc->updated_children = root; if (child != root) - head = cgroup_rstat_push_children(head, child, cpu); + head = css_rstat_push_children(head, child, cpu); unlock_ret: - _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false); + _css_rstat_cpu_unlock(root, cpu, flags, false); return head; } /* * A hook for bpf stat collectors to attach to and flush their stats. - * Together with providing bpf kfuncs for cgroup_rstat_updated() and - * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that + * Together with providing bpf kfuncs for css_rstat_updated() and + * css_rstat_flush(), this enables a complete workflow where bpf progs that * collect cgroup stats can integrate with rstat for efficient flushing. * * A static noinline declaration here could cause the compiler to optimize away @@ -271,7 +354,7 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, __bpf_hook_end(); /* - * Helper functions for locking cgroup_rstat_lock. + * Helper functions for locking. * * This makes it easier to diagnose locking issues and contention in * production environments. The parameter @cpu_in_loop indicate lock @@ -279,156 +362,181 @@ __bpf_hook_end(); * value -1 is used when obtaining the main lock else this is the CPU * number processed last. */ -static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) - __acquires(&cgroup_rstat_lock) +static inline void __css_rstat_lock(struct cgroup_subsys_state *css, + int cpu_in_loop) + __acquires(ss_rstat_lock(css->ss)) { + struct cgroup *cgrp = css->cgroup; + spinlock_t *lock; bool contended; - contended = !spin_trylock_irq(&cgroup_rstat_lock); + lock = ss_rstat_lock(css->ss); + contended = !spin_trylock_irq(lock); if (contended) { trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); - spin_lock_irq(&cgroup_rstat_lock); + spin_lock_irq(lock); } trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); } -static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) - __releases(&cgroup_rstat_lock) -{ - trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); - spin_unlock_irq(&cgroup_rstat_lock); -} - -/* see cgroup_rstat_flush() */ -static void cgroup_rstat_flush_locked(struct cgroup *cgrp) - __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) +static inline void __css_rstat_unlock(struct cgroup_subsys_state *css, + int cpu_in_loop) + __releases(ss_rstat_lock(css->ss)) { - int cpu; - - lockdep_assert_held(&cgroup_rstat_lock); - - for_each_possible_cpu(cpu) { - struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu); - - for (; pos; pos = pos->rstat_flush_next) { - struct cgroup_subsys_state *css; - - cgroup_base_stat_flush(pos, cpu); - bpf_rstat_flush(pos, cgroup_parent(pos), cpu); - - rcu_read_lock(); - list_for_each_entry_rcu(css, &pos->rstat_css_list, - rstat_css_node) - css->ss->css_rstat_flush(css, cpu); - rcu_read_unlock(); - } + struct cgroup *cgrp = css->cgroup; + spinlock_t *lock; - /* play nice and yield if necessary */ - if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { - __cgroup_rstat_unlock(cgrp, cpu); - if (!cond_resched()) - cpu_relax(); - __cgroup_rstat_lock(cgrp, cpu); - } - } + lock = ss_rstat_lock(css->ss); + trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); + spin_unlock_irq(lock); } /** - * cgroup_rstat_flush - flush stats in @cgrp's subtree - * @cgrp: target cgroup + * css_rstat_flush - flush stats in @css's rstat subtree + * @css: target cgroup subsystem state * - * Collect all per-cpu stats in @cgrp's subtree into the global counters - * and propagate them upwards. After this function returns, all cgroups in - * the subtree have up-to-date ->stat. + * Collect all per-cpu stats in @css's subtree into the global counters + * and propagate them upwards. After this function returns, all rstat + * nodes in the subtree have up-to-date ->stat. * - * This also gets all cgroups in the subtree including @cgrp off the + * This also gets all rstat nodes in the subtree including @css off the * ->updated_children lists. * * This function may block. */ -__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) +__bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css) { - might_sleep(); + int cpu; + bool is_self = css_is_self(css); - __cgroup_rstat_lock(cgrp, -1); - cgroup_rstat_flush_locked(cgrp); - __cgroup_rstat_unlock(cgrp, -1); -} + /* + * Since bpf programs can call this function, prevent access to + * uninitialized rstat pointers. + */ + if (!css_uses_rstat(css)) + return; -/** - * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold - * @cgrp: target cgroup - * - * Flush stats in @cgrp's subtree and prevent further flushes. Must be - * paired with cgroup_rstat_flush_release(). - * - * This function may block. - */ -void cgroup_rstat_flush_hold(struct cgroup *cgrp) - __acquires(&cgroup_rstat_lock) -{ might_sleep(); - __cgroup_rstat_lock(cgrp, -1); - cgroup_rstat_flush_locked(cgrp); -} + for_each_possible_cpu(cpu) { + struct cgroup_subsys_state *pos; -/** - * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() - * @cgrp: cgroup used by tracepoint - */ -void cgroup_rstat_flush_release(struct cgroup *cgrp) - __releases(&cgroup_rstat_lock) -{ - __cgroup_rstat_unlock(cgrp, -1); + /* Reacquire for each CPU to avoid disabling IRQs too long */ + __css_rstat_lock(css, cpu); + pos = css_rstat_updated_list(css, cpu); + for (; pos; pos = pos->rstat_flush_next) { + if (is_self) { + cgroup_base_stat_flush(pos->cgroup, cpu); + bpf_rstat_flush(pos->cgroup, + cgroup_parent(pos->cgroup), cpu); + } else + pos->ss->css_rstat_flush(pos, cpu); + } + __css_rstat_unlock(css, cpu); + if (!cond_resched()) + cpu_relax(); + } } -int cgroup_rstat_init(struct cgroup *cgrp) +int css_rstat_init(struct cgroup_subsys_state *css) { + struct cgroup *cgrp = css->cgroup; int cpu; + bool is_self = css_is_self(css); + + if (is_self) { + /* the root cgrp has rstat_base_cpu preallocated */ + if (!cgrp->rstat_base_cpu) { + cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu); + if (!cgrp->rstat_base_cpu) + return -ENOMEM; + } + } else if (css->ss->css_rstat_flush == NULL) + return 0; + + /* the root cgrp's self css has rstat_cpu preallocated */ + if (!css->rstat_cpu) { + css->rstat_cpu = alloc_percpu(struct css_rstat_cpu); + if (!css->rstat_cpu) { + if (is_self) + free_percpu(cgrp->rstat_base_cpu); - /* the root cgrp has rstat_cpu preallocated */ - if (!cgrp->rstat_cpu) { - cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); - if (!cgrp->rstat_cpu) return -ENOMEM; + } } /* ->updated_children list is self terminated */ for_each_possible_cpu(cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); + + rstatc->updated_children = css; + + if (is_self) { + struct cgroup_rstat_base_cpu *rstatbc; - rstatc->updated_children = cgrp; - u64_stats_init(&rstatc->bsync); + rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); + u64_stats_init(&rstatbc->bsync); + } } return 0; } -void cgroup_rstat_exit(struct cgroup *cgrp) +void css_rstat_exit(struct cgroup_subsys_state *css) { int cpu; - cgroup_rstat_flush(cgrp); + if (!css_uses_rstat(css)) + return; + + css_rstat_flush(css); /* sanity check */ for_each_possible_cpu(cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); - if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || + if (WARN_ON_ONCE(rstatc->updated_children != css) || WARN_ON_ONCE(rstatc->updated_next)) return; } - free_percpu(cgrp->rstat_cpu); - cgrp->rstat_cpu = NULL; + if (css_is_self(css)) { + struct cgroup *cgrp = css->cgroup; + + free_percpu(cgrp->rstat_base_cpu); + cgrp->rstat_base_cpu = NULL; + } + + free_percpu(css->rstat_cpu); + css->rstat_cpu = NULL; } -void __init cgroup_rstat_boot(void) +/** + * ss_rstat_init - subsystem-specific rstat initialization + * @ss: target subsystem + * + * If @ss is NULL, the static locks associated with the base stats + * are initialized. If @ss is non-NULL, the subsystem-specific locks + * are initialized. + */ +int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; + /* + * Depending on config, the subsystem per-cpu lock type may be an empty + * struct. Avoid allocating a size of zero in this case. + */ + if (ss && sizeof(*ss->rstat_ss_cpu_lock)) { + ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t); + if (!ss->rstat_ss_cpu_lock) + return -ENOMEM; + } + + spin_lock_init(ss_rstat_lock(ss)); for_each_possible_cpu(cpu) - raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); + raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); + + return 0; } /* @@ -461,9 +569,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_rstat_cpu *prstatc; + struct cgroup_rstat_base_cpu *prstatbc; struct cgroup_base_stat delta; unsigned seq; @@ -473,15 +581,15 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) /* fetch the current per-cpu values */ do { - seq = __u64_stats_fetch_begin(&rstatc->bsync); - delta = rstatc->bstat; - } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); + seq = __u64_stats_fetch_begin(&rstatbc->bsync); + delta = rstatbc->bstat; + } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq)); /* propagate per-cpu delta to cgroup and per-cpu global statistics */ - cgroup_base_stat_sub(&delta, &rstatc->last_bstat); + cgroup_base_stat_sub(&delta, &rstatbc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); - cgroup_base_stat_add(&rstatc->last_bstat, &delta); - cgroup_base_stat_add(&rstatc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatbc->last_bstat, &delta); + cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta); /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { @@ -490,73 +598,73 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); - delta = rstatc->subtree_bstat; - prstatc = cgroup_rstat_cpu(parent, cpu); - cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat); - cgroup_base_stat_add(&prstatc->subtree_bstat, &delta); - cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta); + delta = rstatbc->subtree_bstat; + prstatbc = cgroup_rstat_base_cpu(parent, cpu); + cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat); + cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta); } } -static struct cgroup_rstat_cpu * +static struct cgroup_rstat_base_cpu * cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; - rstatc = get_cpu_ptr(cgrp->rstat_cpu); - *flags = u64_stats_update_begin_irqsave(&rstatc->bsync); - return rstatc; + rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu); + *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync); + return rstatbc; } static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, - struct cgroup_rstat_cpu *rstatc, + struct cgroup_rstat_base_cpu *rstatbc, unsigned long flags) { - u64_stats_update_end_irqrestore(&rstatc->bsync, flags); - cgroup_rstat_updated(cgrp, smp_processor_id()); - put_cpu_ptr(rstatc); + u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); + css_rstat_updated(&cgrp->self, smp_processor_id()); + put_cpu_ptr(rstatbc); } void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); - rstatc->bstat.cputime.sum_exec_runtime += delta_exec; - cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); + rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); + rstatbc->bstat.cputime.sum_exec_runtime += delta_exec; + cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); + rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { case CPUTIME_NICE: - rstatc->bstat.ntime += delta_exec; + rstatbc->bstat.ntime += delta_exec; fallthrough; case CPUTIME_USER: - rstatc->bstat.cputime.utime += delta_exec; + rstatbc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: case CPUTIME_IRQ: case CPUTIME_SOFTIRQ: - rstatc->bstat.cputime.stime += delta_exec; + rstatbc->bstat.cputime.stime += delta_exec; break; #ifdef CONFIG_SCHED_CORE case CPUTIME_FORCEIDLE: - rstatc->bstat.forceidle_sum += delta_exec; + rstatbc->bstat.forceidle_sum += delta_exec; break; #endif default: break; } - cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); + cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } /* @@ -612,42 +720,40 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; - u64 usage, utime, stime, ntime; + struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { - cgroup_rstat_flush_hold(cgrp); - usage = cgrp->bstat.cputime.sum_exec_runtime; + css_rstat_flush(&cgrp->self); + __css_rstat_lock(&cgrp->self, -1); + bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, - &utime, &stime); - ntime = cgrp->bstat.ntime; - cgroup_rstat_flush_release(cgrp); + &bstat.cputime.utime, &bstat.cputime.stime); + __css_rstat_unlock(&cgrp->self, -1); } else { - /* cgrp->bstat of root is not actually used, reuse it */ - root_cgroup_cputime(&cgrp->bstat); - usage = cgrp->bstat.cputime.sum_exec_runtime; - utime = cgrp->bstat.cputime.utime; - stime = cgrp->bstat.cputime.stime; - ntime = cgrp->bstat.ntime; + root_cgroup_cputime(&bstat); } - do_div(usage, NSEC_PER_USEC); - do_div(utime, NSEC_PER_USEC); - do_div(stime, NSEC_PER_USEC); - do_div(ntime, NSEC_PER_USEC); + do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); + do_div(bstat.cputime.utime, NSEC_PER_USEC); + do_div(bstat.cputime.stime, NSEC_PER_USEC); + do_div(bstat.ntime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n" "nice_usec %llu\n", - usage, utime, stime, ntime); + bstat.cputime.sum_exec_runtime, + bstat.cputime.utime, + bstat.cputime.stime, + bstat.ntime); - cgroup_force_idle_show(seq, &cgrp->bstat); + cgroup_force_idle_show(seq, &bstat); } -/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ +/* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */ BTF_KFUNCS_START(bpf_rstat_kfunc_ids) -BTF_ID_FLAGS(func, cgroup_rstat_updated) -BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) +BTF_ID_FLAGS(func, css_rstat_updated) +BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_rstat_kfunc_ids) static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 20552f163930..e81327d2cd63 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -73,7 +73,6 @@ CONFIG_DEBUG_VM=y CONFIG_DEBUG_VM_PGFLAGS=y CONFIG_DEBUG_VM_RB=y CONFIG_DEBUG_VM_VMACACHE=y -CONFIG_GENERIC_PTDUMP=y CONFIG_KASAN=y CONFIG_KASAN_GENERIC=y CONFIG_KASAN_INLINE=y @@ -113,3 +112,8 @@ CONFIG_BRANCH_PROFILE_NONE=y CONFIG_DYNAMIC_FTRACE=y CONFIG_FTRACE=y CONFIG_FUNCTION_TRACER=y +# +# Preemption +# +CONFIG_DEBUG_PREEMPT=y +CONFIG_PREEMPT=y diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 3fabb8f55ef6..dd7c32fb5ac1 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -46,7 +46,7 @@ CONFIG_UBSAN_BOUNDS=y # CONFIG_UBSAN_SHIFT is not set # CONFIG_UBSAN_DIV_ZERO is not set # CONFIG_UBSAN_UNREACHABLE is not set -# CONFIG_UBSAN_SIGNED_WRAP is not set +# CONFIG_UBSAN_INTEGER_WRAP is not set # CONFIG_UBSAN_BOOL is not set # CONFIG_UBSAN_ENUM is not set # CONFIG_UBSAN_ALIGNMENT is not set diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index b753695c5a8f..5dd0f0a34a73 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -2,3 +2,4 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KERNEL_XZ=y CONFIG_SLUB=y CONFIG_SLUB_TINY=y +CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config index 6878b9a49be8..1875a0a5047a 100644 --- a/kernel/configs/xen.config +++ b/kernel/configs/xen.config @@ -13,6 +13,8 @@ CONFIG_SCSI=y CONFIG_FB=y CONFIG_INPUT_MISC=y CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_ZONE_DEVICE=y CONFIG_TTY=y # Technically not required but otherwise produces # pretty useless systems starting from allnoconfig @@ -47,3 +49,4 @@ CONFIG_XEN_GNTDEV=m CONFIG_XEN_GRANT_DEV_ALLOC=m CONFIG_SWIOTLB_XEN=y CONFIG_XEN_PRIVCMD=m +CONFIG_XEN_UNPOPULATED_ALLOC=y diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 938c48952d26..fb5be6e9b423 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -80,17 +80,16 @@ static __always_inline void rcu_task_trace_heavyweight_exit(void) */ static noinstr void ct_kernel_exit_state(int offset) { - int seq; - /* * CPUs seeing atomic_add_return() must see prior RCU read-side * critical sections, and we also must force ordering with the * next idle sojourn. */ rcu_task_trace_heavyweight_enter(); // Before CT state update! - seq = ct_state_inc(offset); - // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING)); + // RCU is still watching. Better not be in extended quiescent state! + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !rcu_is_watching_curr_cpu()); + (void)ct_state_inc(offset); + // RCU is no longer watching. } /* diff --git a/kernel/cpu.c b/kernel/cpu.c index 07455d25329c..a59e009e0be4 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -526,6 +526,7 @@ void lockdep_assert_cpus_held(void) percpu_rwsem_assert_held(&cpu_hotplug_lock); } +EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held); #ifdef CONFIG_LOCKDEP int lockdep_is_cpus_held(void) @@ -1453,11 +1454,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, out: cpus_write_unlock(); - /* - * Do post unplug cleanup. This is still protected against - * concurrent CPU hotplug via cpu_add_remove_lock. - */ - lockup_detector_cleanup(); arch_smt_update(); return ret; } @@ -2073,11 +2069,6 @@ static struct cpuhp_step cpuhp_hp_states[] = { .teardown.single = NULL, .cant_stop = true, }, - [CPUHP_PERF_PREPARE] = { - .name = "perf:prepare", - .startup.single = perf_event_init_cpu, - .teardown.single = perf_event_exit_cpu, - }, [CPUHP_RANDOM_PREPARE] = { .name = "random:prepare", .startup.single = random_prepare_cpu, diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 078fe5bc5a74..335b8425dd4b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -436,7 +436,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) memset(&prstatus, 0, sizeof(prstatus)); prstatus.common.pr_pid = current->pid; elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); final_note(buf); } diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c new file mode 100644 index 000000000000..401423ba477d --- /dev/null +++ b/kernel/crash_dump_dm_crypt.c @@ -0,0 +1,464 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/key.h> +#include <linux/keyctl.h> +#include <keys/user-type.h> +#include <linux/crash_dump.h> +#include <linux/cc_platform.h> +#include <linux/configfs.h> +#include <linux/module.h> + +#define KEY_NUM_MAX 128 /* maximum dm crypt keys */ +#define KEY_SIZE_MAX 256 /* maximum dm crypt key size */ +#define KEY_DESC_MAX_LEN 128 /* maximum dm crypt key description size */ + +static unsigned int key_count; + +struct dm_crypt_key { + unsigned int key_size; + char key_desc[KEY_DESC_MAX_LEN]; + u8 data[KEY_SIZE_MAX]; +}; + +static struct keys_header { + unsigned int total_keys; + struct dm_crypt_key keys[] __counted_by(total_keys); +} *keys_header; + +static size_t get_keys_header_size(size_t total_keys) +{ + return struct_size(keys_header, keys, total_keys); +} + +unsigned long long dm_crypt_keys_addr; +EXPORT_SYMBOL_GPL(dm_crypt_keys_addr); + +static int __init setup_dmcryptkeys(char *arg) +{ + char *end; + + if (!arg) + return -EINVAL; + dm_crypt_keys_addr = memparse(arg, &end); + if (end > arg) + return 0; + + dm_crypt_keys_addr = 0; + return -EINVAL; +} + +early_param("dmcryptkeys", setup_dmcryptkeys); + +/* + * Architectures may override this function to read dm crypt keys + */ +ssize_t __weak dm_crypt_keys_read(char *buf, size_t count, u64 *ppos) +{ + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); +} + +static int add_key_to_keyring(struct dm_crypt_key *dm_key, + key_ref_t keyring_ref) +{ + key_ref_t key_ref; + int r; + + /* create or update the requested key and add it to the target keyring */ + key_ref = key_create_or_update(keyring_ref, "user", dm_key->key_desc, + dm_key->data, dm_key->key_size, + KEY_USR_ALL, KEY_ALLOC_IN_QUOTA); + + if (!IS_ERR(key_ref)) { + r = key_ref_to_ptr(key_ref)->serial; + key_ref_put(key_ref); + kexec_dprintk("Success adding key %s", dm_key->key_desc); + } else { + r = PTR_ERR(key_ref); + kexec_dprintk("Error when adding key"); + } + + key_ref_put(keyring_ref); + return r; +} + +static void get_keys_from_kdump_reserved_memory(void) +{ + struct keys_header *keys_header_loaded; + + arch_kexec_unprotect_crashkres(); + + keys_header_loaded = kmap_local_page(pfn_to_page( + kexec_crash_image->dm_crypt_keys_addr >> PAGE_SHIFT)); + + memcpy(keys_header, keys_header_loaded, get_keys_header_size(key_count)); + kunmap_local(keys_header_loaded); + arch_kexec_protect_crashkres(); +} + +static int restore_dm_crypt_keys_to_thread_keyring(void) +{ + struct dm_crypt_key *key; + size_t keys_header_size; + key_ref_t keyring_ref; + u64 addr; + + /* find the target keyring (which must be writable) */ + keyring_ref = + lookup_user_key(KEY_SPEC_USER_KEYRING, 0x01, KEY_NEED_WRITE); + if (IS_ERR(keyring_ref)) { + kexec_dprintk("Failed to get the user keyring\n"); + return PTR_ERR(keyring_ref); + } + + addr = dm_crypt_keys_addr; + dm_crypt_keys_read((char *)&key_count, sizeof(key_count), &addr); + if (key_count < 0 || key_count > KEY_NUM_MAX) { + kexec_dprintk("Failed to read the number of dm-crypt keys\n"); + return -1; + } + + kexec_dprintk("There are %u keys\n", key_count); + addr = dm_crypt_keys_addr; + + keys_header_size = get_keys_header_size(key_count); + keys_header = kzalloc(keys_header_size, GFP_KERNEL); + if (!keys_header) + return -ENOMEM; + + dm_crypt_keys_read((char *)keys_header, keys_header_size, &addr); + + for (int i = 0; i < keys_header->total_keys; i++) { + key = &keys_header->keys[i]; + kexec_dprintk("Get key (size=%u)\n", key->key_size); + add_key_to_keyring(key, keyring_ref); + } + + return 0; +} + +static int read_key_from_user_keying(struct dm_crypt_key *dm_key) +{ + const struct user_key_payload *ukp; + struct key *key; + + kexec_dprintk("Requesting logon key %s", dm_key->key_desc); + key = request_key(&key_type_logon, dm_key->key_desc, NULL); + + if (IS_ERR(key)) { + pr_warn("No such logon key %s\n", dm_key->key_desc); + return PTR_ERR(key); + } + + ukp = user_key_payload_locked(key); + if (!ukp) + return -EKEYREVOKED; + + if (ukp->datalen > KEY_SIZE_MAX) { + pr_err("Key size %u exceeds maximum (%u)\n", ukp->datalen, KEY_SIZE_MAX); + return -EINVAL; + } + + memcpy(dm_key->data, ukp->data, ukp->datalen); + dm_key->key_size = ukp->datalen; + kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size, + dm_key->key_desc, dm_key->data); + return 0; +} + +struct config_key { + struct config_item item; + const char *description; +}; + +static inline struct config_key *to_config_key(struct config_item *item) +{ + return container_of(item, struct config_key, item); +} + +static ssize_t config_key_description_show(struct config_item *item, char *page) +{ + return sprintf(page, "%s\n", to_config_key(item)->description); +} + +static ssize_t config_key_description_store(struct config_item *item, + const char *page, size_t count) +{ + struct config_key *config_key = to_config_key(item); + size_t len; + int ret; + + ret = -EINVAL; + len = strcspn(page, "\n"); + + if (len > KEY_DESC_MAX_LEN) { + pr_err("The key description shouldn't exceed %u characters", KEY_DESC_MAX_LEN); + return ret; + } + + if (!len) + return ret; + + kfree(config_key->description); + ret = -ENOMEM; + config_key->description = kmemdup_nul(page, len, GFP_KERNEL); + if (!config_key->description) + return ret; + + return count; +} + +CONFIGFS_ATTR(config_key_, description); + +static struct configfs_attribute *config_key_attrs[] = { + &config_key_attr_description, + NULL, +}; + +static void config_key_release(struct config_item *item) +{ + kfree(to_config_key(item)); + key_count--; +} + +static struct configfs_item_operations config_key_item_ops = { + .release = config_key_release, +}; + +static const struct config_item_type config_key_type = { + .ct_item_ops = &config_key_item_ops, + .ct_attrs = config_key_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_item *config_keys_make_item(struct config_group *group, + const char *name) +{ + struct config_key *config_key; + + if (key_count > KEY_NUM_MAX) { + pr_err("Only %u keys at maximum to be created\n", KEY_NUM_MAX); + return ERR_PTR(-EINVAL); + } + + config_key = kzalloc(sizeof(struct config_key), GFP_KERNEL); + if (!config_key) + return ERR_PTR(-ENOMEM); + + config_item_init_type_name(&config_key->item, name, &config_key_type); + + key_count++; + + return &config_key->item; +} + +static ssize_t config_keys_count_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", key_count); +} + +CONFIGFS_ATTR_RO(config_keys_, count); + +static bool is_dm_key_reused; + +static ssize_t config_keys_reuse_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", is_dm_key_reused); +} + +static ssize_t config_keys_reuse_store(struct config_item *item, + const char *page, size_t count) +{ + if (!kexec_crash_image || !kexec_crash_image->dm_crypt_keys_addr) { + kexec_dprintk( + "dm-crypt keys haven't be saved to crash-reserved memory\n"); + return -EINVAL; + } + + if (kstrtobool(page, &is_dm_key_reused)) + return -EINVAL; + + if (is_dm_key_reused) + get_keys_from_kdump_reserved_memory(); + + return count; +} + +CONFIGFS_ATTR(config_keys_, reuse); + +static struct configfs_attribute *config_keys_attrs[] = { + &config_keys_attr_count, + &config_keys_attr_reuse, + NULL, +}; + +/* + * Note that, since no extra work is required on ->drop_item(), + * no ->drop_item() is provided. + */ +static struct configfs_group_operations config_keys_group_ops = { + .make_item = config_keys_make_item, +}; + +static const struct config_item_type config_keys_type = { + .ct_group_ops = &config_keys_group_ops, + .ct_attrs = config_keys_attrs, + .ct_owner = THIS_MODULE, +}; + +static bool restore; + +static ssize_t config_keys_restore_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", restore); +} + +static ssize_t config_keys_restore_store(struct config_item *item, + const char *page, size_t count) +{ + if (!restore) + restore_dm_crypt_keys_to_thread_keyring(); + + if (kstrtobool(page, &restore)) + return -EINVAL; + + return count; +} + +CONFIGFS_ATTR(config_keys_, restore); + +static struct configfs_attribute *kdump_config_keys_attrs[] = { + &config_keys_attr_restore, + NULL, +}; + +static const struct config_item_type kdump_config_keys_type = { + .ct_attrs = kdump_config_keys_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem config_keys_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "crash_dm_crypt_keys", + .ci_type = &config_keys_type, + }, + }, +}; + +static int build_keys_header(void) +{ + struct config_item *item = NULL; + struct config_key *key; + int i, r; + + if (keys_header != NULL) + kvfree(keys_header); + + keys_header = kzalloc(get_keys_header_size(key_count), GFP_KERNEL); + if (!keys_header) + return -ENOMEM; + + keys_header->total_keys = key_count; + + i = 0; + list_for_each_entry(item, &config_keys_subsys.su_group.cg_children, + ci_entry) { + if (item->ci_type != &config_key_type) + continue; + + key = to_config_key(item); + + if (!key->description) { + pr_warn("No key description for key %s\n", item->ci_name); + return -EINVAL; + } + + strscpy(keys_header->keys[i].key_desc, key->description, + KEY_DESC_MAX_LEN); + r = read_key_from_user_keying(&keys_header->keys[i]); + if (r != 0) { + kexec_dprintk("Failed to read key %s\n", + keys_header->keys[i].key_desc); + return r; + } + i++; + kexec_dprintk("Found key: %s\n", item->ci_name); + } + + return 0; +} + +int crash_load_dm_crypt_keys(struct kimage *image) +{ + struct kexec_buf kbuf = { + .image = image, + .buf_min = 0, + .buf_max = ULONG_MAX, + .top_down = false, + .random = true, + }; + int r; + + + if (key_count <= 0) { + kexec_dprintk("No dm-crypt keys\n"); + return -ENOENT; + } + + if (!is_dm_key_reused) { + image->dm_crypt_keys_addr = 0; + r = build_keys_header(); + if (r) + return r; + } + + kbuf.buffer = keys_header; + kbuf.bufsz = get_keys_header_size(key_count); + + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = ELF_CORE_HEADER_ALIGN; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + r = kexec_add_buffer(&kbuf); + if (r) { + kvfree((void *)kbuf.buffer); + return r; + } + image->dm_crypt_keys_addr = kbuf.mem; + image->dm_crypt_keys_sz = kbuf.bufsz; + kexec_dprintk( + "Loaded dm crypt keys to kexec_buffer bufsz=0x%lx memsz=0x%lx\n", + kbuf.bufsz, kbuf.memsz); + + return r; +} + +static int __init configfs_dmcrypt_keys_init(void) +{ + int ret; + + if (is_kdump_kernel()) { + config_keys_subsys.su_group.cg_item.ci_type = + &kdump_config_keys_type; + } + + config_group_init(&config_keys_subsys.su_group); + mutex_init(&config_keys_subsys.su_mutex); + ret = configfs_register_subsystem(&config_keys_subsys); + if (ret) { + pr_err("Error %d while registering subsystem %s\n", ret, + config_keys_subsys.su_group.cg_item.ci_namebuf); + goto out_unregister; + } + + return 0; + +out_unregister: + configfs_unregister_subsystem(&config_keys_subsys); + + return ret; +} + +module_init(configfs_dmcrypt_keys_init); diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index a620fb4b2116..acb6bf42e30d 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -131,7 +131,7 @@ static int __init parse_crashkernel_mem(char *cmdline, cur++; *crash_base = memparse(cur, &tmp); if (cur == tmp) { - pr_warn("crahskernel: Memory value expected after '@'\n"); + pr_warn("crashkernel: Memory value expected after '@'\n"); return -EINVAL; } } @@ -375,11 +375,10 @@ static int __init reserve_crashkernel_low(unsigned long long low_size) return 0; } -void __init reserve_crashkernel_generic(char *cmdline, - unsigned long long crash_size, - unsigned long long crash_base, - unsigned long long crash_low_size, - bool high) +void __init reserve_crashkernel_generic(unsigned long long crash_size, + unsigned long long crash_base, + unsigned long long crash_low_size, + bool high) { unsigned long long search_end = CRASH_ADDR_LOW_MAX, search_base = 0; bool fixed_base = false; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index ce1bb2301c06..0b9495187fba 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -837,10 +837,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) { struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; - int ret = 0; - - if (arch_kgdb_ops.enable_nmi) - arch_kgdb_ops.enable_nmi(0); /* * Avoid entering the debugger if we were triggered due to an oops * but panic_timeout indicates the system should automatically @@ -858,15 +854,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) ks->linux_regs = regs; if (kgdb_reenter_check(ks)) - goto out; /* Ouch, double exception ! */ + return 0; /* Ouch, double exception ! */ if (kgdb_info[ks->cpu].enter_kgdb != 0) - goto out; + return 0; - ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); -out: - if (arch_kgdb_ops.enable_nmi) - arch_kgdb_ops.enable_nmi(1); - return ret; + return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); } NOKPROBE_SYMBOL(kgdb_handle_exception); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6a77f1c779c4..9b11b10b120c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -334,7 +334,7 @@ poll_again: *cp = '\0'; p_tmp = strrchr(buffer, ' '); p_tmp = (p_tmp ? p_tmp + 1 : buffer); - strscpy(tmpbuffer, p_tmp, sizeof(tmpbuffer)); + strscpy(tmpbuffer, p_tmp); *cp = tmp; len = strlen(tmpbuffer); @@ -452,7 +452,7 @@ poll_again: char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) { if (prompt && kdb_prompt_str != prompt) - strscpy(kdb_prompt_str, prompt, CMD_BUFLEN); + strscpy(kdb_prompt_str, prompt); kdb_printf("%s", kdb_prompt_str); kdb_nextline = 1; /* Prompt and input resets line number */ return kdb_read(buffer, bufsize); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 5f4be507d79f..7a4d2d4689a5 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -25,7 +25,6 @@ #include <linux/smp.h> #include <linux/utsname.h> #include <linux/vmalloc.h> -#include <linux/atomic.h> #include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/init.h> @@ -105,7 +104,7 @@ static kdbmsg_t kdbmsgs[] = { KDBMSG(NOENVVALUE, "Environment variable should have value"), KDBMSG(NOTIMP, "Command not implemented"), KDBMSG(ENVFULL, "Environment full"), - KDBMSG(ENVBUFFULL, "Environment buffer full"), + KDBMSG(KMALLOCFAILED, "Failed to allocate memory"), KDBMSG(TOOMANYBPT, "Too many breakpoints defined"), #ifdef CONFIG_CPU_XSCALE KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"), @@ -130,13 +129,9 @@ static const int __nkdb_err = ARRAY_SIZE(kdbmsgs); /* - * Initial environment. This is all kept static and local to - * this file. We don't want to rely on the memory allocation - * mechanisms in the kernel, so we use a very limited allocate-only - * heap for new and altered environment variables. The entire - * environment is limited to a fixed number of entries (add more - * to __env[] if required) and a fixed amount of heap (add more to - * KDB_ENVBUFSIZE if required). + * Initial environment. This is all kept static and local to this file. + * The entire environment is limited to a fixed number of entries + * (add more to __env[] if required) */ static char *__env[31] = { @@ -259,35 +254,6 @@ char *kdbgetenv(const char *match) } /* - * kdballocenv - This function is used to allocate bytes for - * environment entries. - * Parameters: - * bytes The number of bytes to allocate in the static buffer. - * Returns: - * A pointer to the allocated space in the buffer on success. - * NULL if bytes > size available in the envbuffer. - * Remarks: - * We use a static environment buffer (envbuffer) to hold the values - * of dynamically generated environment variables (see kdb_set). Buffer - * space once allocated is never free'd, so over time, the amount of space - * (currently 512 bytes) will be exhausted if env variables are changed - * frequently. - */ -static char *kdballocenv(size_t bytes) -{ -#define KDB_ENVBUFSIZE 512 - static char envbuffer[KDB_ENVBUFSIZE]; - static int envbufsize; - char *ep = NULL; - - if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) { - ep = &envbuffer[envbufsize]; - envbufsize += bytes; - } - return ep; -} - -/* * kdbgetulenv - This function will return the value of an unsigned * long-valued environment variable. * Parameters: @@ -348,9 +314,9 @@ static int kdb_setenv(const char *var, const char *val) varlen = strlen(var); vallen = strlen(val); - ep = kdballocenv(varlen + vallen + 2); - if (ep == (char *)0) - return KDB_ENVBUFFULL; + ep = kmalloc(varlen + vallen + 2, GFP_KDB); + if (!ep) + return KDB_KMALLOCFAILED; sprintf(ep, "%s=%s", var, val); @@ -359,6 +325,7 @@ static int kdb_setenv(const char *var, const char *val) && ((strncmp(__env[i], var, varlen) == 0) && ((__env[i][varlen] == '\0') || (__env[i][varlen] == '=')))) { + kfree_const(__env[i]); __env[i] = ep; return 0; } @@ -2119,32 +2086,6 @@ static int kdb_dmesg(int argc, const char **argv) return 0; } #endif /* CONFIG_PRINTK */ - -/* Make sure we balance enable/disable calls, must disable first. */ -static atomic_t kdb_nmi_disabled; - -static int kdb_disable_nmi(int argc, const char *argv[]) -{ - if (atomic_read(&kdb_nmi_disabled)) - return 0; - atomic_set(&kdb_nmi_disabled, 1); - arch_kgdb_ops.enable_nmi(0); - return 0; -} - -static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp) -{ - if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0)) - return -EINVAL; - arch_kgdb_ops.enable_nmi(1); - return 0; -} - -static const struct kernel_param_ops kdb_param_ops_enable_nmi = { - .set = kdb_param_enable_nmi, -}; -module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600); - /* * kdb_cpu - This function implements the 'cpu' command. * cpu [<cpunum>] @@ -2836,20 +2777,10 @@ static kdbtab_t maintab[] = { }, }; -static kdbtab_t nmicmd = { - .name = "disable_nmi", - .func = kdb_disable_nmi, - .usage = "", - .help = "Disable NMI entry to KDB", - .flags = KDB_ENABLE_ALWAYS_SAFE, -}; - /* Initialize the kdb command table. */ static void __init kdb_inittab(void) { kdb_register_table(maintab, ARRAY_SIZE(maintab)); - if (arch_kgdb_ops.enable_nmi) - kdb_register_table(&nmicmd, 1); } /* Execute any commands defined in kdb_cmds. */ diff --git a/kernel/delayacct.c b/kernel/delayacct.c index eb63a021ac04..30e7912ebb0d 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -14,6 +14,15 @@ #include <linux/delayacct.h> #include <linux/module.h> +#define UPDATE_DELAY(type) \ +do { \ + d->type##_delay_max = tsk->delays->type##_delay_max; \ + d->type##_delay_min = tsk->delays->type##_delay_min; \ + tmp = d->type##_delay_total + tsk->delays->type##_delay; \ + d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \ + d->type##_count += tsk->delays->type##_count; \ +} while (0) + DEFINE_STATIC_KEY_FALSE(delayacct_key); int delayacct_on __read_mostly; /* Delay accounting turned on/off */ struct kmem_cache *delayacct_cache; @@ -173,41 +182,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ raw_spin_lock_irqsave(&tsk->delays->lock, flags); - d->blkio_delay_max = tsk->delays->blkio_delay_max; - d->blkio_delay_min = tsk->delays->blkio_delay_min; - tmp = d->blkio_delay_total + tsk->delays->blkio_delay; - d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; - d->swapin_delay_max = tsk->delays->swapin_delay_max; - d->swapin_delay_min = tsk->delays->swapin_delay_min; - tmp = d->swapin_delay_total + tsk->delays->swapin_delay; - d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; - d->freepages_delay_max = tsk->delays->freepages_delay_max; - d->freepages_delay_min = tsk->delays->freepages_delay_min; - tmp = d->freepages_delay_total + tsk->delays->freepages_delay; - d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; - d->thrashing_delay_max = tsk->delays->thrashing_delay_max; - d->thrashing_delay_min = tsk->delays->thrashing_delay_min; - tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; - d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; - d->compact_delay_max = tsk->delays->compact_delay_max; - d->compact_delay_min = tsk->delays->compact_delay_min; - tmp = d->compact_delay_total + tsk->delays->compact_delay; - d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; - d->wpcopy_delay_max = tsk->delays->wpcopy_delay_max; - d->wpcopy_delay_min = tsk->delays->wpcopy_delay_min; - tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; - d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; - d->irq_delay_max = tsk->delays->irq_delay_max; - d->irq_delay_min = tsk->delays->irq_delay_min; - tmp = d->irq_delay_total + tsk->delays->irq_delay; - d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp; - d->blkio_count += tsk->delays->blkio_count; - d->swapin_count += tsk->delays->swapin_count; - d->freepages_count += tsk->delays->freepages_count; - d->thrashing_count += tsk->delays->thrashing_count; - d->compact_count += tsk->delays->compact_count; - d->wpcopy_count += tsk->delays->wpcopy_count; - d->irq_count += tsk->delays->irq_count; + UPDATE_DELAY(blkio); + UPDATE_DELAY(swapin); + UPDATE_DELAY(freepages); + UPDATE_DELAY(thrashing); + UPDATE_DELAY(compact); + UPDATE_DELAY(wpcopy); + UPDATE_DELAY(irq); raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 3b2bdca9f1d4..77c8d9487a9a 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -336,16 +336,22 @@ static phys_addr_t dma_reserved_default_memory_size __initdata; static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) { - if (!rmem->priv) { - struct dma_coherent_mem *mem; + struct dma_coherent_mem *mem = rmem->priv; + if (!mem) { mem = dma_init_coherent_memory(rmem->base, rmem->base, rmem->size, true); if (IS_ERR(mem)) return PTR_ERR(mem); rmem->priv = mem; } - dma_assign_coherent_memory(dev, rmem->priv); + + /* Warn if the device potentially can't use the reserved memory */ + if (mem->device_base + rmem->size - 1 > + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit)) + dev_warn(dev, "reserved memory is beyond device's set DMA address range\n"); + + dma_assign_coherent_memory(dev, mem); return 0; } diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 055da410ac71..8df0dfaaca18 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -64,8 +64,7 @@ struct cma *dma_contiguous_default_area; * Users, who want to set the size of global CMA area for their system * should use cma= kernel parameter. */ -static const phys_addr_t size_bytes __initconst = - (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; +#define size_bytes ((phys_addr_t)CMA_SIZE_MBYTES * SZ_1M) static phys_addr_t size_cmdline __initdata = -1; static phys_addr_t base_cmdline __initdata; static phys_addr_t limit_cmdline __initdata; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 5b4e6d3bf7bc..24c359d9c879 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -13,6 +13,7 @@ #include <linux/vmalloc.h> #include <linux/set_memory.h> #include <linux/slab.h> +#include <linux/pci-p2pdma.h> #include "direct.h" /* @@ -462,34 +463,33 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { struct pci_p2pdma_map_state p2pdma_state = {}; - enum pci_p2pdma_map_type map; struct scatterlist *sg; int i, ret; for_each_sg(sgl, sg, nents, i) { - if (is_pci_p2pdma_page(sg_page(sg))) { - map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg); - switch (map) { - case PCI_P2PDMA_MAP_BUS_ADDR: - continue; - case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - /* - * Any P2P mapping that traverses the PCI - * host bridge must be mapped with CPU physical - * address and not PCI bus addresses. This is - * done with dma_direct_map_page() below. - */ - break; - default: - ret = -EREMOTEIO; + switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) { + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * Any P2P mapping that traverses the PCI host bridge + * must be mapped with CPU physical address and not PCI + * bus addresses. + */ + break; + case PCI_P2PDMA_MAP_NONE: + sg->dma_address = dma_direct_map_page(dev, sg_page(sg), + sg->offset, sg->length, dir, attrs); + if (sg->dma_address == DMA_MAPPING_ERROR) { + ret = -EIO; goto out_unmap; } - } - - sg->dma_address = dma_direct_map_page(dev, sg_page(sg), - sg->offset, sg->length, dir, attrs); - if (sg->dma_address == DMA_MAPPING_ERROR) { - ret = -EIO; + break; + case PCI_P2PDMA_MAP_BUS_ADDR: + sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, + sg_phys(sg)); + sg_dma_mark_bus_address(sg); + continue; + default: + ret = -EREMOTEIO; goto out_unmap; } sg_dma_len(sg) = sg->length; @@ -584,6 +584,22 @@ int dma_direct_supported(struct device *dev, u64 mask) return mask >= phys_to_dma_unencrypted(dev, min_mask); } +static const struct bus_dma_region *dma_find_range(struct device *dev, + unsigned long start_pfn) +{ + const struct bus_dma_region *m; + + for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) { + unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start); + + if (start_pfn >= cpu_start_pfn && + start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) + return m; + } + + return NULL; +} + /* * To check whether all ram resource ranges are covered by dma range map * Returns 0 when further check is needed @@ -593,20 +609,12 @@ static int check_ram_in_range_map(unsigned long start_pfn, unsigned long nr_pages, void *data) { unsigned long end_pfn = start_pfn + nr_pages; - const struct bus_dma_region *bdr = NULL; - const struct bus_dma_region *m; struct device *dev = data; while (start_pfn < end_pfn) { - for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) { - unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start); + const struct bus_dma_region *bdr; - if (start_pfn >= cpu_start_pfn && - start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) { - bdr = m; - break; - } - } + bdr = dma_find_range(dev, start_pfn); if (!bdr) return 1; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index cda127027e48..107e4a4d251d 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -443,6 +443,24 @@ bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr) } EXPORT_SYMBOL_GPL(__dma_need_sync); +/** + * dma_need_unmap - does this device need dma_unmap_* operations + * @dev: device to check + * + * If this function returns %false, drivers can skip calling dma_unmap_* after + * finishing an I/O. This function must be called after all mappings that might + * need to be unmapped have been performed. + */ +bool dma_need_unmap(struct device *dev) +{ + if (!dma_map_direct(dev, get_dma_ops(dev))) + return true; + if (!dev->dma_skip_sync) + return true; + return IS_ENABLED(CONFIG_DMA_API_DEBUG); +} +EXPORT_SYMBOL_GPL(dma_need_unmap); + static void dma_setup_need_sync(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -910,6 +928,19 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_coherent_mask); +static bool __dma_addressing_limited(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < + dma_get_required_mask(dev)) + return true; + + if (unlikely(ops) || use_dma_iommu(dev)) + return false; + return !dma_direct_all_ram_mapped(dev); +} + /** * dma_addressing_limited - return if the device is addressing limited * @dev: device to check @@ -920,15 +951,11 @@ EXPORT_SYMBOL(dma_set_coherent_mask); */ bool dma_addressing_limited(struct device *dev) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - if (min_not_zero(dma_get_mask(dev), dev->bus_dma_limit) < - dma_get_required_mask(dev)) - return true; - - if (unlikely(ops) || use_dma_iommu(dev)) + if (!__dma_addressing_limited(dev)) return false; - return !dma_direct_all_ram_mapped(dev); + + dev_dbg(dev, "device is DMA addressing limited\n"); + return true; } EXPORT_SYMBOL_GPL(dma_addressing_limited); diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index 095c775e001e..d4b8bd0af79b 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -6,6 +6,9 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n +# Branch profiling isn't noinstr-safe +ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING + CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector diff --git a/kernel/entry/common.c b/kernel/entry/common.c index e33691d5adf7..a8dd1f27417c 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -49,7 +49,7 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Do seccomp after ptrace, to catch any tracer changes. */ if (work & SYSCALL_WORK_SECCOMP) { - ret = __secure_computing(NULL); + ret = __secure_computing(); if (ret == -1L) return ret; } @@ -146,7 +146,7 @@ static inline bool report_single_step(unsigned long work) return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; } -static void syscall_exit_work(struct pt_regs *regs, unsigned long work) +void syscall_exit_work(struct pt_regs *regs, unsigned long work) { bool step; @@ -173,53 +173,6 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work) ptrace_report_syscall_exit(regs, step); } -/* - * Syscall specific exit to user mode preparation. Runs with interrupts - * enabled. - */ -static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) -{ - unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - unsigned long nr = syscall_get_nr(current, regs); - - CT_WARN_ON(ct_state() != CT_STATE_KERNEL); - - if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { - if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) - local_irq_enable(); - } - - rseq_syscall(regs); - - /* - * Do one-time syscall specific work. If these work items are - * enabled, we want to run them exactly once per syscall exit with - * interrupts enabled. - */ - if (unlikely(work & SYSCALL_WORK_EXIT)) - syscall_exit_work(regs, work); -} - -static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) -{ - syscall_exit_to_user_mode_prepare(regs); - local_irq_disable_exit_to_user(); - exit_to_user_mode_prepare(regs); -} - -void syscall_exit_to_user_mode_work(struct pt_regs *regs) -{ - __syscall_exit_to_user_mode_work(regs); -} - -__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) -{ - instrumentation_begin(); - __syscall_exit_to_user_mode_work(regs); - instrumentation_end(); - exit_to_user_mode(); -} - noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) { enter_from_user_mode(regs); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 8a47e52a454f..6c83ad674d01 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -22,6 +22,7 @@ struct callchain_cpus_entries { int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; +static const int six_hundred_forty_kb = 640 * 1024; static inline size_t perf_callchain_entry__sizeof(void) { @@ -266,12 +267,8 @@ exit_put: return entry; } -/* - * Used for sysctl_perf_event_max_stack and - * sysctl_perf_event_max_contexts_per_stack. - */ -int perf_event_max_stack_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int perf_event_max_stack_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; @@ -292,3 +289,32 @@ int perf_event_max_stack_handler(const struct ctl_table *table, int write, return ret; } + +static const struct ctl_table callchain_sysctl_table[] = { + { + .procname = "perf_event_max_stack", + .data = &sysctl_perf_event_max_stack, + .maxlen = sizeof(sysctl_perf_event_max_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&six_hundred_forty_kb, + }, + { + .procname = "perf_event_max_contexts_per_stack", + .data = &sysctl_perf_event_max_contexts_per_stack, + .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_THOUSAND, + }, +}; + +static int __init init_callchain_sysctls(void) +{ + register_sysctl_init("kernel", callchain_sysctl_table); + return 0; +} +core_initcall(init_callchain_sysctls); + diff --git a/kernel/events/core.c b/kernel/events/core.c index bcb09e011e9e..7281230044d0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -55,6 +55,7 @@ #include <linux/pgtable.h> #include <linux/buildid.h> #include <linux/task_work.h> +#include <linux/percpu-rwsem.h> #include "internal.h" @@ -206,6 +207,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, __perf_ctx_unlock(&cpuctx->ctx); } +typedef struct { + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; +} class_perf_ctx_lock_t; + +static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T) +{ perf_ctx_unlock(_T->cpuctx, _T->ctx); } + +static inline class_perf_ctx_lock_t +class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; } + #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) @@ -452,8 +466,8 @@ static struct kmem_cache *perf_event_cache; */ int sysctl_perf_event_paranoid __read_mostly = 2; -/* Minimum for 512 kiB + 1 user control page */ -int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ +/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */ +static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* * max perf event sample rate @@ -463,6 +477,7 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' #define DEFAULT_CPU_TIME_MAX_PERCENT 25 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; @@ -484,7 +499,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); -int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, +static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -506,9 +521,7 @@ int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, return 0; } -int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; - -int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, +static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -528,6 +541,52 @@ int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, return 0; } +static const struct ctl_table events_core_sysctl_table[] = { + /* + * User-space relies on this file as a feature check for + * perf_events being enabled. It's an ABI, do not remove! + */ + { + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), + .mode = 0644, + .proc_handler = perf_event_max_sample_rate_handler, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "perf_cpu_time_max_percent", + .data = &sysctl_perf_cpu_time_max_percent, + .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), + .mode = 0644, + .proc_handler = perf_cpu_time_max_percent_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +}; + +static int __init init_events_core_sysctls(void) +{ + register_sysctl_init("kernel", events_core_sysctl_table); + return 0; +} +core_initcall(init_events_core_sysctls); + + /* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not @@ -898,7 +957,13 @@ static void perf_cgroup_switch(struct task_struct *task) if (READ_ONCE(cpuctx->cgrp) == cgrp) return; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + /* + * Re-check, could've raced vs perf_remove_from_context(). + */ + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; + perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); @@ -916,7 +981,6 @@ static void perf_cgroup_switch(struct task_struct *task) ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -1147,8 +1211,8 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpc->hrtimer_lock); - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - timer->function = perf_mux_hrtimer_handler; + hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_HARD); } static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) @@ -1172,42 +1236,40 @@ static int perf_mux_hrtimer_restart_ipi(void *arg) return perf_mux_hrtimer_restart(arg); } +static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu) +{ + return *this_cpu_ptr(pmu->cpu_pmu_context); +} + void perf_pmu_disable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpc(pmu)->pmu_disable_count; if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpc(pmu)->pmu_disable_count; if (!--(*count)) pmu->pmu_enable(pmu); } static void perf_assert_pmu_disabled(struct pmu *pmu) { - WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); + int *count = &this_cpc(pmu)->pmu_disable_count; + WARN_ON_ONCE(*count == 0); } -static void get_ctx(struct perf_event_context *ctx) +static inline void perf_pmu_read(struct perf_event *event) { - refcount_inc(&ctx->refcount); -} - -static void *alloc_task_ctx_data(struct pmu *pmu) -{ - if (pmu->task_ctx_cache) - return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); - - return NULL; + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); } -static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) +static void get_ctx(struct perf_event_context *ctx) { - if (pmu->task_ctx_cache && task_ctx_data) - kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); + refcount_inc(&ctx->refcount); } static void free_ctx(struct rcu_head *head) @@ -1226,6 +1288,10 @@ static void put_ctx(struct perf_event_context *ctx) if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); + } else { + smp_mb__after_atomic(); /* pairs with wait_var_event() */ + if (ctx->task == TASK_TOMBSTONE) + wake_up_var(&ctx->refcount); } } @@ -2072,18 +2138,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) del_event_from_groups(event, ctx); - /* - * If event was in error state, then keep it - * that way, otherwise bogus counts will be - * returned on read(). The only way to get out - * of error state is by explicit re-enabling - * of the event - */ - if (event->state > PERF_EVENT_STATE_OFF) { - perf_cgroup_event_disable(event, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_OFF); - } - ctx->generation++; event->pmu_ctx->nr_events--; } @@ -2101,8 +2155,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) } static void put_event(struct perf_event *event); -static void event_sched_out(struct perf_event *event, - struct perf_event_context *ctx); +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state); static void perf_put_aux_event(struct perf_event *event) { @@ -2123,7 +2178,7 @@ static void perf_put_aux_event(struct perf_event *event) * If the event is an aux_event, tear down all links to * it from other events. */ - for_each_sibling_event(iter, event->group_leader) { + for_each_sibling_event(iter, event) { if (iter->aux_event != event) continue; @@ -2135,8 +2190,7 @@ static void perf_put_aux_event(struct perf_event *event) * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ - event_sched_out(iter, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR); } } @@ -2194,18 +2248,6 @@ static inline struct list_head *get_event_list(struct perf_event *event) &event->pmu_ctx->flexible_active; } -/* - * Events that have PERF_EV_CAP_SIBLING require being part of a group and - * cannot exist on their own, schedule them out and move them into the ERROR - * state. Also see _perf_event_enable(), it will not be able to recover - * this ERROR state. - */ -static inline void perf_remove_sibling_event(struct perf_event *event) -{ - event_sched_out(event, event->ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); -} - static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; @@ -2241,8 +2283,15 @@ static void perf_group_detach(struct perf_event *event) */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + /* + * Events that have PERF_EV_CAP_SIBLING require being part of + * a group and cannot exist on their own, schedule them out + * and move them into the ERROR state. Also see + * _perf_event_enable(), it will not be able to recover this + * ERROR state. + */ if (sibling->event_caps & PERF_EV_CAP_SIBLING) - perf_remove_sibling_event(sibling); + __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); @@ -2281,7 +2330,11 @@ static void perf_child_detach(struct perf_event *event) if (WARN_ON_ONCE(!parent_event)) return; + /* + * Can't check this from an IPI, the holder is likey another CPU. + * lockdep_assert_held(&parent_event->child_mutex); + */ sync_child_event(event); list_del_init(&event->child_list); @@ -2299,11 +2352,16 @@ event_filter_match(struct perf_event *event) perf_cgroup_match(event); } +static inline bool is_event_in_freq_mode(struct perf_event *event) +{ + return event->attr.freq && event->attr.sample_freq; +} + static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; // XXX cpc serialization, probably per-cpu IRQ disabled @@ -2336,7 +2394,7 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx) if (!is_software_event(event)) cpc->active_oncpu--; - if (event->attr.freq && event->attr.sample_freq) { + if (is_event_in_freq_mode(event)) { ctx->nr_freq--; epc->nr_freq--; } @@ -2406,7 +2464,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL -#define DETACH_DEAD 0x04UL +#define DETACH_EXIT 0x04UL +#define DETACH_REVOKE 0x08UL +#define DETACH_DEAD 0x10UL /* * Cross CPU call to remove a performance event @@ -2421,6 +2481,7 @@ __perf_remove_from_context(struct perf_event *event, void *info) { struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; + enum perf_event_state state = PERF_EVENT_STATE_OFF; unsigned long flags = (unsigned long)info; ctx_time_update(cpuctx, ctx); @@ -2429,24 +2490,32 @@ __perf_remove_from_context(struct perf_event *event, * Ensure event_sched_out() switches to OFF, at the very least * this avoids raising perf_pending_task() at this time. */ + if (flags & DETACH_EXIT) + state = PERF_EVENT_STATE_EXIT; + if (flags & DETACH_REVOKE) + state = PERF_EVENT_STATE_REVOKED; if (flags & DETACH_DEAD) - event->pending_disable = 1; + state = PERF_EVENT_STATE_DEAD; + event_sched_out(event, ctx); + + if (event->state > PERF_EVENT_STATE_OFF) + perf_cgroup_event_disable(event, ctx); + + perf_event_set_state(event, min(event->state, state)); + if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); - if (flags & DETACH_DEAD) - event->state = PERF_EVENT_STATE_DEAD; if (!pmu_ctx->nr_events) { pmu_ctx->rotate_necessary = 0; if (ctx->task && ctx->is_active) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu); - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } @@ -2497,6 +2566,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla event_function_call(event, __perf_remove_from_context, (void *)flags); } +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state) +{ + event_sched_out(event, ctx); + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, state); +} + /* * Cross CPU call to disable a performance event */ @@ -2511,13 +2589,18 @@ static void __perf_event_disable(struct perf_event *event, perf_pmu_disable(event->pmu_ctx->pmu); ctx_time_update_event(ctx, event); + /* + * When disabling a group leader, the whole group becomes ineligible + * to run, so schedule out the full group. + */ if (event == event->group_leader) group_sched_out(event, ctx); - else - event_sched_out(event, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_OFF); - perf_cgroup_event_disable(event, ctx); + /* + * But only mark the leader OFF; the siblings will remain + * INACTIVE. + */ + __event_disable(event, ctx, PERF_EVENT_STATE_OFF); perf_pmu_enable(event->pmu_ctx->pmu); } @@ -2580,11 +2663,46 @@ void perf_event_disable_inatomic(struct perf_event *event) static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); +static void perf_event_unthrottle(struct perf_event *event, bool start) +{ + event->hw.interrupts = 0; + if (start) + event->pmu->start(event, 0); + if (event == event->group_leader) + perf_log_throttle(event, 1); +} + +static void perf_event_throttle(struct perf_event *event) +{ + event->hw.interrupts = MAX_INTERRUPTS; + event->pmu->stop(event, 0); + if (event == event->group_leader) + perf_log_throttle(event, 0); +} + +static void perf_event_unthrottle_group(struct perf_event *event, bool skip_start_event) +{ + struct perf_event *sibling, *leader = event->group_leader; + + perf_event_unthrottle(leader, skip_start_event ? leader != event : true); + for_each_sibling_event(sibling, leader) + perf_event_unthrottle(sibling, skip_start_event ? sibling != event : true); +} + +static void perf_event_throttle_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + + perf_event_throttle(leader); + for_each_sibling_event(sibling, leader) + perf_event_throttle(sibling); +} + static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); @@ -2608,10 +2726,8 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) * ticks already, also for a heavily scheduling task there is little * guarantee it'll get a tick in a timely manner. */ - if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { - perf_log_throttle(event, 1); - event->hw.interrupts = 0; - } + if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) + perf_event_unthrottle(event, false); perf_pmu_disable(event->pmu); @@ -2626,7 +2742,7 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) if (!is_software_event(event)) cpc->active_oncpu++; - if (event->attr.freq && event->attr.sample_freq) { + if (is_event_in_freq_mode(event)) { ctx->nr_freq++; epc->nr_freq++; } @@ -2691,7 +2807,7 @@ error: static int group_can_go_on(struct perf_event *event, int can_add_hw) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); /* * Groups consisting entirely of software events can always go on. @@ -3314,9 +3430,8 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, struct pmu *pmu = pmu_ctx->pmu; if (ctx->task && !(ctx->is_active & EVENT_ALL)) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); - cpc = this_cpu_ptr(pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } @@ -3473,8 +3588,7 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); + perf_pmu_read(event); perf_event_update_time(event); @@ -3522,52 +3636,17 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \ - for (pos1 = list_first_entry(head1, typeof(*pos1), member), \ - pos2 = list_first_entry(head2, typeof(*pos2), member); \ - !list_entry_is_head(pos1, head1, member) && \ - !list_entry_is_head(pos2, head2, member); \ - pos1 = list_next_entry(pos1, member), \ - pos2 = list_next_entry(pos2, member)) - -static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, - struct perf_event_context *next_ctx) -{ - struct perf_event_pmu_context *prev_epc, *next_epc; - - if (!prev_ctx->nr_task_data) - return; - - double_list_for_each_entry(prev_epc, next_epc, - &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list, - pmu_ctx_entry) { - - if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu)) - continue; - - /* - * PMU specific parts of task perf context can require - * additional synchronization. As an example of such - * synchronization see implementation details of Intel - * LBR call stack data profiling; - */ - if (prev_epc->pmu->swap_task_ctx) - prev_epc->pmu->swap_task_ctx(prev_epc, next_epc); - else - swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); - } -} - -static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in) +static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, + struct task_struct *task, bool sched_in) { struct perf_event_pmu_context *pmu_ctx; struct perf_cpu_pmu_context *cpc; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + cpc = this_cpc(pmu_ctx->pmu); if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) - pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); + pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in); } } @@ -3630,17 +3709,16 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - perf_ctx_sched_task_cb(ctx, false); - perf_event_swap_task_ctx_data(ctx, next_ctx); + perf_ctx_sched_task_cb(ctx, task, false); perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of - * ctx->task and ctx->task_ctx_data are immaterial - * since those values are always verified under - * ctx->lock which we're now holding. + * ctx->task is immaterial since this value is + * always verified under ctx->lock which we're now + * holding. */ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); RCU_INIT_POINTER(next->perf_event_ctxp, ctx); @@ -3660,7 +3738,7 @@ unlock: perf_ctx_disable(ctx, false); inside_switch: - perf_ctx_sched_task_cb(ctx, false); + perf_ctx_sched_task_cb(ctx, task, false); task_ctx_sched_out(ctx, NULL, EVENT_ALL); perf_ctx_enable(ctx, false); @@ -3673,7 +3751,7 @@ static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); this_cpu_dec(perf_sched_cb_usages); barrier(); @@ -3685,7 +3763,7 @@ void perf_sched_cb_dec(struct pmu *pmu) void perf_sched_cb_inc(struct pmu *pmu) { - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); if (!cpc->sched_cb_usage++) list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); @@ -3702,7 +3780,8 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ -static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in) +static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, + struct task_struct *task, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; @@ -3716,7 +3795,7 @@ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_i perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); - pmu->sched_task(cpc->task_epc, sched_in); + pmu->sched_task(cpc->task_epc, task, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -3734,7 +3813,7 @@ static void perf_pmu_sched_task(struct task_struct *prev, return; list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) - __perf_pmu_sched_task(cpc, sched_in); + __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in); } static void perf_event_switch(struct task_struct *task, @@ -3802,7 +3881,7 @@ static void __link_epc(struct perf_event_pmu_context *pmu_ctx) if (!pmu_ctx->ctx->task) return; - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + cpc = this_cpc(pmu_ctx->pmu); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = pmu_ctx; } @@ -3930,11 +4009,15 @@ static int merge_sched_in(struct perf_event *event, void *data) if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + + if (*perf_event_fasync(event)) + event->pending_kill = POLL_ERR; + + perf_event_wakeup(event); } else { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); event->pmu_ctx->rotate_necessary = 1; - cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } @@ -4029,7 +4112,7 @@ static void perf_event_context_sched_in(struct task_struct *task) perf_ctx_lock(cpuctx, ctx); perf_ctx_disable(ctx, false); - perf_ctx_sched_task_cb(ctx, true); + perf_ctx_sched_task_cb(ctx, task, true); perf_ctx_enable(ctx, false); perf_ctx_unlock(cpuctx, ctx); @@ -4060,7 +4143,7 @@ static void perf_event_context_sched_in(struct task_struct *task) perf_event_sched_in(cpuctx, ctx, NULL); - perf_ctx_sched_task_cb(cpuctx->task_ctx, true); + perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) perf_ctx_enable(&cpuctx->ctx, false); @@ -4222,14 +4305,10 @@ static void perf_adjust_freq_unthr_events(struct list_head *event_list) hwc = &event->hw; - if (hwc->interrupts == MAX_INTERRUPTS) { - hwc->interrupts = 0; - perf_log_throttle(event, 1); - if (!event->attr.freq || !event->attr.sample_freq) - event->pmu->start(event, 0); - } + if (hwc->interrupts == MAX_INTERRUPTS) + perf_event_unthrottle_group(event, is_event_in_freq_mode(event)); - if (!event->attr.freq || !event->attr.sample_freq) + if (!is_event_in_freq_mode(event)) continue; /* @@ -4501,7 +4580,8 @@ out: static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, - struct perf_event_context *ctx); + struct perf_event_context *ctx, + bool revoke); /* * Removes all events from the current task that have been marked @@ -4528,7 +4608,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx) modified = true; - perf_event_exit_event(event, ctx); + perf_event_exit_event(event, ctx, false); } raw_spin_lock_irqsave(&ctx->lock, flags); @@ -4618,15 +4698,8 @@ static void __perf_event_read(void *info) pmu->read(event); - for_each_sibling_event(sub, event) { - if (sub->state == PERF_EVENT_STATE_ACTIVE) { - /* - * Use sibling's PMU rather than @event's since - * sibling could be on different (eg: software) PMU. - */ - sub->pmu->read(sub); - } - } + for_each_sibling_event(sub, event) + perf_pmu_read(sub); data->ret = pmu->commit_txn(pmu); @@ -4883,7 +4956,7 @@ find_get_context(struct task_struct *task, struct perf_event *event) if (!task) { /* Must be root to operate on a CPU event: */ - err = perf_allow_cpu(&event->attr); + err = perf_allow_cpu(); if (err) return ERR_PTR(err); @@ -4950,8 +5023,7 @@ static struct perf_event_pmu_context * find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, struct perf_event *event) { - struct perf_event_pmu_context *new = NULL, *epc; - void *task_ctx_data = NULL; + struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc; if (!ctx->task) { /* @@ -4961,11 +5033,14 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, */ struct perf_cpu_pmu_context *cpc; - cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { - atomic_set(&epc->refcount, 1); + /* + * One extra reference for the pmu; see perf_pmu_free(). + */ + atomic_set(&epc->refcount, 2); epc->embedded = 1; list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; @@ -4981,14 +5056,6 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, if (!new) return ERR_PTR(-ENOMEM); - if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = alloc_task_ctx_data(pmu); - if (!task_ctx_data) { - kfree(new); - return ERR_PTR(-ENOMEM); - } - } - __perf_init_event_pmu_context(new, pmu); /* @@ -5007,23 +5074,23 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, atomic_inc(&epc->refcount); goto found_epc; } + /* Make sure the pmu_ctx_list is sorted by PMU type: */ + if (!pos && epc->pmu->type > pmu->type) + pos = epc; } epc = new; new = NULL; - list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + if (!pos) + list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + else + list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev); + epc->ctx = ctx; found_epc: - if (task_ctx_data && !epc->task_ctx_data) { - epc->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - ctx->nr_task_data++; - } raw_spin_unlock_irq(&ctx->lock); - - free_task_ctx_data(pmu, task_ctx_data); kfree(new); return epc; @@ -5034,11 +5101,18 @@ static void get_pmu_ctx(struct perf_event_pmu_context *epc) WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); } +static void free_cpc_rcu(struct rcu_head *head) +{ + struct perf_cpu_pmu_context *cpc = + container_of(head, typeof(*cpc), epc.rcu_head); + + kfree(cpc); +} + static void free_epc_rcu(struct rcu_head *head) { struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); - kfree(epc->task_ctx_data); kfree(epc); } @@ -5068,8 +5142,10 @@ static void put_pmu_ctx(struct perf_event_pmu_context *epc) raw_spin_unlock_irqrestore(&ctx->lock, flags); - if (epc->embedded) + if (epc->embedded) { + call_rcu(&epc->rcu_head, free_cpc_rcu); return; + } call_rcu(&epc->rcu_head, free_epc_rcu); } @@ -5114,6 +5190,7 @@ static bool is_sb_event(struct perf_event *event) attr->context_switch || attr->text_poke || attr->bpf_event) return true; + return false; } @@ -5145,6 +5222,225 @@ static void unaccount_freq_event(void) atomic_dec(&nr_freq_events); } + +static struct perf_ctx_data * +alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global) +{ + struct perf_ctx_data *cd; + + cd = kzalloc(sizeof(*cd), GFP_KERNEL); + if (!cd) + return NULL; + + cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL); + if (!cd->data) { + kfree(cd); + return NULL; + } + + cd->global = global; + cd->ctx_cache = ctx_cache; + refcount_set(&cd->refcount, 1); + + return cd; +} + +static void free_perf_ctx_data(struct perf_ctx_data *cd) +{ + kmem_cache_free(cd->ctx_cache, cd->data); + kfree(cd); +} + +static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head) +{ + struct perf_ctx_data *cd; + + cd = container_of(rcu_head, struct perf_ctx_data, rcu_head); + free_perf_ctx_data(cd); +} + +static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd) +{ + call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu); +} + +static int +attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache, + bool global) +{ + struct perf_ctx_data *cd, *old = NULL; + + cd = alloc_perf_ctx_data(ctx_cache, global); + if (!cd) + return -ENOMEM; + + for (;;) { + if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) { + if (old) + perf_free_ctx_data_rcu(old); + return 0; + } + + if (!old) { + /* + * After seeing a dead @old, we raced with + * removal and lost, try again to install @cd. + */ + continue; + } + + if (refcount_inc_not_zero(&old->refcount)) { + free_perf_ctx_data(cd); /* unused */ + return 0; + } + + /* + * @old is a dead object, refcount==0 is stable, try and + * replace it with @cd. + */ + } + return 0; +} + +static void __detach_global_ctx_data(void); +DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem); +static refcount_t global_ctx_data_ref; + +static int +attach_global_ctx_data(struct kmem_cache *ctx_cache) +{ + struct task_struct *g, *p; + struct perf_ctx_data *cd; + int ret; + + if (refcount_inc_not_zero(&global_ctx_data_ref)) + return 0; + + guard(percpu_write)(&global_ctx_data_rwsem); + if (refcount_inc_not_zero(&global_ctx_data_ref)) + return 0; +again: + /* Allocate everything */ + scoped_guard (rcu) { + for_each_process_thread(g, p) { + cd = rcu_dereference(p->perf_ctx_data); + if (cd && !cd->global) { + cd->global = 1; + if (!refcount_inc_not_zero(&cd->refcount)) + cd = NULL; + } + if (!cd) { + get_task_struct(p); + goto alloc; + } + } + } + + refcount_set(&global_ctx_data_ref, 1); + + return 0; +alloc: + ret = attach_task_ctx_data(p, ctx_cache, true); + put_task_struct(p); + if (ret) { + __detach_global_ctx_data(); + return ret; + } + goto again; +} + +static int +attach_perf_ctx_data(struct perf_event *event) +{ + struct task_struct *task = event->hw.target; + struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache; + int ret; + + if (!ctx_cache) + return -ENOMEM; + + if (task) + return attach_task_ctx_data(task, ctx_cache, false); + + ret = attach_global_ctx_data(ctx_cache); + if (ret) + return ret; + + event->attach_state |= PERF_ATTACH_GLOBAL_DATA; + return 0; +} + +static void +detach_task_ctx_data(struct task_struct *p) +{ + struct perf_ctx_data *cd; + + scoped_guard (rcu) { + cd = rcu_dereference(p->perf_ctx_data); + if (!cd || !refcount_dec_and_test(&cd->refcount)) + return; + } + + /* + * The old ctx_data may be lost because of the race. + * Nothing is required to do for the case. + * See attach_task_ctx_data(). + */ + if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL)) + perf_free_ctx_data_rcu(cd); +} + +static void __detach_global_ctx_data(void) +{ + struct task_struct *g, *p; + struct perf_ctx_data *cd; + +again: + scoped_guard (rcu) { + for_each_process_thread(g, p) { + cd = rcu_dereference(p->perf_ctx_data); + if (!cd || !cd->global) + continue; + cd->global = 0; + get_task_struct(p); + goto detach; + } + } + return; +detach: + detach_task_ctx_data(p); + put_task_struct(p); + goto again; +} + +static void detach_global_ctx_data(void) +{ + if (refcount_dec_not_one(&global_ctx_data_ref)) + return; + + guard(percpu_write)(&global_ctx_data_rwsem); + if (!refcount_dec_and_test(&global_ctx_data_ref)) + return; + + /* remove everything */ + __detach_global_ctx_data(); +} + +static void detach_perf_ctx_data(struct perf_event *event) +{ + struct task_struct *task = event->hw.target; + + event->attach_state &= ~PERF_ATTACH_TASK_DATA; + + if (task) + return detach_task_ctx_data(task); + + if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) { + detach_global_ctx_data(); + event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA; + } +} + static void unaccount_event(struct perf_event *event) { bool dec = false; @@ -5239,6 +5535,8 @@ static int exclusive_event_init(struct perf_event *event) return -EBUSY; } + event->attach_state |= PERF_ATTACH_EXCLUSIVE; + return 0; } @@ -5246,14 +5544,13 @@ static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!is_exclusive_pmu(pmu)) - return; - /* see comment in exclusive_event_init() */ if (event->attach_state & PERF_ATTACH_TASK) atomic_dec(&pmu->exclusive_cnt); else atomic_inc(&pmu->exclusive_cnt); + + event->attach_state &= ~PERF_ATTACH_EXCLUSIVE; } static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) @@ -5285,38 +5582,73 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } -static void perf_addr_filters_splice(struct perf_event *event, - struct list_head *head); +static void perf_free_addr_filters(struct perf_event *event); -static void perf_pending_task_sync(struct perf_event *event) +/* vs perf_event_alloc() error */ +static void __free_event(struct perf_event *event) { - struct callback_head *head = &event->pending_task; + struct pmu *pmu = event->pmu; + + if (event->attach_state & PERF_ATTACH_CALLCHAIN) + put_callchain_buffers(); + + kfree(event->addr_filter_ranges); + + if (event->attach_state & PERF_ATTACH_EXCLUSIVE) + exclusive_event_destroy(event); + + if (is_cgroup_event(event)) + perf_detach_cgroup(event); + + if (event->attach_state & PERF_ATTACH_TASK_DATA) + detach_perf_ctx_data(event); + + if (event->destroy) + event->destroy(event); - if (!event->pending_work) - return; /* - * If the task is queued to the current task's queue, we - * obviously can't wait for it to complete. Simply cancel it. + * Must be after ->destroy(), due to uprobe_perf_close() using + * hw.target. */ - if (task_work_cancel(current, head)) { - event->pending_work = 0; - local_dec(&event->ctx->nr_no_switch_fast); - return; + if (event->hw.target) + put_task_struct(event->hw.target); + + if (event->pmu_ctx) { + /* + * put_pmu_ctx() needs an event->ctx reference, because of + * epc->ctx. + */ + WARN_ON_ONCE(!pmu); + WARN_ON_ONCE(!event->ctx); + WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); + put_pmu_ctx(event->pmu_ctx); } /* - * All accesses related to the event are within the same RCU section in - * perf_pending_task(). The RCU grace period before the event is freed - * will make sure all those accesses are complete by then. + * perf_event_free_task() relies on put_ctx() being 'last', in + * particular all task references must be cleaned up. */ - rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); + if (event->ctx) + put_ctx(event->ctx); + + if (pmu) { + module_put(pmu->module); + scoped_guard (spinlock, &pmu->events_lock) { + list_del(&event->pmu_list); + wake_up_var(pmu); + } + } + + call_rcu(&event->rcu_head, free_event_rcu); } +DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) + +/* vs perf_event_alloc() success */ static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); irq_work_sync(&event->pending_disable_irq); - perf_pending_task_sync(event); unaccount_event(event); @@ -5334,53 +5666,21 @@ static void _free_event(struct perf_event *event) mutex_unlock(&event->mmap_mutex); } - if (is_cgroup_event(event)) - perf_detach_cgroup(event); - - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } - perf_event_free_bpf_prog(event); - perf_addr_filters_splice(event, NULL); - kfree(event->addr_filter_ranges); - - if (event->destroy) - event->destroy(event); - - /* - * Must be after ->destroy(), due to uprobe_perf_close() using - * hw.target. - */ - if (event->hw.target) - put_task_struct(event->hw.target); - - if (event->pmu_ctx) - put_pmu_ctx(event->pmu_ctx); + perf_free_addr_filters(event); - /* - * perf_event_free_task() relies on put_ctx() being 'last', in particular - * all task references must be cleaned up. - */ - if (event->ctx) - put_ctx(event->ctx); - - exclusive_event_destroy(event); - module_put(event->pmu->module); - - call_rcu(&event->rcu_head, free_event_rcu); + __free_event(event); } /* * Used to free events which have a known refcount of 1, such as in error paths - * where the event isn't exposed yet and inherited events. + * of inherited events. */ static void free_event(struct perf_event *event) { if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, - "unexpected event refcount: %ld; ptr=%p\n", - atomic_long_read(&event->refcount), event)) { + "unexpected event refcount: %ld; ptr=%p\n", + atomic_long_read(&event->refcount), event)) { /* leak to avoid use-after-free */ return; } @@ -5441,10 +5741,17 @@ static void perf_remove_from_owner(struct perf_event *event) static void put_event(struct perf_event *event) { + struct perf_event *parent; + if (!atomic_long_dec_and_test(&event->refcount)) return; + parent = event->parent; _free_event(event); + + /* Matches the refcount bump in inherit_event() */ + if (parent) + put_event(parent); } /* @@ -5456,7 +5763,6 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; - LIST_HEAD(free_list); /* * If we got here through err_alloc: free_event(event); we will not @@ -5485,15 +5791,17 @@ int perf_event_release_kernel(struct perf_event *event) * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + if (event->state > PERF_EVENT_STATE_REVOKED) { + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + } else { + event->state = PERF_EVENT_STATE_DEAD; + } perf_event_ctx_unlock(event, ctx); again: mutex_lock(&event->child_mutex); list_for_each_entry(child, &event->child_list, child_list) { - void *var = NULL; - /* * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested(). @@ -5526,50 +5834,30 @@ again: tmp = list_first_entry_or_null(&event->child_list, struct perf_event, child_list); if (tmp == child) { - perf_remove_from_context(child, DETACH_GROUP); - list_move(&child->child_list, &free_list); - /* - * This matches the refcount bump in inherit_event(); - * this can't be the last reference. - */ - put_event(event); + perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD); } else { - var = &ctx->refcount; + child = NULL; } mutex_unlock(&event->child_mutex); mutex_unlock(&ctx->mutex); - put_ctx(ctx); - if (var) { - /* - * If perf_event_free_task() has deleted all events from the - * ctx while the child_mutex got released above, make sure to - * notify about the preceding put_ctx(). - */ - smp_mb(); /* pairs with wait_var_event() */ - wake_up_var(var); + if (child) { + /* Last reference unless ->pending_task work is pending */ + put_event(child); } + put_ctx(ctx); + goto again; } mutex_unlock(&event->child_mutex); - list_for_each_entry_safe(child, tmp, &free_list, child_list) { - void *var = &child->ctx->refcount; - - list_del(&child->child_list); - free_event(child); - - /* - * Wake any perf_event_free_task() waiting for this event to be - * freed. - */ - smp_mb(); /* pairs with wait_var_event() */ - wake_up_var(var); - } - no_ctx: - put_event(event); /* Must be the 'last' reference */ + /* + * Last reference unless ->pending_task work is pending on this event + * or any of its children. + */ + put_event(event); return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); @@ -5835,11 +6123,21 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) struct perf_buffer *rb; __poll_t events = EPOLLHUP; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + poll_wait(file, &event->waitq, wait); + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + if (is_event_hup(event)) return events; + if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR && + event->attr.pinned)) + return EPOLLERR; + /* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups. @@ -5930,14 +6228,6 @@ static void __perf_event_period(struct perf_event *event, active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(event->pmu); - /* - * We could be throttled; unthrottle now to avoid the tick - * trying to unthrottle while we already re-started the event. - */ - if (event->hw.interrupts == MAX_INTERRUPTS) { - event->hw.interrupts = 0; - perf_log_throttle(event, 1); - } event->pmu->stop(event, PERF_EF_UPDATE); } @@ -5945,6 +6235,14 @@ static void __perf_event_period(struct perf_event *event, if (active) { event->pmu->start(event, PERF_EF_RELOAD); + /* + * Once the period is force-reset, the event starts immediately. + * But the event/group could be throttled. Unthrottle the + * event/group now to avoid the next tick trying to unthrottle + * while we already re-started the event/group. + */ + if (event->hw.interrupts == MAX_INTERRUPTS) + perf_event_unthrottle_group(event, true); perf_pmu_enable(event->pmu); } } @@ -5962,14 +6260,15 @@ static int _perf_event_period(struct perf_event *event, u64 value) if (!value) return -EINVAL; - if (event->attr.freq && value > sysctl_perf_event_sample_rate) - return -EINVAL; - - if (perf_event_check_period(event, value)) - return -EINVAL; - - if (!event->attr.freq && (value & (1ULL << 63))) - return -EINVAL; + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) + return -EINVAL; + } else { + if (perf_event_check_period(event, value)) + return -EINVAL; + if (value & (1ULL << 63)) + return -EINVAL; + } event_function_call(event, __perf_event_period, &value); @@ -6001,12 +6300,18 @@ static int perf_event_set_output(struct perf_event *event, static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { void (*func)(struct perf_event *); u32 flags = arg; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; @@ -6063,7 +6368,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon if (IS_ERR(prog)) return PTR_ERR(prog); - err = perf_event_set_bpf_prog(event, prog, 0); + err = __perf_event_set_bpf_prog(event, prog, 0); if (err) { bpf_prog_put(prog); return err; @@ -6382,9 +6687,22 @@ void ring_buffer_put(struct perf_buffer *rb) call_rcu(&rb->rcu_head, rb_free_rcu); } +typedef void (*mapped_f)(struct perf_event *event, struct mm_struct *mm); + +#define get_mapped(event, func) \ +({ struct pmu *pmu; \ + mapped_f f = NULL; \ + guard(rcu)(); \ + pmu = READ_ONCE(event->pmu); \ + if (pmu) \ + f = pmu->func; \ + f; \ +}) + static void perf_mmap_open(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; + mapped_f mapped = get_mapped(event, event_mapped); atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); @@ -6392,8 +6710,8 @@ static void perf_mmap_open(struct vm_area_struct *vma) if (vma->vm_pgoff) atomic_inc(&event->rb->aux_mmap_count); - if (event->pmu->event_mapped) - event->pmu->event_mapped(event, vma->vm_mm); + if (mapped) + mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -6409,14 +6727,16 @@ static void perf_pmu_output_stop(struct perf_event *event); static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; + mapped_f unmapped = get_mapped(event, event_unmapped); struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); bool detach_rest = false; - if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event, vma->vm_mm); + /* FIXIES vs perf_pmu_unregister() */ + if (unmapped) + unmapped(event, vma->vm_mm); /* * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex @@ -6608,7 +6928,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; - int ret = 0, flags = 0; + int ret, flags = 0; + mapped_f mapped; /* * Don't allow mmap() of inherited per-task counters. This would @@ -6626,9 +6947,64 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return ret; vma_size = vma->vm_end - vma->vm_start; + nr_pages = vma_size / PAGE_SIZE; + + if (nr_pages > INT_MAX) + return -ENOMEM; + + if (vma_size != PAGE_SIZE * nr_pages) + return -EINVAL; + + user_extra = nr_pages; + + mutex_lock(&event->mmap_mutex); + ret = -EINVAL; + + /* + * This relies on __pmu_detach_event() taking mmap_mutex after marking + * the event REVOKED. Either we observe the state, or __pmu_detach_event() + * will detach the rb created here. + */ + if (event->state <= PERF_EVENT_STATE_REVOKED) { + ret = -ENODEV; + goto unlock; + } if (vma->vm_pgoff == 0) { - nr_pages = (vma_size / PAGE_SIZE) - 1; + nr_pages -= 1; + + /* + * If we have rb pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + goto unlock; + + WARN_ON_ONCE(event->ctx->parent_ctx); + + if (event->rb) { + if (data_page_nr(event->rb) != nr_pages) + goto unlock; + + if (atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Success -- managed to mmap() the same buffer + * multiple times. + */ + ret = 0; + /* We need the rb to map pages. */ + rb = event->rb; + goto unlock; + } + + /* + * Raced against perf_mmap_close()'s + * atomic_dec_and_mutex_lock() remove the + * event and continue as if !event->rb + */ + ring_buffer_attach(event, NULL); + } + } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already @@ -6637,16 +7013,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) */ u64 aux_offset, aux_size; - if (!event->rb) - return -EINVAL; - - nr_pages = vma_size / PAGE_SIZE; - if (nr_pages > INT_MAX) - return -ENOMEM; - - mutex_lock(&event->mmap_mutex); - ret = -EINVAL; - rb = event->rb; if (!rb) goto aux_unlock; @@ -6687,48 +7053,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } atomic_set(&rb->aux_mmap_count, 1); - user_extra = nr_pages; - - goto accounting; } - /* - * If we have rb pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - WARN_ON_ONCE(event->ctx->parent_ctx); -again: - mutex_lock(&event->mmap_mutex); - if (event->rb) { - if (data_page_nr(event->rb) != nr_pages) { - ret = -EINVAL; - goto unlock; - } - - if (!atomic_inc_not_zero(&event->rb->mmap_count)) { - /* - * Raced against perf_mmap_close(); remove the - * event and try again. - */ - ring_buffer_attach(event, NULL); - mutex_unlock(&event->mmap_mutex); - goto again; - } - - /* We need the rb to map pages. */ - rb = event->rb; - goto unlock; - } - - user_extra = nr_pages + 1; - -accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* @@ -6796,6 +7122,8 @@ accounting: rb->aux_mmap_locked = extra; } + ret = 0; + unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); @@ -6820,8 +7148,9 @@ aux_unlock: if (!ret) ret = map_range(rb, vma); - if (event->pmu->event_mapped) - event->pmu->event_mapped(event, vma->vm_mm); + mapped = get_mapped(event, event_mapped); + if (mapped) + mapped(event, vma->vm_mm); return ret; } @@ -6832,6 +7161,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); @@ -6919,15 +7251,15 @@ static void __perf_pending_disable(struct perf_event *event) * CPU-A CPU-B * * perf_event_disable_inatomic() - * @pending_disable = CPU-A; + * @pending_disable = 1; * irq_work_queue(); * * sched-out - * @pending_disable = -1; + * @pending_disable = 0; * * sched-in * perf_event_disable_inatomic() - * @pending_disable = CPU-B; + * @pending_disable = 1; * irq_work_queue(); // FAILS * * irq_work_run() @@ -6983,12 +7315,6 @@ static void perf_pending_task(struct callback_head *head) int rctx; /* - * All accesses to the event must belong to the same implicit RCU read-side - * critical section as the ->pending_work reset. See comment in - * perf_pending_task_sync(). - */ - rcu_read_lock(); - /* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'. */ @@ -6998,9 +7324,8 @@ static void perf_pending_task(struct callback_head *head) event->pending_work = 0; perf_sigtrap(event); local_dec(&event->ctx->nr_no_switch_fast); - rcuwait_wake_up(&event->pending_work_wait); } - rcu_read_unlock(); + put_event(event); if (rctx >= 0) perf_swevent_put_recursion_context(rctx); @@ -7132,6 +7457,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size, if (!regs) return 0; + /* No mm, no stack, no dump. */ + if (!current->mm) + return 0; + /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE @@ -7444,9 +7773,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if ((leader != event) && - (leader->state == PERF_EVENT_STATE_ACTIVE)) - leader->pmu->read(leader); + if ((leader != event) && !handle->skip_read) + perf_pmu_read(leader); values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) @@ -7459,9 +7787,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, for_each_sibling_event(sub, leader) { n = 0; - if ((sub != event) && - (sub->state == PERF_EVENT_STATE_ACTIVE)) - sub->pmu->read(sub); + if ((sub != event) && !handle->skip_read) + perf_pmu_read(sub); values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) @@ -7520,6 +7847,9 @@ void perf_output_sample(struct perf_output_handle *handle, { u64 sample_type = data->type; + if (data->sample_flags & PERF_SAMPLE_READ) + handle->skip_read = 1; + perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) @@ -7842,6 +8172,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + if (!current->mm) + user = false; + if (!kernel && !user) return &__empty_callchain; @@ -8321,7 +8654,8 @@ void perf_event_exec(void) perf_event_enable_on_exec(ctx); perf_event_remove_on_exec(ctx); - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); + scoped_guard(rcu) + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); perf_unpin_context(ctx); put_ctx(ctx); @@ -8513,10 +8847,58 @@ static void perf_event_task(struct task_struct *task, task_ctx); } +/* + * Allocate data for a new task when profiling system-wide + * events which require PMU specific data + */ +static void +perf_event_alloc_task_data(struct task_struct *child, + struct task_struct *parent) +{ + struct kmem_cache *ctx_cache = NULL; + struct perf_ctx_data *cd; + + if (!refcount_read(&global_ctx_data_ref)) + return; + + scoped_guard (rcu) { + cd = rcu_dereference(parent->perf_ctx_data); + if (cd) + ctx_cache = cd->ctx_cache; + } + + if (!ctx_cache) + return; + + guard(percpu_read)(&global_ctx_data_rwsem); + scoped_guard (rcu) { + cd = rcu_dereference(child->perf_ctx_data); + if (!cd) { + /* + * A system-wide event may be unaccount, + * when attaching the perf_ctx_data. + */ + if (!refcount_read(&global_ctx_data_ref)) + return; + goto attach; + } + + if (!cd->global) { + cd->global = 1; + refcount_inc(&cd->refcount); + } + } + + return; +attach: + attach_task_ctx_data(child, ctx_cache, true); +} + void perf_event_fork(struct task_struct *task) { perf_event_task(task, NULL, 1); perf_event_namespaces(task); + perf_event_alloc_task_data(task, current); } /* @@ -8580,7 +8962,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) unsigned int size; memset(comm, 0, sizeof(comm)); - strscpy(comm, comm_event->task->comm, sizeof(comm)); + strscpy(comm, comm_event->task->comm); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -9024,7 +9406,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) } cpy_name: - strscpy(tmp, name, sizeof(tmp)); + strscpy(tmp, name); name = tmp; got_name: /* @@ -9448,7 +9830,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; - strscpy(name, sym, KSYM_NAME_LEN); + strscpy(name, sym); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0'; @@ -9668,7 +10050,7 @@ void perf_event_text_poke(const void *addr, const void *old_bytes, void perf_event_itrace_started(struct perf_event *event) { - event->attach_state |= PERF_ATTACH_ITRACE; + WRITE_ONCE(event->attach_state, event->attach_state | PERF_ATTACH_ITRACE); } static void perf_log_itrace_start(struct perf_event *event) @@ -9751,14 +10133,13 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle) hwc->interrupts = 1; } else { hwc->interrupts++; - if (unlikely(throttle && - hwc->interrupts > max_samples_per_tick)) { - __this_cpu_inc(perf_throttled_count); - tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } + } + + if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) { + __this_cpu_inc(perf_throttled_count); + tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); + perf_event_throttle_group(event); + ret = 1; } if (event->attr.freq) { @@ -9945,6 +10326,7 @@ static int __perf_event_overflow(struct perf_event *event, !task_work_add(current, &event->pending_task, notify_mode)) { event->pending_work = pending_id; local_inc(&event->ctx->nr_no_switch_fast); + WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); event->pending_addr = 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) @@ -10790,11 +11172,15 @@ static inline bool perf_event_is_tracing(struct perf_event *event) return false; } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, - u64 bpf_cookie) +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) { bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + if (!perf_event_is_tracing(event)) return perf_event_set_bpf_handler(event, prog, bpf_cookie); @@ -10829,8 +11215,25 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, return perf_event_attach_bpf_prog(event, prog, bpf_cookie); } +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie); + perf_event_ctx_unlock(event, ctx); + + return ret; +} + void perf_event_free_bpf_prog(struct perf_event *event) { + if (!event->prog) + return; + if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; @@ -10848,7 +11251,15 @@ static void perf_event_free_filter(struct perf_event *event) { } -int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, +static int __perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + return -ENOENT; +} + +int perf_event_set_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, u64 bpf_cookie) { return -ENOENT; @@ -10929,6 +11340,17 @@ static void perf_addr_filters_splice(struct perf_event *event, free_filters_list(&list); } +static void perf_free_addr_filters(struct perf_event *event) +{ + /* + * Used during free paths, there is no concurrency. + */ + if (list_empty(&event->addr_filters.list)) + return; + + perf_addr_filters_splice(event, NULL); +} + /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. @@ -11352,7 +11774,12 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (is_sampling_event(event)) { + /* + * The throttle can be triggered in the hrtimer handler. + * The HRTIMER_NORESTART should be used to stop the timer, + * rather than hrtimer_cancel(). See perf_swevent_hrtimer() + */ + if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); @@ -11367,8 +11794,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - hwc->hrtimer.function = perf_swevent_hrtimer; + hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); /* * Since hrtimers have a fixed rate, we can do a static freq->period @@ -11408,7 +11834,8 @@ static void cpu_clock_event_start(struct perf_event *event, int flags) static void cpu_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); - cpu_clock_event_update(event); + if (flags & PERF_EF_UPDATE) + cpu_clock_event_update(event); } static int cpu_clock_event_add(struct perf_event *event, int flags) @@ -11486,7 +11913,8 @@ static void task_clock_event_start(struct perf_event *event, int flags) static void task_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); - task_clock_event_update(event, event->ctx->time); + if (flags & PERF_EF_UPDATE) + task_clock_event_update(event, event->ctx->time); } static int task_clock_event_add(struct perf_event *event, int flags) @@ -11605,11 +12033,6 @@ static int perf_event_idx_default(struct perf_event *event) return 0; } -static void free_pmu_context(struct pmu *pmu) -{ - free_percpu(pmu->cpu_pmu_context); -} - /* * Let userspace know that this PMU supports address range filtering: */ @@ -11619,7 +12042,7 @@ static ssize_t nr_addr_filters_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); + return sysfs_emit(page, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); @@ -11630,7 +12053,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); - return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); + return sysfs_emit(page, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); @@ -11641,7 +12064,7 @@ perf_event_mux_interval_ms_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); + return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); @@ -11672,7 +12095,7 @@ perf_event_mux_interval_ms_store(struct device *dev, cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_pmu_context *cpc; - cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); @@ -11815,62 +12238,107 @@ del_dev: free_dev: put_device(pmu->dev); + pmu->dev = NULL; goto out; } static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; -int perf_pmu_register(struct pmu *pmu, const char *name, int type) +static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new) { - int cpu, ret, max = PERF_TYPE_MAX; + void *tmp, *val = idr_find(idr, id); - mutex_lock(&pmus_lock); - ret = -ENOMEM; - pmu->pmu_disable_count = alloc_percpu(int); - if (!pmu->pmu_disable_count) - goto unlock; + if (val != old) + return false; - pmu->type = -1; - if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { - ret = -EINVAL; - goto free_pdc; + tmp = idr_replace(idr, new, id); + if (IS_ERR(tmp)) + return false; + + WARN_ON_ONCE(tmp != val); + return true; +} + +static void perf_pmu_free(struct pmu *pmu) +{ + if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); } - if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { - ret = -EINVAL; - goto free_pdc; + if (pmu->cpu_pmu_context) { + int cpu; + + for_each_possible_cpu(cpu) { + struct perf_cpu_pmu_context *cpc; + + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); + if (!cpc) + continue; + if (cpc->epc.embedded) { + /* refcount managed */ + put_pmu_ctx(&cpc->epc); + continue; + } + kfree(cpc); + } + free_percpu(pmu->cpu_pmu_context); } +} + +DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T)) + +int perf_pmu_register(struct pmu *_pmu, const char *name, int type) +{ + int cpu, max = PERF_TYPE_MAX; + + struct pmu *pmu __free(pmu_unregister) = _pmu; + guard(mutex)(&pmus_lock); + + if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) + return -EINVAL; + + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, + "Can not register a pmu with an invalid scope.\n")) + return -EINVAL; pmu->name = name; if (type >= 0) max = type; - ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); - if (ret < 0) - goto free_pdc; + CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL); + if (pmu_type.id < 0) + return pmu_type.id; - WARN_ON(type >= 0 && ret != type); + WARN_ON(type >= 0 && pmu_type.id != type); - type = ret; - pmu->type = type; + pmu->type = pmu_type.id; + atomic_set(&pmu->exclusive_cnt, 0); if (pmu_bus_running && !pmu->dev) { - ret = pmu_dev_alloc(pmu); + int ret = pmu_dev_alloc(pmu); if (ret) - goto free_idr; + return ret; } - ret = -ENOMEM; - pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); + pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *); if (!pmu->cpu_pmu_context) - goto free_dev; + return -ENOMEM; for_each_possible_cpu(cpu) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = + kmalloc_node(sizeof(struct perf_cpu_pmu_context), + GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); + + if (!cpc) + return -ENOMEM; - cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc; __perf_init_event_pmu_context(&cpc->epc, pmu); __perf_mux_hrtimer_init(cpc, cpu); } @@ -11903,51 +12371,159 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; + INIT_LIST_HEAD(&pmu->events); + spin_lock_init(&pmu->events_lock); + + /* + * Now that the PMU is complete, make it visible to perf_try_init_event(). + */ + if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) + return -EINVAL; list_add_rcu(&pmu->entry, &pmus); - atomic_set(&pmu->exclusive_cnt, 0); - ret = 0; -unlock: - mutex_unlock(&pmus_lock); - return ret; + take_idr_id(pmu_type); + _pmu = no_free_ptr(pmu); // let it rip + return 0; +} +EXPORT_SYMBOL_GPL(perf_pmu_register); -free_dev: - if (pmu->dev && pmu->dev != PMU_NULL_DEV) { - device_del(pmu->dev); - put_device(pmu->dev); +static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event, + struct perf_event_context *ctx) +{ + /* + * De-schedule the event and mark it REVOKED. + */ + perf_event_exit_event(event, ctx, true); + + /* + * All _free_event() bits that rely on event->pmu: + * + * Notably, perf_mmap() relies on the ordering here. + */ + scoped_guard (mutex, &event->mmap_mutex) { + WARN_ON_ONCE(pmu->event_unmapped); + /* + * Mostly an empty lock sequence, such that perf_mmap(), which + * relies on mmap_mutex, is sure to observe the state change. + */ + } + + perf_event_free_bpf_prog(event); + perf_free_addr_filters(event); + + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } + + if (event->pmu_ctx) { + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; } -free_idr: - idr_remove(&pmu_idr, pmu->type); + exclusive_event_destroy(event); + module_put(pmu->module); -free_pdc: - free_percpu(pmu->pmu_disable_count); - goto unlock; + event->pmu = NULL; /* force fault instead of UAF */ } -EXPORT_SYMBOL_GPL(perf_pmu_register); -void perf_pmu_unregister(struct pmu *pmu) +static void pmu_detach_event(struct pmu *pmu, struct perf_event *event) { - mutex_lock(&pmus_lock); - list_del_rcu(&pmu->entry); + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + __pmu_detach_event(pmu, event, ctx); + perf_event_ctx_unlock(event, ctx); + + scoped_guard (spinlock, &pmu->events_lock) + list_del(&event->pmu_list); +} + +static struct perf_event *pmu_get_event(struct pmu *pmu) +{ + struct perf_event *event; + + guard(spinlock)(&pmu->events_lock); + list_for_each_entry(event, &pmu->events, pmu_list) { + if (atomic_long_inc_not_zero(&event->refcount)) + return event; + } + + return NULL; +} + +static bool pmu_empty(struct pmu *pmu) +{ + guard(spinlock)(&pmu->events_lock); + return list_empty(&pmu->events); +} + +static void pmu_detach_events(struct pmu *pmu) +{ + struct perf_event *event; + + for (;;) { + event = pmu_get_event(pmu); + if (!event) + break; + + pmu_detach_event(pmu, event); + put_event(event); + } + + /* + * wait for pending _free_event()s + */ + wait_var_event(pmu, pmu_empty(pmu)); +} + +int perf_pmu_unregister(struct pmu *pmu) +{ + scoped_guard (mutex, &pmus_lock) { + if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL)) + return -EINVAL; + + list_del_rcu(&pmu->entry); + } /* * We dereference the pmu list under both SRCU and regular RCU, so * synchronize against both of those. + * + * Notably, the entirety of event creation, from perf_init_event() + * (which will now fail, because of the above) until + * perf_install_in_context() should be under SRCU such that + * this synchronizes against event creation. This avoids trying to + * detach events that are not fully formed. */ synchronize_srcu(&pmus_srcu); synchronize_rcu(); - free_percpu(pmu->pmu_disable_count); - idr_remove(&pmu_idr, pmu->type); - if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); + if (pmu->event_unmapped && !pmu_empty(pmu)) { + /* + * Can't force remove events when pmu::event_unmapped() + * is used in perf_mmap_close(). + */ + guard(mutex)(&pmus_lock); + idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu); + list_add_rcu(&pmu->entry, &pmus); + return -EBUSY; } - free_pmu_context(pmu); - mutex_unlock(&pmus_lock); + + scoped_guard (mutex, &pmus_lock) + idr_remove(&pmu_idr, pmu->type); + + /* + * PMU is removed from the pmus list, so no new events will + * be created, now take care of the existing ones. + */ + pmu_detach_events(pmu); + + /* + * PMU is unused, make it go away. + */ + perf_pmu_free(pmu); + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -11987,48 +12563,61 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); - if (!ret) { - if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && - has_extended_regs(event)) - ret = -EOPNOTSUPP; + if (ret) + goto err_pmu; - if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && - event_has_any_exclude_flag(event)) - ret = -EINVAL; + if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && + has_extended_regs(event)) { + ret = -EOPNOTSUPP; + goto err_destroy; + } - if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { - const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); - struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); - int cpu; - - if (pmu_cpumask && cpumask) { - cpu = cpumask_any_and(pmu_cpumask, cpumask); - if (cpu >= nr_cpu_ids) - ret = -ENODEV; - else - event->event_caps |= PERF_EV_CAP_READ_SCOPE; - } else { - ret = -ENODEV; - } - } + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && + event_has_any_exclude_flag(event)) { + ret = -EINVAL; + goto err_destroy; + } + + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { + const struct cpumask *cpumask; + struct cpumask *pmu_cpumask; + int cpu; + + cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); + pmu_cpumask = perf_scope_cpumask(pmu->scope); + + ret = -ENODEV; + if (!pmu_cpumask || !cpumask) + goto err_destroy; + + cpu = cpumask_any_and(pmu_cpumask, cpumask); + if (cpu >= nr_cpu_ids) + goto err_destroy; - if (ret && event->destroy) - event->destroy(event); + event->event_caps |= PERF_EV_CAP_READ_SCOPE; } - if (ret) - module_put(pmu->module); + return 0; + +err_destroy: + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } +err_pmu: + event->pmu = NULL; + module_put(pmu->module); return ret; } static struct pmu *perf_init_event(struct perf_event *event) { bool extended_type = false; - int idx, type, ret; struct pmu *pmu; + int type, ret; - idx = srcu_read_lock(&pmus_srcu); + guard(srcu)(&pmus_srcu); /* pmu idr/list access */ /* * Save original type before calling pmu->event_init() since certain @@ -12041,7 +12630,7 @@ static struct pmu *perf_init_event(struct perf_event *event) pmu = event->parent->pmu; ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; } /* @@ -12060,13 +12649,12 @@ static struct pmu *perf_init_event(struct perf_event *event) } again: - rcu_read_lock(); - pmu = idr_find(&pmu_idr, type); - rcu_read_unlock(); + scoped_guard (rcu) + pmu = idr_find(&pmu_idr, type); if (pmu) { if (event->attr.type != type && type != PERF_TYPE_RAW && !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) - goto fail; + return ERR_PTR(-ENOENT); ret = perf_try_init_event(pmu, event); if (ret == -ENOENT && event->attr.type != type && !extended_type) { @@ -12075,27 +12663,21 @@ again: } if (ret) - pmu = ERR_PTR(ret); + return ERR_PTR(ret); - goto unlock; + return pmu; } list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; - if (ret != -ENOENT) { - pmu = ERR_PTR(ret); - goto unlock; - } + if (ret != -ENOENT) + return ERR_PTR(ret); } -fail: - pmu = ERR_PTR(-ENOENT); -unlock: - srcu_read_unlock(&pmus_srcu, idx); - return pmu; + return ERR_PTR(-ENOENT); } static void attach_sb_event(struct perf_event *event) @@ -12222,7 +12804,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, void *context, int cgroup_fd) { struct pmu *pmu; - struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; int node; @@ -12237,8 +12818,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } node = (cpu >= 0) ? cpu_to_node(cpu) : -1; - event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, - node); + struct perf_event *event __free(__free_event) = + kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node); if (!event) return ERR_PTR(-ENOMEM); @@ -12260,13 +12841,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->active_entry); INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); + INIT_LIST_HEAD(&event->pmu_list); init_waitqueue_head(&event->waitq); init_irq_work(&event->pending_irq, perf_pending_irq); event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); - rcuwait_init(&event->pending_work_wait); mutex_init(&event->mmap_mutex); raw_spin_lock_init(&event->addr_filters.lock); @@ -12332,7 +12913,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, hwc = &event->hw; hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) + if (is_event_in_freq_mode(event)) hwc->sample_period = 1; hwc->last_period = hwc->sample_period; @@ -12345,15 +12926,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * See perf_output_read(). */ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) - goto err_ns; + return ERR_PTR(-EINVAL); if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; pmu = perf_init_event(event); - if (IS_ERR(pmu)) { - err = PTR_ERR(pmu); - goto err_ns; + if (IS_ERR(pmu)) + return (void*)pmu; + + /* + * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config(). + * The attach should be right after the perf_init_event(). + * Otherwise, the __free_event() would mistakenly detach the non-exist + * perf_ctx_data because of the other errors between them. + */ + if (event->attach_state & PERF_ATTACH_TASK_DATA) { + err = attach_perf_ctx_data(event); + if (err) + return ERR_PTR(err); } /* @@ -12361,49 +12952,39 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * events (they don't make sense as the cgroup will be different * on other CPUs in the uncore mask). */ - if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { - err = -EINVAL; - goto err_pmu; - } + if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) + return ERR_PTR(-EINVAL); if (event->attr.aux_output && (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || - event->attr.aux_pause || event->attr.aux_resume)) { - err = -EOPNOTSUPP; - goto err_pmu; - } + event->attr.aux_pause || event->attr.aux_resume)) + return ERR_PTR(-EOPNOTSUPP); - if (event->attr.aux_pause && event->attr.aux_resume) { - err = -EINVAL; - goto err_pmu; - } + if (event->attr.aux_pause && event->attr.aux_resume) + return ERR_PTR(-EINVAL); if (event->attr.aux_start_paused) { - if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { - err = -EOPNOTSUPP; - goto err_pmu; - } + if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) + return ERR_PTR(-EOPNOTSUPP); event->hw.aux_paused = 1; } if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) - goto err_pmu; + return ERR_PTR(err); } err = exclusive_event_init(event); if (err) - goto err_pmu; + return ERR_PTR(err); if (has_addr_filter(event)) { event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, sizeof(struct perf_addr_filter_range), GFP_KERNEL); - if (!event->addr_filter_ranges) { - err = -ENOMEM; - goto err_per_task; - } + if (!event->addr_filter_ranges) + return ERR_PTR(-ENOMEM); /* * Clone the parent's vma offsets: they are valid until exec() @@ -12427,42 +13008,26 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(attr->sample_max_stack); if (err) - goto err_addr_filters; + return ERR_PTR(err); + event->attach_state |= PERF_ATTACH_CALLCHAIN; } } err = security_perf_event_alloc(event); if (err) - goto err_callchain_buffer; + return ERR_PTR(err); /* symmetric to unaccount_event() in _free_event() */ account_event(event); - return event; - -err_callchain_buffer: - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } -err_addr_filters: - kfree(event->addr_filter_ranges); - -err_per_task: - exclusive_event_destroy(event); - -err_pmu: - if (is_cgroup_event(event)) - perf_detach_cgroup(event); - if (event->destroy) - event->destroy(event); - module_put(pmu->module); -err_ns: - if (event->hw.target) - put_task_struct(event->hw.target); - call_rcu(&event->rcu_head, free_event_rcu); + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + lockdep_assert_held(&pmus_srcu); + scoped_guard (spinlock, &pmu->events_lock) + list_add(&event->pmu_list, &pmu->events); - return ERR_PTR(err); + return_ptr(event); } static int perf_copy_attr(struct perf_event_attr __user *uattr, @@ -12532,7 +13097,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, } /* privileged levels capture (kernel, hv): check permissions */ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { - ret = perf_allow_kernel(attr); + ret = perf_allow_kernel(); if (ret) return ret; } @@ -12661,6 +13226,9 @@ set: goto unlock; if (output_event) { + if (output_event->state <= PERF_EVENT_STATE_REVOKED) + goto unlock; + /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); if (!rb) @@ -12789,12 +13357,12 @@ SYSCALL_DEFINE5(perf_event_open, return err; /* Do we allow access to perf_event_open(2) ? */ - err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + err = security_perf_event_open(PERF_SECURITY_OPEN); if (err) return err; if (!attr.exclude_kernel) { - err = perf_allow_kernel(&attr); + err = perf_allow_kernel(); if (err) return err; } @@ -12814,7 +13382,7 @@ SYSCALL_DEFINE5(perf_event_open, /* Only privileged users can get physical addresses */ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { - err = perf_allow_kernel(&attr); + err = perf_allow_kernel(); if (err) return err; } @@ -12842,6 +13410,11 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + CLASS(fd, group)(group_fd); // group_fd == -1 => empty if (group_fd != -1) { if (!is_perf_file(group)) { @@ -12849,6 +13422,10 @@ SYSCALL_DEFINE5(perf_event_open, goto err_fd; } group_leader = fd_file(group)->private_data; + if (group_leader->state <= PERF_EVENT_STATE_REVOKED) { + err = -ENODEV; + goto err_fd; + } if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -13145,7 +13722,7 @@ err_cred: if (task) up_read(&task->signal->exec_update_lock); err_alloc: - free_event(event); + put_event(event); err_task: if (task) put_task_struct(task); @@ -13182,6 +13759,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL); + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); if (IS_ERR(event)) { @@ -13253,7 +13835,7 @@ err_unlock: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + put_event(event); err: return ERR_PTR(err); } @@ -13393,10 +13975,12 @@ static void sync_child_event(struct perf_event *child_event) } static void -perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) +perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx, bool revoke) { struct perf_event *parent_event = event->parent; - unsigned long detach_flags = 0; + unsigned long detach_flags = DETACH_EXIT; + unsigned int attach_state; if (parent_event) { /* @@ -13411,28 +13995,38 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - detach_flags = DETACH_GROUP | DETACH_CHILD; + detach_flags |= DETACH_GROUP | DETACH_CHILD; mutex_lock(&parent_event->child_mutex); + /* PERF_ATTACH_ITRACE might be set concurrently */ + attach_state = READ_ONCE(event->attach_state); } - perf_remove_from_context(event, detach_flags); - - raw_spin_lock_irq(&ctx->lock); - if (event->state > PERF_EVENT_STATE_EXIT) - perf_event_set_state(event, PERF_EVENT_STATE_EXIT); - raw_spin_unlock_irq(&ctx->lock); + if (revoke) + detach_flags |= DETACH_GROUP | DETACH_REVOKE; + perf_remove_from_context(event, detach_flags); /* * Child events can be freed. */ if (parent_event) { mutex_unlock(&parent_event->child_mutex); + /* - * Kick perf_poll() for is_event_hup(); + * Match the refcount initialization. Make sure it doesn't happen + * twice if pmu_detach_event() calls it on an already exited task. */ - perf_event_wakeup(parent_event); - free_event(event); - put_event(parent_event); + if (attach_state & PERF_ATTACH_CHILD) { + /* + * Kick perf_poll() for is_event_hup(); + */ + perf_event_wakeup(parent_event); + /* + * pmu_detach_event() will have an extra refcount. + * perf_pending_task() might have one too. + */ + put_event(event); + } + return; } @@ -13442,15 +14036,13 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) perf_event_wakeup(event); } -static void perf_event_exit_task_context(struct task_struct *child) +static void perf_event_exit_task_context(struct task_struct *task, bool exit) { - struct perf_event_context *child_ctx, *clone_ctx = NULL; + struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; - WARN_ON_ONCE(child != current); - - child_ctx = perf_pin_task_context(child); - if (!child_ctx) + ctx = perf_pin_task_context(task); + if (!ctx) return; /* @@ -13463,27 +14055,28 @@ static void perf_event_exit_task_context(struct task_struct *child) * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context(). */ - mutex_lock(&child_ctx->mutex); + mutex_lock(&ctx->mutex); /* * In a single ctx::lock section, de-schedule the events and detach the * context from the task such that we cannot ever get it scheduled back * in. */ - raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); + raw_spin_lock_irq(&ctx->lock); + if (exit) + task_ctx_sched_out(ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ - RCU_INIT_POINTER(child->perf_event_ctxp, NULL); - put_ctx(child_ctx); /* cannot be last */ - WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); - put_task_struct(current); /* cannot be last */ + RCU_INIT_POINTER(task->perf_event_ctxp, NULL); + put_ctx(ctx); /* cannot be last */ + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); + put_task_struct(task); /* cannot be last */ - clone_ctx = unclone_ctx(child_ctx); - raw_spin_unlock_irq(&child_ctx->lock); + clone_ctx = unclone_ctx(ctx); + raw_spin_unlock_irq(&ctx->lock); if (clone_ctx) put_ctx(clone_ctx); @@ -13493,28 +14086,48 @@ static void perf_event_exit_task_context(struct task_struct *child) * won't get any samples after PERF_RECORD_EXIT. We can however still * get a few PERF_RECORD_READ events. */ - perf_event_task(child, child_ctx, 0); + if (exit) + perf_event_task(task, ctx, 0); - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - perf_event_exit_event(child_event, child_ctx); + list_for_each_entry_safe(child_event, next, &ctx->event_list, event_entry) + perf_event_exit_event(child_event, ctx, false); - mutex_unlock(&child_ctx->mutex); + mutex_unlock(&ctx->mutex); - put_ctx(child_ctx); + if (!exit) { + /* + * perf_event_release_kernel() could still have a reference on + * this context. In that case we must wait for these events to + * have been freed (in particular all their references to this + * task must've been dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, + refcount_read(&ctx->refcount) == 1); + } + put_ctx(ctx); } /* - * When a child task exits, feed back event values to parent events. + * When a task exits, feed back event values to parent events. * * Can be called with exec_update_lock held when called from * setup_new_exec(). */ -void perf_event_exit_task(struct task_struct *child) +void perf_event_exit_task(struct task_struct *task) { struct perf_event *event, *tmp; - mutex_lock(&child->perf_event_mutex); - list_for_each_entry_safe(event, tmp, &child->perf_event_list, + WARN_ON_ONCE(task != current); + + mutex_lock(&task->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &task->perf_event_list, owner_entry) { list_del_init(&event->owner_entry); @@ -13525,38 +14138,23 @@ void perf_event_exit_task(struct task_struct *child) */ smp_store_release(&event->owner, NULL); } - mutex_unlock(&child->perf_event_mutex); + mutex_unlock(&task->perf_event_mutex); - perf_event_exit_task_context(child); + perf_event_exit_task_context(task, true); /* * The perf_event_exit_task_context calls perf_event_task - * with child's task_ctx, which generates EXIT events for - * child contexts and sets child->perf_event_ctxp[] to NULL. + * with task's task_ctx, which generates EXIT events for + * task contexts and sets task->perf_event_ctxp[] to NULL. * At this point we need to send EXIT events to cpu contexts. */ - perf_event_task(child, NULL, 0); -} - -static void perf_free_event(struct perf_event *event, - struct perf_event_context *ctx) -{ - struct perf_event *parent = event->parent; - - if (WARN_ON_ONCE(!parent)) - return; + perf_event_task(task, NULL, 0); - mutex_lock(&parent->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent->child_mutex); - - put_event(parent); - - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - list_del_event(event, ctx); - raw_spin_unlock_irq(&ctx->lock); - free_event(event); + /* + * Detach the perf_ctx_data for the system-wide event. + */ + guard(percpu_read)(&global_ctx_data_rwsem); + detach_task_ctx_data(task); } /* @@ -13568,48 +14166,7 @@ static void perf_free_event(struct perf_event *event, */ void perf_event_free_task(struct task_struct *task) { - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - - ctx = rcu_access_pointer(task->perf_event_ctxp); - if (!ctx) - return; - - mutex_lock(&ctx->mutex); - raw_spin_lock_irq(&ctx->lock); - /* - * Destroy the task <-> ctx relation and mark the context dead. - * - * This is important because even though the task hasn't been - * exposed yet the context has been (through child_list). - */ - RCU_INIT_POINTER(task->perf_event_ctxp, NULL); - WRITE_ONCE(ctx->task, TASK_TOMBSTONE); - put_task_struct(task); /* cannot be last */ - raw_spin_unlock_irq(&ctx->lock); - - - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) - perf_free_event(event, ctx); - - mutex_unlock(&ctx->mutex); - - /* - * perf_event_release_kernel() could've stolen some of our - * child events and still have them on its free_list. In that - * case we must wait for these events to have been freed (in - * particular all their references to this task must've been - * dropped). - * - * Without this copy_process() will unconditionally free this - * task (irrespective of its reference count) and - * _free_event()'s put_task_struct(event->hw.target) will be a - * use-after-free. - * - * Wait for all events to drop their context reference. - */ - wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); - put_ctx(ctx); /* must be last */ + perf_event_exit_task_context(task, false); } void perf_event_delayed_put(struct task_struct *task) @@ -13647,12 +14204,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) return &event->attr; } -int perf_allow_kernel(struct perf_event_attr *attr) +int perf_allow_kernel(void) { if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; - return security_perf_event_open(attr, PERF_SECURITY_KERNEL); + return security_perf_event_open(PERF_SECURITY_KERNEL); } EXPORT_SYMBOL_GPL(perf_allow_kernel); @@ -13686,6 +14243,14 @@ inherit_event(struct perf_event *parent_event, if (parent_event->parent) parent_event = parent_event->parent; + if (parent_event->state <= PERF_EVENT_STATE_REVOKED) + return NULL; + + /* + * Event creation should be under SRCU, see perf_pmu_unregister(). + */ + guard(srcu)(&pmus_srcu); + child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child, @@ -13694,6 +14259,9 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + get_ctx(child_ctx); + child_event->ctx = child_ctx; + pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); @@ -13711,13 +14279,10 @@ inherit_event(struct perf_event *parent_event, if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); - /* task_ctx_data is freed with child_ctx */ free_event(child_event); return NULL; } - get_ctx(child_ctx); - /* * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, @@ -13738,7 +14303,6 @@ inherit_event(struct perf_event *parent_event, local64_set(&hwc->period_left, sample_period); } - child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; child_event->overflow_handler_context = parent_event->overflow_handler_context; @@ -13969,6 +14533,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags) child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); + child->perf_ctx_data = NULL; ret = perf_event_init_context(child, clone_flags); if (ret) { diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bc4a61029b6d..8ec2cb688903 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -950,9 +950,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp) return -ENOENT; /* - * no branch sampling for breakpoint events + * Check if breakpoint type is supported before proceeding. + * Also, no branch sampling for breakpoint events. */ - if (has_branch_stack(bp)) + if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp)) return -EOPNOTSUPP; err = register_perf_hw_breakpoint(bp); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 180509132d4b..aa9a759e824f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -19,7 +19,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->rb->poll, EPOLLIN); + atomic_set(&handle->rb->poll, EPOLLIN | EPOLLRDNORM); handle->event->pending_wakeup = 1; @@ -185,6 +185,7 @@ __perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; + handle->flags = 0; have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { @@ -440,7 +441,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * store that will be enabled on successful return */ if (!handle->size) { /* A, matches D */ - event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); WRITE_ONCE(rb->aux_nest, 0); goto err_put; @@ -525,7 +526,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) - handle->event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); } @@ -678,7 +679,15 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); - int ret = -ENOMEM, max_order; + bool use_contiguous_pages = event->pmu->capabilities & ( + PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE); + /* + * Initialize max_order to 0 for page allocation. This allocates single + * pages to minimize memory fragmentation. This is overridden if the + * PMU needs or prefers contiguous pages (use_contiguous_pages = true). + */ + int max_order = 0; + int ret = -ENOMEM; if (!has_aux(event)) return -EOPNOTSUPP; @@ -688,8 +697,8 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, if (!overwrite) { /* - * Watermark defaults to half the buffer, and so does the - * max_order, to aid PMU drivers in double buffering. + * Watermark defaults to half the buffer, to aid PMU drivers + * in double buffering. */ if (!watermark) watermark = min_t(unsigned long, @@ -697,16 +706,19 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, (unsigned long)nr_pages << (PAGE_SHIFT - 1)); /* - * Use aux_watermark as the basis for chunking to - * help PMU drivers honor the watermark. + * If using contiguous pages, use aux_watermark as the basis + * for chunking to help PMU drivers honor the watermark. */ - max_order = get_order(watermark); + if (use_contiguous_pages) + max_order = get_order(watermark); } else { /* - * We need to start with the max_order that fits in nr_pages, - * not the other way around, hence ilog2() and not get_order. + * If using contiguous pages, we need to start with the + * max_order that fits in nr_pages, not the other way around, + * hence ilog2() and not get_order. */ - max_order = ilog2(nr_pages); + if (use_contiguous_pages) + max_order = ilog2(nr_pages); watermark = 0; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2ca797cbe465..4c965ba77f9f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -29,6 +29,7 @@ #include <linux/workqueue.h> #include <linux/srcu.h> #include <linux/oom.h> /* check_stable_address_space */ +#include <linux/pagewalk.h> #include <linux/uprobes.h> @@ -152,80 +153,6 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) } /** - * __replace_page - replace page in vma by new page. - * based on replace_page in mm/ksm.c - * - * @vma: vma that holds the pte pointing to page - * @addr: address the old @page is mapped at - * @old_page: the page we are replacing by new_page - * @new_page: the modified page we replace page by - * - * If @new_page is NULL, only unmap @old_page. - * - * Returns 0 on success, negative error code otherwise. - */ -static int __replace_page(struct vm_area_struct *vma, unsigned long addr, - struct page *old_page, struct page *new_page) -{ - struct folio *old_folio = page_folio(old_page); - struct folio *new_folio; - struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); - int err; - struct mmu_notifier_range range; - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, - addr + PAGE_SIZE); - - if (new_page) { - new_folio = page_folio(new_page); - err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); - if (err) - return err; - } - - /* For folio_free_swap() below */ - folio_lock(old_folio); - - mmu_notifier_invalidate_range_start(&range); - err = -EAGAIN; - if (!page_vma_mapped_walk(&pvmw)) - goto unlock; - VM_BUG_ON_PAGE(addr != pvmw.address, old_page); - - if (new_page) { - folio_get(new_folio); - folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); - folio_add_lru_vma(new_folio, vma); - } else - /* no new page, just dec_mm_counter for old_page */ - dec_mm_counter(mm, MM_ANONPAGES); - - if (!folio_test_anon(old_folio)) { - dec_mm_counter(mm, mm_counter_file(old_folio)); - inc_mm_counter(mm, MM_ANONPAGES); - } - - flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); - ptep_clear_flush(vma, addr, pvmw.pte); - if (new_page) - set_pte_at(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); - - folio_remove_rmap_pte(old_folio, old_page, vma); - if (!folio_mapped(old_folio)) - folio_free_swap(old_folio); - page_vma_mapped_walk_done(&pvmw); - folio_put(old_folio); - - err = 0; - unlock: - mmu_notifier_invalidate_range_end(&range); - folio_unlock(old_folio); - return err; -} - -/** * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_swbp_insn @@ -417,7 +344,7 @@ static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " - "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n", + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) uprobe->ref_ctr_offset, mm); @@ -452,6 +379,95 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, return ret; } +static bool orig_page_is_identical(struct vm_area_struct *vma, + unsigned long vaddr, struct page *page, bool *pmd_mappable) +{ + const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT; + struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping, + index); + struct page *orig_page; + bool identical; + + if (IS_ERR(orig_folio)) + return false; + orig_page = folio_file_page(orig_folio, index); + + *pmd_mappable = folio_test_pmd_mappable(orig_folio); + identical = folio_test_uptodate(orig_folio) && + pages_identical(page, orig_page); + folio_put(orig_folio); + return identical; +} + +static int __uprobe_write_opcode(struct vm_area_struct *vma, + struct folio_walk *fw, struct folio *folio, + unsigned long opcode_vaddr, uprobe_opcode_t opcode) +{ + const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + const bool is_register = !!is_swbp_insn(&opcode); + bool pmd_mappable; + + /* For now, we'll only handle PTE-mapped folios. */ + if (fw->level != FW_LEVEL_PTE) + return -EFAULT; + + /* + * See can_follow_write_pte(): we'd actually prefer a writable PTE here, + * but the VMA might not be writable. + */ + if (!pte_write(fw->pte)) { + if (!PageAnonExclusive(fw->page)) + return -EFAULT; + if (unlikely(userfaultfd_pte_wp(vma, fw->pte))) + return -EFAULT; + /* SOFTDIRTY is handled via pte_mkdirty() below. */ + } + + /* + * We'll temporarily unmap the page and flush the TLB, such that we can + * modify the page atomically. + */ + flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); + fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); + copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + + /* + * When unregistering, we may only zap a PTE if uffd is disabled and + * there are no unexpected folio references ... + */ + if (is_register || userfaultfd_missing(vma) || + (folio_ref_count(folio) != folio_mapcount(folio) + 1 + + folio_test_swapcache(folio) * folio_nr_pages(folio))) + goto remap; + + /* + * ... and the mapped page is identical to the original page that + * would get faulted in on next access. + */ + if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable)) + goto remap; + + dec_mm_counter(vma->vm_mm, MM_ANONPAGES); + folio_remove_rmap_pte(folio, fw->page, vma); + if (!folio_mapped(folio) && folio_test_swapcache(folio) && + folio_trylock(folio)) { + folio_free_swap(folio); + folio_unlock(folio); + } + folio_put(folio); + + return pmd_mappable; +remap: + /* + * Make sure that our copy_to_page() changes become visible before the + * set_pte_at() write. + */ + smp_wmb(); + /* We modified the page. Make sure to mark the PTE dirty. */ + set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte)); + return 0; +} + /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for @@ -463,141 +479,146 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * * uprobe_write_opcode - write the opcode at a given virtual address. * @auprobe: arch specific probepoint information. - * @mm: the probed process address space. - * @vaddr: the virtual address to store the opcode. - * @opcode: opcode to be written at @vaddr. + * @vma: the probed virtual memory area. + * @opcode_vaddr: the virtual address to store the opcode. + * @opcode: opcode to be written at @opcode_vaddr. * * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ -int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, - unsigned long vaddr, uprobe_opcode_t opcode) +int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long opcode_vaddr, uprobe_opcode_t opcode) { + const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; - struct page *old_page, *new_page; - struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; - bool orig_page_huge = false; unsigned int gup_flags = FOLL_FORCE; + struct mmu_notifier_range range; + struct folio_walk fw; + struct folio *folio; + struct page *page; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); -retry: + if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) + return -EINVAL; + + /* + * When registering, we have to break COW to get an exclusive anonymous + * page that we can safely modify. Use FOLL_WRITE to trigger a write + * fault if required. When unregistering, we might be lucky and the + * anon page is already gone. So defer write faults until really + * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() + * cannot deal with PMDs yet. + */ if (is_register) - gup_flags |= FOLL_SPLIT_PMD; - /* Read the page with vaddr into memory */ - old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); - if (IS_ERR(old_page)) - return PTR_ERR(old_page); + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; - ret = verify_opcode(old_page, vaddr, &opcode); +retry: + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL); if (ret <= 0) - goto put_old; + goto out; + folio = page_folio(page); - if (WARN(!is_register && PageCompound(old_page), - "uprobe unregister should never work on compound page\n")) { - ret = -EINVAL; - goto put_old; + ret = verify_opcode(page, opcode_vaddr, &opcode); + if (ret <= 0) { + folio_put(folio); + goto out; } /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); - if (ret) - goto put_old; + if (ret) { + folio_put(folio); + goto out; + } ref_ctr_updated = 1; } ret = 0; - if (!is_register && !PageAnon(old_page)) - goto put_old; - - ret = anon_vma_prepare(vma); - if (ret) - goto put_old; - - ret = -ENOMEM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); - if (!new_page) - goto put_old; - - __SetPageUptodate(new_page); - copy_highpage(new_page, old_page); - copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + if (unlikely(!folio_test_anon(folio))) { + VM_WARN_ON_ONCE(is_register); + folio_put(folio); + goto out; + } if (!is_register) { - struct page *orig_page; - pgoff_t index; - - VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); - - index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; - orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, - index); - - if (orig_page) { - if (PageUptodate(orig_page) && - pages_identical(new_page, orig_page)) { - /* let go new_page */ - put_page(new_page); - new_page = NULL; - - if (PageCompound(orig_page)) - orig_page_huge = true; - } - put_page(orig_page); - } + /* + * In the common case, we'll be able to zap the page when + * unregistering. So trigger MMU notifiers now, as we won't + * be able to do it under PTL. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + vaddr, vaddr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + } + + ret = -EAGAIN; + /* Walk the page tables again, to perform the actual update. */ + if (folio_walk_start(&fw, vma, vaddr, 0)) { + if (fw.page == page) + ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); + folio_walk_end(&fw, vma); } - ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page); - if (new_page) - put_page(new_page); -put_old: - put_page(old_page); + if (!is_register) + mmu_notifier_invalidate_range_end(&range); - if (unlikely(ret == -EAGAIN)) + folio_put(folio); + switch (ret) { + case -EFAULT: + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; + fallthrough; + case -EAGAIN: goto retry; + default: + break; + } +out: /* Revert back reference counter if instruction update failed. */ - if (ret && is_register && ref_ctr_updated) + if (ret < 0 && is_register && ref_ctr_updated) update_ref_ctr(uprobe, mm, -1); /* try collapse pmd for compound page */ - if (!ret && orig_page_huge) + if (ret > 0) collapse_pte_mapped_thp(mm, vaddr, false); - return ret; + return ret < 0 ? ret : 0; } /** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); } /** * set_orig_insn - Restore the original instruction. - * @mm: the probed process address space. + * @vma: the probed virtual memory area. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak -set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_orig_insn(struct arch_uprobe *auprobe, + struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, mm, vaddr, + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } @@ -762,10 +783,14 @@ static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) enum hprobe_state hstate; /* - * return_instance's hprobe is protected by RCU. - * Underlying uprobe is itself protected from reuse by SRCU. + * Caller should guarantee that return_instance is not going to be + * freed from under us. This can be achieved either through holding + * rcu_read_lock() or by owning return_instance in the first place. + * + * Underlying uprobe is itself protected from reuse by SRCU, so ensure + * SRCU lock is held properly. */ - lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu)); + lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); hstate = READ_ONCE(hprobe->state); switch (hstate) { @@ -1114,10 +1139,10 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) return ret; } -static int -install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long vaddr) +static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { + struct mm_struct *mm = vma->vm_mm; bool first_uprobe; int ret; @@ -1133,7 +1158,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (first_uprobe) set_bit(MMF_HAS_UPROBES, &mm->flags); - ret = set_swbp(&uprobe->arch, mm, vaddr); + ret = set_swbp(&uprobe->arch, vma, vaddr); if (!ret) clear_bit(MMF_RECALC_UPROBES, &mm->flags); else if (first_uprobe) @@ -1142,11 +1167,13 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, return ret; } -static int -remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) +static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long vaddr) { + struct mm_struct *mm = vma->vm_mm; + set_bit(MMF_RECALC_UPROBES, &mm->flags); - return set_orig_insn(&uprobe->arch, mm, vaddr); + return set_orig_insn(&uprobe->arch, vma, vaddr); } struct map_info { @@ -1276,10 +1303,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (is_register) { /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) - err = install_breakpoint(uprobe, mm, vma, info->vaddr); + err = install_breakpoint(uprobe, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { if (!filter_chain(uprobe, mm)) - err |= remove_breakpoint(uprobe, mm, info->vaddr); + err |= remove_breakpoint(uprobe, vma, info->vaddr); } unlock: @@ -1452,7 +1479,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) continue; vaddr = offset_to_vaddr(vma, uprobe->offset); - err |= remove_breakpoint(uprobe, mm, vaddr); + err |= remove_breakpoint(uprobe, vma, vaddr); } mmap_read_unlock(mm); @@ -1590,7 +1617,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!fatal_signal_pending(current) && filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); + install_breakpoint(uprobe, vma, vaddr); } put_uprobe(uprobe); } @@ -1683,7 +1710,8 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO| + VM_SEALED_SYSMAP, &xol_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); @@ -1935,6 +1963,9 @@ static void free_ret_instance(struct uprobe_task *utask, * to-be-reused return instances for future uretprobes. If ri_timer() * happens to be running right now, though, we fallback to safety and * just perform RCU-delated freeing of ri. + * Admittedly, this is a rather simple use of seqcount, but it nicely + * abstracts away all the necessary memory barriers, so we use + * a well-supported kernel primitive here. */ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { /* immediate reuse of ri without RCU GP is OK */ @@ -1995,12 +2026,20 @@ static void ri_timer(struct timer_list *timer) /* RCU protects return_instance from freeing. */ guard(rcu)(); - write_seqcount_begin(&utask->ri_seqcount); + /* + * See free_ret_instance() for notes on seqcount use. + * We also employ raw API variants to avoid lockdep false-positive + * warning complaining about enabled preemption. The timer can only be + * invoked once for a uprobe_task. Therefore there can only be one + * writer. The reader does not require an even sequence count to make + * progress, so it is OK to remain preemptible on PREEMPT_RT. + */ + raw_write_seqcount_begin(&utask->ri_seqcount); for_each_ret_instance_rcu(ri, utask->return_instances) hprobe_expire(&ri->hprobe, false); - write_seqcount_end(&utask->ri_seqcount); + raw_write_seqcount_end(&utask->ri_seqcount); } static struct uprobe_task *alloc_utask(void) @@ -2160,8 +2199,8 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) */ unsigned long uprobe_get_trampoline_vaddr(void) { + unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR; struct xol_area *area; - unsigned long trampoline_vaddr = -1; /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ @@ -2302,9 +2341,8 @@ bool uprobe_deny_signal(void) WARN_ON_ONCE(utask->state != UTASK_SSTEP); if (task_sigpending(t)) { - spin_lock_irq(&t->sighand->siglock); + utask->signal_denied = true; clear_tsk_thread_flag(t, TIF_SIGPENDING); - spin_unlock_irq(&t->sighand->siglock); if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; @@ -2737,9 +2775,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) utask->state = UTASK_RUNNING; xol_free_insn_slot(utask); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* see uprobe_deny_signal() */ - spin_unlock_irq(¤t->sighand->siglock); + if (utask->signal_denied) { + set_thread_flag(TIF_SIGPENDING); + utask->signal_denied = false; + } if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); diff --git a/kernel/exit.c b/kernel/exit.c index 3485e5fc499e..bb184a67ac73 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -69,6 +69,7 @@ #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/uaccess.h> +#include <linux/pidfs.h> #include <uapi/linux/wait.h> @@ -122,14 +123,27 @@ static __init int kernel_exit_sysfs_init(void) late_initcall(kernel_exit_sysfs_init); #endif -static void __unhash_process(struct task_struct *p, bool group_dead) +/* + * For things release_task() would like to do *after* tasklist_lock is released. + */ +struct release_task_post { + struct pid *pids[PIDTYPE_MAX]; +}; + +static void __unhash_process(struct release_task_post *post, struct task_struct *p, + bool group_dead) { + struct pid *pid = task_pid(p); + nr_threads--; - detach_pid(p, PIDTYPE_PID); + + detach_pid(post->pids, p, PIDTYPE_PID); + wake_up_all(&pid->wait_pidfd); + if (group_dead) { - detach_pid(p, PIDTYPE_TGID); - detach_pid(p, PIDTYPE_PGID); - detach_pid(p, PIDTYPE_SID); + detach_pid(post->pids, p, PIDTYPE_TGID); + detach_pid(post->pids, p, PIDTYPE_PGID); + detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); @@ -141,7 +155,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) /* * This function expects the tasklist_lock write-locked. */ -static void __exit_signal(struct task_struct *tsk) +static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); @@ -174,9 +188,6 @@ static void __exit_signal(struct task_struct *tsk) sig->curr_target = next_thread(tsk); } - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, - sizeof(unsigned long long)); - /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, @@ -197,23 +208,15 @@ static void __exit_signal(struct task_struct *tsk) task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; - __unhash_process(tsk, group_dead); + __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); - /* - * Do this under ->siglock, we can race with another thread - * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. - */ - flush_sigqueue(&tsk->pending); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk, TIF_SIGPENDING); - if (group_dead) { - flush_sigqueue(&sig->shared_pending); + if (group_dead) tty_kref_put(tty); - } } static void delayed_put_task_struct(struct rcu_head *rhp) @@ -239,22 +242,28 @@ void __weak release_thread(struct task_struct *dead_task) void release_task(struct task_struct *p) { + struct release_task_post post; struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: + memset(&post, 0, sizeof(post)); + /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); + pidfs_exit(p); cgroup_release(p); + /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ + thread_pid = task_pid(p); + write_lock_irq(&tasklist_lock); ptrace_release_task(p); - thread_pid = get_pid(p->thread_pid); - __exit_signal(p); + __exit_signal(&post, p); /* * If we are the last non-leader member of the thread @@ -265,6 +274,9 @@ repeat: leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + /* for pidfs_exit() and do_notify_parent() */ + if (leader->signal->flags & SIGNAL_GROUP_EXIT) + leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -276,9 +288,22 @@ repeat: } write_unlock_irq(&tasklist_lock); + /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); - put_pid(thread_pid); + add_device_randomness(&p->se.sum_exec_runtime, + sizeof(p->se.sum_exec_runtime)); + free_pids(post.pids); release_thread(p); + /* + * This task was already removed from the process/thread/pid lists + * and lock_task_sighand(p) can't succeed. Nobody else can touch + * ->pending or, if group dead, signal->shared_pending. We can call + * flush_sigqueue() lockless. + */ + flush_sigqueue(&p->pending); + if (thread_group_leader(p)) + flush_sigqueue(&p->signal->shared_pending); + put_task_struct_rcu_user(p); p = leader; @@ -396,44 +421,30 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -static void coredump_task_exit(struct task_struct *tsk) +static void coredump_task_exit(struct task_struct *tsk, + struct core_state *core_state) { - struct core_state *core_state; + struct core_thread self; + self.task = tsk; + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* - * Serialize with any possible pending coredump. - * We must hold siglock around checking core_state - * and setting PF_POSTCOREDUMP. The core-inducing thread - * will increment ->nr_threads for each thread in the - * group without PF_POSTCOREDUMP set. + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. */ - spin_lock_irq(&tsk->sighand->siglock); - tsk->flags |= PF_POSTCOREDUMP; - core_state = tsk->signal->core_state; - spin_unlock_irq(&tsk->sighand->siglock); - if (core_state) { - struct core_thread self; - - self.task = current; - if (self.task->flags & PF_SIGNALED) - self.next = xchg(&core_state->dumper.next, &self); - else - self.task = NULL; - /* - * Implies mb(), the result of xchg() must be visible - * to core_state->dumper. - */ - if (atomic_dec_and_test(&core_state->nr_threads)) - complete(&core_state->startup); + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); - for (;;) { - set_current_state(TASK_IDLE|TASK_FREEZABLE); - if (!self.task) /* see coredump_finish() */ - break; - schedule(); - } - __set_current_state(TASK_RUNNING); + for (;;) { + set_current_state(TASK_IDLE|TASK_FREEZABLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); } + __set_current_state(TASK_RUNNING); } #ifdef CONFIG_MEMCG @@ -740,12 +751,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; - /* - * sub-thread or delay_group_leader(), wake up the - * PIDFD_THREAD waiters. - */ - if (!thread_group_empty(tsk)) - do_notify_pidfd(tsk); if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && @@ -758,6 +763,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead) do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; + /* untraced sub-thread */ + do_notify_pidfd(tsk); } if (autoreap) { @@ -861,6 +868,7 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; + struct core_state *core_state; spin_lock_irq(&sighand->siglock); signal->quick_threads--; @@ -870,7 +878,19 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) signal->group_exit_code = code; signal->group_stop_count = 0; } + /* + * Serialize with any possible pending coredump. + * We must hold siglock around checking core_state + * and setting PF_POSTCOREDUMP. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group without PF_POSTCOREDUMP set. + */ + tsk->flags |= PF_POSTCOREDUMP; + core_state = signal->core_state; spin_unlock_irq(&sighand->siglock); + + if (unlikely(core_state)) + coredump_task_exit(tsk, core_state); } void __noreturn do_exit(long code) @@ -879,15 +899,12 @@ void __noreturn do_exit(long code) int group_dead; WARN_ON(irqs_disabled()); - - synchronize_group_exit(tsk, code); - WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); - coredump_task_exit(tsk); + synchronize_group_exit(tsk, code); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); @@ -921,12 +938,21 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); + trace_sched_process_exit(tsk, group_dead); + + /* + * Since sampling can touch ->mm, make sure to stop everything before we + * tear it down. + * + * Also flushes inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_event_exit_task(tsk); exit_mm(); if (group_dead) acct_process(); - trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); @@ -938,14 +964,6 @@ void __noreturn do_exit(long code) exit_task_work(tsk); exit_thread(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - * - * because of cgroup mode, must be called before cgroup_exit() - */ - perf_event_exit_task(tsk); - sched_autogroup_exit_task(tsk); cgroup_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 735405a9c5f3..1ee8eb11f38b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -112,6 +112,9 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +/* For dup_mmap(). */ +#include "../mm/internal.h" + #include <trace/events/sched.h> #define CREATE_TRACE_POINTS @@ -311,11 +314,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * so memcg accounting is performed manually on assigning/releasing * stacks to tasks. Drop __GFP_ACCOUNT. */ - stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, - VMALLOC_START, VMALLOC_END, + stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP & ~__GFP_ACCOUNT, - PAGE_KERNEL, - 0, node, __builtin_return_address(0)); + node, __builtin_return_address(0)); if (!stack) return -ENOMEM; @@ -430,112 +431,9 @@ struct kmem_cache *files_cachep; /* SLAB cache for fs_struct structures (tsk->fs) */ struct kmem_cache *fs_cachep; -/* SLAB cache for vm_area_struct structures */ -static struct kmem_cache *vm_area_cachep; - /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -#ifdef CONFIG_PER_VMA_LOCK - -/* SLAB cache for vm_area_struct.lock */ -static struct kmem_cache *vma_lock_cachep; - -static bool vma_lock_alloc(struct vm_area_struct *vma) -{ - vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); - if (!vma->vm_lock) - return false; - - init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = UINT_MAX; - - return true; -} - -static inline void vma_lock_free(struct vm_area_struct *vma) -{ - kmem_cache_free(vma_lock_cachep, vma->vm_lock); -} - -#else /* CONFIG_PER_VMA_LOCK */ - -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } -static inline void vma_lock_free(struct vm_area_struct *vma) {} - -#endif /* CONFIG_PER_VMA_LOCK */ - -struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - - vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!vma) - return NULL; - - vma_init(vma, mm); - if (!vma_lock_alloc(vma)) { - kmem_cache_free(vm_area_cachep, vma); - return NULL; - } - - return vma; -} - -struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) -{ - struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - - if (!new) - return NULL; - - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); - ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - /* - * orig->shared.rb may be modified concurrently, but the clone - * will be reinitialized. - */ - data_race(memcpy(new, orig, sizeof(*new))); - if (!vma_lock_alloc(new)) { - kmem_cache_free(vm_area_cachep, new); - return NULL; - } - INIT_LIST_HEAD(&new->anon_vma_chain); - vma_numab_state_init(new); - dup_anon_vma_name(orig, new); - - return new; -} - -void __vm_area_free(struct vm_area_struct *vma) -{ - vma_numab_state_free(vma); - free_anon_vma_name(vma); - vma_lock_free(vma); - kmem_cache_free(vm_area_cachep, vma); -} - -#ifdef CONFIG_PER_VMA_LOCK -static void vm_area_free_rcu_cb(struct rcu_head *head) -{ - struct vm_area_struct *vma = container_of(head, struct vm_area_struct, - vm_rcu); - - /* The vma should not be locked while being destroyed. */ - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); - __vm_area_free(vma); -} -#endif - -void vm_area_free(struct vm_area_struct *vma) -{ -#ifdef CONFIG_PER_VMA_LOCK - call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); -#else - __vm_area_free(vma); -#endif -} - static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -615,7 +513,7 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); -static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) { struct file *exe_file; @@ -630,178 +528,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) } #ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp; - int retval; - unsigned long charge = 0; - LIST_HEAD(uf); - VMA_ITERATOR(vmi, mm, 0); - - if (mmap_write_lock_killable(oldmm)) - return -EINTR; - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - dup_mm_exe_file(mm, oldmm); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - /* Use __mt_dup() to efficiently build an identical maple tree. */ - retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); - if (unlikely(retval)) - goto out; - - mt_clear_in_rcu(vmi.mas.tree); - for_each_vma(vmi, mpnt) { - struct file *file; - - vma_start_write(mpnt); - if (mpnt->vm_flags & VM_DONTCOPY) { - retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, - mpnt->vm_end, GFP_KERNEL); - if (retval) - goto loop_out; - - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - /* - * Don't duplicate many vmas if we've been oom-killed (for - * example) - */ - if (fatal_signal_pending(current)) { - retval = -EINTR; - goto loop_out; - } - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = vm_area_dup(mpnt); - if (!tmp) - goto fail_nomem; - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* - * VM_WIPEONFORK gets a clean slate in the child. - * Don't prepare anon_vma until fault since we don't - * copy page for current vma. - */ - tmp->anon_vma = NULL; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - vm_flags_clear(tmp, VM_LOCKED_MASK); - /* - * Copy/update hugetlb private vma information. - */ - if (is_vm_hugetlb_page(tmp)) - hugetlb_dup_vma_private(tmp); - - /* - * Link the vma into the MT. After using __mt_dup(), memory - * allocation is not necessary here, so it cannot fail. - */ - vma_iter_bulk_store(&vmi, tmp); - - mm->map_count++; - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - file = tmp->vm_file; - if (file) { - struct address_space *mapping = file->f_mapping; - - get_file(file); - i_mmap_lock_write(mapping); - if (vma_is_shared_maywrite(tmp)) - mapping_allow_writable(mapping); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(tmp, mpnt); - - if (retval) { - mpnt = vma_next(&vmi); - goto loop_out; - } - } - /* a new mm has just been created */ - retval = arch_dup_mmap(oldmm, mm); -loop_out: - vma_iter_free(&vmi); - if (!retval) { - mt_set_in_rcu(vmi.mas.tree); - ksm_fork(mm, oldmm); - khugepaged_fork(mm, oldmm); - } else { - - /* - * The entire maple tree has already been duplicated. If the - * mmap duplication fails, mark the failure point with - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, - * stop releasing VMAs that have not been duplicated after this - * point. - */ - if (mpnt) { - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); - mas_store(&vmi.mas, XA_ZERO_ENTRY); - /* Avoid OOM iterating a broken tree */ - set_bit(MMF_OOM_SKIP, &mm->flags); - } - /* - * The mm_struct is going to exit, but the locks will be dropped - * first. Set the mm_struct as unstable is advisable as it is - * not fully initialised. - */ - set_bit(MMF_UNSTABLE, &mm->flags); - } -out: - mmap_write_unlock(mm); - flush_tlb_mm(oldmm); - mmap_write_unlock(oldmm); - if (!retval) - dup_userfaultfd_complete(&uf); - else - dup_userfaultfd_fail(&uf); - return retval; - -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - vm_area_free(tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto loop_out; -} - static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); @@ -815,17 +541,40 @@ static inline void mm_free_pgd(struct mm_struct *mm) pgd_free(mm, mm->pgd); } #else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - mmap_write_lock(oldmm); - dup_mm_exe_file(mm, oldmm); - mmap_write_unlock(oldmm); - return 0; -} #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ +#ifdef CONFIG_MM_ID +static DEFINE_IDA(mm_ida); + +static inline int mm_alloc_id(struct mm_struct *mm) +{ + int ret; + + ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL); + if (ret < 0) + return ret; + mm->mm_id = ret; + return 0; +} + +static inline void mm_free_id(struct mm_struct *mm) +{ + const mm_id_t id = mm->mm_id; + + mm->mm_id = MM_ID_DUMMY; + if (id == MM_ID_DUMMY) + return; + if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX)) + return; + ida_free(&mm_ida, id); +} +#else /* !CONFIG_MM_ID */ +static inline int mm_alloc_id(struct mm_struct *mm) { return 0; } +static inline void mm_free_id(struct mm_struct *mm) {} +#endif /* CONFIG_MM_ID */ + static void check_mm(struct mm_struct *mm) { int i; @@ -929,6 +678,7 @@ void __mmdrop(struct mm_struct *mm) WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); + mm_free_id(mm); destroy_context(mm); mmu_notifier_subscriptions_destroy(mm); check_mm(mm); @@ -1263,6 +1013,15 @@ static void mm_init_uprobes_state(struct mm_struct *mm) #endif } +static void mmap_init_lock(struct mm_struct *mm) +{ + init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); +#ifdef CONFIG_PER_VMA_LOCK + rcuwait_init(&mm->vma_writer_wait); +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -1287,6 +1046,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); + futex_mm_init(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) mm->pmd_huge_pte = NULL; #endif @@ -1304,6 +1064,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_pgd(mm)) goto fail_nopgd; + if (mm_alloc_id(mm)) + goto fail_noid; + if (init_new_context(p, mm)) goto fail_nocontext; @@ -1323,6 +1086,8 @@ fail_pcpu: fail_cid: destroy_context(mm); fail_nocontext: + mm_free_id(mm); +fail_noid: mm_free_pgd(mm); fail_nopgd: free_mm(mm); @@ -1364,6 +1129,7 @@ static inline void __mmput(struct mm_struct *mm) if (mm->binfmt) module_put(mm->binfmt->module); lru_gen_del_mm(mm); + futex_hash_free(mm); mmdrop(mm); } @@ -1559,6 +1325,17 @@ struct mm_struct *get_task_mm(struct task_struct *task) } EXPORT_SYMBOL_GPL(get_task_mm); +static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode) +{ + if (mm == current->mm) + return true; + if (ptrace_may_access(task, mode)) + return true; + if ((mode & PTRACE_MODE_READ) && perfmon_capable()) + return true; + return false; +} + struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) { struct mm_struct *mm; @@ -1571,7 +1348,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mm = get_task_mm(task); if (!mm) { mm = ERR_PTR(-ESRCH); - } else if (mm != current->mm && !ptrace_may_access(task, mode)) { + } else if (!may_access_mm(mm, task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } @@ -1891,8 +1668,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) #ifdef CONFIG_POSIX_TIMERS INIT_HLIST_HEAD(&sig->posix_timers); INIT_HLIST_HEAD(&sig->ignored_posix_timers); - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sig->real_timer.function = it_real_fn; + hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); #endif task_lock(current->group_leader); @@ -2003,17 +1779,16 @@ static inline void rcu_copy_process(struct task_struct *p) } /** - * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd - * @ret: Where to return the file for the pidfd. + * @ret_file: return the new pidfs file * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * - * The helper doesn't perform checks on @pid which makes it useful for pidfds - * created via CLONE_PIDFD where @pid has no task attached when the pidfd and - * pidfd file are prepared. + * The helper verifies that @pid is still in use, without PIDFD_THREAD the + * task identified by @pid must be a thread-group leader. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in @@ -2030,64 +1805,48 @@ static inline void rcu_copy_process(struct task_struct *p) * error, a negative error code is returned from the function and the * last argument remains unchanged. */ -static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file) { - int pidfd; - struct file *pidfd_file; + struct file *pidfs_file; - pidfd = get_unused_fd_flags(O_CLOEXEC); - if (pidfd < 0) - return pidfd; - - pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); - if (IS_ERR(pidfd_file)) { - put_unused_fd(pidfd); - return PTR_ERR(pidfd_file); - } /* - * anon_inode_getfile() ignores everything outside of the - * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually. + * PIDFD_STALE is only allowed to be passed if the caller knows + * that @pid is already registered in pidfs and thus + * PIDFD_INFO_EXIT information is guaranteed to be available. */ - pidfd_file->f_flags |= (flags & PIDFD_THREAD); - *ret = pidfd_file; - return pidfd; -} + if (!(flags & PIDFD_STALE)) { + /* + * While holding the pidfd waitqueue lock removing the + * task linkage for the thread-group leader pid + * (PIDTYPE_TGID) isn't possible. Thus, if there's still + * task linkage for PIDTYPE_PID not having thread-group + * leader linkage for the pid means it wasn't a + * thread-group leader in the first place. + */ + guard(spinlock_irq)(&pid->wait_pidfd.lock); -/** - * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd - * @pid: the struct pid for which to create a pidfd - * @flags: flags of the new @pidfd - * @ret: Where to return the pidfd. - * - * Allocate a new file that stashes @pid and reserve a new pidfd number in the - * caller's file descriptor table. The pidfd is reserved but not installed yet. - * - * The helper verifies that @pid is still in use, without PIDFD_THREAD the - * task identified by @pid must be a thread-group leader. - * - * If this function returns successfully the caller is responsible to either - * call fd_install() passing the returned pidfd and pidfd file as arguments in - * order to install the pidfd into its file descriptor table or they must use - * put_unused_fd() and fput() on the returned pidfd and pidfd file - * respectively. - * - * This function is useful when a pidfd must already be reserved but there - * might still be points of failure afterwards and the caller wants to ensure - * that no pidfd is leaked into its file descriptor table. - * - * Return: On success, a reserved pidfd is returned from the function and a new - * pidfd file is returned in the last argument to the function. On - * error, a negative error code is returned from the function and the - * last argument remains unchanged. - */ -int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) -{ - bool thread = flags & PIDFD_THREAD; + /* Task has already been reaped. */ + if (!pid_has_task(pid, PIDTYPE_PID)) + return -ESRCH; + /* + * If this struct pid isn't used as a thread-group + * leader but the caller requested to create a + * thread-group leader pidfd then report ENOENT. + */ + if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID)) + return -ENOENT; + } - if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) - return -EINVAL; + CLASS(get_unused_fd, pidfd)(O_CLOEXEC); + if (pidfd < 0) + return pidfd; - return __pidfd_prepare(pid, flags, ret); + pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR); + if (IS_ERR(pidfs_file)) + return PTR_ERR(pidfs_file); + + *ret_file = pidfs_file; + return take_fd(pidfd); } static void __delayed_free_task(struct rcu_head *rhp) @@ -2136,6 +1895,13 @@ static void rv_task_fork(struct task_struct *p) #define rv_task_fork(p) do {} while (0) #endif +static bool need_futex_hash_allocate_default(u64 clone_flags) +{ + if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM)) + return false; + return true; +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2432,8 +2198,11 @@ __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; - /* Note that no task has been attached to @pid yet. */ - retval = __pidfd_prepare(pid, flags, &pidfile); + /* + * Note that no task has been attached to @pid yet indicate + * that via CLONE_PIDFD. + */ + retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; @@ -2514,6 +2283,21 @@ __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; /* + * Allocate a default futex hash for the user process once the first + * thread spawns. + */ + if (need_futex_hash_allocate_default(clone_flags)) { + retval = futex_hash_allocate_default(); + if (retval) + goto bad_fork_core_free; + /* + * If we fail beyond this point we don't free the allocated + * futex hash map. We assume that another thread will be created + * and makes use of it. The hash map will be freed once the main + * thread terminates. + */ + } + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do * not want user-space to be able to predict the process start-time by @@ -3200,11 +2984,6 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); -#ifdef CONFIG_PER_VMA_LOCK - vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); -#endif mmap_init(); nsproxy_cache_init(); } diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 3db8567f5a44..90d53fb0ee9e 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -36,9 +36,15 @@ #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/plist.h> +#include <linux/gfp.h> +#include <linux/vmalloc.h> #include <linux/memblock.h> #include <linux/fault-inject.h> #include <linux/slab.h> +#include <linux/prctl.h> +#include <linux/rcuref.h> +#include <linux/mempolicy.h> +#include <linux/mmap_lock.h> #include "futex.h" #include "../locking/rtmutex_common.h" @@ -49,12 +55,24 @@ * reside in the same cacheline. */ static struct { - struct futex_hash_bucket *queues; - unsigned long hashsize; + unsigned long hashmask; + unsigned int hashshift; + struct futex_hash_bucket *queues[MAX_NUMNODES]; } __futex_data __read_mostly __aligned(2*sizeof(long)); -#define futex_queues (__futex_data.queues) -#define futex_hashsize (__futex_data.hashsize) +#define futex_hashmask (__futex_data.hashmask) +#define futex_hashshift (__futex_data.hashshift) +#define futex_queues (__futex_data.queues) + +struct futex_private_hash { + rcuref_t users; + unsigned int hash_mask; + struct rcu_head rcu; + void *mm; + bool custom; + bool immutable; + struct futex_hash_bucket queues[]; +}; /* * Fault injections for futexes. @@ -107,21 +125,328 @@ late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAIL_FUTEX */ +static struct futex_hash_bucket * +__futex_hash(union futex_key *key, struct futex_private_hash *fph); + +#ifdef CONFIG_FUTEX_PRIVATE_HASH +static inline bool futex_key_is_private(union futex_key *key) +{ + /* + * Relies on get_futex_key() to set either bit for shared + * futexes -- see comment with union futex_key. + */ + return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); +} + +bool futex_private_hash_get(struct futex_private_hash *fph) +{ + if (fph->immutable) + return true; + return rcuref_get(&fph->users); +} + +void futex_private_hash_put(struct futex_private_hash *fph) +{ + /* Ignore return value, last put is verified via rcuref_is_dead() */ + if (fph->immutable) + return; + if (rcuref_put(&fph->users)) + wake_up_var(fph->mm); +} + /** - * futex_hash - Return the hash bucket in the global hash - * @key: Pointer to the futex key for which the hash is calculated + * futex_hash_get - Get an additional reference for the local hash. + * @hb: ptr to the private local hash. * - * We hash on the keys returned from get_futex_key (see below) and return the - * corresponding hash bucket in the global hash. + * Obtain an additional reference for the already obtained hash bucket. The + * caller must already own an reference. */ +void futex_hash_get(struct futex_hash_bucket *hb) +{ + struct futex_private_hash *fph = hb->priv; + + if (!fph) + return; + WARN_ON_ONCE(!futex_private_hash_get(fph)); +} + +void futex_hash_put(struct futex_hash_bucket *hb) +{ + struct futex_private_hash *fph = hb->priv; + + if (!fph) + return; + futex_private_hash_put(fph); +} + +static struct futex_hash_bucket * +__futex_hash_private(union futex_key *key, struct futex_private_hash *fph) +{ + u32 hash; + + if (!futex_key_is_private(key)) + return NULL; + + if (!fph) + fph = rcu_dereference(key->private.mm->futex_phash); + if (!fph || !fph->hash_mask) + return NULL; + + hash = jhash2((void *)&key->private.address, + sizeof(key->private.address) / 4, + key->both.offset); + return &fph->queues[hash & fph->hash_mask]; +} + +static void futex_rehash_private(struct futex_private_hash *old, + struct futex_private_hash *new) +{ + struct futex_hash_bucket *hb_old, *hb_new; + unsigned int slots = old->hash_mask + 1; + unsigned int i; + + for (i = 0; i < slots; i++) { + struct futex_q *this, *tmp; + + hb_old = &old->queues[i]; + + spin_lock(&hb_old->lock); + plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) { + + plist_del(&this->list, &hb_old->chain); + futex_hb_waiters_dec(hb_old); + + WARN_ON_ONCE(this->lock_ptr != &hb_old->lock); + + hb_new = __futex_hash(&this->key, new); + futex_hb_waiters_inc(hb_new); + /* + * The new pointer isn't published yet but an already + * moved user can be unqueued due to timeout or signal. + */ + spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING); + plist_add(&this->list, &hb_new->chain); + this->lock_ptr = &hb_new->lock; + spin_unlock(&hb_new->lock); + } + spin_unlock(&hb_old->lock); + } +} + +static bool __futex_pivot_hash(struct mm_struct *mm, + struct futex_private_hash *new) +{ + struct futex_private_hash *fph; + + WARN_ON_ONCE(mm->futex_phash_new); + + fph = rcu_dereference_protected(mm->futex_phash, + lockdep_is_held(&mm->futex_hash_lock)); + if (fph) { + if (!rcuref_is_dead(&fph->users)) { + mm->futex_phash_new = new; + return false; + } + + futex_rehash_private(fph, new); + } + rcu_assign_pointer(mm->futex_phash, new); + kvfree_rcu(fph, rcu); + return true; +} + +static void futex_pivot_hash(struct mm_struct *mm) +{ + scoped_guard(mutex, &mm->futex_hash_lock) { + struct futex_private_hash *fph; + + fph = mm->futex_phash_new; + if (fph) { + mm->futex_phash_new = NULL; + __futex_pivot_hash(mm, fph); + } + } +} + +struct futex_private_hash *futex_private_hash(void) +{ + struct mm_struct *mm = current->mm; + /* + * Ideally we don't loop. If there is a replacement in progress + * then a new private hash is already prepared and a reference can't be + * obtained once the last user dropped it's. + * In that case we block on mm_struct::futex_hash_lock and either have + * to perform the replacement or wait while someone else is doing the + * job. Eitherway, on the second iteration we acquire a reference on the + * new private hash or loop again because a new replacement has been + * requested. + */ +again: + scoped_guard(rcu) { + struct futex_private_hash *fph; + + fph = rcu_dereference(mm->futex_phash); + if (!fph) + return NULL; + + if (fph->immutable) + return fph; + if (rcuref_get(&fph->users)) + return fph; + } + futex_pivot_hash(mm); + goto again; +} + struct futex_hash_bucket *futex_hash(union futex_key *key) { - u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, - key->both.offset); + struct futex_private_hash *fph; + struct futex_hash_bucket *hb; + +again: + scoped_guard(rcu) { + hb = __futex_hash(key, NULL); + fph = hb->priv; + + if (!fph || futex_private_hash_get(fph)) + return hb; + } + futex_pivot_hash(key->private.mm); + goto again; +} + +#else /* !CONFIG_FUTEX_PRIVATE_HASH */ + +static struct futex_hash_bucket * +__futex_hash_private(union futex_key *key, struct futex_private_hash *fph) +{ + return NULL; +} + +struct futex_hash_bucket *futex_hash(union futex_key *key) +{ + return __futex_hash(key, NULL); +} + +#endif /* CONFIG_FUTEX_PRIVATE_HASH */ + +#ifdef CONFIG_FUTEX_MPOL + +static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = vma_lookup(mm, addr); + struct mempolicy *mpol; + int node = FUTEX_NO_NODE; + + if (!vma) + return FUTEX_NO_NODE; + + mpol = vma_policy(vma); + if (!mpol) + return FUTEX_NO_NODE; + + switch (mpol->mode) { + case MPOL_PREFERRED: + node = first_node(mpol->nodes); + break; + case MPOL_PREFERRED_MANY: + case MPOL_BIND: + if (mpol->home_node != NUMA_NO_NODE) + node = mpol->home_node; + break; + default: + break; + } + + return node; +} + +static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr) +{ + int seq, node; + + guard(rcu)(); + + if (!mmap_lock_speculate_try_begin(mm, &seq)) + return -EBUSY; + + node = __futex_key_to_node(mm, addr); + + if (mmap_lock_speculate_retry(mm, seq)) + return -EAGAIN; + + return node; +} + +static int futex_mpol(struct mm_struct *mm, unsigned long addr) +{ + int node; + + node = futex_key_to_node_opt(mm, addr); + if (node >= FUTEX_NO_NODE) + return node; - return &futex_queues[hash & (futex_hashsize - 1)]; + guard(mmap_read_lock)(mm); + return __futex_key_to_node(mm, addr); } +#else /* !CONFIG_FUTEX_MPOL */ + +static int futex_mpol(struct mm_struct *mm, unsigned long addr) +{ + return FUTEX_NO_NODE; +} + +#endif /* CONFIG_FUTEX_MPOL */ + +/** + * __futex_hash - Return the hash bucket + * @key: Pointer to the futex key for which the hash is calculated + * @fph: Pointer to private hash if known + * + * We hash on the keys returned from get_futex_key (see below) and return the + * corresponding hash bucket. + * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the + * private hash) is returned if existing. Otherwise a hash bucket from the + * global hash is returned. + */ +static struct futex_hash_bucket * +__futex_hash(union futex_key *key, struct futex_private_hash *fph) +{ + int node = key->both.node; + u32 hash; + + if (node == FUTEX_NO_NODE) { + struct futex_hash_bucket *hb; + + hb = __futex_hash_private(key, fph); + if (hb) + return hb; + } + + hash = jhash2((u32 *)key, + offsetof(typeof(*key), both.offset) / sizeof(u32), + key->both.offset); + + if (node == FUTEX_NO_NODE) { + /* + * In case of !FLAGS_NUMA, use some unused hash bits to pick a + * node -- this ensures regular futexes are interleaved across + * the nodes and avoids having to allocate multiple + * hash-tables. + * + * NOTE: this isn't perfectly uniform, but it is fast and + * handles sparse node masks. + */ + node = (hash >> futex_hashshift) % nr_node_ids; + if (!node_possible(node)) { + node = find_next_bit_wrap(node_possible_map.bits, + nr_node_ids, node); + } + } + + return &futex_queues[node][hash & futex_hashmask]; +} /** * futex_setup_timer - set up the sleeping hrtimer. @@ -206,7 +531,7 @@ static u64 get_inode_sequence_number(struct inode *inode) * * For shared mappings (when @fshared), the key is: * - * ( inode->i_sequence, page->index, offset_within_page ) + * ( inode->i_sequence, page offset within mapping, offset_within_page ) * * [ also see get_inode_sequence_number() ] * @@ -227,25 +552,60 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, struct page *page; struct folio *folio; struct address_space *mapping; - int err, ro = 0; + int node, err, size, ro = 0; + bool node_updated = false; bool fshared; fshared = flags & FLAGS_SHARED; + size = futex_size(flags); + if (flags & FLAGS_NUMA) + size *= 2; /* * The futex address must be "naturally" aligned. */ key->both.offset = address % PAGE_SIZE; - if (unlikely((address % sizeof(u32)) != 0)) + if (unlikely((address % size) != 0)) return -EINVAL; address -= key->both.offset; - if (unlikely(!access_ok(uaddr, sizeof(u32)))) + if (unlikely(!access_ok(uaddr, size))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) return -EFAULT; + node = FUTEX_NO_NODE; + + if (flags & FLAGS_NUMA) { + u32 __user *naddr = (void *)uaddr + size / 2; + + if (futex_get_value(&node, naddr)) + return -EFAULT; + + if ((node != FUTEX_NO_NODE) && + ((unsigned int)node >= MAX_NUMNODES || !node_possible(node))) + return -EINVAL; + } + + if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) { + node = futex_mpol(mm, address); + node_updated = true; + } + + if (flags & FLAGS_NUMA) { + u32 __user *naddr = (void *)uaddr + size / 2; + + if (node == FUTEX_NO_NODE) { + node = numa_node_id(); + node_updated = true; + } + if (node_updated && futex_put_value(node, naddr)) + return -EFAULT; + } + + key->both.node = node; + /* * PROCESS_PRIVATE futexes are fast. * As the mm cannot disappear under us and the 'key' only needs @@ -502,13 +862,9 @@ void __futex_unqueue(struct futex_q *q) } /* The key must be already stored in q->key. */ -struct futex_hash_bucket *futex_q_lock(struct futex_q *q) +void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb) __acquires(&hb->lock) { - struct futex_hash_bucket *hb; - - hb = futex_hash(&q->key); - /* * Increment the counter before taking the lock so that * a potential waker won't miss a to-be-slept task that is @@ -522,14 +878,13 @@ struct futex_hash_bucket *futex_q_lock(struct futex_q *q) q->lock_ptr = &hb->lock; spin_lock(&hb->lock); - return hb; } void futex_q_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { - spin_unlock(&hb->lock); futex_hb_waiters_dec(hb); + spin_unlock(&hb->lock); } void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, @@ -568,6 +923,8 @@ int futex_unqueue(struct futex_q *q) spinlock_t *lock_ptr; int ret = 0; + /* RCU so lock_ptr is not going away during locking. */ + guard(rcu)(); /* In the common case we don't take the spinlock, which is nice. */ retry: /* @@ -606,6 +963,24 @@ retry: return ret; } +void futex_q_lockptr_lock(struct futex_q *q) +{ + spinlock_t *lock_ptr; + + /* + * See futex_unqueue() why lock_ptr can change. + */ + guard(rcu)(); +retry: + lock_ptr = READ_ONCE(q->lock_ptr); + spin_lock(lock_ptr); + + if (unlikely(lock_ptr != q->lock_ptr)) { + spin_unlock(lock_ptr); + goto retry; + } +} + /* * PI futexes can not be requeued and must remove themselves from the hash * bucket. The hash bucket lock (i.e. lock_ptr) is held. @@ -949,10 +1324,20 @@ static void exit_pi_state_list(struct task_struct *curr) { struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; - struct futex_hash_bucket *hb; union futex_key key = FUTEX_KEY_INIT; /* + * The mutex mm_struct::futex_hash_lock might be acquired. + */ + might_sleep(); + /* + * Ensure the hash remains stable (no resize) during the while loop + * below. The hb pointer is acquired under the pi_lock so we can't block + * on the mutex. + */ + WARN_ON(curr != current); + guard(private_hash)(); + /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: @@ -962,50 +1347,52 @@ static void exit_pi_state_list(struct task_struct *curr) next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; - hb = futex_hash(&key); - - /* - * We can race against put_pi_state() removing itself from the - * list (a waiter going away). put_pi_state() will first - * decrement the reference count and then modify the list, so - * its possible to see the list entry but fail this reference - * acquire. - * - * In that case; drop the locks to let put_pi_state() make - * progress and retry the loop. - */ - if (!refcount_inc_not_zero(&pi_state->refcount)) { + if (1) { + CLASS(hb, hb)(&key); + + /* + * We can race against put_pi_state() removing itself from the + * list (a waiter going away). put_pi_state() will first + * decrement the reference count and then modify the list, so + * its possible to see the list entry but fail this reference + * acquire. + * + * In that case; drop the locks to let put_pi_state() make + * progress and retry the loop. + */ + if (!refcount_inc_not_zero(&pi_state->refcount)) { + raw_spin_unlock_irq(&curr->pi_lock); + cpu_relax(); + raw_spin_lock_irq(&curr->pi_lock); + continue; + } raw_spin_unlock_irq(&curr->pi_lock); - cpu_relax(); - raw_spin_lock_irq(&curr->pi_lock); - continue; - } - raw_spin_unlock_irq(&curr->pi_lock); - spin_lock(&hb->lock); - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); - raw_spin_lock(&curr->pi_lock); - /* - * We dropped the pi-lock, so re-check whether this - * task still owns the PI-state: - */ - if (head->next != next) { - /* retain curr->pi_lock for the loop invariant */ - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + spin_lock(&hb->lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ + if (head->next != next) { + /* retain curr->pi_lock for the loop invariant */ + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + put_pi_state(pi_state); + continue; + } + + WARN_ON(pi_state->owner != curr); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + pi_state->owner = NULL; + + raw_spin_unlock(&curr->pi_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); - put_pi_state(pi_state); - continue; } - WARN_ON(pi_state->owner != curr); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - pi_state->owner = NULL; - - raw_spin_unlock(&curr->pi_lock); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - spin_unlock(&hb->lock); - rt_mutex_futex_unlock(&pi_state->pi_mutex); put_pi_state(pi_state); @@ -1125,29 +1512,314 @@ void futex_exit_release(struct task_struct *tsk) futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } +static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, + struct futex_private_hash *fph) +{ +#ifdef CONFIG_FUTEX_PRIVATE_HASH + fhb->priv = fph; +#endif + atomic_set(&fhb->waiters, 0); + plist_head_init(&fhb->chain); + spin_lock_init(&fhb->lock); +} + +#define FH_CUSTOM 0x01 +#define FH_IMMUTABLE 0x02 + +#ifdef CONFIG_FUTEX_PRIVATE_HASH +void futex_hash_free(struct mm_struct *mm) +{ + struct futex_private_hash *fph; + + kvfree(mm->futex_phash_new); + fph = rcu_dereference_raw(mm->futex_phash); + if (fph) { + WARN_ON_ONCE(rcuref_read(&fph->users) > 1); + kvfree(fph); + } +} + +static bool futex_pivot_pending(struct mm_struct *mm) +{ + struct futex_private_hash *fph; + + guard(rcu)(); + + if (!mm->futex_phash_new) + return true; + + fph = rcu_dereference(mm->futex_phash); + return rcuref_is_dead(&fph->users); +} + +static bool futex_hash_less(struct futex_private_hash *a, + struct futex_private_hash *b) +{ + /* user provided always wins */ + if (!a->custom && b->custom) + return true; + if (a->custom && !b->custom) + return false; + + /* zero-sized hash wins */ + if (!b->hash_mask) + return true; + if (!a->hash_mask) + return false; + + /* keep the biggest */ + if (a->hash_mask < b->hash_mask) + return true; + if (a->hash_mask > b->hash_mask) + return false; + + return false; /* equal */ +} + +static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) +{ + struct mm_struct *mm = current->mm; + struct futex_private_hash *fph; + bool custom = flags & FH_CUSTOM; + int i; + + if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots))) + return -EINVAL; + + /* + * Once we've disabled the global hash there is no way back. + */ + scoped_guard(rcu) { + fph = rcu_dereference(mm->futex_phash); + if (fph && (!fph->hash_mask || fph->immutable)) { + if (custom) + return -EBUSY; + return 0; + } + } + + fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + if (!fph) + return -ENOMEM; + + rcuref_init(&fph->users, 1); + fph->hash_mask = hash_slots ? hash_slots - 1 : 0; + fph->custom = custom; + fph->immutable = !!(flags & FH_IMMUTABLE); + fph->mm = mm; + + for (i = 0; i < hash_slots; i++) + futex_hash_bucket_init(&fph->queues[i], fph); + + if (custom) { + /* + * Only let prctl() wait / retry; don't unduly delay clone(). + */ +again: + wait_var_event(mm, futex_pivot_pending(mm)); + } + + scoped_guard(mutex, &mm->futex_hash_lock) { + struct futex_private_hash *free __free(kvfree) = NULL; + struct futex_private_hash *cur, *new; + + cur = rcu_dereference_protected(mm->futex_phash, + lockdep_is_held(&mm->futex_hash_lock)); + new = mm->futex_phash_new; + mm->futex_phash_new = NULL; + + if (fph) { + if (cur && (!cur->hash_mask || cur->immutable)) { + /* + * If two threads simultaneously request the global + * hash then the first one performs the switch, + * the second one returns here. + */ + free = fph; + mm->futex_phash_new = new; + return -EBUSY; + } + if (cur && !new) { + /* + * If we have an existing hash, but do not yet have + * allocated a replacement hash, drop the initial + * reference on the existing hash. + */ + futex_private_hash_put(cur); + } + + if (new) { + /* + * Two updates raced; throw out the lesser one. + */ + if (futex_hash_less(new, fph)) { + free = new; + new = fph; + } else { + free = fph; + } + } else { + new = fph; + } + fph = NULL; + } + + if (new) { + /* + * Will set mm->futex_phash_new on failure; + * futex_private_hash_get() will try again. + */ + if (!__futex_pivot_hash(mm, new) && custom) + goto again; + } + } + return 0; +} + +int futex_hash_allocate_default(void) +{ + unsigned int threads, buckets, current_buckets = 0; + struct futex_private_hash *fph; + + if (!current->mm) + return 0; + + scoped_guard(rcu) { + threads = min_t(unsigned int, + get_nr_threads(current), + num_online_cpus()); + + fph = rcu_dereference(current->mm->futex_phash); + if (fph) { + if (fph->custom) + return 0; + + current_buckets = fph->hash_mask + 1; + } + } + + /* + * The default allocation will remain within + * 16 <= threads * 4 <= global hash size + */ + buckets = roundup_pow_of_two(4 * threads); + buckets = clamp(buckets, 16, futex_hashmask + 1); + + if (current_buckets >= buckets) + return 0; + + return futex_hash_allocate(buckets, 0); +} + +static int futex_hash_get_slots(void) +{ + struct futex_private_hash *fph; + + guard(rcu)(); + fph = rcu_dereference(current->mm->futex_phash); + if (fph && fph->hash_mask) + return fph->hash_mask + 1; + return 0; +} + +static int futex_hash_get_immutable(void) +{ + struct futex_private_hash *fph; + + guard(rcu)(); + fph = rcu_dereference(current->mm->futex_phash); + if (fph && fph->immutable) + return 1; + if (fph && !fph->hash_mask) + return 1; + return 0; +} + +#else + +static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) +{ + return -EINVAL; +} + +static int futex_hash_get_slots(void) +{ + return 0; +} + +static int futex_hash_get_immutable(void) +{ + return 0; +} +#endif + +int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) +{ + unsigned int flags = FH_CUSTOM; + int ret; + + switch (arg2) { + case PR_FUTEX_HASH_SET_SLOTS: + if (arg4 & ~FH_FLAG_IMMUTABLE) + return -EINVAL; + if (arg4 & FH_FLAG_IMMUTABLE) + flags |= FH_IMMUTABLE; + ret = futex_hash_allocate(arg3, flags); + break; + + case PR_FUTEX_HASH_GET_SLOTS: + ret = futex_hash_get_slots(); + break; + + case PR_FUTEX_HASH_GET_IMMUTABLE: + ret = futex_hash_get_immutable(); + break; + + default: + ret = -EINVAL; + break; + } + return ret; +} + static int __init futex_init(void) { - unsigned int futex_shift; - unsigned long i; + unsigned long hashsize, i; + unsigned int order, n; + unsigned long size; #ifdef CONFIG_BASE_SMALL - futex_hashsize = 16; + hashsize = 16; #else - futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); + hashsize = 256 * num_possible_cpus(); + hashsize /= num_possible_nodes(); + hashsize = max(4, hashsize); + hashsize = roundup_pow_of_two(hashsize); #endif + futex_hashshift = ilog2(hashsize); + size = sizeof(struct futex_hash_bucket) * hashsize; + order = get_order(size); + + for_each_node(n) { + struct futex_hash_bucket *table; + + if (order > MAX_PAGE_ORDER) + table = vmalloc_huge_node(size, GFP_KERNEL, n); + else + table = alloc_pages_exact_nid(n, size, GFP_KERNEL); + + BUG_ON(!table); - futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - futex_hashsize, 0, 0, - &futex_shift, NULL, - futex_hashsize, futex_hashsize); - futex_hashsize = 1UL << futex_shift; + for (i = 0; i < hashsize; i++) + futex_hash_bucket_init(&table[i], NULL); - for (i = 0; i < futex_hashsize; i++) { - atomic_set(&futex_queues[i].waiters, 0); - plist_head_init(&futex_queues[i].chain); - spin_lock_init(&futex_queues[i].lock); + futex_queues[n] = table; } + futex_hashmask = hashsize - 1; + pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n", + hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024, + order > MAX_PAGE_ORDER ? "vmalloc" : "linear"); return 0; } core_initcall(futex_init); diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 6b2f4c7eb720..fcd1617212ee 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -7,6 +7,7 @@ #include <linux/sched/wake_q.h> #include <linux/compat.h> #include <linux/uaccess.h> +#include <linux/cleanup.h> #ifdef CONFIG_PREEMPT_RT #include <linux/rcuwait.h> @@ -38,6 +39,7 @@ #define FLAGS_HAS_TIMEOUT 0x0040 #define FLAGS_NUMA 0x0080 #define FLAGS_STRICT 0x0100 +#define FLAGS_MPOL 0x0200 /* FUTEX_ to FLAGS_ */ static inline unsigned int futex_to_flags(unsigned int op) @@ -53,7 +55,7 @@ static inline unsigned int futex_to_flags(unsigned int op) return flags; } -#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE) +#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_MPOL | FUTEX2_PRIVATE) /* FUTEX2_ to FLAGS_ */ static inline unsigned int futex2_to_flags(unsigned int flags2) @@ -66,6 +68,9 @@ static inline unsigned int futex2_to_flags(unsigned int flags2) if (flags2 & FUTEX2_NUMA) flags |= FLAGS_NUMA; + if (flags2 & FUTEX2_MPOL) + flags |= FLAGS_MPOL; + return flags; } @@ -86,6 +91,19 @@ static inline bool futex_flags_valid(unsigned int flags) if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) return false; + /* + * Must be able to represent both FUTEX_NO_NODE and every valid nodeid + * in a futex word. + */ + if (flags & FLAGS_NUMA) { + int bits = 8 * futex_size(flags); + u64 max = ~0ULL; + + max >>= 64 - bits; + if (nr_node_ids >= max) + return false; + } + return true; } @@ -117,6 +135,7 @@ struct futex_hash_bucket { atomic_t waiters; spinlock_t lock; struct plist_head chain; + struct futex_private_hash *priv; } ____cacheline_aligned_in_smp; /* @@ -156,6 +175,7 @@ typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q); * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup * @requeue_state: State field for futex_requeue_pi() + * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) * * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so @@ -182,6 +202,7 @@ struct futex_q { union futex_key *requeue_pi_key; u32 bitset; atomic_t requeue_state; + bool drop_hb_ref; #ifdef CONFIG_PREEMPT_RT struct rcuwait requeue_wait; #endif @@ -196,12 +217,35 @@ enum futex_access { extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw); - +extern void futex_q_lockptr_lock(struct futex_q *q); extern struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns); extern struct futex_hash_bucket *futex_hash(union futex_key *key); +#ifdef CONFIG_FUTEX_PRIVATE_HASH +extern void futex_hash_get(struct futex_hash_bucket *hb); +extern void futex_hash_put(struct futex_hash_bucket *hb); + +extern struct futex_private_hash *futex_private_hash(void); +extern bool futex_private_hash_get(struct futex_private_hash *fph); +extern void futex_private_hash_put(struct futex_private_hash *fph); + +#else /* !CONFIG_FUTEX_PRIVATE_HASH */ +static inline void futex_hash_get(struct futex_hash_bucket *hb) { } +static inline void futex_hash_put(struct futex_hash_bucket *hb) { } +static inline struct futex_private_hash *futex_private_hash(void) { return NULL; } +static inline bool futex_private_hash_get(void) { return false; } +static inline void futex_private_hash_put(struct futex_private_hash *fph) { } +#endif + +DEFINE_CLASS(hb, struct futex_hash_bucket *, + if (_T) futex_hash_put(_T), + futex_hash(key), union futex_key *key); + +DEFINE_CLASS(private_hash, struct futex_private_hash *, + if (_T) futex_private_hash_put(_T), + futex_private_hash(), void); /** * futex_match - Check whether two futex keys are equal @@ -219,9 +263,9 @@ static inline int futex_match(union futex_key *key1, union futex_key *key2) } extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - struct futex_q *q, struct futex_hash_bucket **hb); -extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout); + struct futex_q *q, union futex_key *key2, + struct task_struct *task); +extern void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout); extern bool __futex_wake_mark(struct futex_q *q); extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); @@ -256,7 +300,7 @@ static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 * This looks a bit overkill, but generally just results in a couple * of instructions. */ -static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from) +static __always_inline int futex_get_value(u32 *dest, u32 __user *from) { u32 val; @@ -273,12 +317,26 @@ Efault: return -EFAULT; } +static __always_inline int futex_put_value(u32 val, u32 __user *to) +{ + if (can_do_masked_user_access()) + to = masked_user_access_begin(to); + else if (!user_read_access_begin(to, sizeof(*to))) + return -EFAULT; + unsafe_put_user(val, to, Efault); + user_read_access_end(); + return 0; +Efault: + user_read_access_end(); + return -EFAULT; +} + static inline int futex_get_value_locked(u32 *dest, u32 __user *from) { int ret; pagefault_disable(); - ret = futex_read_inatomic(dest, from); + ret = futex_get_value(dest, from); pagefault_enable(); return ret; @@ -354,7 +412,7 @@ static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb) #endif } -extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q); +extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb); extern void futex_q_unlock(struct futex_hash_bucket *hb); diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index 7a941845f7ee..dacb2330f1fb 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -806,7 +806,7 @@ handle_err: break; } - spin_lock(q->lock_ptr); + futex_q_lockptr_lock(q); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); /* @@ -920,7 +920,6 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl struct hrtimer_sleeper timeout, *to; struct task_struct *exiting = NULL; struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; DEFINE_WAKE_Q(wake_q); int res, ret; @@ -939,151 +938,183 @@ retry: goto out; retry_private: - hb = futex_q_lock(&q); + if (1) { + CLASS(hb, hb)(&q.key); - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, - &exiting, 0); - if (unlikely(ret)) { - /* - * Atomic work succeeded and we got the lock, - * or failed. Either way, we do _not_ block. - */ - switch (ret) { - case 1: - /* We got the lock. */ - ret = 0; - goto out_unlock_put_key; - case -EFAULT: - goto uaddr_faulted; - case -EBUSY: - case -EAGAIN: - /* - * Two reasons for this: - * - EBUSY: Task is exiting and we just wait for the - * exit to complete. - * - EAGAIN: The user space value changed. - */ - futex_q_unlock(hb); + futex_q_lock(&q, hb); + + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, + &exiting, 0); + if (unlikely(ret)) { /* - * Handle the case where the owner is in the middle of - * exiting. Wait for the exit to complete otherwise - * this task might loop forever, aka. live lock. + * Atomic work succeeded and we got the lock, + * or failed. Either way, we do _not_ block. */ - wait_for_owner_exiting(ret, exiting); - cond_resched(); - goto retry; - default: - goto out_unlock_put_key; + switch (ret) { + case 1: + /* We got the lock. */ + ret = 0; + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; + case -EBUSY: + case -EAGAIN: + /* + * Two reasons for this: + * - EBUSY: Task is exiting and we just wait for the + * exit to complete. + * - EAGAIN: The user space value changed. + */ + futex_q_unlock(hb); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); + cond_resched(); + goto retry; + default: + goto out_unlock_put_key; + } } - } - WARN_ON(!q.pi_state); + WARN_ON(!q.pi_state); - /* - * Only actually queue now that the atomic ops are done: - */ - __futex_queue(&q, hb, current); + /* + * Only actually queue now that the atomic ops are done: + */ + __futex_queue(&q, hb, current); - if (trylock) { - ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); - /* Fixup the trylock return value: */ - ret = ret ? 0 : -EWOULDBLOCK; - goto no_block; - } + if (trylock) { + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + goto no_block; + } - /* - * Must be done before we enqueue the waiter, here is unfortunately - * under the hb lock, but that *should* work because it does nothing. - */ - rt_mutex_pre_schedule(); + /* + * Caution; releasing @hb in-scope. The hb->lock is still locked + * while the reference is dropped. The reference can not be dropped + * after the unlock because if a user initiated resize is in progress + * then we might need to wake him. This can not be done after the + * rt_mutex_pre_schedule() invocation. The hb will remain valid because + * the thread, performing resize, will block on hb->lock during + * the requeue. + */ + futex_hash_put(no_free_ptr(hb)); + /* + * Must be done before we enqueue the waiter, here is unfortunately + * under the hb lock, but that *should* work because it does nothing. + */ + rt_mutex_pre_schedule(); - rt_mutex_init_waiter(&rt_waiter); + rt_mutex_init_waiter(&rt_waiter); - /* - * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not - * hold it while doing rt_mutex_start_proxy(), because then it will - * include hb->lock in the blocking chain, even through we'll not in - * fact hold it while blocking. This will lead it to report -EDEADLK - * and BUG when futex_unlock_pi() interleaves with this. - * - * Therefore acquire wait_lock while holding hb->lock, but drop the - * latter before calling __rt_mutex_start_proxy_lock(). This - * interleaves with futex_unlock_pi() -- which does a similar lock - * handoff -- such that the latter can observe the futex_q::pi_state - * before __rt_mutex_start_proxy_lock() is done. - */ - raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); - spin_unlock(q.lock_ptr); - /* - * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter - * such that futex_unlock_pi() is guaranteed to observe the waiter when - * it sees the futex_q::pi_state. - */ - ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); - raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); + /* + * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not + * hold it while doing rt_mutex_start_proxy(), because then it will + * include hb->lock in the blocking chain, even through we'll not in + * fact hold it while blocking. This will lead it to report -EDEADLK + * and BUG when futex_unlock_pi() interleaves with this. + * + * Therefore acquire wait_lock while holding hb->lock, but drop the + * latter before calling __rt_mutex_start_proxy_lock(). This + * interleaves with futex_unlock_pi() -- which does a similar lock + * handoff -- such that the latter can observe the futex_q::pi_state + * before __rt_mutex_start_proxy_lock() is done. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); + spin_unlock(q.lock_ptr); + /* + * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter + * such that futex_unlock_pi() is guaranteed to observe the waiter when + * it sees the futex_q::pi_state. + */ + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); + raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); - if (ret) { - if (ret == 1) - ret = 0; - goto cleanup; - } + if (ret) { + if (ret == 1) + ret = 0; + goto cleanup; + } - if (unlikely(to)) - hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); + if (unlikely(to)) + hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); - ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); cleanup: - /* - * If we failed to acquire the lock (deadlock/signal/timeout), we must - * must unwind the above, however we canont lock hb->lock because - * rt_mutex already has a waiter enqueued and hb->lock can itself try - * and enqueue an rt_waiter through rtlock. - * - * Doing the cleanup without holding hb->lock can cause inconsistent - * state between hb and pi_state, but only in the direction of not - * seeing a waiter that is leaving. - * - * See futex_unlock_pi(), it deals with this inconsistency. - * - * There be dragons here, since we must deal with the inconsistency on - * the way out (here), it is impossible to detect/warn about the race - * the other way around (missing an incoming waiter). - * - * What could possibly go wrong... - */ - if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) - ret = 0; + /* + * If we failed to acquire the lock (deadlock/signal/timeout), we must + * unwind the above, however we canont lock hb->lock because + * rt_mutex already has a waiter enqueued and hb->lock can itself try + * and enqueue an rt_waiter through rtlock. + * + * Doing the cleanup without holding hb->lock can cause inconsistent + * state between hb and pi_state, but only in the direction of not + * seeing a waiter that is leaving. + * + * See futex_unlock_pi(), it deals with this inconsistency. + * + * There be dragons here, since we must deal with the inconsistency on + * the way out (here), it is impossible to detect/warn about the race + * the other way around (missing an incoming waiter). + * + * What could possibly go wrong... + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; - /* - * Now that the rt_waiter has been dequeued, it is safe to use - * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up - * the - */ - spin_lock(q.lock_ptr); - /* - * Waiter is unqueued. - */ - rt_mutex_post_schedule(); + /* + * Now that the rt_waiter has been dequeued, it is safe to use + * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up + * the + */ + futex_q_lockptr_lock(&q); + /* + * Waiter is unqueued. + */ + rt_mutex_post_schedule(); no_block: - /* - * Fixup the pi_state owner and possibly acquire the lock if we - * haven't already. - */ - res = fixup_pi_owner(uaddr, &q, !ret); - /* - * If fixup_pi_owner() returned an error, propagate that. If it acquired - * the lock, clear our -ETIMEDOUT or -EINTR. - */ - if (res) - ret = (res < 0) ? res : 0; - - futex_unqueue_pi(&q); - spin_unlock(q.lock_ptr); - goto out; + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_pi_owner(uaddr, &q, !ret); + /* + * If fixup_pi_owner() returned an error, propagate that. If it acquired + * the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + futex_unqueue_pi(&q); + spin_unlock(q.lock_ptr); + if (q.drop_hb_ref) { + CLASS(hb, hb)(&q.key); + /* Additional reference from futex_unlock_pi() */ + futex_hash_put(hb); + } + goto out; out_unlock_put_key: - futex_q_unlock(hb); + futex_q_unlock(hb); + goto out; + +uaddr_faulted: + futex_q_unlock(hb); + + ret = fault_in_user_writeable(uaddr); + if (ret) + goto out; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + goto retry; + } out: if (to) { @@ -1091,18 +1122,6 @@ out: destroy_hrtimer_on_stack(&to->timer); } return ret != -EINTR ? ret : -ERESTARTNOINTR; - -uaddr_faulted: - futex_q_unlock(hb); - - ret = fault_in_user_writeable(uaddr); - if (ret) - goto out; - - if (!(flags & FLAGS_SHARED)) - goto retry_private; - - goto retry; } /* @@ -1114,7 +1133,6 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { u32 curval, uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; - struct futex_hash_bucket *hb; struct futex_q *top_waiter; int ret; @@ -1134,7 +1152,7 @@ retry: if (ret) return ret; - hb = futex_hash(&key); + CLASS(hb, hb)(&key); spin_lock(&hb->lock); retry_hb: @@ -1187,6 +1205,12 @@ retry_hb: */ rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); if (!rt_waiter) { + /* + * Acquire a reference for the leaving waiter to ensure + * valid futex_q::lock_ptr. + */ + futex_hash_get(hb); + top_waiter->drop_hb_ref = true; __futex_unqueue(top_waiter); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); goto retry_hb; diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index b47bb764b352..c716a66f8692 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -87,6 +87,11 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, futex_hb_waiters_inc(hb2); plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; + /* + * hb1 and hb2 belong to the same futex_hash_bucket_private + * because if we managed get a reference on hb1 then it can't be + * replaced. Therefore we avoid put(hb1)+get(hb2) here. + */ } q->key = *key2; } @@ -231,7 +236,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; - + /* + * Acquire a reference for the waiter to ensure valid + * futex_q::lock_ptr. + */ + futex_hash_get(hb); + q->drop_hb_ref = true; q->lock_ptr = &hb->lock; /* Signal locked state to the waiter */ @@ -371,7 +381,6 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags1, union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; int task_count = 0, ret; struct futex_pi_state *pi_state = NULL; - struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); @@ -443,240 +452,242 @@ retry: if (requeue_pi && futex_match(&key1, &key2)) return -EINVAL; - hb1 = futex_hash(&key1); - hb2 = futex_hash(&key2); - retry_private: - futex_hb_waiters_inc(hb2); - double_lock_hb(hb1, hb2); + if (1) { + CLASS(hb, hb1)(&key1); + CLASS(hb, hb2)(&key2); - if (likely(cmpval != NULL)) { - u32 curval; + futex_hb_waiters_inc(hb2); + double_lock_hb(hb1, hb2); - ret = futex_get_value_locked(&curval, uaddr1); + if (likely(cmpval != NULL)) { + u32 curval; - if (unlikely(ret)) { - double_unlock_hb(hb1, hb2); - futex_hb_waiters_dec(hb2); + ret = futex_get_value_locked(&curval, uaddr1); - ret = get_user(curval, uaddr1); - if (ret) - return ret; + if (unlikely(ret)) { + futex_hb_waiters_dec(hb2); + double_unlock_hb(hb1, hb2); - if (!(flags1 & FLAGS_SHARED)) - goto retry_private; + ret = get_user(curval, uaddr1); + if (ret) + return ret; - goto retry; - } - if (curval != *cmpval) { - ret = -EAGAIN; - goto out_unlock; - } - } + if (!(flags1 & FLAGS_SHARED)) + goto retry_private; - if (requeue_pi) { - struct task_struct *exiting = NULL; + goto retry; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out_unlock; + } + } - /* - * Attempt to acquire uaddr2 and wake the top waiter. If we - * intend to requeue waiters, force setting the FUTEX_WAITERS - * bit. We force this here where we are able to easily handle - * faults rather in the requeue loop below. - * - * Updates topwaiter::requeue_state if a top waiter exists. - */ - ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, - &key2, &pi_state, - &exiting, nr_requeue); + if (requeue_pi) { + struct task_struct *exiting = NULL; - /* - * At this point the top_waiter has either taken uaddr2 or - * is waiting on it. In both cases pi_state has been - * established and an initial refcount on it. In case of an - * error there's nothing. - * - * The top waiter's requeue_state is up to date: - * - * - If the lock was acquired atomically (ret == 1), then - * the state is Q_REQUEUE_PI_LOCKED. - * - * The top waiter has been dequeued and woken up and can - * return to user space immediately. The kernel/user - * space state is consistent. In case that there must be - * more waiters requeued the WAITERS bit in the user - * space futex is set so the top waiter task has to go - * into the syscall slowpath to unlock the futex. This - * will block until this requeue operation has been - * completed and the hash bucket locks have been - * dropped. - * - * - If the trylock failed with an error (ret < 0) then - * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing - * happened", or Q_REQUEUE_PI_IGNORE when there was an - * interleaved early wakeup. - * - * - If the trylock did not succeed (ret == 0) then the - * state is either Q_REQUEUE_PI_IN_PROGRESS or - * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. - * This will be cleaned up in the loop below, which - * cannot fail because futex_proxy_trylock_atomic() did - * the same sanity checks for requeue_pi as the loop - * below does. - */ - switch (ret) { - case 0: - /* We hold a reference on the pi state. */ - break; - - case 1: /* - * futex_proxy_trylock_atomic() acquired the user space - * futex. Adjust task_count. + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. + * + * Updates topwaiter::requeue_state if a top waiter exists. */ - task_count++; - ret = 0; - break; + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state, + &exiting, nr_requeue); - /* - * If the above failed, then pi_state is NULL and - * waiter::requeue_state is correct. - */ - case -EFAULT: - double_unlock_hb(hb1, hb2); - futex_hb_waiters_dec(hb2); - ret = fault_in_user_writeable(uaddr2); - if (!ret) - goto retry; - return ret; - case -EBUSY: - case -EAGAIN: - /* - * Two reasons for this: - * - EBUSY: Owner is exiting and we just wait for the - * exit to complete. - * - EAGAIN: The user space value changed. - */ - double_unlock_hb(hb1, hb2); - futex_hb_waiters_dec(hb2); /* - * Handle the case where the owner is in the middle of - * exiting. Wait for the exit to complete otherwise - * this task might loop forever, aka. live lock. + * At this point the top_waiter has either taken uaddr2 or + * is waiting on it. In both cases pi_state has been + * established and an initial refcount on it. In case of an + * error there's nothing. + * + * The top waiter's requeue_state is up to date: + * + * - If the lock was acquired atomically (ret == 1), then + * the state is Q_REQUEUE_PI_LOCKED. + * + * The top waiter has been dequeued and woken up and can + * return to user space immediately. The kernel/user + * space state is consistent. In case that there must be + * more waiters requeued the WAITERS bit in the user + * space futex is set so the top waiter task has to go + * into the syscall slowpath to unlock the futex. This + * will block until this requeue operation has been + * completed and the hash bucket locks have been + * dropped. + * + * - If the trylock failed with an error (ret < 0) then + * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing + * happened", or Q_REQUEUE_PI_IGNORE when there was an + * interleaved early wakeup. + * + * - If the trylock did not succeed (ret == 0) then the + * state is either Q_REQUEUE_PI_IN_PROGRESS or + * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. + * This will be cleaned up in the loop below, which + * cannot fail because futex_proxy_trylock_atomic() did + * the same sanity checks for requeue_pi as the loop + * below does. */ - wait_for_owner_exiting(ret, exiting); - cond_resched(); - goto retry; - default: - goto out_unlock; - } - } - - plist_for_each_entry_safe(this, next, &hb1->chain, list) { - if (task_count - nr_wake >= nr_requeue) - break; - - if (!futex_match(&this->key, &key1)) - continue; - - /* - * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always - * be paired with each other and no other futex ops. - * - * We should never be requeueing a futex_q with a pi_state, - * which is awaiting a futex_unlock_pi(). - */ - if ((requeue_pi && !this->rt_waiter) || - (!requeue_pi && this->rt_waiter) || - this->pi_state) { - ret = -EINVAL; - break; + switch (ret) { + case 0: + /* We hold a reference on the pi state. */ + break; + + case 1: + /* + * futex_proxy_trylock_atomic() acquired the user space + * futex. Adjust task_count. + */ + task_count++; + ret = 0; + break; + + /* + * If the above failed, then pi_state is NULL and + * waiter::requeue_state is correct. + */ + case -EFAULT: + futex_hb_waiters_dec(hb2); + double_unlock_hb(hb1, hb2); + ret = fault_in_user_writeable(uaddr2); + if (!ret) + goto retry; + return ret; + case -EBUSY: + case -EAGAIN: + /* + * Two reasons for this: + * - EBUSY: Owner is exiting and we just wait for the + * exit to complete. + * - EAGAIN: The user space value changed. + */ + futex_hb_waiters_dec(hb2); + double_unlock_hb(hb1, hb2); + /* + * Handle the case where the owner is in the middle of + * exiting. Wait for the exit to complete otherwise + * this task might loop forever, aka. live lock. + */ + wait_for_owner_exiting(ret, exiting); + cond_resched(); + goto retry; + default: + goto out_unlock; + } } - /* Plain futexes just wake or requeue and are done */ - if (!requeue_pi) { - if (++task_count <= nr_wake) - this->wake(&wake_q, this); - else - requeue_futex(this, hb1, hb2, &key2); - continue; - } + plist_for_each_entry_safe(this, next, &hb1->chain, list) { + if (task_count - nr_wake >= nr_requeue) + break; - /* Ensure we requeue to the expected futex for requeue_pi. */ - if (!futex_match(this->requeue_pi_key, &key2)) { - ret = -EINVAL; - break; - } + if (!futex_match(&this->key, &key1)) + continue; - /* - * Requeue nr_requeue waiters and possibly one more in the case - * of requeue_pi if we couldn't acquire the lock atomically. - * - * Prepare the waiter to take the rt_mutex. Take a refcount - * on the pi_state and store the pointer in the futex_q - * object of the waiter. - */ - get_pi_state(pi_state); - - /* Don't requeue when the waiter is already on the way out. */ - if (!futex_requeue_pi_prepare(this, pi_state)) { /* - * Early woken waiter signaled that it is on the - * way out. Drop the pi_state reference and try the - * next waiter. @this->pi_state is still NULL. + * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + * + * We should never be requeueing a futex_q with a pi_state, + * which is awaiting a futex_unlock_pi(). */ - put_pi_state(pi_state); - continue; - } + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter) || + this->pi_state) { + ret = -EINVAL; + break; + } + + /* Plain futexes just wake or requeue and are done */ + if (!requeue_pi) { + if (++task_count <= nr_wake) + this->wake(&wake_q, this); + else + requeue_futex(this, hb1, hb2, &key2); + continue; + } + + /* Ensure we requeue to the expected futex for requeue_pi. */ + if (!futex_match(this->requeue_pi_key, &key2)) { + ret = -EINVAL; + break; + } - ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, - this->rt_waiter, - this->task); - - if (ret == 1) { - /* - * We got the lock. We do neither drop the refcount - * on pi_state nor clear this->pi_state because the - * waiter needs the pi_state for cleaning up the - * user space value. It will drop the refcount - * after doing so. this::requeue_state is updated - * in the wakeup as well. - */ - requeue_pi_wake_futex(this, &key2, hb2); - task_count++; - } else if (!ret) { - /* Waiter is queued, move it to hb2 */ - requeue_futex(this, hb1, hb2, &key2); - futex_requeue_pi_complete(this, 0); - task_count++; - } else { /* - * rt_mutex_start_proxy_lock() detected a potential - * deadlock when we tried to queue that waiter. - * Drop the pi_state reference which we took above - * and remove the pointer to the state from the - * waiters futex_q object. + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + * + * Prepare the waiter to take the rt_mutex. Take a refcount + * on the pi_state and store the pointer in the futex_q + * object of the waiter. */ - this->pi_state = NULL; - put_pi_state(pi_state); - futex_requeue_pi_complete(this, ret); - /* - * We stop queueing more waiters and let user space - * deal with the mess. - */ - break; + get_pi_state(pi_state); + + /* Don't requeue when the waiter is already on the way out. */ + if (!futex_requeue_pi_prepare(this, pi_state)) { + /* + * Early woken waiter signaled that it is on the + * way out. Drop the pi_state reference and try the + * next waiter. @this->pi_state is still NULL. + */ + put_pi_state(pi_state); + continue; + } + + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task); + + if (ret == 1) { + /* + * We got the lock. We do neither drop the refcount + * on pi_state nor clear this->pi_state because the + * waiter needs the pi_state for cleaning up the + * user space value. It will drop the refcount + * after doing so. this::requeue_state is updated + * in the wakeup as well. + */ + requeue_pi_wake_futex(this, &key2, hb2); + task_count++; + } else if (!ret) { + /* Waiter is queued, move it to hb2 */ + requeue_futex(this, hb1, hb2, &key2); + futex_requeue_pi_complete(this, 0); + task_count++; + } else { + /* + * rt_mutex_start_proxy_lock() detected a potential + * deadlock when we tried to queue that waiter. + * Drop the pi_state reference which we took above + * and remove the pointer to the state from the + * waiters futex_q object. + */ + this->pi_state = NULL; + put_pi_state(pi_state); + futex_requeue_pi_complete(this, ret); + /* + * We stop queueing more waiters and let user space + * deal with the mess. + */ + break; + } } - } - /* - * We took an extra initial reference to the pi_state in - * futex_proxy_trylock_atomic(). We need to drop it here again. - */ - put_pi_state(pi_state); + /* + * We took an extra initial reference to the pi_state in + * futex_proxy_trylock_atomic(). We need to drop it here again. + */ + put_pi_state(pi_state); out_unlock: - double_unlock_hb(hb1, hb2); + futex_hb_waiters_dec(hb2); + double_unlock_hb(hb1, hb2); + } wake_up_q(&wake_q); - futex_hb_waiters_dec(hb2); return ret ? ret : task_count; } @@ -769,7 +780,6 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, { struct hrtimer_sleeper timeout, *to; struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; struct rt_mutex_base *pi_mutex; @@ -805,35 +815,28 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + ret = futex_wait_setup(uaddr, val, flags, &q, &key2, current); if (ret) goto out; - /* - * The check above which compares uaddrs is not sufficient for - * shared futexes. We need to compare the keys: - */ - if (futex_match(&q.key, &key2)) { - futex_q_unlock(hb); - ret = -EINVAL; - goto out; - } - /* Queue the futex_q, drop the hb lock, wait for wakeup. */ - futex_wait_queue(hb, &q, to); + futex_do_wait(&q, to); switch (futex_requeue_pi_wakeup_sync(&q)) { case Q_REQUEUE_PI_IGNORE: - /* The waiter is still on uaddr1 */ - spin_lock(&hb->lock); - ret = handle_early_requeue_pi_wakeup(hb, &q, to); - spin_unlock(&hb->lock); + { + CLASS(hb, hb)(&q.key); + /* The waiter is still on uaddr1 */ + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, to); + spin_unlock(&hb->lock); + } break; case Q_REQUEUE_PI_LOCKED: /* The requeue acquired the lock */ if (q.pi_state && (q.pi_state->owner != current)) { - spin_lock(q.lock_ptr); + futex_q_lockptr_lock(&q); ret = fixup_pi_owner(uaddr2, &q, true); /* * Drop the reference to the pi state which the @@ -860,7 +863,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ret = 0; - spin_lock(q.lock_ptr); + futex_q_lockptr_lock(&q); debug_rt_mutex_free_waiter(&rt_waiter); /* * Fixup the pi_state owner and possibly acquire the lock if we @@ -892,6 +895,11 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, default: BUG(); } + if (q.drop_hb_ref) { + CLASS(hb, hb)(&q.key); + /* Additional reference from requeue_pi_wake_futex() */ + futex_hash_put(hb); + } out: if (to) { diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 25877d4f2f8f..e2bbe5509ec2 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -154,7 +154,6 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) */ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { - struct futex_hash_bucket *hb; struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; DEFINE_WAKE_Q(wake_q); @@ -170,7 +169,7 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) if ((flags & FLAGS_STRICT) && !nr_wake) return 0; - hb = futex_hash(&key); + CLASS(hb, hb)(&key); /* Make sure we really have tasks to wakeup */ if (!futex_hb_waiters_pending(hb)) @@ -253,7 +252,6 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; int ret, op_ret; DEFINE_WAKE_Q(wake_q); @@ -266,67 +264,69 @@ retry: if (unlikely(ret != 0)) return ret; - hb1 = futex_hash(&key1); - hb2 = futex_hash(&key2); - retry_private: - double_lock_hb(hb1, hb2); - op_ret = futex_atomic_op_inuser(op, uaddr2); - if (unlikely(op_ret < 0)) { - double_unlock_hb(hb1, hb2); - - if (!IS_ENABLED(CONFIG_MMU) || - unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { - /* - * we don't get EFAULT from MMU faults if we don't have - * an MMU, but we might get them from range checking - */ - ret = op_ret; - return ret; - } - - if (op_ret == -EFAULT) { - ret = fault_in_user_writeable(uaddr2); - if (ret) + if (1) { + CLASS(hb, hb1)(&key1); + CLASS(hb, hb2)(&key2); + + double_lock_hb(hb1, hb2); + op_ret = futex_atomic_op_inuser(op, uaddr2); + if (unlikely(op_ret < 0)) { + double_unlock_hb(hb1, hb2); + + if (!IS_ENABLED(CONFIG_MMU) || + unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { + /* + * we don't get EFAULT from MMU faults if we don't have + * an MMU, but we might get them from range checking + */ + ret = op_ret; return ret; - } - - cond_resched(); - if (!(flags & FLAGS_SHARED)) - goto retry_private; - goto retry; - } + } - plist_for_each_entry_safe(this, next, &hb1->chain, list) { - if (futex_match (&this->key, &key1)) { - if (this->pi_state || this->rt_waiter) { - ret = -EINVAL; - goto out_unlock; + if (op_ret == -EFAULT) { + ret = fault_in_user_writeable(uaddr2); + if (ret) + return ret; } - this->wake(&wake_q, this); - if (++ret >= nr_wake) - break; + + cond_resched(); + if (!(flags & FLAGS_SHARED)) + goto retry_private; + goto retry; } - } - if (op_ret > 0) { - op_ret = 0; - plist_for_each_entry_safe(this, next, &hb2->chain, list) { - if (futex_match (&this->key, &key2)) { + plist_for_each_entry_safe(this, next, &hb1->chain, list) { + if (futex_match(&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); - if (++op_ret >= nr_wake2) + if (++ret >= nr_wake) break; } } - ret += op_ret; - } + + if (op_ret > 0) { + op_ret = 0; + plist_for_each_entry_safe(this, next, &hb2->chain, list) { + if (futex_match(&this->key, &key2)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } + this->wake(&wake_q, this); + if (++op_ret >= nr_wake2) + break; + } + } + ret += op_ret; + } out_unlock: - double_unlock_hb(hb1, hb2); + double_unlock_hb(hb1, hb2); + } wake_up_q(&wake_q); return ret; } @@ -334,23 +334,12 @@ out_unlock: static long futex_wait_restart(struct restart_block *restart); /** - * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal - * @hb: the futex hash bucket, must be locked by the caller + * futex_do_wait() - wait for wakeup, timeout, or signal * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout */ -void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout) +void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) { - /* - * The task state is guaranteed to be set before another task can - * wake it. set_current_state() is implemented using smp_store_mb() and - * futex_queue() calls spin_unlock() upon completion, both serializing - * access to the hash list and forcing another memory barrier. - */ - set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); - futex_queue(q, hb, current); - /* Arm the timer */ if (timeout) hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); @@ -412,12 +401,17 @@ int futex_unqueue_multiple(struct futex_vector *v, int count) */ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) { - struct futex_hash_bucket *hb; bool retry = false; int ret, i; u32 uval; /* + * Make sure to have a reference on the private_hash such that we + * don't block on rehash after changing the task state below. + */ + guard(private_hash)(); + + /* * Enqueuing multiple futexes is tricky, because we need to enqueue * each futex on the list before dealing with the next one to avoid * deadlocking on the hash bucket. But, before enqueuing, we need to @@ -451,20 +445,24 @@ retry: struct futex_q *q = &vs[i].q; u32 val = vs[i].w.val; - hb = futex_q_lock(q); - ret = futex_get_value_locked(&uval, uaddr); + if (1) { + CLASS(hb, hb)(&q->key); - if (!ret && uval == val) { - /* - * The bucket lock can't be held while dealing with the - * next futex. Queue each futex at this moment so hb can - * be unlocked. - */ - futex_queue(q, hb, current); - continue; - } + futex_q_lock(q, hb); + ret = futex_get_value_locked(&uval, uaddr); + + if (!ret && uval == val) { + /* + * The bucket lock can't be held while dealing with the + * next futex. Queue each futex at this moment so hb can + * be unlocked. + */ + futex_queue(q, hb, current); + continue; + } - futex_q_unlock(hb); + futex_q_unlock(hb); + } __set_current_state(TASK_RUNNING); /* @@ -578,7 +576,8 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, * @val: the expected value * @flags: futex flags (FLAGS_SHARED, etc.) * @q: the associated futex_q - * @hb: storage for hash_bucket pointer to be returned to caller + * @key2: the second futex_key if used for requeue PI + * @task: Task queueing this futex * * Setup the futex_q and locate the hash_bucket. Get the futex value and * compare it with the expected value. Handle atomic faults internally. @@ -586,10 +585,12 @@ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, * * Return: * - 0 - uaddr contains val and hb has been locked; - * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked + * - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not + * be read, does not contain the expected value or is not properly aligned. */ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - struct futex_q *q, struct futex_hash_bucket **hb) + struct futex_q *q, union futex_key *key2, + struct task_struct *task) { u32 uval; int ret; @@ -618,26 +619,45 @@ retry: return ret; retry_private: - *hb = futex_q_lock(q); + if (1) { + CLASS(hb, hb)(&q->key); - ret = futex_get_value_locked(&uval, uaddr); + futex_q_lock(q, hb); - if (ret) { - futex_q_unlock(*hb); + ret = futex_get_value_locked(&uval, uaddr); - ret = get_user(uval, uaddr); - if (ret) - return ret; + if (ret) { + futex_q_unlock(hb); - if (!(flags & FLAGS_SHARED)) - goto retry_private; + ret = get_user(uval, uaddr); + if (ret) + return ret; - goto retry; - } + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + goto retry; + } - if (uval != val) { - futex_q_unlock(*hb); - ret = -EWOULDBLOCK; + if (uval != val) { + futex_q_unlock(hb); + return -EWOULDBLOCK; + } + + if (key2 && futex_match(&q->key, key2)) { + futex_q_unlock(hb); + return -EINVAL; + } + + /* + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using smp_store_mb() and + * futex_queue() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. + */ + if (task == current) + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + futex_queue(q, hb, task); } return ret; @@ -647,7 +667,6 @@ int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset) { struct futex_q q = futex_q_init; - struct futex_hash_bucket *hb; int ret; if (!bitset) @@ -660,12 +679,12 @@ retry: * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ - ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current); if (ret) return ret; /* futex_queue and wait for wakeup, timeout, or a signal. */ - futex_wait_queue(hb, &q, to); + futex_do_wait(&q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ if (!futex_unqueue(&q)) diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index fd75b4a484d7..a08cc076f332 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -22,10 +22,6 @@ #define GCOV_COUNTERS 9 #elif (__GNUC__ >= 10) #define GCOV_COUNTERS 8 -#elif (__GNUC__ >= 7) -#define GCOV_COUNTERS 9 -#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) -#define GCOV_COUNTERS 10 #else #define GCOV_COUNTERS 9 #endif diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 00529c81cc40..c9e5dc068e85 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -89,7 +89,6 @@ rm -f "${tmpdir}.contents.txt" # Create archive and try to normalize metadata for reproducibility. tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ - --exclude=".__afs*" --exclude=".nfs*" \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ -I $XZ -cf $tarfile -C "${tmpdir}/" . > /dev/null diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 04efa7a6e69b..d2432df2b905 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -22,6 +22,7 @@ #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <linux/sched/sysctl.h> +#include <linux/hung_task.h> #include <trace/events/sched.h> @@ -93,6 +94,75 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static void debug_show_blocker(struct task_struct *task) +{ + struct task_struct *g, *t; + unsigned long owner, blocker, blocker_type; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); + + blocker = READ_ONCE(task->blocker); + if (!blocker) + return; + + blocker_type = hung_task_get_blocker_type(blocker); + + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: + owner = mutex_get_owner( + (struct mutex *)hung_task_blocker_to_lock(blocker)); + break; + case BLOCKER_TYPE_SEM: + owner = sem_last_holder( + (struct semaphore *)hung_task_blocker_to_lock(blocker)); + break; + default: + WARN_ON_ONCE(1); + return; + } + + + if (unlikely(!owner)) { + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: + pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n", + task->comm, task->pid); + break; + case BLOCKER_TYPE_SEM: + pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", + task->comm, task->pid); + break; + } + return; + } + + /* Ensure the owner information is correct. */ + for_each_process_thread(g, t) { + if ((unsigned long)t != owner) + continue; + + switch (blocker_type) { + case BLOCKER_TYPE_MUTEX: + pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n", + task->comm, task->pid, t->comm, t->pid); + break; + case BLOCKER_TYPE_SEM: + pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", + task->comm, task->pid, t->comm, t->pid); + break; + } + sched_show_task(t); + return; + } +} +#else +static inline void debug_show_blocker(struct task_struct *task) +{ +} +#endif + static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; @@ -152,6 +222,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); + debug_show_blocker(t); hung_task_show_lock = true; if (sysctl_hung_task_all_cpu_backtrace) diff --git a/kernel/iomem.c b/kernel/iomem.c index dc2120776e1c..75e61c1c6bc0 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -6,7 +6,8 @@ #include <linux/ioremap.h> #ifndef arch_memremap_wb -static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +static void *arch_memremap_wb(resource_size_t offset, unsigned long size, + unsigned long flags) { #ifdef ioremap_cache return (__force void *)ioremap_cache(offset, size); @@ -91,7 +92,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) if (is_ram == REGION_INTERSECTS) addr = try_ram_remap(offset, size, flags); if (!addr) - addr = arch_memremap_wb(offset, size); + addr = arch_memremap_wb(offset, size, flags); } /* diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 875f25ed6f71..3f02a0e45254 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -47,10 +47,6 @@ config GENERIC_IRQ_INJECTION config HARDIRQS_SW_RESEND bool -# Edge style eoi based handler (cell) -config IRQ_EDGE_EOI_HANDLER - bool - # Generic configurable interrupt chip implementation config GENERIC_IRQ_CHIP bool @@ -96,6 +92,7 @@ config GENERIC_MSI_IRQ bool select IRQ_DOMAIN_HIERARCHY +# irqchip drivers should select this if they call iommu_dma_prepare_msi() config IRQ_MSI_IOMMU bool diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index ae60cae24e9a..d0af8a8b3ae6 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -43,18 +43,16 @@ unsigned long probe_irq_on(void) * flush such a longstanding irq before considering it as spurious. */ for_each_irq_desc_reverse(i, desc) { - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { /* * Some chips need to know about probing in * progress: */ if (desc->irq_data.chip->irq_set_type) - desc->irq_data.chip->irq_set_type(&desc->irq_data, - IRQ_TYPE_PROBE); + desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); irq_activate_and_startup(desc, IRQ_NORESEND); } - raw_spin_unlock_irq(&desc->lock); } /* Wait for longstanding interrupts to trigger. */ @@ -66,13 +64,12 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for_each_irq_desc_reverse(i, desc) { - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; if (irq_activate_and_startup(desc, IRQ_NORESEND)) desc->istate |= IRQS_PENDING; } - raw_spin_unlock_irq(&desc->lock); } /* @@ -84,18 +81,16 @@ unsigned long probe_irq_on(void) * Now filter out any obviously spurious interrupts */ for_each_irq_desc(i, desc) { - raw_spin_lock_irq(&desc->lock); - + guard(raw_spinlock_irq)(&desc->lock); if (desc->istate & IRQS_AUTODETECT) { /* It triggered already - consider it spurious. */ if (!(desc->istate & IRQS_WAITING)) { desc->istate &= ~IRQS_AUTODETECT; irq_shutdown_and_deactivate(desc); - } else - if (i < 32) - mask |= 1 << i; + } else if (i < 32) { + mask |= 1 << i; + } } - raw_spin_unlock_irq(&desc->lock); } return mask; @@ -121,7 +116,7 @@ unsigned int probe_irq_mask(unsigned long val) int i; for_each_irq_desc(i, desc) { - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (desc->istate & IRQS_AUTODETECT) { if (i < 16 && !(desc->istate & IRQS_WAITING)) mask |= 1 << i; @@ -129,7 +124,6 @@ unsigned int probe_irq_mask(unsigned long val) desc->istate &= ~IRQS_AUTODETECT; irq_shutdown_and_deactivate(desc); } - raw_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); @@ -160,8 +154,7 @@ int probe_irq_off(unsigned long val) struct irq_desc *desc; for_each_irq_desc(i, desc) { - raw_spin_lock_irq(&desc->lock); - + guard(raw_spinlock_irq)(&desc->lock); if (desc->istate & IRQS_AUTODETECT) { if (!(desc->istate & IRQS_WAITING)) { if (!nr_of_irqs) @@ -171,7 +164,6 @@ int probe_irq_off(unsigned long val) desc->istate &= ~IRQS_AUTODETECT; irq_shutdown_and_deactivate(desc); } - raw_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c901436ebd9f..2b274007e8ba 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -34,98 +34,80 @@ struct irqaction chained_action = { }; /** - * irq_set_chip - set the irq chip for an irq - * @irq: irq number - * @chip: pointer to irq chip description structure + * irq_set_chip - set the irq chip for an irq + * @irq: irq number + * @chip: pointer to irq chip description structure */ int irq_set_chip(unsigned int irq, const struct irq_chip *chip) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); + int ret = -EINVAL; - if (!desc) - return -EINVAL; - - desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip); - irq_put_desc_unlock(desc, flags); - /* - * For !CONFIG_SPARSE_IRQ make the irq show up in - * allocated_irqs. - */ - irq_mark_irq(irq); - return 0; + scoped_irqdesc_get_and_lock(irq, 0) { + scoped_irqdesc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip); + ret = 0; + } + /* For !CONFIG_SPARSE_IRQ make the irq show up in allocated_irqs. */ + if (!ret) + irq_mark_irq(irq); + return ret; } EXPORT_SYMBOL(irq_set_chip); /** - * irq_set_irq_type - set the irq trigger type for an irq - * @irq: irq number - * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h + * irq_set_irq_type - set the irq trigger type for an irq + * @irq: irq number + * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ int irq_set_irq_type(unsigned int irq, unsigned int type) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - int ret = 0; - - if (!desc) - return -EINVAL; - - ret = __irq_set_trigger(desc, type); - irq_put_desc_busunlock(desc, flags); - return ret; + scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) + return __irq_set_trigger(scoped_irqdesc, type); + return -EINVAL; } EXPORT_SYMBOL(irq_set_irq_type); /** - * irq_set_handler_data - set irq handler data for an irq - * @irq: Interrupt number - * @data: Pointer to interrupt specific data + * irq_set_handler_data - set irq handler data for an irq + * @irq: Interrupt number + * @data: Pointer to interrupt specific data * - * Set the hardware irq controller data for an irq + * Set the hardware irq controller data for an irq */ int irq_set_handler_data(unsigned int irq, void *data) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return -EINVAL; - desc->irq_common_data.handler_data = data; - irq_put_desc_unlock(desc, flags); - return 0; + scoped_irqdesc_get_and_lock(irq, 0) { + scoped_irqdesc->irq_common_data.handler_data = data; + return 0; + } + return -EINVAL; } EXPORT_SYMBOL(irq_set_handler_data); /** - * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset - * @irq_base: Interrupt number base - * @irq_offset: Interrupt number offset - * @entry: Pointer to MSI descriptor data + * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset + * @irq_base: Interrupt number base + * @irq_offset: Interrupt number offset + * @entry: Pointer to MSI descriptor data * - * Set the MSI descriptor entry for an irq at offset + * Set the MSI descriptor entry for an irq at offset */ -int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, - struct msi_desc *entry) -{ - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - - if (!desc) - return -EINVAL; - desc->irq_common_data.msi_desc = entry; - if (entry && !irq_offset) - entry->irq = irq_base; - irq_put_desc_unlock(desc, flags); - return 0; +int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry) +{ + scoped_irqdesc_get_and_lock(irq_base + irq_offset, IRQ_GET_DESC_CHECK_GLOBAL) { + scoped_irqdesc->irq_common_data.msi_desc = entry; + if (entry && !irq_offset) + entry->irq = irq_base; + return 0; + } + return -EINVAL; } /** - * irq_set_msi_desc - set MSI descriptor data for an irq - * @irq: Interrupt number - * @entry: Pointer to MSI descriptor data + * irq_set_msi_desc - set MSI descriptor data for an irq + * @irq: Interrupt number + * @entry: Pointer to MSI descriptor data * - * Set the MSI descriptor entry for an irq + * Set the MSI descriptor entry for an irq */ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { @@ -133,22 +115,19 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) } /** - * irq_set_chip_data - set irq chip data for an irq - * @irq: Interrupt number - * @data: Pointer to chip specific data + * irq_set_chip_data - set irq chip data for an irq + * @irq: Interrupt number + * @data: Pointer to chip specific data * - * Set the hardware irq chip data for an irq + * Set the hardware irq chip data for an irq */ int irq_set_chip_data(unsigned int irq, void *data) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return -EINVAL; - desc->irq_data.chip_data = data; - irq_put_desc_unlock(desc, flags); - return 0; + scoped_irqdesc_get_and_lock(irq, 0) { + scoped_irqdesc->irq_data.chip_data = data; + return 0; + } + return -EINVAL; } EXPORT_SYMBOL(irq_set_chip_data); @@ -223,6 +202,27 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, return IRQ_STARTUP_ABORT; return IRQ_STARTUP_MANAGED; } + +void irq_startup_managed(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + + /* + * Clear managed-shutdown flag, so we don't repeat managed-startup for + * multiple hotplugs, and cause imbalanced disable depth. + */ + irqd_clr_managed_shutdown(d); + + /* + * Only start it up when the disable depth is 1, so that a disable, + * hotunplug, hotplug sequence does not end up enabling it during + * hotplug unconditionally. + */ + desc->depth--; + if (!desc->depth) + irq_startup(desc, IRQ_RESEND, IRQ_START_COND); +} + #else static __always_inline int __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, @@ -232,6 +232,21 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, } #endif +static void irq_enable(struct irq_desc *desc) +{ + if (!irqd_irq_disabled(&desc->irq_data)) { + unmask_irq(desc); + } else { + irq_state_clr_disabled(desc); + if (desc->irq_data.chip->irq_enable) { + desc->irq_data.chip->irq_enable(&desc->irq_data); + irq_state_clr_masked(desc); + } else { + unmask_irq(desc); + } + } +} + static int __irq_startup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); @@ -275,6 +290,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) ret = __irq_startup(desc); break; case IRQ_STARTUP_ABORT: + desc->depth = 1; irqd_set_managed_shutdown(d); return 0; } @@ -307,7 +323,13 @@ void irq_shutdown(struct irq_desc *desc) { if (irqd_is_started(&desc->irq_data)) { clear_irq_resend(desc); - desc->depth = 1; + /* + * Increment disable depth, so that a managed shutdown on + * CPU hotunplug preserves the actual disabled state when the + * CPU comes back online. See irq_startup_managed(). + */ + desc->depth++; + if (desc->irq_data.chip->irq_shutdown) { desc->irq_data.chip->irq_shutdown(&desc->irq_data); irq_state_set_disabled(desc); @@ -332,21 +354,6 @@ void irq_shutdown_and_deactivate(struct irq_desc *desc) irq_domain_deactivate_irq(&desc->irq_data); } -void irq_enable(struct irq_desc *desc) -{ - if (!irqd_irq_disabled(&desc->irq_data)) { - unmask_irq(desc); - } else { - irq_state_clr_disabled(desc); - if (desc->irq_data.chip->irq_enable) { - desc->irq_data.chip->irq_enable(&desc->irq_data); - irq_state_clr_masked(desc); - } else { - unmask_irq(desc); - } - } -} - static void __irq_disable(struct irq_desc *desc, bool mask) { if (irqd_irq_disabled(&desc->irq_data)) { @@ -450,48 +457,6 @@ void unmask_threaded_irq(struct irq_desc *desc) unmask_irq(desc); } -/* - * handle_nested_irq - Handle a nested irq from a irq thread - * @irq: the interrupt number - * - * Handle interrupts which are nested into a threaded interrupt - * handler. The handler function is called inside the calling - * threads context. - */ -void handle_nested_irq(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; - irqreturn_t action_ret; - - might_sleep(); - - raw_spin_lock_irq(&desc->lock); - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - action = desc->action; - if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - raw_spin_unlock_irq(&desc->lock); - return; - } - - kstat_incr_irqs_this_cpu(desc); - atomic_inc(&desc->threads_active); - raw_spin_unlock_irq(&desc->lock); - - action_ret = IRQ_NONE; - for_each_action_of_desc(desc, action) - action_ret |= action->thread_fn(action->irq, action->dev_id); - - if (!irq_settings_no_debug(desc)) - note_interrupt(desc, action_ret); - - wake_threads_waitq(desc); -} -EXPORT_SYMBOL_GPL(handle_nested_irq); - static bool irq_check_poll(struct irq_desc *desc) { if (!(desc->istate & IRQS_POLL_INPROGRESS)) @@ -499,7 +464,7 @@ static bool irq_check_poll(struct irq_desc *desc) return irq_wait_for_poll(desc); } -static bool irq_may_run(struct irq_desc *desc) +static bool irq_can_handle_pm(struct irq_desc *desc) { unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; @@ -524,77 +489,111 @@ static bool irq_may_run(struct irq_desc *desc) return irq_check_poll(desc); } +static inline bool irq_can_handle_actions(struct irq_desc *desc) +{ + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; + return false; + } + return true; +} + +static inline bool irq_can_handle(struct irq_desc *desc) +{ + if (!irq_can_handle_pm(desc)) + return false; + + return irq_can_handle_actions(desc); +} + /** - * handle_simple_irq - Simple and software-decoded IRQs. - * @desc: the interrupt description structure for this irq - * - * Simple interrupts are either sent from a demultiplexing interrupt - * handler or come from hardware, where no interrupt hardware control - * is necessary. + * handle_nested_irq - Handle a nested irq from a irq thread + * @irq: the interrupt number * - * Note: The caller is expected to handle the ack, clear, mask and - * unmask issues if necessary. + * Handle interrupts which are nested into a threaded interrupt + * handler. The handler function is called inside the calling threads + * context. */ -void handle_simple_irq(struct irq_desc *desc) +void handle_nested_irq(unsigned int irq) { - raw_spin_lock(&desc->lock); + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + irqreturn_t action_ret; - if (!irq_may_run(desc)) - goto out_unlock; + might_sleep(); - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + scoped_guard(raw_spinlock_irq, &desc->lock) { + if (!irq_can_handle_actions(desc)) + return; - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - goto out_unlock; + action = desc->action; + kstat_incr_irqs_this_cpu(desc); + atomic_inc(&desc->threads_active); } + action_ret = IRQ_NONE; + for_each_action_of_desc(desc, action) + action_ret |= action->thread_fn(action->irq, action->dev_id); + + if (!irq_settings_no_debug(desc)) + note_interrupt(desc, action_ret); + + wake_threads_waitq(desc); +} +EXPORT_SYMBOL_GPL(handle_nested_irq); + +/** + * handle_simple_irq - Simple and software-decoded IRQs. + * @desc: the interrupt description structure for this irq + * + * Simple interrupts are either sent from a demultiplexing interrupt + * handler or come from hardware, where no interrupt hardware control is + * necessary. + * + * Note: The caller is expected to handle the ack, clear, mask and unmask + * issues if necessary. + */ +void handle_simple_irq(struct irq_desc *desc) +{ + guard(raw_spinlock)(&desc->lock); + + if (!irq_can_handle(desc)) + return; + kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); - -out_unlock: - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_simple_irq); /** - * handle_untracked_irq - Simple and software-decoded IRQs. - * @desc: the interrupt description structure for this irq + * handle_untracked_irq - Simple and software-decoded IRQs. + * @desc: the interrupt description structure for this irq * - * Untracked interrupts are sent from a demultiplexing interrupt - * handler when the demultiplexer does not know which device it its - * multiplexed irq domain generated the interrupt. IRQ's handled - * through here are not subjected to stats tracking, randomness, or - * spurious interrupt detection. + * Untracked interrupts are sent from a demultiplexing interrupt handler + * when the demultiplexer does not know which device it its multiplexed irq + * domain generated the interrupt. IRQ's handled through here are not + * subjected to stats tracking, randomness, or spurious interrupt + * detection. * - * Note: Like handle_simple_irq, the caller is expected to handle - * the ack, clear, mask and unmask issues if necessary. + * Note: Like handle_simple_irq, the caller is expected to handle the ack, + * clear, mask and unmask issues if necessary. */ void handle_untracked_irq(struct irq_desc *desc) { - raw_spin_lock(&desc->lock); - - if (!irq_may_run(desc)) - goto out_unlock; - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + scoped_guard(raw_spinlock, &desc->lock) { + if (!irq_can_handle(desc)) + return; - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - goto out_unlock; + desc->istate &= ~IRQS_PENDING; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); } - desc->istate &= ~IRQS_PENDING; - irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); - raw_spin_unlock(&desc->lock); - __handle_irq_event_percpu(desc); - raw_spin_lock(&desc->lock); - irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); - -out_unlock: - raw_spin_unlock(&desc->lock); + scoped_guard(raw_spinlock, &desc->lock) + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); } EXPORT_SYMBOL_GPL(handle_untracked_irq); @@ -617,40 +616,26 @@ static void cond_unmask_irq(struct irq_desc *desc) } /** - * handle_level_irq - Level type irq handler - * @desc: the interrupt description structure for this irq + * handle_level_irq - Level type irq handler + * @desc: the interrupt description structure for this irq * - * Level type interrupts are active as long as the hardware line has - * the active level. This may require to mask the interrupt and unmask - * it after the associated handler has acknowledged the device, so the - * interrupt line is back to inactive. + * Level type interrupts are active as long as the hardware line has the + * active level. This may require to mask the interrupt and unmask it after + * the associated handler has acknowledged the device, so the interrupt + * line is back to inactive. */ void handle_level_irq(struct irq_desc *desc) { - raw_spin_lock(&desc->lock); + guard(raw_spinlock)(&desc->lock); mask_ack_irq(desc); - if (!irq_may_run(desc)) - goto out_unlock; - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - /* - * If its disabled or no action available - * keep it masked and get out of here - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - goto out_unlock; - } + if (!irq_can_handle(desc)) + return; kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); cond_unmask_irq(desc); - -out_unlock: - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_level_irq); @@ -675,42 +660,43 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) } } +static inline void cond_eoi_irq(struct irq_chip *chip, struct irq_data *data) +{ + if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) + chip->irq_eoi(data); +} + /** - * handle_fasteoi_irq - irq handler for transparent controllers - * @desc: the interrupt description structure for this irq + * handle_fasteoi_irq - irq handler for transparent controllers + * @desc: the interrupt description structure for this irq * - * Only a single callback will be issued to the chip: an ->eoi() - * call when the interrupt has been serviced. This enables support - * for modern forms of interrupt handlers, which handle the flow - * details in hardware, transparently. + * Only a single callback will be issued to the chip: an ->eoi() call when + * the interrupt has been serviced. This enables support for modern forms + * of interrupt handlers, which handle the flow details in hardware, + * transparently. */ void handle_fasteoi_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; - raw_spin_lock(&desc->lock); + guard(raw_spinlock)(&desc->lock); /* * When an affinity change races with IRQ handling, the next interrupt * can arrive on the new CPU before the original CPU has completed * handling the previous one - it may need to be resent. */ - if (!irq_may_run(desc)) { + if (!irq_can_handle_pm(desc)) { if (irqd_needs_resend_when_in_progress(&desc->irq_data)) desc->istate |= IRQS_PENDING; - goto out; + cond_eoi_irq(chip, &desc->irq_data); + return; } - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - /* - * If its disabled or no action available - * then mask it and get out of here: - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; + if (!irq_can_handle_actions(desc)) { mask_irq(desc); - goto out; + cond_eoi_irq(chip, &desc->irq_data); + return; } kstat_incr_irqs_this_cpu(desc); @@ -726,13 +712,6 @@ void handle_fasteoi_irq(struct irq_desc *desc) */ if (unlikely(desc->istate & IRQS_PENDING)) check_irq_resend(desc, false); - - raw_spin_unlock(&desc->lock); - return; -out: - if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_irq); @@ -770,40 +749,27 @@ void handle_fasteoi_nmi(struct irq_desc *desc) EXPORT_SYMBOL_GPL(handle_fasteoi_nmi); /** - * handle_edge_irq - edge type IRQ handler - * @desc: the interrupt description structure for this irq + * handle_edge_irq - edge type IRQ handler + * @desc: the interrupt description structure for this irq * - * Interrupt occurs on the falling and/or rising edge of a hardware - * signal. The occurrence is latched into the irq controller hardware - * and must be acked in order to be reenabled. After the ack another - * interrupt can happen on the same source even before the first one - * is handled by the associated event handler. If this happens it - * might be necessary to disable (mask) the interrupt depending on the - * controller hardware. This requires to reenable the interrupt inside - * of the loop which handles the interrupts which have arrived while - * the handler was running. If all pending interrupts are handled, the - * loop is left. + * Interrupt occurs on the falling and/or rising edge of a hardware + * signal. The occurrence is latched into the irq controller hardware and + * must be acked in order to be reenabled. After the ack another interrupt + * can happen on the same source even before the first one is handled by + * the associated event handler. If this happens it might be necessary to + * disable (mask) the interrupt depending on the controller hardware. This + * requires to reenable the interrupt inside of the loop which handles the + * interrupts which have arrived while the handler was running. If all + * pending interrupts are handled, the loop is left. */ void handle_edge_irq(struct irq_desc *desc) { - raw_spin_lock(&desc->lock); - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - if (!irq_may_run(desc)) { - desc->istate |= IRQS_PENDING; - mask_ack_irq(desc); - goto out_unlock; - } + guard(raw_spinlock)(&desc->lock); - /* - * If its disabled or no action available then mask it and get - * out of here. - */ - if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { + if (!irq_can_handle(desc)) { desc->istate |= IRQS_PENDING; mask_ack_irq(desc); - goto out_unlock; + return; } kstat_incr_irqs_this_cpu(desc); @@ -814,7 +780,7 @@ void handle_edge_irq(struct irq_desc *desc) do { if (unlikely(!desc->action)) { mask_irq(desc); - goto out_unlock; + return; } /* @@ -830,61 +796,10 @@ void handle_edge_irq(struct irq_desc *desc) handle_irq_event(desc); - } while ((desc->istate & IRQS_PENDING) && - !irqd_irq_disabled(&desc->irq_data)); - -out_unlock: - raw_spin_unlock(&desc->lock); + } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data)); } EXPORT_SYMBOL(handle_edge_irq); -#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER -/** - * handle_edge_eoi_irq - edge eoi type IRQ handler - * @desc: the interrupt description structure for this irq - * - * Similar as the above handle_edge_irq, but using eoi and w/o the - * mask/unmask logic. - */ -void handle_edge_eoi_irq(struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - - raw_spin_lock(&desc->lock); - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - if (!irq_may_run(desc)) { - desc->istate |= IRQS_PENDING; - goto out_eoi; - } - - /* - * If its disabled or no action available then mask it and get - * out of here. - */ - if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { - desc->istate |= IRQS_PENDING; - goto out_eoi; - } - - kstat_incr_irqs_this_cpu(desc); - - do { - if (unlikely(!desc->action)) - goto out_eoi; - - handle_irq_event(desc); - - } while ((desc->istate & IRQS_PENDING) && - !irqd_irq_disabled(&desc->irq_data)); - -out_eoi: - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); -} -#endif - /** * handle_percpu_irq - Per CPU local irq handler * @desc: the interrupt description structure for this irq @@ -1054,35 +969,23 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, } } -void -__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, - const char *name) +void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); - - if (!desc) - return; - - __irq_do_set_handler(desc, handle, is_chained, name); - irq_put_desc_busunlock(desc, flags); + scoped_irqdesc_get_and_lock(irq, 0) + __irq_do_set_handler(scoped_irqdesc, handle, is_chained, name); } EXPORT_SYMBOL_GPL(__irq_set_handler); -void -irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, - void *data) +void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, + void *data) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); - - if (!desc) - return; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; - desc->irq_common_data.handler_data = data; - __irq_do_set_handler(desc, handle, 1, NULL); - - irq_put_desc_busunlock(desc, flags); + desc->irq_common_data.handler_data = data; + __irq_do_set_handler(desc, handle, 1, NULL); + } } EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data); @@ -1097,38 +1000,34 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - unsigned long flags, trigger, tmp; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return; - - /* - * Warn when a driver sets the no autoenable flag on an already - * active interrupt. - */ - WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN)); - - irq_settings_clr_and_set(desc, clr, set); + scoped_irqdesc_get_and_lock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; + unsigned long trigger, tmp; + /* + * Warn when a driver sets the no autoenable flag on an already + * active interrupt. + */ + WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN)); - trigger = irqd_get_trigger_type(&desc->irq_data); + irq_settings_clr_and_set(desc, clr, set); - irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | - IRQD_TRIGGER_MASK | IRQD_LEVEL); - if (irq_settings_has_no_balance_set(desc)) - irqd_set(&desc->irq_data, IRQD_NO_BALANCING); - if (irq_settings_is_per_cpu(desc)) - irqd_set(&desc->irq_data, IRQD_PER_CPU); - if (irq_settings_is_level(desc)) - irqd_set(&desc->irq_data, IRQD_LEVEL); + trigger = irqd_get_trigger_type(&desc->irq_data); - tmp = irq_settings_get_trigger_mask(desc); - if (tmp != IRQ_TYPE_NONE) - trigger = tmp; + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | + IRQD_TRIGGER_MASK | IRQD_LEVEL); + if (irq_settings_has_no_balance_set(desc)) + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + if (irq_settings_is_per_cpu(desc)) + irqd_set(&desc->irq_data, IRQD_PER_CPU); + if (irq_settings_is_level(desc)) + irqd_set(&desc->irq_data, IRQD_LEVEL); - irqd_set(&desc->irq_data, trigger); + tmp = irq_settings_get_trigger_mask(desc); + if (tmp != IRQ_TYPE_NONE) + trigger = tmp; - irq_put_desc_unlock(desc, flags); + irqd_set(&desc->irq_data, trigger); + } } EXPORT_SYMBOL_GPL(irq_modify_status); @@ -1141,25 +1040,21 @@ EXPORT_SYMBOL_GPL(irq_modify_status); */ void irq_cpu_online(void) { - struct irq_desc *desc; - struct irq_chip *chip; - unsigned long flags; unsigned int irq; for_each_active_irq(irq) { - desc = irq_to_desc(irq); + struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip; + if (!desc) continue; - raw_spin_lock_irqsave(&desc->lock, flags); - + guard(raw_spinlock_irqsave)(&desc->lock); chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_online && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_online(&desc->irq_data); - - raw_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -1171,25 +1066,21 @@ void irq_cpu_online(void) */ void irq_cpu_offline(void) { - struct irq_desc *desc; - struct irq_chip *chip; - unsigned long flags; unsigned int irq; for_each_active_irq(irq) { - desc = irq_to_desc(irq); + struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip; + if (!desc) continue; - raw_spin_lock_irqsave(&desc->lock, flags); - + guard(raw_spinlock_irqsave)(&desc->lock); chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_offline && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_offline(&desc->irq_data); - - raw_spin_unlock_irqrestore(&desc->lock, flags); } } #endif @@ -1198,102 +1089,69 @@ void irq_cpu_offline(void) #ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS /** - * handle_fasteoi_ack_irq - irq handler for edge hierarchy - * stacked on transparent controllers + * handle_fasteoi_ack_irq - irq handler for edge hierarchy stacked on + * transparent controllers * - * @desc: the interrupt description structure for this irq + * @desc: the interrupt description structure for this irq * - * Like handle_fasteoi_irq(), but for use with hierarchy where - * the irq_chip also needs to have its ->irq_ack() function - * called. + * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip + * also needs to have its ->irq_ack() function called. */ void handle_fasteoi_ack_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; - raw_spin_lock(&desc->lock); - - if (!irq_may_run(desc)) - goto out; + guard(raw_spinlock)(&desc->lock); - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + if (!irq_can_handle_pm(desc)) { + cond_eoi_irq(chip, &desc->irq_data); + return; + } - /* - * If its disabled or no action available - * then mask it and get out of here: - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; + if (unlikely(!irq_can_handle_actions(desc))) { mask_irq(desc); - goto out; + cond_eoi_irq(chip, &desc->irq_data); + return; } kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); - /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); - - raw_spin_unlock(&desc->lock); - return; -out: - if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq); /** - * handle_fasteoi_mask_irq - irq handler for level hierarchy - * stacked on transparent controllers + * handle_fasteoi_mask_irq - irq handler for level hierarchy stacked on + * transparent controllers * - * @desc: the interrupt description structure for this irq + * @desc: the interrupt description structure for this irq * - * Like handle_fasteoi_irq(), but for use with hierarchy where - * the irq_chip also needs to have its ->irq_mask_ack() function - * called. + * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip + * also needs to have its ->irq_mask_ack() function called. */ void handle_fasteoi_mask_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; - raw_spin_lock(&desc->lock); + guard(raw_spinlock)(&desc->lock); mask_ack_irq(desc); - if (!irq_may_run(desc)) - goto out; - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - /* - * If its disabled or no action available - * then mask it and get out of here: - */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - desc->istate |= IRQS_PENDING; - mask_irq(desc); - goto out; + if (!irq_can_handle(desc)) { + cond_eoi_irq(chip, &desc->irq_data); + return; } kstat_incr_irqs_this_cpu(desc); - if (desc->istate & IRQS_ONESHOT) - mask_irq(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); - - raw_spin_unlock(&desc->lock); - return; -out: - if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq); diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 15a7654eff68..755346ea9819 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -177,9 +177,8 @@ void irq_migrate_all_off_this_cpu(void) bool affinity_broken; desc = irq_to_desc(irq); - raw_spin_lock(&desc->lock); - affinity_broken = migrate_one_irq(desc); - raw_spin_unlock(&desc->lock); + scoped_guard(raw_spinlock, &desc->lock) + affinity_broken = migrate_one_irq(desc); if (affinity_broken) { pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n", @@ -211,15 +210,8 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) return; - /* - * Don't restore suspended interrupts here when a system comes back - * from S3. They are reenabled via resume_device_irqs(). - */ - if (desc->istate & IRQS_SUSPENDED) - return; - if (irqd_is_managed_and_shutdown(data)) - irq_startup(desc, IRQ_RESEND, IRQ_START_COND); + irq_startup_managed(desc); /* * If the interrupt can only be directed to a single target @@ -244,9 +236,8 @@ int irq_affinity_online_cpu(unsigned int cpu) irq_lock_sparse(); for_each_active_irq(irq) { desc = irq_to_desc(irq); - raw_spin_lock_irq(&desc->lock); - irq_restore_affinity_of_irq(desc, cpu); - raw_spin_unlock_irq(&desc->lock); + scoped_guard(raw_spinlock_irq, &desc->lock) + irq_restore_affinity_of_irq(desc, cpu); } irq_unlock_sparse(); diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index ca142b9a4db3..3527defd2890 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -160,7 +160,7 @@ static int irq_debug_show(struct seq_file *m, void *p) struct irq_desc *desc = m->private; struct irq_data *data; - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); data = irq_desc_get_irq_data(desc); seq_printf(m, "handler: %ps\n", desc->handle_irq); seq_printf(m, "device: %s\n", desc->dev_name); @@ -178,7 +178,6 @@ static int irq_debug_show(struct seq_file *m, void *p) seq_printf(m, "node: %d\n", irq_data_get_node(data)); irq_debug_show_masks(m, desc); irq_debug_show_data(m, data, 0); - raw_spin_unlock_irq(&desc->lock); return 0; } @@ -226,12 +225,12 @@ void irq_debugfs_copy_devname(int irq, struct device *dev) void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) { - char name [10]; + char name [12]; if (!irq_dir || !desc || desc->debugfs_file) return; - sprintf(name, "%d", irq); + sprintf(name, "%u", irq); desc->debugfs_file = debugfs_create_file(name, 0644, irq_dir, desc, &dfs_irq_ops); } diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c4a8bca5f2b0..bf59e37d650a 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -40,10 +40,9 @@ void irq_gc_mask_disable_reg(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.disable); *ct->mask_cache &= ~mask; - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_disable_reg); @@ -60,10 +59,9 @@ void irq_gc_mask_set_bit(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); *ct->mask_cache |= mask; irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); @@ -80,10 +78,9 @@ void irq_gc_mask_clr_bit(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); *ct->mask_cache &= ~mask; irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); @@ -100,10 +97,9 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.enable); *ct->mask_cache |= mask; - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_unmask_enable_reg); @@ -117,9 +113,8 @@ void irq_gc_ack_set_bit(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.ack); - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); @@ -133,9 +128,8 @@ void irq_gc_ack_clr_bit(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = ~d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.ack); - irq_gc_unlock(gc); } /** @@ -156,11 +150,10 @@ void irq_gc_mask_disable_and_ack_set(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.disable); *ct->mask_cache &= ~mask; irq_reg_writel(gc, mask, ct->regs.ack); - irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_disable_and_ack_set); @@ -174,9 +167,8 @@ void irq_gc_eoi(struct irq_data *d) struct irq_chip_type *ct = irq_data_get_chip_type(d); u32 mask = d->mask; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); irq_reg_writel(gc, mask, ct->regs.eoi); - irq_gc_unlock(gc); } /** @@ -196,12 +188,11 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) if (!(mask & gc->wake_enabled)) return -EINVAL; - irq_gc_lock(gc); + guard(raw_spinlock)(&gc->lock); if (on) gc->wake_active |= mask; else gc->wake_active &= ~mask; - irq_gc_unlock(gc); return 0; } EXPORT_SYMBOL_GPL(irq_gc_set_wake); @@ -288,7 +279,6 @@ int irq_domain_alloc_generic_chips(struct irq_domain *d, { struct irq_domain_chip_generic *dgc; struct irq_chip_generic *gc; - unsigned long flags; int numchips, i; size_t dgc_sz; size_t gc_sz; @@ -340,9 +330,8 @@ int irq_domain_alloc_generic_chips(struct irq_domain *d, goto err; } - raw_spin_lock_irqsave(&gc_lock, flags); - list_add_tail(&gc->list, &gc_list); - raw_spin_unlock_irqrestore(&gc_lock, flags); + scoped_guard (raw_spinlock_irqsave, &gc_lock) + list_add_tail(&gc->list, &gc_list); /* Calc pointer to the next generic chip */ tmp += gc_sz; } @@ -459,7 +448,6 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, struct irq_chip_generic *gc; struct irq_chip_type *ct; struct irq_chip *chip; - unsigned long flags; int idx; gc = __irq_get_domain_generic_chip(d, hw_irq); @@ -479,9 +467,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, /* We only init the cache for the first mapping of a generic chip */ if (!gc->installed) { - raw_spin_lock_irqsave(&gc->lock, flags); + guard(raw_spinlock_irqsave)(&gc->lock); irq_gc_init_mask_cache(gc, dgc->gc_flags); - raw_spin_unlock_irqrestore(&gc->lock, flags); } /* Mark the interrupt as installed */ @@ -548,9 +535,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, struct irq_chip *chip = &ct->chip; unsigned int i; - raw_spin_lock(&gc_lock); - list_add_tail(&gc->list, &gc_list); - raw_spin_unlock(&gc_lock); + scoped_guard (raw_spinlock, &gc_lock) + list_add_tail(&gc->list, &gc_list); irq_gc_init_mask_cache(gc, flags); @@ -616,9 +602,8 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, { unsigned int i, virq; - raw_spin_lock(&gc_lock); - list_del(&gc->list); - raw_spin_unlock(&gc_lock); + scoped_guard (raw_spinlock, &gc_lock) + list_del(&gc->list); for (i = 0; msk; msk >>= 1, i++) { if (!(msk & 0x01)) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a979523640d0..aebfe225c9a6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -87,10 +87,10 @@ extern void __enable_irq(struct irq_desc *desc); extern int irq_activate(struct irq_desc *desc); extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); +extern void irq_startup_managed(struct irq_desc *desc); extern void irq_shutdown(struct irq_desc *desc); extern void irq_shutdown_and_deactivate(struct irq_desc *desc); -extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); @@ -98,18 +98,12 @@ extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); extern void unmask_threaded_irq(struct irq_desc *desc); -extern unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask); - #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } #else extern void irq_mark_irq(unsigned int irq); #endif -extern int __irq_get_irqchip_state(struct irq_data *data, - enum irqchip_irq_state which, - bool *state); - irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); @@ -139,8 +133,6 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern void irq_set_thread_affinity(struct irq_desc *desc); - extern int irq_do_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force); @@ -150,6 +142,10 @@ extern int irq_setup_affinity(struct irq_desc *desc); static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; } #endif + +#define for_each_action_of_desc(desc, act) \ + for (act = desc->action; act; act = act->next) + /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { @@ -169,38 +165,33 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) -#define for_each_action_of_desc(desc, act) \ - for (act = desc->action; act; act = act->next) - -struct irq_desc * -__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, - unsigned int check); +struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, + unsigned int check); void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); -static inline struct irq_desc * -irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check) -{ - return __irq_get_desc_lock(irq, flags, true, check); -} +__DEFINE_CLASS_IS_CONDITIONAL(irqdesc_lock, true); +__DEFINE_UNLOCK_GUARD(irqdesc_lock, struct irq_desc, + __irq_put_desc_unlock(_T->lock, _T->flags, _T->bus), + unsigned long flags; bool bus); -static inline void -irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) +static inline class_irqdesc_lock_t class_irqdesc_lock_constructor(unsigned int irq, bool bus, + unsigned int check) { - __irq_put_desc_unlock(desc, flags, true); -} + class_irqdesc_lock_t _t = { .bus = bus, }; -static inline struct irq_desc * -irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check) -{ - return __irq_get_desc_lock(irq, flags, false, check); -} + _t.lock = __irq_get_desc_lock(irq, &_t.flags, bus, check); -static inline void -irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) -{ - __irq_put_desc_unlock(desc, flags, false); + return _t; } +#define scoped_irqdesc_get_and_lock(_irq, _check) \ + scoped_guard(irqdesc_lock, _irq, false, _check) + +#define scoped_irqdesc_get_and_buslock(_irq, _check) \ + scoped_guard(irqdesc_lock, _irq, true, _check) + +#define scoped_irqdesc ((struct irq_desc *)(__guard_ptr(irqdesc_lock)(&scope))) + #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) static inline unsigned int irqd_get(struct irq_data *d) @@ -442,6 +433,7 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) return desc->pending_mask; } bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); +void irq_force_complete_move(struct irq_desc *desc); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) { @@ -467,6 +459,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) { return false; } +static inline void irq_force_complete_move(struct irq_desc *desc) { } #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 1a3d483548e2..ae4c9cbd1b4b 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -202,7 +202,7 @@ struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode, void *data) { struct irq_sim_work_ctx *work_ctx __free(kfree) = - kmalloc(sizeof(*work_ctx), GFP_KERNEL); + kzalloc(sizeof(*work_ctx), GFP_KERNEL); if (!work_ctx) return ERR_PTR(-ENOMEM); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 287830739783..b64c57b44c20 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -246,8 +246,7 @@ static struct kobject *irq_kobj_base; #define IRQ_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) -static ssize_t per_cpu_count_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); ssize_t ret = 0; @@ -257,112 +256,83 @@ static ssize_t per_cpu_count_show(struct kobject *kobj, for_each_possible_cpu(cpu) { unsigned int c = irq_desc_kstat_cpu(desc, cpu); - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); + ret += sysfs_emit_at(buf, ret, "%s%u", p, c); p = ","; } - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(per_cpu_count); -static ssize_t chip_name_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t chip_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - - raw_spin_lock_irq(&desc->lock); - if (desc->irq_data.chip && desc->irq_data.chip->name) { - ret = scnprintf(buf, PAGE_SIZE, "%s\n", - desc->irq_data.chip->name); - } - raw_spin_unlock_irq(&desc->lock); - return ret; + guard(raw_spinlock_irq)(&desc->lock); + if (desc->irq_data.chip && desc->irq_data.chip->name) + return sysfs_emit(buf, "%s\n", desc->irq_data.chip->name); + return 0; } IRQ_ATTR_RO(chip_name); -static ssize_t hwirq_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t hwirq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.domain) - ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq); - raw_spin_unlock_irq(&desc->lock); - - return ret; + return sysfs_emit(buf, "%lu\n", desc->irq_data.hwirq); + return 0; } IRQ_ATTR_RO(hwirq); -static ssize_t type_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - raw_spin_lock_irq(&desc->lock); - ret = sprintf(buf, "%s\n", - irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); - raw_spin_unlock_irq(&desc->lock); - - return ret; + guard(raw_spinlock_irq)(&desc->lock); + return sysfs_emit(buf, "%s\n", irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); } IRQ_ATTR_RO(type); -static ssize_t wakeup_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t wakeup_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - - raw_spin_lock_irq(&desc->lock); - ret = sprintf(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data))); - raw_spin_unlock_irq(&desc->lock); - - return ret; + guard(raw_spinlock_irq)(&desc->lock); + return sysfs_emit(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data))); } IRQ_ATTR_RO(wakeup); -static ssize_t name_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - ssize_t ret = 0; - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); if (desc->name) - ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name); - raw_spin_unlock_irq(&desc->lock); - - return ret; + return sysfs_emit(buf, "%s\n", desc->name); + return 0; } IRQ_ATTR_RO(name); -static ssize_t actions_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t actions_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); struct irqaction *action; ssize_t ret = 0; char *p = ""; - raw_spin_lock_irq(&desc->lock); - for_each_action_of_desc(desc, action) { - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", - p, action->name); - p = ","; + scoped_guard(raw_spinlock_irq, &desc->lock) { + for_each_action_of_desc(desc, action) { + ret += sysfs_emit_at(buf, ret, "%s%s", p, action->name); + p = ","; + } } - raw_spin_unlock_irq(&desc->lock); if (ret) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); - + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(actions); @@ -418,19 +388,14 @@ static int __init irq_sysfs_init(void) int irq; /* Prevent concurrent irq alloc/free */ - irq_lock_sparse(); - + guard(mutex)(&sparse_irq_lock); irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); - if (!irq_kobj_base) { - irq_unlock_sparse(); + if (!irq_kobj_base) return -ENOMEM; - } /* Add the already allocated interrupts */ for_each_irq_desc(irq, desc) irq_sysfs_add(irq, desc); - irq_unlock_sparse(); - return 0; } postcore_initcall(irq_sysfs_init); @@ -573,12 +538,12 @@ err: return -ENOMEM; } -static int irq_expand_nr_irqs(unsigned int nr) +static bool irq_expand_nr_irqs(unsigned int nr) { if (nr > MAX_SPARSE_IRQS) - return -ENOMEM; + return false; nr_irqs = nr; - return 0; + return true; } int __init early_irq_init(void) @@ -656,11 +621,9 @@ EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - raw_spin_lock_irqsave(&desc->lock, flags); - desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) + desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); delete_irq_desc(irq); } @@ -679,16 +642,15 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, return start; } -static int irq_expand_nr_irqs(unsigned int nr) +static inline bool irq_expand_nr_irqs(unsigned int nr) { - return -ENOMEM; + return false; } void irq_mark_irq(unsigned int irq) { - mutex_lock(&sparse_irq_lock); + guard(mutex)(&sparse_irq_lock); irq_insert_desc(irq, irq_desc + irq); - mutex_unlock(&sparse_irq_lock); } #ifdef CONFIG_GENERIC_IRQ_LEGACY @@ -827,11 +789,9 @@ void irq_free_descs(unsigned int from, unsigned int cnt) if (from >= nr_irqs || (from + cnt) > nr_irqs) return; - mutex_lock(&sparse_irq_lock); + guard(mutex)(&sparse_irq_lock); for (i = 0; i < cnt; i++) free_desc(from + i); - - mutex_unlock(&sparse_irq_lock); } EXPORT_SYMBOL_GPL(irq_free_descs); @@ -848,11 +808,10 @@ EXPORT_SYMBOL_GPL(irq_free_descs); * * Returns the first irq number or error code */ -int __ref -__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, - struct module *owner, const struct irq_affinity_desc *affinity) +int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, + struct module *owner, const struct irq_affinity_desc *affinity) { - int start, ret; + int start; if (!cnt) return -EINVAL; @@ -870,22 +829,17 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, from = arch_dynirq_lower_bound(from); } - mutex_lock(&sparse_irq_lock); + guard(mutex)(&sparse_irq_lock); start = irq_find_free_area(from, cnt); - ret = -EEXIST; if (irq >=0 && start != irq) - goto unlock; + return -EEXIST; if (start + cnt > nr_irqs) { - ret = irq_expand_nr_irqs(start + cnt); - if (ret) - goto unlock; + if (!irq_expand_nr_irqs(start + cnt)) + return -ENOMEM; } - ret = alloc_descs(start, cnt, node, affinity, owner); -unlock: - mutex_unlock(&sparse_irq_lock); - return ret; + return alloc_descs(start, cnt, node, affinity, owner); } EXPORT_SYMBOL_GPL(__irq_alloc_descs); @@ -900,27 +854,27 @@ unsigned int irq_get_next_irq(unsigned int offset) return irq_find_at_or_after(offset); } -struct irq_desc * -__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, - unsigned int check) +struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, + unsigned int check) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; - if (desc) { - if (check & _IRQ_DESC_CHECK) { - if ((check & _IRQ_DESC_PERCPU) && - !irq_settings_is_per_cpu_devid(desc)) - return NULL; - - if (!(check & _IRQ_DESC_PERCPU) && - irq_settings_is_per_cpu_devid(desc)) - return NULL; - } + desc = irq_to_desc(irq); + if (!desc) + return NULL; + + if (check & _IRQ_DESC_CHECK) { + if ((check & _IRQ_DESC_PERCPU) && !irq_settings_is_per_cpu_devid(desc)) + return NULL; - if (bus) - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, *flags); + if (!(check & _IRQ_DESC_PERCPU) && irq_settings_is_per_cpu_devid(desc)) + return NULL; } + + if (bus) + chip_bus_lock(desc); + raw_spin_lock_irqsave(&desc->lock, *flags); + return desc; } @@ -991,7 +945,7 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } -unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) +static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) { unsigned int sum = 0; int cpu; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index ec6d8e72d980..c8b6de09047b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -398,7 +398,7 @@ void irq_domain_remove(struct irq_domain *domain) * If the going away domain is the default one, reset it. */ if (unlikely(irq_default_domain == domain)) - irq_set_default_host(NULL); + irq_set_default_domain(NULL); mutex_unlock(&irq_domain_mutex); @@ -480,33 +480,6 @@ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, } EXPORT_SYMBOL_GPL(irq_domain_create_simple); -/** - * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. - * @of_node: pointer to interrupt controller's device tree node. - * @size: total number of irqs in legacy mapping - * @first_irq: first number of irq block assigned to the domain - * @first_hwirq: first hwirq number to use for the translation. Should normally - * be '0', but a positive integer can be used if the effective - * hwirqs numbering does not begin at zero. - * @ops: map/unmap domain callbacks - * @host_data: Controller private data pointer - * - * Note: the map() callback will be called before this function returns - * for all legacy interrupts except 0 (which is always the invalid irq for - * a legacy controller). - */ -struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, - unsigned int size, - unsigned int first_irq, - irq_hw_number_t first_hwirq, - const struct irq_domain_ops *ops, - void *host_data) -{ - return irq_domain_create_legacy(of_node_to_fwnode(of_node), size, - first_irq, first_hwirq, ops, host_data); -} -EXPORT_SYMBOL_GPL(irq_domain_add_legacy); - struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, unsigned int size, unsigned int first_irq, @@ -573,7 +546,7 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, EXPORT_SYMBOL_GPL(irq_find_matching_fwspec); /** - * irq_set_default_host() - Set a "default" irq domain + * irq_set_default_domain() - Set a "default" irq domain * @domain: default domain pointer * * For convenience, it's possible to set a "default" domain that will be used @@ -581,16 +554,16 @@ EXPORT_SYMBOL_GPL(irq_find_matching_fwspec); * platforms that want to manipulate a few hard coded interrupt numbers that * aren't properly represented in the device-tree. */ -void irq_set_default_host(struct irq_domain *domain) +void irq_set_default_domain(struct irq_domain *domain) { pr_debug("Default domain set to @0x%p\n", domain); irq_default_domain = domain; } -EXPORT_SYMBOL_GPL(irq_set_default_host); +EXPORT_SYMBOL_GPL(irq_set_default_domain); /** - * irq_get_default_host() - Retrieve the "default" irq domain + * irq_get_default_domain() - Retrieve the "default" irq domain * * Returns: the default domain, if any. * @@ -598,11 +571,11 @@ EXPORT_SYMBOL_GPL(irq_set_default_host); * systems that cannot implement a firmware->fwnode mapping (which * both DT and ACPI provide). */ -struct irq_domain *irq_get_default_host(void) +struct irq_domain *irq_get_default_domain(void) { return irq_default_domain; } -EXPORT_SYMBOL_GPL(irq_get_default_host); +EXPORT_SYMBOL_GPL(irq_get_default_domain); static bool irq_domain_is_nomap(struct irq_domain *domain) { @@ -885,7 +858,7 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, { int i; - fwspec->fwnode = of_node_to_fwnode(np); + fwspec->fwnode = of_fwnode_handle(np); fwspec->param_count = count; for (i = 0; i < count; i++) @@ -1133,6 +1106,31 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); /** + * irq_domain_xlate_twothreecell() - Generic xlate for direct two or three cell bindings + * @d: Interrupt domain involved in the translation + * @ctrlr: The device tree node for the device whose interrupt is translated + * @intspec: The interrupt specifier data from the device tree + * @intsize: The number of entries in @intspec + * @out_hwirq: Pointer to storage for the hardware interrupt number + * @out_type: Pointer to storage for the interrupt type + * + * Device Tree interrupt specifier translation function for two or three + * cell bindings, where the cell values map directly to the hardware + * interrupt number and the type specifier. + */ +int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type) +{ + struct irq_fwspec fwspec; + + of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); + + return irq_domain_translate_twothreecell(d, &fwspec, out_hwirq, out_type); +} +EXPORT_SYMBOL_GPL(irq_domain_xlate_twothreecell); + +/** * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings * @d: Interrupt domain involved in the translation * @ctrlr: The device tree node for the device whose interrupt is translated @@ -1216,6 +1214,37 @@ int irq_domain_translate_twocell(struct irq_domain *d, } EXPORT_SYMBOL_GPL(irq_domain_translate_twocell); +/** + * irq_domain_translate_twothreecell() - Generic translate for direct two or three cell + * bindings + * @d: Interrupt domain involved in the translation + * @fwspec: The firmware interrupt specifier to translate + * @out_hwirq: Pointer to storage for the hardware interrupt number + * @out_type: Pointer to storage for the interrupt type + * + * Firmware interrupt specifier translation function for two or three cell + * specifications, where the parameter values map directly to the hardware + * interrupt number and the type specifier. + */ +int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type) +{ + if (fwspec->param_count == 2) { + *out_hwirq = fwspec->param[0]; + *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; + return 0; + } + + if (fwspec->param_count == 3) { + *out_hwirq = fwspec->param[1]; + *out_type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; + return 0; + } + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(irq_domain_translate_twothreecell); + int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity) { @@ -1252,47 +1281,6 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data) EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY -/** - * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy - * @parent: Parent irq domain to associate with the new domain - * @flags: Irq domain flags associated to the domain - * @size: Size of the domain. See below - * @fwnode: Optional fwnode of the interrupt controller - * @ops: Pointer to the interrupt domain callbacks - * @host_data: Controller private data pointer - * - * If @size is 0 a tree domain is created, otherwise a linear domain. - * - * If successful the parent is associated to the new domain and the - * domain flags are set. - * Returns pointer to IRQ domain, or NULL on failure. - */ -struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, - unsigned int flags, - unsigned int size, - struct fwnode_handle *fwnode, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain_info info = { - .fwnode = fwnode, - .size = size, - .hwirq_max = size, - .ops = ops, - .host_data = host_data, - .domain_flags = flags, - .parent = parent, - }; - struct irq_domain *d; - - if (!info.size) - info.hwirq_max = ~0U; - - d = irq_domain_instantiate(&info); - return IS_ERR(d) ? NULL : d; -} -EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy); - static void irq_domain_insert_irq(int virq) { struct irq_data *data; @@ -1589,9 +1577,8 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, } } -int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg) +static int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, + unsigned int nr_irqs, void *arg) { if (!domain->ops->alloc) { pr_debug("domain->ops->alloc() is NULL\n"); @@ -2009,7 +1996,7 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain) domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; } #else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -/** +/* * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain * @domain: domain to match * @virq: IRQ number to get irq_data @@ -2023,7 +2010,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, } EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); -/** +/* * irq_domain_set_info - Set the complete data for a @virq in @domain * @domain: Interrupt domain to match * @virq: IRQ number diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f300bb6be3bd..c94837382037 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -35,14 +35,14 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif +static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state); + static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { struct irq_data *irqd = irq_desc_get_irq_data(desc); bool inprogress; do { - unsigned long flags; - /* * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. @@ -51,7 +51,7 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irqsave)(&desc->lock); inprogress = irqd_irq_inprogress(&desc->irq_data); /* @@ -67,33 +67,30 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE, &inprogress); } - raw_spin_unlock_irqrestore(&desc->lock, flags); - /* Oops, that failed? */ } while (inprogress); } /** - * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) - * @irq: interrupt number to wait for + * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for * - * This function waits for any pending hard IRQ handlers for this - * interrupt to complete before returning. If you use this - * function while holding a resource the IRQ handler may need you - * will deadlock. It does not take associated threaded handlers - * into account. + * This function waits for any pending hard IRQ handlers for this interrupt + * to complete before returning. If you use this function while holding a + * resource the IRQ handler may need you will deadlock. It does not take + * associated threaded handlers into account. * - * Do not use this for shutdown scenarios where you must be sure - * that all parts (hardirq and threaded handler) have completed. + * Do not use this for shutdown scenarios where you must be sure that all + * parts (hardirq and threaded handler) have completed. * - * Returns: false if a threaded handler is active. + * Returns: false if a threaded handler is active. * - * This function may be called - with care - from IRQ context. + * This function may be called - with care - from IRQ context. * - * It does not check whether there is an interrupt in flight at the - * hardware level, but not serviced yet, as this might deadlock when - * called with interrupts disabled and the target CPU of the interrupt - * is the current CPU. + * It does not check whether there is an interrupt in flight at the + * hardware level, but not serviced yet, as this might deadlock when called + * with interrupts disabled and the target CPU of the interrupt is the + * current CPU. */ bool synchronize_hardirq(unsigned int irq) { @@ -119,19 +116,19 @@ static void __synchronize_irq(struct irq_desc *desc) } /** - * synchronize_irq - wait for pending IRQ handlers (on other CPUs) - * @irq: interrupt number to wait for + * synchronize_irq - wait for pending IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for * - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. + * This function waits for any pending IRQ handlers for this interrupt to + * complete before returning. If you use this function while holding a + * resource the IRQ handler may need you will deadlock. * - * Can only be called from preemptible code as it might sleep when - * an interrupt thread is associated to @irq. + * Can only be called from preemptible code as it might sleep when + * an interrupt thread is associated to @irq. * - * It optionally makes sure (when the irq chip supports that method) - * that the interrupt is not pending in any CPU and waiting for - * service. + * It optionally makes sure (when the irq chip supports that method) + * that the interrupt is not pending in any CPU and waiting for + * service. */ void synchronize_irq(unsigned int irq) { @@ -154,8 +151,8 @@ static bool __irq_can_set_affinity(struct irq_desc *desc) } /** - * irq_can_set_affinity - Check if the affinity of a given irq can be set - * @irq: Interrupt to check + * irq_can_set_affinity - Check if the affinity of a given irq can be set + * @irq: Interrupt to check * */ int irq_can_set_affinity(unsigned int irq) @@ -179,15 +176,15 @@ bool irq_can_set_affinity_usr(unsigned int irq) } /** - * irq_set_thread_affinity - Notify irq threads to adjust affinity - * @desc: irq descriptor which has affinity changed + * irq_set_thread_affinity - Notify irq threads to adjust affinity + * @desc: irq descriptor which has affinity changed * - * We just set IRQTF_AFFINITY and delegate the affinity setting - * to the interrupt thread itself. We can not call - * set_cpus_allowed_ptr() here as we hold desc->lock and this - * code can be called from hard interrupt context. + * Just set IRQTF_AFFINITY and delegate the affinity setting to the + * interrupt thread itself. We can not call set_cpus_allowed_ptr() here as + * we hold desc->lock and this code can be called from hard interrupt + * context. */ -void irq_set_thread_affinity(struct irq_desc *desc) +static void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action; @@ -398,14 +395,8 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, * an interrupt which is already started or which has already been configured * as managed will also fail, as these mean invalid init state or double init. */ -int irq_update_affinity_desc(unsigned int irq, - struct irq_affinity_desc *affinity) +int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity) { - struct irq_desc *desc; - unsigned long flags; - bool activated; - int ret = 0; - /* * Supporting this with the reservation scheme used by x86 needs * some more thought. Fail it for now. @@ -413,60 +404,50 @@ int irq_update_affinity_desc(unsigned int irq, if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) return -EOPNOTSUPP; - desc = irq_get_desc_buslock(irq, &flags, 0); - if (!desc) - return -EINVAL; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; + bool activated; - /* Requires the interrupt to be shut down */ - if (irqd_is_started(&desc->irq_data)) { - ret = -EBUSY; - goto out_unlock; - } + /* Requires the interrupt to be shut down */ + if (irqd_is_started(&desc->irq_data)) + return -EBUSY; - /* Interrupts which are already managed cannot be modified */ - if (irqd_affinity_is_managed(&desc->irq_data)) { - ret = -EBUSY; - goto out_unlock; - } - - /* - * Deactivate the interrupt. That's required to undo - * anything an earlier activation has established. - */ - activated = irqd_is_activated(&desc->irq_data); - if (activated) - irq_domain_deactivate_irq(&desc->irq_data); - - if (affinity->is_managed) { - irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); - irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); - } + /* Interrupts which are already managed cannot be modified */ + if (irqd_affinity_is_managed(&desc->irq_data)) + return -EBUSY; + /* + * Deactivate the interrupt. That's required to undo + * anything an earlier activation has established. + */ + activated = irqd_is_activated(&desc->irq_data); + if (activated) + irq_domain_deactivate_irq(&desc->irq_data); - cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); + if (affinity->is_managed) { + irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); + irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); + } - /* Restore the activation state */ - if (activated) - irq_domain_activate_irq(&desc->irq_data, false); + cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); -out_unlock: - irq_put_desc_busunlock(desc, flags); - return ret; + /* Restore the activation state */ + if (activated) + irq_domain_activate_irq(&desc->irq_data, false); + return 0; + } + return -EINVAL; } static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - int ret; if (!desc) return -EINVAL; - raw_spin_lock_irqsave(&desc->lock, flags); - ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; + guard(raw_spinlock_irqsave)(&desc->lock); + return irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); } /** @@ -499,39 +480,36 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) } EXPORT_SYMBOL_GPL(irq_force_affinity); -int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, - bool setaffinity) +int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + int ret = -EINVAL; - if (!desc) - return -EINVAL; - desc->affinity_hint = m; - irq_put_desc_unlock(desc, flags); - if (m && setaffinity) + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + scoped_irqdesc->affinity_hint = m; + ret = 0; + } + + if (!ret && m && setaffinity) __irq_set_affinity(irq, m, false); - return 0; + return ret; } EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint); static void irq_affinity_notify(struct work_struct *work) { - struct irq_affinity_notify *notify = - container_of(work, struct irq_affinity_notify, work); + struct irq_affinity_notify *notify = container_of(work, struct irq_affinity_notify, work); struct irq_desc *desc = irq_to_desc(notify->irq); cpumask_var_t cpumask; - unsigned long flags; if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) goto out; - raw_spin_lock_irqsave(&desc->lock, flags); - if (irq_move_pending(&desc->irq_data)) - irq_get_pending(cpumask, desc); - else - cpumask_copy(cpumask, desc->irq_common_data.affinity); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) { + if (irq_move_pending(&desc->irq_data)) + irq_get_pending(cpumask, desc); + else + cpumask_copy(cpumask, desc->irq_common_data.affinity); + } notify->notify(notify, cpumask); @@ -541,22 +519,20 @@ out: } /** - * irq_set_affinity_notifier - control notification of IRQ affinity changes - * @irq: Interrupt for which to enable/disable notification - * @notify: Context for notification, or %NULL to disable - * notification. Function pointers must be initialised; - * the other fields will be initialised by this function. - * - * Must be called in process context. Notification may only be enabled - * after the IRQ is allocated and must be disabled before the IRQ is - * freed using free_irq(). + * irq_set_affinity_notifier - control notification of IRQ affinity changes + * @irq: Interrupt for which to enable/disable notification + * @notify: Context for notification, or %NULL to disable + * notification. Function pointers must be initialised; + * the other fields will be initialised by this function. + * + * Must be called in process context. Notification may only be enabled + * after the IRQ is allocated and must be disabled before the IRQ is freed + * using free_irq(). */ -int -irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) +int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { struct irq_desc *desc = irq_to_desc(irq); struct irq_affinity_notify *old_notify; - unsigned long flags; /* The release function is promised process context */ might_sleep(); @@ -571,10 +547,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) INIT_WORK(¬ify->work, irq_affinity_notify); } - raw_spin_lock_irqsave(&desc->lock, flags); - old_notify = desc->affinity_notify; - desc->affinity_notify = notify; - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) { + old_notify = desc->affinity_notify; + desc->affinity_notify = notify; + } if (old_notify) { if (cancel_work_sync(&old_notify->work)) { @@ -595,7 +571,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); int irq_setup_affinity(struct irq_desc *desc) { struct cpumask *set = irq_default_affinity; - int ret, node = irq_desc_get_node(desc); + int node = irq_desc_get_node(desc); + static DEFINE_RAW_SPINLOCK(mask_lock); static struct cpumask mask; @@ -603,7 +580,7 @@ int irq_setup_affinity(struct irq_desc *desc) if (!__irq_can_set_affinity(desc)) return 0; - raw_spin_lock(&mask_lock); + guard(raw_spinlock)(&mask_lock); /* * Preserve the managed affinity setting and a userspace affinity * setup, but make sure that one of the targets is online. @@ -628,9 +605,7 @@ int irq_setup_affinity(struct irq_desc *desc) if (cpumask_intersects(&mask, nodemask)) cpumask_and(&mask, &mask, nodemask); } - ret = irq_do_set_affinity(&desc->irq_data, &mask, false); - raw_spin_unlock(&mask_lock); - return ret; + return irq_do_set_affinity(&desc->irq_data, &mask, false); } #else /* Wrapper for ALPHA specific affinity selector magic */ @@ -643,44 +618,36 @@ int irq_setup_affinity(struct irq_desc *desc) /** - * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt - * @irq: interrupt number to set affinity - * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU - * specific data for percpu_devid interrupts - * - * This function uses the vCPU specific data to set the vCPU - * affinity for an irq. The vCPU specific data is passed from - * outside, such as KVM. One example code path is as below: - * KVM -> IOMMU -> irq_set_vcpu_affinity(). + * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt + * @irq: interrupt number to set affinity + * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU + * specific data for percpu_devid interrupts + * + * This function uses the vCPU specific data to set the vCPU affinity for + * an irq. The vCPU specific data is passed from outside, such as KVM. One + * example code path is as below: KVM -> IOMMU -> irq_set_vcpu_affinity(). */ int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - struct irq_data *data; - struct irq_chip *chip; - int ret = -ENOSYS; + scoped_irqdesc_get_and_lock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; + struct irq_data *data; + struct irq_chip *chip; - if (!desc) - return -EINVAL; - - data = irq_desc_get_irq_data(desc); - do { - chip = irq_data_get_irq_chip(data); - if (chip && chip->irq_set_vcpu_affinity) - break; -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - data = data->parent_data; -#else - data = NULL; -#endif - } while (data); + data = irq_desc_get_irq_data(desc); + do { + chip = irq_data_get_irq_chip(data); + if (chip && chip->irq_set_vcpu_affinity) + break; - if (data) - ret = chip->irq_set_vcpu_affinity(data, vcpu_info); - irq_put_desc_unlock(desc, flags); + data = irqd_get_parent_data(data); + } while (data); - return ret; + if (!data) + return -ENOSYS; + return chip->irq_set_vcpu_affinity(data, vcpu_info); + } + return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); @@ -692,26 +659,23 @@ void __disable_irq(struct irq_desc *desc) static int __disable_irq_nosync(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - - if (!desc) - return -EINVAL; - __disable_irq(desc); - irq_put_desc_busunlock(desc, flags); - return 0; + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + __disable_irq(scoped_irqdesc); + return 0; + } + return -EINVAL; } /** - * disable_irq_nosync - disable an irq without waiting - * @irq: Interrupt to disable + * disable_irq_nosync - disable an irq without waiting + * @irq: Interrupt to disable * - * Disable the selected interrupt line. Disables and Enables are - * nested. - * Unlike disable_irq(), this function does not ensure existing - * instances of the IRQ handler have completed before returning. + * Disable the selected interrupt line. Disables and Enables are + * nested. + * Unlike disable_irq(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. * - * This function may be called from IRQ context. + * This function may be called from IRQ context. */ void disable_irq_nosync(unsigned int irq) { @@ -720,17 +684,17 @@ void disable_irq_nosync(unsigned int irq) EXPORT_SYMBOL(disable_irq_nosync); /** - * disable_irq - disable an irq and wait for completion - * @irq: Interrupt to disable + * disable_irq - disable an irq and wait for completion + * @irq: Interrupt to disable * - * Disable the selected interrupt line. Enables and Disables are - * nested. - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. + * Disable the selected interrupt line. Enables and Disables are nested. * - * Can only be called from preemptible code as it might sleep when - * an interrupt thread is associated to @irq. + * This function waits for any pending IRQ handlers for this interrupt to + * complete before returning. If you use this function while holding a + * resource the IRQ handler may need you will deadlock. + * + * Can only be called from preemptible code as it might sleep when an + * interrupt thread is associated to @irq. * */ void disable_irq(unsigned int irq) @@ -742,40 +706,39 @@ void disable_irq(unsigned int irq) EXPORT_SYMBOL(disable_irq); /** - * disable_hardirq - disables an irq and waits for hardirq completion - * @irq: Interrupt to disable + * disable_hardirq - disables an irq and waits for hardirq completion + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Enables and Disables are nested. * - * Disable the selected interrupt line. Enables and Disables are - * nested. - * This function waits for any pending hard IRQ handlers for this - * interrupt to complete before returning. If you use this function while - * holding a resource the hard IRQ handler may need you will deadlock. + * This function waits for any pending hard IRQ handlers for this interrupt + * to complete before returning. If you use this function while holding a + * resource the hard IRQ handler may need you will deadlock. * - * When used to optimistically disable an interrupt from atomic context - * the return value must be checked. + * When used to optimistically disable an interrupt from atomic context the + * return value must be checked. * - * Returns: false if a threaded handler is active. + * Returns: false if a threaded handler is active. * - * This function may be called - with care - from IRQ context. + * This function may be called - with care - from IRQ context. */ bool disable_hardirq(unsigned int irq) { if (!__disable_irq_nosync(irq)) return synchronize_hardirq(irq); - return false; } EXPORT_SYMBOL_GPL(disable_hardirq); /** - * disable_nmi_nosync - disable an nmi without waiting - * @irq: Interrupt to disable - * - * Disable the selected interrupt line. Disables and enables are - * nested. - * The interrupt to disable must have been requested through request_nmi. - * Unlike disable_nmi(), this function does not ensure existing - * instances of the IRQ handler have completed before returning. + * disable_nmi_nosync - disable an nmi without waiting + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Disables and enables are nested. + * + * The interrupt to disable must have been requested through request_nmi. + * Unlike disable_nmi(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. */ void disable_nmi_nosync(unsigned int irq) { @@ -815,41 +778,34 @@ void __enable_irq(struct irq_desc *desc) } /** - * enable_irq - enable handling of an irq - * @irq: Interrupt to enable + * enable_irq - enable handling of an irq + * @irq: Interrupt to enable * - * Undoes the effect of one call to disable_irq(). If this - * matches the last disable, processing of interrupts on this - * IRQ line is re-enabled. + * Undoes the effect of one call to disable_irq(). If this matches the + * last disable, processing of interrupts on this IRQ line is re-enabled. * - * This function may be called from IRQ context only when - * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! + * This function may be called from IRQ context only when + * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; - if (!desc) - return; - if (WARN(!desc->irq_data.chip, - KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) - goto out; - - __enable_irq(desc); -out: - irq_put_desc_busunlock(desc, flags); + if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq)) + return; + __enable_irq(desc); + } } EXPORT_SYMBOL(enable_irq); /** - * enable_nmi - enable handling of an nmi - * @irq: Interrupt to enable + * enable_nmi - enable handling of an nmi + * @irq: Interrupt to enable * - * The interrupt to enable must have been requested through request_nmi. - * Undoes the effect of one call to disable_nmi(). If this - * matches the last disable, processing of interrupts on this - * IRQ line is re-enabled. + * The interrupt to enable must have been requested through request_nmi. + * Undoes the effect of one call to disable_nmi(). If this matches the last + * disable, processing of interrupts on this IRQ line is re-enabled. */ void enable_nmi(unsigned int irq) { @@ -871,65 +827,59 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) } /** - * irq_set_irq_wake - control irq power management wakeup - * @irq: interrupt to control - * @on: enable/disable power management wakeup - * - * Enable/disable power management wakeup mode, which is - * disabled by default. Enables and disables must match, - * just as they match for non-wakeup mode support. - * - * Wakeup mode lets this IRQ wake the system from sleep - * states like "suspend to RAM". - * - * Note: irq enable/disable state is completely orthogonal - * to the enable/disable state of irq wake. An irq can be - * disabled with disable_irq() and still wake the system as - * long as the irq has wake enabled. If this does not hold, - * then the underlying irq chip and the related driver need - * to be investigated. + * irq_set_irq_wake - control irq power management wakeup + * @irq: interrupt to control + * @on: enable/disable power management wakeup + * + * Enable/disable power management wakeup mode, which is disabled by + * default. Enables and disables must match, just as they match for + * non-wakeup mode support. + * + * Wakeup mode lets this IRQ wake the system from sleep states like + * "suspend to RAM". + * + * Note: irq enable/disable state is completely orthogonal to the + * enable/disable state of irq wake. An irq can be disabled with + * disable_irq() and still wake the system as long as the irq has wake + * enabled. If this does not hold, then the underlying irq chip and the + * related driver need to be investigated. */ int irq_set_irq_wake(unsigned int irq, unsigned int on) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); - int ret = 0; + scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; + int ret = 0; - if (!desc) - return -EINVAL; - - /* Don't use NMIs as wake up interrupts please */ - if (irq_is_nmi(desc)) { - ret = -EINVAL; - goto out_unlock; - } + /* Don't use NMIs as wake up interrupts please */ + if (irq_is_nmi(desc)) + return -EINVAL; - /* wakeup-capable irqs can be shared between drivers that - * don't need to have the same sleep mode behaviors. - */ - if (on) { - if (desc->wake_depth++ == 0) { - ret = set_irq_wake_real(irq, on); - if (ret) - desc->wake_depth = 0; - else - irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); - } - } else { - if (desc->wake_depth == 0) { - WARN(1, "Unbalanced IRQ %d wake disable\n", irq); - } else if (--desc->wake_depth == 0) { - ret = set_irq_wake_real(irq, on); - if (ret) - desc->wake_depth = 1; - else - irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); + /* + * wakeup-capable irqs can be shared between drivers that + * don't need to have the same sleep mode behaviors. + */ + if (on) { + if (desc->wake_depth++ == 0) { + ret = set_irq_wake_real(irq, on); + if (ret) + desc->wake_depth = 0; + else + irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); + } + } else { + if (desc->wake_depth == 0) { + WARN(1, "Unbalanced IRQ %d wake disable\n", irq); + } else if (--desc->wake_depth == 0) { + ret = set_irq_wake_real(irq, on); + if (ret) + desc->wake_depth = 1; + else + irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); + } } + return ret; } - -out_unlock: - irq_put_desc_busunlock(desc, flags); - return ret; + return -EINVAL; } EXPORT_SYMBOL(irq_set_irq_wake); @@ -938,22 +888,17 @@ EXPORT_SYMBOL(irq_set_irq_wake); * particular irq has been exclusively allocated or is available * for driver use. */ -int can_request_irq(unsigned int irq, unsigned long irqflags) +bool can_request_irq(unsigned int irq, unsigned long irqflags) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - int canrequest = 0; - - if (!desc) - return 0; + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; - if (irq_settings_can_request(desc)) { - if (!desc->action || - irqflags & desc->action->flags & IRQF_SHARED) - canrequest = 1; + if (irq_settings_can_request(desc)) { + if (!desc->action || irqflags & desc->action->flags & IRQF_SHARED) + return true; + } } - irq_put_desc_unlock(desc, flags); - return canrequest; + return false; } int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) @@ -1014,16 +959,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) #ifdef CONFIG_HARDIRQS_SW_RESEND int irq_set_parent(int irq, int parent_irq) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); - - if (!desc) - return -EINVAL; - - desc->parent_irq = parent_irq; - - irq_put_desc_unlock(desc, flags); - return 0; + scoped_irqdesc_get_and_lock(irq, 0) { + scoped_irqdesc->parent_irq = parent_irq; + return 0; + } + return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_parent); #endif @@ -1077,19 +1017,19 @@ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *a return; } - raw_spin_lock_irq(&desc->lock); - /* - * This code is triggered unconditionally. Check the affinity - * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. - */ - if (cpumask_available(desc->irq_common_data.affinity)) { - const struct cpumask *m; + scoped_guard(raw_spinlock_irq, &desc->lock) { + /* + * This code is triggered unconditionally. Check the affinity + * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. + */ + if (cpumask_available(desc->irq_common_data.affinity)) { + const struct cpumask *m; - m = irq_data_get_effective_affinity_mask(&desc->irq_data); - cpumask_copy(mask, m); - valid = true; + m = irq_data_get_effective_affinity_mask(&desc->irq_data); + cpumask_copy(mask, m); + valid = true; + } } - raw_spin_unlock_irq(&desc->lock); if (valid) set_cpus_allowed_ptr(current, mask); @@ -1257,9 +1197,8 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) if (WARN_ON_ONCE(!secondary)) return; - raw_spin_lock_irq(&desc->lock); + guard(raw_spinlock_irq)(&desc->lock); __irq_wake_thread(desc, secondary); - raw_spin_unlock_irq(&desc->lock); } /* @@ -1332,21 +1271,19 @@ static int irq_thread(void *data) } /** - * irq_wake_thread - wake the irq thread for the action identified by dev_id - * @irq: Interrupt line - * @dev_id: Device identity for which the thread should be woken - * + * irq_wake_thread - wake the irq thread for the action identified by dev_id + * @irq: Interrupt line + * @dev_id: Device identity for which the thread should be woken */ void irq_wake_thread(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - unsigned long flags; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return; - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irqsave)(&desc->lock); for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) @@ -1354,7 +1291,6 @@ void irq_wake_thread(unsigned int irq, void *dev_id) break; } } - raw_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL_GPL(irq_wake_thread); @@ -1985,9 +1921,8 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) * There is no interrupt on the fly anymore. Deactivate it * completely. */ - raw_spin_lock_irqsave(&desc->lock, flags); - irq_domain_deactivate_irq(&desc->irq_data); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) + irq_domain_deactivate_irq(&desc->irq_data); irq_release_resources(desc); chip_bus_sync_unlock(desc); @@ -2003,20 +1938,19 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) } /** - * free_irq - free an interrupt allocated with request_irq - * @irq: Interrupt line to free - * @dev_id: Device identity to free + * free_irq - free an interrupt allocated with request_irq + * @irq: Interrupt line to free + * @dev_id: Device identity to free * - * Remove an interrupt handler. The handler is removed and if the - * interrupt line is no longer in use by any driver it is disabled. - * On a shared IRQ the caller must ensure the interrupt is disabled - * on the card it drives before calling this function. The function - * does not return until any executing interrupts for this IRQ - * have completed. + * Remove an interrupt handler. The handler is removed and if the interrupt + * line is no longer in use by any driver it is disabled. On a shared IRQ + * the caller must ensure the interrupt is disabled on the card it drives + * before calling this function. The function does not return until any + * executing interrupts for this IRQ have completed. * - * This function must not be called from interrupt context. + * This function must not be called from interrupt context. * - * Returns the devname argument passed to request_irq. + * Returns the devname argument passed to request_irq. */ const void *free_irq(unsigned int irq, void *dev_id) { @@ -2073,8 +2007,6 @@ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) const void *free_nmi(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - const void *devname; if (!desc || WARN_ON(!irq_is_nmi(desc))) return NULL; @@ -2086,53 +2018,46 @@ const void *free_nmi(unsigned int irq, void *dev_id) if (WARN_ON(desc->depth == 0)) disable_nmi_nosync(irq); - raw_spin_lock_irqsave(&desc->lock, flags); - + guard(raw_spinlock_irqsave)(&desc->lock); irq_nmi_teardown(desc); - devname = __cleanup_nmi(irq, desc); - - raw_spin_unlock_irqrestore(&desc->lock, flags); - - return devname; + return __cleanup_nmi(irq, desc); } /** - * request_threaded_irq - allocate an interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * Primary handler for threaded interrupts. - * If handler is NULL and thread_fn != NULL - * the default primary handler is installed. - * @thread_fn: Function called from the irq handler thread - * If NULL, no irq thread is created - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. From the point this - * call is made your handler function may be invoked. Since - * your handler function must clear any interrupt the board - * raises, you must take care both to initialise your hardware - * and to set up the interrupt handler in the right order. - * - * If you want to set up a threaded irq handler for your device - * then you need to supply @handler and @thread_fn. @handler is - * still called in hard interrupt context and has to check - * whether the interrupt originates from the device. If yes it - * needs to disable the interrupt on the device and return - * IRQ_WAKE_THREAD which will wake up the handler thread and run - * @thread_fn. This split handler design is necessary to support - * shared interrupts. - * - * Dev_id must be globally unique. Normally the address of the - * device data structure is used as the cookie. Since the handler - * receives this value it makes sense to use it. - * - * If your interrupt is shared you must pass a non NULL dev_id - * as this is required when freeing the interrupt. - * - * Flags: + * request_threaded_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Primary handler for threaded interrupts. + * If handler is NULL and thread_fn != NULL + * the default primary handler is installed. + * @thread_fn: Function called from the irq handler thread + * If NULL, no irq thread is created + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the interrupt line + * and IRQ handling. From the point this call is made your handler function + * may be invoked. Since your handler function must clear any interrupt the + * board raises, you must take care both to initialise your hardware and to + * set up the interrupt handler in the right order. + * + * If you want to set up a threaded irq handler for your device then you + * need to supply @handler and @thread_fn. @handler is still called in hard + * interrupt context and has to check whether the interrupt originates from + * the device. If yes it needs to disable the interrupt on the device and + * return IRQ_WAKE_THREAD which will wake up the handler thread and run + * @thread_fn. This split handler design is necessary to support shared + * interrupts. + * + * @dev_id must be globally unique. Normally the address of the device data + * structure is used as the cookie. Since the handler receives this value + * it makes sense to use it. + * + * If your interrupt is shared you must pass a non NULL dev_id as this is + * required when freeing the interrupt. + * + * Flags: * * IRQF_SHARED Interrupt is shared * IRQF_TRIGGER_* Specify active edge(s) or level @@ -2230,21 +2155,20 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, EXPORT_SYMBOL(request_threaded_irq); /** - * request_any_context_irq - allocate an interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * Threaded handler for threaded interrupts. - * @flags: Interrupt type flags - * @name: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. It selects either a - * hardirq or threaded handling method depending on the - * context. - * - * On failure, it returns a negative value. On success, - * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED. + * request_any_context_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Threaded handler for threaded interrupts. + * @flags: Interrupt type flags + * @name: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the interrupt line + * and IRQ handling. It selects either a hardirq or threaded handling + * method depending on the context. + * + * Returns: On failure, it returns a negative value. On success, it returns either + * IRQC_IS_HARDIRQ or IRQC_IS_NESTED. */ int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) @@ -2271,37 +2195,35 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, EXPORT_SYMBOL_GPL(request_any_context_irq); /** - * request_nmi - allocate an interrupt line for NMI delivery - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * Threaded handler for threaded interrupts. - * @irqflags: Interrupt type flags - * @name: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. It sets up the IRQ line - * to be handled as an NMI. - * - * An interrupt line delivering NMIs cannot be shared and IRQ handling - * cannot be threaded. - * - * Interrupt lines requested for NMI delivering must produce per cpu - * interrupts and have auto enabling setting disabled. - * - * Dev_id must be globally unique. Normally the address of the - * device data structure is used as the cookie. Since the handler - * receives this value it makes sense to use it. - * - * If the interrupt line cannot be used to deliver NMIs, function - * will fail and return a negative value. + * request_nmi - allocate an interrupt line for NMI delivery + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Threaded handler for threaded interrupts. + * @irqflags: Interrupt type flags + * @name: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the interrupt line + * and IRQ handling. It sets up the IRQ line to be handled as an NMI. + * + * An interrupt line delivering NMIs cannot be shared and IRQ handling + * cannot be threaded. + * + * Interrupt lines requested for NMI delivering must produce per cpu + * interrupts and have auto enabling setting disabled. + * + * @dev_id must be globally unique. Normally the address of the device data + * structure is used as the cookie. Since the handler receives this value + * it makes sense to use it. + * + * If the interrupt line cannot be used to deliver NMIs, function will fail + * and return a negative value. */ int request_nmi(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *name, void *dev_id) { struct irqaction *action; struct irq_desc *desc; - unsigned long flags; int retval; if (irq == IRQ_NOTCONNECTED) @@ -2343,21 +2265,17 @@ int request_nmi(unsigned int irq, irq_handler_t handler, if (retval) goto err_irq_setup; - raw_spin_lock_irqsave(&desc->lock, flags); - - /* Setup NMI state */ - desc->istate |= IRQS_NMI; - retval = irq_nmi_setup(desc); - if (retval) { - __cleanup_nmi(irq, desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return -EINVAL; + scoped_guard(raw_spinlock_irqsave, &desc->lock) { + /* Setup NMI state */ + desc->istate |= IRQS_NMI; + retval = irq_nmi_setup(desc); + if (retval) { + __cleanup_nmi(irq, desc); + return -EINVAL; + } + return 0; } - raw_spin_unlock_irqrestore(&desc->lock, flags); - - return 0; - err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: @@ -2368,35 +2286,25 @@ err_out: void enable_percpu_irq(unsigned int irq, unsigned int type) { - unsigned int cpu = smp_processor_id(); - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { + struct irq_desc *desc = scoped_irqdesc; - if (!desc) - return; - - /* - * If the trigger type is not specified by the caller, then - * use the default for this interrupt. - */ - type &= IRQ_TYPE_SENSE_MASK; - if (type == IRQ_TYPE_NONE) - type = irqd_get_trigger_type(&desc->irq_data); - - if (type != IRQ_TYPE_NONE) { - int ret; - - ret = __irq_set_trigger(desc, type); - - if (ret) { - WARN(1, "failed to set type for IRQ%d\n", irq); - goto out; + /* + * If the trigger type is not specified by the caller, then + * use the default for this interrupt. + */ + type &= IRQ_TYPE_SENSE_MASK; + if (type == IRQ_TYPE_NONE) + type = irqd_get_trigger_type(&desc->irq_data); + + if (type != IRQ_TYPE_NONE) { + if (__irq_set_trigger(desc, type)) { + WARN(1, "failed to set type for IRQ%d\n", irq); + return; + } } + irq_percpu_enable(desc, smp_processor_id()); } - - irq_percpu_enable(desc, cpu); -out: - irq_put_desc_unlock(desc, flags); } EXPORT_SYMBOL_GPL(enable_percpu_irq); @@ -2414,33 +2322,16 @@ void enable_percpu_nmi(unsigned int irq, unsigned int type) */ bool irq_percpu_is_enabled(unsigned int irq) { - unsigned int cpu = smp_processor_id(); - struct irq_desc *desc; - unsigned long flags; - bool is_enabled; - - desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); - if (!desc) - return false; - - is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); - irq_put_desc_unlock(desc, flags); - - return is_enabled; + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) + return cpumask_test_cpu(smp_processor_id(), scoped_irqdesc->percpu_enabled); + return false; } EXPORT_SYMBOL_GPL(irq_percpu_is_enabled); void disable_percpu_irq(unsigned int irq) { - unsigned int cpu = smp_processor_id(); - unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); - - if (!desc) - return; - - irq_percpu_disable(desc, cpu); - irq_put_desc_unlock(desc, flags); + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) + irq_percpu_disable(scoped_irqdesc, smp_processor_id()); } EXPORT_SYMBOL_GPL(disable_percpu_irq); @@ -2456,71 +2347,47 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); if (!desc) return NULL; - raw_spin_lock_irqsave(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) { + action = desc->action; + if (!action || action->percpu_dev_id != dev_id) { + WARN(1, "Trying to free already-free IRQ %d\n", irq); + return NULL; + } - action = desc->action; - if (!action || action->percpu_dev_id != dev_id) { - WARN(1, "Trying to free already-free IRQ %d\n", irq); - goto bad; - } + if (!cpumask_empty(desc->percpu_enabled)) { + WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", + irq, cpumask_first(desc->percpu_enabled)); + return NULL; + } - if (!cpumask_empty(desc->percpu_enabled)) { - WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", - irq, cpumask_first(desc->percpu_enabled)); - goto bad; + /* Found it - now remove it from the list of entries: */ + desc->action = NULL; + desc->istate &= ~IRQS_NMI; } - /* Found it - now remove it from the list of entries: */ - desc->action = NULL; - - desc->istate &= ~IRQS_NMI; - - raw_spin_unlock_irqrestore(&desc->lock, flags); - unregister_handler_proc(irq, action); - irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); return action; - -bad: - raw_spin_unlock_irqrestore(&desc->lock, flags); - return NULL; } /** - * remove_percpu_irq - free a per-cpu interrupt - * @irq: Interrupt line to free - * @act: irqaction for the interrupt + * free_percpu_irq - free an interrupt allocated with request_percpu_irq + * @irq: Interrupt line to free + * @dev_id: Device identity to free * - * Used to remove interrupts statically setup by the early boot process. - */ -void remove_percpu_irq(unsigned int irq, struct irqaction *act) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (desc && irq_settings_is_per_cpu_devid(desc)) - __free_percpu_irq(irq, act->percpu_dev_id); -} - -/** - * free_percpu_irq - free an interrupt allocated with request_percpu_irq - * @irq: Interrupt line to free - * @dev_id: Device identity to free + * Remove a percpu interrupt handler. The handler is removed, but the + * interrupt line is not disabled. This must be done on each CPU before + * calling this function. The function does not return until any executing + * interrupts for this IRQ have completed. * - * Remove a percpu interrupt handler. The handler is removed, but - * the interrupt line is not disabled. This must be done on each - * CPU before calling this function. The function does not return - * until any executing interrupts for this IRQ have completed. - * - * This function must not be called from interrupt context. + * This function must not be called from interrupt context. */ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) { @@ -2549,9 +2416,9 @@ void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) } /** - * setup_percpu_irq - setup a per-cpu interrupt - * @irq: Interrupt line to setup - * @act: irqaction for the interrupt + * setup_percpu_irq - setup a per-cpu interrupt + * @irq: Interrupt line to setup + * @act: irqaction for the interrupt * * Used to statically setup per-cpu interrupts in the early boot process. */ @@ -2576,21 +2443,20 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) } /** - * __request_percpu_irq - allocate a percpu interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * @flags: Interrupt type flags (IRQF_TIMER only) - * @devname: An ascii name for the claiming device - * @dev_id: A percpu cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt on the local CPU. If the interrupt is supposed to be - * enabled on other CPUs, it has to be done on each CPU using - * enable_percpu_irq(). - * - * Dev_id must be globally unique. It is a per-cpu variable, and - * the handler gets called with the interrupted CPU's instance of - * that variable. + * __request_percpu_irq - allocate a percpu interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * @flags: Interrupt type flags (IRQF_TIMER only) + * @devname: An ascii name for the claiming device + * @dev_id: A percpu cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the interrupt on the + * local CPU. If the interrupt is supposed to be enabled on other CPUs, it + * has to be done on each CPU using enable_percpu_irq(). + * + * @dev_id must be globally unique. It is a per-cpu variable, and + * the handler gets called with the interrupted CPU's instance of + * that variable. */ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, @@ -2638,32 +2504,31 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, EXPORT_SYMBOL_GPL(__request_percpu_irq); /** - * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs. - * @name: An ascii name for the claiming device - * @dev_id: A percpu cookie passed back to the handler function + * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * @name: An ascii name for the claiming device + * @dev_id: A percpu cookie passed back to the handler function * - * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs - * have to be setup on each CPU by calling prepare_percpu_nmi() before - * being enabled on the same CPU by using enable_percpu_nmi(). + * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs + * have to be setup on each CPU by calling prepare_percpu_nmi() before + * being enabled on the same CPU by using enable_percpu_nmi(). * - * Dev_id must be globally unique. It is a per-cpu variable, and - * the handler gets called with the interrupted CPU's instance of - * that variable. + * @dev_id must be globally unique. It is a per-cpu variable, and the + * handler gets called with the interrupted CPU's instance of that + * variable. * - * Interrupt lines requested for NMI delivering should have auto enabling - * setting disabled. + * Interrupt lines requested for NMI delivering should have auto enabling + * setting disabled. * - * If the interrupt line cannot be used to deliver NMIs, function - * will fail returning a negative value. + * If the interrupt line cannot be used to deliver NMIs, function + * will fail returning a negative value. */ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; - unsigned long flags; int retval; if (!handler) @@ -2699,10 +2564,8 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler, if (retval) goto err_irq_setup; - raw_spin_lock_irqsave(&desc->lock, flags); - desc->istate |= IRQS_NMI; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + scoped_guard(raw_spinlock_irqsave, &desc->lock) + desc->istate |= IRQS_NMI; return 0; err_irq_setup: @@ -2714,83 +2577,58 @@ err_out: } /** - * prepare_percpu_nmi - performs CPU local setup for NMI delivery - * @irq: Interrupt line to prepare for NMI delivery + * prepare_percpu_nmi - performs CPU local setup for NMI delivery + * @irq: Interrupt line to prepare for NMI delivery * - * This call prepares an interrupt line to deliver NMI on the current CPU, - * before that interrupt line gets enabled with enable_percpu_nmi(). + * This call prepares an interrupt line to deliver NMI on the current CPU, + * before that interrupt line gets enabled with enable_percpu_nmi(). * - * As a CPU local operation, this should be called from non-preemptible - * context. + * As a CPU local operation, this should be called from non-preemptible + * context. * - * If the interrupt line cannot be used to deliver NMIs, function - * will fail returning a negative value. + * If the interrupt line cannot be used to deliver NMIs, function will fail + * returning a negative value. */ int prepare_percpu_nmi(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc; - int ret = 0; + int ret = -EINVAL; WARN_ON(preemptible()); - desc = irq_get_desc_lock(irq, &flags, - IRQ_GET_DESC_CHECK_PERCPU); - if (!desc) - return -EINVAL; - - if (WARN(!irq_is_nmi(desc), - KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", - irq)) { - ret = -EINVAL; - goto out; - } + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { + if (WARN(!irq_is_nmi(scoped_irqdesc), + "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq)) + return -EINVAL; - ret = irq_nmi_setup(desc); - if (ret) { - pr_err("Failed to setup NMI delivery: irq %u\n", irq); - goto out; + ret = irq_nmi_setup(scoped_irqdesc); + if (ret) + pr_err("Failed to setup NMI delivery: irq %u\n", irq); } - -out: - irq_put_desc_unlock(desc, flags); return ret; } /** - * teardown_percpu_nmi - undoes NMI setup of IRQ line - * @irq: Interrupt line from which CPU local NMI configuration should be - * removed - * - * This call undoes the setup done by prepare_percpu_nmi(). + * teardown_percpu_nmi - undoes NMI setup of IRQ line + * @irq: Interrupt line from which CPU local NMI configuration should be removed * - * IRQ line should not be enabled for the current CPU. + * This call undoes the setup done by prepare_percpu_nmi(). * - * As a CPU local operation, this should be called from non-preemptible - * context. + * IRQ line should not be enabled for the current CPU. + * As a CPU local operation, this should be called from non-preemptible + * context. */ void teardown_percpu_nmi(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc; - WARN_ON(preemptible()); - desc = irq_get_desc_lock(irq, &flags, - IRQ_GET_DESC_CHECK_PERCPU); - if (!desc) - return; - - if (WARN_ON(!irq_is_nmi(desc))) - goto out; - - irq_nmi_teardown(desc); -out: - irq_put_desc_unlock(desc, flags); + scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) { + if (WARN_ON(!irq_is_nmi(scoped_irqdesc))) + return; + irq_nmi_teardown(scoped_irqdesc); + } } -int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, - bool *state) +static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct irq_chip *chip; int err = -EINVAL; @@ -2814,87 +2652,62 @@ int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, } /** - * irq_get_irqchip_state - returns the irqchip state of a interrupt. - * @irq: Interrupt line that is forwarded to a VM - * @which: One of IRQCHIP_STATE_* the caller wants to know about - * @state: a pointer to a boolean where the state is to be stored + * irq_get_irqchip_state - returns the irqchip state of a interrupt. + * @irq: Interrupt line that is forwarded to a VM + * @which: One of IRQCHIP_STATE_* the caller wants to know about + * @state: a pointer to a boolean where the state is to be stored * - * This call snapshots the internal irqchip state of an - * interrupt, returning into @state the bit corresponding to - * stage @which + * This call snapshots the internal irqchip state of an interrupt, + * returning into @state the bit corresponding to stage @which * - * This function should be called with preemption disabled if the - * interrupt controller has per-cpu registers. + * This function should be called with preemption disabled if the interrupt + * controller has per-cpu registers. */ -int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, - bool *state) +int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state) { - struct irq_desc *desc; - struct irq_data *data; - unsigned long flags; - int err = -EINVAL; - - desc = irq_get_desc_buslock(irq, &flags, 0); - if (!desc) - return err; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); - data = irq_desc_get_irq_data(desc); - - err = __irq_get_irqchip_state(data, which, state); - - irq_put_desc_busunlock(desc, flags); - return err; + return __irq_get_irqchip_state(data, which, state); + } + return -EINVAL; } EXPORT_SYMBOL_GPL(irq_get_irqchip_state); /** - * irq_set_irqchip_state - set the state of a forwarded interrupt. - * @irq: Interrupt line that is forwarded to a VM - * @which: State to be restored (one of IRQCHIP_STATE_*) - * @val: Value corresponding to @which + * irq_set_irqchip_state - set the state of a forwarded interrupt. + * @irq: Interrupt line that is forwarded to a VM + * @which: State to be restored (one of IRQCHIP_STATE_*) + * @val: Value corresponding to @which * - * This call sets the internal irqchip state of an interrupt, - * depending on the value of @which. + * This call sets the internal irqchip state of an interrupt, depending on + * the value of @which. * - * This function should be called with migration disabled if the - * interrupt controller has per-cpu registers. + * This function should be called with migration disabled if the interrupt + * controller has per-cpu registers. */ -int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, - bool val) +int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool val) { - struct irq_desc *desc; - struct irq_data *data; - struct irq_chip *chip; - unsigned long flags; - int err = -EINVAL; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc); + struct irq_chip *chip; - desc = irq_get_desc_buslock(irq, &flags, 0); - if (!desc) - return err; + do { + chip = irq_data_get_irq_chip(data); - data = irq_desc_get_irq_data(desc); + if (WARN_ON_ONCE(!chip)) + return -ENODEV; - do { - chip = irq_data_get_irq_chip(data); - if (WARN_ON_ONCE(!chip)) { - err = -ENODEV; - goto out_unlock; - } - if (chip->irq_set_irqchip_state) - break; -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - data = data->parent_data; -#else - data = NULL; -#endif - } while (data); + if (chip->irq_set_irqchip_state) + break; - if (data) - err = chip->irq_set_irqchip_state(data, which, val); + data = irqd_get_parent_data(data); + } while (data); -out_unlock: - irq_put_desc_busunlock(desc, flags); - return err; + if (data) + return chip->irq_set_irqchip_state(data, which, val); + } + return -EINVAL; } EXPORT_SYMBOL_GPL(irq_set_irqchip_state); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index eb150afd671f..f2b2929986ff 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -35,6 +35,16 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) return true; } +void irq_force_complete_move(struct irq_desc *desc) +{ + for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = irqd_get_parent_data(d)) { + if (d->chip && d->chip->irq_force_complete_move) { + d->chip->irq_force_complete_move(d); + return; + } + } +} + void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); @@ -117,3 +127,13 @@ void __irq_move_irq(struct irq_data *idata) if (!masked) idata->chip->irq_unmask(idata); } + +bool irq_can_move_in_process_context(struct irq_data *data) +{ + /* + * Get the top level irq_data in the hierarchy, which is optimized + * away when CONFIG_IRQ_DOMAIN_HIERARCHY is disabled. + */ + data = irq_desc_get_irq_data(irq_data_to_desc(data)); + return irq_can_move_pcntxt(data); +} diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 396a067a8a56..9febe797a5f6 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -15,6 +15,7 @@ #include <linux/mutex.h> #include <linux/pci.h> #include <linux/slab.h> +#include <linux/seq_file.h> #include <linux/sysfs.h> #include <linux/types.h> #include <linux/xarray.h> @@ -58,7 +59,8 @@ struct msi_ctrl { static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl); static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid); static inline int msi_sysfs_create_group(struct device *dev); - +static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg); /** * msi_alloc_desc - Allocate an initialized msi_desc @@ -342,26 +344,30 @@ int msi_setup_device_data(struct device *dev) } /** - * msi_lock_descs - Lock the MSI descriptor storage of a device + * __msi_lock_descs - Lock the MSI descriptor storage of a device * @dev: Device to operate on + * + * Internal function for guard(msi_descs_lock). Don't use in code. */ -void msi_lock_descs(struct device *dev) +void __msi_lock_descs(struct device *dev) { mutex_lock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(msi_lock_descs); +EXPORT_SYMBOL_GPL(__msi_lock_descs); /** - * msi_unlock_descs - Unlock the MSI descriptor storage of a device + * __msi_unlock_descs - Unlock the MSI descriptor storage of a device * @dev: Device to operate on + * + * Internal function for guard(msi_descs_lock). Don't use in code. */ -void msi_unlock_descs(struct device *dev) +void __msi_unlock_descs(struct device *dev) { /* Invalidate the index which was cached by the iterator */ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX; mutex_unlock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(msi_unlock_descs); +EXPORT_SYMBOL_GPL(__msi_unlock_descs); static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid, enum msi_desc_filter filter) @@ -447,7 +453,6 @@ EXPORT_SYMBOL_GPL(msi_next_desc); unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index) { struct msi_desc *desc; - unsigned int ret = 0; bool pcimsi = false; struct xarray *xa; @@ -461,7 +466,7 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN) pcimsi = to_pci_dev(dev)->msi_enabled; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); xa = &dev->msi.data->__domains[domid].store; desc = xa_load(xa, pcimsi ? 0 : index); if (desc && desc->irq) { @@ -470,16 +475,12 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne * PCI-MSIX and platform MSI use a descriptor per * interrupt. */ - if (pcimsi) { - if (index < desc->nvec_used) - ret = desc->irq + index; - } else { - ret = desc->irq; - } + if (!pcimsi) + return desc->irq; + if (index < desc->nvec_used) + return desc->irq + index; } - - msi_unlock_descs(dev); - return ret; + return 0; } EXPORT_SYMBOL_GPL(msi_domain_get_virq); @@ -756,12 +757,30 @@ static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fw return info->ops->msi_translate(domain, fwspec, hwirq, type); } +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static void msi_domain_debug_show(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind) +{ + struct msi_desc *desc = irqd ? irq_data_get_msi_desc(irqd) : NULL; + + if (!desc) + return; + + seq_printf(m, "\n%*saddress_hi: 0x%08x", ind + 1, "", desc->msg.address_hi); + seq_printf(m, "\n%*saddress_lo: 0x%08x", ind + 1, "", desc->msg.address_lo); + seq_printf(m, "\n%*smsg_data: 0x%08x\n", ind + 1, "", desc->msg.data); +} +#endif + static const struct irq_domain_ops msi_domain_ops = { .alloc = msi_domain_alloc, .free = msi_domain_free, .activate = msi_domain_activate, .deactivate = msi_domain_deactivate, .translate = msi_domain_translate, +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + .debug_show = msi_domain_debug_show, +#endif }; static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, @@ -777,6 +796,10 @@ static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev, return 0; } +static void msi_domain_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg) +{ +} + static void msi_domain_ops_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) { @@ -802,6 +825,7 @@ static struct msi_domain_ops msi_domain_ops_default = { .get_hwirq = msi_domain_ops_get_hwirq, .msi_init = msi_domain_ops_init, .msi_prepare = msi_domain_ops_prepare, + .msi_teardown = msi_domain_ops_teardown, .set_desc = msi_domain_ops_set_desc, }; @@ -823,6 +847,8 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info) ops->msi_init = msi_domain_ops_default.msi_init; if (ops->msi_prepare == NULL) ops->msi_prepare = msi_domain_ops_default.msi_prepare; + if (ops->msi_teardown == NULL) + ops->msi_teardown = msi_domain_ops_default.msi_teardown; if (ops->set_desc == NULL) ops->set_desc = msi_domain_ops_default.set_desc; } @@ -886,6 +912,32 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, } /** + * msi_create_parent_irq_domain - Create an MSI-parent interrupt domain + * @info: MSI irqdomain creation info + * @msi_parent_ops: MSI parent callbacks and configuration + * + * Return: pointer to the created &struct irq_domain or %NULL on failure + */ +struct irq_domain *msi_create_parent_irq_domain(struct irq_domain_info *info, + const struct msi_parent_ops *msi_parent_ops) +{ + struct irq_domain *d; + + info->hwirq_max = max(info->hwirq_max, info->size); + info->size = info->hwirq_max; + info->domain_flags |= IRQ_DOMAIN_FLAG_MSI_PARENT; + info->bus_token = msi_parent_ops->bus_select_token; + + d = irq_domain_instantiate(info); + if (IS_ERR(d)) + return NULL; + + d->msi_parent_ops = msi_parent_ops; + return d; +} +EXPORT_SYMBOL_GPL(msi_create_parent_irq_domain); + +/** * msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down * in the domain hierarchy * @dev: The device for which the domain should be created @@ -979,9 +1031,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, void *chip_data) { struct irq_domain *domain, *parent = dev->msi.domain; - struct fwnode_handle *fwnode, *fwnalloced = NULL; - struct msi_domain_template *bundle; const struct msi_parent_ops *pops; + struct fwnode_handle *fwnode; if (!irq_domain_is_msi_parent(parent)) return false; @@ -989,7 +1040,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, if (domid >= MSI_MAX_DEVICE_IRQDOMAINS) return false; - bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); + struct msi_domain_template *bundle __free(kfree) = + kmemdup(template, sizeof(*bundle), GFP_KERNEL); if (!bundle) return false; @@ -998,6 +1050,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, bundle->info.ops = &bundle->ops; bundle->info.data = domain_data; bundle->info.chip_data = chip_data; + bundle->info.alloc_data = &bundle->alloc_info; pops = parent->msi_parent_ops; snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s", @@ -1012,41 +1065,43 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, * node as they are not guaranteed to have a fwnode. They are never * looked up and always handled in the context of the device. */ - if (bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE) - fwnode = dev->fwnode; + struct fwnode_handle *fwnode_alloced __free(irq_domain_free_fwnode) = NULL; + + if (!(bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE)) + fwnode = fwnode_alloced = irq_domain_alloc_named_fwnode(bundle->name); else - fwnode = fwnalloced = irq_domain_alloc_named_fwnode(bundle->name); + fwnode = dev->fwnode; if (!fwnode) - goto free_bundle; + return false; if (msi_setup_device_data(dev)) - goto free_fwnode; - - msi_lock_descs(dev); + return false; + guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid))) - goto fail; + return false; if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info)) - goto fail; + return false; domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent); if (!domain) - goto fail; + return false; domain->dev = dev; dev->msi.data->__domains[domid].domain = domain; - msi_unlock_descs(dev); - return true; -fail: - msi_unlock_descs(dev); -free_fwnode: - irq_domain_free_fwnode(fwnalloced); -free_bundle: - kfree(bundle); - return false; + if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) { + dev->msi.data->__domains[domid].domain = NULL; + irq_domain_remove(domain); + return false; + } + + /* @bundle and @fwnode_alloced are now in use. Prevent cleanup */ + retain_and_null_ptr(bundle); + retain_and_null_ptr(fwnode_alloced); + return true; } /** @@ -1060,23 +1115,21 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) struct msi_domain_info *info; struct irq_domain *domain; - msi_lock_descs(dev); - + guard(msi_descs_lock)(dev); domain = msi_get_device_domain(dev, domid); - if (!domain || !irq_domain_is_msi_device(domain)) - goto unlock; + return; dev->msi.data->__domains[domid].domain = NULL; info = domain->host_data; + + info->ops->msi_teardown(domain, info->alloc_data); + if (irq_domain_is_msi_device(domain)) fwnode = domain->fwnode; irq_domain_remove(domain); irq_domain_free_fwnode(fwnode); kfree(container_of(info, struct msi_domain_template, info)); - -unlock: - msi_unlock_descs(dev); } /** @@ -1092,16 +1145,14 @@ bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, { struct msi_domain_info *info; struct irq_domain *domain; - bool ret = false; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); domain = msi_get_device_domain(dev, domid); if (domain && irq_domain_is_msi_device(domain)) { info = domain->host_data; - ret = info->bus_token == bus_token; + return info->bus_token == bus_token; } - msi_unlock_descs(dev); - return ret; + return false; } static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, @@ -1143,7 +1194,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) return false; - if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) + if (info->flags & MSI_FLAG_NO_MASK) return false; /* @@ -1219,6 +1270,24 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag return 0; } +static int populate_alloc_info(struct irq_domain *domain, struct device *dev, + unsigned int nirqs, msi_alloc_info_t *arg) +{ + struct msi_domain_info *info = domain->host_data; + + /* + * If the caller has provided a template alloc info, use that. Once + * all users of msi_create_irq_domain() have been eliminated, this + * should be the only source of allocation information, and the + * prepare call below should be finally removed. + */ + if (!info->alloc_data) + return msi_domain_prepare_irqs(domain, dev, nirqs, arg); + + *arg = *info->alloc_data; + return 0; +} + static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain, struct msi_ctrl *ctrl) { @@ -1231,7 +1300,7 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain unsigned long idx; int i, ret, virq; - ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg); + ret = populate_alloc_info(domain, dev, ctrl->nirqs, &arg); if (ret) return ret; @@ -1372,12 +1441,9 @@ int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last) { - int ret; - msi_lock_descs(dev); - ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last); - msi_unlock_descs(dev); - return ret; + guard(msi_descs_lock)(dev); + return msi_domain_alloc_irqs_range_locked(dev, domid, first, last); } EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range); @@ -1481,12 +1547,8 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u const struct irq_affinity_desc *affdesc, union msi_instance_cookie *icookie) { - struct msi_map map; - - msi_lock_descs(dev); - map = __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); - msi_unlock_descs(dev); - return map; + guard(msi_descs_lock)(dev); + return __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); } /** @@ -1523,13 +1585,11 @@ int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, icookie.value = ((u64)type << 32) | hwirq; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain)) map.index = -EINVAL; else map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie); - msi_unlock_descs(dev); - return map.index >= 0 ? map.virq : map.index; } @@ -1622,9 +1682,8 @@ void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last) { - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); msi_domain_free_irqs_range_locked(dev, domid, first, last); - msi_unlock_descs(dev); } EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all); @@ -1654,9 +1713,8 @@ void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid) */ void msi_domain_free_irqs_all(struct device *dev, unsigned int domid) { - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); msi_domain_free_irqs_all_locked(dev, domid); - msi_unlock_descs(dev); } /** @@ -1675,12 +1733,11 @@ void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq) if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI)) return; - msi_lock_descs(dev); - if (!WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) { - msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, - desc->msi_index); - } - msi_unlock_descs(dev); + guard(msi_descs_lock)(dev); + if (WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) + return; + msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, + desc->msi_index); } /** diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index c556bc49d213..445912d51033 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -46,8 +46,7 @@ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) desc->cond_suspend_depth++; WARN_ON_ONCE(desc->no_suspend_depth && - (desc->no_suspend_depth + - desc->cond_suspend_depth) != desc->nr_actions); + (desc->no_suspend_depth + desc->cond_suspend_depth) != desc->nr_actions); } /* @@ -134,14 +133,12 @@ void suspend_device_irqs(void) int irq; for_each_irq_desc(irq, desc) { - unsigned long flags; bool sync; if (irq_settings_is_nested_thread(desc)) continue; - raw_spin_lock_irqsave(&desc->lock, flags); - sync = suspend_device_irq(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irqsave, &desc->lock) + sync = suspend_device_irq(desc); if (sync) synchronize_irq(irq); @@ -186,18 +183,15 @@ static void resume_irqs(bool want_early) int irq; for_each_irq_desc(irq, desc) { - unsigned long flags; - bool is_early = desc->action && - desc->action->flags & IRQF_EARLY_RESUME; + bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; if (!is_early && want_early) continue; if (irq_settings_is_nested_thread(desc)) continue; - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irqsave)(&desc->lock); resume_irq(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); } } @@ -207,22 +201,16 @@ static void resume_irqs(bool want_early) */ void rearm_wake_irq(unsigned int irq) { - unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); + scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { + struct irq_desc *desc = scoped_irqdesc; - if (!desc) - return; - - if (!(desc->istate & IRQS_SUSPENDED) || - !irqd_is_wakeup_set(&desc->irq_data)) - goto unlock; - - desc->istate &= ~IRQS_SUSPENDED; - irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); - __enable_irq(desc); + if (!(desc->istate & IRQS_SUSPENDED) || !irqd_is_wakeup_set(&desc->irq_data)) + return; -unlock: - irq_put_desc_busunlock(desc, flags); + desc->istate &= ~IRQS_SUSPENDED; + irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + __enable_irq(desc); + } } /** diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 8e29809de38d..29c2404e743b 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -81,20 +81,18 @@ static int show_irq_affinity(int type, struct seq_file *m) static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - unsigned long flags; cpumask_var_t mask; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - raw_spin_lock_irqsave(&desc->lock, flags); - if (desc->affinity_hint) - cpumask_copy(mask, desc->affinity_hint); - raw_spin_unlock_irqrestore(&desc->lock, flags); + scoped_guard(raw_spinlock_irq, &desc->lock) { + if (desc->affinity_hint) + cpumask_copy(mask, desc->affinity_hint); + } seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); free_cpumask_var(mask); - return 0; } @@ -295,32 +293,26 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) #define MAX_NAMELEN 128 -static int name_unique(unsigned int irq, struct irqaction *new_action) +static bool name_unique(unsigned int irq, struct irqaction *new_action) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - unsigned long flags; - int ret = 1; - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irq)(&desc->lock); for_each_action_of_desc(desc, action) { if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) { - ret = 0; - break; - } + !strcmp(new_action->name, action->name)) + return false; } - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; + return true; } void register_handler_proc(unsigned int irq, struct irqaction *action) { - char name [MAX_NAMELEN]; + char name[MAX_NAMELEN]; struct irq_desc *desc = irq_to_desc(irq); - if (!desc->dir || action->dir || !action->name || - !name_unique(irq, action)) + if (!desc->dir || action->dir || !action->name || !name_unique(irq, action)) return; snprintf(name, MAX_NAMELEN, "%s", action->name); @@ -347,17 +339,16 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) * added, not when the descriptor is created, so multiple * tasks might try to register at the same time. */ - mutex_lock(®ister_lock); + guard(mutex)(®ister_lock); if (desc->dir) - goto out_unlock; - - sprintf(name, "%d", irq); + return; /* create /proc/irq/1234 */ + sprintf(name, "%u", irq); desc->dir = proc_mkdir(name, root_irq_dir); if (!desc->dir) - goto out_unlock; + return; #ifdef CONFIG_SMP umode_t umode = S_IRUGO; @@ -366,31 +357,27 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) umode |= S_IWUSR; /* create /proc/irq/<irq>/smp_affinity */ - proc_create_data("smp_affinity", umode, desc->dir, - &irq_affinity_proc_ops, irqp); + proc_create_data("smp_affinity", umode, desc->dir, &irq_affinity_proc_ops, irqp); /* create /proc/irq/<irq>/affinity_hint */ proc_create_single_data("affinity_hint", 0444, desc->dir, - irq_affinity_hint_proc_show, irqp); + irq_affinity_hint_proc_show, irqp); /* create /proc/irq/<irq>/smp_affinity_list */ proc_create_data("smp_affinity_list", umode, desc->dir, &irq_affinity_list_proc_ops, irqp); - proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, - irqp); + proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, irqp); # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK proc_create_single_data("effective_affinity", 0444, desc->dir, - irq_effective_aff_proc_show, irqp); + irq_effective_aff_proc_show, irqp); proc_create_single_data("effective_affinity_list", 0444, desc->dir, - irq_effective_aff_list_proc_show, irqp); + irq_effective_aff_list_proc_show, irqp); # endif #endif proc_create_single_data("spurious", 0444, desc->dir, - irq_spurious_proc_show, (void *)(long)irq); + irq_spurious_proc_show, (void *)(long)irq); -out_unlock: - mutex_unlock(®ister_lock); } void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) @@ -468,7 +455,6 @@ int show_interrupts(struct seq_file *p, void *v) int i = *(loff_t *) v, j; struct irqaction *action; struct irq_desc *desc; - unsigned long flags; if (i > ACTUAL_NR_IRQS) return 0; @@ -487,13 +473,13 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - rcu_read_lock(); + guard(rcu)(); desc = irq_to_desc(i); if (!desc || irq_settings_is_hidden(desc)) - goto outsparse; + return 0; if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs) - goto outsparse; + return 0; seq_printf(p, "%*d:", prec, i); for_each_online_cpu(j) { @@ -503,7 +489,7 @@ int show_interrupts(struct seq_file *p, void *v) } seq_putc(p, ' '); - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.chip) { if (desc->irq_data.chip->irq_print_chip) desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); @@ -532,9 +518,6 @@ int show_interrupts(struct seq_file *p, void *v) } seq_putc(p, '\n'); - raw_spin_unlock_irqrestore(&desc->lock, flags); -outsparse: - rcu_read_unlock(); return 0; } #endif diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 1b7fa72968bd..ca9cc1b806a9 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -30,18 +30,17 @@ static DEFINE_RAW_SPINLOCK(irq_resend_lock); */ static void resend_irqs(struct tasklet_struct *unused) { - struct irq_desc *desc; - - raw_spin_lock_irq(&irq_resend_lock); + guard(raw_spinlock_irq)(&irq_resend_lock); while (!hlist_empty(&irq_resend_list)) { - desc = hlist_entry(irq_resend_list.first, struct irq_desc, - resend_node); + struct irq_desc *desc; + + desc = hlist_entry(irq_resend_list.first, struct irq_desc, resend_node); hlist_del_init(&desc->resend_node); + raw_spin_unlock(&irq_resend_lock); desc->handle_irq(desc); raw_spin_lock(&irq_resend_lock); } - raw_spin_unlock_irq(&irq_resend_lock); } /* Tasklet to handle resend: */ @@ -75,19 +74,18 @@ static int irq_sw_resend(struct irq_desc *desc) } /* Add to resend_list and activate the softirq: */ - raw_spin_lock(&irq_resend_lock); - if (hlist_unhashed(&desc->resend_node)) - hlist_add_head(&desc->resend_node, &irq_resend_list); - raw_spin_unlock(&irq_resend_lock); + scoped_guard(raw_spinlock, &irq_resend_lock) { + if (hlist_unhashed(&desc->resend_node)) + hlist_add_head(&desc->resend_node, &irq_resend_list); + } tasklet_schedule(&resend_tasklet); return 0; } void clear_irq_resend(struct irq_desc *desc) { - raw_spin_lock(&irq_resend_lock); + guard(raw_spinlock)(&irq_resend_lock); hlist_del_init(&desc->resend_node); - raw_spin_unlock(&irq_resend_lock); } void irq_resend_init(struct irq_desc *desc) @@ -172,30 +170,24 @@ int check_irq_resend(struct irq_desc *desc, bool inject) */ int irq_inject_interrupt(unsigned int irq) { - struct irq_desc *desc; - unsigned long flags; - int err; + int err = -EINVAL; /* Try the state injection hardware interface first */ if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true)) return 0; /* That failed, try via the resend mechanism */ - desc = irq_get_desc_buslock(irq, &flags, 0); - if (!desc) - return -EINVAL; + scoped_irqdesc_get_and_buslock(irq, 0) { + struct irq_desc *desc = scoped_irqdesc; - /* - * Only try to inject when the interrupt is: - * - not NMI type - * - activated - */ - if (irq_is_nmi(desc) || !irqd_is_activated(&desc->irq_data)) - err = -EINVAL; - else - err = check_irq_resend(desc, true); - - irq_put_desc_busunlock(desc, flags); + /* + * Only try to inject when the interrupt is: + * - not NMI type + * - activated + */ + if (!irq_is_nmi(desc) && irqd_is_activated(&desc->irq_data)) + err = check_irq_resend(desc, true); + } return err; } EXPORT_SYMBOL_GPL(irq_inject_interrupt); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 02b2daf07441..8f26982e7300 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -34,8 +34,9 @@ static atomic_t irq_poll_active; * true and let the handler run. */ bool irq_wait_for_poll(struct irq_desc *desc) - __must_hold(&desc->lock) { + lockdep_assert_held(&desc->lock); + if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), "irq poll in progress on cpu %d for irq %d\n", smp_processor_id(), desc->irq_data.irq)) @@ -59,37 +60,35 @@ bool irq_wait_for_poll(struct irq_desc *desc) /* * Recovery handler for misrouted interrupts. */ -static int try_one_irq(struct irq_desc *desc, bool force) +static bool try_one_irq(struct irq_desc *desc, bool force) { - irqreturn_t ret = IRQ_NONE; struct irqaction *action; + bool ret = false; - raw_spin_lock(&desc->lock); + guard(raw_spinlock)(&desc->lock); /* * PER_CPU, nested thread interrupts and interrupts explicitly * marked polled are excluded from polling. */ - if (irq_settings_is_per_cpu(desc) || - irq_settings_is_nested_thread(desc) || + if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc) || irq_settings_is_polled(desc)) - goto out; + return false; /* * Do not poll disabled interrupts unless the spurious * disabled poller asks explicitly. */ if (irqd_irq_disabled(&desc->irq_data) && !force) - goto out; + return false; /* * All handlers must agree on IRQF_SHARED, so we test just the * first. */ action = desc->action; - if (!action || !(action->flags & IRQF_SHARED) || - (action->flags & __IRQF_TIMER)) - goto out; + if (!action || !(action->flags & IRQF_SHARED) || (action->flags & __IRQF_TIMER)) + return false; /* Already running on another processor */ if (irqd_irq_inprogress(&desc->irq_data)) { @@ -98,21 +97,19 @@ static int try_one_irq(struct irq_desc *desc, bool force) * CPU to go looking for our mystery interrupt too */ desc->istate |= IRQS_PENDING; - goto out; + return false; } /* Mark it poll in progress */ desc->istate |= IRQS_POLL_INPROGRESS; do { if (handle_irq_event(desc) == IRQ_HANDLED) - ret = IRQ_HANDLED; + ret = true; /* Make sure that there is still a valid action */ action = desc->action; } while ((desc->istate & IRQS_PENDING) && action); desc->istate &= ~IRQS_POLL_INPROGRESS; -out: - raw_spin_unlock(&desc->lock); - return ret == IRQ_HANDLED; + return ret; } static int misrouted_irq(int irq) @@ -157,8 +154,7 @@ static void poll_spurious_irqs(struct timer_list *unused) continue; /* Racy but it doesn't matter */ - state = desc->istate; - barrier(); + state = READ_ONCE(desc->istate); if (!(state & IRQS_SPURIOUS_DISABLED)) continue; @@ -168,8 +164,7 @@ static void poll_spurious_irqs(struct timer_list *unused) } out: atomic_dec(&irq_poll_active); - mod_timer(&poll_spurious_irq_timer, - jiffies + POLL_SPURIOUS_IRQ_INTERVAL); + mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } static inline int bad_action_ret(irqreturn_t action_ret) @@ -193,17 +188,13 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) { unsigned int irq = irq_desc_get_irq(desc); struct irqaction *action; - unsigned long flags; - if (bad_action_ret(action_ret)) { - printk(KERN_ERR "irq event %d: bogus return value %x\n", - irq, action_ret); - } else { - printk(KERN_ERR "irq %d: nobody cared (try booting with " - "the \"irqpoll\" option)\n", irq); - } + if (bad_action_ret(action_ret)) + pr_err("irq event %d: bogus return value %x\n", irq, action_ret); + else + pr_err("irq %d: nobody cared (try booting with the \"irqpoll\" option)\n", irq); dump_stack(); - printk(KERN_ERR "handlers:\n"); + pr_err("handlers:\n"); /* * We need to take desc->lock here. note_interrupt() is called @@ -211,15 +202,13 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) * with something else removing an action. It's ok to take * desc->lock here. See synchronize_irq(). */ - raw_spin_lock_irqsave(&desc->lock, flags); + guard(raw_spinlock_irqsave)(&desc->lock); for_each_action_of_desc(desc, action) { - printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler); + pr_err("[<%p>] %ps", action->handler, action->handler); if (action->thread_fn) - printk(KERN_CONT " threaded [<%p>] %ps", - action->thread_fn, action->thread_fn); - printk(KERN_CONT "\n"); + pr_cont(" threaded [<%p>] %ps", action->thread_fn, action->thread_fn); + pr_cont("\n"); } - raw_spin_unlock_irqrestore(&desc->lock, flags); } static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) @@ -232,18 +221,17 @@ static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) } } -static inline int -try_misrouted_irq(unsigned int irq, struct irq_desc *desc, - irqreturn_t action_ret) +static inline bool try_misrouted_irq(unsigned int irq, struct irq_desc *desc, + irqreturn_t action_ret) { struct irqaction *action; if (!irqfixup) - return 0; + return false; /* We didn't actually handle the IRQ - see if it was misrouted? */ if (action_ret == IRQ_NONE) - return 1; + return true; /* * But for 'irqfixup == 2' we also do it for handled interrupts if @@ -251,19 +239,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, * traditional PC timer interrupt.. Legacy) */ if (irqfixup < 2) - return 0; + return false; if (!irq) - return 1; + return true; /* * Since we don't get the descriptor lock, "action" can - * change under us. We don't really care, but we don't - * want to follow a NULL pointer. So tell the compiler to - * just load it once by using a barrier. + * change under us. */ - action = desc->action; - barrier(); + action = READ_ONCE(desc->action); return action && (action->flags & IRQF_IRQPOLL); } @@ -273,8 +258,7 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) { unsigned int irq; - if (desc->istate & IRQS_POLL_INPROGRESS || - irq_settings_is_polled(desc)) + if (desc->istate & IRQS_POLL_INPROGRESS || irq_settings_is_polled(desc)) return; if (bad_action_ret(action_ret)) { @@ -420,13 +404,12 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret) /* * Now kill the IRQ */ - printk(KERN_EMERG "Disabling IRQ #%d\n", irq); + pr_emerg("Disabling IRQ #%d\n", irq); desc->istate |= IRQS_SPURIOUS_DISABLED; desc->depth++; irq_disable(desc); - mod_timer(&poll_spurious_irq_timer, - jiffies + POLL_SPURIOUS_IRQ_INTERVAL); + mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } desc->irqs_unhandled = 0; } @@ -436,11 +419,9 @@ bool noirqdebug __read_mostly; int noirqdebug_setup(char *str) { noirqdebug = 1; - printk(KERN_INFO "IRQ lockup detection disabled\n"); - + pr_info("IRQ lockup detection disabled\n"); return 1; } - __setup("noirqdebug", noirqdebug_setup); module_param(noirqdebug, bool, 0644); MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); @@ -452,12 +433,10 @@ static int __init irqfixup_setup(char *str) return 1; } irqfixup = 1; - printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); - printk(KERN_WARNING "This may impact system performance.\n"); - + pr_warn("Misrouted IRQ fixup support enabled.\n"); + pr_warn("This may impact system performance.\n"); return 1; } - __setup("irqfixup", irqfixup_setup); module_param(irqfixup, int, 0644); @@ -468,11 +447,8 @@ static int __init irqpoll_setup(char *str) return 1; } irqfixup = 2; - printk(KERN_WARNING "Misrouted IRQ fixup and polling support " - "enabled\n"); - printk(KERN_WARNING "This may significantly impact system " - "performance\n"); + pr_warn("Misrouted IRQ fixup and polling support enabled\n"); + pr_warn("This may significantly impact system performance\n"); return 1; } - __setup("irqpoll", irqpoll_setup); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 93a822d3c468..7cb19e601426 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -653,13 +653,12 @@ static int __jump_label_mod_text_reserved(void *start, void *end) struct module *mod; int ret; - preempt_disable(); - mod = __module_text_address((unsigned long)start); - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); - if (!try_module_get(mod)) - mod = NULL; - preempt_enable(); - + scoped_guard(rcu) { + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + } if (!mod) return 0; @@ -746,9 +745,9 @@ static int jump_label_add_module(struct module *mod) kfree(jlm); return -ENOMEM; } - preempt_disable(); - jlm2->mod = __module_address((unsigned long)key); - preempt_enable(); + scoped_guard(rcu) + jlm2->mod = __module_address((unsigned long)key); + jlm2->entries = static_key_entries(key); jlm2->next = NULL; static_key_set_mod(key, jlm2); @@ -906,13 +905,13 @@ static void jump_label_update(struct static_key *key) return; } - preempt_disable(); - mod = __module_address((unsigned long)key); - if (mod) { - stop = mod->jump_entries + mod->num_jump_entries; - init = mod->state == MODULE_STATE_COMING; + scoped_guard(rcu) { + mod = __module_address((unsigned long)key); + if (mod) { + stop = mod->jump_entries + mod->num_jump_entries; + init = mod->state == MODULE_STATE_COMING; + } } - preempt_enable(); #endif entry = static_key_entries(key); /* if there are no users, entry can be NULL */ diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index a9a0ca605d4a..4198f30aac3c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -148,16 +148,8 @@ static unsigned int get_symbol_offset(unsigned long pos) unsigned long kallsyms_sym_address(int idx) { - /* values are unsigned offsets if --absolute-percpu is not in effect */ - if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) - return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; - - /* ...otherwise, positive offsets are absolute values */ - if (kallsyms_offsets[idx] >= 0) - return kallsyms_offsets[idx]; - - /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ - return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; + /* values are unsigned offsets */ + return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; } static unsigned int get_symbol_seq(int index) diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 2c596851f8a9..7c1a65bd5f8d 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -145,7 +145,7 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, */ task1 = find_task_by_vpid(pid1); task2 = find_task_by_vpid(pid2); - if (!task1 || !task2) + if (unlikely(!task1 || !task2)) goto err_no_task; get_task_struct(task1); diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 117d9d4d3c3b..c2871180edcc 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1500,8 +1500,8 @@ static int access_thread(void *arg) func(); } } while (!torture_must_stop()); - del_timer_sync(&timer); - destroy_timer_on_stack(&timer); + timer_delete_sync(&timer); + timer_destroy_on_stack(&timer); torture_kthread_stopping("access_thread"); return 0; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c0bdc1686154..9c59fa480b0b 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -210,6 +210,16 @@ int sanity_check_segment_list(struct kimage *image) } #endif + /* + * The destination addresses are searched from system RAM rather than + * being allocated from the buddy allocator, so they are not guaranteed + * to be accepted by the current kernel. Accept the destination + * addresses before kexec swaps their content with the segments' source + * pages to avoid accessing memory before it is accepted. + */ + for (i = 0; i < nr_segments; i++) + accept_memory(image->segment[i].mem, image->segment[i].memsz); + return 0; } @@ -867,6 +877,60 @@ int kimage_load_segment(struct kimage *image, return result; } +void *kimage_map_segment(struct kimage *image, + unsigned long addr, unsigned long size) +{ + unsigned long src_page_addr, dest_page_addr = 0; + unsigned long eaddr = addr + size; + kimage_entry_t *ptr, entry; + struct page **src_pages; + unsigned int npages; + void *vaddr = NULL; + int i; + + /* + * Collect the source pages and map them in a contiguous VA range. + */ + npages = PFN_UP(eaddr) - PFN_DOWN(addr); + src_pages = kmalloc_array(npages, sizeof(*src_pages), GFP_KERNEL); + if (!src_pages) { + pr_err("Could not allocate ima pages array.\n"); + return NULL; + } + + i = 0; + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) { + dest_page_addr = entry & PAGE_MASK; + } else if (entry & IND_SOURCE) { + if (dest_page_addr >= addr && dest_page_addr < eaddr) { + src_page_addr = entry & PAGE_MASK; + src_pages[i++] = + virt_to_page(__va(src_page_addr)); + if (i == npages) + break; + dest_page_addr += PAGE_SIZE; + } + } + } + + /* Sanity check. */ + WARN_ON(i < npages); + + vaddr = vmap(src_pages, npages, VM_MAP, PAGE_KERNEL); + kfree(src_pages); + + if (!vaddr) + pr_err("Could not map ima buffer.\n"); + + return vaddr; +} + +void kimage_unmap_segment(void *segment_buffer) +{ + vunmap(segment_buffer); +} + struct kexec_load_limit { /* Mutex protects the limit count. */ struct mutex mutex; @@ -1013,7 +1077,7 @@ int kernel_kexec(void) error = -EBUSY; goto Restore_console; } - suspend_console(); + console_suspend_all(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_console; @@ -1072,7 +1136,7 @@ int kernel_kexec(void) Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: - resume_console(); + console_resume_all(); thaw_processes(); Restore_console: pm_restore_console(); diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c index d3689632e8b9..3a5c25b2adc9 100644 --- a/kernel/kexec_elf.c +++ b/kernel/kexec_elf.c @@ -390,7 +390,7 @@ int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, struct kexec_buf *kbuf, unsigned long *lowest_load_addr) { - unsigned long lowest_addr = UINT_MAX; + unsigned long lowest_addr = ULONG_MAX; int ret; size_t i; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 3eedb8c226ad..69fe76fd9233 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -19,7 +19,6 @@ #include <linux/list.h> #include <linux/fs.h> #include <linux/ima.h> -#include <crypto/hash.h> #include <crypto/sha2.h> #include <linux/elf.h> #include <linux/elfcore.h> @@ -38,6 +37,21 @@ void set_kexec_sig_enforced(void) } #endif +#ifdef CONFIG_IMA_KEXEC +static bool check_ima_segment_index(struct kimage *image, int i) +{ + if (image->is_ima_segment_index_set && i == image->ima_segment_index) + return true; + else + return false; +} +#else +static bool check_ima_segment_index(struct kimage *image, int i) +{ + return false; +} +#endif + static int kexec_calculate_store_digests(struct kimage *image); /* Maximum size in bytes for kernel/initrd files. */ @@ -186,6 +200,15 @@ kimage_validate_signature(struct kimage *image) } #endif +static int kexec_post_load(struct kimage *image, unsigned long flags) +{ +#ifdef CONFIG_IMA_KEXEC + if (!(flags & KEXEC_FILE_ON_CRASH)) + ima_kexec_post_load(image); +#endif + return machine_kexec_post_load(image); +} + /* * In file mode list of segments is prepared by kernel. Copy relevant * data from user space, do error checking, prepare segment list @@ -253,6 +276,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, /* IMA needs to pass the measurement list to the next kernel. */ ima_add_kexec_buffer(image); + /* If KHO is active, add its images to the list */ + ret = kho_fill_kimage(image); + if (ret) + goto out; + /* Call image load handler */ ldata = kexec_image_load_default(image); @@ -413,7 +441,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, kimage_terminate(image); - ret = machine_kexec_post_load(image); + ret = kexec_post_load(image, flags); if (ret) goto out; @@ -445,6 +473,7 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, temp_end = min(end, kbuf->buf_max); temp_start = temp_end - kbuf->memsz + 1; + kexec_random_range_start(temp_start, temp_end, kbuf, &temp_start); do { /* align down start */ @@ -464,6 +493,12 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, continue; } + /* Make sure this does not conflict with exclude range */ + if (arch_check_excluded_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + /* We found a suitable memory range */ break; } while (1); @@ -483,6 +518,8 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, temp_start = max(start, kbuf->buf_min); + kexec_random_range_start(temp_start, end, kbuf, &temp_start); + do { temp_start = ALIGN(temp_start, kbuf->buf_align); temp_end = temp_start + kbuf->memsz - 1; @@ -498,6 +535,12 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, continue; } + /* Make sure this does not conflict with exclude range */ + if (arch_check_excluded_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + /* We found a suitable memory range */ break; } while (1); @@ -636,6 +679,14 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; + /* + * If KHO is active, only use KHO scratch memory. All other memory + * could potentially be handed over. + */ + ret = kho_locate_mem_hole(kbuf, locate_mem_hole_callback); + if (ret <= 0) + return ret; + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else @@ -700,11 +751,10 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Calculate and store the digest of segments */ static int kexec_calculate_store_digests(struct kimage *image) { - struct crypto_shash *tfm; - struct shash_desc *desc; + struct sha256_state state; int ret = 0, i, j, zero_buf_sz, sha_region_sz; - size_t desc_size, nullsz; - char *digest; + size_t nullsz; + u8 digest[SHA256_DIGEST_SIZE]; void *zero_buf; struct kexec_sha_region *sha_regions; struct purgatory_info *pi = &image->purgatory_info; @@ -715,37 +765,12 @@ static int kexec_calculate_store_digests(struct kimage *image) zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); zero_buf_sz = PAGE_SIZE; - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto out; - } - - desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); - desc = kzalloc(desc_size, GFP_KERNEL); - if (!desc) { - ret = -ENOMEM; - goto out_free_tfm; - } - sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); sha_regions = vzalloc(sha_region_sz); - if (!sha_regions) { - ret = -ENOMEM; - goto out_free_desc; - } - - desc->tfm = tfm; - - ret = crypto_shash_init(desc); - if (ret < 0) - goto out_free_sha_regions; + if (!sha_regions) + return -ENOMEM; - digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); - if (!digest) { - ret = -ENOMEM; - goto out_free_sha_regions; - } + sha256_init(&state); for (j = i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; @@ -764,10 +789,14 @@ static int kexec_calculate_store_digests(struct kimage *image) if (ksegment->kbuf == pi->purgatory_buf) continue; - ret = crypto_shash_update(desc, ksegment->kbuf, - ksegment->bufsz); - if (ret) - break; + /* + * Skip the segment if ima_segment_index is set and matches + * the current index + */ + if (check_ima_segment_index(image, i)) + continue; + + sha256_update(&state, ksegment->kbuf, ksegment->bufsz); /* * Assume rest of the buffer is filled with zero and @@ -779,44 +808,26 @@ static int kexec_calculate_store_digests(struct kimage *image) if (bytes > zero_buf_sz) bytes = zero_buf_sz; - ret = crypto_shash_update(desc, zero_buf, bytes); - if (ret) - break; + sha256_update(&state, zero_buf, bytes); nullsz -= bytes; } - if (ret) - break; - sha_regions[j].start = ksegment->mem; sha_regions[j].len = ksegment->memsz; j++; } - if (!ret) { - ret = crypto_shash_final(desc, digest); - if (ret) - goto out_free_digest; - ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", - sha_regions, sha_region_sz, 0); - if (ret) - goto out_free_digest; + sha256_final(&state, digest); - ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest", - digest, SHA256_DIGEST_SIZE, 0); - if (ret) - goto out_free_digest; - } + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_sha_regions; -out_free_digest: - kfree(digest); + ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); out_free_sha_regions: vfree(sha_regions); -out_free_desc: - kfree(desc); -out_free_tfm: - kfree(tfm); -out: return ret; } diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c new file mode 100644 index 000000000000..5a21dbe17950 --- /dev/null +++ b/kernel/kexec_handover.c @@ -0,0 +1,1271 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover.c - kexec handover metadata processing + * Copyright (C) 2023 Alexander Graf <graf@amazon.com> + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> + * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include <linux/cma.h> +#include <linux/count_zeros.h> +#include <linux/debugfs.h> +#include <linux/kexec.h> +#include <linux/kexec_handover.h> +#include <linux/libfdt.h> +#include <linux/list.h> +#include <linux/memblock.h> +#include <linux/notifier.h> +#include <linux/page-isolation.h> + +#include <asm/early_ioremap.h> + +/* + * KHO is tightly coupled with mm init and needs access to some of mm + * internal APIs. + */ +#include "../mm/internal.h" +#include "kexec_internal.h" + +#define KHO_FDT_COMPATIBLE "kho-v1" +#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" +#define PROP_SUB_FDT "fdt" + +static bool kho_enable __ro_after_init; + +bool kho_is_enabled(void) +{ + return kho_enable; +} +EXPORT_SYMBOL_GPL(kho_is_enabled); + +static int __init kho_parse_enable(char *p) +{ + return kstrtobool(p, &kho_enable); +} +early_param("kho", kho_parse_enable); + +/* + * Keep track of memory that is to be preserved across KHO. + * + * The serializing side uses two levels of xarrays to manage chunks of per-order + * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a + * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations + * each bitmap will cover 16M of address space. Thus, for 16G of memory at most + * 512K of bitmap memory will be needed for order 0. + * + * This approach is fully incremental, as the serialization progresses folios + * can continue be aggregated to the tracker. The final step, immediately prior + * to kexec would serialize the xarray information into a linked list for the + * successor kernel to parse. + */ + +#define PRESERVE_BITS (512 * 8) + +struct kho_mem_phys_bits { + DECLARE_BITMAP(preserve, PRESERVE_BITS); +}; + +struct kho_mem_phys { + /* + * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized + * to order. + */ + struct xarray phys_bits; +}; + +struct kho_mem_track { + /* Points to kho_mem_phys, each order gets its own bitmap tree */ + struct xarray orders; +}; + +struct khoser_mem_chunk; + +struct kho_serialization { + struct page *fdt; + struct list_head fdt_list; + struct dentry *sub_fdt_dir; + struct kho_mem_track track; + /* First chunk of serialized preserved memory map */ + struct khoser_mem_chunk *preserved_mem_map; +}; + +static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) +{ + void *elm, *res; + + elm = xa_load(xa, index); + if (elm) + return elm; + + elm = kzalloc(sz, GFP_KERNEL); + if (!elm) + return ERR_PTR(-ENOMEM); + + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); + if (xa_is_err(res)) + res = ERR_PTR(xa_err(res)); + + if (res) { + kfree(elm); + return res; + } + + return elm; +} + +static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, + unsigned long end_pfn) +{ + struct kho_mem_phys_bits *bits; + struct kho_mem_phys *physxa; + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + const unsigned long pfn_high = pfn >> order; + + physxa = xa_load(&track->orders, order); + if (!physxa) + continue; + + bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); + if (!bits) + continue; + + clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); + + pfn += 1 << order; + } +} + +static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, + unsigned int order) +{ + struct kho_mem_phys_bits *bits; + struct kho_mem_phys *physxa; + const unsigned long pfn_high = pfn >> order; + + might_sleep(); + + physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa)); + if (IS_ERR(physxa)) + return PTR_ERR(physxa); + + bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, + sizeof(*bits)); + if (IS_ERR(bits)) + return PTR_ERR(bits); + + set_bit(pfn_high % PRESERVE_BITS, bits->preserve); + + return 0; +} + +/* almost as free_reserved_page(), just don't free the page */ +static void kho_restore_page(struct page *page, unsigned int order) +{ + unsigned int nr_pages = (1 << order); + + /* Head page gets refcount of 1. */ + set_page_count(page, 1); + + /* For higher order folios, tail pages get a page count of zero. */ + for (unsigned int i = 1; i < nr_pages; i++) + set_page_count(page + i, 0); + + if (order > 0) + prep_compound_page(page, order); + + adjust_managed_page_count(page, nr_pages); +} + +/** + * kho_restore_folio - recreates the folio from the preserved memory. + * @phys: physical address of the folio. + * + * Return: pointer to the struct folio on success, NULL on failure. + */ +struct folio *kho_restore_folio(phys_addr_t phys) +{ + struct page *page = pfn_to_online_page(PHYS_PFN(phys)); + unsigned long order; + + if (!page) + return NULL; + + order = page->private; + if (order > MAX_PAGE_ORDER) + return NULL; + + kho_restore_page(page, order); + return page_folio(page); +} +EXPORT_SYMBOL_GPL(kho_restore_folio); + +/* Serialize and deserialize struct kho_mem_phys across kexec + * + * Record all the bitmaps in a linked list of pages for the next kernel to + * process. Each chunk holds bitmaps of the same order and each block of bitmaps + * starts at a given physical address. This allows the bitmaps to be sparse. The + * xarray is used to store them in a tree while building up the data structure, + * but the KHO successor kernel only needs to process them once in order. + * + * All of this memory is normal kmalloc() memory and is not marked for + * preservation. The successor kernel will remain isolated to the scratch space + * until it completes processing this list. Once processed all the memory + * storing these ranges will be marked as free. + */ + +struct khoser_mem_bitmap_ptr { + phys_addr_t phys_start; + DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); +}; + +struct khoser_mem_chunk_hdr { + DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); + unsigned int order; + unsigned int num_elms; +}; + +#define KHOSER_BITMAP_SIZE \ + ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ + sizeof(struct khoser_mem_bitmap_ptr)) + +struct khoser_mem_chunk { + struct khoser_mem_chunk_hdr hdr; + struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; +}; + +static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); + +static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, + unsigned long order) +{ + struct khoser_mem_chunk *chunk; + + chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!chunk) + return NULL; + chunk->hdr.order = order; + if (cur_chunk) + KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); + return chunk; +} + +static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) +{ + struct khoser_mem_chunk *chunk = first_chunk; + + while (chunk) { + struct khoser_mem_chunk *tmp = chunk; + + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + kfree(tmp); + } +} + +static int kho_mem_serialize(struct kho_serialization *ser) +{ + struct khoser_mem_chunk *first_chunk = NULL; + struct khoser_mem_chunk *chunk = NULL; + struct kho_mem_phys *physxa; + unsigned long order; + + xa_for_each(&ser->track.orders, order, physxa) { + struct kho_mem_phys_bits *bits; + unsigned long phys; + + chunk = new_chunk(chunk, order); + if (!chunk) + goto err_free; + + if (!first_chunk) + first_chunk = chunk; + + xa_for_each(&physxa->phys_bits, phys, bits) { + struct khoser_mem_bitmap_ptr *elm; + + if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { + chunk = new_chunk(chunk, order); + if (!chunk) + goto err_free; + } + + elm = &chunk->bitmaps[chunk->hdr.num_elms]; + chunk->hdr.num_elms++; + elm->phys_start = (phys * PRESERVE_BITS) + << (order + PAGE_SHIFT); + KHOSER_STORE_PTR(elm->bitmap, bits); + } + } + + ser->preserved_mem_map = first_chunk; + + return 0; + +err_free: + kho_mem_ser_free(first_chunk); + return -ENOMEM; +} + +static void deserialize_bitmap(unsigned int order, + struct khoser_mem_bitmap_ptr *elm) +{ + struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); + unsigned long bit; + + for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { + int sz = 1 << (order + PAGE_SHIFT); + phys_addr_t phys = + elm->phys_start + (bit << (order + PAGE_SHIFT)); + struct page *page = phys_to_page(phys); + + memblock_reserve(phys, sz); + memblock_reserved_mark_noinit(phys, sz); + page->private = order; + } +} + +static void __init kho_mem_deserialize(const void *fdt) +{ + struct khoser_mem_chunk *chunk; + const phys_addr_t *mem; + int len; + + mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); + + if (!mem || len != sizeof(*mem)) { + pr_err("failed to get preserved memory bitmaps\n"); + return; + } + + chunk = *mem ? phys_to_virt(*mem) : NULL; + while (chunk) { + unsigned int i; + + for (i = 0; i != chunk->hdr.num_elms; i++) + deserialize_bitmap(chunk->hdr.order, + &chunk->bitmaps[i]); + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + } +} + +/* + * With KHO enabled, memory can become fragmented because KHO regions may + * be anywhere in physical address space. The scratch regions give us a + * safe zones that we will never see KHO allocations from. This is where we + * can later safely load our new kexec images into and then use the scratch + * area for early allocations that happen before page allocator is + * initialized. + */ +static struct kho_scratch *kho_scratch; +static unsigned int kho_scratch_cnt; + +/* + * The scratch areas are scaled by default as percent of memory allocated from + * memblock. A user can override the scale with command line parameter: + * + * kho_scratch=N% + * + * It is also possible to explicitly define size for a lowmem, a global and + * per-node scratch areas: + * + * kho_scratch=l[KMG],n[KMG],m[KMG] + * + * The explicit size definition takes precedence over scale definition. + */ +static unsigned int scratch_scale __initdata = 200; +static phys_addr_t scratch_size_global __initdata; +static phys_addr_t scratch_size_pernode __initdata; +static phys_addr_t scratch_size_lowmem __initdata; + +static int __init kho_parse_scratch_size(char *p) +{ + size_t len; + unsigned long sizes[3]; + int i; + + if (!p) + return -EINVAL; + + len = strlen(p); + if (!len) + return -EINVAL; + + /* parse nn% */ + if (p[len - 1] == '%') { + /* unsigned int max is 4,294,967,295, 10 chars */ + char s_scale[11] = {}; + int ret = 0; + + if (len > ARRAY_SIZE(s_scale)) + return -EINVAL; + + memcpy(s_scale, p, len - 1); + ret = kstrtouint(s_scale, 10, &scratch_scale); + if (!ret) + pr_notice("scratch scale is %d%%\n", scratch_scale); + return ret; + } + + /* parse ll[KMG],mm[KMG],nn[KMG] */ + for (i = 0; i < ARRAY_SIZE(sizes); i++) { + char *endp = p; + + if (i > 0) { + if (*p != ',') + return -EINVAL; + p += 1; + } + + sizes[i] = memparse(p, &endp); + if (!sizes[i] || endp == p) + return -EINVAL; + p = endp; + } + + scratch_size_lowmem = sizes[0]; + scratch_size_global = sizes[1]; + scratch_size_pernode = sizes[2]; + scratch_scale = 0; + + pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n", + (u64)(scratch_size_lowmem >> 20), + (u64)(scratch_size_global >> 20), + (u64)(scratch_size_pernode >> 20)); + + return 0; +} +early_param("kho_scratch", kho_parse_scratch_size); + +static void __init scratch_size_update(void) +{ + phys_addr_t size; + + if (!scratch_scale) + return; + + size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT, + NUMA_NO_NODE); + size = size * scratch_scale / 100; + scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES); + + size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, + NUMA_NO_NODE); + size = size * scratch_scale / 100 - scratch_size_lowmem; + scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES); +} + +static phys_addr_t __init scratch_size_node(int nid) +{ + phys_addr_t size; + + if (scratch_scale) { + size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, + nid); + size = size * scratch_scale / 100; + } else { + size = scratch_size_pernode; + } + + return round_up(size, CMA_MIN_ALIGNMENT_BYTES); +} + +/** + * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec + * + * With KHO we can preserve arbitrary pages in the system. To ensure we still + * have a large contiguous region of memory when we search the physical address + * space for target memory, let's make sure we always have a large CMA region + * active. This CMA region will only be used for movable pages which are not a + * problem for us during KHO because we can just move them somewhere else. + */ +static void __init kho_reserve_scratch(void) +{ + phys_addr_t addr, size; + int nid, i = 0; + + if (!kho_enable) + return; + + scratch_size_update(); + + /* FIXME: deal with node hot-plug/remove */ + kho_scratch_cnt = num_online_nodes() + 2; + size = kho_scratch_cnt * sizeof(*kho_scratch); + kho_scratch = memblock_alloc(size, PAGE_SIZE); + if (!kho_scratch) + goto err_disable_kho; + + /* + * reserve scratch area in low memory for lowmem allocations in the + * next kernel + */ + size = scratch_size_lowmem; + addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, + ARCH_LOW_ADDRESS_LIMIT); + if (!addr) + goto err_free_scratch_desc; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + + /* reserve large contiguous area for allocations without nid */ + size = scratch_size_global; + addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); + if (!addr) + goto err_free_scratch_areas; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + + for_each_online_node(nid) { + size = scratch_size_node(nid); + addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, + 0, MEMBLOCK_ALLOC_ACCESSIBLE, + nid, true); + if (!addr) + goto err_free_scratch_areas; + + kho_scratch[i].addr = addr; + kho_scratch[i].size = size; + i++; + } + + return; + +err_free_scratch_areas: + for (i--; i >= 0; i--) + memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size); +err_free_scratch_desc: + memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); +err_disable_kho: + kho_enable = false; +} + +struct fdt_debugfs { + struct list_head list; + struct debugfs_blob_wrapper wrapper; + struct dentry *file; +}; + +static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir, + const char *name, const void *fdt) +{ + struct fdt_debugfs *f; + struct dentry *file; + + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (!f) + return -ENOMEM; + + f->wrapper.data = (void *)fdt; + f->wrapper.size = fdt_totalsize(fdt); + + file = debugfs_create_blob(name, 0400, dir, &f->wrapper); + if (IS_ERR(file)) { + kfree(f); + return PTR_ERR(file); + } + + f->file = file; + list_add(&f->list, list); + + return 0; +} + +/** + * kho_add_subtree - record the physical address of a sub FDT in KHO root tree. + * @ser: serialization control object passed by KHO notifiers. + * @name: name of the sub tree. + * @fdt: the sub tree blob. + * + * Creates a new child node named @name in KHO root FDT and records + * the physical address of @fdt. The pages of @fdt must also be preserved + * by KHO for the new kernel to retrieve it after kexec. + * + * A debugfs blob entry is also created at + * ``/sys/kernel/debug/kho/out/sub_fdts/@name``. + * + * Return: 0 on success, error code on failure + */ +int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) +{ + int err = 0; + u64 phys = (u64)virt_to_phys(fdt); + void *root = page_to_virt(ser->fdt); + + err |= fdt_begin_node(root, name); + err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys)); + err |= fdt_end_node(root); + + if (err) + return err; + + return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt); +} +EXPORT_SYMBOL_GPL(kho_add_subtree); + +struct kho_out { + struct blocking_notifier_head chain_head; + + struct dentry *dir; + + struct mutex lock; /* protects KHO FDT finalization */ + + struct kho_serialization ser; + bool finalized; +}; + +static struct kho_out kho_out = { + .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), + .lock = __MUTEX_INITIALIZER(kho_out.lock), + .ser = { + .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), + .track = { + .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), + }, + }, + .finalized = false, +}; + +int register_kho_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&kho_out.chain_head, nb); +} +EXPORT_SYMBOL_GPL(register_kho_notifier); + +int unregister_kho_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&kho_out.chain_head, nb); +} +EXPORT_SYMBOL_GPL(unregister_kho_notifier); + +/** + * kho_preserve_folio - preserve a folio across kexec. + * @folio: folio to preserve. + * + * Instructs KHO to preserve the whole folio across kexec. The order + * will be preserved as well. + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_folio(struct folio *folio) +{ + const unsigned long pfn = folio_pfn(folio); + const unsigned int order = folio_order(folio); + struct kho_mem_track *track = &kho_out.ser.track; + + if (kho_out.finalized) + return -EBUSY; + + return __kho_preserve_order(track, pfn, order); +} +EXPORT_SYMBOL_GPL(kho_preserve_folio); + +/** + * kho_preserve_phys - preserve a physically contiguous range across kexec. + * @phys: physical address of the range. + * @size: size of the range. + * + * Instructs KHO to preserve the memory range from @phys to @phys + @size + * across kexec. + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_phys(phys_addr_t phys, size_t size) +{ + unsigned long pfn = PHYS_PFN(phys); + unsigned long failed_pfn = 0; + const unsigned long start_pfn = pfn; + const unsigned long end_pfn = PHYS_PFN(phys + size); + int err = 0; + struct kho_mem_track *track = &kho_out.ser.track; + + if (kho_out.finalized) + return -EBUSY; + + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) + return -EINVAL; + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + + err = __kho_preserve_order(track, pfn, order); + if (err) { + failed_pfn = pfn; + break; + } + + pfn += 1 << order; + } + + if (err) + __kho_unpreserve(track, start_pfn, failed_pfn); + + return err; +} +EXPORT_SYMBOL_GPL(kho_preserve_phys); + +/* Handling for debug/kho/out */ + +static struct dentry *debugfs_root; + +static int kho_out_update_debugfs_fdt(void) +{ + int err = 0; + struct fdt_debugfs *ff, *tmp; + + if (kho_out.finalized) { + err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir, + "fdt", page_to_virt(kho_out.ser.fdt)); + } else { + list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) { + debugfs_remove(ff->file); + list_del(&ff->list); + kfree(ff); + } + } + + return err; +} + +static int kho_abort(void) +{ + int err; + unsigned long order; + struct kho_mem_phys *physxa; + + xa_for_each(&kho_out.ser.track.orders, order, physxa) { + struct kho_mem_phys_bits *bits; + unsigned long phys; + + xa_for_each(&physxa->phys_bits, phys, bits) + kfree(bits); + + xa_destroy(&physxa->phys_bits); + kfree(physxa); + } + xa_destroy(&kho_out.ser.track.orders); + + if (kho_out.ser.preserved_mem_map) { + kho_mem_ser_free(kho_out.ser.preserved_mem_map); + kho_out.ser.preserved_mem_map = NULL; + } + + err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT, + NULL); + err = notifier_to_errno(err); + + if (err) + pr_err("Failed to abort KHO finalization: %d\n", err); + + return err; +} + +static int kho_finalize(void) +{ + int err = 0; + u64 *preserved_mem_map; + void *fdt = page_to_virt(kho_out.ser.fdt); + + err |= fdt_create(fdt, PAGE_SIZE); + err |= fdt_finish_reservemap(fdt); + err |= fdt_begin_node(fdt, ""); + err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE); + /** + * Reserve the preserved-memory-map property in the root FDT, so + * that all property definitions will precede subnodes created by + * KHO callers. + */ + err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP, + sizeof(*preserved_mem_map), + (void **)&preserved_mem_map); + if (err) + goto abort; + + err = kho_preserve_folio(page_folio(kho_out.ser.fdt)); + if (err) + goto abort; + + err = blocking_notifier_call_chain(&kho_out.chain_head, + KEXEC_KHO_FINALIZE, &kho_out.ser); + err = notifier_to_errno(err); + if (err) + goto abort; + + err = kho_mem_serialize(&kho_out.ser); + if (err) + goto abort; + + *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map); + + err |= fdt_end_node(fdt); + err |= fdt_finish(fdt); + +abort: + if (err) { + pr_err("Failed to convert KHO state tree: %d\n", err); + kho_abort(); + } + + return err; +} + +static int kho_out_finalize_get(void *data, u64 *val) +{ + mutex_lock(&kho_out.lock); + *val = kho_out.finalized; + mutex_unlock(&kho_out.lock); + + return 0; +} + +static int kho_out_finalize_set(void *data, u64 _val) +{ + int ret = 0; + bool val = !!_val; + + mutex_lock(&kho_out.lock); + + if (val == kho_out.finalized) { + if (kho_out.finalized) + ret = -EEXIST; + else + ret = -ENOENT; + goto unlock; + } + + if (val) + ret = kho_finalize(); + else + ret = kho_abort(); + + if (ret) + goto unlock; + + kho_out.finalized = val; + ret = kho_out_update_debugfs_fdt(); + +unlock: + mutex_unlock(&kho_out.lock); + return ret; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get, + kho_out_finalize_set, "%llu\n"); + +static int scratch_phys_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].addr); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_phys); + +static int scratch_len_show(struct seq_file *m, void *v) +{ + for (int i = 0; i < kho_scratch_cnt; i++) + seq_printf(m, "0x%llx\n", kho_scratch[i].size); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(scratch_len); + +static __init int kho_out_debugfs_init(void) +{ + struct dentry *dir, *f, *sub_fdt_dir; + + dir = debugfs_create_dir("out", debugfs_root); + if (IS_ERR(dir)) + return -ENOMEM; + + sub_fdt_dir = debugfs_create_dir("sub_fdts", dir); + if (IS_ERR(sub_fdt_dir)) + goto err_rmdir; + + f = debugfs_create_file("scratch_phys", 0400, dir, NULL, + &scratch_phys_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("scratch_len", 0400, dir, NULL, + &scratch_len_fops); + if (IS_ERR(f)) + goto err_rmdir; + + f = debugfs_create_file("finalize", 0600, dir, NULL, + &fops_kho_out_finalize); + if (IS_ERR(f)) + goto err_rmdir; + + kho_out.dir = dir; + kho_out.ser.sub_fdt_dir = sub_fdt_dir; + return 0; + +err_rmdir: + debugfs_remove_recursive(dir); + return -ENOENT; +} + +struct kho_in { + struct dentry *dir; + phys_addr_t fdt_phys; + phys_addr_t scratch_phys; + struct list_head fdt_list; +}; + +static struct kho_in kho_in = { + .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list), +}; + +static const void *kho_get_fdt(void) +{ + return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; +} + +/** + * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. + * @name: the name of the sub FDT passed to kho_add_subtree(). + * @phys: if found, the physical address of the sub FDT is stored in @phys. + * + * Retrieve a preserved sub FDT named @name and store its physical + * address in @phys. + * + * Return: 0 on success, error code on failure + */ +int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +{ + const void *fdt = kho_get_fdt(); + const u64 *val; + int offset, len; + + if (!fdt) + return -ENOENT; + + if (!phys) + return -EINVAL; + + offset = fdt_subnode_offset(fdt, 0, name); + if (offset < 0) + return -ENOENT; + + val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); + if (!val || len != sizeof(*val)) + return -EINVAL; + + *phys = (phys_addr_t)*val; + + return 0; +} +EXPORT_SYMBOL_GPL(kho_retrieve_subtree); + +/* Handling for debugfs/kho/in */ + +static __init int kho_in_debugfs_init(const void *fdt) +{ + struct dentry *sub_fdt_dir; + int err, child; + + kho_in.dir = debugfs_create_dir("in", debugfs_root); + if (IS_ERR(kho_in.dir)) + return PTR_ERR(kho_in.dir); + + sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir); + if (IS_ERR(sub_fdt_dir)) { + err = PTR_ERR(sub_fdt_dir); + goto err_rmdir; + } + + err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt); + if (err) + goto err_rmdir; + + fdt_for_each_subnode(child, fdt, 0) { + int len = 0; + const char *name = fdt_get_name(fdt, child, NULL); + const u64 *fdt_phys; + + fdt_phys = fdt_getprop(fdt, child, "fdt", &len); + if (!fdt_phys) + continue; + if (len != sizeof(*fdt_phys)) { + pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n", + name, len); + continue; + } + err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name, + phys_to_virt(*fdt_phys)); + if (err) { + pr_warn("failed to add fdt `%s` to debugfs: %d\n", name, + err); + continue; + } + } + + return 0; + +err_rmdir: + debugfs_remove_recursive(kho_in.dir); + return err; +} + +static __init int kho_init(void) +{ + int err = 0; + const void *fdt = kho_get_fdt(); + + if (!kho_enable) + return 0; + + kho_out.ser.fdt = alloc_page(GFP_KERNEL); + if (!kho_out.ser.fdt) { + err = -ENOMEM; + goto err_free_scratch; + } + + debugfs_root = debugfs_create_dir("kho", NULL); + if (IS_ERR(debugfs_root)) { + err = -ENOENT; + goto err_free_fdt; + } + + err = kho_out_debugfs_init(); + if (err) + goto err_free_fdt; + + if (fdt) { + err = kho_in_debugfs_init(fdt); + /* + * Failure to create /sys/kernel/debug/kho/in does not prevent + * reviving state from KHO and setting up KHO for the next + * kexec. + */ + if (err) + pr_err("failed exposing handover FDT in debugfs: %d\n", + err); + + return 0; + } + + for (int i = 0; i < kho_scratch_cnt; i++) { + unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr); + unsigned long count = kho_scratch[i].size >> PAGE_SHIFT; + unsigned long pfn; + + for (pfn = base_pfn; pfn < base_pfn + count; + pfn += pageblock_nr_pages) + init_cma_reserved_pageblock(pfn_to_page(pfn)); + } + + return 0; + +err_free_fdt: + put_page(kho_out.ser.fdt); + kho_out.ser.fdt = NULL; +err_free_scratch: + for (int i = 0; i < kho_scratch_cnt; i++) { + void *start = __va(kho_scratch[i].addr); + void *end = start + kho_scratch[i].size; + + free_reserved_area(start, end, -1, ""); + } + kho_enable = false; + return err; +} +late_initcall(kho_init); + +static void __init kho_release_scratch(void) +{ + phys_addr_t start, end; + u64 i; + + memmap_init_kho_scratch_pages(); + + /* + * Mark scratch mem as CMA before we return it. That way we + * ensure that no kernel allocations happen on it. That means + * we can reuse it as scratch memory again later. + */ + __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, + MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) { + ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start)); + ulong end_pfn = pageblock_align(PFN_UP(end)); + ulong pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) + set_pageblock_migratetype(pfn_to_page(pfn), + MIGRATE_CMA); + } +} + +void __init kho_memory_init(void) +{ + struct folio *folio; + + if (kho_in.scratch_phys) { + kho_scratch = phys_to_virt(kho_in.scratch_phys); + kho_release_scratch(); + + kho_mem_deserialize(kho_get_fdt()); + folio = kho_restore_folio(kho_in.fdt_phys); + if (!folio) + pr_warn("failed to restore folio for KHO fdt\n"); + } else { + kho_reserve_scratch(); + } +} + +void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, + phys_addr_t scratch_phys, u64 scratch_len) +{ + void *fdt = NULL; + struct kho_scratch *scratch = NULL; + int err = 0; + unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); + + /* Validate the input FDT */ + fdt = early_memremap(fdt_phys, fdt_len); + if (!fdt) { + pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); + err = -EFAULT; + goto out; + } + err = fdt_check_header(fdt); + if (err) { + pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", + fdt_phys, err); + err = -EINVAL; + goto out; + } + err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); + if (err) { + pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", + fdt_phys, KHO_FDT_COMPATIBLE, err); + err = -EINVAL; + goto out; + } + + scratch = early_memremap(scratch_phys, scratch_len); + if (!scratch) { + pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", + scratch_phys, scratch_len); + err = -EFAULT; + goto out; + } + + /* + * We pass a safe contiguous blocks of memory to use for early boot + * purporses from the previous kernel so that we can resize the + * memblock array as needed. + */ + for (int i = 0; i < scratch_cnt; i++) { + struct kho_scratch *area = &scratch[i]; + u64 size = area->size; + + memblock_add(area->addr, size); + err = memblock_mark_kho_scratch(area->addr, size); + if (WARN_ON(err)) { + pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d", + &area->addr, &size, err); + goto out; + } + pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); + } + + memblock_reserve(scratch_phys, scratch_len); + + /* + * Now that we have a viable region of scratch memory, let's tell + * the memblocks allocator to only use that for any allocations. + * That way we ensure that nothing scribbles over in use data while + * we initialize the page tables which we will need to ingest all + * memory reservations from the previous kernel. + */ + memblock_set_kho_scratch_only(); + + kho_in.fdt_phys = fdt_phys; + kho_in.scratch_phys = scratch_phys; + kho_scratch_cnt = scratch_cnt; + pr_info("found kexec handover data. Will skip init for some devices\n"); + +out: + if (fdt) + early_memunmap(fdt, fdt_len); + if (scratch) + early_memunmap(scratch, scratch_len); + if (err) + pr_warn("disabling KHO revival: %d\n", err); +} + +/* Helper functions for kexec_file_load */ + +int kho_fill_kimage(struct kimage *image) +{ + ssize_t scratch_size; + int err = 0; + struct kexec_buf scratch; + + if (!kho_enable) + return 0; + + image->kho.fdt = page_to_phys(kho_out.ser.fdt); + + scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt; + scratch = (struct kexec_buf){ + .image = image, + .buffer = kho_scratch, + .bufsz = scratch_size, + .mem = KEXEC_BUF_MEM_UNKNOWN, + .memsz = scratch_size, + .buf_align = SZ_64K, /* Makes it easier to map */ + .buf_max = ULONG_MAX, + .top_down = true, + }; + err = kexec_add_buffer(&scratch); + if (err) + return err; + image->kho.scratch = &image->segment[image->nr_segments - 1]; + + return 0; +} + +static int kho_walk_scratch(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + int ret = 0; + int i; + + for (i = 0; i < kho_scratch_cnt; i++) { + struct resource res = { + .start = kho_scratch[i].addr, + .end = kho_scratch[i].addr + kho_scratch[i].size - 1, + }; + + /* Try to fit the kimage into our KHO scratch region */ + ret = func(&res, kbuf); + if (ret) + break; + } + + return ret; +} + +int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + int ret; + + if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH) + return 1; + + ret = kho_walk_scratch(kbuf, func); + + return ret == 1 ? 0 : -EADDRNOTAVAIL; +} diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index d35d9792402d..30a733a55a67 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -39,4 +39,20 @@ extern size_t kexec_purgatory_size; #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } #endif /* CONFIG_KEXEC_FILE */ + +struct kexec_buf; + +#ifdef CONFIG_KEXEC_HANDOVER +int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)); +int kho_fill_kimage(struct kimage *image); +#else +static inline int kho_locate_mem_hole(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 1; +} + +static inline int kho_fill_kimage(struct kimage *image) { return 0; } +#endif /* CONFIG_KEXEC_HANDOVER */ #endif /* LINUX_KEXEC_INTERNAL_H */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 88aeac84e4c0..ffe0c3d52306 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1547,7 +1547,7 @@ static int check_kprobe_address_safe(struct kprobe *p, /* Ensure the address is in a text area, and find a module if exists. */ *probed_mod = NULL; if (!core_kernel_text((unsigned long) p->addr)) { - guard(preempt)(); + guard(rcu)(); *probed_mod = __module_text_address((unsigned long) p->addr); if (!(*probed_mod)) return -EINVAL; diff --git a/kernel/kthread.c b/kernel/kthread.c index 5dc5b0d7238e..85fc068f0083 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1207,7 +1207,8 @@ EXPORT_SYMBOL_GPL(kthread_queue_work); */ void kthread_delayed_work_timer_fn(struct timer_list *t) { - struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); + struct kthread_delayed_work *dwork = timer_container_of(dwork, t, + timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; @@ -1362,14 +1363,14 @@ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, struct kthread_worker *worker = work->worker; /* - * del_timer_sync() must be called to make sure that the timer + * timer_delete_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); - del_timer_sync(&dwork->timer); + timer_delete_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 0cd39954d5a1..0e73fac55f8e 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -59,7 +59,7 @@ static void klp_find_object_module(struct klp_object *obj) if (!klp_is_module(obj)) return; - rcu_read_lock_sched(); + guard(rcu)(); /* * We do not want to block removal of patched modules and therefore * we do not take a reference here. The patches are removed by @@ -75,8 +75,6 @@ static void klp_find_object_module(struct klp_object *obj) */ if (mod && mod->klp_alive) obj->mod = mod; - - rcu_read_unlock_sched(); } static bool klp_initialized(void) @@ -601,9 +599,12 @@ static int klp_add_object_nops(struct klp_patch *patch, } /* - * Add 'nop' functions which simply return to the caller to run - * the original function. The 'nop' functions are added to a - * patch to facilitate a 'replace' mode. + * Add 'nop' functions which simply return to the caller to run the + * original function. + * + * They are added only when the atomic replace mode is used and only for + * functions which are currently livepatched but are no longer included + * in the new livepatch. */ static int klp_add_nops(struct klp_patch *patch) { diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index ba069459c101..2351a19ac2a9 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -29,22 +29,13 @@ static unsigned int klp_signals_cnt; /* * When a livepatch is in progress, enable klp stack checking in - * cond_resched(). This helps CPU-bound kthreads get patched. + * schedule(). This helps CPU-bound kthreads get patched. */ -#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) - -#define klp_cond_resched_enable() sched_dynamic_klp_enable() -#define klp_cond_resched_disable() sched_dynamic_klp_disable() - -#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key); -EXPORT_SYMBOL(klp_sched_try_switch_key); -#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key) -#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key) - -#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ +#define klp_resched_enable() static_branch_enable(&klp_sched_try_switch_key) +#define klp_resched_disable() static_branch_disable(&klp_sched_try_switch_key) /* * This work can be performed periodically to finish patching or unpatching any @@ -365,26 +356,18 @@ static bool klp_try_switch_task(struct task_struct *task) void __klp_sched_try_switch(void) { - if (likely(!klp_patch_pending(current))) - return; - /* - * This function is called from cond_resched() which is called in many - * places throughout the kernel. Using the klp_mutex here might - * deadlock. - * - * Instead, disable preemption to prevent racing with other callers of - * klp_try_switch_task(). Thanks to task_call_func() they won't be - * able to switch this task while it's running. + * This function is called from __schedule() while a context switch is + * about to happen. Preemption is already disabled and klp_mutex + * can't be acquired. + * Disabled preemption is used to prevent racing with other callers of + * klp_try_switch_task(). Thanks to task_call_func() they won't be + * able to switch to this task while it's running. */ - preempt_disable(); + lockdep_assert_preemption_disabled(); - /* - * Make sure current didn't get patched between the above check and - * preempt_disable(). - */ - if (unlikely(!klp_patch_pending(current))) - goto out; + if (likely(!klp_patch_pending(current))) + return; /* * Enforce the order of the TIF_PATCH_PENDING read above and the @@ -395,11 +378,7 @@ void __klp_sched_try_switch(void) smp_rmb(); klp_try_switch_task(current); - -out: - preempt_enable(); } -EXPORT_SYMBOL(__klp_sched_try_switch); /* * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. @@ -508,7 +487,7 @@ void klp_try_complete_transition(void) } /* Done! Now cleanup the data structures. */ - klp_cond_resched_disable(); + klp_resched_disable(); patch = klp_transition_patch; klp_complete_transition(); @@ -560,7 +539,7 @@ void klp_start_transition(void) set_tsk_thread_flag(task, TIF_PATCH_PENDING); } - klp_cond_resched_enable(); + klp_resched_enable(); klp_signals_cnt = 0; } diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 0db4093d17b8..a114949eeed5 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,7 +5,8 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance. +KASAN_SANITIZE_lockdep.o := n KCSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 97fb6f3f840a..4e36258cc34f 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -50,6 +50,11 @@ LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */ #endif /* CONFIG_QUEUED_SPINLOCKS */ /* + * Locking events for Resilient Queued Spin Lock + */ +LOCK_EVENT(rqspinlock_lock_timeout) /* # of locking ops that timeout */ + +/* * Locking events for rwsem */ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ @@ -67,3 +72,31 @@ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */ + +/* + * Locking events for rtlock_slowlock() + */ +LOCK_EVENT(rtlock_slowlock) /* # of rtlock_slowlock() calls */ +LOCK_EVENT(rtlock_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtlock_slow_acq2) /* # of locks acquired in for loop */ +LOCK_EVENT(rtlock_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtlock_slow_wake) /* # of wakeup's */ + +/* + * Locking events for rt_mutex_slowlock() + */ +LOCK_EVENT(rtmutex_slowlock) /* # of rt_mutex_slowlock() calls */ +LOCK_EVENT(rtmutex_slow_block) /* # of rt_mutex_slowlock_block() calls */ +LOCK_EVENT(rtmutex_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtmutex_slow_acq2) /* # of locks acquired at the end */ +LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */ +LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */ +LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */ + +/* + * Locking events for lockdep + */ +LOCK_EVENT(lockdep_acquire) +LOCK_EVENT(lockdep_lock) +LOCK_EVENT(lockdep_nocheck) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4470680f0226..dd2bbf73718b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -57,10 +57,12 @@ #include <linux/lockdep.h> #include <linux/context_tracking.h> #include <linux/console.h> +#include <linux/kasan.h> #include <asm/sections.h> #include "lockdep_internals.h" +#include "lock_events.h" #include <trace/events/lock.h> @@ -170,6 +172,7 @@ static struct task_struct *lockdep_selftest_task_struct; static int graph_lock(void) { lockdep_lock(); + lockevent_inc(lockdep_lock); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -216,6 +219,7 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; unsigned long nr_zapped_classes; +unsigned long nr_dynamic_keys; unsigned long max_lock_class_idx; struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); @@ -1235,6 +1239,7 @@ void lockdep_register_key(struct lock_class_key *key) goto out_unlock; } hlist_add_head_rcu(&key->hash_entry, hash_head); + nr_dynamic_keys++; out_unlock: graph_unlock(); restore_irqs: @@ -1974,41 +1979,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, } /* - * We are about to add A -> B into the dependency graph, and in __bfs() a - * strong dependency path A -> .. -> B is found: hlock_class equals - * entry->class. - * - * If A -> .. -> B can replace A -> B in any __bfs() search (means the former - * is _stronger_ than or equal to the latter), we consider A -> B as redundant. - * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A - * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the - * dependency graph, as any strong path ..-> A -> B ->.. we can get with - * having dependency A -> B, we could already get a equivalent path ..-> A -> - * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant. - * - * We need to make sure both the start and the end of A -> .. -> B is not - * weaker than A -> B. For the start part, please see the comment in - * check_redundant(). For the end part, we need: - * - * Either - * - * a) A -> B is -(*R)-> (everything is not weaker than that) - * - * or - * - * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) - * - */ -static inline bool hlock_equal(struct lock_list *entry, void *data) -{ - struct held_lock *hlock = (struct held_lock *)data; - - return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ - (hlock->read == 2 || /* A -> B is -(*R)-> */ - !entry->only_xr); /* A -> .. -> B is -(*N)-> */ -} - -/* * We are about to add B -> A into the dependency graph, and in __bfs() a * strong dependency path A -> .. -> B is found: hlock_class equals * entry->class. @@ -2913,6 +2883,41 @@ static inline bool usage_skip(struct lock_list *entry, void *mask) #ifdef CONFIG_LOCKDEP_SMALL /* + * We are about to add A -> B into the dependency graph, and in __bfs() a + * strong dependency path A -> .. -> B is found: hlock_class equals + * entry->class. + * + * If A -> .. -> B can replace A -> B in any __bfs() search (means the former + * is _stronger_ than or equal to the latter), we consider A -> B as redundant. + * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A + * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the + * dependency graph, as any strong path ..-> A -> B ->.. we can get with + * having dependency A -> B, we could already get a equivalent path ..-> A -> + * .. -> B -> .. with A -> .. -> B. Therefore A -> B is redundant. + * + * We need to make sure both the start and the end of A -> .. -> B is not + * weaker than A -> B. For the start part, please see the comment in + * check_redundant(). For the end part, we need: + * + * Either + * + * a) A -> B is -(*R)-> (everything is not weaker than that) + * + * or + * + * b) A -> .. -> B is -(*N)-> (nothing is stronger than this) + * + */ +static inline bool hlock_equal(struct lock_list *entry, void *data) +{ + struct held_lock *hlock = (struct held_lock *)data; + + return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */ + (hlock->read == 2 || /* A -> B is -(*R)-> */ + !entry->only_xr); /* A -> .. -> B is -(*N)-> */ +} + +/* * Check that the dependency graph starting at <src> can lead to * <target> or not. If it can, <src> -> <target> dependency is already * in the graph. @@ -5091,8 +5096,15 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (unlikely(lock->key == &__lockdep_no_track__)) return 0; - if (!prove_locking || lock->key == &__lockdep_no_validate__) + lockevent_inc(lockdep_acquire); + + if (!prove_locking || lock->key == &__lockdep_no_validate__) { check = 0; + lockevent_inc(lockdep_nocheck); + } + + if (DEBUG_LOCKS_WARN_ON(subclass >= MAX_LOCKDEP_SUBCLASSES)) + return 0; if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; @@ -5824,6 +5836,14 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!debug_locks) return; + /* + * As KASAN instrumentation is disabled and lock_acquire() is usually + * the first lockdep call when a task tries to acquire a lock, add + * kasan_check_byte() here to check for use-after-free and other + * memory errors. + */ + kasan_check_byte(lock); + if (unlikely(!lockdep_enabled())) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { @@ -6249,6 +6269,9 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) hlist_del_rcu(&class->hash_entry); WRITE_ONCE(class->key, NULL); WRITE_ONCE(class->name, NULL); + /* Class allocated but not used, -1 in nr_unused_locks */ + if (class->usage_mask == 0) + debug_atomic_dec(nr_unused_locks); nr_lock_classes--; __clear_bit(class - lock_classes, lock_classes_in_use); if (class - lock_classes == max_lock_class_idx) @@ -6588,6 +6611,7 @@ void lockdep_unregister_key(struct lock_class_key *key) pf = get_pending_free(); __lockdep_free_key_range(pf, key, 1); need_callback = prepare_call_rcu_zapped(pf); + nr_dynamic_keys--; } lockdep_unlock(); raw_local_irq_restore(flags); diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 20f9ef58d3d0..82156caf77d1 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -138,6 +138,7 @@ extern unsigned long nr_lock_classes; extern unsigned long nr_zapped_classes; extern unsigned long nr_zapped_lock_chains; extern unsigned long nr_list_entries; +extern unsigned long nr_dynamic_keys; long lockdep_next_lockchain(long i); unsigned long lock_chain_count(void); extern unsigned long nr_stack_trace_entries; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 6db0f43fc4df..b52c07c4707c 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -286,6 +286,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #endif seq_printf(m, " lock-classes: %11lu [max: %lu]\n", nr_lock_classes, MAX_LOCKDEP_KEYS); + seq_printf(m, " dynamic-keys: %11lu\n", + nr_dynamic_keys); seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", nr_list_entries, MAX_LOCKDEP_ENTRIES); seq_printf(m, " indirect dependencies: %11lu\n", diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index cc33470f4de9..ce0362f0a871 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -362,6 +362,60 @@ static struct lock_torture_ops raw_spin_lock_irq_ops = { .name = "raw_spin_lock_irq" }; +#ifdef CONFIG_BPF_SYSCALL + +#include <asm/rqspinlock.h> +static rqspinlock_t rqspinlock; + +static int torture_raw_res_spin_write_lock(int tid __maybe_unused) +{ + raw_res_spin_lock(&rqspinlock); + return 0; +} + +static void torture_raw_res_spin_write_unlock(int tid __maybe_unused) +{ + raw_res_spin_unlock(&rqspinlock); +} + +static struct lock_torture_ops raw_res_spin_lock_ops = { + .writelock = torture_raw_res_spin_write_lock, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_res_spin_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_res_spin_lock" +}; + +static int torture_raw_res_spin_write_lock_irq(int tid __maybe_unused) +{ + unsigned long flags; + + raw_res_spin_lock_irqsave(&rqspinlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_raw_res_spin_write_unlock_irq(int tid __maybe_unused) +{ + raw_res_spin_unlock_irqrestore(&rqspinlock, cxt.cur_ops->flags); +} + +static struct lock_torture_ops raw_res_spin_lock_irq_ops = { + .writelock = torture_raw_res_spin_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_res_spin_write_unlock_irq, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_res_spin_lock_irq" +}; + +#endif + static DEFINE_RWLOCK(torture_rwlock); static int torture_rwlock_write_lock(int tid __maybe_unused) @@ -1168,6 +1222,9 @@ static int __init lock_torture_init(void) &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, &raw_spin_lock_ops, &raw_spin_lock_irq_ops, +#ifdef CONFIG_BPF_SYSCALL + &raw_res_spin_lock_ops, &raw_res_spin_lock_irq_ops, +#endif &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, &ww_mutex_lock_ops, diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 85251d8771d9..5c92ba199b90 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -15,12 +15,6 @@ #include <asm/mcs_spinlock.h> -struct mcs_spinlock { - struct mcs_spinlock *next; - int locked; /* 1 if lock acquired */ - int count; /* nesting count, see qspinlock.c */ -}; - #ifndef arch_mcs_spin_lock_contended /* * Using smp_cond_load_acquire() provides the acquire semantics @@ -30,9 +24,7 @@ struct mcs_spinlock { * spinning, and smp_cond_load_acquire() provides that behavior. */ #define arch_mcs_spin_lock_contended(l) \ -do { \ - smp_cond_load_acquire(l, VAL); \ -} while (0) + smp_cond_load_acquire(l, VAL) #endif #ifndef arch_mcs_spin_unlock_contended diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index b36f23de48f1..a39ecccbd106 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -29,6 +29,7 @@ #include <linux/interrupt.h> #include <linux/debug_locks.h> #include <linux/osq_lock.h> +#include <linux/hung_task.h> #define CREATE_TRACE_POINTS #include <trace/events/lock.h> @@ -72,6 +73,14 @@ static inline unsigned long __owner_flags(unsigned long owner) return owner & MUTEX_FLAGS; } +/* Do not use the return value as a pointer directly. */ +unsigned long mutex_get_owner(struct mutex *lock) +{ + unsigned long owner = atomic_long_read(&lock->owner); + + return (unsigned long)__owner_task(owner); +} + /* * Returns: __mutex_owner(lock) on failure or NULL on success. */ @@ -143,6 +152,8 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) unsigned long curr = (unsigned long)current; unsigned long zero = 0UL; + MUTEX_WARN_ON(lock->magic != lock); + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr)) return true; @@ -180,6 +191,9 @@ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); +#endif debug_mutex_add_waiter(lock, waiter, current); list_add_tail(&waiter->list, list); @@ -195,6 +209,9 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) __mutex_clear_flag(lock, MUTEX_FLAGS); debug_mutex_remove_waiter(lock, waiter, current); +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER + hung_task_clear_blocker(); +#endif } /* @@ -792,11 +809,12 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); int __sched -mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) +_mutex_lock_killable(struct mutex *lock, unsigned int subclass, + struct lockdep_map *nest) { - return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); + return __mutex_lock(lock, TASK_KILLABLE, subclass, nest, _RET_IP_); } -EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); +EXPORT_SYMBOL_GPL(_mutex_lock_killable); int __sched mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) @@ -1046,6 +1064,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, #endif +#ifndef CONFIG_DEBUG_LOCK_ALLOC /** * mutex_trylock - try to acquire the mutex, without waiting * @lock: the mutex to be acquired @@ -1062,17 +1081,24 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { + MUTEX_WARN_ON(lock->magic != lock); + return __mutex_trylock(lock); +} +EXPORT_SYMBOL(mutex_trylock); +#else +int __sched _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock) +{ bool locked; MUTEX_WARN_ON(lock->magic != lock); - locked = __mutex_trylock(lock); if (locked) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_); return locked; } -EXPORT_SYMBOL(mutex_trylock); +EXPORT_SYMBOL(_mutex_trylock_nest_lock); +#endif #ifndef CONFIG_DEBUG_LOCK_ALLOC int __sched diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 6083883c4fe0..ef234469baac 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -138,7 +138,8 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, return !reader; /* wake (readers until) 1 writer */ } -static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) +static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader, + bool freeze) { DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); bool wait; @@ -156,7 +157,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) spin_unlock_irq(&sem->waiters.lock); while (wait) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE | + (freeze ? TASK_FREEZABLE : 0)); if (!smp_load_acquire(&wq_entry.private)) break; schedule(); @@ -164,7 +166,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) __set_current_state(TASK_RUNNING); } -bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) +bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try, + bool freeze) { if (__percpu_down_read_trylock(sem)) return true; @@ -174,7 +177,7 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ); preempt_enable(); - percpu_rwsem_wait(sem, /* .reader = */ true); + percpu_rwsem_wait(sem, /* .reader = */ true, freeze); preempt_disable(); trace_contention_end(sem, 0); @@ -184,7 +187,7 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); #define per_cpu_sum(var) \ ({ \ - typeof(var) __sum = 0; \ + TYPEOF_UNQUAL(var) __sum = 0; \ int cpu; \ compiletime_assert_atomic_type(__sum); \ for_each_possible_cpu(cpu) \ @@ -237,7 +240,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem) */ if (!__percpu_down_write_trylock(sem)) { trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE); - percpu_rwsem_wait(sem, /* .reader = */ false); + percpu_rwsem_wait(sem, /* .reader = */ false, false); contended = true; } diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 7d96bed718e4..af8d122bb649 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -25,8 +25,9 @@ #include <trace/events/lock.h> /* - * Include queued spinlock statistics code + * Include queued spinlock definitions and statistics code */ +#include "qspinlock.h" #include "qspinlock_stat.h" /* @@ -67,36 +68,6 @@ */ #include "mcs_spinlock.h" -#define MAX_NODES 4 - -/* - * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in - * size and four of them will fit nicely in one 64-byte cacheline. For - * pvqspinlock, however, we need more space for extra data. To accommodate - * that, we insert two more long words to pad it up to 32 bytes. IOW, only - * two of them can fit in a cacheline in this case. That is OK as it is rare - * to have more than 2 levels of slowpath nesting in actual use. We don't - * want to penalize pvqspinlocks to optimize for a rare case in native - * qspinlocks. - */ -struct qnode { - struct mcs_spinlock mcs; -#ifdef CONFIG_PARAVIRT_SPINLOCKS - long reserved[2]; -#endif -}; - -/* - * The pending bit spinning loop count. - * This heuristic is used to limit the number of lockword accesses - * made by atomic_cond_read_relaxed when waiting for the lock to - * transition out of the "== _Q_PENDING_VAL" state. We don't spin - * indefinitely because there's no guarantee that we'll make forward - * progress. - */ -#ifndef _Q_PENDING_LOOPS -#define _Q_PENDING_LOOPS 1 -#endif /* * Per-CPU queue node structures; we can never have more than 4 nested @@ -106,161 +77,7 @@ struct qnode { * * PV doubles the storage and uses the second cacheline for PV state. */ -static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); - -/* - * We must be able to distinguish between no-tail and the tail at 0:0, - * therefore increment the cpu number by one. - */ - -static inline __pure u32 encode_tail(int cpu, int idx) -{ - u32 tail; - - tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; - tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ - - return tail; -} - -static inline __pure struct mcs_spinlock *decode_tail(u32 tail) -{ - int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; - int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; - - return per_cpu_ptr(&qnodes[idx].mcs, cpu); -} - -static inline __pure -struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) -{ - return &((struct qnode *)base + idx)->mcs; -} - -#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) - -#if _Q_PENDING_BITS == 8 -/** - * clear_pending - clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,* -> *,0,* - */ -static __always_inline void clear_pending(struct qspinlock *lock) -{ - WRITE_ONCE(lock->pending, 0); -} - -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - * - * Lock stealing is not allowed if this function is used. - */ -static __always_inline void clear_pending_set_locked(struct qspinlock *lock) -{ - WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); -} - -/* - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail), which heads an address dependency - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - /* - * We can use relaxed semantics since the caller ensures that the - * MCS node is properly initialized before updating the tail. - */ - return (u32)xchg_relaxed(&lock->tail, - tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; -} - -#else /* _Q_PENDING_BITS == 8 */ - -/** - * clear_pending - clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,* -> *,0,* - */ -static __always_inline void clear_pending(struct qspinlock *lock) -{ - atomic_andnot(_Q_PENDING_VAL, &lock->val); -} - -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - */ -static __always_inline void clear_pending_set_locked(struct qspinlock *lock) -{ - atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); -} - -/** - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail) - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - u32 old, new; - - old = atomic_read(&lock->val); - do { - new = (old & _Q_LOCKED_PENDING_MASK) | tail; - /* - * We can use relaxed semantics since the caller ensures that - * the MCS node is properly initialized before updating the - * tail. - */ - } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); - - return old; -} -#endif /* _Q_PENDING_BITS == 8 */ - -/** - * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending - * @lock : Pointer to queued spinlock structure - * Return: The previous lock value - * - * *,*,* -> *,1,* - */ -#ifndef queued_fetch_set_pending_acquire -static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) -{ - return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); -} -#endif - -/** - * set_locked - Set the lock bit and own the lock - * @lock: Pointer to queued spinlock structure - * - * *,*,0 -> *,0,1 - */ -static __always_inline void set_locked(struct qspinlock *lock) -{ - WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); -} - +static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]); /* * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for @@ -410,7 +227,7 @@ pv_queue: * any MCS node. This is not the most elegant solution, but is * simple enough. */ - if (unlikely(idx >= MAX_NODES)) { + if (unlikely(idx >= _Q_MAX_NODES)) { lockevent_inc(lock_no_node); while (!queued_spin_trylock(lock)) cpu_relax(); @@ -465,7 +282,7 @@ pv_queue: * head of the waitqueue. */ if (old & _Q_TAIL_MASK) { - prev = decode_tail(old); + prev = decode_tail(old, qnodes); /* Link @node into the waitqueue. */ WRITE_ONCE(prev->next, node); diff --git a/kernel/locking/qspinlock.h b/kernel/locking/qspinlock.h new file mode 100644 index 000000000000..d69958a844f7 --- /dev/null +++ b/kernel/locking/qspinlock.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Queued spinlock defines + * + * This file contains macro definitions and functions shared between different + * qspinlock slow path implementations. + */ +#ifndef __LINUX_QSPINLOCK_H +#define __LINUX_QSPINLOCK_H + +#include <asm-generic/percpu.h> +#include <linux/percpu-defs.h> +#include <asm-generic/qspinlock.h> +#include <asm-generic/mcs_spinlock.h> + +#define _Q_MAX_NODES 4 + +/* + * The pending bit spinning loop count. + * This heuristic is used to limit the number of lockword accesses + * made by atomic_cond_read_relaxed when waiting for the lock to + * transition out of the "== _Q_PENDING_VAL" state. We don't spin + * indefinitely because there's no guarantee that we'll make forward + * progress. + */ +#ifndef _Q_PENDING_LOOPS +#define _Q_PENDING_LOOPS 1 +#endif + +/* + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in + * size and four of them will fit nicely in one 64-byte cacheline. For + * pvqspinlock, however, we need more space for extra data. To accommodate + * that, we insert two more long words to pad it up to 32 bytes. IOW, only + * two of them can fit in a cacheline in this case. That is OK as it is rare + * to have more than 2 levels of slowpath nesting in actual use. We don't + * want to penalize pvqspinlocks to optimize for a rare case in native + * qspinlocks. + */ +struct qnode { + struct mcs_spinlock mcs; +#ifdef CONFIG_PARAVIRT_SPINLOCKS + long reserved[2]; +#endif +}; + +/* + * We must be able to distinguish between no-tail and the tail at 0:0, + * therefore increment the cpu number by one. + */ + +static inline __pure u32 encode_tail(int cpu, int idx) +{ + u32 tail; + + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + + return tail; +} + +static inline __pure struct mcs_spinlock *decode_tail(u32 tail, + struct qnode __percpu *qnodes) +{ + int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; + int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + + return per_cpu_ptr(&qnodes[idx].mcs, cpu); +} + +static inline __pure +struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) +{ + return &((struct qnode *)base + idx)->mcs; +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +#if _Q_PENDING_BITS == 8 +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + WRITE_ONCE(lock->pending, 0); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); +} + +/* + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail), which heads an address dependency + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + /* + * We can use relaxed semantics since the caller ensures that the + * MCS node is properly initialized before updating the tail. + */ + return (u32)xchg_relaxed(&lock->tail, + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; +} + +#else /* _Q_PENDING_BITS == 8 */ + +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_andnot(_Q_PENDING_VAL, &lock->val); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); +} + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = (old & _Q_LOCKED_PENDING_MASK) | tail; + /* + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. + */ + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return old; +} +#endif /* _Q_PENDING_BITS == 8 */ + +/** + * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending + * @lock : Pointer to queued spinlock structure + * Return: The previous lock value + * + * *,*,* -> *,1,* + */ +#ifndef queued_fetch_set_pending_acquire +static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) +{ + return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); +} +#endif + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(struct qspinlock *lock) +{ + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); +} + +#endif /* __LINUX_QSPINLOCK_H */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4a8df1800cbb..c80902eacd79 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -27,6 +27,7 @@ #include <trace/events/lock.h> #include "rtmutex_common.h" +#include "lock_events.h" #ifndef WW_RT # define build_ww_mutex() (false) @@ -1612,10 +1613,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct task_struct *owner; int ret = 0; + lockevent_inc(rtmutex_slow_block); for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock, current, waiter)) + if (try_to_take_rt_mutex(lock, current, waiter)) { + lockevent_inc(rtmutex_slow_acq3); break; + } if (timeout && !timeout->task) { ret = -ETIMEDOUT; @@ -1638,8 +1642,10 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = NULL; raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) { + lockevent_inc(rtmutex_slow_sleep); rt_mutex_schedule(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @@ -1694,6 +1700,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, int ret; lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtmutex_slowlock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { @@ -1701,6 +1708,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq1); return 0; } @@ -1719,10 +1727,12 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq2); } else { __set_current_state(TASK_RUNNING); remove_waiter(lock, waiter); rt_mutex_handle_deadlock(ret, chwalk, lock, waiter); + lockevent_inc(rtmutex_deadlock); } /* @@ -1751,6 +1761,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, &waiter, wake_q); debug_rt_mutex_free_waiter(&waiter); + lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q)); return ret; } @@ -1823,9 +1834,12 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, struct task_struct *owner; lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtlock_slowlock); - if (try_to_take_rt_mutex(lock, current, NULL)) + if (try_to_take_rt_mutex(lock, current, NULL)) { + lockevent_inc(rtlock_slow_acq1); return; + } rt_mutex_init_rtlock_waiter(&waiter); @@ -1838,8 +1852,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, for (;;) { /* Try to acquire the lock again */ - if (try_to_take_rt_mutex(lock, current, &waiter)) + if (try_to_take_rt_mutex(lock, current, &waiter)) { + lockevent_inc(rtlock_slow_acq2); break; + } if (&waiter == rt_mutex_top_waiter(lock)) owner = rt_mutex_owner(lock); @@ -1847,8 +1863,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, owner = NULL; raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) { + lockevent_inc(rtlock_slow_sleep); schedule_rtlock(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_RTLOCK_WAIT); @@ -1865,6 +1883,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, debug_rt_mutex_free_waiter(&waiter); trace_contention_end(lock, 0); + lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q)); } static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 191e4720e546..2d933528a0fa 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -544,12 +544,12 @@ int __sched mutex_lock_interruptible_nested(struct mutex *lock, } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); -int __sched mutex_lock_killable_nested(struct mutex *lock, - unsigned int subclass) +int __sched _mutex_lock_killable(struct mutex *lock, unsigned int subclass, + struct lockdep_map *nest_lock) { - return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, subclass, nest_lock, _RET_IP_); } -EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); +EXPORT_SYMBOL_GPL(_mutex_lock_killable); void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) { @@ -563,6 +563,21 @@ void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) } EXPORT_SYMBOL_GPL(mutex_lock_io_nested); +int __sched _mutex_trylock_nest_lock(struct mutex *lock, + struct lockdep_map *nest_lock) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) + return 0; + + ret = __rt_mutex_trylock(&lock->rtmutex); + if (ret) + mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock); #else /* CONFIG_DEBUG_LOCK_ALLOC */ void __sched mutex_lock(struct mutex *lock) @@ -591,22 +606,16 @@ void __sched mutex_lock_io(struct mutex *lock) io_schedule_finish(token); } EXPORT_SYMBOL(mutex_lock_io); -#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ int __sched mutex_trylock(struct mutex *lock) { - int ret; - if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) return 0; - ret = __rt_mutex_trylock(&lock->rtmutex); - if (ret) - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - - return ret; + return __rt_mutex_trylock(&lock->rtmutex); } EXPORT_SYMBOL(mutex_trylock); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ void __sched mutex_unlock(struct mutex *lock) { diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index c38a2d2d4a7e..78dd3d8c6554 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -59,8 +59,8 @@ struct rt_mutex_waiter { }; /** - * rt_wake_q_head - Wrapper around regular wake_q_head to support - * "sleeping" spinlocks on RT + * struct rt_wake_q_head - Wrapper around regular wake_q_head to support + * "sleeping" spinlocks on RT * @head: The regular wake_q_head for sleeping lock variants * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups */ diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 34bfae72f295..3ef032e22f7e 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -29,16 +29,53 @@ #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> +#include <linux/sched/wake_q.h> #include <linux/semaphore.h> #include <linux/spinlock.h> #include <linux/ftrace.h> #include <trace/events/lock.h> +#include <linux/hung_task.h> static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); static noinline int __down_killable(struct semaphore *sem); static noinline int __down_timeout(struct semaphore *sem, long timeout); -static noinline void __up(struct semaphore *sem); +static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q); + +#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ + WRITE_ONCE((sem)->last_holder, (unsigned long)current); +} + +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ + if (READ_ONCE((sem)->last_holder) == (unsigned long)current) + WRITE_ONCE((sem)->last_holder, 0UL); +} + +unsigned long sem_last_holder(struct semaphore *sem) +{ + return READ_ONCE(sem->last_holder); +} +#else +static inline void hung_task_sem_set_holder(struct semaphore *sem) +{ +} +static inline void hung_task_sem_clear_if_holder(struct semaphore *sem) +{ +} +unsigned long sem_last_holder(struct semaphore *sem) +{ + return 0UL; +} +#endif + +static inline void __sem_acquire(struct semaphore *sem) +{ + sem->count--; + hung_task_sem_set_holder(sem); +} /** * down - acquire the semaphore @@ -58,7 +95,7 @@ void __sched down(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else __down(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -82,7 +119,7 @@ int __sched down_interruptible(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_interruptible(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -109,7 +146,7 @@ int __sched down_killable(struct semaphore *sem) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_killable(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -139,7 +176,7 @@ int __sched down_trylock(struct semaphore *sem) raw_spin_lock_irqsave(&sem->lock, flags); count = sem->count - 1; if (likely(count >= 0)) - sem->count = count; + __sem_acquire(sem); raw_spin_unlock_irqrestore(&sem->lock, flags); return (count < 0); @@ -164,7 +201,7 @@ int __sched down_timeout(struct semaphore *sem, long timeout) might_sleep(); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) - sem->count--; + __sem_acquire(sem); else result = __down_timeout(sem, timeout); raw_spin_unlock_irqrestore(&sem->lock, flags); @@ -183,13 +220,19 @@ EXPORT_SYMBOL(down_timeout); void __sched up(struct semaphore *sem) { unsigned long flags; + DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->lock, flags); + + hung_task_sem_clear_if_holder(sem); + if (likely(list_empty(&sem->wait_list))) sem->count++; else - __up(sem); + __up(sem, &wake_q); raw_spin_unlock_irqrestore(&sem->lock, flags); + if (!wake_q_empty(&wake_q)) + wake_up_q(&wake_q); } EXPORT_SYMBOL(up); @@ -224,8 +267,10 @@ static inline int __sched ___down_common(struct semaphore *sem, long state, raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); raw_spin_lock_irq(&sem->lock); - if (waiter.up) + if (waiter.up) { + hung_task_sem_set_holder(sem); return 0; + } } timed_out: @@ -242,10 +287,14 @@ static inline int __sched __down_common(struct semaphore *sem, long state, { int ret; + hung_task_set_blocker(sem, BLOCKER_TYPE_SEM); + trace_contention_begin(sem, 0); ret = ___down_common(sem, state, timeout); trace_contention_end(sem, ret); + hung_task_clear_blocker(); + return ret; } @@ -269,11 +318,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout); } -static noinline void __sched __up(struct semaphore *sem) +static noinline void __sched __up(struct semaphore *sem, + struct wake_q_head *wake_q) { struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, struct semaphore_waiter, list); list_del(&waiter->list); waiter->up = true; - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); } diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index d7762ef5949a..39278737bb68 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -192,6 +192,11 @@ config GENDWARFKSYMS depends on !DEBUG_INFO_REDUCED && !DEBUG_INFO_SPLIT # Requires ELF object files. depends on !LTO + # To avoid conflicts with the discarded __gendwarfksyms_ptr symbols on + # X86, requires pahole before commit 47dcb534e253 ("btf_encoder: Stop + # indexing symbols for VARs") or after commit 9810758003ce ("btf_encoder: + # Verify 0 address DWARF variables are in ELF section"). + depends on !X86 || !DEBUG_INFO_BTF || PAHOLE_VERSION < 128 || PAHOLE_VERSION > 129 help Calculate symbol versions from DWARF debugging information using gendwarfksyms. Requires DEBUG_INFO to be enabled. diff --git a/kernel/module/internal.h b/kernel/module/internal.h index d09b46ef032f..8d74b0a21c82 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -124,17 +124,6 @@ char *module_next_tag_pair(char *string, unsigned long *secsize); #define for_each_modinfo_entry(entry, info, name) \ for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry)) -static inline void module_assert_mutex_or_preempt(void) -{ -#ifdef CONFIG_LOCKDEP - if (unlikely(!debug_locks)) - return; - - WARN_ON_ONCE(!rcu_read_lock_sched_held() && - !lockdep_is_held(&module_mutex)); -#endif -} - static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym) { #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS @@ -333,8 +322,11 @@ int module_enable_rodata_ro(const struct module *mod); int module_enable_rodata_ro_after_init(const struct module *mod); int module_enable_data_nx(const struct module *mod); int module_enable_text_rox(const struct module *mod); -int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod); +int module_enforce_rwx_sections(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + const char *secstrings, + const struct module *mod); +void module_mark_ro_after_init(const Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + const char *secstrings); #ifdef CONFIG_MODULE_SIG int module_sig_check(struct load_info *info, int flags); diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index bf65e0c3c86f..00a60796327c 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -177,19 +177,15 @@ void add_kallsyms(struct module *mod, const struct load_info *info) unsigned long strtab_size; void *data_base = mod->mem[MOD_DATA].base; void *init_data_base = mod->mem[MOD_INIT_DATA].base; + struct mod_kallsyms *kallsyms; - /* Set up to point into init section. */ - mod->kallsyms = (void __rcu *)init_data_base + - info->mod_kallsyms_init_off; + kallsyms = init_data_base + info->mod_kallsyms_init_off; - rcu_read_lock(); - /* The following is safe since this pointer cannot change */ - rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr; - rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + kallsyms->symtab = (void *)symsec->sh_addr; + kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - rcu_dereference(mod->kallsyms)->strtab = - (void *)info->sechdrs[info->index.str].sh_addr; - rcu_dereference(mod->kallsyms)->typetab = init_data_base + info->init_typeoffs; + kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + kallsyms->typetab = init_data_base + info->init_typeoffs; /* * Now populate the cut down core kallsyms for after init @@ -199,20 +195,19 @@ void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_kallsyms.strtab = s = data_base + info->stroffs; mod->core_kallsyms.typetab = data_base + info->core_typeoffs; strtab_size = info->core_typeoffs - info->stroffs; - src = rcu_dereference(mod->kallsyms)->symtab; - for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) { - rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info); + src = kallsyms->symtab; + for (ndst = i = 0; i < kallsyms->num_symtab; i++) { + kallsyms->typetab[i] = elf_type(src + i, info); if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { ssize_t ret; mod->core_kallsyms.typetab[ndst] = - rcu_dereference(mod->kallsyms)->typetab[i]; + kallsyms->typetab[i]; dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; - ret = strscpy(s, - &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name], + ret = strscpy(s, &kallsyms->strtab[src[i].st_name], strtab_size); if (ret < 0) break; @@ -220,7 +215,9 @@ void add_kallsyms(struct module *mod, const struct load_info *info) strtab_size -= ret + 1; } } - rcu_read_unlock(); + + /* Set up to point into init section. */ + rcu_assign_pointer(mod->kallsyms, kallsyms); mod->core_kallsyms.num_symtab = ndst; } @@ -260,7 +257,7 @@ static const char *find_kallsyms_symbol(struct module *mod, { unsigned int i, best = 0; unsigned long nextval, bestval; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms); struct module_memory *mod_mem; /* At worse, next value is at end of module */ @@ -319,7 +316,7 @@ void * __weak dereference_module_function_descriptor(struct module *mod, /* * For kallsyms to ask for address resolution. NULL means not found. Careful - * not to lock to avoid deadlock on oopses, simply disable preemption. + * not to lock to avoid deadlock on oopses, RCU is enough. */ int module_address_lookup(unsigned long addr, unsigned long *size, @@ -332,7 +329,7 @@ int module_address_lookup(unsigned long addr, int ret = 0; struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address(addr); if (mod) { if (modname) @@ -350,8 +347,6 @@ int module_address_lookup(unsigned long addr, if (sym) ret = strscpy(namebuf, sym, KSYM_NAME_LEN); } - preempt_enable(); - return ret; } @@ -359,7 +354,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) { struct module *mod; - preempt_disable(); + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; @@ -371,12 +366,10 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) goto out; strscpy(symname, sym, KSYM_NAME_LEN); - preempt_enable(); return 0; } } out: - preempt_enable(); return -ERANGE; } @@ -385,13 +378,13 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, { struct module *mod; - preempt_disable(); + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { struct mod_kallsyms *kallsyms; if (mod->state == MODULE_STATE_UNFORMED) continue; - kallsyms = rcu_dereference_sched(mod->kallsyms); + kallsyms = rcu_dereference(mod->kallsyms); if (symnum < kallsyms->num_symtab) { const Elf_Sym *sym = &kallsyms->symtab[symnum]; @@ -400,12 +393,10 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); strscpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); - preempt_enable(); return 0; } symnum -= kallsyms->num_symtab; } - preempt_enable(); return -ERANGE; } @@ -413,7 +404,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name) { unsigned int i; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms); for (i = 0; i < kallsyms->num_symtab; i++) { const Elf_Sym *sym = &kallsyms->symtab[i]; @@ -453,23 +444,15 @@ static unsigned long __module_kallsyms_lookup_name(const char *name) /* Look for this name: can be of form module:name. */ unsigned long module_kallsyms_lookup_name(const char *name) { - unsigned long ret; - /* Don't lock: we're in enough trouble already. */ - preempt_disable(); - ret = __module_kallsyms_lookup_name(name); - preempt_enable(); - return ret; + guard(rcu)(); + return __module_kallsyms_lookup_name(name); } unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) { - unsigned long ret; - - preempt_disable(); - ret = __find_kallsyms_symbol_value(mod, name); - preempt_enable(); - return ret; + guard(rcu)(); + return __find_kallsyms_symbol_value(mod, name); } int module_kallsyms_on_each_symbol(const char *modname, @@ -490,10 +473,8 @@ int module_kallsyms_on_each_symbol(const char *modname, if (modname && strcmp(modname, mod->name)) continue; - /* Use rcu_dereference_sched() to remain compliant with the sparse tool */ - preempt_disable(); - kallsyms = rcu_dereference_sched(mod->kallsyms); - preempt_enable(); + kallsyms = rcu_dereference_check(mod->kallsyms, + lockdep_is_held(&module_mutex)); for (i = 0; i < kallsyms->num_symtab; i++) { const Elf_Sym *sym = &kallsyms->symtab[i]; diff --git a/kernel/module/main.c b/kernel/module/main.c index 1fb9ad289a6f..413ac6ea3702 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -67,7 +67,7 @@ /* * Mutex protects: - * 1) List of modules (also safely readable with preempt_disable), + * 1) List of modules (also safely readable within RCU read section), * 2) module_use links, * 3) mod_tree.addr_min/mod_tree.addr_max. * (delete and add uses RCU list operations). @@ -170,6 +170,30 @@ static inline void add_taint_module(struct module *mod, unsigned flag, } /* + * Like strncmp(), except s/-/_/g as per scripts/Makefile.lib:name-fix-token rule. + */ +static int mod_strncmp(const char *str_a, const char *str_b, size_t n) +{ + for (int i = 0; i < n; i++) { + char a = str_a[i]; + char b = str_b[i]; + int d; + + if (a == '-') a = '_'; + if (b == '-') b = '_'; + + d = a - b; + if (d) + return d; + + if (!a) + break; + } + + return 0; +} + +/* * A thread that wants to hold a reference to a module only while it * is running can call this to safely exit. */ @@ -331,7 +355,7 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, /* * Find an exported symbol and return it, along with, (optional) crc and - * (optional) module which owns it. Needs preempt disabled or module_mutex. + * (optional) module which owns it. Needs RCU or module_mutex. */ bool find_symbol(struct find_symbol_arg *fsa) { @@ -345,8 +369,6 @@ bool find_symbol(struct find_symbol_arg *fsa) struct module *mod; unsigned int i; - module_assert_mutex_or_preempt(); - for (i = 0; i < ARRAY_SIZE(arr); i++) if (find_exported_symbol_in_section(&arr[i], NULL, fsa)) return true; @@ -374,16 +396,14 @@ bool find_symbol(struct find_symbol_arg *fsa) } /* - * Search for module by name: must hold module_mutex (or preempt disabled - * for read-only access). + * Search for module by name: must hold module_mutex (or RCU for read-only + * access). */ struct module *find_module_all(const char *name, size_t len, bool even_unformed) { struct module *mod; - module_assert_mutex_or_preempt(); - list_for_each_entry_rcu(mod, &modules, list, lockdep_is_held(&module_mutex)) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) @@ -454,8 +474,7 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) struct module *mod; unsigned int cpu; - preempt_disable(); - + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; @@ -472,13 +491,10 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) per_cpu_ptr(mod->percpu, get_boot_cpu_id()); } - preempt_enable(); return true; } } } - - preempt_enable(); return false; } @@ -795,8 +811,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, async_synchronize_full(); /* Store the name and taints of the last unloaded module for diagnostic purposes */ - strscpy(last_unloaded_module.name, mod->name, sizeof(last_unloaded_module.name)); - strscpy(last_unloaded_module.taints, module_flags(mod, buf, false), sizeof(last_unloaded_module.taints)); + strscpy(last_unloaded_module.name, mod->name); + strscpy(last_unloaded_module.taints, module_flags(mod, buf, false)); free_module(mod); /* someone could wait for the module in add_unformed_module() */ @@ -814,10 +830,9 @@ void __symbol_put(const char *symbol) .gplok = true, }; - preempt_disable(); + guard(rcu)(); BUG_ON(!find_symbol(&fsa)); module_put(fsa.owner); - preempt_enable(); } EXPORT_SYMBOL(__symbol_put); @@ -832,13 +847,12 @@ void symbol_put_addr(void *addr) /* * Even though we hold a reference on the module; we still need to - * disable preemption in order to safely traverse the data structure. + * RCU read section in order to safely traverse the data structure. */ - preempt_disable(); + guard(rcu)(); modaddr = __module_text_address(a); BUG_ON(!modaddr); module_put(modaddr); - preempt_enable(); } EXPORT_SYMBOL_GPL(symbol_put_addr); @@ -1093,6 +1107,46 @@ static char *get_modinfo(const struct load_info *info, const char *tag) return get_next_modinfo(info, tag, NULL); } +/** + * verify_module_namespace() - does @modname have access to this symbol's @namespace + * @namespace: export symbol namespace + * @modname: module name + * + * If @namespace is prefixed with "module:" to indicate it is a module namespace + * then test if @modname matches any of the comma separated patterns. + * + * The patterns only support tail-glob. + */ +static bool verify_module_namespace(const char *namespace, const char *modname) +{ + size_t len, modlen = strlen(modname); + const char *prefix = "module:"; + const char *sep; + bool glob; + + if (!strstarts(namespace, prefix)) + return false; + + for (namespace += strlen(prefix); *namespace; namespace = sep) { + sep = strchrnul(namespace, ','); + len = sep - namespace; + + glob = false; + if (sep[-1] == '*') { + len--; + glob = true; + } + + if (*sep) + sep++; + + if (mod_strncmp(namespace, modname, len) == 0 && (glob || len == modlen)) + return true; + } + + return false; +} + static int verify_namespace_is_imported(const struct load_info *info, const struct kernel_symbol *sym, struct module *mod) @@ -1102,6 +1156,10 @@ static int verify_namespace_is_imported(const struct load_info *info, namespace = kernel_symbol_namespace(sym); if (namespace && namespace[0]) { + + if (verify_module_namespace(namespace, mod->name)) + return 0; + for_each_modinfo_entry(imported_namespace, info, "import_ns") { if (strcmp(namespace, imported_namespace) == 0) return 0; @@ -1189,7 +1247,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, getname: /* We must make copy under the lock if we failed to get ref. */ - strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN); + strscpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN); unlock: mutex_unlock(&module_mutex); return fsa.sym; @@ -1221,18 +1279,6 @@ void __weak module_arch_freeing_init(struct module *mod) { } -void *__module_writable_address(struct module *mod, void *loc) -{ - for_class_mod_mem_type(type, text) { - struct module_memory *mem = &mod->mem[type]; - - if (loc >= mem->base && loc < mem->base + mem->size) - return loc + (mem->rw_copy - mem->base); - } - - return loc; -} - static int module_memory_alloc(struct module *mod, enum mod_mem_type type) { unsigned int size = PAGE_ALIGN(mod->mem[type].size); @@ -1250,21 +1296,15 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) if (!ptr) return -ENOMEM; - mod->mem[type].base = ptr; - if (execmem_is_rox(execmem_type)) { - ptr = vzalloc(size); + int err = execmem_make_temp_rw(ptr, size); - if (!ptr) { - execmem_free(mod->mem[type].base); + if (err) { + execmem_free(ptr); return -ENOMEM; } - mod->mem[type].rw_copy = ptr; mod->mem[type].is_rox = true; - } else { - mod->mem[type].rw_copy = mod->mem[type].base; - memset(mod->mem[type].base, 0, size); } /* @@ -1278,18 +1318,29 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) * *do* eventually get freed, but let's just keep things simple * and avoid *any* false positives. */ - kmemleak_not_leak(ptr); + if (!mod->mem[type].is_rox) + kmemleak_not_leak(ptr); + + memset(ptr, 0, size); + mod->mem[type].base = ptr; return 0; } +static void module_memory_restore_rox(struct module *mod) +{ + for_class_mod_mem_type(type, text) { + struct module_memory *mem = &mod->mem[type]; + + if (mem->is_rox) + execmem_restore_rox(mem->base, mem->size); + } +} + static void module_memory_free(struct module *mod, enum mod_mem_type type) { struct module_memory *mem = &mod->mem[type]; - if (mem->is_rox) - vfree(mem->rw_copy); - execmem_free(mem->base); } @@ -1348,7 +1399,7 @@ static void free_module(struct module *mod) mod_tree_remove(mod); /* Remove this module from bug list, this uses list_del_rcu */ module_bug_cleanup(mod); - /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ + /* Wait for RCU synchronizing before releasing mod->list and buglist. */ synchronize_rcu(); if (try_add_tainted_module(mod)) pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n", @@ -1371,21 +1422,18 @@ void *__symbol_get(const char *symbol) .warn = true, }; - preempt_disable(); - if (!find_symbol(&fsa)) - goto fail; - if (fsa.license != GPL_ONLY) { - pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n", - symbol); - goto fail; + scoped_guard(rcu) { + if (!find_symbol(&fsa)) + return NULL; + if (fsa.license != GPL_ONLY) { + pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n", + symbol); + return NULL; + } + if (strong_try_module_get(fsa.owner)) + return NULL; } - if (strong_try_module_get(fsa.owner)) - goto fail; - preempt_enable(); return (void *)kernel_symbol_value(fsa.sym); -fail: - preempt_enable(); - return NULL; } EXPORT_SYMBOL_GPL(__symbol_get); @@ -1582,12 +1630,11 @@ static void __layout_sections(struct module *mod, struct load_info *info, bool i { unsigned int m, i; + /* + * { Mask of required section header flags, + * Mask of excluded section header flags } + */ static const unsigned long masks[][2] = { - /* - * NOTE: all executable code must be the first section - * in this array; otherwise modify the text_size - * finder in the two loops below - */ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL }, @@ -1679,15 +1726,30 @@ static void module_license_taint_check(struct module *mod, const char *license) } } -static void setup_modinfo(struct module *mod, struct load_info *info) +static int setup_modinfo(struct module *mod, struct load_info *info) { const struct module_attribute *attr; + char *imported_namespace; int i; for (i = 0; (attr = modinfo_attrs[i]); i++) { if (attr->setup) attr->setup(mod, get_modinfo(info, attr->attr.name)); } + + for_each_modinfo_entry(imported_namespace, info, "import_ns") { + /* + * 'module:' prefixed namespaces are implicit, disallow + * explicit imports. + */ + if (strstarts(imported_namespace, "module:")) { + pr_err("%s: module tries to import module namespace: %s\n", + mod->name, imported_namespace); + return -EPERM; + } + } + + return 0; } static void free_modinfo(struct module *mod) @@ -2642,7 +2704,6 @@ static int move_module(struct module *mod, struct load_info *info) for_each_mod_mem_type(type) { if (!mod->mem[type].size) { mod->mem[type].base = NULL; - mod->mem[type].rw_copy = NULL; continue; } @@ -2659,7 +2720,6 @@ static int move_module(struct module *mod, struct load_info *info) void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; const char *sname; - unsigned long addr; if (!(shdr->sh_flags & SHF_ALLOC)) continue; @@ -2680,14 +2740,12 @@ static int move_module(struct module *mod, struct load_info *info) ret = PTR_ERR(dest); goto out_err; } - addr = (unsigned long)dest; codetag_section_found = true; } else { enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK; - addr = (unsigned long)mod->mem[type].base + offset; - dest = mod->mem[type].rw_copy + offset; + dest = mod->mem[type].base + offset; } if (shdr->sh_type != SHT_NOBITS) { @@ -2710,13 +2768,14 @@ static int move_module(struct module *mod, struct load_info *info) * users of info can keep taking advantage and using the newly * minted official memory area. */ - shdr->sh_addr = addr; + shdr->sh_addr = (unsigned long)dest; pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr, (long)shdr->sh_size, info->secstrings + shdr->sh_name); } return 0; out_err: + module_memory_restore_rox(mod); for (t--; t >= 0; t--) module_memory_free(mod, t); if (codetag_section_found) @@ -2791,7 +2850,6 @@ core_param(module_blacklist, module_blacklist, charp, 0400); static struct module *layout_and_allocate(struct load_info *info, int flags) { struct module *mod; - unsigned int ndx; int err; /* Allow arches to frob section contents and sizes. */ @@ -2809,22 +2867,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC; /* - * Mark ro_after_init section with SHF_RO_AFTER_INIT so that - * layout_sections() can put it in the right place. + * Mark relevant sections as SHF_RO_AFTER_INIT so layout_sections() can + * put them in the right place. * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set. */ - ndx = find_sec(info, ".data..ro_after_init"); - if (ndx) - info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; - /* - * Mark the __jump_table section as ro_after_init as well: these data - * structures are never modified, with the exception of entries that - * refer to code in the __init section, which are annotated as such - * at module load time. - */ - ndx = find_sec(info, "__jump_table"); - if (ndx) - info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; + module_mark_ro_after_init(info->hdr, info->sechdrs, info->secstrings); /* * Determine total sizes, and put offsets in sh_entsize. For now @@ -2852,6 +2899,7 @@ static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); module_arch_freeing_init(mod); + codetag_free_module_sections(mod); free_mod_mem(mod); } @@ -2863,17 +2911,8 @@ int __weak module_finalize(const Elf_Ehdr *hdr, return 0; } -int __weak module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - return 0; -} - static int post_relocation(struct module *mod, const struct load_info *info) { - int ret; - /* Sort exception table now relocations are done. */ sort_extable(mod->extable, mod->extable + mod->num_exentries); @@ -2885,24 +2924,7 @@ static int post_relocation(struct module *mod, const struct load_info *info) add_kallsyms(mod, info); /* Arch-specific module finalizing. */ - ret = module_finalize(info->hdr, info->sechdrs, mod); - if (ret) - return ret; - - for_each_mod_mem_type(type) { - struct module_memory *mem = &mod->mem[type]; - - if (mem->is_rox) { - if (!execmem_update_copy(mem->base, mem->rw_copy, - mem->size)) - return -ENOMEM; - - vfree(mem->rw_copy); - mem->rw_copy = NULL; - } - } - - return module_post_finalize(info->hdr, info->sechdrs, mod); + return module_finalize(info->hdr, info->sechdrs, mod); } /* Call module constructors. */ @@ -3049,7 +3071,7 @@ static noinline int do_init_module(struct module *mod) #endif /* * We want to free module_init, but be aware that kallsyms may be - * walking this with preempt disabled. In all the failure paths, we + * walking this within an RCU read section. In all the failure paths, we * call synchronize_rcu(), but we don't want to slow down the success * path. execmem_free() cannot be called in an interrupt, so do the * work and call synchronize_rcu() in a work queue. @@ -3384,7 +3406,9 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_unload; /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, info); + err = setup_modinfo(mod, info); + if (err) + goto free_modinfo; /* Fix up syms, so that st_value is a pointer to location. */ err = simplify_symbols(mod, info); @@ -3447,11 +3471,12 @@ static int load_module(struct load_info *info, const char __user *uargs, goto sysfs_cleanup; } + if (codetag_load_module(mod)) + goto sysfs_cleanup; + /* Get rid of temporary copy. */ free_copy(info, flags); - codetag_load_module(mod); - /* Done! */ trace_module_load(mod); @@ -3499,6 +3524,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod->mem[type].size); } + module_memory_restore_rox(mod); module_deallocate(mod, info); free_copy: /* @@ -3715,28 +3741,23 @@ out: /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { - const struct exception_table_entry *e = NULL; struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address(addr); if (!mod) - goto out; + return NULL; if (!mod->num_exentries) - goto out; - - e = search_extable(mod->extable, - mod->num_exentries, - addr); -out: - preempt_enable(); - + return NULL; /* - * Now, if we found one, we are running inside it now, hence - * we cannot unload the module, hence no refcnt needed. + * The address passed here belongs to a module that is currently + * invoked (we are running inside it). Therefore its module::refcnt + * needs already be >0 to ensure that it is not removed at this stage. + * All other user need to invoke this function within a RCU read + * section. */ - return e; + return search_extable(mod->extable, mod->num_exentries, addr); } /** @@ -3748,20 +3769,15 @@ out: */ bool is_module_address(unsigned long addr) { - bool ret; - - preempt_disable(); - ret = __module_address(addr) != NULL; - preempt_enable(); - - return ret; + guard(rcu)(); + return __module_address(addr) != NULL; } /** * __module_address() - get the module which contains an address. * @addr: the address. * - * Must be called with preempt disabled or module mutex held so that + * Must be called within RCU read section or module mutex held so that * module doesn't get freed during this. */ struct module *__module_address(unsigned long addr) @@ -3779,8 +3795,6 @@ struct module *__module_address(unsigned long addr) return NULL; lookup: - module_assert_mutex_or_preempt(); - mod = mod_find(addr, &mod_tree); if (mod) { BUG_ON(!within_module(addr, mod)); @@ -3800,20 +3814,28 @@ lookup: */ bool is_module_text_address(unsigned long addr) { - bool ret; + guard(rcu)(); + return __module_text_address(addr) != NULL; +} - preempt_disable(); - ret = __module_text_address(addr) != NULL; - preempt_enable(); +void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data) +{ + struct module *mod; - return ret; + guard(rcu)(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (func(mod, data)) + break; + } } /** * __module_text_address() - get the module whose code contains an address. * @addr: the address. * - * Must be called with preempt disabled or module mutex held so that + * Must be called within RCU read section or module mutex held so that * module doesn't get freed during this. */ struct module *__module_text_address(unsigned long addr) @@ -3836,7 +3858,7 @@ void print_modules(void) printk(KERN_DEFAULT "Modules linked in:"); /* Most callers should already have preempt disabled, but make sure */ - preempt_disable(); + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; @@ -3844,7 +3866,6 @@ void print_modules(void) } print_unloaded_tainted_modules(); - preempt_enable(); if (last_unloaded_module.name[0]) pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name, last_unloaded_module.taints); diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index 74834ba15615..8fd438529fbc 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/set_memory.h> +#include <linux/execmem.h> #include "internal.h" static int module_set_memory(const struct module *mod, enum mod_mem_type type, @@ -32,12 +33,12 @@ static int module_set_memory(const struct module *mod, enum mod_mem_type type, int module_enable_text_rox(const struct module *mod) { for_class_mod_mem_type(type, text) { + const struct module_memory *mem = &mod->mem[type]; int ret; - if (mod->mem[type].is_rox) - continue; - - if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + if (mem->is_rox) + ret = execmem_restore_rox(mem->base, mem->size); + else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) ret = module_set_memory(mod, type, set_memory_rox); else ret = module_set_memory(mod, type, set_memory_x); @@ -86,8 +87,9 @@ int module_enable_data_nx(const struct module *mod) return 0; } -int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, - char *secstrings, struct module *mod) +int module_enforce_rwx_sections(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + const char *secstrings, + const struct module *mod) { const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR; int i; @@ -105,3 +107,45 @@ int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, return 0; } + +static const char *const ro_after_init[] = { + /* + * Section .data..ro_after_init holds data explicitly annotated by + * __ro_after_init. + */ + ".data..ro_after_init", + + /* + * Section __jump_table holds data structures that are never modified, + * with the exception of entries that refer to code in the __init + * section, which are marked as such at module load time. + */ + "__jump_table", + +#ifdef CONFIG_HAVE_STATIC_CALL_INLINE + /* + * Section .static_call_sites holds data structures that need to be + * sorted and processed at module load time but are never modified + * afterwards. + */ + ".static_call_sites", +#endif +}; + +void module_mark_ro_after_init(const Elf_Ehdr *hdr, Elf_Shdr *sechdrs, + const char *secstrings) +{ + int i, j; + + for (i = 1; i < hdr->e_shnum; i++) { + Elf_Shdr *shdr = &sechdrs[i]; + + for (j = 0; j < ARRAY_SIZE(ro_after_init); j++) { + if (strcmp(secstrings + shdr->sh_name, + ro_after_init[j]) == 0) { + shdr->sh_flags |= SHF_RO_AFTER_INIT; + break; + } + } + } +} diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c index 16742d1c630c..4fefec5b683c 100644 --- a/kernel/module/tracking.c +++ b/kernel/module/tracking.c @@ -21,8 +21,6 @@ int try_add_tainted_module(struct module *mod) { struct mod_unload_taint *mod_taint; - module_assert_mutex_or_preempt(); - if (!mod->taints) goto out; diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index 277197977d43..d3204c5c74eb 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -12,11 +12,11 @@ /* * Use a latched RB-tree for __module_address(); this allows us to use - * RCU-sched lookups of the address from any context. + * RCU lookups of the address from any context. * - * This is conditional on PERF_EVENTS || TRACING because those can really hit - * __module_address() hard by doing a lot of stack unwinding; potentially from - * NMI context. + * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can + * really hit __module_address() hard by doing a lot of stack unwinding; + * potentially from NMI context. */ static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) diff --git a/kernel/module/version.c b/kernel/module/version.c index 3718a8868321..2beefeba82d9 100644 --- a/kernel/module/version.c +++ b/kernel/module/version.c @@ -79,17 +79,17 @@ int check_modstruct_version(const struct load_info *info, .name = "module_layout", .gplok = true, }; + bool have_symbol; /* * Since this should be found in kernel (which can't be removed), no - * locking is necessary -- use preempt_disable() to placate lockdep. + * locking is necessary. Regardless use a RCU read section to keep + * lockdep happy. */ - preempt_disable(); - if (!find_symbol(&fsa)) { - preempt_enable(); - BUG(); - } - preempt_enable(); + scoped_guard(rcu) + have_symbol = find_symbol(&fsa); + BUG_ON(!have_symbol); + return check_version(info, "module_layout", mod, fsa.crc); } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index c9d97ed20122..5f31fdff8a38 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -128,17 +128,13 @@ out_time: out_net: put_cgroup_ns(new_nsp->cgroup_ns); out_cgroup: - if (new_nsp->pid_ns_for_children) - put_pid_ns(new_nsp->pid_ns_for_children); + put_pid_ns(new_nsp->pid_ns_for_children); out_pid: - if (new_nsp->ipc_ns) - put_ipc_ns(new_nsp->ipc_ns); + put_ipc_ns(new_nsp->ipc_ns); out_ipc: - if (new_nsp->uts_ns) - put_uts_ns(new_nsp->uts_ns); + put_uts_ns(new_nsp->uts_ns); out_uts: - if (new_nsp->mnt_ns) - put_mnt_ns(new_nsp->mnt_ns); + put_mnt_ns(new_nsp->mnt_ns); out_ns: kmem_cache_free(nsproxy_cachep, new_nsp); return ERR_PTR(err); @@ -189,18 +185,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) void free_nsproxy(struct nsproxy *ns) { - if (ns->mnt_ns) - put_mnt_ns(ns->mnt_ns); - if (ns->uts_ns) - put_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - put_ipc_ns(ns->ipc_ns); - if (ns->pid_ns_for_children) - put_pid_ns(ns->pid_ns_for_children); - if (ns->time_ns) - put_time_ns(ns->time_ns); - if (ns->time_ns_for_children) - put_time_ns(ns->time_ns_for_children); + put_mnt_ns(ns->mnt_ns); + put_uts_ns(ns->uts_ns); + put_ipc_ns(ns->ipc_ns); + put_pid_ns(ns->pid_ns_for_children); + put_time_ns(ns->time_ns); + put_time_ns(ns->time_ns_for_children); put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); diff --git a/kernel/padata.c b/kernel/padata.c index 418987056340..7eee94166357 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -290,7 +290,7 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd, if (remove_object) { list_del_init(&padata->list); ++pd->processed; - pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false); + pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu); } spin_unlock(&reorder->lock); @@ -358,7 +358,8 @@ static void padata_reorder(struct parallel_data *pd) * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish. */ padata_get_pd(pd); - queue_work(pinst->serial_wq, &pd->reorder_work); + if (!queue_work(pinst->serial_wq, &pd->reorder_work)) + padata_put_pd(pd); } } diff --git a/kernel/panic.c b/kernel/panic.c index d8635d5cecb2..b0b9a8bf4560 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -97,6 +97,36 @@ static const struct ctl_table kern_panic_table[] = { }, #endif { + .procname = "panic", + .data = &panic_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_oops", + .data = &panic_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_print", + .data = &panic_print, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "panic_on_warn", + .data = &panic_on_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "warn_limit", .data = &warn_limit, .maxlen = sizeof(warn_limit), @@ -277,12 +307,10 @@ static void panic_other_cpus_shutdown(bool crash_kexec) } /** - * panic - halt the system - * @fmt: The text string to print - * - * Display a message, then perform cleanups. + * panic - halt the system + * @fmt: The text string to print * - * This function never returns. + * Display a message, then perform cleanups. This function never returns. */ void panic(const char *fmt, ...) { @@ -511,6 +539,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { TAINT_FLAG(AUX, 'X', ' ', true), TAINT_FLAG(RANDSTRUCT, 'T', ' ', true), TAINT_FLAG(TEST, 'N', ' ', true), + TAINT_FLAG(FWCTL, 'J', ' ', true), }; #undef TAINT_FLAG @@ -832,9 +861,15 @@ device_initcall(register_warn_debugfs); */ __visible noinstr void __stack_chk_fail(void) { + unsigned long flags; + instrumentation_begin(); + flags = user_access_save(); + panic("stack-protector: Kernel stack is corrupted in: %pB", __builtin_return_address(0)); + + user_access_restore(flags); instrumentation_end(); } EXPORT_SYMBOL(__stack_chk_fail); diff --git a/kernel/params.c b/kernel/params.c index 0074d29c9b80..b92d64161b75 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -551,7 +551,7 @@ struct module_param_attrs { unsigned int num; struct attribute_group grp; - struct param_attribute attrs[]; + struct param_attribute attrs[] __counted_by(num); }; #ifdef CONFIG_SYSFS @@ -651,35 +651,32 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, } /* Enlarge allocations. */ - new_mp = krealloc(mk->mp, - sizeof(*mk->mp) + - sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1), + new_mp = krealloc(mk->mp, struct_size(mk->mp, attrs, mk->mp->num + 1), GFP_KERNEL); if (!new_mp) return -ENOMEM; mk->mp = new_mp; + mk->mp->num++; /* Extra pointer for NULL terminator */ - new_attrs = krealloc(mk->mp->grp.attrs, - sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2), - GFP_KERNEL); + new_attrs = krealloc_array(mk->mp->grp.attrs, mk->mp->num + 1, + sizeof(mk->mp->grp.attrs[0]), GFP_KERNEL); if (!new_attrs) return -ENOMEM; mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ - memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0])); - sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); - mk->mp->attrs[mk->mp->num].param = kp; - mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; + memset(&mk->mp->attrs[mk->mp->num - 1], 0, sizeof(mk->mp->attrs[0])); + sysfs_attr_init(&mk->mp->attrs[mk->mp->num - 1].mattr.attr); + mk->mp->attrs[mk->mp->num - 1].param = kp; + mk->mp->attrs[mk->mp->num - 1].mattr.show = param_attr_show; /* Do not allow runtime DAC changes to make param writable. */ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) - mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; + mk->mp->attrs[mk->mp->num - 1].mattr.store = param_attr_store; else - mk->mp->attrs[mk->mp->num].mattr.store = NULL; - mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; - mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; - mk->mp->num++; + mk->mp->attrs[mk->mp->num - 1].mattr.store = NULL; + mk->mp->attrs[mk->mp->num - 1].mattr.attr.name = (char *)name; + mk->mp->attrs[mk->mp->num - 1].mattr.attr.mode = kp->perm; /* Fix up all the pointers, since krealloc can move us */ for (i = 0; i < mk->mp->num; i++) @@ -763,38 +760,35 @@ void destroy_params(const struct kernel_param *params, unsigned num) params[i].ops->free(params[i].arg); } -static struct module_kobject * __init locate_module_kobject(const char *name) +struct module_kobject __modinit * lookup_or_create_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; int err; kobj = kset_find_obj(module_kset, name); - if (kobj) { - mk = to_module_kobject(kobj); - } else { - mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); - BUG_ON(!mk); - - mk->mod = THIS_MODULE; - mk->kobj.kset = module_kset; - err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, - "%s", name); -#ifdef CONFIG_MODULES - if (!err) - err = sysfs_create_file(&mk->kobj, &module_uevent.attr); -#endif - if (err) { - kobject_put(&mk->kobj); - pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", - name, err); - return NULL; - } + if (kobj) + return to_module_kobject(kobj); - /* So that we hold reference in both cases. */ - kobject_get(&mk->kobj); + mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); + if (!mk) + return NULL; + + mk->mod = THIS_MODULE; + mk->kobj.kset = module_kset; + err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); + if (IS_ENABLED(CONFIG_MODULES) && !err) + err = sysfs_create_file(&mk->kobj, &module_uevent.attr); + if (err) { + kobject_put(&mk->kobj); + pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", + name, err); + return NULL; } + /* So that we hold reference in both cases. */ + kobject_get(&mk->kobj); + return mk; } @@ -805,7 +799,7 @@ static void __init kernel_add_sysfs_param(const char *name, struct module_kobject *mk; int err; - mk = locate_module_kobject(name); + mk = lookup_or_create_module_kobject(name); if (!mk) return; @@ -876,7 +870,7 @@ static void __init version_sysfs_builtin(void) int err; for (vattr = __start___modver; vattr < __stop___modver; vattr++) { - mk = locate_module_kobject(vattr->module_name); + mk = lookup_or_create_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); WARN_ON_ONCE(err); @@ -949,7 +943,9 @@ struct kset *module_kset; static void module_kobj_release(struct kobject *kobj) { struct module_kobject *mk = to_module_kobject(kobj); - complete(mk->kobj_completion); + + if (mk->kobj_completion) + complete(mk->kobj_completion); } const struct kobj_type module_ktype = { diff --git a/kernel/pid.c b/kernel/pid.c index 924084713be8..8317bcbc7cf7 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -88,20 +88,6 @@ struct pid_namespace init_pid_ns = { }; EXPORT_SYMBOL_GPL(init_pid_ns); -/* - * Note: disable interrupts while the pidmap_lock is held as an - * interrupt might come in and do read_lock(&tasklist_lock). - * - * If we don't disable interrupts there is a nasty deadlock between - * detach_pid()->free_pid() and another cpu that does - * spin_lock(&pidmap_lock) followed by an interrupt routine that does - * read_lock(&tasklist_lock); - * - * After we clean up the tasklist_lock and know there are no - * irq handlers that take it we can leave the interrupts enabled. - * For now it is easier to be safe than to prove it can't happen. - */ - static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); @@ -114,6 +100,7 @@ void put_pid(struct pid *pid) ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { + WARN_ON_ONCE(pid->stashed); kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } @@ -128,11 +115,11 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { - /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; - unsigned long flags; - spin_lock_irqsave(&pidmap_lock, flags); + lockdep_assert_not_held(&tasklist_lock); + + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; @@ -155,11 +142,23 @@ void free_pid(struct pid *pid) idr_remove(&ns->idr, upid->nr); } pidfs_remove_pid(pid); - spin_unlock_irqrestore(&pidmap_lock, flags); + spin_unlock(&pidmap_lock); call_rcu(&pid->rcu, delayed_put_pid); } +void free_pids(struct pid **pids) +{ + int tmp; + + /* + * This can batch pidmap_lock. + */ + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) + if (pids[tmp]) + free_pid(pids[tmp]); +} + struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size) { @@ -211,7 +210,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, } idr_preload(GFP_KERNEL); - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, @@ -238,7 +237,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max, GFP_ATOMIC); } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); if (nr < 0) { @@ -272,7 +271,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, upid = pid->numbers + ns->level; idr_preload(GFP_KERNEL); - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; pidfs_add_pid(pid); @@ -281,18 +280,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); return pid; out_unlock: - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); put_pid_ns(ns); out_free: - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); @@ -302,7 +301,7 @@ out_free: if (ns->pid_allocated == PIDNS_ADDING) idr_set_cursor(&ns->idr, 0); - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); @@ -310,9 +309,9 @@ out_free: void disable_pid_allocation(struct pid_namespace *ns) { - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); ns->pid_allocated &= ~PIDNS_ADDING; - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) @@ -339,43 +338,45 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) */ void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid *pid = *task_pid_ptr(task, type); + struct pid *pid; + + lockdep_assert_held_write(&tasklist_lock); + + pid = *task_pid_ptr(task, type); hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } -static void __change_pid(struct task_struct *task, enum pid_type type, - struct pid *new) +static void __change_pid(struct pid **pids, struct task_struct *task, + enum pid_type type, struct pid *new) { - struct pid **pid_ptr = task_pid_ptr(task, type); - struct pid *pid; + struct pid **pid_ptr, *pid; int tmp; + lockdep_assert_held_write(&tasklist_lock); + + pid_ptr = task_pid_ptr(task, type); pid = *pid_ptr; hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; - if (type == PIDTYPE_PID) { - WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID)); - wake_up_all(&pid->wait_pidfd); - } - for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; - free_pid(pid); + WARN_ON(pids[type]); + pids[type] = pid; } -void detach_pid(struct task_struct *task, enum pid_type type) +void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type) { - __change_pid(task, type, NULL); + __change_pid(pids, task, type, NULL); } -void change_pid(struct task_struct *task, enum pid_type type, +void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type, struct pid *pid) { - __change_pid(task, type, pid); + __change_pid(pids, task, type, pid); attach_pid(task, type); } @@ -386,6 +387,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right) struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; + lockdep_assert_held_write(&tasklist_lock); + /* Swap the single entry tid lists */ hlists_swap_heads_rcu(head1, head2); @@ -403,6 +406,7 @@ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { WARN_ON_ONCE(type == PIDTYPE_PID); + lockdep_assert_held_write(&tasklist_lock); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } @@ -564,15 +568,29 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) */ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) { - unsigned int f_flags; + unsigned int f_flags = 0; struct pid *pid; struct task_struct *task; + enum pid_type type; - pid = pidfd_get_pid(pidfd, &f_flags); - if (IS_ERR(pid)) - return ERR_CAST(pid); + switch (pidfd) { + case PIDFD_SELF_THREAD: + type = PIDTYPE_PID; + pid = get_task_pid(current, type); + break; + case PIDFD_SELF_THREAD_GROUP: + type = PIDTYPE_TGID; + pid = get_task_pid(current, type); + break; + default: + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) + return ERR_CAST(pid); + type = PIDTYPE_TGID; + break; + } - task = get_pid_task(pid, PIDTYPE_TGID); + task = get_pid_task(pid, type); put_pid(pid); if (!task) return ERR_PTR(-ESRCH); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 8f6cfec87555..7098ed44e717 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -107,7 +107,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns goto out_free_idr; ns->ns.ops = &pidns_operations; - ns->pid_max = parent_pid_ns->pid_max; + ns->pid_max = PID_MAX_LIMIT; err = register_pidns_sysctls(ns); if (err) goto out_free_inum; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ca947ed32e3d..54a623680019 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -380,8 +380,7 @@ config CPU_PM config ENERGY_MODEL bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)" - depends on SMP - depends on CPU_FREQ + depends on CPU_FREQ || PM_DEVFREQ help Several subsystems (thermal and/or the task scheduler for example) can leverage information about the energy consumed by devices to diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 3874f0e97651..ea7995a25780 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -161,22 +161,10 @@ static void em_debug_create_pd(struct device *dev) {} static void em_debug_remove_pd(struct device *dev) {} #endif -static void em_destroy_table_rcu(struct rcu_head *rp) -{ - struct em_perf_table __rcu *table; - - table = container_of(rp, struct em_perf_table, rcu); - kfree(table); -} - static void em_release_table_kref(struct kref *kref) { - struct em_perf_table __rcu *table; - /* It was the last owner of this table so we can free */ - table = container_of(kref, struct em_perf_table, kref); - - call_rcu(&table->rcu, em_destroy_table_rcu); + kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu); } /** @@ -185,7 +173,7 @@ static void em_release_table_kref(struct kref *kref) * * No return values. */ -void em_table_free(struct em_perf_table __rcu *table) +void em_table_free(struct em_perf_table *table) { kref_put(&table->kref, em_release_table_kref); } @@ -198,9 +186,9 @@ void em_table_free(struct em_perf_table __rcu *table) * has a user. * Returns allocated table or NULL. */ -struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd) { - struct em_perf_table __rcu *table; + struct em_perf_table *table; int table_size; table_size = sizeof(struct em_perf_state) * pd->nr_perf_states; @@ -239,12 +227,16 @@ static void em_init_performance(struct device *dev, struct em_perf_domain *pd, } static int em_compute_costs(struct device *dev, struct em_perf_state *table, - struct em_data_callback *cb, int nr_states, + const struct em_data_callback *cb, int nr_states, unsigned long flags) { unsigned long prev_cost = ULONG_MAX; int i, ret; + /* This is needed only for CPUs and EAS skip other devices */ + if (!_is_cpu_device(dev)) + return 0; + /* Compute the cost of each performance state. */ for (i = nr_states - 1; i >= 0; i--) { unsigned long power_res, cost; @@ -308,9 +300,9 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, * Return 0 on success or an error code on failure. */ int em_dev_update_perf_domain(struct device *dev, - struct em_perf_table __rcu *new_table) + struct em_perf_table *new_table) { - struct em_perf_table __rcu *old_table; + struct em_perf_table *old_table; struct em_perf_domain *pd; if (!dev) @@ -327,7 +319,8 @@ int em_dev_update_perf_domain(struct device *dev, kref_get(&new_table->kref); - old_table = pd->em_table; + old_table = rcu_dereference_protected(pd->em_table, + lockdep_is_held(&em_pd_mutex)); rcu_assign_pointer(pd->em_table, new_table); em_cpufreq_update_efficiencies(dev, new_table->state); @@ -341,7 +334,7 @@ EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, struct em_perf_state *table, - struct em_data_callback *cb, + const struct em_data_callback *cb, unsigned long flags) { unsigned long power, freq, prev_freq = 0; @@ -396,10 +389,11 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, } static int em_create_pd(struct device *dev, int nr_states, - struct em_data_callback *cb, cpumask_t *cpus, + const struct em_data_callback *cb, + const cpumask_t *cpus, unsigned long flags) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; int cpu, ret, num_cpus; @@ -556,9 +550,10 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * Return 0 on success */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *cpus, - bool microwatts) + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) { + struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; unsigned long flags = 0; int cpu, ret; @@ -631,7 +626,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, dev->em_pd->min_perf_state = 0; dev->em_pd->max_perf_state = nr_states - 1; - em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state); + em_table = rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex)); + em_cpufreq_update_efficiencies(dev, em_table->state); em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); @@ -668,7 +665,8 @@ void em_dev_unregister_perf_domain(struct device *dev) mutex_lock(&em_pd_mutex); em_debug_remove_pd(dev); - em_table_free(dev->em_pd->em_table); + em_table_free(rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex))); kfree(dev->em_pd); dev->em_pd = NULL; @@ -676,9 +674,9 @@ void em_dev_unregister_perf_domain(struct device *dev) } EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); -static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) +static struct em_perf_table *em_table_dup(struct em_perf_domain *pd) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_state *ps, *new_ps; int ps_size; @@ -700,14 +698,16 @@ static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) } static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, - struct em_perf_table __rcu *em_table) + struct em_perf_table *em_table) { int ret; - ret = em_compute_costs(dev, em_table->state, NULL, pd->nr_perf_states, - pd->flags); - if (ret) - goto free_em_table; + if (!em_is_artificial(pd)) { + ret = em_compute_costs(dev, em_table->state, NULL, + pd->nr_perf_states, pd->flags); + if (ret) + goto free_em_table; + } ret = em_dev_update_perf_domain(dev, em_table); if (ret) @@ -727,11 +727,24 @@ free_em_table: * Adjustment of CPU performance values after boot, when all CPUs capacites * are correctly calculated. */ -static void em_adjust_new_capacity(struct device *dev, - struct em_perf_domain *pd, - u64 max_cap) +static void em_adjust_new_capacity(unsigned int cpu, struct device *dev, + struct em_perf_domain *pd) { - struct em_perf_table __rcu *em_table; + unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu); + struct em_perf_table *em_table; + struct em_perf_state *table; + unsigned long em_max_perf; + + rcu_read_lock(); + table = em_perf_state_from_pd(pd); + em_max_perf = table[pd->nr_perf_states - 1].performance; + rcu_read_unlock(); + + if (em_max_perf == cpu_capacity) + return; + + pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu, + cpu_capacity, em_max_perf); em_table = em_table_dup(pd); if (!em_table) { @@ -744,12 +757,27 @@ static void em_adjust_new_capacity(struct device *dev, em_recalc_and_update(dev, pd, em_table); } +/** + * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update. + * @cpu: Target CPU. + * + * Adjust the existing EM for @cpu after a capacity update under the assumption + * that the capacity has been updated in the same way for all of the CPUs in + * the same perf domain. + */ +void em_adjust_cpu_capacity(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + struct em_perf_domain *pd; + + pd = em_pd_get(dev); + if (pd) + em_adjust_new_capacity(cpu, dev, pd); +} + static void em_check_capacity_update(void) { cpumask_var_t cpu_done_mask; - struct em_perf_state *table; - struct em_perf_domain *pd; - unsigned long cpu_capacity; int cpu; if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) { @@ -760,7 +788,7 @@ static void em_check_capacity_update(void) /* Check if CPUs capacity has changed than update EM */ for_each_possible_cpu(cpu) { struct cpufreq_policy *policy; - unsigned long em_max_perf; + struct em_perf_domain *pd; struct device *dev; if (cpumask_test_cpu(cpu, cpu_done_mask)) @@ -775,32 +803,15 @@ static void em_check_capacity_update(void) } cpufreq_cpu_put(policy); - pd = em_cpu_get(cpu); + dev = get_cpu_device(cpu); + pd = em_pd_get(dev); if (!pd || em_is_artificial(pd)) continue; cpumask_or(cpu_done_mask, cpu_done_mask, em_span_cpus(pd)); - cpu_capacity = arch_scale_cpu_capacity(cpu); - - rcu_read_lock(); - table = em_perf_state_from_pd(pd); - em_max_perf = table[pd->nr_perf_states - 1].performance; - rcu_read_unlock(); - - /* - * Check if the CPU capacity has been adjusted during boot - * and trigger the update for new performance values. - */ - if (em_max_perf == cpu_capacity) - continue; - - pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", - cpu, cpu_capacity, em_max_perf); - - dev = get_cpu_device(cpu); - em_adjust_new_capacity(dev, pd, cpu_capacity); + em_adjust_new_capacity(cpu, dev, pd); } free_cpumask_var(cpu_done_mask); @@ -822,7 +833,7 @@ static void em_update_workfn(struct work_struct *work) */ int em_dev_update_chip_binning(struct device *dev) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; int i, ret; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 10a01af63a80..519fb09de5e0 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) "PM: hibernation: " fmt +#include <crypto/acompress.h> #include <linux/blkdev.h> #include <linux/export.h> #include <linux/suspend.h> @@ -89,6 +90,11 @@ void hibernate_release(void) atomic_inc(&hibernate_atomic); } +bool hibernation_in_progress(void) +{ + return !atomic_read(&hibernate_atomic); +} + bool hibernation_available(void) { return nohibernate == 0 && @@ -132,10 +138,15 @@ bool system_entering_hibernation(void) EXPORT_SYMBOL(system_entering_hibernation); #ifdef CONFIG_PM_DEBUG +static unsigned int pm_test_delay = 5; +module_param(pm_test_delay, uint, 0644); +MODULE_PARM_DESC(pm_test_delay, + "Number of seconds to wait before resuming from hibernation test"); static void hibernation_debug_sleep(void) { - pr_info("debug: Waiting for 5 seconds.\n"); - mdelay(5000); + pr_info("hibernation debug: Waiting for %d second(s).\n", + pm_test_delay); + mdelay(pm_test_delay * 1000); } static int hibernation_test(int level) @@ -411,7 +422,7 @@ int hibernation_snapshot(int platform_mode) goto Thaw; } - suspend_console(); + console_suspend_all(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -437,7 +448,7 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); - resume_console(); + console_resume_all(); dpm_complete(msg); Close: @@ -547,7 +558,7 @@ int hibernation_restore(int platform_mode) int error; pm_prepare_console(); - suspend_console(); + console_suspend_all(); pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { @@ -561,7 +572,7 @@ int hibernation_restore(int platform_mode) } dpm_resume_end(PMSG_RECOVER); pm_restore_gfp_mask(); - resume_console(); + console_resume_all(); pm_restore_console(); return error; } @@ -586,7 +597,7 @@ int hibernation_platform_enter(void) goto Close; entering_platform_hibernation = true; - suspend_console(); + console_suspend_all(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -639,7 +650,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); - resume_console(); + console_resume_all(); Close: hibernation_ops->end(); @@ -756,8 +767,8 @@ int hibernate(void) * Query for the compression algorithm support if compression is enabled. */ if (!nocompress) { - strscpy(hib_comp_algo, hibernate_compressor, sizeof(hib_comp_algo)); - if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) { + strscpy(hib_comp_algo, hibernate_compressor); + if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); return -EOPNOTSUPP; } @@ -777,6 +788,8 @@ int hibernate(void) goto Restore; ksys_sync_helper(); + if (filesystem_freeze_enabled) + filesystems_freeze(); error = freeze_processes(); if (error) @@ -845,6 +858,7 @@ int hibernate(void) /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; Exit: + filesystems_thaw(); pm_notifier_call_chain(PM_POST_HIBERNATION); Restore: pm_restore_console(); @@ -881,6 +895,9 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto restore; + if (filesystem_freeze_enabled) + filesystems_freeze(); + error = freeze_processes(); if (error) goto exit; @@ -901,7 +918,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto dpm_complete; - suspend_console(); + console_suspend_all(); error = dpm_suspend(PMSG_FREEZE); if (error) @@ -925,7 +942,7 @@ skip: dpm_resume: dpm_resume(PMSG_THAW); - resume_console(); + console_resume_all(); dpm_complete: dpm_complete(PMSG_THAW); @@ -940,6 +957,7 @@ thaw: thaw_processes(); exit: + filesystems_thaw(); pm_notifier_call_chain(PM_POST_HIBERNATION); restore: @@ -1005,10 +1023,10 @@ static int software_resume(void) */ if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) { if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4) - strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4, sizeof(hib_comp_algo)); + strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4); else - strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO, sizeof(hib_comp_algo)); - if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) { + strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO); + if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); error = -EOPNOTSUPP; goto Unlock; @@ -1028,19 +1046,26 @@ static int software_resume(void) if (error) goto Restore; + if (filesystem_freeze_enabled) + filesystems_freeze(); + pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); - if (error) + if (error) { + filesystems_thaw(); goto Close_Finish; + } error = freeze_kernel_threads(); if (error) { thaw_processes(); + filesystems_thaw(); goto Close_Finish; } error = load_image_and_restore(); thaw_processes(); + filesystems_thaw(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); Restore: @@ -1446,22 +1471,21 @@ static const char * const comp_alg_enabled[] = { static int hibernate_compressor_param_set(const char *compressor, const struct kernel_param *kp) { - unsigned int sleep_flags; int index, ret; - sleep_flags = lock_system_sleep(); + if (!mutex_trylock(&system_transition_mutex)) + return -EBUSY; index = sysfs_match_string(comp_alg_enabled, compressor); if (index >= 0) { ret = param_set_copystring(comp_alg_enabled[index], kp); if (!ret) - strscpy(hib_comp_algo, comp_alg_enabled[index], - sizeof(hib_comp_algo)); + strscpy(hib_comp_algo, comp_alg_enabled[index]); } else { ret = index; } - unlock_system_sleep(sleep_flags); + mutex_unlock(&system_transition_mutex); if (ret) pr_debug("Cannot set specified compressor %s\n", diff --git a/kernel/power/main.c b/kernel/power/main.c index 6254814d4817..3d484630505a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -557,6 +557,10 @@ static int __init pm_debugfs_init(void) late_initcall(pm_debugfs_init); #endif /* CONFIG_DEBUG_FS */ +bool pm_sleep_transition_in_progress(void) +{ + return pm_suspend_in_progress() || hibernation_in_progress(); +} #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG @@ -594,7 +598,7 @@ power_attr(pm_print_times); static inline void pm_print_times_init(void) { - pm_print_times_enabled = !!initcall_debug; + pm_print_times_enabled = initcall_debug; } static ssize_t pm_wakeup_irq_show(struct kobject *kobj, @@ -613,7 +617,7 @@ bool pm_debug_messages_on __read_mostly; bool pm_debug_messages_should_print(void) { - return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON; + return pm_debug_messages_on && pm_sleep_transition_in_progress(); } EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); @@ -962,6 +966,34 @@ power_attr(pm_freeze_timeout); #endif /* CONFIG_FREEZER*/ +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +bool filesystem_freeze_enabled = false; + +static ssize_t freeze_filesystems_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", filesystem_freeze_enabled); +} + +static ssize_t freeze_filesystems_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + filesystem_freeze_enabled = !!val; + return n; +} + +power_attr(freeze_filesystems); +#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATION */ + static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE @@ -992,6 +1024,9 @@ static struct attribute * g[] = { #ifdef CONFIG_FREEZER &pm_freeze_timeout_attr.attr, #endif +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) + &freeze_filesystems_attr.attr, +#endif NULL, }; diff --git a/kernel/power/power.h b/kernel/power/power.h index c352dea2f67b..cb1d71562002 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -18,6 +18,10 @@ struct swsusp_info { unsigned long size; } __aligned(PAGE_SIZE); +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) +extern bool filesystem_freeze_enabled; +#endif + #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ extern void __init hibernate_reserved_size_init(void); @@ -71,10 +75,14 @@ extern void enable_restore_image_protection(void); static inline void enable_restore_image_protection(void) {} #endif /* CONFIG_STRICT_KERNEL_RWX */ +extern bool hibernation_in_progress(void); + #else /* !CONFIG_HIBERNATION */ static inline void hibernate_reserved_size_init(void) {} static inline void hibernate_image_size_init(void) {} + +static inline bool hibernation_in_progress(void) { return false; } #endif /* !CONFIG_HIBERNATION */ #define power_attr(_name) \ diff --git a/kernel/power/process.c b/kernel/power/process.c index 66ac067d9ae6..dc0dfc349f22 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -189,7 +189,7 @@ void thaw_processes(void) oom_killer_enable(); - pr_info("Restarting tasks ... "); + pr_info("Restarting tasks: Starting\n"); __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); @@ -208,7 +208,7 @@ void thaw_processes(void) usermodehelper_enable(); schedule(); - pr_cont("done.\n"); + pr_info("Restarting tasks: Done\n"); trace_suspend_resume(TPS("thaw_processes"), 0, false); } @@ -217,7 +217,7 @@ void thaw_kernel_threads(void) struct task_struct *g, *p; pm_nosig_freezing = false; - pr_info("Restarting kernel threads ... "); + pr_info("Restarting kernel threads ...\n"); thaw_workqueues(); @@ -229,5 +229,5 @@ void thaw_kernel_threads(void) read_unlock(&tasklist_lock); schedule(); - pr_cont("done.\n"); + pr_info("Done restarting kernel threads.\n"); } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index c9fb559a6399..2af36cfe35cd 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1094,16 +1094,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm) ((unsigned long long) region->end_pfn << PAGE_SHIFT) - 1); - for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) { - /* - * It is safe to ignore the result of - * mem_bm_set_bit_check() here, since we won't - * touch the PFNs for which the error is - * returned anyway. - */ - mem_bm_set_bit_check(bm, pfn); - } + for_each_valid_pfn(pfn, region->start_pfn, region->end_pfn) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } } } @@ -1255,21 +1254,20 @@ static void mark_free_pages(struct zone *zone) spin_lock_irqsave(&zone->lock, flags); max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); + for_each_valid_pfn(pfn, zone->zone_start_pfn, max_zone_pfn) { + page = pfn_to_page(pfn); - if (!--page_count) { - touch_nmi_watchdog(); - page_count = WD_PAGE_COUNT; - } + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } - if (page_zone(page) != zone) - continue; + if (page_zone(page) != zone) + continue; - if (!swsusp_page_is_forbidden(page)) - swsusp_unset_page_free(page); - } + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } for_each_migratetype_order(order, t) { list_for_each_entry(page, @@ -2270,9 +2268,9 @@ int snapshot_read_next(struct snapshot_handle *handle) */ void *kaddr; - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); copy_page(buffer, kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); handle->buffer = buffer; } else { handle->buffer = page_address(page); @@ -2561,9 +2559,9 @@ static void copy_last_highmem_page(void) if (last_highmem_page) { void *dst; - dst = kmap_atomic(last_highmem_page); + dst = kmap_local_page(last_highmem_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); last_highmem_page = NULL; } } @@ -2881,13 +2879,13 @@ static inline void swap_two_pages_data(struct page *p1, struct page *p2, { void *kaddr1, *kaddr2; - kaddr1 = kmap_atomic(p1); - kaddr2 = kmap_atomic(p2); + kaddr1 = kmap_local_page(p1); + kaddr2 = kmap_local_page(p2); copy_page(buf, kaddr1); copy_page(kaddr1, kaddr2); copy_page(kaddr2, buf); - kunmap_atomic(kaddr2); - kunmap_atomic(kaddr1); + kunmap_local(kaddr2); + kunmap_local(kaddr1); } /** diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 09f8397bae15..76b141b9aac0 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -30,6 +30,7 @@ #include <trace/events/power.h> #include <linux/compiler.h> #include <linux/moduleparam.h> +#include <linux/fs.h> #include "power.h" @@ -91,6 +92,16 @@ static void s2idle_enter(void) { trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); + /* + * The correctness of the code below depends on the number of online + * CPUs being stable, but CPUs cannot be taken offline or put online + * while it is running. + * + * The s2idle_lock must be acquired before the pending wakeup check to + * prevent pm_system_wakeup() from running as a whole between that check + * and the subsequent s2idle_state update in which case a wakeup event + * would get lost. + */ raw_spin_lock_irq(&s2idle_lock); if (pm_wakeup_pending()) goto out; @@ -98,8 +109,6 @@ static void s2idle_enter(void) s2idle_state = S2IDLE_STATE_ENTER; raw_spin_unlock_irq(&s2idle_lock); - cpus_read_lock(); - /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ @@ -112,8 +121,6 @@ static void s2idle_enter(void) */ wake_up_all_idle_cpus(); - cpus_read_unlock(); - raw_spin_lock_irq(&s2idle_lock); out: @@ -368,6 +375,8 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Restore; + if (filesystem_freeze_enabled) + filesystems_freeze(); trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); trace_suspend_resume(TPS("freeze_processes"), 0, false); @@ -502,7 +511,7 @@ int suspend_devices_and_enter(suspend_state_t state) if (error) goto Close; - suspend_console(); + console_suspend_all(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -521,9 +530,9 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - trace_suspend_resume(TPS("resume_console"), state, true); - resume_console(); - trace_suspend_resume(TPS("resume_console"), state, false); + trace_suspend_resume(TPS("console_resume_all"), state, true); + console_resume_all(); + trace_suspend_resume(TPS("console_resume_all"), state, false); Close: platform_resume_end(state); @@ -544,6 +553,7 @@ int suspend_devices_and_enter(suspend_state_t state) static void suspend_finish(void) { suspend_thaw_processes(); + filesystems_thaw(); pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); } @@ -582,6 +592,8 @@ static int enter_state(suspend_state_t state) ksys_sync_helper(); trace_suspend_resume(TPS("sync_filesystems"), 0, false); } + if (filesystem_freeze_enabled) + filesystems_freeze(); pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); pm_suspend_clear_flags(); @@ -603,6 +615,7 @@ static int enter_state(suspend_state_t state) pm_pr_dbg("Finishing wakeup.\n"); suspend_finish(); Unlock: + filesystems_thaw(); mutex_unlock(&system_transition_mutex); return error; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 82b884b67152..ad13c461b657 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "PM: " fmt +#include <crypto/acompress.h> #include <linux/module.h> #include <linux/file.h> #include <linux/delay.h> @@ -267,35 +268,26 @@ static void hib_end_io(struct bio *bio) bio_put(bio); } -static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr, +static int hib_submit_io_sync(blk_opf_t opf, pgoff_t page_off, void *addr) +{ + return bdev_rw_virt(file_bdev(hib_resume_bdev_file), + page_off * (PAGE_SIZE >> 9), addr, PAGE_SIZE, opf); +} + +static int hib_submit_io_async(blk_opf_t opf, pgoff_t page_off, void *addr, struct hib_bio_batch *hb) { - struct page *page = virt_to_page(addr); struct bio *bio; - int error = 0; bio = bio_alloc(file_bdev(hib_resume_bdev_file), 1, opf, GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - pr_err("Adding page to bio failed at %llu\n", - (unsigned long long)bio->bi_iter.bi_sector); - bio_put(bio); - return -EFAULT; - } - - if (hb) { - bio->bi_end_io = hib_end_io; - bio->bi_private = hb; - atomic_inc(&hb->count); - submit_bio(bio); - } else { - error = submit_bio_wait(bio); - bio_put(bio); - } - - return error; + bio_add_virt_nofail(bio, addr, PAGE_SIZE); + bio->bi_end_io = hib_end_io; + bio->bi_private = hb; + atomic_inc(&hb->count); + submit_bio(bio); + return 0; } static int hib_wait_io(struct hib_bio_batch *hb) @@ -315,7 +307,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); + hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); @@ -328,8 +320,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, - swsusp_resume_block, swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, + swsusp_resume_block, swsusp_header); } else { pr_err("Swap header not found!\n"); error = -ENODEV; @@ -379,36 +371,30 @@ static int swsusp_swap_check(void) static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) { + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; void *src; int ret; if (!offset) return -ENOSPC; - if (hb) { - src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | - __GFP_NORETRY); - if (src) { - copy_page(src, buf); - } else { - ret = hib_wait_io(hb); /* Free pages */ - if (ret) - return ret; - src = (void *)__get_free_page(GFP_NOIO | - __GFP_NOWARN | - __GFP_NORETRY); - if (src) { - copy_page(src, buf); - } else { - WARN_ON_ONCE(1); - hb = NULL; /* Go synchronous */ - src = buf; - } - } - } else { - src = buf; + if (!hb) + goto sync_io; + + src = (void *)__get_free_page(gfp); + if (!src) { + ret = hib_wait_io(hb); /* Free pages */ + if (ret) + return ret; + src = (void *)__get_free_page(gfp); + if (WARN_ON_ONCE(!src)) + goto sync_io; } - return hib_submit_io(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); + + copy_page(src, buf); + return hib_submit_io_async(REQ_OP_WRITE | REQ_SYNC, offset, src, hb); +sync_io: + return hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, offset, buf); } static void release_swap_writer(struct swap_map_handle *handle) @@ -635,7 +621,8 @@ static int crc32_threadfn(void *data) */ struct cmp_data { struct task_struct *thr; /* thread */ - struct crypto_comp *cc; /* crypto compressor stream */ + struct crypto_acomp *cc; /* crypto compressor */ + struct acomp_req *cr; /* crypto request */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -656,7 +643,6 @@ static atomic_t compressed_size = ATOMIC_INIT(0); static int compress_threadfn(void *data) { struct cmp_data *d = data; - unsigned int cmp_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || @@ -670,11 +656,13 @@ static int compress_threadfn(void *data) } atomic_set(&d->ready, 0); - cmp_len = CMP_SIZE - CMP_HEADER; - d->ret = crypto_comp_compress(d->cc, d->unc, d->unc_len, - d->cmp + CMP_HEADER, - &cmp_len); - d->cmp_len = cmp_len; + acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + acomp_request_set_src_nondma(d->cr, d->unc, d->unc_len); + acomp_request_set_dst_nondma(d->cr, d->cmp + CMP_HEADER, + CMP_SIZE - CMP_HEADER); + d->ret = crypto_acomp_compress(d->cr); + d->cmp_len = d->cr->dlen; atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); atomic_set_release(&d->stop, 1); @@ -745,13 +733,20 @@ static int save_compressed_image(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); + data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC); if (IS_ERR_OR_NULL(data[thr].cc)) { pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); ret = -EFAULT; goto out_clean; } + data[thr].cr = acomp_request_alloc(data[thr].cc); + if (!data[thr].cr) { + pr_err("Could not allocate comp request\n"); + ret = -ENOMEM; + goto out_clean; + } + data[thr].thr = kthread_run(compress_threadfn, &data[thr], "image_compress/%u", thr); @@ -899,8 +894,8 @@ out_clean: for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); - if (data[thr].cc) - crypto_free_comp(data[thr].cc); + acomp_request_free(data[thr].cr); + crypto_free_acomp(data[thr].cc); } vfree(data); } @@ -1031,7 +1026,7 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_submit_io(REQ_OP_READ, offset, tmp->map, NULL); + error = hib_submit_io_sync(REQ_OP_READ, offset, tmp->map); if (error) { release_swap_reader(handle); return error; @@ -1055,7 +1050,10 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_submit_io(REQ_OP_READ, offset, buf, hb); + if (hb) + error = hib_submit_io_async(REQ_OP_READ, offset, buf, hb); + else + error = hib_submit_io_sync(REQ_OP_READ, offset, buf); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1142,7 +1140,8 @@ static int load_image(struct swap_map_handle *handle, */ struct dec_data { struct task_struct *thr; /* thread */ - struct crypto_comp *cc; /* crypto compressor stream */ + struct crypto_acomp *cc; /* crypto compressor */ + struct acomp_req *cr; /* crypto request */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -1160,7 +1159,6 @@ struct dec_data { static int decompress_threadfn(void *data) { struct dec_data *d = data; - unsigned int unc_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || @@ -1174,10 +1172,13 @@ static int decompress_threadfn(void *data) } atomic_set(&d->ready, 0); - unc_len = UNC_SIZE; - d->ret = crypto_comp_decompress(d->cc, d->cmp + CMP_HEADER, d->cmp_len, - d->unc, &unc_len); - d->unc_len = unc_len; + acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + acomp_request_set_src_nondma(d->cr, d->cmp + CMP_HEADER, + d->cmp_len); + acomp_request_set_dst_nondma(d->cr, d->unc, UNC_SIZE); + d->ret = crypto_acomp_decompress(d->cr); + d->unc_len = d->cr->dlen; if (clean_pages_on_decompress) flush_icache_range((unsigned long)d->unc, @@ -1254,13 +1255,20 @@ static int load_compressed_image(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); + data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC); if (IS_ERR_OR_NULL(data[thr].cc)) { pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); ret = -EFAULT; goto out_clean; } + data[thr].cr = acomp_request_alloc(data[thr].cc); + if (!data[thr].cr) { + pr_err("Could not allocate comp request\n"); + ret = -ENOMEM; + goto out_clean; + } + data[thr].thr = kthread_run(decompress_threadfn, &data[thr], "image_decompress/%u", thr); @@ -1507,8 +1515,8 @@ out_clean: for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); - if (data[thr].cc) - crypto_free_comp(data[thr].cc); + acomp_request_free(data[thr].cr); + crypto_free_acomp(data[thr].cc); } vfree(data); } @@ -1570,8 +1578,8 @@ int swsusp_check(bool exclusive) BLK_OPEN_READ, holder, NULL); if (!IS_ERR(hib_resume_bdev_file)) { clear_page(swsusp_header); - error = hib_submit_io(REQ_OP_READ, swsusp_resume_block, - swsusp_header, NULL); + error = hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, + swsusp_header); if (error) goto put; @@ -1579,9 +1587,9 @@ int swsusp_check(bool exclusive) memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); swsusp_header_flags = swsusp_header->flags; /* Reset swap signature now */ - error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, - swsusp_header, NULL); + swsusp_header); } else { error = -EINVAL; } @@ -1630,13 +1638,12 @@ int swsusp_unmark(void) { int error; - hib_submit_io(REQ_OP_READ, swsusp_resume_block, - swsusp_header, NULL); + hib_submit_io_sync(REQ_OP_READ, swsusp_resume_block, swsusp_header); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_submit_io(REQ_OP_WRITE | REQ_SYNC, + error = hib_submit_io_sync(REQ_OP_WRITE | REQ_SYNC, swsusp_resume_block, - swsusp_header, NULL); + swsusp_header); } else { pr_err("Cannot find swsusp signature!\n"); error = -ENODEV; diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 52571dcad768..4e941999a53b 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -49,6 +49,9 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active) len += sysfs_emit_at(buf, len, "%s ", wl->name); } + if (len > 0) + --len; + len += sysfs_emit_at(buf, len, "\n"); mutex_unlock(&wakelocks_lock); diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index a91bdf802967..48a24e7b309d 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -64,6 +64,7 @@ struct dev_printk_info; extern struct printk_ringbuffer *prb; extern bool printk_kthreads_running; +extern bool debug_non_panic_cpus; __printf(4, 0) int vprintk_store(int facility, int level, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 07668433644b..1eea80d0648e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2375,6 +2375,22 @@ void printk_legacy_allow_panic_sync(void) } } +bool __read_mostly debug_non_panic_cpus; + +#ifdef CONFIG_PRINTK_CALLER +static int __init debug_non_panic_cpus_setup(char *str) +{ + debug_non_panic_cpus = true; + pr_info("allow messages from non-panic CPUs in panic()\n"); + + return 0; +} +early_param("debug_non_panic_cpus", debug_non_panic_cpus_setup); +module_param(debug_non_panic_cpus, bool, 0644); +MODULE_PARM_DESC(debug_non_panic_cpus, + "allow messages from non-panic CPUs in panic()"); +#endif + asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) @@ -2391,7 +2407,9 @@ asmlinkage int vprintk_emit(int facility, int level, * non-panic CPUs are generating any messages, they will be * silently dropped. */ - if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace) + if (other_cpu_in_panic() && + !debug_non_panic_cpus && + !panic_triggering_all_cpu_backtrace) return 0; printk_get_console_flush_type(&ft); @@ -2461,7 +2479,6 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); -static bool pr_flush(int timeout_ms, bool reset_on_progress); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); #else /* CONFIG_PRINTK */ @@ -2474,7 +2491,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre static u64 syslog_seq; -static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ @@ -2733,11 +2749,11 @@ module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc"); /** - * suspend_console - suspend the console subsystem + * console_suspend_all - suspend the console subsystem * * This disables printk() while we go into suspend states */ -void suspend_console(void) +void console_suspend_all(void) { struct console *con; @@ -2760,7 +2776,7 @@ void suspend_console(void) synchronize_srcu(&console_srcu); } -void resume_console(void) +void console_resume_all(void) { struct console_flush_type ft; struct console *con; @@ -3342,7 +3358,12 @@ void console_unblank(void) */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) { + short flags = console_srcu_read_flags(c); + + if (flags & CON_SUSPENDED) + continue; + + if ((flags & CON_ENABLED) && c->unblank) { found_unblank = true; break; } @@ -3379,7 +3400,12 @@ void console_unblank(void) cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) + short flags = console_srcu_read_flags(c); + + if (flags & CON_SUSPENDED) + continue; + + if ((flags & CON_ENABLED) && c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); @@ -3497,10 +3523,10 @@ struct tty_driver *console_device(int *index) /* * Prevent further output on the passed console device so that (for example) - * serial drivers can disable console output before suspending a port, and can + * serial drivers can suspend console output before suspending a port, and can * re-enable output afterwards. */ -void console_stop(struct console *console) +void console_suspend(struct console *console) { __pr_flush(console, 1000, true); console_list_lock(); @@ -3515,9 +3541,9 @@ void console_stop(struct console *console) */ synchronize_srcu(&console_srcu); } -EXPORT_SYMBOL(console_stop); +EXPORT_SYMBOL(console_suspend); -void console_start(struct console *console) +void console_resume(struct console *console) { struct console_flush_type ft; bool is_nbcon; @@ -3542,7 +3568,7 @@ void console_start(struct console *console) __pr_flush(console, 1000, true); } -EXPORT_SYMBOL(console_start); +EXPORT_SYMBOL(console_resume); #ifdef CONFIG_PRINTK static int unregister_console_locked(struct console *console); @@ -4277,6 +4303,11 @@ void __init console_init(void) initcall_t call; initcall_entry_t *ce; +#ifdef CONFIG_NULL_TTY_DEFAULT_CONSOLE + if (!console_set_on_cmdline) + add_preferred_console("ttynull", 0, NULL); +#endif + /* Setup the default TTY line discipline. */ n_tty_init(); @@ -4466,7 +4497,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * Context: Process context. May sleep while acquiring console lock. * Return: true if all usable printers are caught up. */ -static bool pr_flush(int timeout_ms, bool reset_on_progress) +bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 88e8f3a61922..d9fb053cff67 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -2133,9 +2133,9 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * there may be other finalized records beyond that * need to be printed for a panic situation. If this * is the panic CPU, skip this - * non-existent/non-finalized record unless it is - * at or beyond the head, in which case it is not - * possible to continue. + * non-existent/non-finalized record unless non-panic + * CPUs are still running and their debugging is + * explicitly enabled. * * Note that new messages printed on panic CPU are * finalized when we are here. The only exception @@ -2143,10 +2143,13 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ - if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb))) + if (this_cpu_in_panic() && + (!debug_non_panic_cpus || legacy_allow_panic_sync) && + ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; - else + } else { return false; + } } } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d5f89f9ef29f..75a84efad40f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -921,7 +921,6 @@ ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, unsigned long args[ARRAY_SIZE(info->entry.args)]; int i; - info->op = PTRACE_SYSCALL_INFO_ENTRY; info->entry.nr = syscall_get_nr(child, regs); syscall_get_arguments(child, regs, args); for (i = 0; i < ARRAY_SIZE(args); i++) @@ -943,10 +942,12 @@ ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, * diverge significantly enough. */ ptrace_get_syscall_info_entry(child, regs, info); - info->op = PTRACE_SYSCALL_INFO_SECCOMP; info->seccomp.ret_data = child->ptrace_message; - /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ + /* + * ret_data is the last non-reserved field + * in struct ptrace_syscall_info.seccomp + */ return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); } @@ -954,7 +955,6 @@ static unsigned long ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, struct ptrace_syscall_info *info) { - info->op = PTRACE_SYSCALL_INFO_EXIT; info->exit.rval = syscall_get_error(child, regs); info->exit.is_error = !!info->exit.rval; if (!info->exit.is_error) @@ -965,19 +965,8 @@ ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, } static int -ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, - void __user *datavp) +ptrace_get_syscall_info_op(struct task_struct *child) { - struct pt_regs *regs = task_pt_regs(child); - struct ptrace_syscall_info info = { - .op = PTRACE_SYSCALL_INFO_NONE, - .arch = syscall_get_arch(child), - .instruction_pointer = instruction_pointer(regs), - .stack_pointer = user_stack_pointer(regs), - }; - unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); - unsigned long write_size; - /* * This does not need lock_task_sighand() to access * child->last_siginfo because ptrace_freeze_traced() @@ -988,24 +977,160 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, case SIGTRAP | 0x80: switch (child->ptrace_message) { case PTRACE_EVENTMSG_SYSCALL_ENTRY: - actual_size = ptrace_get_syscall_info_entry(child, regs, - &info); - break; + return PTRACE_SYSCALL_INFO_ENTRY; case PTRACE_EVENTMSG_SYSCALL_EXIT: - actual_size = ptrace_get_syscall_info_exit(child, regs, - &info); - break; + return PTRACE_SYSCALL_INFO_EXIT; + default: + return PTRACE_SYSCALL_INFO_NONE; } - break; case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): - actual_size = ptrace_get_syscall_info_seccomp(child, regs, - &info); + return PTRACE_SYSCALL_INFO_SECCOMP; + default: + return PTRACE_SYSCALL_INFO_NONE; + } +} + +static int +ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, + void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info = { + .op = ptrace_get_syscall_info_op(child), + .arch = syscall_get_arch(child), + .instruction_pointer = instruction_pointer(regs), + .stack_pointer = user_stack_pointer(regs), + }; + unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); + unsigned long write_size; + + switch (info.op) { + case PTRACE_SYSCALL_INFO_ENTRY: + actual_size = ptrace_get_syscall_info_entry(child, regs, &info); + break; + case PTRACE_SYSCALL_INFO_EXIT: + actual_size = ptrace_get_syscall_info_exit(child, regs, &info); + break; + case PTRACE_SYSCALL_INFO_SECCOMP: + actual_size = ptrace_get_syscall_info_seccomp(child, regs, &info); break; } write_size = min(actual_size, user_size); return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; } + +static int +ptrace_set_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + unsigned long args[ARRAY_SIZE(info->entry.args)]; + int nr = info->entry.nr; + int i; + + /* + * Check that the syscall number specified in info->entry.nr + * is either a value of type "int" or a sign-extended value + * of type "int". + */ + if (nr != info->entry.nr) + return -ERANGE; + + for (i = 0; i < ARRAY_SIZE(args); i++) { + args[i] = info->entry.args[i]; + /* + * Check that the syscall argument specified in + * info->entry.args[i] is either a value of type + * "unsigned long" or a sign-extended value of type "long". + */ + if (args[i] != info->entry.args[i]) + return -ERANGE; + } + + syscall_set_nr(child, regs, nr); + /* + * If the syscall number is set to -1, setting syscall arguments is not + * just pointless, it would also clobber the syscall return value on + * those architectures that share the same register both for the first + * argument of syscall and its return value. + */ + if (nr != -1) + syscall_set_arguments(child, regs, args); + + return 0; +} + +static int +ptrace_set_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + /* + * info->entry is currently a subset of info->seccomp, + * info->seccomp.ret_data is currently ignored. + */ + return ptrace_set_syscall_info_entry(child, regs, info); +} + +static int +ptrace_set_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + long rval = info->exit.rval; + + /* + * Check that the return value specified in info->exit.rval + * is either a value of type "long" or a sign-extended value + * of type "long". + */ + if (rval != info->exit.rval) + return -ERANGE; + + if (info->exit.is_error) + syscall_set_return_value(child, regs, rval, 0); + else + syscall_set_return_value(child, regs, 0, rval); + + return 0; +} + +static int +ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size, + const void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info; + + if (user_size < sizeof(info)) + return -EINVAL; + + /* + * The compatibility is tracked by info.op and info.flags: if user-space + * does not instruct us to use unknown extra bits from future versions + * of ptrace_syscall_info, we are not going to read them either. + */ + if (copy_from_user(&info, datavp, sizeof(info))) + return -EFAULT; + + /* Reserved for future use. */ + if (info.flags || info.reserved) + return -EINVAL; + + /* Changing the type of the system call stop is not supported yet. */ + if (ptrace_get_syscall_info_op(child) != info.op) + return -EINVAL; + + switch (info.op) { + case PTRACE_SYSCALL_INFO_ENTRY: + return ptrace_set_syscall_info_entry(child, regs, &info); + case PTRACE_SYSCALL_INFO_EXIT: + return ptrace_set_syscall_info_exit(child, regs, &info); + case PTRACE_SYSCALL_INFO_SECCOMP: + return ptrace_set_syscall_info_seccomp(child, regs, &info); + default: + /* Other types of system call stops are not supported yet. */ + return -EINVAL; + } +} #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, @@ -1224,6 +1349,10 @@ int ptrace_request(struct task_struct *child, long request, case PTRACE_GET_SYSCALL_INFO: ret = ptrace_get_syscall_info(child, addr, datavp); break; + + case PTRACE_SET_SYSCALL_INFO: + ret = ptrace_set_syscall_info(child, addr, datavp); + break; #endif case PTRACE_SECCOMP_GET_FILTER: diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b9b6bc55185d..4d9b21f69eaa 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -18,7 +18,7 @@ config TREE_RCU config PREEMPT_RCU bool - default y if PREEMPTION + default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC) select TREE_RCU help This option selects the RCU implementation that is @@ -65,6 +65,19 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config FORCE_NEED_SRCU_NMI_SAFE + bool "Force selection of NEED_SRCU_NMI_SAFE" + depends on !TINY_SRCU + depends on RCU_EXPERT + depends on ARCH_HAS_NMI_SAFE_THIS_CPU_OPS + select NEED_SRCU_NMI_SAFE + default n + help + This option forces selection of the NEED_SRCU_NMI_SAFE + Kconfig option, allowing testing of srcu_read_lock_nmisafe() + and srcu_read_unlock_nmisafe() on architectures (like x86) + that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option. + config NEED_SRCU_NMI_SAFE def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU @@ -91,7 +104,7 @@ config NEED_TASKS_RCU config TASKS_RCU bool - default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO) + default NEED_TASKS_RCU && PREEMPTION select IRQ_WORK config FORCE_TASKS_RUDE_RCU @@ -323,21 +336,27 @@ config RCU_LAZY depends on RCU_NOCB_CPU default n help - To save power, batch RCU callbacks and flush after delay, memory - pressure, or callback list growing too big. + To save power, batch RCU callbacks and delay starting the + corresponding grace period for multiple seconds. The grace + period will be started after this delay, in case of memory + pressure, or if the corresponding CPU's callback list grows + too large. - Requires rcu_nocbs=all to be set. + These delays happen only on rcu_nocbs CPUs, that is, CPUs + whose callbacks have been offloaded. - Use rcutree.enable_rcu_lazy=0 to turn it off at boot time. + Use the rcutree.enable_rcu_lazy=0 kernel-boot parameter to + globally disable these delays. config RCU_LAZY_DEFAULT_OFF bool "Turn RCU lazy invocation off by default" depends on RCU_LAZY default n help - Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default - off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch - it back on. + Build the kernel with CONFIG_RCU_LAZY=y, but cause the kernel + to boot with these energy-efficiency delays disabled. Use the + rcutree.enable_rcu_lazy=0 kernel-boot parameter to override + the this option at boot time, thus re-enabling these delays. config RCU_DOUBLE_CHECK_CB_TIME bool "RCU callback-batch backup time check" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 6af90510a1ca..12e4c64ebae1 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -54,7 +54,7 @@ config RCU_TORTURE_TEST Say N if you are unsure. config RCU_TORTURE_TEST_CHK_RDR_STATE - tristate "Check rcutorture reader state" + bool "Check rcutorture reader state" depends on RCU_TORTURE_TEST default n help @@ -70,7 +70,7 @@ config RCU_TORTURE_TEST_CHK_RDR_STATE Say N if you are unsure. config RCU_TORTURE_TEST_LOG_CPU - tristate "Log CPU for rcutorture failures" + bool "Log CPU for rcutorture failures" depends on RCU_TORTURE_TEST default n help @@ -84,6 +84,20 @@ config RCU_TORTURE_TEST_LOG_CPU Say Y here if you want CPU IDs logged. Say N if you are unsure. +config RCU_TORTURE_TEST_LOG_GP + bool "Log grace-period numbers for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + corresponding grace-period sequence numbers. This information + can be useful, but it does incur additional overhead, overhead + that can make both failures and close calls less probable. + + Say Y here if you want grace-period sequence numbers logged. + Say N if you are unsure. + config RCU_REF_SCALE_TEST tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index feb3ac1dc5d5..9cf01832a6c3 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -57,6 +57,9 @@ /* Low-order bit definition for polled grace-period APIs. */ #define RCU_GET_STATE_COMPLETED 0x1 +/* A complete grace period count */ +#define RCU_SEQ_GP (RCU_SEQ_STATE_MASK + 1) + extern int sysctl_sched_rt_runtime; /* @@ -157,12 +160,21 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) * Given a snapshot from rcu_seq_snap(), determine whether or not a * full update-side operation has occurred, but do not allow the * (ULONG_MAX / 2) safety-factor/guard-band. + * + * The token returned by get_state_synchronize_rcu_full() is based on + * rcu_state.gp_seq but it is tested in poll_state_synchronize_rcu_full() + * against the root rnp->gp_seq. Since rcu_seq_start() is first called + * on rcu_state.gp_seq and only later reflected on the root rnp->gp_seq, + * it is possible that rcu_seq_snap(rcu_state.gp_seq) returns 2 full grace + * periods ahead of the root rnp->gp_seq. To prevent false-positives with the + * full polling API that a wrap around instantly completed the GP, when nothing + * like that happened, adjust for the 2 GPs in the ULONG_CMP_LT(). */ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) { unsigned long cur_s = READ_ONCE(*sp); - return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1)); + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_GP)); } /* @@ -572,6 +584,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long c_old, unsigned long c); void rcu_gp_set_torture_wait(int duration); +void rcu_set_gpwrap_lag(unsigned long lag); +int rcu_get_gpwrap_count(int cpu); #else static inline void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) { @@ -589,7 +603,11 @@ void do_trace_rcu_torture_read(const char *rcutorturename, do { } while (0) #endif static inline void rcu_gp_set_torture_wait(int duration) { } +static inline void rcu_set_gpwrap_lag(unsigned long lag) { } +static inline int rcu_get_gpwrap_count(int cpu) { return 0; } #endif +unsigned long long rcutorture_gather_gp_seqs(void); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len); #ifdef CONFIG_TINY_SRCU @@ -611,8 +629,6 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } -static inline unsigned long -srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; } static inline void show_rcu_gp_kthreads(void) { } @@ -624,7 +640,6 @@ static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } bool rcu_watching_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); -unsigned long srcu_batches_completed(struct srcu_struct *sp); bool rcu_check_boost_fail(unsigned long gp_state, int *cpup); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); @@ -636,6 +651,12 @@ void rcu_gp_slow_register(atomic_t *rgssp); void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TINY_SRCU +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; } +#else // #ifdef CONFIG_TINY_SRCU +unsigned long srcu_batches_completed(struct srcu_struct *sp); +#endif // #else // #ifdef CONFIG_TINY_SRCU + #ifdef CONFIG_RCU_NOCB_CPU void rcu_bind_current_to_nocb(void); #else diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 0f3059b1b80d..b521d0455992 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -762,7 +762,7 @@ kfree_scale_thread(void *arg) } for (i = 0; i < kfree_alloc_num; i++) { - alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL); + alloc_ptr = kcalloc(kfree_mult, sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) return -ENOMEM; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d26fb1d33ed9..70ec0f21abc3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -115,6 +115,10 @@ torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(bool, gpwrap_lag, true, "Enable grace-period wrap lag testing"); +torture_param(int, gpwrap_lag_gps, 8, "Value to set for set_gpwrap_lag during an active testing period."); +torture_param(int, gpwrap_lag_cycle_mins, 30, "Total cycle duration for gpwrap lag testing (in minutes)"); +torture_param(int, gpwrap_lag_active_mins, 5, "Duration for which gpwrap lag is active within each cycle (in minutes)"); torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable"); torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disable"); @@ -135,6 +139,7 @@ torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); +torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds."); torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); @@ -147,6 +152,7 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); static int nrealnocbers; static int nrealreaders; +static int nrealfakewriters; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; @@ -272,6 +278,9 @@ struct rt_read_seg { bool rt_preempted; int rt_cpu; int rt_end_cpu; + unsigned long long rt_gp_seq; + unsigned long long rt_gp_seq_end; + u64 rt_ts; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; @@ -406,6 +415,10 @@ struct rcu_torture_ops { void (*gp_slow_register)(atomic_t *rgssp); void (*gp_slow_unregister)(atomic_t *rgssp); bool (*reader_blocked)(void); + unsigned long long (*gather_gp_seqs)(void); + void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len); + void (*set_gpwrap_lag)(unsigned long lag); + int (*get_gpwrap_count)(int cpu); long cbflood_max; int irq_capable; int can_boost; @@ -610,6 +623,10 @@ static struct rcu_torture_ops rcu_ops = { .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU) ? has_rcu_reader_blocked : NULL, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, + .set_gpwrap_lag = rcu_set_gpwrap_lag, + .get_gpwrap_count = rcu_get_gpwrap_count, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -655,6 +672,8 @@ static struct rcu_torture_ops rcu_busted_ops = { .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, .call = call_rcu_busted, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted" @@ -677,8 +696,11 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) static int srcu_torture_read_lock(void) { int idx; + struct srcu_ctr __percpu *scp; int ret = 0; + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { idx = srcu_read_lock(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); @@ -694,6 +716,12 @@ static int srcu_torture_read_lock(void) WARN_ON_ONCE(idx & ~0x1); ret += idx << 2; } + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_read_lock_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 3; + } return ret; } @@ -719,6 +747,8 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) + srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); if (reader_flavor & SRCU_READ_FLAVOR_LITE) srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); if (reader_flavor & SRCU_READ_FLAVOR_NMI) @@ -791,6 +821,7 @@ static struct rcu_torture_ops srcu_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -834,6 +865,7 @@ static struct rcu_torture_ops srcud_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -1148,8 +1180,19 @@ static int rcu_torture_boost(void *arg) unsigned long gp_state; unsigned long gp_state_time; unsigned long oldstarttime; + unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ; - VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) { + VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + } else { + VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period"); + while (time_before(jiffies, booststarttime)) { + schedule_timeout_idle(HZ); + if (kthread_should_stop()) + goto cleanup; + } + VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period"); + } /* Set real-time priority. */ sched_set_fifo_low(current); @@ -1225,6 +1268,7 @@ checkwait: if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); } while (!torture_must_stop()); +cleanup: /* Clean up and exit. */ while (!kthread_should_stop()) { torture_shutdown_absorb("rcu_torture_boost"); @@ -1728,7 +1772,7 @@ rcu_torture_fakewriter(void *arg) do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && - torture_random(&rand) % (nfakewriters * 8) == 0) { + torture_random(&rand) % (nrealfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else { switch (synctype[torture_random(&rand) % nsynctypes]) { @@ -1873,6 +1917,8 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, #define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq) { + int mask; + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE)) return; @@ -1899,11 +1945,27 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, WARN_ONCE(cur_ops->extendables && !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); - WARN_ONCE(cur_ops->extendables && - !(curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && + + /* + * non-preemptible RCU in a preemptible kernel uses preempt_disable() + * as rcu_read_lock(). + */ + mask = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + + WARN_ONCE(cur_ops->extendables && !(curstate & mask) && (preempt_count() & PREEMPT_MASK), ROEC_ARGS); - WARN_ONCE(cur_ops->readlock_nesting && - !(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && + + /* + * non-preemptible RCU in a preemptible kernel uses "preempt_count() & + * PREEMPT_MASK" as ->readlock_nesting(). + */ + mask = RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + + WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) && cur_ops->readlock_nesting() > 0, ROEC_ARGS); } @@ -1965,6 +2027,13 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, rtrsp[-1].rt_preempted = cur_ops->reader_blocked(); } } + // Sample grace-period sequence number, as good a place as any. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) { + rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs(); + rtrsp->rt_ts = ktime_get_mono_fast_ns(); + if (!first) + rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq; + } /* * Next, remove old protection, in decreasing order of strength @@ -2103,53 +2172,70 @@ rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_sta return &rtrsp[j]; } -/* - * Do one read-side critical section, returning false if there was - * no data to read. Can be invoked both from process context and - * from a timer handler. - */ -static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) -{ - bool checkpolling = !(torture_random(trsp) & 0xfff); +struct rcu_torture_one_read_state { + bool checkpolling; unsigned long cookie; struct rcu_gp_oldstate cookie_full; - int i; unsigned long started; - unsigned long completed; - int newstate; struct rcu_torture *p; - int pipe_count; - bool preempted = false; - int readstate = 0; - struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } }; - struct rt_read_seg *rtrsp = &rtseg[0]; - struct rt_read_seg *rtrsp1; + int readstate; + struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS]; + struct rt_read_seg *rtrsp; unsigned long long ts; +}; - WARN_ON_ONCE(!rcu_is_watching()); - newstate = rcutorture_extend_mask(readstate, trsp); - rcutorture_one_extend(&readstate, newstate, myid < 0, trsp, rtrsp++); - if (checkpolling) { +static void init_rcu_torture_one_read_state(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp) +{ + memset(rtorsp, 0, sizeof(*rtorsp)); + rtorsp->checkpolling = !(torture_random(trsp) & 0xfff); + rtorsp->rtrsp = &rtorsp->rtseg[0]; +} + +/* + * Set up the first segment of a series of overlapping read-side + * critical sections. The caller must have actually initiated the + * outermost read-side critical section. + */ +static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp, long myid) +{ + if (rtorsp->checkpolling) { if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - cookie = cur_ops->get_gp_state(); + rtorsp->cookie = cur_ops->get_gp_state(); if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) - cur_ops->get_gp_state_full(&cookie_full); + cur_ops->get_gp_state_full(&rtorsp->cookie_full); } - started = cur_ops->get_gp_seq(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, + rtorsp->started = cur_ops->get_gp_seq(); + rtorsp->ts = rcu_trace_clock_local(); + rtorsp->p = rcu_dereference_check(rcu_torture_current, !cur_ops->readlock_held || cur_ops->readlock_held()); - if (p == NULL) { + if (rtorsp->p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp); + rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp); return false; } - if (p->rtort_mbtest == 0) + if (rtorsp->p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - rcu_torture_reader_do_mbchk(myid, p, trsp); - rtrsp = rcutorture_loop_extend(&readstate, myid < 0, trsp, rtrsp); + rcu_torture_reader_do_mbchk(myid, rtorsp->p, trsp); + return true; +} + +/* + * Complete the last segment of a series of overlapping read-side + * critical sections and check for errors. + */ +static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp, long myid) +{ + int i; + unsigned long completed; + int pipe_count; + bool preempted = false; + struct rt_read_seg *rtrsp1; + preempt_disable(); - pipe_count = READ_ONCE(p->rtort_pipe_count); + pipe_count = READ_ONCE(rtorsp->p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { // Should not happen in a correct RCU implementation, // happens quite often for torture_type=busted. @@ -2157,28 +2243,28 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) } completed = cur_ops->get_gp_seq(); if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, started, completed); + do_trace_rcu_torture_read(cur_ops->name, &rtorsp->p->rtort_rcu, + rtorsp->ts, rtorsp->started, completed); rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = rcutorture_seq_diff(completed, started); + completed = rcutorture_seq_diff(completed, rtorsp->started); if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - if (checkpolling) { + if (rtorsp->checkpolling) { if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - WARN_ONCE(cur_ops->poll_gp_state(cookie), + WARN_ONCE(cur_ops->poll_gp_state(rtorsp->cookie), "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", __func__, rcu_torture_writer_state_getname(), rcu_torture_writer_state, - cookie, cur_ops->get_gp_state()); + rtorsp->cookie, cur_ops->get_gp_state()); if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) - WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full), + WARN_ONCE(cur_ops->poll_gp_state_full(&rtorsp->cookie_full), "%s: Cookie check 6 failed %s(%d) online %*pbl\n", __func__, rcu_torture_writer_state_getname(), @@ -2187,21 +2273,42 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) } if (cur_ops->reader_blocked) preempted = cur_ops->reader_blocked(); - rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp); - WARN_ON_ONCE(readstate); + rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp); + WARN_ON_ONCE(rtorsp->readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. - WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1); + WARN_ON_ONCE(leakpointer && READ_ONCE(rtorsp->p->rtort_pipe_count) > 1); /* If error or close call, record the sequence of reader protections. */ if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) { i = 0; - for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++) + for (rtrsp1 = &rtorsp->rtseg[0]; rtrsp1 < rtorsp->rtrsp; rtrsp1++) err_segs[i++] = *rtrsp1; rt_read_nsegs = i; rt_read_preempted = preempted; } +} +/* + * Do one read-side critical section, returning false if there was + * no data to read. Can be invoked both from process context and + * from a timer handler. + */ +static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) +{ + int newstate; + struct rcu_torture_one_read_state rtors; + + WARN_ON_ONCE(!rcu_is_watching()); + init_rcu_torture_one_read_state(&rtors, trsp); + newstate = rcutorture_extend_mask(rtors.readstate, trsp); + rcutorture_one_extend(&rtors.readstate, newstate, myid < 0, trsp, rtors.rtrsp++); + if (!rcu_torture_one_read_start(&rtors, trsp, myid)) { + rcutorture_one_extend(&rtors.readstate, 0, myid < 0, trsp, rtors.rtrsp); + return false; + } + rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, myid < 0, trsp, rtors.rtrsp); + rcu_torture_one_read_end(&rtors, trsp, myid); return true; } @@ -2246,7 +2353,7 @@ rcu_torture_reader(void *arg) set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) timer_setup_on_stack(&t, rcu_torture_timer, 0); - tick_dep_set_task(current, TICK_DEP_BIT_RCU); + tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick. do { if (irqreader && cur_ops->irq_capable) { if (!timer_pending(&t)) @@ -2263,8 +2370,8 @@ rcu_torture_reader(void *arg) stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { - del_timer_sync(&t); - destroy_timer_on_stack(&t); + timer_delete_sync(&t); + timer_destroy_on_stack(&t); } tick_dep_clear_task(current, TICK_DEP_BIT_RCU); torture_kthread_stopping("rcu_torture_reader"); @@ -2333,6 +2440,7 @@ rcu_torture_stats_print(void) int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + long n_gpwraps = 0; struct rcu_torture *rtcp; static unsigned long rtcv_snap = ULONG_MAX; static bool splatted; @@ -2343,6 +2451,8 @@ rcu_torture_stats_print(void) pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]); batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); } + if (cur_ops->get_gpwrap_count) + n_gpwraps += cur_ops->get_gpwrap_count(cpu); } for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) { if (pipesummary[i] != 0) @@ -2374,8 +2484,9 @@ rcu_torture_stats_print(void) data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic. - pr_cont("nocb-toggles: %ld:%ld\n", + pr_cont("nocb-toggles: %ld:%ld ", atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload)); + pr_cont("gpwraps: %ld\n", n_gpwraps); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -2512,7 +2623,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "shuffle_interval=%d stutter=%d irqreader=%d " "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " + "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "stall_cpu_block=%d stall_cpu_repeat=%d " "n_barrier_cbs=%d " @@ -2522,11 +2633,11 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d " "preempt_duration=%d preempt_interval=%d\n", - torture_type, tag, nrealreaders, nfakewriters, + torture_type, tag, nrealreaders, nrealfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, + test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, stall_cpu_block, stall_cpu_repeat, n_barrier_cbs, @@ -2975,7 +3086,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); rfp->rcu_launder_gp_seq_start = gps; - tick_dep_set_task(current, TICK_DEP_BIT_RCU); + tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick. while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { @@ -3546,6 +3657,57 @@ static int rcu_torture_preempt(void *unused) static enum cpuhp_state rcutor_hp; +static struct hrtimer gpwrap_lag_timer; +static bool gpwrap_lag_active; + +/* Timer handler for toggling RCU grace-period sequence overflow test lag value */ +static enum hrtimer_restart rcu_gpwrap_lag_timer(struct hrtimer *timer) +{ + ktime_t next_delay; + + if (gpwrap_lag_active) { + pr_alert("rcu-torture: Disabling gpwrap lag (value=0)\n"); + cur_ops->set_gpwrap_lag(0); + gpwrap_lag_active = false; + next_delay = ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0); + } else { + pr_alert("rcu-torture: Enabling gpwrap lag (value=%d)\n", gpwrap_lag_gps); + cur_ops->set_gpwrap_lag(gpwrap_lag_gps); + gpwrap_lag_active = true; + next_delay = ktime_set(gpwrap_lag_active_mins * 60, 0); + } + + if (torture_must_stop_irq()) + return HRTIMER_NORESTART; + + hrtimer_forward_now(timer, next_delay); + return HRTIMER_RESTART; +} + +static int rcu_gpwrap_lag_init(void) +{ + if (!gpwrap_lag) + return 0; + + if (gpwrap_lag_cycle_mins <= 0 || gpwrap_lag_active_mins <= 0) { + pr_alert("rcu-torture: lag timing parameters must be positive\n"); + return -EINVAL; + } + + hrtimer_setup(&gpwrap_lag_timer, rcu_gpwrap_lag_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + gpwrap_lag_active = false; + hrtimer_start(&gpwrap_lag_timer, + ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0), HRTIMER_MODE_REL); + + return 0; +} + +static void rcu_gpwrap_lag_cleanup(void) +{ + hrtimer_cancel(&gpwrap_lag_timer); + cur_ops->set_gpwrap_lag(0); + gpwrap_lag_active = false; +} static void rcu_torture_cleanup(void) { @@ -3553,6 +3715,7 @@ rcu_torture_cleanup(void) int flags = 0; unsigned long gp_seq = 0; int i; + int j; if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) { @@ -3597,7 +3760,7 @@ rcu_torture_cleanup(void) rcu_torture_reader_mbchk = NULL; if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) + for (i = 0; i < nrealfakewriters; i++) torture_stop_kthread(rcu_torture_fakewriter, fakewriter_tasks[i]); kfree(fakewriter_tasks); @@ -3635,7 +3798,11 @@ rcu_torture_cleanup(void) pr_alert("\t: No segments recorded!!!\n"); firsttime = 1; for (i = 0; i < rt_read_nsegs; i++) { - pr_alert("\t%d: %#4x", i, err_segs[i].rt_readstate); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP)) + pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL)); + else + pr_alert("\t"); + pr_cont("%d: %#4x", i, err_segs[i].rt_readstate); if (err_segs[i].rt_delay_jiffies != 0) { pr_cont("%s%ldjiffies", firsttime ? "" : "+", err_segs[i].rt_delay_jiffies); @@ -3648,6 +3815,27 @@ rcu_torture_cleanup(void) else pr_cont(" ..."); } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && + cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) { + char buf1[20+1]; + char buf2[20+1]; + char sepchar = '-'; + + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, + buf1, ARRAY_SIZE(buf1)); + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end, + buf2, ARRAY_SIZE(buf2)); + if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) { + if (buf2[0]) { + for (j = 0; buf2[j]; j++) + buf2[j] = '.'; + if (j) + buf2[j - 1] = ' '; + } + sepchar = ' '; + } + pr_cont(" %s%c%s", buf1, sepchar, buf2); + } if (err_segs[i].rt_delay_ms != 0) { pr_cont(" %s%ldms", firsttime ? "" : "+", err_segs[i].rt_delay_ms); @@ -3689,6 +3877,9 @@ rcu_torture_cleanup(void) torture_cleanup_end(); if (cur_ops->gp_slow_unregister) cur_ops->gp_slow_unregister(NULL); + + if (gpwrap_lag && cur_ops->set_gpwrap_lag) + rcu_gpwrap_lag_cleanup(); } static void rcu_torture_leak_cb(struct rcu_head *rhp) @@ -3994,6 +4185,14 @@ rcu_torture_init(void) rcu_torture_init_srcu_lockdep(); + if (nfakewriters >= 0) { + nrealfakewriters = nfakewriters; + } else { + nrealfakewriters = num_online_cpus() - 2 - nfakewriters; + if (nrealfakewriters <= 0) + nrealfakewriters = 1; + } + if (nreaders >= 0) { nrealreaders = nreaders; } else { @@ -4050,8 +4249,9 @@ rcu_torture_init(void) writer_task); if (torture_init_error(firsterr)) goto unwind; - if (nfakewriters > 0) { - fakewriter_tasks = kcalloc(nfakewriters, + + if (nrealfakewriters > 0) { + fakewriter_tasks = kcalloc(nrealfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -4060,7 +4260,7 @@ rcu_torture_init(void) goto unwind; } } - for (i = 0; i < nfakewriters; i++) { + for (i = 0; i < nrealfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, NULL, fakewriter_tasks[i]); if (torture_init_error(firsterr)) @@ -4176,9 +4376,17 @@ rcu_torture_init(void) } if (object_debug) rcu_test_debug_objects(); - torture_init_end(); + if (cur_ops->gp_slow_register && !WARN_ON_ONCE(!cur_ops->gp_slow_unregister)) cur_ops->gp_slow_register(&rcu_fwd_cb_nodelay); + + if (gpwrap_lag && cur_ops->set_gpwrap_lag) { + firsterr = rcu_gpwrap_lag_init(); + if (torture_init_error(firsterr)) + goto unwind; + } + + torture_init_end(); return 0; unwind: diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 1b47376acdc4..f11a7c2af778 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -216,6 +216,36 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static void srcu_fast_ref_scale_read_section(const int nloops) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static const struct ref_scale_ops srcu_fast_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_fast_ref_scale_read_section, + .delaysection = srcu_fast_ref_scale_delay_section, + .name = "srcu-fast" +}; + static void srcu_lite_ref_scale_read_section(const int nloops) { int i; @@ -1163,7 +1193,7 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 4dcbf8aa80ff..6e9fe2ce1075 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -20,7 +20,11 @@ #include "rcu_segcblist.h" #include "rcu.h" +#ifndef CONFIG_TREE_RCU int rcu_scheduler_active __read_mostly; +#else // #ifndef CONFIG_TREE_RCU +extern int rcu_scheduler_active; +#endif // #else // #ifndef CONFIG_TREE_RCU static LIST_HEAD(srcu_boot_list); static bool srcu_init_done; @@ -98,7 +102,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { int newval; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); preempt_enable(); @@ -120,7 +124,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { preempt_enable(); return; /* Already running or nothing to do. */ @@ -138,7 +142,7 @@ void srcu_drive_gp(struct work_struct *wp) WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ preempt_enable(); swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); preempt_enable(); @@ -159,7 +163,7 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_running, false); idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)); preempt_enable(); @@ -172,7 +176,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY cookie = get_state_synchronize_srcu(ssp); if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { preempt_enable(); @@ -199,7 +203,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY local_irq_save(flags); *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; @@ -261,7 +265,7 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) { unsigned long ret; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY ret = get_state_synchronize_srcu(ssp); srcu_gp_start_if_needed(ssp); preempt_enable(); @@ -282,11 +286,13 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); +#ifndef CONFIG_TREE_RCU /* Lockdep diagnostics. */ void __init rcu_scheduler_starting(void) { rcu_scheduler_active = RCU_SCHEDULER_RUNNING; } +#endif // #ifndef CONFIG_TREE_RCU /* * Queue work for srcu_struct structures with early boot callbacks. diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b83c74c4dcc0..48047260697e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -116,8 +116,9 @@ do { \ /* * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and - * srcu_read_unlock() running against them. So if the is_static parameter - * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. + * srcu_read_unlock() running against them. So if the is_static + * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and + * ->srcu_ctrs[].srcu_unlocks. */ static void init_srcu_struct_data(struct srcu_struct *ssp) { @@ -128,8 +129,6 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); @@ -247,15 +246,16 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_sup->srcu_cb_mutex); mutex_init(&ssp->srcu_sup->srcu_gp_mutex); - ssp->srcu_idx = 0; ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_barrier_seq = 0; mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); ssp->srcu_sup->sda_is_static = is_static; - if (!is_static) + if (!is_static) { ssp->sda = alloc_percpu(struct srcu_data); + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; + } if (!ssp->sda) goto err_free_sup; init_srcu_struct_data(ssp); @@ -429,10 +429,10 @@ static bool srcu_gp_is_expedited(struct srcu_struct *ssp) } /* - * Computes approximate total of the readers' ->srcu_lock_count[] values - * for the rank of per-CPU counters specified by idx, and returns true if - * the caller did the proper barrier (gp), and if the count of the locks - * matches that of the unlocks passed in. + * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks + * values for the rank of per-CPU counters specified by idx, and returns + * true if the caller did the proper barrier (gp), and if the count of + * the locks matches that of the unlocks passed in. */ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) { @@ -443,20 +443,20 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks); if (IS_ENABLED(CONFIG_PROVE_RCU)) mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), "Mixed reader flavors for srcu_struct at %ps.\n", ssp); - if (mask & SRCU_READ_FLAVOR_LITE && !gp) + if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp) return false; return sum == unlocks; } /* - * Returns approximate total of the readers' ->srcu_unlock_count[] values - * for the rank of per-CPU counters specified by idx. + * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks + * values for the rank of per-CPU counters specified by idx. */ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { @@ -467,7 +467,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, u for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_unlock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks); mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), @@ -487,7 +487,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) unsigned long unlocks; unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); - did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE); + did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP); /* * Make sure that a lock is always counted if the corresponding @@ -509,48 +509,49 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * If the locks are the same as the unlocks, then there must have * been no readers on this index at some point in this function. * But there might be more readers, as a task might have read - * the current ->srcu_idx but not yet have incremented its CPU's - * ->srcu_lock_count[idx] counter. In fact, it is possible + * the current ->srcu_ctrp but not yet have incremented its CPU's + * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible * that most of the tasks have been preempted between fetching - * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there - * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks - * in a system whose address space was fully populated with memory. - * Call this quantity Nt. + * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And + * there could be almost (ULONG_MAX / sizeof(struct task_struct)) + * tasks in a system whose address space was fully populated + * with memory. Call this quantity Nt. * - * So suppose that the updater is preempted at this point in the - * code for a long time. That now-preempted updater has already - * flipped ->srcu_idx (possibly during the preceding grace period), - * done an smp_mb() (again, possibly during the preceding grace - * period), and summed up the ->srcu_unlock_count[idx] counters. - * How many times can a given one of the aforementioned Nt tasks - * increment the old ->srcu_idx value's ->srcu_lock_count[idx] - * counter, in the absence of nesting? + * So suppose that the updater is preempted at this + * point in the code for a long time. That now-preempted + * updater has already flipped ->srcu_ctrp (possibly during + * the preceding grace period), done an smp_mb() (again, + * possibly during the preceding grace period), and summed up + * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times + * can a given one of the aforementioned Nt tasks increment the + * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter, + * in the absence of nesting? * * It can clearly do so once, given that it has already fetched - * the old value of ->srcu_idx and is just about to use that value - * to index its increment of ->srcu_lock_count[idx]. But as soon as - * it leaves that SRCU read-side critical section, it will increment - * ->srcu_unlock_count[idx], which must follow the updater's above - * read from that same value. Thus, as soon the reading task does - * an smp_mb() and a later fetch from ->srcu_idx, that task will be - * guaranteed to get the new index. Except that the increment of - * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the - * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock() - * is before the smp_mb(). Thus, that task might not see the new - * value of ->srcu_idx until the -second- __srcu_read_lock(), - * which in turn means that this task might well increment - * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice, - * not just once. + * the old value of ->srcu_ctrp and is just about to use that + * value to index its increment of ->srcu_ctrs[idx].srcu_locks. + * But as soon as it leaves that SRCU read-side critical section, + * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must + * follow the updater's above read from that same value. Thus, + as soon the reading task does an smp_mb() and a later fetch from + * ->srcu_ctrp, that task will be guaranteed to get the new index. + * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks + * in __srcu_read_unlock() is after the smp_mb(), and the fetch + * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb(). + * Thus, that task might not see the new value of ->srcu_ctrp until + * the -second- __srcu_read_lock(), which in turn means that this + * task might well increment ->srcu_ctrs[idx].srcu_locks for the + * old value of ->srcu_ctrp twice, not just once. * * However, it is important to note that a given smp_mb() takes * effect not just for the task executing it, but also for any * later task running on that same CPU. * - * That is, there can be almost Nt + Nc further increments of - * ->srcu_lock_count[idx] for the old index, where Nc is the number - * of CPUs. But this is OK because the size of the task_struct - * structure limits the value of Nt and current systems limit Nc - * to a few thousand. + * That is, there can be almost Nt + Nc further increments + * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc + * is the number of CPUs. But this is OK because the size of + * the task_struct structure limits the value of Nt and current + * systems limit Nc to a few thousand. * * OK, but what about nesting? This does impose a limit on * nesting of half of the size of the task_struct structure @@ -581,10 +582,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp) for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[0]); - sum += atomic_long_read(&sdp->srcu_lock_count[1]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[0]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[1]); + sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks); + sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks); + sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks); + sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks); } return sum; } @@ -647,6 +648,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long jbase = SRCU_INTERVAL; struct srcu_usage *sup = ssp->srcu_sup; + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); if (srcu_gp_is_expedited(ssp)) jbase = 0; if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { @@ -674,9 +676,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; + unsigned long delay; struct srcu_usage *sup = ssp->srcu_sup; - if (WARN_ON(!srcu_get_delay(ssp))) + spin_lock_irq_rcu_node(ssp->srcu_sup); + delay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + if (WARN_ON(!delay)) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ @@ -684,7 +690,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - del_timer_sync(&sdp->delay_work); + timer_delete_sync(&sdp->delay_work); flush_work(&sdp->work); if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) return; /* Forgot srcu_barrier(), so just leak it! */ @@ -743,12 +749,11 @@ EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); */ int __srcu_read_lock(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); + this_cpu_inc(scp->srcu_locks.counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scp); } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -760,7 +765,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -773,13 +778,12 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); */ int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) { - int idx; - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp); + struct srcu_ctr *scp = raw_cpu_ptr(scpp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - atomic_long_inc(&sdp->srcu_lock_count[idx]); + atomic_long_inc(&scp->srcu_locks); smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scpp); } EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); @@ -790,10 +794,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); */ void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) { - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ - atomic_long_inc(&sdp->srcu_unlock_count[idx]); + atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks); } EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); @@ -1096,13 +1098,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* * Wait until all readers counted by array index idx complete, but * loop an additional time if there is an expedited grace period pending. - * The caller must ensure that ->srcu_idx is not changed while checking. + * The caller must ensure that ->srcu_ctrp is not changed while checking. */ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { unsigned long curdelay; + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = !srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) @@ -1114,30 +1118,30 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) } /* - * Increment the ->srcu_idx counter so that future SRCU readers will + * Increment the ->srcu_ctrp counter so that future SRCU readers will * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *ssp) { /* - * Because the flip of ->srcu_idx is executed only if the + * Because the flip of ->srcu_ctrp is executed only if the * preceding call to srcu_readers_active_idx_check() found that - * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched - * and because that summing uses atomic_long_read(), there is - * ordering due to a control dependency between that summing and - * the WRITE_ONCE() in this call to srcu_flip(). This ordering - * ensures that if this updater saw a given reader's increment from - * __srcu_read_lock(), that reader was using a value of ->srcu_idx - * from before the previous call to srcu_flip(), which should be - * quite rare. This ordering thus helps forward progress because - * the grace period could otherwise be delayed by additional - * calls to __srcu_read_lock() using that old (soon to be new) - * value of ->srcu_idx. + * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums + * matched and because that summing uses atomic_long_read(), + * there is ordering due to a control dependency between that + * summing and the WRITE_ONCE() in this call to srcu_flip(). + * This ordering ensures that if this updater saw a given reader's + * increment from __srcu_read_lock(), that reader was using a value + * of ->srcu_ctrp from before the previous call to srcu_flip(), + * which should be quite rare. This ordering thus helps forward + * progress because the grace period could otherwise be delayed + * by additional calls to __srcu_read_lock() using that old (soon + * to be new) value of ->srcu_ctrp. * * This sum-equality check and ordering also ensures that if * a given call to __srcu_read_lock() uses the new value of - * ->srcu_idx, this updater's earlier scans cannot have seen + * ->srcu_ctrp, this updater's earlier scans cannot have seen * that reader's increments, which is all to the good, because * this grace period need not wait on that reader. After all, * if those earlier scans had seen that reader, there would have @@ -1152,7 +1156,8 @@ static void srcu_flip(struct srcu_struct *ssp) */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter. + WRITE_ONCE(ssp->srcu_ctrp, + &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]); /* * Ensure that if the updater misses an __srcu_read_unlock() @@ -1198,7 +1203,7 @@ static bool srcu_should_expedite(struct srcu_struct *ssp) check_init_srcu_struct(ssp); /* If _lite() readers, don't do unsolicited expediting. */ - if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE) + if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP) return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); @@ -1398,8 +1403,12 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, * read-side critical sections are delimited by srcu_read_lock() and * srcu_read_unlock(), and may be nested. * - * The callback will be invoked from process context, but must nevertheless - * be fast and must not block. + * The callback will be invoked from process context, but with bh + * disabled. The callback function must therefore be fast and must + * not block. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) @@ -1465,8 +1474,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * * Wait for the count to drain to zero of both indexes. To avoid the * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, - * and then flip the srcu_idx and wait for the count of the other index. + * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero + * at first, and then flip the ->srcu_ctrp and wait for the count of the + * other index. * * Can block; must be called from process context. * @@ -1579,7 +1589,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { if (cookie != SRCU_GET_STATE_COMPLETED && - !rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) + !rcu_seq_done_exact(&ssp->srcu_sup->srcu_gp_seq, cookie)) return false; // Ensure that the end of the SRCU grace period happens before // any subsequent code that the caller might execute. @@ -1675,7 +1685,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return READ_ONCE(ssp->srcu_idx); + return READ_ONCE(ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcu_batches_completed); @@ -1692,7 +1702,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) /* * Because readers might be delayed for an extended period after - * fetching ->srcu_idx for their index, at any point in time there + * fetching ->srcu_ctrp for their index, at any point in time there * might well be readers using both idx=0 and idx=1. We therefore * need to wait for readers to clear from both index values before * invoking a callback. @@ -1720,7 +1730,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) } if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) { - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 1)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1738,7 +1748,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) * SRCU read-side critical sections are normally short, * so check at least twice in quick succession after a flip. */ - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 2)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1849,7 +1859,9 @@ static void process_srcu(struct work_struct *work) ssp = sup->srcu_ssp; srcu_advance_state(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (curdelay) { WRITE_ONCE(sup->reschedule_count, 0); } else { @@ -1896,7 +1908,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); int ss_state_idx = ss_state; - idx = ssp->srcu_idx & 0x1; + idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]; if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; pr_alert("%s%s Tree SRCU g%ld state %d (%s)", @@ -1914,8 +1926,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx])); - u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx])); + u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks)); + u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks)); /* * Make sure that a lock is always counted if the corresponding @@ -1923,8 +1935,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx])); - l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx])); + l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks)); + l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks)); c0 = l0 - u0; c1 = l1 - u1; @@ -2001,6 +2013,7 @@ static int srcu_module_coming(struct module *mod) ssp->sda = alloc_percpu(struct srcu_data); if (WARN_ON_ONCE(!ssp->sda)) return -ENOMEM; + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; } return 0; } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 59314da5eb60..f92443561d36 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -316,7 +316,8 @@ static void call_rcu_tasks_generic_timer(struct timer_list *tlp) unsigned long flags; bool needwake = false; struct rcu_tasks *rtp; - struct rcu_tasks_percpu *rtpcp = from_timer(rtpcp, tlp, lazy_timer); + struct rcu_tasks_percpu *rtpcp = timer_container_of(rtpcp, tlp, + lazy_timer); rtp = rtpcp->rtpp; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); @@ -1086,7 +1087,7 @@ static void rcu_tasks_postscan(struct list_head *hop) } if (!IS_ENABLED(CONFIG_TINY_RCU)) - del_timer_sync(&tasks_rcu_exit_srcu_stall_timer); + timer_delete_sync(&tasks_rcu_exit_srcu_stall_timer); } /* See if tasks are still holding out, complain if so. */ @@ -2256,7 +2257,7 @@ void __init tasks_cblist_init_generic(void) #endif } -void __init rcu_init_tasks_generic(void) +static int __init rcu_init_tasks_generic(void) { #ifdef CONFIG_TASKS_RCU rcu_spawn_tasks_kthread(); @@ -2272,7 +2273,10 @@ void __init rcu_init_tasks_generic(void) // Run the self-tests. rcu_tasks_initiate_self_tests(); + + return 0; } +core_initcall(rcu_init_tasks_generic); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4b3f31911465..c1ebfd51768b 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -85,15 +85,8 @@ void rcu_sched_clock_irq(int user) static inline bool rcu_reclaim_tiny(struct rcu_head *head) { rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kvfree_rcu_offset(offset)) { - trace_rcu_invoke_kvfree_callback("", head, offset); - kvfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } trace_rcu_invoke_callback("", head); f = head->func; @@ -159,10 +152,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -static void tiny_rcu_leak_callback(struct rcu_head *rhp) -{ -} - /* * Post an RCU callback to be invoked after the end of an RCU grace * period. But since we have but one CPU, that would be after any @@ -178,9 +167,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); mem_dump_obj(head); } - - if (!__is_kvfree_rcu_offset((unsigned long)head->func)) - WRITE_ONCE(head->func, tiny_rcu_leak_callback); return; } @@ -246,15 +232,18 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr) +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) +unsigned long long rcutorture_gather_gp_seqs(void) { - if (head) - kasan_record_aux_stack(ptr); + return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL; +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); - __kvfree_call_rcu(head, ptr); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) +{ + snprintf(cp, len, "g%04llx", seqs & 0xffffULL); } -EXPORT_SYMBOL_GPL(kvfree_call_rcu); +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); #endif void __init rcu_init(void) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 475f31deed14..14d4499c6fc3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -80,6 +80,15 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .gpwrap = true, }; + +int rcu_get_gpwrap_count(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return READ_ONCE(rdp->gpwrap_count); +} +EXPORT_SYMBOL_GPL(rcu_get_gpwrap_count); + static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, .gp_state = RCU_GP_IDLE, @@ -538,6 +547,26 @@ void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); +/* Gather grace-period sequence numbers for rcutorture diagnostics. */ +unsigned long long rcutorture_gather_gp_seqs(void) +{ + return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) | + ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) | + (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL); +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); + +/* Format grace-period sequence numbers for rcutorture diagnostics. */ +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) +{ + unsigned int egp = (seqs >> 16) & 0xffffffULL; + unsigned int ggp = (seqs >> 40) & 0xffffULL; + unsigned int pgp = seqs & 0xffffULL; + + snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp); +} +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); + #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on @@ -737,6 +766,25 @@ void rcu_request_urgent_qs_task(struct task_struct *t) smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true); } +static unsigned long seq_gpwrap_lag = ULONG_MAX / 4; + +/** + * rcu_set_gpwrap_lag - Set RCU GP sequence overflow lag value. + * @lag_gps: Set overflow lag to this many grace period worth of counters + * which is used by rcutorture to quickly force a gpwrap situation. + * @lag_gps = 0 means we reset it back to the boot-time value. + */ +void rcu_set_gpwrap_lag(unsigned long lag_gps) +{ + unsigned long lag_seq_count; + + lag_seq_count = (lag_gps == 0) + ? ULONG_MAX / 4 + : lag_gps << RCU_SEQ_CTR_SHIFT; + WRITE_ONCE(seq_gpwrap_lag, lag_seq_count); +} +EXPORT_SYMBOL_GPL(rcu_set_gpwrap_lag); + /* * When trying to report a quiescent state on behalf of some other CPU, * it is our responsibility to check for and handle potential overflow @@ -747,9 +795,11 @@ void rcu_request_urgent_qs_task(struct task_struct *t) static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { raw_lockdep_assert_held_rcu_node(rnp); - if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, - rnp->gp_seq)) + if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + seq_gpwrap_lag, + rnp->gp_seq)) { WRITE_ONCE(rdp->gpwrap, true); + WRITE_ONCE(rdp->gpwrap_count, READ_ONCE(rdp->gpwrap_count) + 1); + } if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq)) rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4; } @@ -781,6 +831,10 @@ static int rcu_watching_snap_save(struct rcu_data *rdp) return 0; } +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif + /* * Returns positive if the specified CPU has passed through a quiescent state * by virtue of being in or having passed through an dynticks idle state since @@ -916,9 +970,9 @@ static int rcu_watching_snap_recheck(struct rcu_data *rdp) rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); - rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu); - rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu); - rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu); + rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu); + rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu); + rsrp->nr_csw = nr_context_switches_cpu(cpu); rsrp->jiffies = jiffies; rsrp->gp_seq = rdp->gp_seq; } @@ -1040,38 +1094,6 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) return needmore; } -static void swake_up_one_online_ipi(void *arg) -{ - struct swait_queue_head *wqh = arg; - - swake_up_one(wqh); -} - -static void swake_up_one_online(struct swait_queue_head *wqh) -{ - int cpu = get_cpu(); - - /* - * If called from rcutree_report_cpu_starting(), wake up - * is dangerous that late in the CPU-down hotplug process. The - * scheduler might queue an ignored hrtimer. Defer the wake up - * to an online CPU instead. - */ - if (unlikely(cpu_is_offline(cpu))) { - int target; - - target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU), - cpu_online_mask); - - smp_call_function_single(target, swake_up_one_online_ipi, - wqh, 0); - put_cpu(); - } else { - put_cpu(); - swake_up_one(wqh); - } -} - /* * Awaken the grace-period kthread. Don't do a self-awaken (unless in an * interrupt or softirq handler, in which case we just might immediately @@ -1096,7 +1118,7 @@ static void rcu_gp_kthread_wake(void) return; WRITE_ONCE(rcu_state.gp_wake_time, jiffies); WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); - swake_up_one_online(&rcu_state.gp_wq); + swake_up_one(&rcu_state.gp_wq); } /* @@ -1254,7 +1276,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { if (!offloaded) ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ rdp->core_needs_qs = false; @@ -1268,7 +1290,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Now handle the beginnings of any new-to-this-CPU grace periods. */ if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1283,7 +1305,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); - if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap) WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); @@ -1612,12 +1634,10 @@ static void rcu_sr_normal_complete(struct llist_node *node) { struct rcu_synchronize *rs = container_of( (struct rcu_head *) node, struct rcu_synchronize, head); - unsigned long oldstate = (unsigned long) rs->head.func; WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && - !poll_state_synchronize_rcu(oldstate), - "A full grace period is not passed yet: %lu", - rcu_seq_diff(get_state_synchronize_rcu(), oldstate)); + !poll_state_synchronize_rcu_full(&rs->oldstate), + "A full grace period is not passed yet!\n"); /* Finally. */ complete(&rs->completion); @@ -1780,6 +1800,7 @@ static noinline_for_stack bool rcu_gp_init(void) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(); bool start_new_poll; + unsigned long old_gp_seq; WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); @@ -1801,10 +1822,19 @@ static noinline_for_stack bool rcu_gp_init(void) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(); + /* + * A new wait segment must be started before gp_seq advanced, so + * that previous gp waiters won't observe the new gp_seq. + */ + start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ + old_gp_seq = rcu_state.gp_seq; rcu_seq_start(&rcu_state.gp_seq); + /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && + rcu_seq_done_exact(&old_gp_seq, rcu_seq_snap(&rcu_state.gp_seq))); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); - start_new_poll = rcu_sr_normal_gp_init(); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); @@ -2931,13 +2961,8 @@ static int __init rcu_spawn_core_kthreads(void) static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) { rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kvfree_rcu_offset((unsigned long)func)) - trace_rcu_kvfree_callback(rcu_state.name, head, - (unsigned long)func, - rcu_segcblist_n_cbs(&rdp->cblist)); - else - trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_callback(rcu_state.name, head, + rcu_segcblist_n_cbs(&rdp->cblist)); trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); } @@ -3047,6 +3072,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) /* Misaligned rcu_head! */ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); + /* Avoid NULL dereference if callback is NULL. */ + if (WARN_ON_ONCE(!func)) + return; + if (debug_rcu_head_queue(head)) { /* * Probable double call_rcu(), so leak the callback. @@ -3107,7 +3136,7 @@ module_param(enable_rcu_lazy, bool, 0444); * critical sections have completed. * * Use this API instead of call_rcu() if you don't want the callback to be - * invoked after very long periods of time, which can happen on systems without + * delayed for very long periods of time, which can happen on systems without * memory pressure and on systems which are lightly loaded or mostly idle. * This function will cause callbacks to be invoked sooner than later at the * expense of extra power. Other than that, this function is identical to, and @@ -3138,6 +3167,12 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * might well execute concurrently with RCU read-side critical sections * that started after call_rcu() was invoked. * + * It is perfectly legal to repost an RCU callback, potentially with + * a different callback function, from within its callback function. + * The specified function will be invoked after another full grace period + * has elapsed. This use case is similar in form to the common practice + * of reposting a timer from within its own handler. + * * RCU read-side critical sections are delimited by rcu_read_lock() * and rcu_read_unlock(), and may be nested. In addition, but only in * v5.0 and later, regions of code across which interrupts, preemption, @@ -3166,6 +3201,13 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * * Implementation of these memory-ordering guarantees is described here: * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. + * + * Specific to call_rcu() (as opposed to the other call_rcu*() functions), + * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many + * seconds before starting the grace period needed by the corresponding + * callback. This delay can significantly improve energy-efficiency + * on low-utilization battery-powered devices. To avoid this delay, + * in latency-sensitive kernel code, use call_rcu_hurry(). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -3214,7 +3256,7 @@ static void synchronize_rcu_normal(void) * snapshot before adding a request. */ if (IS_ENABLED(CONFIG_PROVE_RCU)) - rs.head.func = (void *) get_state_synchronize_rcu(); + get_state_synchronize_rcu_full(&rs.oldstate); rcu_sr_normal_add_req(&rs); @@ -3357,14 +3399,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { - struct rcu_node *rnp = rcu_get_root(); - /* * Any prior manipulation of RCU-protected data must happen * before the loads from ->gp_seq and ->expedited_sequence. */ smp_mb(); /* ^^^ */ - rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq); + + // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use + // in poll_state_synchronize_rcu_full() notwithstanding. Use of + // the latter here would result in too-short grace periods due to + // interactions with newly onlined CPUs. + rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq); rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a9a811d9d7a3..3830c19cf2f6 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -168,7 +168,7 @@ struct rcu_snap_record { u64 cputime_irq; /* Accumulated cputime of hard irqs */ u64 cputime_softirq;/* Accumulated cputime of soft irqs */ u64 cputime_system; /* Accumulated cputime of kernel tasks */ - unsigned long nr_hardirqs; /* Accumulated number of hard irqs */ + u64 nr_hardirqs; /* Accumulated number of hard irqs */ unsigned int nr_softirqs; /* Accumulated number of soft irqs */ unsigned long long nr_csw; /* Accumulated number of task switches */ unsigned long jiffies; /* Track jiffies value */ @@ -183,6 +183,7 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiescent state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ + unsigned int gpwrap_count; /* Count of GP sequence wrap. */ bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 77efed89c79e..c36c7d5575ca 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -200,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, if (rnp->parent == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) - swake_up_one_online(&rcu_state.expedited_wq); + swake_up_one(&rcu_state.expedited_wq); break; } @@ -230,17 +230,19 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) * specified leaf rcu_node structure, which is acquired by the caller. */ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags, - unsigned long mask, bool wake) + unsigned long mask_in, bool wake) __releases(rnp->lock) { int cpu; + unsigned long mask; struct rcu_data *rdp; raw_lockdep_assert_held_rcu_node(rnp); - if (!(rnp->expmask & mask)) { + if (!(rnp->expmask & mask_in)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } + mask = mask_in & rnp->expmask; WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 2605dd234a13..b473ff056f49 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -206,7 +206,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&rdp_gp->nocb_timer); + timer_delete(&rdp_gp->nocb_timer); } if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { @@ -216,7 +216,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); if (needwake) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); - swake_up_one_online(&rdp_gp->nocb_gp_wq); + swake_up_one(&rdp_gp->nocb_gp_wq); } return needwake; @@ -554,19 +554,13 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); - } else if (!irqs_disabled_flags(flags) && cpu_online(rdp->cpu)) { + } else if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ rcu_nocb_unlock(rdp); wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { - /* - * Don't do the wake-up upfront on fragile paths. - * Also offline CPUs can't call swake_up_one_online() from - * (soft-)IRQs. Rely on the final deferred wake-up from - * rcutree_report_cpu_dead() - */ rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); @@ -822,7 +816,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); + timer_delete(&my_rdp->nocb_timer); } WRITE_ONCE(my_rdp->nocb_gp_sleep, true); raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); @@ -991,7 +985,7 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, static void do_nocb_deferred_wakeup_timer(struct timer_list *t) { unsigned long flags; - struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); + struct rcu_data *rdp = timer_container_of(rdp, t, nocb_timer); WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); @@ -1557,8 +1551,11 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) /* Dump out nocb kthread state for the specified rcu_data structure. */ static void show_rcu_nocb_state(struct rcu_data *rdp) { - char bufw[20]; - char bufr[20]; + char bufd[22]; + char bufw[45]; + char bufr[45]; + char bufn[22]; + char bufb[22]; struct rcu_data *nocb_next_rdp; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; @@ -1572,9 +1569,13 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) typeof(*rdp), nocb_entry_rdp); - sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); - sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]); + sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL], + rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]); + sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass)); + pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], @@ -1586,12 +1587,15 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) jiffies - rdp->nocb_nobypass_last, rdp->nocb_nobypass_count, ".D"[rcu_segcblist_ready_cbs(rsclp)], + rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd, ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn, ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb, rcu_segcblist_n_cbs(&rdp->cblist), rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3600152b858e..0b0f56f6abc8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -29,7 +29,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || lockdep_is_held(&rdp->nocb_lock) || lockdep_is_held(&rcu_state.nocb_mutex) || - (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) && + ((!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) || softirq_count()) && rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), "Unsafe read of RCU_NOCB offloaded state" @@ -833,8 +833,17 @@ void rcu_read_unlock_strict(void) { struct rcu_data *rdp; - if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread) return; + + /* + * rcu_report_qs_rdp() can only be invoked with a stable rdp and + * from the local CPU. + * + * The in_atomic_preempt_off() check ensures that we come here holding + * the last preempt_count (which will get dropped once we return to + * __rcu_read_unlock(). + */ rdp = this_cpu_ptr(&rcu_data); rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); @@ -975,13 +984,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) */ static void rcu_flavor_sched_clock_irq(int user) { - if (user || rcu_is_cpu_rrupt_from_idle()) { + if (user || rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && + (preempt_count() == HARDIRQ_OFFSET))) { /* * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. + * mode, from the idle loop without this being a nested + * interrupt, or while not holding the task preempt count + * (with PREEMPT_COUNT=y). In this case, the CPU is in a + * quiescent state, so note it. * * No memory barrier is required here because rcu_qs() * references only CPU-local variables that other CPUs diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 925fcdad5dea..486c00536207 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -20,6 +20,28 @@ int sysctl_panic_on_rcu_stall __read_mostly; int sysctl_max_rcu_stall_to_panic __read_mostly; +#ifdef CONFIG_SYSFS + +static unsigned int rcu_stall_count; + +static ssize_t rcu_stall_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", rcu_stall_count); +} + +static struct kobj_attribute rcu_stall_count_attr = __ATTR_RO(rcu_stall_count); + +static __init int kernel_rcu_stall_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &rcu_stall_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_rcu_stall_sysfs_init); + +#endif // CONFIG_SYSFS + #ifdef CONFIG_PROVE_RCU #define RCU_STALL_DELAY_DELTA (5 * HZ) #else @@ -435,8 +457,8 @@ static void print_cpu_stat_info(int cpu) rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); pr_err("\t hardirqs softirqs csw/system\n"); - pr_err("\t number: %8ld %10d %12lld\n", - kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs, + pr_err("\t number: %8lld %10d %12lld\n", + kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu) - rsrp->nr_hardirqs, kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs, nr_context_switches_cpu(cpu) - rsrp->nr_csw); pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n", @@ -784,6 +806,10 @@ static void check_cpu_stall(struct rcu_data *rdp) if (kvm_check_and_clear_guest_paused()) return; +#ifdef CONFIG_SYSFS + ++rcu_stall_count; +#endif + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); diff --git a/kernel/reboot.c b/kernel/reboot.c index b5a8569e5d81..ec087827c85c 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -36,6 +36,8 @@ enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; EXPORT_SYMBOL_GPL(reboot_mode); enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED; +static enum hw_protection_action hw_protection_action = HWPROT_ACT_SHUTDOWN; + /* * This variable is used privately to keep track of whether or not * reboot_type is still set to its default value (i.e., reboot= hasn't @@ -229,6 +231,9 @@ EXPORT_SYMBOL(unregister_restart_handler); /** * do_kernel_restart - Execute kernel restart handler call chain * + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * * Calls functions registered with register_restart_handler. * * Expected to be called from machine_restart as last step of the restart @@ -704,6 +709,7 @@ void kernel_power_off(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); + pr_flush(1000, true); kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_power_off(); } @@ -932,61 +938,86 @@ void orderly_reboot(void) } EXPORT_SYMBOL_GPL(orderly_reboot); +static const char *hw_protection_action_str(enum hw_protection_action action) +{ + switch (action) { + case HWPROT_ACT_SHUTDOWN: + return "shutdown"; + case HWPROT_ACT_REBOOT: + return "reboot"; + default: + return "undefined"; + } +} + +static enum hw_protection_action hw_failure_emergency_action; + /** - * hw_failure_emergency_poweroff_func - emergency poweroff work after a known delay - * @work: work_struct associated with the emergency poweroff function + * hw_failure_emergency_action_func - emergency action work after a known delay + * @work: work_struct associated with the emergency action function * * This function is called in very critical situations to force - * a kernel poweroff after a configurable timeout value. + * a kernel poweroff or reboot after a configurable timeout value. */ -static void hw_failure_emergency_poweroff_func(struct work_struct *work) +static void hw_failure_emergency_action_func(struct work_struct *work) { + const char *action_str = hw_protection_action_str(hw_failure_emergency_action); + + pr_emerg("Hardware protection timed-out. Trying forced %s\n", + action_str); + /* - * We have reached here after the emergency shutdown waiting period has - * expired. This means orderly_poweroff has not been able to shut off - * the system for some reason. + * We have reached here after the emergency action waiting period has + * expired. This means orderly_poweroff/reboot has not been able to + * shut off the system for some reason. * - * Try to shut down the system immediately using kernel_power_off - * if populated + * Try to shut off the system immediately if possible */ - pr_emerg("Hardware protection timed-out. Trying forced poweroff\n"); - kernel_power_off(); + + if (hw_failure_emergency_action == HWPROT_ACT_REBOOT) + kernel_restart(NULL); + else + kernel_power_off(); /* * Worst of the worst case trigger emergency restart */ - pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n"); + pr_emerg("Hardware protection %s failed. Trying emergency restart\n", + action_str); emergency_restart(); } -static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work, - hw_failure_emergency_poweroff_func); +static DECLARE_DELAYED_WORK(hw_failure_emergency_action_work, + hw_failure_emergency_action_func); /** - * hw_failure_emergency_poweroff - Trigger an emergency system poweroff + * hw_failure_emergency_schedule - Schedule an emergency system shutdown or reboot + * + * @action: The hardware protection action to be taken + * @action_delay_ms: Time in milliseconds to elapse before triggering action * * This may be called from any critical situation to trigger a system shutdown - * after a given period of time. If time is negative this is not scheduled. + * or reboot after a given period of time. + * If time is negative this is not scheduled. */ -static void hw_failure_emergency_poweroff(int poweroff_delay_ms) +static void hw_failure_emergency_schedule(enum hw_protection_action action, + int action_delay_ms) { - if (poweroff_delay_ms <= 0) + if (action_delay_ms <= 0) return; - schedule_delayed_work(&hw_failure_emergency_poweroff_work, - msecs_to_jiffies(poweroff_delay_ms)); + hw_failure_emergency_action = action; + schedule_delayed_work(&hw_failure_emergency_action_work, + msecs_to_jiffies(action_delay_ms)); } /** - * __hw_protection_shutdown - Trigger an emergency system shutdown or reboot + * __hw_protection_trigger - Trigger an emergency system shutdown or reboot * * @reason: Reason of emergency shutdown or reboot to be printed. * @ms_until_forced: Time to wait for orderly shutdown or reboot before * triggering it. Negative value disables the forced * shutdown or reboot. - * @shutdown: If true, indicates that a shutdown will happen - * after the critical tempeature is reached. - * If false, indicates that a reboot will happen - * after the critical tempeature is reached. + * @action: The hardware protection action to be taken. * * Initiate an emergency system shutdown or reboot in order to protect * hardware from further damage. Usage examples include a thermal protection. @@ -994,11 +1025,16 @@ static void hw_failure_emergency_poweroff(int poweroff_delay_ms) * pending even if the previous request has given a large timeout for forced * shutdown/reboot. */ -void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown) +void __hw_protection_trigger(const char *reason, int ms_until_forced, + enum hw_protection_action action) { static atomic_t allow_proceed = ATOMIC_INIT(1); - pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason); + if (action == HWPROT_ACT_DEFAULT) + action = hw_protection_action; + + pr_emerg("HARDWARE PROTECTION %s (%s)\n", + hw_protection_action_str(action), reason); /* Shutdown should be initiated only once. */ if (!atomic_dec_and_test(&allow_proceed)) @@ -1008,13 +1044,55 @@ void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shut * Queue a backup emergency shutdown in the event of * orderly_poweroff failure */ - hw_failure_emergency_poweroff(ms_until_forced); - if (shutdown) + hw_failure_emergency_schedule(action, ms_until_forced); + if (action == HWPROT_ACT_REBOOT) + orderly_reboot(); + else orderly_poweroff(true); +} +EXPORT_SYMBOL_GPL(__hw_protection_trigger); + +static bool hw_protection_action_parse(const char *str, + enum hw_protection_action *action) +{ + if (sysfs_streq(str, "shutdown")) + *action = HWPROT_ACT_SHUTDOWN; + else if (sysfs_streq(str, "reboot")) + *action = HWPROT_ACT_REBOOT; else - orderly_reboot(); + return false; + + return true; +} + +static int __init hw_protection_setup(char *str) +{ + hw_protection_action_parse(str, &hw_protection_action); + return 1; +} +__setup("hw_protection=", hw_protection_setup); + +#ifdef CONFIG_SYSFS +static ssize_t hw_protection_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + hw_protection_action_str(hw_protection_action)); +} +static ssize_t hw_protection_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!hw_protection_action_parse(buf, &hw_protection_action)) + return -EINVAL; + + return count; } -EXPORT_SYMBOL_GPL(__hw_protection_shutdown); +static struct kobj_attribute hw_protection_attr = __ATTR_RW(hw_protection); +#endif static int __init reboot_setup(char *str) { @@ -1275,6 +1353,7 @@ static struct kobj_attribute reboot_cpu_attr = __ATTR_RW(cpu); #endif static struct attribute *reboot_attrs[] = { + &hw_protection_attr.attr, &reboot_mode_attr.attr, #ifdef CONFIG_X86 &reboot_force_attr.attr, diff --git a/kernel/relay.c b/kernel/relay.c index a8ae436dc77e..c0c93a04d4ce 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -351,10 +351,9 @@ static struct dentry *relay_create_buf_file(struct rchan *chan, struct dentry *dentry; char *tmpname; - tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); + tmpname = kasprintf(GFP_KERNEL, "%s%d", chan->base_filename, cpu); if (!tmpname) return NULL; - snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); /* Create file in fs */ dentry = chan->cb->create_buf_file(tmpname, chan->parent, @@ -453,7 +452,7 @@ int relay_prepare_cpu(unsigned int cpu) /** * relay_open - create a new relay channel - * @base_filename: base name of files to create, %NULL for buffering only + * @base_filename: base name of files to create * @parent: dentry of parent directory, %NULL for root directory or buffer * @subbuf_size: size of sub-buffers * @n_subbufs: number of sub-buffers @@ -466,10 +465,6 @@ int relay_prepare_cpu(unsigned int cpu) * attributes specified. The created channel buffer files * will be named base_filename0...base_filenameN-1. File * permissions will be %S_IRUSR. - * - * If opening a buffer (@parent = NULL) that you later wish to register - * in a filesystem, call relay_late_setup_files() once the @parent dentry - * is available. */ struct rchan *relay_open(const char *base_filename, struct dentry *parent, @@ -541,111 +536,6 @@ struct rchan_percpu_buf_dispatcher { struct dentry *dentry; }; -/* Called in atomic context. */ -static void __relay_set_buf_dentry(void *info) -{ - struct rchan_percpu_buf_dispatcher *p = info; - - relay_set_buf_dentry(p->buf, p->dentry); -} - -/** - * relay_late_setup_files - triggers file creation - * @chan: channel to operate on - * @base_filename: base name of files to create - * @parent: dentry of parent directory, %NULL for root directory - * - * Returns 0 if successful, non-zero otherwise. - * - * Use to setup files for a previously buffer-only channel created - * by relay_open() with a NULL parent dentry. - * - * For example, this is useful for perfomring early tracing in kernel, - * before VFS is up and then exposing the early results once the dentry - * is available. - */ -int relay_late_setup_files(struct rchan *chan, - const char *base_filename, - struct dentry *parent) -{ - int err = 0; - unsigned int i, curr_cpu; - unsigned long flags; - struct dentry *dentry; - struct rchan_buf *buf; - struct rchan_percpu_buf_dispatcher disp; - - if (!chan || !base_filename) - return -EINVAL; - - strscpy(chan->base_filename, base_filename, NAME_MAX); - - mutex_lock(&relay_channels_mutex); - /* Is chan already set up? */ - if (unlikely(chan->has_base_filename)) { - mutex_unlock(&relay_channels_mutex); - return -EEXIST; - } - chan->has_base_filename = 1; - chan->parent = parent; - - if (chan->is_global) { - err = -EINVAL; - buf = *per_cpu_ptr(chan->buf, 0); - if (!WARN_ON_ONCE(!buf)) { - dentry = relay_create_buf_file(chan, buf, 0); - if (dentry && !WARN_ON_ONCE(!chan->is_global)) { - relay_set_buf_dentry(buf, dentry); - err = 0; - } - } - mutex_unlock(&relay_channels_mutex); - return err; - } - - curr_cpu = get_cpu(); - /* - * The CPU hotplug notifier ran before us and created buffers with - * no files associated. So it's safe to call relay_setup_buf_file() - * on all currently online CPUs. - */ - for_each_online_cpu(i) { - buf = *per_cpu_ptr(chan->buf, i); - if (unlikely(!buf)) { - WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); - err = -EINVAL; - break; - } - - dentry = relay_create_buf_file(chan, buf, i); - if (unlikely(!dentry)) { - err = -EINVAL; - break; - } - - if (curr_cpu == i) { - local_irq_save(flags); - relay_set_buf_dentry(buf, dentry); - local_irq_restore(flags); - } else { - disp.buf = buf; - disp.dentry = dentry; - smp_mb(); - /* relay_channels_mutex must be held, so wait. */ - err = smp_call_function_single(i, - __relay_set_buf_dentry, - &disp, 1); - } - if (unlikely(err)) - break; - } - put_cpu(); - mutex_unlock(&relay_channels_mutex); - - return err; -} -EXPORT_SYMBOL_GPL(relay_late_setup_files); - /** * relay_switch_subbuf - switch to a new sub-buffer * @buf: channel buffer diff --git a/kernel/resource.c b/kernel/resource.c index 12004452d999..8d3e6ed0bdc1 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -561,8 +561,7 @@ static int __region_intersects(struct resource *parent, resource_size_t start, struct resource res, o; bool covered; - res.start = start; - res.end = start + size - 1; + res = DEFINE_RES(start, size, 0); for (p = parent->child; p ; p = p->sibling) { if (!resource_intersection(p, &res, &o)) @@ -1714,18 +1713,13 @@ static int __init reserve_setup(char *str) * I/O port space; otherwise assume it's memory. */ if (io_start < 0x10000) { - res->flags = IORESOURCE_IO; + *res = DEFINE_RES_IO_NAMED(io_start, io_num, "reserved"); parent = &ioport_resource; } else { - res->flags = IORESOURCE_MEM; + *res = DEFINE_RES_MEM_NAMED(io_start, io_num, "reserved"); parent = &iomem_resource; } - res->name = "reserved"; - res->start = io_start; - res->end = io_start + io_num - 1; res->flags |= IORESOURCE_BUSY; - res->desc = IORES_DESC_NONE; - res->child = NULL; if (request_resource(parent, res) == 0) reserved = x+1; } @@ -1975,11 +1969,7 @@ get_free_mem_region(struct device *dev, struct resource *base, */ revoke_iomem(res); } else { - res->start = addr; - res->end = addr + size - 1; - res->name = name; - res->desc = desc; - res->flags = IORESOURCE_MEM; + *res = DEFINE_RES_NAMED_DESC(addr, size, name, IORESOURCE_MEM, desc); /* * Only succeed if the resource hosts an exclusive diff --git a/kernel/rseq.c b/kernel/rseq.c index 442aba29bc4c..b7a1ec327e81 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -78,24 +78,24 @@ efault: return -EFAULT; } -static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, - u32 node_id, u32 mm_cid) -{ - rseq_kernel_fields(t)->cpu_id_start = cpu_id; - rseq_kernel_fields(t)->cpu_id = cpu_id; - rseq_kernel_fields(t)->node_id = node_id; - rseq_kernel_fields(t)->mm_cid = mm_cid; -} +/* + * Update an rseq field and its in-kernel copy in lock-step to keep a coherent + * state. + */ +#define rseq_unsafe_put_user(t, value, field, error_label) \ + do { \ + unsafe_put_user(value, &t->rseq->field, error_label); \ + rseq_kernel_fields(t)->field = value; \ + } while (0) + #else static int rseq_validate_ro_fields(struct task_struct *t) { return 0; } -static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, - u32 node_id, u32 mm_cid) -{ -} +#define rseq_unsafe_put_user(t, value, field, error_label) \ + unsafe_put_user(value, &t->rseq->field, error_label) #endif /* @@ -173,17 +173,18 @@ static int rseq_update_cpu_node_id(struct task_struct *t) WARN_ON_ONCE((int) mm_cid < 0); if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; - unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); - unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); - unsafe_put_user(node_id, &rseq->node_id, efault_end); - unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); + + rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); + rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); + rseq_unsafe_put_user(t, node_id, node_id, efault_end); + rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); + /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally updated only if * t->rseq_len != ORIG_RSEQ_SIZE. */ user_write_access_end(); - rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid); trace_rseq_update(t); return 0; @@ -195,6 +196,7 @@ efault: static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { + struct rseq __user *rseq = t->rseq; u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; @@ -202,40 +204,61 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) * Validate read-only rseq fields. */ if (rseq_validate_ro_fields(t)) - return -EFAULT; - /* - * Reset cpu_id_start to its initial state (0). - */ - if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) - return -EFAULT; - /* - * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming - * in after unregistration can figure out that rseq needs to be - * registered again. - */ - if (put_user(cpu_id, &t->rseq->cpu_id)) - return -EFAULT; - /* - * Reset node_id to its initial state (0). - */ - if (put_user(node_id, &t->rseq->node_id)) - return -EFAULT; + goto efault; + + if (!user_write_access_begin(rseq, t->rseq_len)) + goto efault; + /* - * Reset mm_cid to its initial state (0). + * Reset all fields to their initial state. + * + * All fields have an initial state of 0 except cpu_id which is set to + * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after + * unregistration can figure out that rseq needs to be registered + * again. */ - if (put_user(mm_cid, &t->rseq->mm_cid)) - return -EFAULT; - - rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid); + rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); + rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); + rseq_unsafe_put_user(t, node_id, node_id, efault_end); + rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if * t->rseq_len != ORIG_RSEQ_SIZE. */ + user_write_access_end(); + return 0; + +efault_end: + user_write_access_end(); +efault: + return -EFAULT; +} + +/* + * Get the user-space pointer value stored in the 'rseq_cs' field. + */ +static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) +{ + if (!rseq_cs) + return -EFAULT; + +#ifdef CONFIG_64BIT + if (get_user(*rseq_cs, &rseq->rseq_cs)) + return -EFAULT; +#else + if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) + return -EFAULT; +#endif + return 0; } +/* + * If the rseq_cs field of 'struct rseq' contains a valid pointer to + * user-space, copy 'struct rseq_cs' from user-space and validate its fields. + */ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; @@ -244,17 +267,16 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) u32 sig; int ret; -#ifdef CONFIG_64BIT - if (get_user(ptr, &t->rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) - return -EFAULT; -#endif + ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); + if (ret) + return ret; + + /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } + /* Check that the pointer value fits in the user-space process space. */ if (ptr >= TASK_SIZE) return -EINVAL; urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; @@ -330,7 +352,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) return !!event_mask; } -static int clear_rseq_cs(struct task_struct *t) +static int clear_rseq_cs(struct rseq __user *rseq) { /* * The rseq_cs field is set to NULL on preemption or signal @@ -341,9 +363,9 @@ static int clear_rseq_cs(struct task_struct *t) * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT - return put_user(0UL, &t->rseq->rseq_cs); + return put_user(0UL, &rseq->rseq_cs); #else - if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) + if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) return -EFAULT; return 0; #endif @@ -375,11 +397,11 @@ static int rseq_ip_fixup(struct pt_regs *regs) * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t); + return clear_rseq_cs(t->rseq); ret = rseq_need_restart(t, rseq_cs.flags); if (ret <= 0) return ret; - ret = clear_rseq_cs(t); + ret = clear_rseq_cs(t->rseq); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, @@ -453,6 +475,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; + u64 rseq_cs; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) @@ -507,9 +530,19 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; - current->rseq = rseq; - current->rseq_len = rseq_len; - current->rseq_sig = sig; + + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. + */ + if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) + return -EFAULT; + if (rseq_cs && clear_rseq_cs(rseq)) + return -EFAULT; + #ifdef CONFIG_DEBUG_RSEQ /* * Initialize the in-kernel rseq fields copy for validation of @@ -522,6 +555,14 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, return -EFAULT; #endif /* + * Activate the registration by setting the rseq area address, length + * and signature in the task struct. + */ + current->rseq = rseq; + current->rseq_len = rseq_len; + current->rseq_sig = sig; + + /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 976092b7bd45..8ae86371ddcd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -22,6 +22,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_build_policy.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_build_utility.o += -DDISABLE_BRANCH_PROFILING +endif # # Build efficiency: # diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index fae1f5c921eb..72d97aa8b726 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -61,6 +61,7 @@ #ifdef CONFIG_SCHED_CLASS_EXT # include "ext.c" +# include "ext_idle.c" #endif #include "syscalls.c" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 80a3df49ab47..bf9d8db94b70 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -68,9 +68,7 @@ # include "cpufreq_schedutil.c" #endif -#ifdef CONFIG_SCHED_DEBUG -# include "debug.c" -#endif +#include "debug.c" #ifdef CONFIG_SCHEDSTATS # include "stats.c" diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9aecd914ac69..8988d38d46a3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -66,6 +66,7 @@ #include <linux/vtime.h> #include <linux/wait_api.h> #include <linux/workqueue_api.h> +#include <linux/livepatch_sched.h> #ifdef CONFIG_PREEMPT_DYNAMIC # ifdef CONFIG_GENERIC_ENTRY @@ -91,7 +92,6 @@ #include "autogroup.h" #include "pelt.h" #include "smp.h" -#include "stats.h" #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" @@ -119,7 +119,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#ifdef CONFIG_SCHED_DEBUG /* * Debugging: various feature bits * @@ -129,7 +128,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); */ #define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | -const_debug unsigned int sysctl_sched_features = +__read_mostly unsigned int sysctl_sched_features = #include "features.h" 0; #undef SCHED_FEAT @@ -143,13 +142,12 @@ const_debug unsigned int sysctl_sched_features = */ __read_mostly int sysctl_resched_latency_warn_ms = 100; __read_mostly int sysctl_resched_latency_warn_once = 1; -#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ -const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; +__read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; __read_mostly int scheduler_running; @@ -491,6 +489,16 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } #endif /* CONFIG_SCHED_CORE */ +/* need a wrapper since we may need to trace from modules */ +EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); + +/* Call via the helper macro trace_set_current_state. */ +void __trace_set_current_state(int state_value) +{ + trace_sched_set_state_tp(current, state_value); +} +EXPORT_SYMBOL(__trace_set_current_state); + /* * Serialization rules: * @@ -800,11 +808,10 @@ void update_rq_clock(struct rq *rq) if (rq->clock_update_flags & RQCF_ACT_SKIP) return; -#ifdef CONFIG_SCHED_DEBUG if (sched_feat(WARN_DOUBLE_CLOCK)) - SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); + WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED); rq->clock_update_flags |= RQCF_UPDATED; -#endif + clock = sched_clock_cpu(cpu_of(rq)); scx_rq_clock_update(rq, clock); @@ -916,8 +923,7 @@ static void hrtick_rq_init(struct rq *rq) #ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - rq->hrtick_timer.function = hrtick; + hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -1720,7 +1726,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, bucket = &uc_rq->bucket[uc_se->bucket_id]; - SCHED_WARN_ON(!bucket->tasks); + WARN_ON_ONCE(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; @@ -1740,14 +1746,14 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, * Defensive programming: this should never happen. If it happens, * e.g. due to future modification, warn and fix up the expected value. */ - SCHED_WARN_ON(bucket->value > rq_clamp); + WARN_ON_ONCE(bucket->value > rq_clamp); if (bucket->value >= rq_clamp) { bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); uclamp_rq_set(rq, clamp_id, bkt_clamp); } } -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { enum uclamp_id clamp_id; @@ -1757,13 +1763,14 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled. */ - if (!static_branch_unlikely(&sched_uclamp_used)) + if (!uclamp_is_used()) return; if (unlikely(!p->sched_class->uclamp_enabled)) return; - if (p->se.sched_delayed) + /* Only inc the delayed task which being woken up. */ + if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)) return; for_each_clamp_id(clamp_id) @@ -1784,7 +1791,7 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled. */ - if (!static_branch_unlikely(&sched_uclamp_used)) + if (!uclamp_is_used()) return; if (unlikely(!p->sched_class->uclamp_enabled)) @@ -1942,12 +1949,12 @@ static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write, } if (update_root_tg) { - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); uclamp_update_root_tg(); } if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); uclamp_sync_util_min_rt_default(); } @@ -2031,7 +2038,7 @@ static void __init init_uclamp(void) } #else /* !CONFIG_UCLAMP_TASK */ -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } @@ -2067,12 +2074,14 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - p->sched_class->enqueue_task(rq, p, flags); /* - * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear - * ->sched_delayed. + * Can be before ->enqueue_task() because uclamp considers the + * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared + * in ->enqueue_task(). */ - uclamp_rq_inc(rq, p); + uclamp_rq_inc(rq, p, flags); + + p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags); @@ -2122,7 +2131,7 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - SCHED_WARN_ON(flags & DEQUEUE_SLEEP); + WARN_ON_ONCE(flags & DEQUEUE_SLEEP); WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ASSERT_EXCLUSIVE_WRITER(p->on_rq); @@ -2278,6 +2287,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * just go back and repeat. */ rq = task_rq_lock(p, &rf); + /* + * If task is sched_delayed, force dequeue it, to avoid always + * hitting the tick timeout in the queued case + */ + if (p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); trace_sched_wait_task(p); running = task_on_cpu(rq, p); queued = task_on_rq_queued(p); @@ -2727,7 +2742,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) * XXX do further audits, this smells like something putrid. */ if (ctx->flags & SCA_MIGRATE_DISABLE) - SCHED_WARN_ON(!p->on_cpu); + WARN_ON_ONCE(!p->on_cpu); else lockdep_assert_held(&p->pi_lock); @@ -3292,7 +3307,6 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { -#ifdef CONFIG_SCHED_DEBUG unsigned int state = READ_ONCE(p->__state); /* @@ -3330,7 +3344,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(!cpu_online(new_cpu)); WARN_ON_ONCE(is_migration_disabled(p)); -#endif trace_sched_migrate_task(p, new_cpu); @@ -3349,6 +3362,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { + __schedstat_inc(p->stats.numa_task_swapped); + count_vm_numa_event(NUMA_TASK_SWAP); + count_memcg_event_mm(p->mm, NUMA_TASK_SWAP); + if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; struct rq_flags srf, drf; @@ -3922,13 +3939,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { - /* - * The BPF scheduler may depend on select_task_rq() being invoked during - * wakeups. In addition, @p may end up executing on a different CPU - * regardless of what happens in the wakeup path making the ttwu_queue - * optimization less meaningful. Skip if on SCX. - */ - if (task_on_scx(p)) + /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ + if (!scx_allow_ttwu_queue(p)) return false; /* @@ -4196,7 +4208,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); if (!ttwu_state_match(p, state, &success)) goto out; @@ -4490,7 +4502,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_LIST_HEAD(&p->se.group_node); /* A delayed task cannot be in clone(). */ - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; @@ -5307,6 +5319,12 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ finish_task_switch(prev); + /* + * This is a special case: the newly created task has just + * switched the context for the first time. It is returning from + * schedule for the first time in this path. + */ + trace_sched_exit_tp(true, CALLER_ADDR0); preempt_enable(); if (current->set_child_tid) @@ -5578,7 +5596,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_SCHED_DEBUG static u64 cpu_resched_latency(struct rq *rq) { int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); @@ -5623,9 +5640,6 @@ static int __init setup_resched_latency_warn_ms(char *str) return 1; } __setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); -#else -static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } -#endif /* CONFIG_SCHED_DEBUG */ /* * This function gets called by the timer code, with HZ frequency. @@ -5746,7 +5760,7 @@ static void sched_tick_remote(struct work_struct *work) * we are always sure that there is no proxy (only a * single task is running). */ - SCHED_WARN_ON(rq->curr != rq->donor); + WARN_ON_ONCE(rq->curr != rq->donor); update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -5966,7 +5980,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); - SCHED_WARN_ON(ct_state() == CT_STATE_USER); + WARN_ON_ONCE(ct_state() == CT_STATE_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -6571,12 +6585,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * Otherwise marks the task's __state as RUNNING */ static bool try_to_block_task(struct rq *rq, struct task_struct *p, - unsigned long task_state) + unsigned long *task_state_p) { + unsigned long task_state = *task_state_p; int flags = DEQUEUE_NOCLOCK; if (signal_pending_state(task_state, p)) { WRITE_ONCE(p->__state, TASK_RUNNING); + *task_state_p = TASK_RUNNING; return false; } @@ -6650,12 +6666,15 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; + bool is_switch = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; + trace_sched_entry_tp(preempt, CALLER_ADDR0); + cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; @@ -6665,6 +6684,8 @@ static void __sched notrace __schedule(int sched_mode) if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); + klp_sched_try_switch(prev); + local_irq_disable(); rcu_note_context_switch(preempt); @@ -6710,7 +6731,7 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - try_to_block_task(rq, prev, prev_state); + try_to_block_task(rq, prev, &prev_state); switch_count = &prev->nvcsw; } @@ -6719,11 +6740,10 @@ static void __sched notrace __schedule(int sched_mode) picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); -#ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; -#endif - if (likely(prev != next)) { + is_switch = prev != next; + if (likely(is_switch)) { rq->nr_switches++; /* * RCU users of rcu_dereference(rq->curr) may not see @@ -6768,6 +6788,7 @@ picked: __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); } + trace_sched_exit_tp(is_switch, CALLER_ADDR0); } void __noreturn do_task_dead(void) @@ -6812,7 +6833,7 @@ static inline void sched_submit_work(struct task_struct *tsk) * deadlock if the callback attempts to acquire a lock which is * already acquired. */ - SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); + WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT); /* * If we are going to sleep and we have plugged IO queued, @@ -7095,7 +7116,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); + WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -7285,12 +7306,12 @@ out_unlock: #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) { - if (should_resched(0)) { + if (should_resched(0) && !irqs_disabled()) { preempt_schedule_common(); return 1; } /* - * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick * whether the current CPU is in an RCU read-side critical section, * so the tick can report quiescent states even for CPUs looping * in kernel context. In contrast, in non-preemptible kernels, @@ -7299,6 +7320,8 @@ int __sched __cond_resched(void) * RCU quiescent state. Therefore, the following code causes * cond_resched() to report a quiescent state, but only when RCU * is in urgent need of one. + * A third case, preemptible, but non-PREEMPT_RCU provides for + * urgently needed quiescent states via rcu_flavor_sched_clock_irq(). */ #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); @@ -7323,7 +7346,6 @@ EXPORT_STATIC_CALL_TRAMP(might_resched); static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); int __sched dynamic_cond_resched(void) { - klp_sched_try_switch(); if (!static_branch_unlikely(&sk_dynamic_cond_resched)) return 0; return __cond_resched(); @@ -7495,7 +7517,6 @@ int sched_dynamic_mode(const char *str) #endif static DEFINE_MUTEX(sched_dynamic_mutex); -static bool klp_override; static void __sched_dynamic_update(int mode) { @@ -7503,8 +7524,7 @@ static void __sched_dynamic_update(int mode) * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in * the ZERO state, which is invalid. */ - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -7513,8 +7533,7 @@ static void __sched_dynamic_update(int mode) switch (mode) { case preempt_dynamic_none: - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); @@ -7525,8 +7544,7 @@ static void __sched_dynamic_update(int mode) break; case preempt_dynamic_voluntary: - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); @@ -7537,8 +7555,7 @@ static void __sched_dynamic_update(int mode) break; case preempt_dynamic_full: - if (!klp_override) - preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -7549,8 +7566,7 @@ static void __sched_dynamic_update(int mode) break; case preempt_dynamic_lazy: - if (!klp_override) - preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -7571,36 +7587,6 @@ void sched_dynamic_update(int mode) mutex_unlock(&sched_dynamic_mutex); } -#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL - -static int klp_cond_resched(void) -{ - __klp_sched_try_switch(); - return __cond_resched(); -} - -void sched_dynamic_klp_enable(void) -{ - mutex_lock(&sched_dynamic_mutex); - - klp_override = true; - static_call_update(cond_resched, klp_cond_resched); - - mutex_unlock(&sched_dynamic_mutex); -} - -void sched_dynamic_klp_disable(void) -{ - mutex_lock(&sched_dynamic_mutex); - - klp_override = false; - __sched_dynamic_update(preempt_dynamic_mode); - - mutex_unlock(&sched_dynamic_mutex); -} - -#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ - static int __init setup_preempt_mode(char *str) { int mode = sched_dynamic_mode(str); @@ -7647,10 +7633,57 @@ PREEMPT_MODEL_ACCESSOR(lazy); #else /* !CONFIG_PREEMPT_DYNAMIC: */ +#define preempt_dynamic_mode -1 + static inline void preempt_dynamic_init(void) { } #endif /* CONFIG_PREEMPT_DYNAMIC */ +const char *preempt_modes[] = { + "none", "voluntary", "full", "lazy", NULL, +}; + +const char *preempt_model_str(void) +{ + bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) && + (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) || + IS_ENABLED(CONFIG_PREEMPT_LAZY)); + static char buf[128]; + + if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) { + struct seq_buf s; + + seq_buf_init(&s, buf, sizeof(buf)); + seq_buf_puts(&s, "PREEMPT"); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + seq_buf_printf(&s, "%sRT%s", + brace ? "_{" : "_", + brace ? "," : ""); + + if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) { + seq_buf_printf(&s, "(%s)%s", + preempt_dynamic_mode > 0 ? + preempt_modes[preempt_dynamic_mode] : "undef", + brace ? "}" : ""); + return seq_buf_str(&s); + } + + if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { + seq_buf_printf(&s, "LAZY%s", + brace ? "}" : ""); + return seq_buf_str(&s); + } + + return seq_buf_str(&s); + } + + if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD)) + return "VOLUNTARY"; + + return "NONE"; +} + int io_schedule_prepare(void) { int old_iowait = current->in_iowait; @@ -7765,10 +7798,9 @@ void show_state_filter(unsigned int state_filter) sched_show_task(p); } -#ifdef CONFIG_SCHED_DEBUG if (!state_filter) sysrq_sched_debug_show(); -#endif + rcu_read_unlock(); /* * Only show locks if all tasks are dumped: @@ -7902,8 +7934,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu) if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; - /* TODO: This is not properly updating schedstats */ - + __schedstat_inc(p->stats.numa_task_migrated); + count_vm_numa_event(NUMA_TASK_MIGRATE); + count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE); trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } @@ -8183,7 +8216,7 @@ static void cpuset_cpu_active(void) * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ - partition_sched_domains(1, NULL, NULL); + cpuset_reset_sched_domains(); if (--num_cpus_frozen) return; /* @@ -8202,7 +8235,7 @@ static void cpuset_cpu_inactive(unsigned int cpu) cpuset_update_active_cpus(); } else { num_cpus_frozen++; - partition_sched_domains(1, NULL, NULL); + cpuset_reset_sched_domains(); } } @@ -8424,9 +8457,9 @@ void __init sched_init_smp(void) * CPU masks are stable and all blatant races in the below code cannot * happen. */ - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); sched_init_domains(cpu_active_mask); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) @@ -8512,7 +8545,7 @@ void __init sched_init(void) init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_EXT_GROUP_SCHED - root_task_group.scx_weight = CGROUP_WEIGHT_DFL; + scx_tg_init(&root_task_group); #endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -8952,7 +8985,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; - scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); + scx_tg_init(tg); alloc_uclamp_sched_group(tg, parent); return tg; @@ -8967,7 +9000,7 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) unsigned long flags; spin_lock_irqsave(&task_group_lock, flags); - list_add_rcu(&tg->list, &task_groups); + list_add_tail_rcu(&tg->list, &task_groups); /* Root should already exist: */ WARN_ON(!parent); @@ -9016,7 +9049,7 @@ void sched_release_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -static struct task_group *sched_get_task_group(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk) { struct task_group *tg; @@ -9028,13 +9061,7 @@ static struct task_group *sched_get_task_group(struct task_struct *tsk) tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); - - return tg; -} - -static void sched_change_group(struct task_struct *tsk, struct task_group *group) -{ - tsk->sched_task_group = group; + tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) @@ -9055,20 +9082,11 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - struct task_group *group; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); rq = rq_guard.rq; - /* - * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous - * group changes. - */ - group = sched_get_task_group(tsk); - if (group == tsk->sched_task_group) - return; - update_rq_clock(rq); running = task_current_donor(rq, tsk); @@ -9079,7 +9097,7 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk, group); + sched_change_group(tsk); if (!for_autogroup) scx_cgroup_move_task(tsk); @@ -9168,11 +9186,15 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) struct task_struct *task; struct cgroup_subsys_state *css; + if (!rt_group_sched_enabled()) + goto scx_check; + cgroup_taskset_for_each(task, css, tset) { if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; } -#endif +scx_check: +#endif /* CONFIG_RT_GROUP_SCHED */ return scx_cgroup_can_attach(tset); } @@ -9203,7 +9225,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) unsigned int clamps; lockdep_assert_held(&uclamp_mutex); - SCHED_WARN_ON(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held()); css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent @@ -9295,7 +9317,7 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); guard(mutex)(&uclamp_mutex); guard(rcu)(); @@ -9825,18 +9847,6 @@ static struct cftype cpu_legacy_files[] = { .seq_show = cpu_cfs_local_stat_show, }, #endif -#ifdef CONFIG_RT_GROUP_SCHED - { - .name = "rt_runtime_us", - .read_s64 = cpu_rt_runtime_read, - .write_s64 = cpu_rt_runtime_write, - }, - { - .name = "rt_period_us", - .read_u64 = cpu_rt_period_read_uint, - .write_u64 = cpu_rt_period_write_uint, - }, -#endif #ifdef CONFIG_UCLAMP_TASK_GROUP { .name = "uclamp.min", @@ -9854,6 +9864,55 @@ static struct cftype cpu_legacy_files[] = { { } /* Terminate */ }; +#ifdef CONFIG_RT_GROUP_SCHED +static struct cftype rt_group_files[] = { + { + .name = "rt_runtime_us", + .read_s64 = cpu_rt_runtime_read, + .write_s64 = cpu_rt_runtime_write, + }, + { + .name = "rt_period_us", + .read_u64 = cpu_rt_period_read_uint, + .write_u64 = cpu_rt_period_write_uint, + }, + { } /* Terminate */ +}; + +# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED +DEFINE_STATIC_KEY_FALSE(rt_group_sched); +# else +DEFINE_STATIC_KEY_TRUE(rt_group_sched); +# endif + +static int __init setup_rt_group_sched(char *str) +{ + long val; + + if (kstrtol(str, 0, &val) || val < 0 || val > 1) { + pr_warn("Unable to set rt_group_sched\n"); + return 1; + } + if (val) + static_branch_enable(&rt_group_sched); + else + static_branch_disable(&rt_group_sched); + + return 1; +} +__setup("rt_group_sched=", setup_rt_group_sched); + +static int __init cpu_rt_group_init(void) +{ + if (!rt_group_sched_enabled()) + return 0; + + WARN_ON(cgroup_add_legacy_cftypes(&cpu_cgrp_subsys, rt_group_files)); + return 0; +} +subsys_initcall(cpu_rt_group_init); +#endif /* CONFIG_RT_GROUP_SCHED */ + static int cpu_extra_stat_show(struct seq_file *sf, struct cgroup_subsys_state *css) { @@ -10538,7 +10597,7 @@ static void task_mm_cid_work(struct callback_head *work) struct mm_struct *mm; int weight, cpu; - SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work)); + WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); work->next = work; /* Prevent double-add */ if (t->flags & PF_EXITING) @@ -10667,7 +10726,6 @@ void sched_mm_cid_after_execve(struct task_struct *t) smp_mb(); t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); } - rseq_set_notify_resume(t); } void sched_mm_cid_fork(struct task_struct *t) diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 1ef98a93eb1d..c4606ca89210 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -65,7 +65,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, * a cookie until after we've removed it, we must have core scheduling * enabled here. */ - SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq)); + WARN_ON_ONCE((p->core_cookie || cookie) && !sched_core_enabled(rq)); if (sched_core_enqueued(p)) sched_core_dequeue(rq, p, DEQUEUE_SAVE); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 1a19d69b91ed..461242ec958a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -81,9 +81,23 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (!cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (unlikely(sg_policy->limits_changed)) { - sg_policy->limits_changed = false; - sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + if (unlikely(READ_ONCE(sg_policy->limits_changed))) { + WRITE_ONCE(sg_policy->limits_changed, false); + sg_policy->need_freq_update = true; + + /* + * The above limits_changed update must occur before the reads + * of policy limits in cpufreq_driver_resolve_freq() or a policy + * limits update might be missed, so use a memory barrier to + * ensure it. + * + * This pairs with the write memory barrier in sugov_limits(). + */ + smp_mb(); + + return true; + } else if (sg_policy->need_freq_update) { + /* ignore_dl_rate_limit() wants a new frequency to be found. */ return true; } @@ -95,10 +109,22 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->need_freq_update) + if (sg_policy->need_freq_update) { sg_policy->need_freq_update = false; - else if (sg_policy->next_freq == next_freq) + /* + * The policy limits have changed, but if the return value of + * cpufreq_driver_resolve_freq() after applying the new limits + * is still equal to the previously selected frequency, the + * driver callback need not be invoked unless the driver + * specifically wants that to happen on every update of the + * policy limits. + */ + if (sg_policy->next_freq == next_freq && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) + return false; + } else if (sg_policy->next_freq == next_freq) { return false; + } sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -365,7 +391,7 @@ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) { if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_min) - sg_cpu->sg_policy->limits_changed = true; + sg_cpu->sg_policy->need_freq_update = true; } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, @@ -604,7 +630,7 @@ static const struct kobj_type sugov_tunables_ktype = { /********************** cpufreq governor interface *********************/ -struct cpufreq_governor schedutil_gov; +static struct cpufreq_governor schedutil_gov; static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) { @@ -871,10 +897,19 @@ static void sugov_limits(struct cpufreq_policy *policy) mutex_unlock(&sg_policy->work_lock); } - sg_policy->limits_changed = true; + /* + * The limits_changed update below must take place before the updates + * of policy limits in cpufreq_set_policy() or a policy limits update + * might be missed, so use a memory barrier to ensure it. + * + * This pairs with the memory barrier in sugov_should_update_freq(). + */ + smp_wmb(); + + WRITE_ONCE(sg_policy->limits_changed, true); } -struct cpufreq_governor schedutil_gov = { +static struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, @@ -892,4 +927,9 @@ struct cpufreq_governor *cpufreq_default_governor(void) } #endif +bool sugov_is_governor(struct cpufreq_policy *policy) +{ + return policy->governor == &schedutil_gov; +} + cpufreq_governor_init(schedutil_gov); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5d9143dd0879..6dab4854c6c0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -9,8 +9,6 @@ #ifdef CONFIG_IRQ_TIME_ACCOUNTING -DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); - /* * There are no locks covering percpu hardirq/softirq time. * They are only modified in vtime_account, on corresponding CPU @@ -24,14 +22,16 @@ DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); +int sched_clock_irqtime; + void enable_sched_clock_irqtime(void) { - static_branch_enable(&sched_clock_irqtime); + sched_clock_irqtime = 1; } void disable_sched_clock_irqtime(void) { - static_branch_disable(&sched_clock_irqtime); + sched_clock_irqtime = 0; } static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 38e4537790af..ad45a8fea245 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -166,14 +166,14 @@ static inline unsigned long dl_bw_capacity(int i) } } -static inline bool dl_bw_visited(int cpu, u64 gen) +bool dl_bw_visited(int cpu, u64 cookie) { struct root_domain *rd = cpu_rq(cpu)->rd; - if (rd->visit_gen == gen) + if (rd->visit_cookie == cookie) return true; - rd->visit_gen = gen; + rd->visit_cookie = cookie; return false; } @@ -207,7 +207,7 @@ static inline unsigned long dl_bw_capacity(int i) return SCHED_CAPACITY_SCALE; } -static inline bool dl_bw_visited(int cpu, u64 gen) +bool dl_bw_visited(int cpu, u64 cookie) { return false; } @@ -249,8 +249,8 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw += dl_bw; - SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + WARN_ON_ONCE(dl_rq->running_bw < old); /* overflow */ + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); /* kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } @@ -262,7 +262,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw -= dl_bw; - SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ + WARN_ON_ONCE(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) dl_rq->running_bw = 0; /* kick cpufreq (see the comment in kernel/sched/sched.h). */ @@ -276,7 +276,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw += dl_bw; - SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ + WARN_ON_ONCE(dl_rq->this_bw < old); /* overflow */ } static inline @@ -286,10 +286,10 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw -= dl_bw; - SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ + WARN_ON_ONCE(dl_rq->this_bw > old); /* underflow */ if (dl_rq->this_bw > old) dl_rq->this_bw = 0; - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); } static inline @@ -1382,8 +1382,7 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = dl_task_timer; + hrtimer_setup(timer, dl_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } /* @@ -1839,8 +1838,7 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = inactive_task_timer; + hrtimer_setup(timer, inactive_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #define __node_2_dle(node) \ @@ -2956,7 +2954,7 @@ void dl_add_task_root_domain(struct task_struct *p) struct dl_bw *dl_b; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); - if (!dl_task(p)) { + if (!dl_task(p) || dl_entity_is_special(&p->dl)) { raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return; } @@ -2981,18 +2979,22 @@ void dl_clear_root_domain(struct root_domain *rd) rd->dl_bw.total_bw = 0; /* - * dl_server bandwidth is only restored when CPUs are attached to root - * domains (after domains are created or CPUs moved back to the - * default root doamin). + * dl_servers are not tasks. Since dl_add_task_root_domain ignores + * them, we need to account for them here explicitly. */ for_each_cpu(i, rd->span) { struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; if (dl_server(dl_se) && cpu_active(i)) - rd->dl_bw.total_bw += dl_se->dl_bw; + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i)); } } +void dl_clear_root_domain_cpu(int cpu) +{ + dl_clear_root_domain(cpu_rq(cpu)->rd); +} + #endif /* CONFIG_SMP */ static void switched_from_dl(struct rq *rq, struct task_struct *p) @@ -3171,15 +3173,18 @@ DEFINE_SCHED_CLASS(dl) = { #endif }; -/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ -static u64 dl_generation; +/* + * Used for dl_bw check and update, used under sched_rt_handler()::mutex and + * sched_domains_mutex. + */ +u64 dl_cookie; int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); - u64 gen = ++dl_generation; + u64 cookie = ++dl_cookie; struct dl_bw *dl_b; int cpu, cpus, ret = 0; unsigned long flags; @@ -3189,10 +3194,10 @@ int sched_dl_global_validate(void) * value smaller than the currently allocated bandwidth in * any of the root_domains. */ - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { rcu_read_lock_sched(); - if (dl_bw_visited(cpu, gen)) + if (dl_bw_visited(cpu, cookie)) goto next; dl_b = dl_bw_of(cpu); @@ -3229,7 +3234,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) void sched_dl_do_global(void) { u64 new_bw = -1; - u64 gen = ++dl_generation; + u64 cookie = ++dl_cookie; struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -3240,7 +3245,7 @@ void sched_dl_do_global(void) for_each_possible_cpu(cpu) { rcu_read_lock_sched(); - if (dl_bw_visited(cpu, gen)) { + if (dl_bw_visited(cpu, cookie)) { rcu_read_unlock_sched(); continue; } @@ -3567,9 +3572,7 @@ void dl_bw_free(int cpu, u64 dl_bw) } #endif -#ifdef CONFIG_SCHED_DEBUG void print_dl_stats(struct seq_file *m, int cpu) { print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); } -#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ef047add7f9e..9d71baf08075 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -244,11 +244,13 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { - static const char * preempt_modes[] = { - "none", "voluntary", "full", "lazy", - }; - int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; + int j; + + /* Count entries in NULL terminated preempt_modes */ + for (j = 0; preempt_modes[j]; j++) + ; + j -= !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); for (; i < j; i++) { if (preempt_dynamic_mode == i) @@ -292,7 +294,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, bool orig; cpus_read_lock(); - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); orig = sched_debug_verbose; result = debugfs_write_file_bool(filp, ubuf, cnt, ppos); @@ -304,7 +306,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, sd_dentry = NULL; } - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); cpus_read_unlock(); return result; @@ -515,9 +517,9 @@ static __init int sched_init_debug(void) debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); update_sched_domain_debugfs(); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); #endif #ifdef CONFIG_NUMA_BALANCING @@ -586,6 +588,10 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent) debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level); + + if (sd->flags & SD_ASYM_PACKING) + debugfs_create_u32("group_asym_prefer_cpu", 0444, parent, + (u32 *)&sd->groups->asym_prefer_cpu); } void update_sched_domain_debugfs(void) @@ -1204,6 +1210,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_failed_migrations_running); P_SCHEDSTAT(nr_failed_migrations_hot); P_SCHEDSTAT(nr_forced_migrations); +#ifdef CONFIG_NUMA_BALANCING + P_SCHEDSTAT(numa_task_migrated); + P_SCHEDSTAT(numa_task_swapped); +#endif P_SCHEDSTAT(nr_wakeups); P_SCHEDSTAT(nr_wakeups_sync); P_SCHEDSTAT(nr_wakeups_migrate); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 5a81d9a1e31f..b498d867ba21 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6,6 +6,9 @@ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> * Copyright (c) 2022 David Vernet <dvernet@meta.com> */ +#include <linux/btf_ids.h> +#include "ext_idle.h" + #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) enum scx_consts { @@ -23,7 +26,7 @@ enum scx_consts { * Iterating all tasks may take a while. Periodically drop * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. */ - SCX_OPS_TASK_ITER_BATCH = 32, + SCX_TASK_ITER_BATCH = 32, }; enum scx_exit_kind { @@ -41,9 +44,9 @@ enum scx_exit_kind { }; /* - * An exit code can be specified when exiting with scx_bpf_exit() or - * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN - * respectively. The codes are 64bit of the format: + * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), + * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes + * are 64bit of the format: * * Bits: [63 .. 48 47 .. 32 31 .. 0] * [ SYS ACT ] [ SYS RSN ] [ USR ] @@ -93,7 +96,7 @@ enum scx_ops_flags { /* * Keep built-in idle tracking even if ops.update_idle() is implemented. */ - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, /* * By default, if there are no other task to run on the CPU, ext core @@ -101,7 +104,7 @@ enum scx_ops_flags { * flag is specified, such tasks are passed to ops.enqueue() with * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. */ - SCX_OPS_ENQ_LAST = 1LLU << 1, + SCX_OPS_ENQ_LAST = 1LLU << 1, /* * An exiting task may schedule after PF_EXITING is set. In such cases, @@ -114,13 +117,13 @@ enum scx_ops_flags { * depend on pid lookups and wants to handle these tasks directly, the * following flag can be used. */ - SCX_OPS_ENQ_EXITING = 1LLU << 2, + SCX_OPS_ENQ_EXITING = 1LLU << 2, /* * If set, only tasks with policy set to SCHED_EXT are attached to * sched_ext. If clear, SCHED_NORMAL tasks are also included. */ - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, /* * A migration disabled task can only execute on its current CPU. By @@ -133,19 +136,48 @@ enum scx_ops_flags { * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr * and thus may disagree with cpumask_weight(p->cpus_ptr). */ - SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes + * ops.enqueue() on the ops.select_cpu() selected or the wakee's + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline + * transfers. When this optimization is enabled, ops.select_cpu() is + * skipped in some cases (when racing against the wakee switching out). + * As the BPF scheduler may depend on ops.select_cpu() being invoked + * during wakeups, queued wakeup is disabled by default. + * + * If this ops flag is set, queued wakeup optimization is enabled and + * the BPF scheduler must be able to handle ops.enqueue() invoked on the + * wakee's CPU without preceding ops.select_cpu() even for tasks which + * may be executed on multiple CPUs. + */ + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, + + /* + * If set, enable per-node idle cpumasks. If clear, use a single global + * flat idle cpumask. + */ + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, /* * CPU cgroup support flags */ - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */ + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ - SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | - SCX_OPS_ENQ_LAST | - SCX_OPS_ENQ_EXITING | - SCX_OPS_ENQ_MIGRATION_DISABLED | - SCX_OPS_SWITCH_PARTIAL | - SCX_OPS_HAS_CGROUP_WEIGHT, + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_ALLOW_QUEUED_WAKEUP | + SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_BUILTIN_IDLE_PER_NODE | + SCX_OPS_HAS_CGROUP_WEIGHT, + + /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ + __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, + + SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, }; /* argument container for ops.init_task() */ @@ -341,6 +373,15 @@ struct sched_ext_ops { * @running: A task is starting to run on its associated CPU * @p: task starting to run * + * Note that this callback may be called from a CPU other than the + * one the task is going to run on. This can happen when a task + * property is changed (i.e., affinity), since scx_next_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to determine the + * target CPU the task is going to use. + * * See ->runnable() for explanation on the task state notifiers. */ void (*running)(struct task_struct *p); @@ -350,6 +391,15 @@ struct sched_ext_ops { * @p: task stopping to run * @runnable: is task @p still runnable? * + * Note that this callback may be called from a CPU other than the + * one the task was running on. This can happen when a task + * property is changed (i.e., affinity), since dequeue_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU + * the task was running on. + * * See ->runnable() for explanation on the task state notifiers. If * !@runnable, ->quiescent() will be invoked after this operation * returns. @@ -438,6 +488,7 @@ struct sched_ext_ops { * idle CPU tracking and the following helpers become unavailable: * * - scx_bpf_select_cpu_dfl() + * - scx_bpf_select_cpu_and() * - scx_bpf_test_and_clear_cpu_idle() * - scx_bpf_pick_idle_cpu() * @@ -701,6 +752,9 @@ struct sched_ext_ops { * BPF scheduler is enabled. */ char name[SCX_OPS_NAME_LEN]; + + /* internal use only, must be NULL */ + void *priv; }; enum scx_opi { @@ -712,6 +766,98 @@ enum scx_opi { SCX_OPI_END = SCX_OP_IDX(init), }; +/* + * Collection of event counters. Event types are placed in descending order. + */ +struct scx_event_stats { + /* + * If ops.select_cpu() returns a CPU which can't be used by the task, + * the core scheduler code silently picks a fallback CPU. + */ + s64 SCX_EV_SELECT_CPU_FALLBACK; + + /* + * When dispatching to a local DSQ, the CPU may have gone offline in + * the meantime. In this case, the task is bounced to the global DSQ. + */ + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; + + /* + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task + * continued to run because there were no other tasks on the CPU. + */ + s64 SCX_EV_DISPATCH_KEEP_LAST; + + /* + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task + * is dispatched to a local DSQ when exiting. + */ + s64 SCX_EV_ENQ_SKIP_EXITING; + + /* + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a + * migration disabled task skips ops.enqueue() and is dispatched to its + * local DSQ. + */ + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + + /* + * Total number of times a task's time slice was refilled with the + * default value (SCX_SLICE_DFL). + */ + s64 SCX_EV_REFILL_SLICE_DFL; + + /* + * The total duration of bypass modes in nanoseconds. + */ + s64 SCX_EV_BYPASS_DURATION; + + /* + * The number of tasks dispatched in the bypassing mode. + */ + s64 SCX_EV_BYPASS_DISPATCH; + + /* + * The number of times the bypassing mode has been activated. + */ + s64 SCX_EV_BYPASS_ACTIVATE; +}; + +struct scx_sched { + struct sched_ext_ops ops; + DECLARE_BITMAP(has_op, SCX_OPI_END); + + /* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. + * This is to avoid live-locking in bypass mode where all tasks are + * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If + * per-node split isn't sufficient, it can be further split. + */ + struct rhashtable dsq_hash; + struct scx_dispatch_q **global_dsqs; + + /* + * The event counters are in a per-CPU variable to minimize the + * accounting overhead. A system-wide view on the event counter is + * constructed when requested by scx_bpf_events(). + */ + struct scx_event_stats __percpu *event_stats_cpu; + + bool warned_zero_slice; + + atomic_t exit_kind; + struct scx_exit_info *exit_info; + + struct kobject kobj; + + struct kthread_worker *helper; + struct irq_work error_irq_work; + struct kthread_work disable_work; + struct rcu_work rcu_work; +}; + enum scx_wake_flags { /* expose select WF_* flags as enums */ SCX_WAKE_FORK = WF_FORK, @@ -779,6 +925,7 @@ enum scx_deq_flags { enum scx_pick_idle_cpu_flags { SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ }; enum scx_kick_flags { @@ -810,18 +957,18 @@ enum scx_tg_flags { SCX_TG_INITED = 1U << 1, }; -enum scx_ops_enable_state { - SCX_OPS_ENABLING, - SCX_OPS_ENABLED, - SCX_OPS_DISABLING, - SCX_OPS_DISABLED, +enum scx_enable_state { + SCX_ENABLING, + SCX_ENABLED, + SCX_DISABLING, + SCX_DISABLED, }; -static const char *scx_ops_enable_state_str[] = { - [SCX_OPS_ENABLING] = "enabling", - [SCX_OPS_ENABLED] = "enabled", - [SCX_OPS_DISABLING] = "disabling", - [SCX_OPS_DISABLED] = "disabled", +static const char *scx_enable_state_str[] = { + [SCX_ENABLING] = "enabling", + [SCX_ENABLED] = "enabled", + [SCX_DISABLING] = "disabling", + [SCX_DISABLED] = "disabled", }; /* @@ -870,6 +1017,16 @@ enum scx_ops_state { #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) /* + * NOTE: sched_ext is in the process of growing multiple scheduler support and + * scx_root usage is in a transitional state. Naked dereferences are safe if the + * caller is one of the tasks attached to SCX and explicit RCU dereference is + * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but + * are used as temporary markers to indicate that the dereferences need to be + * updated to point to the associated scheduler instances rather than scx_root. + */ +static struct scx_sched __rcu *scx_root; + +/* * During exit, a task may schedule after losing its PIDs. When disabling the * BPF scheduler, we need to be able to iterate tasks in every state to * guarantee system safety. Maintain a dedicated task list which contains every @@ -879,38 +1036,17 @@ static DEFINE_SPINLOCK(scx_tasks_lock); static LIST_HEAD(scx_tasks); /* ops enable/disable */ -static struct kthread_worker *scx_ops_helper; -static DEFINE_MUTEX(scx_ops_enable_mutex); -DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); +static DEFINE_MUTEX(scx_enable_mutex); +DEFINE_STATIC_KEY_FALSE(__scx_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); -static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); +static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); static unsigned long scx_in_softlockup; -static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0); -static int scx_ops_bypass_depth; -static bool scx_ops_init_task_enabled; +static atomic_t scx_breather_depth = ATOMIC_INIT(0); +static int scx_bypass_depth; +static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); -static struct sched_ext_ops scx_ops; -static bool scx_warned_zero_slice; - -static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); -static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); -static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); -static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); -static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); - -#ifdef CONFIG_SMP -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); -#endif - -static struct static_key_false scx_has_op[SCX_OPI_END] = - { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; - -static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); -static struct scx_exit_info *scx_exit_info; - static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); @@ -924,7 +1060,7 @@ static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); /* * The maximum amount of time in jiffies that a task may be runnable without * being scheduled on a CPU. If this timeout is exceeded, it will trigger - * scx_ops_error(). + * scx_error(). */ static unsigned long scx_watchdog_timeout; @@ -938,21 +1074,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; static struct delayed_work scx_watchdog_work; -/* idle tracking */ -#ifdef CONFIG_SMP -#ifdef CONFIG_CPUMASK_OFFSTACK -#define CL_ALIGNED_IF_ONSTACK -#else -#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp -#endif - -static struct { - cpumask_var_t cpu; - cpumask_var_t smt; -} idle_masks CL_ALIGNED_IF_ONSTACK; - -#endif /* CONFIG_SMP */ - /* for %SCX_KICK_WAIT */ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; @@ -965,23 +1086,12 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; */ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); -/* - * Dispatch queues. - * - * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is - * to avoid live-locking in bypass mode where all tasks are dispatched to - * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't - * sufficient, it can be further split. - */ -static struct scx_dispatch_q **global_dsqs; - static const struct rhashtable_params dsq_hash_params = { .key_len = sizeof_field(struct scx_dispatch_q, id), .key_offset = offsetof(struct scx_dispatch_q, id), .head_offset = offsetof(struct scx_dispatch_q, hash_node), }; -static struct rhashtable dsq_hash; static LLIST_HEAD(dsqs_to_free); /* dispatch buf */ @@ -1028,27 +1138,46 @@ static struct scx_dump_data scx_dump_data = { /* /sys/kernel/sched_ext interface */ static struct kset *scx_kset; -static struct kobject *scx_root_kobj; #define CREATE_TRACE_POINTS #include <trace/events/sched_ext.h> static void process_ddsp_deferred_locals(struct rq *rq); static void scx_bpf_kick_cpu(s32 cpu, u64 flags); -static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, - s64 exit_code, - const char *fmt, ...); +static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, + s64 exit_code, const char *fmt, va_list args); -#define scx_ops_error_kind(err, fmt, args...) \ - scx_ops_exit_kind((err), 0, fmt, ##args) +static __printf(4, 5) void scx_exit(struct scx_sched *sch, + enum scx_exit_kind kind, s64 exit_code, + const char *fmt, ...) +{ + va_list args; -#define scx_ops_exit(code, fmt, args...) \ - scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args) + va_start(args, fmt); + scx_vexit(sch, kind, exit_code, fmt, args); + va_end(args); +} + +static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code, + const char *fmt, ...) +{ + struct scx_sched *sch; + va_list args; + + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) { + va_start(args, fmt); + scx_vexit(sch, kind, exit_code, fmt, args); + va_end(args); + } + rcu_read_unlock(); +} -#define scx_ops_error(fmt, args...) \ - scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) +#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) +#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args) -#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) +#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) static long jiffies_delta_msecs(unsigned long at, unsigned long now) { @@ -1078,12 +1207,14 @@ static bool u32_before(u32 a, u32 b) static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) { - return global_dsqs[cpu_to_node(task_cpu(p))]; + struct scx_sched *sch = scx_root; + + return sch->global_dsqs[cpu_to_node(task_cpu(p))]; } -static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) +static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) { - return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); + return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params); } /* @@ -1110,27 +1241,61 @@ static void scx_kf_disallow(u32 mask) current->scx.kf_mask &= ~mask; } -#define SCX_CALL_OP(mask, op, args...) \ +/* + * Track the rq currently locked. + * + * This allows kfuncs to safely operate on rq from any scx ops callback, + * knowing which rq is already locked. + */ +static DEFINE_PER_CPU(struct rq *, locked_rq); + +static inline void update_locked_rq(struct rq *rq) +{ + /* + * Check whether @rq is actually locked. This can help expose bugs + * or incorrect assumptions about the context in which a kfunc or + * callback is executed. + */ + if (rq) + lockdep_assert_rq_held(rq); + __this_cpu_write(locked_rq, rq); +} + +/* + * Return the rq currently locked from an scx callback, or NULL if no rq is + * locked. + */ +static inline struct rq *scx_locked_rq(void) +{ + return __this_cpu_read(locked_rq); +} + +#define SCX_CALL_OP(sch, mask, op, rq, args...) \ do { \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ - scx_ops.op(args); \ + (sch)->ops.op(args); \ scx_kf_disallow(mask); \ } else { \ - scx_ops.op(args); \ + (sch)->ops.op(args); \ } \ + update_locked_rq(NULL); \ } while (0) -#define SCX_CALL_OP_RET(mask, op, args...) \ +#define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \ ({ \ - __typeof__(scx_ops.op(args)) __ret; \ + __typeof__((sch)->ops.op(args)) __ret; \ + \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ - __ret = scx_ops.op(args); \ + __ret = (sch)->ops.op(args); \ scx_kf_disallow(mask); \ } else { \ - __ret = scx_ops.op(args); \ + __ret = (sch)->ops.op(args); \ } \ + update_locked_rq(NULL); \ __ret; \ }) @@ -1145,31 +1310,31 @@ do { \ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on * the specific task. */ -#define SCX_CALL_OP_TASK(mask, op, task, args...) \ +#define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \ do { \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task; \ - SCX_CALL_OP(mask, op, task, ##args); \ + SCX_CALL_OP((sch), mask, op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ } while (0) -#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ +#define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \ ({ \ - __typeof__(scx_ops.op(task, ##args)) __ret; \ + __typeof__((sch)->ops.op(task, ##args)) __ret; \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task; \ - __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ + __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ __ret; \ }) -#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ +#define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \ ({ \ - __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ + __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ current->scx.kf_tasks[0] = task0; \ current->scx.kf_tasks[1] = task1; \ - __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ + __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \ current->scx.kf_tasks[0] = NULL; \ current->scx.kf_tasks[1] = NULL; \ __ret; \ @@ -1179,8 +1344,8 @@ do { \ static __always_inline bool scx_kf_allowed(u32 mask) { if (unlikely(!(current->scx.kf_mask & mask))) { - scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", - mask, current->scx.kf_mask); + scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", + mask, current->scx.kf_mask); return false; } @@ -1193,13 +1358,13 @@ static __always_inline bool scx_kf_allowed(u32 mask) */ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { - scx_ops_error("cpu_release kfunc called from a nested operation"); + scx_kf_error("cpu_release kfunc called from a nested operation"); return false; } if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { - scx_ops_error("dispatch kfunc called from a nested operation"); + scx_kf_error("dispatch kfunc called from a nested operation"); return false; } @@ -1215,18 +1380,13 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, if (unlikely((p != current->scx.kf_tasks[0] && p != current->scx.kf_tasks[1]))) { - scx_ops_error("called on a task not being operated on"); + scx_kf_error("called on a task not being operated on"); return false; } return true; } -static bool scx_kf_allowed_if_unlocked(void) -{ - return !current->scx.kf_mask; -} - /** * nldsq_next_task - Iterate to the next task in a non-local DSQ * @dsq: user dsq being iterated @@ -1394,15 +1554,15 @@ static void scx_task_iter_stop(struct scx_task_iter *iter) * @iter: iterator to walk * * Visit the next task. See scx_task_iter_start() for details. Locks are dropped - * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing - * stalls by holding scx_tasks_lock for too long. + * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls + * by holding scx_tasks_lock for too long. */ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) { struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; - if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) { + if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); scx_task_iter_relock(iter); @@ -1473,23 +1633,72 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return p; } -static enum scx_ops_enable_state scx_ops_enable_state(void) +/** + * scx_add_event - Increase an event counter for 'name' by 'cnt' + * @sch: scx_sched to account events for + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This can be used when preemption is not disabled. + */ +#define scx_add_event(sch, name, cnt) do { \ + this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + trace_sched_ext_event(#name, (cnt)); \ +} while(0) + +/** + * __scx_add_event - Increase an event counter for 'name' by 'cnt' + * @sch: scx_sched to account events for + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This should be used only when preemption is disabled. + */ +#define __scx_add_event(sch, name, cnt) do { \ + __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + trace_sched_ext_event(#name, cnt); \ +} while(0) + +/** + * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' + * @dst_e: destination event stats + * @src_e: source event stats + * @kind: a kind of event to be aggregated + */ +#define scx_agg_event(dst_e, src_e, kind) do { \ + (dst_e)->kind += READ_ONCE((src_e)->kind); \ +} while(0) + +/** + * scx_dump_event - Dump an event 'kind' in 'events' to 's' + * @s: output seq_buf + * @events: event stats + * @kind: a kind of event to dump + */ +#define scx_dump_event(s, events, kind) do { \ + dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ +} while (0) + + +static void scx_read_events(struct scx_sched *sch, + struct scx_event_stats *events); + +static enum scx_enable_state scx_enable_state(void) { - return atomic_read(&scx_ops_enable_state_var); + return atomic_read(&scx_enable_state_var); } -static enum scx_ops_enable_state -scx_ops_set_enable_state(enum scx_ops_enable_state to) +static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to) { - return atomic_xchg(&scx_ops_enable_state_var, to); + return atomic_xchg(&scx_enable_state_var, to); } -static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, - enum scx_ops_enable_state from) +static bool scx_tryset_enable_state(enum scx_enable_state to, + enum scx_enable_state from) { int from_v = from; - return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); + return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); } static bool scx_rq_bypassing(struct rq *rq) @@ -1514,8 +1723,14 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss) } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); } +static inline bool __cpu_valid(s32 cpu) +{ + return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); +} + /** - * ops_cpu_valid - Verify a cpu number + * ops_cpu_valid - Verify a cpu number, to be used on ops input args + * @sch: scx_sched to abort on error * @cpu: cpu number which came from a BPF ops * @where: extra information reported on error * @@ -1523,35 +1738,52 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss) * Verify that it is in range and one of the possible cpus. If invalid, trigger * an ops error. */ -static bool ops_cpu_valid(s32 cpu, const char *where) +static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) { - if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) { + if (__cpu_valid(cpu)) { return true; } else { - scx_ops_error("invalid CPU %d%s%s", cpu, - where ? " " : "", where ?: ""); + scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); + return false; + } +} + +/** + * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args + * @cpu: cpu number which came from a BPF ops + * @where: extra information reported on error + * + * The same as ops_cpu_valid() but @sch is implicit. + */ +static bool kf_cpu_valid(u32 cpu, const char *where) +{ + if (__cpu_valid(cpu)) { + return true; + } else { + scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); return false; } } /** * ops_sanitize_err - Sanitize a -errno value + * @sch: scx_sched to error out on error * @ops_name: operation to blame on failure * @err: -errno value to sanitize * - * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return + * Verify @err is a valid -errno. If not, trigger scx_error() and return * -%EPROTO. This is necessary because returning a rogue -errno up the chain can * cause misbehaviors. For an example, a large negative return from * ops.init_task() triggers an oops when passed up the call chain because the * value fails IS_ERR() test after being encoded with ERR_PTR() and then is * handled as a pointer. */ -static int ops_sanitize_err(const char *ops_name, s32 err) +static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err) { if (err < 0 && err >= -MAX_ERRNO) return err; - scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); + scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err); return -EPROTO; } @@ -1658,7 +1890,7 @@ static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) lockdep_assert_rq_held(rq); #ifdef CONFIG_SCHED_CORE - if (SCX_HAS_OP(core_sched_before)) + if (unlikely(SCX_HAS_OP(scx_root, core_sched_before))) touch_core_sched(rq, p); #endif } @@ -1696,8 +1928,14 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) WRITE_ONCE(dsq->nr, dsq->nr + delta); } -static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, - u64 enq_flags) +static void refill_task_slice_dfl(struct task_struct *p) +{ + p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1); +} + +static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, + struct task_struct *p, u64 enq_flags) { bool is_local = dsq->id == SCX_DSQ_LOCAL; @@ -1708,7 +1946,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, if (!is_local) { raw_spin_lock(&dsq->lock); if (unlikely(dsq->id == SCX_DSQ_INVALID)) { - scx_ops_error("attempting to dispatch to a destroyed dsq"); + scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); dsq = find_global_dsq(p); @@ -1725,7 +1963,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, * disallow any internal DSQ from doing vtime ordering of * tasks. */ - scx_ops_error("cannot use vtime ordering for built-in DSQs"); + scx_error(sch, "cannot use vtime ordering for built-in DSQs"); enq_flags &= ~SCX_ENQ_DSQ_PRIQ; } @@ -1739,8 +1977,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, */ if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && nldsq_next_task(dsq, NULL, false))) - scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", - dsq->id); + scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks", + dsq->id); p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less); @@ -1761,8 +1999,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, } else { /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) - scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", - dsq->id); + scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", + dsq->id); if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) list_add(&p->scx.dsq_list.node, &dsq->list); @@ -1877,7 +2115,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) raw_spin_unlock(&dsq->lock); } -static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, +static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, + struct rq *rq, u64 dsq_id, struct task_struct *p) { struct scx_dispatch_q *dsq; @@ -1888,7 +2127,7 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; - if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) + if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) return find_global_dsq(p); return &cpu_rq(cpu)->scx.local_dsq; @@ -1897,11 +2136,11 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, if (dsq_id == SCX_DSQ_GLOBAL) dsq = find_global_dsq(p); else - dsq = find_user_dsq(dsq_id); + dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) { - scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", - dsq_id, p->comm, p->pid); + scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]", + dsq_id, p->comm, p->pid); return find_global_dsq(p); } @@ -1922,12 +2161,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task, /* @p must match the task on the enqueue path */ if (unlikely(p != ddsp_task)) { if (IS_ERR(ddsp_task)) - scx_ops_error("%s[%d] already direct-dispatched", - p->comm, p->pid); + scx_kf_error("%s[%d] already direct-dispatched", + p->comm, p->pid); else - scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", - ddsp_task->comm, ddsp_task->pid, - p->comm, p->pid); + scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", + ddsp_task->comm, ddsp_task->pid, + p->comm, p->pid); return; } @@ -1938,11 +2177,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task, p->scx.ddsp_enq_flags = enq_flags; } -static void direct_dispatch(struct task_struct *p, u64 enq_flags) +static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, + u64 enq_flags) { struct rq *rq = task_rq(p); struct scx_dispatch_q *dsq = - find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); + find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); touch_core_sched_dispatch(rq, p); @@ -1983,7 +2223,8 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags) return; } - dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(sch, dsq, p, + p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); } static bool scx_rq_online(struct rq *rq) @@ -2001,6 +2242,7 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { + struct scx_sched *sch = scx_root; struct task_struct **ddsp_taskp; unsigned long qseq; @@ -2018,23 +2260,29 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (scx_rq_bypassing(rq)) + if (scx_rq_bypassing(rq)) { + __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); goto global; + } if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct; /* see %SCX_OPS_ENQ_EXITING */ - if (!static_branch_unlikely(&scx_ops_enq_exiting) && - unlikely(p->flags & PF_EXITING)) + if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) && + unlikely(p->flags & PF_EXITING)) { + __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1); goto local; + } /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ - if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && - is_migration_disabled(p)) + if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && + is_migration_disabled(p)) { + __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); goto local; + } - if (!SCX_HAS_OP(enqueue)) + if (unlikely(!SCX_HAS_OP(sch, enqueue))) goto global; /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ @@ -2047,7 +2295,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, WARN_ON_ONCE(*ddsp_taskp); *ddsp_taskp = p; - SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); + SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags); *ddsp_taskp = NULL; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -2061,7 +2309,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, return; direct: - direct_dispatch(p, enq_flags); + direct_dispatch(sch, p, enq_flags); return; local: @@ -2071,15 +2319,15 @@ local: * higher priority it becomes from scx_prio_less()'s POV. */ touch_core_sched(rq, p); - p->scx.slice = SCX_SLICE_DFL; + refill_task_slice_dfl(p); local_norefill: - dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); + dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); return; global: touch_core_sched(rq, p); /* see the comment in local: */ - p->scx.slice = SCX_SLICE_DFL; - dispatch_enqueue(find_global_dsq(p), p, enq_flags); + refill_task_slice_dfl(p); + dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -2097,7 +2345,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) } /* - * list_add_tail() must be used. scx_ops_bypass() depends on tasks being + * list_add_tail() must be used. scx_bypass() depends on tasks being * appended to the runnable_list. */ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); @@ -2112,6 +2360,7 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) { + struct scx_sched *sch = scx_root; int sticky_cpu = p->scx.sticky_cpu; if (enq_flags & ENQUEUE_WAKEUP) @@ -2141,8 +2390,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags rq->scx.nr_running++; add_nr_running(rq, 1); - if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); + if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags); if (enq_flags & SCX_ENQ_WAKEUP) touch_core_sched(rq, p); @@ -2150,10 +2399,15 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags do_enqueue_task(rq, p, enq_flags, sticky_cpu); out: rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; + + if ((enq_flags & SCX_ENQ_CPU_SELECTED) && + unlikely(cpu_of(rq) != p->scx.selected_cpu)) + __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1); } -static void ops_dequeue(struct task_struct *p, u64 deq_flags) +static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) { + struct scx_sched *sch = scx_root; unsigned long opss; /* dequeue is always temporary, don't reset runnable_at */ @@ -2172,8 +2426,9 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags) */ BUG(); case SCX_OPSS_QUEUED: - if (SCX_HAS_OP(dequeue)) - SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); + if (SCX_HAS_OP(sch, dequeue)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, + p, deq_flags); if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, SCX_OPSS_NONE)) @@ -2201,12 +2456,14 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags) static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) { + struct scx_sched *sch = scx_root; + if (!(p->scx.flags & SCX_TASK_QUEUED)) { WARN_ON_ONCE(task_runnable(p)); return true; } - ops_dequeue(p, deq_flags); + ops_dequeue(rq, p, deq_flags); /* * A currently running task which is going off @rq first gets dequeued @@ -2220,13 +2477,13 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags * information meaningful to the BPF scheduler and can be suppressed by * skipping the callbacks if the task is !QUEUED. */ - if (SCX_HAS_OP(stopping) && task_current(rq, p)) { + if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { update_curr_scx(rq); - SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); + SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false); } - if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); + if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags); if (deq_flags & SCX_DEQ_SLEEP) p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; @@ -2243,20 +2500,23 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags static void yield_task_scx(struct rq *rq) { + struct scx_sched *sch = scx_root; struct task_struct *p = rq->curr; - if (SCX_HAS_OP(yield)) - SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); + if (SCX_HAS_OP(sch, yield)) + SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); else p->scx.slice = 0; } static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) { + struct scx_sched *sch = scx_root; struct task_struct *from = rq->curr; - if (SCX_HAS_OP(yield)) - return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); + if (SCX_HAS_OP(sch, yield)) + return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, + from, to); else return false; } @@ -2336,12 +2596,13 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, * * The caller must ensure that @p and @rq are on different CPUs. */ -static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, - bool trigger_error) +static bool task_can_run_on_remote_rq(struct scx_sched *sch, + struct task_struct *p, struct rq *rq, + bool enforce) { int cpu = cpu_of(rq); - SCHED_WARN_ON(task_cpu(p) == cpu); + WARN_ON_ONCE(task_cpu(p) == cpu); /* * If @p has migration disabled, @p->cpus_ptr is updated to contain only @@ -2356,9 +2617,9 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, * easily be masked if task_allowed_on_cpu() is done first. */ if (unlikely(is_migration_disabled(p))) { - if (trigger_error) - scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", - p->comm, p->pid, task_cpu(p), cpu); + if (enforce) + scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", + p->comm, p->pid, task_cpu(p), cpu); return false; } @@ -2369,14 +2630,18 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, * picked CPU is outside the allowed mask. */ if (!task_allowed_on_cpu(p, cpu)) { - if (trigger_error) - scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", - cpu, p->comm, p->pid); + if (enforce) + scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", + cpu, p->comm, p->pid); return false; } - if (!scx_rq_online(rq)) + if (!scx_rq_online(rq)) { + if (enforce) + __scx_add_event(scx_root, + SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; + } return true; } @@ -2446,12 +2711,13 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, } #else /* CONFIG_SMP */ static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } -static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } +static inline bool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce) { return false; } static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } #endif /* CONFIG_SMP */ /** * move_task_between_dsqs() - Move a task from one DSQ to another + * @sch: scx_sched being operated on * @p: target task * @enq_flags: %SCX_ENQ_* * @src_dsq: DSQ @p is currently on, must not be a local DSQ @@ -2465,7 +2731,8 @@ static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p * On return, @src_dsq is unlocked and only @p's new task_rq, which is the * return value, is locked. */ -static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, +static struct rq *move_task_between_dsqs(struct scx_sched *sch, + struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *src_dsq, struct scx_dispatch_q *dst_dsq) { @@ -2478,7 +2745,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, if (dst_dsq->id == SCX_DSQ_LOCAL) { dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (src_rq != dst_rq && - unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { + unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { dst_dsq = find_global_dsq(p); dst_rq = src_rq; } @@ -2512,7 +2779,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, p->scx.dsq = NULL; raw_spin_unlock(&src_dsq->lock); - dispatch_enqueue(dst_dsq, p, enq_flags); + dispatch_enqueue(sch, dst_dsq, p, enq_flags); } return dst_rq; @@ -2524,13 +2791,13 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, * to the bypass mode can take a long time. Inject artificial delays while the * bypass mode is switching to guarantee timely completion. */ -static void scx_ops_breather(struct rq *rq) +static void scx_breather(struct rq *rq) { u64 until; lockdep_assert_rq_held(rq); - if (likely(!atomic_read(&scx_ops_breather_depth))) + if (likely(!atomic_read(&scx_breather_depth))) return; raw_spin_rq_unlock(rq); @@ -2539,25 +2806,26 @@ static void scx_ops_breather(struct rq *rq) do { int cnt = 1024; - while (atomic_read(&scx_ops_breather_depth) && --cnt) + while (atomic_read(&scx_breather_depth) && --cnt) cpu_relax(); - } while (atomic_read(&scx_ops_breather_depth) && + } while (atomic_read(&scx_breather_depth) && time_before64(ktime_get_ns(), until)); raw_spin_rq_lock(rq); } -static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq) +static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, + struct scx_dispatch_q *dsq) { struct task_struct *p; retry: /* - * This retry loop can repeatedly race against scx_ops_bypass() - * dequeueing tasks from @dsq trying to put the system into the bypass - * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can - * live-lock the machine into soft lockups. Give a breather. + * This retry loop can repeatedly race against scx_bypass() dequeueing + * tasks from @dsq trying to put the system into the bypass mode. On + * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock + * the machine into soft lockups. Give a breather. */ - scx_ops_breather(rq); + scx_breather(rq); /* * The caller can't expect to successfully consume a task if the task's @@ -2579,7 +2847,7 @@ retry: return true; } - if (task_can_run_on_remote_rq(p, rq, false)) { + if (task_can_run_on_remote_rq(sch, p, rq, false)) { if (likely(consume_remote_task(rq, p, dsq, task_rq))) return true; goto retry; @@ -2590,15 +2858,16 @@ retry: return false; } -static bool consume_global_dsq(struct rq *rq) +static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) { int node = cpu_to_node(cpu_of(rq)); - return consume_dispatch_q(rq, global_dsqs[node]); + return consume_dispatch_q(sch, rq, sch->global_dsqs[node]); } /** * dispatch_to_local_dsq - Dispatch a task to a local dsq + * @sch: scx_sched being operated on * @rq: current rq which is locked * @dst_dsq: destination DSQ * @p: task to dispatch @@ -2611,7 +2880,8 @@ static bool consume_global_dsq(struct rq *rq) * The caller must have exclusive ownership of @p (e.g. through * %SCX_OPSS_DISPATCHING). */ -static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, +static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, + struct scx_dispatch_q *dst_dsq, struct task_struct *p, u64 enq_flags) { struct rq *src_rq = task_rq(p); @@ -2627,14 +2897,15 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, * If dispatching to @rq that @p is already on, no lock dancing needed. */ if (rq == src_rq && rq == dst_rq) { - dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(sch, dst_dsq, p, + enq_flags | SCX_ENQ_CLEAR_OPSS); return; } #ifdef CONFIG_SMP if (src_rq != dst_rq && - unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { - dispatch_enqueue(find_global_dsq(p), p, + unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { + dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; } @@ -2672,7 +2943,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, */ if (src_rq == dst_rq) { p->scx.holding_cpu = -1; - dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags); + dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p, + enq_flags); } else { move_remote_task_to_local_dsq(p, enq_flags, src_rq, dst_rq); @@ -2714,7 +2986,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, * was valid in the first place. Make sure that the task is still owned by the * BPF scheduler and claim the ownership before dispatching. */ -static void finish_dispatch(struct rq *rq, struct task_struct *p, +static void finish_dispatch(struct scx_sched *sch, struct rq *rq, + struct task_struct *p, unsigned long qseq_at_dispatch, u64 dsq_id, u64 enq_flags) { @@ -2767,15 +3040,15 @@ retry: BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); - dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p); + dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p); if (dsq->id == SCX_DSQ_LOCAL) - dispatch_to_local_dsq(rq, dsq, p, enq_flags); + dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); else - dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); } -static void flush_dispatch_buf(struct rq *rq) +static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); u32 u; @@ -2783,7 +3056,7 @@ static void flush_dispatch_buf(struct rq *rq) for (u = 0; u < dspc->cursor; u++) { struct scx_dsp_buf_ent *ent = &dspc->buf[u]; - finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id, + finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id, ent->enq_flags); } @@ -2793,6 +3066,7 @@ static void flush_dispatch_buf(struct rq *rq) static int balance_one(struct rq *rq, struct task_struct *prev) { + struct scx_sched *sch = scx_root; struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); bool prev_on_scx = prev->sched_class == &ext_sched_class; bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED; @@ -2802,7 +3076,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) rq->scx.flags |= SCX_RQ_IN_BALANCE; rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); - if (static_branch_unlikely(&scx_ops_cpu_preempt) && + if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && unlikely(rq->scx.cpu_released)) { /* * If the previous sched_class for the current CPU was not SCX, @@ -2810,8 +3084,9 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * core. This callback complements ->cpu_release(), which is * emitted in switch_class(). */ - if (SCX_HAS_OP(cpu_acquire)) - SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL); + if (SCX_HAS_OP(sch, cpu_acquire)) + SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq, + cpu_of(rq), NULL); rq->scx.cpu_released = false; } @@ -2825,8 +3100,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * scheduler wants to handle this explicitly, it should * implement ->cpu_release(). * - * See scx_ops_disable_workfn() for the explanation on the - * bypassing test. + * See scx_disable_workfn() for the explanation on the bypassing + * test. */ if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) { rq->scx.flags |= SCX_RQ_BAL_KEEP; @@ -2838,10 +3113,11 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_global_dsq(rq)) + if (consume_global_dsq(sch, rq)) goto has_tasks; - if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) + if (unlikely(!SCX_HAS_OP(sch, dispatch)) || + scx_rq_bypassing(rq) || !scx_rq_online(rq)) goto no_tasks; dspc->rq = rq; @@ -2856,10 +3132,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev) do { dspc->nr_tasks = 0; - SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), - prev_on_scx ? prev : NULL); + SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, + cpu_of(rq), prev_on_scx ? prev : NULL); - flush_dispatch_buf(rq); + flush_dispatch_buf(sch, rq); if (prev_on_rq && prev->scx.slice) { rq->scx.flags |= SCX_RQ_BAL_KEEP; @@ -2867,7 +3143,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) } if (rq->scx.local_dsq.nr) goto has_tasks; - if (consume_global_dsq(rq)) + if (consume_global_dsq(sch, rq)) goto has_tasks; /* @@ -2890,9 +3166,10 @@ no_tasks: * Didn't find another task to run. Keep running @prev unless * %SCX_OPS_ENQ_LAST is in effect. */ - if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) || - scx_rq_bypassing(rq))) { + if (prev_on_rq && + (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; + __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); goto has_tasks; } rq->scx.flags &= ~SCX_RQ_IN_BALANCE; @@ -2952,32 +3229,36 @@ static void process_ddsp_deferred_locals(struct rq *rq) */ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, struct task_struct, scx.dsq_list.node))) { + struct scx_sched *sch = scx_root; struct scx_dispatch_q *dsq; list_del_init(&p->scx.dsq_list.node); - dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p); + dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) - dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags); + dispatch_to_local_dsq(sch, rq, dsq, p, + p->scx.ddsp_enq_flags); } } static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) { + struct scx_sched *sch = scx_root; + if (p->scx.flags & SCX_TASK_QUEUED) { /* * Core-sched might decide to execute @p before it is * dispatched. Call ops_dequeue() to notify the BPF scheduler. */ - ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); + ops_dequeue(rq, p, SCX_DEQ_CORE_SCHED_EXEC); dispatch_dequeue(rq, p); } p->se.exec_start = rq_clock_task(rq); /* see dequeue_task_scx() on why we skip when !QUEUED */ - if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(SCX_KF_REST, running, p); + if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p); clr_task_runnable(p, true); @@ -3020,6 +3301,7 @@ preempt_reason_from_class(const struct sched_class *class) static void switch_class(struct rq *rq, struct task_struct *next) { + struct scx_sched *sch = scx_root; const struct sched_class *next_class = next->sched_class; #ifdef CONFIG_SMP @@ -3030,7 +3312,7 @@ static void switch_class(struct rq *rq, struct task_struct *next) */ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); #endif - if (!static_branch_unlikely(&scx_ops_cpu_preempt)) + if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) return; /* @@ -3052,14 +3334,14 @@ static void switch_class(struct rq *rq, struct task_struct *next) * next time that balance_scx() is invoked. */ if (!rq->scx.cpu_released) { - if (SCX_HAS_OP(cpu_release)) { + if (SCX_HAS_OP(sch, cpu_release)) { struct scx_cpu_release_args args = { .reason = preempt_reason_from_class(next_class), .task = next, }; - SCX_CALL_OP(SCX_KF_CPU_RELEASE, - cpu_release, cpu_of(rq), &args); + SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq, + cpu_of(rq), &args); } rq->scx.cpu_released = true; } @@ -3068,11 +3350,12 @@ static void switch_class(struct rq *rq, struct task_struct *next) static void put_prev_task_scx(struct rq *rq, struct task_struct *p, struct task_struct *next) { + struct scx_sched *sch = scx_root; update_curr_scx(rq); /* see dequeue_task_scx() on why we skip when !QUEUED */ - if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); + if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true); if (p->scx.flags & SCX_TASK_QUEUED) { set_task_runnable(rq, p); @@ -3084,7 +3367,8 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, * DSQ. */ if (p->scx.slice && !scx_rq_bypassing(rq)) { - dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); + dispatch_enqueue(sch, &rq->scx.local_dsq, p, + SCX_ENQ_HEAD); goto switch_class; } @@ -3095,7 +3379,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, * which should trigger an explicit follow-up scheduling event. */ if (sched_class_above(&ext_sched_class, next->sched_class)) { - WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last)); + WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); } else { do_enqueue_task(rq, p, 0, -1); @@ -3117,7 +3401,6 @@ static struct task_struct *pick_task_scx(struct rq *rq) { struct task_struct *prev = rq->curr; struct task_struct *p; - bool prev_on_scx = prev->sched_class == &ext_sched_class; bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; bool kick_idle = false; @@ -3137,15 +3420,19 @@ static struct task_struct *pick_task_scx(struct rq *rq) * if pick_task_scx() is called without preceding balance_scx(). */ if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { - if (prev_on_scx) { + if (prev->scx.flags & SCX_TASK_QUEUED) { keep_prev = true; } else { keep_prev = false; kick_idle = true; } - } else if (unlikely(keep_prev && !prev_on_scx)) { - /* only allowed during transitions */ - WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED); + } else if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { + /* + * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is + * conditional on scx_enabled() and may have been skipped. + */ + WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); keep_prev = false; } @@ -3157,7 +3444,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) if (keep_prev) { p = prev; if (!p->scx.slice) - p->scx.slice = SCX_SLICE_DFL; + refill_task_slice_dfl(p); } else { p = first_local_task(rq); if (!p) { @@ -3167,12 +3454,14 @@ static struct task_struct *pick_task_scx(struct rq *rq) } if (unlikely(!p->scx.slice)) { - if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { + struct scx_sched *sch = scx_root; + + if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", p->comm, p->pid, __func__); - scx_warned_zero_slice = true; + sch->warned_zero_slice = true; } - p->scx.slice = SCX_SLICE_DFL; + refill_task_slice_dfl(p); } } @@ -3201,13 +3490,17 @@ static struct task_struct *pick_task_scx(struct rq *rq) bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, bool in_fi) { + struct scx_sched *sch = scx_root; + /* * The const qualifiers are dropped from task_struct pointers when * calling ops.core_sched_before(). Accesses are controlled by the * verifier. */ - if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a))) - return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, + if (SCX_HAS_OP(sch, core_sched_before) && + !scx_rq_bypassing(task_rq(a))) + return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before, + NULL, (struct task_struct *)a, (struct task_struct *)b); else @@ -3217,418 +3510,11 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, #ifdef CONFIG_SMP -static bool test_and_clear_cpu_idle(int cpu) -{ -#ifdef CONFIG_SCHED_SMT - /* - * SMT mask should be cleared whether we can claim @cpu or not. The SMT - * cluster is not wholly idle either way. This also prevents - * scx_pick_idle_cpu() from getting caught in an infinite loop. - */ - if (sched_smt_active()) { - const struct cpumask *smt = cpu_smt_mask(cpu); - - /* - * If offline, @cpu is not its own sibling and - * scx_pick_idle_cpu() can get caught in an infinite loop as - * @cpu is never cleared from idle_masks.smt. Ensure that @cpu - * is eventually cleared. - * - * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to - * reduce memory writes, which may help alleviate cache - * coherence pressure. - */ - if (cpumask_intersects(smt, idle_masks.smt)) - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); - else if (cpumask_test_cpu(cpu, idle_masks.smt)) - __cpumask_clear_cpu(cpu, idle_masks.smt); - } -#endif - return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); -} - -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) -{ - int cpu; - -retry: - if (sched_smt_active()) { - cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); - if (cpu < nr_cpu_ids) - goto found; - - if (flags & SCX_PICK_IDLE_CORE) - return -EBUSY; - } - - cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); - if (cpu >= nr_cpu_ids) - return -EBUSY; - -found: - if (test_and_clear_cpu_idle(cpu)) - return cpu; - else - goto retry; -} - -/* - * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC - * domain is not defined). - */ -static unsigned int llc_weight(s32 cpu) -{ - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (!sd) - return 0; - - return sd->span_weight; -} - -/* - * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC - * domain is not defined). - */ -static struct cpumask *llc_span(s32 cpu) -{ - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (!sd) - return 0; - - return sched_domain_span(sd); -} - -/* - * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the - * NUMA domain is not defined). - */ -static unsigned int numa_weight(s32 cpu) -{ - struct sched_domain *sd; - struct sched_group *sg; - - sd = rcu_dereference(per_cpu(sd_numa, cpu)); - if (!sd) - return 0; - sg = sd->groups; - if (!sg) - return 0; - - return sg->group_weight; -} - -/* - * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA - * domain is not defined). - */ -static struct cpumask *numa_span(s32 cpu) -{ - struct sched_domain *sd; - struct sched_group *sg; - - sd = rcu_dereference(per_cpu(sd_numa, cpu)); - if (!sd) - return NULL; - sg = sd->groups; - if (!sg) - return NULL; - - return sched_group_span(sg); -} - -/* - * Return true if the LLC domains do not perfectly overlap with the NUMA - * domains, false otherwise. - */ -static bool llc_numa_mismatch(void) -{ - int cpu; - - /* - * We need to scan all online CPUs to verify whether their scheduling - * domains overlap. - * - * While it is rare to encounter architectures with asymmetric NUMA - * topologies, CPU hotplugging or virtualized environments can result - * in asymmetric configurations. - * - * For example: - * - * NUMA 0: - * - LLC 0: cpu0..cpu7 - * - LLC 1: cpu8..cpu15 [offline] - * - * NUMA 1: - * - LLC 0: cpu16..cpu23 - * - LLC 1: cpu24..cpu31 - * - * In this case, if we only check the first online CPU (cpu0), we might - * incorrectly assume that the LLC and NUMA domains are fully - * overlapping, which is incorrect (as NUMA 1 has two distinct LLC - * domains). - */ - for_each_online_cpu(cpu) - if (llc_weight(cpu) != numa_weight(cpu)) - return true; - - return false; -} - -/* - * Initialize topology-aware scheduling. - * - * Detect if the system has multiple LLC or multiple NUMA domains and enable - * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle - * selection policy. - * - * Assumption: the kernel's internal topology representation assumes that each - * CPU belongs to a single LLC domain, and that each LLC domain is entirely - * contained within a single NUMA node. - */ -static void update_selcpu_topology(void) -{ - bool enable_llc = false, enable_numa = false; - unsigned int nr_cpus; - s32 cpu = cpumask_first(cpu_online_mask); - - /* - * Enable LLC domain optimization only when there are multiple LLC - * domains among the online CPUs. If all online CPUs are part of a - * single LLC domain, the idle CPU selection logic can choose any - * online CPU without bias. - * - * Note that it is sufficient to check the LLC domain of the first - * online CPU to determine whether a single LLC domain includes all - * CPUs. - */ - rcu_read_lock(); - nr_cpus = llc_weight(cpu); - if (nr_cpus > 0) { - if (nr_cpus < num_online_cpus()) - enable_llc = true; - pr_debug("sched_ext: LLC=%*pb weight=%u\n", - cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); - } - - /* - * Enable NUMA optimization only when there are multiple NUMA domains - * among the online CPUs and the NUMA domains don't perfectly overlaps - * with the LLC domains. - * - * If all CPUs belong to the same NUMA node and the same LLC domain, - * enabling both NUMA and LLC optimizations is unnecessary, as checking - * for an idle CPU in the same domain twice is redundant. - */ - nr_cpus = numa_weight(cpu); - if (nr_cpus > 0) { - if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) - enable_numa = true; - pr_debug("sched_ext: NUMA=%*pb weight=%u\n", - cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); - } - rcu_read_unlock(); - - pr_debug("sched_ext: LLC idle selection %s\n", - str_enabled_disabled(enable_llc)); - pr_debug("sched_ext: NUMA idle selection %s\n", - str_enabled_disabled(enable_numa)); - - if (enable_llc) - static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); - else - static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); - if (enable_numa) - static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); - else - static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); -} - -/* - * Built-in CPU idle selection policy: - * - * 1. Prioritize full-idle cores: - * - always prioritize CPUs from fully idle cores (both logical CPUs are - * idle) to avoid interference caused by SMT. - * - * 2. Reuse the same CPU: - * - prefer the last used CPU to take advantage of cached data (L1, L2) and - * branch prediction optimizations. - * - * 3. Pick a CPU within the same LLC (Last-Level Cache): - * - if the above conditions aren't met, pick a CPU that shares the same LLC - * to maintain cache locality. - * - * 4. Pick a CPU within the same NUMA node, if enabled: - * - choose a CPU from the same NUMA node to reduce memory access latency. - * - * 5. Pick any idle CPU usable by the task. - * - * Step 3 and 4 are performed only if the system has, respectively, multiple - * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and - * scx_selcpu_topo_numa). - * - * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because - * we never call ops.select_cpu() for them, see select_task_rq(). - */ -static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *found) -{ - const struct cpumask *llc_cpus = NULL; - const struct cpumask *numa_cpus = NULL; - s32 cpu; - - *found = false; - - /* - * This is necessary to protect llc_cpus. - */ - rcu_read_lock(); - - /* - * Determine the scheduling domain only if the task is allowed to run - * on all CPUs. - * - * This is done primarily for efficiency, as it avoids the overhead of - * updating a cpumask every time we need to select an idle CPU (which - * can be costly in large SMP systems), but it also aligns logically: - * if a task's scheduling domain is restricted by user-space (through - * CPU affinity), the task will simply use the flat scheduling domain - * defined by user-space. - */ - if (p->nr_cpus_allowed >= num_possible_cpus()) { - if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) - numa_cpus = numa_span(prev_cpu); - - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) - llc_cpus = llc_span(prev_cpu); - } - - /* - * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. - */ - if (wake_flags & SCX_WAKE_SYNC) { - cpu = smp_processor_id(); - - /* - * If the waker's CPU is cache affine and prev_cpu is idle, - * then avoid a migration. - */ - if (cpus_share_cache(cpu, prev_cpu) && - test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * If the waker's local DSQ is empty, and the system is under - * utilized, try to wake up @p to the local DSQ of the waker. - * - * Checking only for an empty local DSQ is insufficient as it - * could give the wakee an unfair advantage when the system is - * oversaturated. - * - * Checking only for the presence of idle CPUs is also - * insufficient as the local DSQ of the waker could have tasks - * piled up on it even if there is an idle core elsewhere on - * the system. - */ - if (!cpumask_empty(idle_masks.cpu) && - !(current->flags & PF_EXITING) && - cpu_rq(cpu)->scx.local_dsq.nr == 0) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) - goto cpu_found; - } - } - - /* - * If CPU has SMT, any wholly idle CPU is likely a better pick than - * partially idle @prev_cpu. - */ - if (sched_smt_active()) { - /* - * Keep using @prev_cpu if it's part of a fully idle core. - */ - if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && - test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * Search for any fully idle core in the same LLC domain. - */ - if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any fully idle core in the same NUMA node. - */ - if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any full idle core usable by the task. - */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Use @prev_cpu if it's idle. - */ - if (test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * Search for any idle CPU in the same LLC domain. - */ - if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, 0); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any idle CPU in the same NUMA node. - */ - if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, 0); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any idle CPU usable by the task. - */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); - if (cpu >= 0) - goto cpu_found; - - rcu_read_unlock(); - return prev_cpu; - -cpu_found: - rcu_read_unlock(); - - *found = true; - return cpu; -} - static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { + struct scx_sched *sch = scx_root; + bool rq_bypass; + /* * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it * can be a good migration opportunity with low cache and memory @@ -3642,7 +3528,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { + rq_bypass = scx_rq_bypassing(task_rq(p)); + if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3650,22 +3537,30 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag WARN_ON_ONCE(*ddsp_taskp); *ddsp_taskp = p; - cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, - select_cpu, p, prev_cpu, wake_flags); + cpu = SCX_CALL_OP_TASK_RET(sch, + SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, + select_cpu, NULL, p, prev_cpu, + wake_flags); + p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; - if (ops_cpu_valid(cpu, "from ops.select_cpu()")) + if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()")) return cpu; else return prev_cpu; } else { - bool found; s32 cpu; - cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); - if (found) { - p->scx.slice = SCX_SLICE_DFL; + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); + if (cpu >= 0) { + refill_task_slice_dfl(p); p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; + } else { + cpu = prev_cpu; } + p->scx.selected_cpu = cpu; + + if (rq_bypass) + __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); return cpu; } } @@ -3678,6 +3573,8 @@ static void task_woken_scx(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_scx(struct task_struct *p, struct affinity_context *ac) { + struct scx_sched *sch = scx_root; + set_cpus_allowed_common(p, ac); /* @@ -3688,112 +3585,38 @@ static void set_cpus_allowed_scx(struct task_struct *p, * Fine-grained memory write control is enforced by BPF making the const * designation pointless. Cast it away when calling the operation. */ - if (SCX_HAS_OP(set_cpumask)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, - (struct cpumask *)p->cpus_ptr); -} - -static void reset_idle_masks(void) -{ - /* - * Consider all online cpus idle. Should converge to the actual state - * quickly. - */ - cpumask_copy(idle_masks.cpu, cpu_online_mask); - cpumask_copy(idle_masks.smt, cpu_online_mask); + if (SCX_HAS_OP(sch, set_cpumask)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL, + p, (struct cpumask *)p->cpus_ptr); } -static void update_builtin_idle(int cpu, bool idle) -{ - assign_cpu(cpu, idle_masks.cpu, idle); - -#ifdef CONFIG_SCHED_SMT - if (sched_smt_active()) { - const struct cpumask *smt = cpu_smt_mask(cpu); - - if (idle) { - /* - * idle_masks.smt handling is racy but that's fine as - * it's only for optimization and self-correcting. - */ - if (!cpumask_subset(smt, idle_masks.cpu)) - return; - cpumask_or(idle_masks.smt, idle_masks.smt, smt); - } else { - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); - } - } -#endif -} - -/* - * Update the idle state of a CPU to @idle. - * - * If @do_notify is true, ops.update_idle() is invoked to notify the scx - * scheduler of an actual idle state transition (idle to busy or vice - * versa). If @do_notify is false, only the idle state in the idle masks is - * refreshed without invoking ops.update_idle(). - * - * This distinction is necessary, because an idle CPU can be "reserved" and - * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as - * busy even if no tasks are dispatched. In this case, the CPU may return - * to idle without a true state transition. Refreshing the idle masks - * without invoking ops.update_idle() ensures accurate idle state tracking - * while avoiding unnecessary updates and maintaining balanced state - * transitions. - */ -void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) +static void handle_hotplug(struct rq *rq, bool online) { + struct scx_sched *sch = scx_root; int cpu = cpu_of(rq); - lockdep_assert_rq_held(rq); - - /* - * Trigger ops.update_idle() only when transitioning from a task to - * the idle thread and vice versa. - * - * Idle transitions are indicated by do_notify being set to true, - * managed by put_prev_task_idle()/set_next_task_idle(). - */ - if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + atomic_long_inc(&scx_hotplug_seq); /* - * Update the idle masks: - * - for real idle transitions (do_notify == true) - * - for idle-to-idle transitions (indicated by the previous task - * being the idle thread, managed by pick_task_idle()) - * - * Skip updating idle masks if the previous task is not the idle - * thread, since set_next_task_idle() has already handled it when - * transitioning from a task to the idle thread (calling this - * function with do_notify == true). - * - * In this way we can avoid updating the idle masks twice, - * unnecessarily. + * scx_root updates are protected by cpus_read_lock() and will stay + * stable here. Note that we can't depend on scx_enabled() test as the + * hotplug ops need to be enabled before __scx_enabled is set. */ - if (static_branch_likely(&scx_builtin_idle_enabled)) - if (do_notify || is_idle_task(rq->curr)) - update_builtin_idle(cpu, idle); -} - -static void handle_hotplug(struct rq *rq, bool online) -{ - int cpu = cpu_of(rq); - - atomic_long_inc(&scx_hotplug_seq); + if (unlikely(!sch)) + return; if (scx_enabled()) - update_selcpu_topology(); + scx_idle_update_selcpu_topology(&sch->ops); - if (online && SCX_HAS_OP(cpu_online)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); - else if (!online && SCX_HAS_OP(cpu_offline)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu); + if (online && SCX_HAS_OP(sch, cpu_online)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu); + else if (!online && SCX_HAS_OP(sch, cpu_offline)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu); else - scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, - "cpu %d going %s, exiting scheduler", cpu, - online ? "online" : "offline"); + scx_exit(sch, SCX_EXIT_UNREG_KERN, + SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "cpu %d going %s, exiting scheduler", cpu, + online ? "online" : "offline"); } void scx_rq_activate(struct rq *rq) @@ -3816,21 +3639,20 @@ static void rq_offline_scx(struct rq *rq) rq->scx.flags &= ~SCX_RQ_ONLINE; } -#else /* CONFIG_SMP */ - -static bool test_and_clear_cpu_idle(int cpu) { return false; } -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } -static void reset_idle_masks(void) {} - #endif /* CONFIG_SMP */ static bool check_rq_for_timeouts(struct rq *rq) { + struct scx_sched *sch; struct task_struct *p; struct rq_flags rf; bool timed_out = false; rq_lock_irqsave(rq, &rf); + sch = rcu_dereference_bh(scx_root); + if (unlikely(!sch)) + goto out_unlock; + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { unsigned long last_runnable = p->scx.runnable_at; @@ -3838,16 +3660,15 @@ static bool check_rq_for_timeouts(struct rq *rq) last_runnable + scx_watchdog_timeout))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); - scx_ops_error_kind(SCX_EXIT_ERROR_STALL, - "%s[%d] failed to run for %u.%03us", - p->comm, p->pid, - dur_ms / 1000, dur_ms % 1000); + scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, + "%s[%d] failed to run for %u.%03us", + p->comm, p->pid, dur_ms / 1000, dur_ms % 1000); timed_out = true; break; } } +out_unlock: rq_unlock_irqrestore(rq, &rf); - return timed_out; } @@ -3869,19 +3690,24 @@ static void scx_watchdog_workfn(struct work_struct *work) void scx_tick(struct rq *rq) { + struct scx_sched *sch; unsigned long last_check; if (!scx_enabled()) return; + sch = rcu_dereference_bh(scx_root); + if (unlikely(!sch)) + return; + last_check = READ_ONCE(scx_watchdog_timestamp); if (unlikely(time_after(jiffies, last_check + READ_ONCE(scx_watchdog_timeout)))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_check); - scx_ops_error_kind(SCX_EXIT_ERROR_STALL, - "watchdog failed to check in for %u.%03us", - dur_ms / 1000, dur_ms % 1000); + scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, + "watchdog failed to check in for %u.%03us", + dur_ms / 1000, dur_ms % 1000); } update_other_load_avgs(rq); @@ -3889,6 +3715,8 @@ void scx_tick(struct rq *rq) static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) { + struct scx_sched *sch = scx_root; + update_curr_scx(rq); /* @@ -3898,8 +3726,8 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) if (scx_rq_bypassing(rq)) { curr->scx.slice = 0; touch_core_sched(rq, curr); - } else if (SCX_HAS_OP(tick)) { - SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr); + } else if (SCX_HAS_OP(sch, tick)) { + SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr); } if (!curr->scx.slice) @@ -3964,21 +3792,23 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) p->scx.flags |= state << SCX_TASK_STATE_SHIFT; } -static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork) +static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork) { + struct scx_sched *sch = scx_root; int ret; p->scx.disallow = false; - if (SCX_HAS_OP(init_task)) { + if (SCX_HAS_OP(sch, init_task)) { struct scx_init_task_args args = { SCX_INIT_TASK_ARGS_CGROUP(tg) .fork = fork, }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args); + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL, + p, &args); if (unlikely(ret)) { - ret = ops_sanitize_err("init_task", ret); + ret = ops_sanitize_err(sch, "init_task", ret); return ret; } } @@ -4006,8 +3836,8 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool task_rq_unlock(rq, p, &rf); } else if (p->policy == SCHED_EXT) { - scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork", - p->comm, p->pid); + scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", + p->comm, p->pid); } } @@ -4015,11 +3845,13 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool return 0; } -static void scx_ops_enable_task(struct task_struct *p) +static void scx_enable_task(struct task_struct *p) { + struct scx_sched *sch = scx_root; + struct rq *rq = task_rq(p); u32 weight; - lockdep_assert_rq_held(task_rq(p)); + lockdep_assert_rq_held(rq); /* * Set the weight before calling ops.enable() so that the scheduler @@ -4032,26 +3864,31 @@ static void scx_ops_enable_task(struct task_struct *p) p->scx.weight = sched_weight_to_cgroup(weight); - if (SCX_HAS_OP(enable)) - SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); + if (SCX_HAS_OP(sch, enable)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p); scx_set_task_state(p, SCX_TASK_ENABLED); - if (SCX_HAS_OP(set_weight)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); + if (SCX_HAS_OP(sch, set_weight)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, + p, p->scx.weight); } -static void scx_ops_disable_task(struct task_struct *p) +static void scx_disable_task(struct task_struct *p) { - lockdep_assert_rq_held(task_rq(p)); + struct scx_sched *sch = scx_root; + struct rq *rq = task_rq(p); + + lockdep_assert_rq_held(rq); WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); - if (SCX_HAS_OP(disable)) - SCX_CALL_OP_TASK(SCX_KF_REST, disable, p); + if (SCX_HAS_OP(sch, disable)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p); scx_set_task_state(p, SCX_TASK_READY); } -static void scx_ops_exit_task(struct task_struct *p) +static void scx_exit_task(struct task_struct *p) { + struct scx_sched *sch = scx_root; struct scx_exit_task_args args = { .cancelled = false, }; @@ -4067,15 +3904,16 @@ static void scx_ops_exit_task(struct task_struct *p) case SCX_TASK_READY: break; case SCX_TASK_ENABLED: - scx_ops_disable_task(p); + scx_disable_task(p); break; default: WARN_ON_ONCE(true); return; } - if (SCX_HAS_OP(exit_task)) - SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args); + if (SCX_HAS_OP(sch, exit_task)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p), + p, &args); scx_set_task_state(p, SCX_TASK_NONE); } @@ -4107,15 +3945,15 @@ int scx_fork(struct task_struct *p) { percpu_rwsem_assert_held(&scx_fork_rwsem); - if (scx_ops_init_task_enabled) - return scx_ops_init_task(p, task_group(p), true); + if (scx_init_task_enabled) + return scx_init_task(p, task_group(p), true); else return 0; } void scx_post_fork(struct task_struct *p) { - if (scx_ops_init_task_enabled) { + if (scx_init_task_enabled) { scx_set_task_state(p, SCX_TASK_READY); /* @@ -4128,7 +3966,7 @@ void scx_post_fork(struct task_struct *p) struct rq *rq; rq = task_rq_lock(p, &rf); - scx_ops_enable_task(p); + scx_enable_task(p); task_rq_unlock(rq, p, &rf); } } @@ -4148,7 +3986,7 @@ void scx_cancel_fork(struct task_struct *p) rq = task_rq_lock(p, &rf); WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); - scx_ops_exit_task(p); + scx_exit_task(p); task_rq_unlock(rq, p, &rf); } @@ -4164,15 +4002,15 @@ void sched_ext_free(struct task_struct *p) spin_unlock_irqrestore(&scx_tasks_lock, flags); /* - * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY -> - * ENABLED transitions can't race us. Disable ops for @p. + * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED + * transitions can't race us. Disable ops for @p. */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; struct rq *rq; rq = task_rq_lock(p, &rf); - scx_ops_exit_task(p); + scx_exit_task(p); task_rq_unlock(rq, p, &rf); } } @@ -4180,11 +4018,14 @@ void sched_ext_free(struct task_struct *p) static void reweight_task_scx(struct rq *rq, struct task_struct *p, const struct load_weight *lw) { + struct scx_sched *sch = scx_root; + lockdep_assert_rq_held(task_rq(p)); p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); - if (SCX_HAS_OP(set_weight)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); + if (SCX_HAS_OP(sch, set_weight)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, + p, p->scx.weight); } static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) @@ -4193,20 +4034,22 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) static void switching_to_scx(struct rq *rq, struct task_struct *p) { - scx_ops_enable_task(p); + struct scx_sched *sch = scx_root; + + scx_enable_task(p); /* * set_cpus_allowed_scx() is not called while @p is associated with a * different scheduler class. Keep the BPF scheduler up-to-date. */ - if (SCX_HAS_OP(set_cpumask)) - SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, - (struct cpumask *)p->cpus_ptr); + if (SCX_HAS_OP(sch, set_cpumask)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq, + p, (struct cpumask *)p->cpus_ptr); } static void switched_from_scx(struct rq *rq, struct task_struct *p) { - scx_ops_disable_task(p); + scx_disable_task(p); } static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} @@ -4248,55 +4091,30 @@ bool scx_can_stop_tick(struct rq *rq) DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); static bool scx_cgroup_enabled; -static bool cgroup_warned_missing_weight; -static bool cgroup_warned_missing_idle; - -static void scx_cgroup_warn_missing_weight(struct task_group *tg) -{ - if (scx_ops_enable_state() == SCX_OPS_DISABLED || - cgroup_warned_missing_weight) - return; - - if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent) - return; - - pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n", - scx_ops.name); - cgroup_warned_missing_weight = true; -} -static void scx_cgroup_warn_missing_idle(struct task_group *tg) +void scx_tg_init(struct task_group *tg) { - if (!scx_cgroup_enabled || cgroup_warned_missing_idle) - return; - - if (!tg->idle) - return; - - pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n", - scx_ops.name); - cgroup_warned_missing_idle = true; + tg->scx_weight = CGROUP_WEIGHT_DFL; } int scx_tg_online(struct task_group *tg) { + struct scx_sched *sch = scx_root; int ret = 0; WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); percpu_down_read(&scx_cgroup_rwsem); - scx_cgroup_warn_missing_weight(tg); - if (scx_cgroup_enabled) { - if (SCX_HAS_OP(cgroup_init)) { + if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, - tg->css.cgroup, &args); + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, + NULL, tg->css.cgroup, &args); if (ret) - ret = ops_sanitize_err("cgroup_init", ret); + ret = ops_sanitize_err(sch, "cgroup_init", ret); } if (ret == 0) tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; @@ -4310,12 +4128,16 @@ int scx_tg_online(struct task_group *tg) void scx_tg_offline(struct task_group *tg) { + struct scx_sched *sch = scx_root; + WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); percpu_down_read(&scx_cgroup_rwsem); - if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup); + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && + (tg->scx_flags & SCX_TG_INITED)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, + tg->css.cgroup); tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); percpu_up_read(&scx_cgroup_rwsem); @@ -4323,6 +4145,7 @@ void scx_tg_offline(struct task_group *tg) int scx_cgroup_can_attach(struct cgroup_taskset *tset) { + struct scx_sched *sch = scx_root; struct cgroup_subsys_state *css; struct task_struct *p; int ret; @@ -4347,8 +4170,9 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) if (from == to) continue; - if (SCX_HAS_OP(cgroup_prep_move)) { - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, + if (SCX_HAS_OP(sch, cgroup_prep_move)) { + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, + cgroup_prep_move, NULL, p, from, css->cgroup); if (ret) goto err; @@ -4361,18 +4185,21 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) err: cgroup_taskset_for_each(p, css, tset) { - if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, - p->scx.cgrp_moving_from, css->cgroup); + if (SCX_HAS_OP(sch, cgroup_cancel_move) && + p->scx.cgrp_moving_from) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } percpu_up_read(&scx_cgroup_rwsem); - return ops_sanitize_err("cgroup_prep_move", ret); + return ops_sanitize_err(sch, "cgroup_prep_move", ret); } void scx_cgroup_move_task(struct task_struct *p) { + struct scx_sched *sch = scx_root; + if (!scx_cgroup_enabled) return; @@ -4380,9 +4207,11 @@ void scx_cgroup_move_task(struct task_struct *p) * @p must have ops.cgroup_prep_move() called on it and thus * cgrp_moving_from set. */ - if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) - SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, - p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); + if (SCX_HAS_OP(sch, cgroup_move) && + !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) + SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL, + p, p->scx.cgrp_moving_from, + tg_cgrp(task_group(p))); p->scx.cgrp_moving_from = NULL; } @@ -4393,6 +4222,7 @@ void scx_cgroup_finish_attach(void) void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) { + struct scx_sched *sch = scx_root; struct cgroup_subsys_state *css; struct task_struct *p; @@ -4400,9 +4230,10 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) goto out_unlock; cgroup_taskset_for_each(p, css, tset) { - if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, - p->scx.cgrp_moving_from, css->cgroup); + if (SCX_HAS_OP(sch, cgroup_cancel_move) && + p->scx.cgrp_moving_from) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } out_unlock: @@ -4411,23 +4242,23 @@ out_unlock: void scx_group_set_weight(struct task_group *tg, unsigned long weight) { + struct scx_sched *sch = scx_root; + percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled && tg->scx_weight != weight) { - if (SCX_HAS_OP(cgroup_set_weight)) - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, - tg_cgrp(tg), weight); - tg->scx_weight = weight; - } + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && + tg->scx_weight != weight) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, + tg_cgrp(tg), weight); + + tg->scx_weight = weight; percpu_up_read(&scx_cgroup_rwsem); } void scx_group_set_idle(struct task_group *tg, bool idle) { - percpu_down_read(&scx_cgroup_rwsem); - scx_cgroup_warn_missing_idle(tg); - percpu_up_read(&scx_cgroup_rwsem); + /* TODO: Implement ops->cgroup_set_idle() */ } static void scx_cgroup_lock(void) @@ -4506,29 +4337,6 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) dsq->id = dsq_id; } -static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) -{ - struct scx_dispatch_q *dsq; - int ret; - - if (dsq_id & SCX_DSQ_FLAG_BUILTIN) - return ERR_PTR(-EINVAL); - - dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); - if (!dsq) - return ERR_PTR(-ENOMEM); - - init_dsq(dsq, dsq_id); - - ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, - dsq_hash_params); - if (ret) { - kfree(dsq); - return ERR_PTR(ret); - } - return dsq; -} - static void free_dsq_irq_workfn(struct irq_work *irq_work) { struct llist_node *to_free = llist_del_all(&dsqs_to_free); @@ -4540,26 +4348,27 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work) static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); -static void destroy_dsq(u64 dsq_id) +static void destroy_dsq(struct scx_sched *sch, u64 dsq_id) { struct scx_dispatch_q *dsq; unsigned long flags; rcu_read_lock(); - dsq = find_user_dsq(dsq_id); + dsq = find_user_dsq(sch, dsq_id); if (!dsq) goto out_unlock_rcu; raw_spin_lock_irqsave(&dsq->lock, flags); if (dsq->nr) { - scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", - dsq->id, dsq->nr); + scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)", + dsq->id, dsq->nr); goto out_unlock_dsq; } - if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) + if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node, + dsq_hash_params)) goto out_unlock_dsq; /* @@ -4579,7 +4388,7 @@ out_unlock_rcu: } #ifdef CONFIG_EXT_GROUP_SCHED -static void scx_cgroup_exit(void) +static void scx_cgroup_exit(struct scx_sched *sch) { struct cgroup_subsys_state *css; @@ -4599,14 +4408,15 @@ static void scx_cgroup_exit(void) continue; tg->scx_flags &= ~SCX_TG_INITED; - if (!scx_ops.cgroup_exit) + if (!sch->ops.cgroup_exit) continue; if (WARN_ON_ONCE(!css_tryget(css))) continue; rcu_read_unlock(); - SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, + css->cgroup); rcu_read_lock(); css_put(css); @@ -4614,16 +4424,13 @@ static void scx_cgroup_exit(void) rcu_read_unlock(); } -static int scx_cgroup_init(void) +static int scx_cgroup_init(struct scx_sched *sch) { struct cgroup_subsys_state *css; int ret; percpu_rwsem_assert_held(&scx_cgroup_rwsem); - cgroup_warned_missing_weight = false; - cgroup_warned_missing_idle = false; - /* * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and init, all online cgroups are initialized. @@ -4633,14 +4440,11 @@ static int scx_cgroup_init(void) struct task_group *tg = css_tg(css); struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; - scx_cgroup_warn_missing_weight(tg); - scx_cgroup_warn_missing_idle(tg); - if ((tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) continue; - if (!scx_ops.cgroup_init) { + if (!sch->ops.cgroup_init) { tg->scx_flags |= SCX_TG_INITED; continue; } @@ -4649,11 +4453,11 @@ static int scx_cgroup_init(void) continue; rcu_read_unlock(); - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { css_put(css); - scx_ops_error("ops.cgroup_init() failed (%d)", ret); + scx_error(sch, "ops.cgroup_init() failed (%d)", ret); return ret; } tg->scx_flags |= SCX_TG_INITED; @@ -4670,8 +4474,8 @@ static int scx_cgroup_init(void) } #else -static void scx_cgroup_exit(void) {} -static int scx_cgroup_init(void) { return 0; } +static void scx_cgroup_exit(struct scx_sched *sch) {} +static int scx_cgroup_init(struct scx_sched *sch) { return 0; } #endif @@ -4688,8 +4492,7 @@ static int scx_cgroup_init(void) { return 0; } static ssize_t scx_attr_state_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return sysfs_emit(buf, "%s\n", - scx_ops_enable_state_str[scx_ops_enable_state()]); + return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]); } SCX_ATTR(state); @@ -4734,20 +4537,82 @@ static const struct attribute_group scx_global_attr_group = { .attrs = scx_global_attrs, }; +static void free_exit_info(struct scx_exit_info *ei); + +static void scx_sched_free_rcu_work(struct work_struct *work) +{ + struct rcu_work *rcu_work = to_rcu_work(work); + struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); + struct rhashtable_iter rht_iter; + struct scx_dispatch_q *dsq; + int node; + + kthread_stop(sch->helper->task); + free_percpu(sch->event_stats_cpu); + + for_each_node_state(node, N_POSSIBLE) + kfree(sch->global_dsqs[node]); + kfree(sch->global_dsqs); + + rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); + do { + rhashtable_walk_start(&rht_iter); + + while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) + destroy_dsq(sch, dsq->id); + + rhashtable_walk_stop(&rht_iter); + } while (dsq == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&rht_iter); + + rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); + free_exit_info(sch->exit_info); + kfree(sch); +} + static void scx_kobj_release(struct kobject *kobj) { - kfree(kobj); + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); + + INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); + queue_rcu_work(system_unbound_wq, &sch->rcu_work); } static ssize_t scx_attr_ops_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return sysfs_emit(buf, "%s\n", scx_ops.name); + return sysfs_emit(buf, "%s\n", scx_root->ops.name); } SCX_ATTR(ops); +#define scx_attr_event_show(buf, at, events, kind) ({ \ + sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ +}) + +static ssize_t scx_attr_events_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); + struct scx_event_stats events; + int at = 0; + + scx_read_events(sch, &events); + at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); + return at; +} +SCX_ATTR(events); + static struct attribute *scx_sched_attrs[] = { &scx_attr_ops.attr, + &scx_attr_events.attr, NULL, }; ATTRIBUTE_GROUPS(scx_sched); @@ -4760,7 +4625,7 @@ static const struct kobj_type scx_ktype = { static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) { - return add_uevent_var(env, "SCXOPS=%s", scx_ops.name); + return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name); } static const struct kset_uevent_ops scx_uevent_ops = { @@ -4773,14 +4638,20 @@ static const struct kset_uevent_ops scx_uevent_ops = { */ bool task_should_scx(int policy) { - if (!scx_enabled() || - unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) + if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING)) return false; if (READ_ONCE(scx_switching_all)) return true; return policy == SCHED_EXT; } +bool scx_allow_ttwu_queue(const struct task_struct *p) +{ + return !scx_enabled() || + (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) || + p->sched_class != &ext_sched_class; +} + /** * scx_softlockup - sched_ext softlockup handler * @dur_s: number of seconds of CPU stuck due to soft lockup @@ -4793,40 +4664,48 @@ bool task_should_scx(int policy) */ void scx_softlockup(u32 dur_s) { - switch (scx_ops_enable_state()) { - case SCX_OPS_ENABLING: - case SCX_OPS_ENABLED: + struct scx_sched *sch; + + rcu_read_lock(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + goto out_unlock; + + switch (scx_enable_state()) { + case SCX_ENABLING: + case SCX_ENABLED: break; default: - return; + goto out_unlock; } - /* allow only one instance, cleared at the end of scx_ops_bypass() */ + /* allow only one instance, cleared at the end of scx_bypass() */ if (test_and_set_bit(0, &scx_in_softlockup)) - return; + goto out_unlock; printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", - smp_processor_id(), dur_s, scx_ops.name); + smp_processor_id(), dur_s, scx_root->ops.name); /* * Some CPUs may be trapped in the dispatch paths. Enable breather - * immediately; otherwise, we might even be able to get to - * scx_ops_bypass(). + * immediately; otherwise, we might even be able to get to scx_bypass(). */ - atomic_inc(&scx_ops_breather_depth); + atomic_inc(&scx_breather_depth); - scx_ops_error("soft lockup - CPU#%d stuck for %us", - smp_processor_id(), dur_s); + scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s); +out_unlock: + rcu_read_unlock(); } static void scx_clear_softlockup(void) { if (test_and_clear_bit(0, &scx_in_softlockup)) - atomic_dec(&scx_ops_breather_depth); + atomic_dec(&scx_breather_depth); } /** - * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress + * scx_bypass - [Un]bypass scx_ops and guarantee forward progress * @bypass: true for bypass, false for unbypass * * Bypassing guarantees that all runnable tasks make forward progress without @@ -4856,26 +4735,36 @@ static void scx_clear_softlockup(void) * * - scx_prio_less() reverts to the default core_sched_at order. */ -static void scx_ops_bypass(bool bypass) +static void scx_bypass(bool bypass) { static DEFINE_RAW_SPINLOCK(bypass_lock); - int cpu; + static unsigned long bypass_timestamp; + struct scx_sched *sch; unsigned long flags; + int cpu; raw_spin_lock_irqsave(&bypass_lock, flags); + sch = rcu_dereference_bh(scx_root); + if (bypass) { - scx_ops_bypass_depth++; - WARN_ON_ONCE(scx_ops_bypass_depth <= 0); - if (scx_ops_bypass_depth != 1) + scx_bypass_depth++; + WARN_ON_ONCE(scx_bypass_depth <= 0); + if (scx_bypass_depth != 1) goto unlock; + bypass_timestamp = ktime_get_ns(); + if (sch) + scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); } else { - scx_ops_bypass_depth--; - WARN_ON_ONCE(scx_ops_bypass_depth < 0); - if (scx_ops_bypass_depth != 0) + scx_bypass_depth--; + WARN_ON_ONCE(scx_bypass_depth < 0); + if (scx_bypass_depth != 0) goto unlock; + if (sch) + scx_add_event(sch, SCX_EV_BYPASS_DURATION, + ktime_get_ns() - bypass_timestamp); } - atomic_inc(&scx_ops_breather_depth); + atomic_inc(&scx_breather_depth); /* * No task property is changing. We just need to make sure all currently @@ -4933,7 +4822,7 @@ static void scx_ops_bypass(bool bypass) raw_spin_rq_unlock(rq); } - atomic_dec(&scx_ops_breather_depth); + atomic_dec(&scx_breather_depth); unlock: raw_spin_unlock_irqrestore(&bypass_lock, flags); scx_clear_softlockup(); @@ -4941,7 +4830,7 @@ unlock: static void free_exit_info(struct scx_exit_info *ei) { - kfree(ei->dump); + kvfree(ei->dump); kfree(ei->msg); kfree(ei->bt); kfree(ei); @@ -4957,7 +4846,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL); ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); - ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); + ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); if (!ei->bt || !ei->msg || !ei->dump) { free_exit_info(ei); @@ -4989,42 +4878,36 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) } } -static void scx_ops_disable_workfn(struct kthread_work *work) +static void scx_disable_workfn(struct kthread_work *work) { - struct scx_exit_info *ei = scx_exit_info; + struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); + struct scx_exit_info *ei = sch->exit_info; struct scx_task_iter sti; struct task_struct *p; - struct rhashtable_iter rht_iter; - struct scx_dispatch_q *dsq; - int i, kind, cpu; + int kind, cpu; - kind = atomic_read(&scx_exit_kind); + kind = atomic_read(&sch->exit_kind); while (true) { - /* - * NONE indicates that a new scx_ops has been registered since - * disable was scheduled - don't kill the new ops. DONE - * indicates that the ops has already been disabled. - */ - if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) + if (kind == SCX_EXIT_DONE) /* already disabled? */ return; - if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) + WARN_ON_ONCE(kind == SCX_EXIT_NONE); + if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) break; } ei->kind = kind; ei->reason = scx_exit_reason(ei->kind); /* guarantee forward progress by bypassing scx_ops */ - scx_ops_bypass(true); + scx_bypass(true); - switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { - case SCX_OPS_DISABLING: + switch (scx_set_enable_state(SCX_DISABLING)) { + case SCX_DISABLING: WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); break; - case SCX_OPS_DISABLED: + case SCX_DISABLED: pr_warn("sched_ext: ops error detected without ops (%s)\n", - scx_exit_info->msg); - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != - SCX_OPS_DISABLING); + sch->exit_info->msg); + WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); goto done; default: break; @@ -5035,17 +4918,17 @@ static void scx_ops_disable_workfn(struct kthread_work *work) * we can safely use blocking synchronization constructs. Actually * disable ops. */ - mutex_lock(&scx_ops_enable_mutex); + mutex_lock(&scx_enable_mutex); static_branch_disable(&__scx_switched_all); WRITE_ONCE(scx_switching_all, false); /* * Shut down cgroup support before tasks so that the cgroup attach path - * doesn't race against scx_ops_exit_task(). + * doesn't race against scx_exit_task(). */ scx_cgroup_lock(); - scx_cgroup_exit(); + scx_cgroup_exit(sch); scx_cgroup_unlock(); /* @@ -5054,7 +4937,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) */ percpu_down_write(&scx_fork_rwsem); - scx_ops_init_task_enabled = false; + scx_init_task_enabled = false; scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { @@ -5074,7 +4957,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) sched_enq_and_set_task(&ctx); check_class_changed(task_rq(p), p, old_class, p->prio); - scx_ops_exit_task(p); + scx_exit_task(p); } scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); @@ -5089,97 +4972,71 @@ static void scx_ops_disable_workfn(struct kthread_work *work) } /* no task is on scx, turn off all the switches and flush in-progress calls */ - static_branch_disable(&__scx_ops_enabled); - for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) - static_branch_disable(&scx_has_op[i]); - static_branch_disable(&scx_ops_enq_last); - static_branch_disable(&scx_ops_enq_exiting); - static_branch_disable(&scx_ops_enq_migration_disabled); - static_branch_disable(&scx_ops_cpu_preempt); - static_branch_disable(&scx_builtin_idle_enabled); + static_branch_disable(&__scx_enabled); + bitmap_zero(sch->has_op, SCX_OPI_END); + scx_idle_disable(); synchronize_rcu(); if (ei->kind >= SCX_EXIT_ERROR) { pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", - scx_ops.name, ei->reason); + sch->ops.name, ei->reason); if (ei->msg[0] != '\0') - pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg); + pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); #ifdef CONFIG_STACKTRACE stack_trace_print(ei->bt, ei->bt_len, 2); #endif } else { pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", - scx_ops.name, ei->reason); + sch->ops.name, ei->reason); } - if (scx_ops.exit) - SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); + if (sch->ops.exit) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei); cancel_delayed_work_sync(&scx_watchdog_work); /* - * Delete the kobject from the hierarchy eagerly in addition to just - * dropping a reference. Otherwise, if the object is deleted - * asynchronously, sysfs could observe an object of the same name still - * in the hierarchy when another scheduler is loaded. + * scx_root clearing must be inside cpus_read_lock(). See + * handle_hotplug(). */ - kobject_del(scx_root_kobj); - kobject_put(scx_root_kobj); - scx_root_kobj = NULL; - - memset(&scx_ops, 0, sizeof(scx_ops)); - - rhashtable_walk_enter(&dsq_hash, &rht_iter); - do { - rhashtable_walk_start(&rht_iter); - - while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) - destroy_dsq(dsq->id); + cpus_read_lock(); + RCU_INIT_POINTER(scx_root, NULL); + cpus_read_unlock(); - rhashtable_walk_stop(&rht_iter); - } while (dsq == ERR_PTR(-EAGAIN)); - rhashtable_walk_exit(&rht_iter); + /* + * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs + * could observe an object of the same name still in the hierarchy when + * the next scheduler is loaded. + */ + kobject_del(&sch->kobj); free_percpu(scx_dsp_ctx); scx_dsp_ctx = NULL; scx_dsp_max_batch = 0; - free_exit_info(scx_exit_info); - scx_exit_info = NULL; - - mutex_unlock(&scx_ops_enable_mutex); + mutex_unlock(&scx_enable_mutex); - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != - SCX_OPS_DISABLING); + WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); done: - scx_ops_bypass(false); -} - -static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); - -static void schedule_scx_ops_disable_work(void) -{ - struct kthread_worker *helper = READ_ONCE(scx_ops_helper); - - /* - * We may be called spuriously before the first bpf_sched_ext_reg(). If - * scx_ops_helper isn't set up yet, there's nothing to do. - */ - if (helper) - kthread_queue_work(helper, &scx_ops_disable_work); + scx_bypass(false); } -static void scx_ops_disable(enum scx_exit_kind kind) +static void scx_disable(enum scx_exit_kind kind) { int none = SCX_EXIT_NONE; + struct scx_sched *sch; if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) kind = SCX_EXIT_ERROR; - atomic_try_cmpxchg(&scx_exit_kind, &none, kind); - - schedule_scx_ops_disable_work(); + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) { + atomic_try_cmpxchg(&sch->exit_kind, &none, kind); + kthread_queue_work(sch->helper, &sch->disable_work); + } + rcu_read_unlock(); } static void dump_newline(struct seq_buf *s) @@ -5297,6 +5154,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, struct task_struct *p, char marker) { static unsigned long bt[SCX_EXIT_BT_LEN]; + struct scx_sched *sch = scx_root; char dsq_id_buf[19] = "(n/a)"; unsigned long ops_state = atomic_long_read(&p->scx.ops_state); unsigned int bt_len = 0; @@ -5319,9 +5177,9 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, p->scx.dsq_vtime, p->scx.slice, p->scx.weight); dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); - if (SCX_HAS_OP(dump_task)) { + if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); - SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p); + SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p); ops_dump_exit(); } @@ -5338,6 +5196,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) { static DEFINE_SPINLOCK(dump_lock); static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; + struct scx_sched *sch = scx_root; struct scx_dump_ctx dctx = { .kind = ei->kind, .exit_code = ei->exit_code, @@ -5346,6 +5205,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) .at_jiffies = jiffies, }; struct seq_buf s; + struct scx_event_stats events; unsigned long flags; char *buf; int cpu; @@ -5365,9 +5225,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) dump_stack_trace(&s, " ", ei->bt, ei->bt_len); } - if (SCX_HAS_OP(dump)) { + if (SCX_HAS_OP(sch, dump)) { ops_dump_init(&s, ""); - SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx); + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx); ops_dump_exit(); } @@ -5388,7 +5248,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) idle = list_empty(&rq->scx.runnable_list) && rq->curr->sched_class == &idle_sched_class; - if (idle && !SCX_HAS_OP(dump_cpu)) + if (idle && !SCX_HAS_OP(sch, dump_cpu)) goto next; /* @@ -5422,9 +5282,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) cpumask_pr_args(rq->scx.cpus_to_wait)); used = seq_buf_used(&ns); - if (SCX_HAS_OP(dump_cpu)) { + if (SCX_HAS_OP(sch, dump_cpu)) { ops_dump_init(&ns, " "); - SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle); + SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL, + &dctx, cpu, idle); ops_dump_exit(); } @@ -5454,6 +5315,21 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) rq_unlock(rq, &rf); } + dump_newline(&s); + dump_line(&s, "Event counters"); + dump_line(&s, "--------------"); + + scx_read_events(sch, &events); + scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); + scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); + scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); + scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); + scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); + if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) memcpy(ei->dump + dump_len - sizeof(trunc_marker), trunc_marker, sizeof(trunc_marker)); @@ -5461,27 +5337,25 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) spin_unlock_irqrestore(&dump_lock, flags); } -static void scx_ops_error_irq_workfn(struct irq_work *irq_work) +static void scx_error_irq_workfn(struct irq_work *irq_work) { - struct scx_exit_info *ei = scx_exit_info; + struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work); + struct scx_exit_info *ei = sch->exit_info; if (ei->kind >= SCX_EXIT_ERROR) - scx_dump_state(ei, scx_ops.exit_dump_len); + scx_dump_state(ei, sch->ops.exit_dump_len); - schedule_scx_ops_disable_work(); + kthread_queue_work(sch->helper, &sch->disable_work); } -static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); - -static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, - s64 exit_code, - const char *fmt, ...) +static void scx_vexit(struct scx_sched *sch, + enum scx_exit_kind kind, s64 exit_code, + const char *fmt, va_list args) { - struct scx_exit_info *ei = scx_exit_info; + struct scx_exit_info *ei = sch->exit_info; int none = SCX_EXIT_NONE; - va_list args; - if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) + if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) return; ei->exit_code = exit_code; @@ -5489,31 +5363,98 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, if (kind >= SCX_EXIT_ERROR) ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); #endif - va_start(args, fmt); vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); - va_end(args); /* * Set ei->kind and ->reason for scx_dump_state(). They'll be set again - * in scx_ops_disable_workfn(). + * in scx_disable_workfn(). */ ei->kind = kind; ei->reason = scx_exit_reason(ei->kind); - irq_work_queue(&scx_ops_error_irq_work); + irq_work_queue(&sch->error_irq_work); } -static struct kthread_worker *scx_create_rt_helper(const char *name) +static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) { - struct kthread_worker *helper; + struct scx_sched *sch; + int node, ret; - helper = kthread_run_worker(0, name); - if (helper) - sched_set_fifo(helper->task); - return helper; -} + sch = kzalloc(sizeof(*sch), GFP_KERNEL); + if (!sch) + return ERR_PTR(-ENOMEM); -static void check_hotplug_seq(const struct sched_ext_ops *ops) + sch->exit_info = alloc_exit_info(ops->exit_dump_len); + if (!sch->exit_info) { + ret = -ENOMEM; + goto err_free_sch; + } + + ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params); + if (ret < 0) + goto err_free_ei; + + sch->global_dsqs = kcalloc(nr_node_ids, sizeof(sch->global_dsqs[0]), + GFP_KERNEL); + if (!sch->global_dsqs) { + ret = -ENOMEM; + goto err_free_hash; + } + + for_each_node_state(node, N_POSSIBLE) { + struct scx_dispatch_q *dsq; + + dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) { + ret = -ENOMEM; + goto err_free_gdsqs; + } + + init_dsq(dsq, SCX_DSQ_GLOBAL); + sch->global_dsqs[node] = dsq; + } + + sch->event_stats_cpu = alloc_percpu(struct scx_event_stats); + if (!sch->event_stats_cpu) + goto err_free_gdsqs; + + sch->helper = kthread_run_worker(0, "sched_ext_helper"); + if (!sch->helper) + goto err_free_event_stats; + sched_set_fifo(sch->helper->task); + + atomic_set(&sch->exit_kind, SCX_EXIT_NONE); + init_irq_work(&sch->error_irq_work, scx_error_irq_workfn); + kthread_init_work(&sch->disable_work, scx_disable_workfn); + sch->ops = *ops; + ops->priv = sch; + + sch->kobj.kset = scx_kset; + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); + if (ret < 0) + goto err_stop_helper; + + return sch; + +err_stop_helper: + kthread_stop(sch->helper->task); +err_free_event_stats: + free_percpu(sch->event_stats_cpu); +err_free_gdsqs: + for_each_node_state(node, N_POSSIBLE) + kfree(sch->global_dsqs[node]); + kfree(sch->global_dsqs); +err_free_hash: + rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); +err_free_ei: + free_exit_info(sch->exit_info); +err_free_sch: + kfree(sch); + return ERR_PTR(ret); +} + +static void check_hotplug_seq(struct scx_sched *sch, + const struct sched_ext_ops *ops) { unsigned long long global_hotplug_seq; @@ -5525,33 +5466,48 @@ static void check_hotplug_seq(const struct sched_ext_ops *ops) if (ops->hotplug_seq) { global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); if (ops->hotplug_seq != global_hotplug_seq) { - scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, - "expected hotplug seq %llu did not match actual %llu", - ops->hotplug_seq, global_hotplug_seq); + scx_exit(sch, SCX_EXIT_UNREG_KERN, + SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "expected hotplug seq %llu did not match actual %llu", + ops->hotplug_seq, global_hotplug_seq); } } } -static int validate_ops(const struct sched_ext_ops *ops) +static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) { /* * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the * ops.enqueue() callback isn't implemented. */ if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { - scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); + scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); + return -EINVAL; + } + + /* + * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle + * selection policy to be enabled. + */ + if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && + (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); return -EINVAL; } + if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) + pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); + return 0; } -static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) +static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) { + struct scx_sched *sch; struct scx_task_iter sti; struct task_struct *p; unsigned long timeout; - int i, cpu, node, ret; + int i, cpu, ret; if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), cpu_possible_mask)) { @@ -5559,78 +5515,25 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) return -EINVAL; } - mutex_lock(&scx_ops_enable_mutex); - - if (!scx_ops_helper) { - WRITE_ONCE(scx_ops_helper, - scx_create_rt_helper("sched_ext_ops_helper")); - if (!scx_ops_helper) { - ret = -ENOMEM; - goto err_unlock; - } - } - - if (!global_dsqs) { - struct scx_dispatch_q **dsqs; - - dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL); - if (!dsqs) { - ret = -ENOMEM; - goto err_unlock; - } - - for_each_node_state(node, N_POSSIBLE) { - struct scx_dispatch_q *dsq; - - dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); - if (!dsq) { - for_each_node_state(node, N_POSSIBLE) - kfree(dsqs[node]); - kfree(dsqs); - ret = -ENOMEM; - goto err_unlock; - } + mutex_lock(&scx_enable_mutex); - init_dsq(dsq, SCX_DSQ_GLOBAL); - dsqs[node] = dsq; - } - - global_dsqs = dsqs; - } - - if (scx_ops_enable_state() != SCX_OPS_DISABLED) { + if (scx_enable_state() != SCX_DISABLED) { ret = -EBUSY; goto err_unlock; } - scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL); - if (!scx_root_kobj) { - ret = -ENOMEM; + sch = scx_alloc_and_add_sched(ops); + if (IS_ERR(sch)) { + ret = PTR_ERR(sch); goto err_unlock; } - scx_root_kobj->kset = scx_kset; - ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root"); - if (ret < 0) - goto err; - - scx_exit_info = alloc_exit_info(ops->exit_dump_len); - if (!scx_exit_info) { - ret = -ENOMEM; - goto err_del; - } - /* - * Set scx_ops, transition to ENABLING and clear exit info to arm the - * disable path. Failure triggers full disabling from here on. + * Transition to ENABLING and clear exit info to arm the disable path. + * Failure triggers full disabling from here on. */ - scx_ops = *ops; - - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) != - SCX_OPS_DISABLED); - - atomic_set(&scx_exit_kind, SCX_EXIT_NONE); - scx_warned_zero_slice = false; + WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); + WARN_ON_ONCE(scx_root); atomic_long_set(&scx_nr_rejected, 0); @@ -5643,27 +5546,34 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ cpus_read_lock(); - if (scx_ops.init) { - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); + /* + * Make the scheduler instance visible. Must be inside cpus_read_lock(). + * See handle_hotplug(). + */ + rcu_assign_pointer(scx_root, sch); + + scx_idle_enable(ops); + + if (sch->ops.init) { + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); if (ret) { - ret = ops_sanitize_err("init", ret); + ret = ops_sanitize_err(sch, "init", ret); cpus_read_unlock(); - scx_ops_error("ops.init() failed (%d)", ret); + scx_error(sch, "ops.init() failed (%d)", ret); goto err_disable; } } for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) if (((void (**)(void))ops)[i]) - static_branch_enable_cpuslocked(&scx_has_op[i]); + set_bit(i, sch->has_op); + + check_hotplug_seq(sch, ops); + scx_idle_update_selcpu_topology(ops); - check_hotplug_seq(ops); -#ifdef CONFIG_SMP - update_selcpu_topology(); -#endif cpus_read_unlock(); - ret = validate_ops(ops); + ret = validate_ops(sch, ops); if (ret) goto err_disable; @@ -5688,33 +5598,19 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_watchdog_timeout / 2); /* - * Once __scx_ops_enabled is set, %current can be switched to SCX - * anytime. This can lead to stalls as some BPF schedulers (e.g. - * userspace scheduling) may not function correctly before all tasks are - * switched. Init in bypass mode to guarantee forward progress. + * Once __scx_enabled is set, %current can be switched to SCX anytime. + * This can lead to stalls as some BPF schedulers (e.g. userspace + * scheduling) may not function correctly before all tasks are switched. + * Init in bypass mode to guarantee forward progress. */ - scx_ops_bypass(true); + scx_bypass(true); for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) - static_branch_enable(&scx_has_op[i]); - - if (ops->flags & SCX_OPS_ENQ_LAST) - static_branch_enable(&scx_ops_enq_last); + set_bit(i, sch->has_op); - if (ops->flags & SCX_OPS_ENQ_EXITING) - static_branch_enable(&scx_ops_enq_exiting); - if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) - static_branch_enable(&scx_ops_enq_migration_disabled); - if (scx_ops.cpu_acquire || scx_ops.cpu_release) - static_branch_enable(&scx_ops_cpu_preempt); - - if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { - reset_idle_masks(); - static_branch_enable(&scx_builtin_idle_enabled); - } else { - static_branch_disable(&scx_builtin_idle_enabled); - } + if (sch->ops.cpu_acquire || sch->ops.cpu_release) + sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT; /* * Lock out forks, cgroup on/offlining and moves before opening the @@ -5722,8 +5618,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ percpu_down_write(&scx_fork_rwsem); - WARN_ON_ONCE(scx_ops_init_task_enabled); - scx_ops_init_task_enabled = true; + WARN_ON_ONCE(scx_init_task_enabled); + scx_init_task_enabled = true; /* * Enable ops for every task. Fork is excluded by scx_fork_rwsem @@ -5732,14 +5628,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * tasks. Prep all tasks first and then enable them with preemption * disabled. * - * All cgroups should be initialized before scx_ops_init_task() so that - * the BPF scheduler can reliably track each task's cgroup membership - * from scx_ops_init_task(). Lock out cgroup on/offlining and task - * migrations while tasks are being initialized so that - * scx_cgroup_can_attach() never sees uninitialized tasks. + * All cgroups should be initialized before scx_init_task() so that the + * BPF scheduler can reliably track each task's cgroup membership from + * scx_init_task(). Lock out cgroup on/offlining and task migrations + * while tasks are being initialized so that scx_cgroup_can_attach() + * never sees uninitialized tasks. */ scx_cgroup_lock(); - ret = scx_cgroup_init(); + ret = scx_cgroup_init(sch); if (ret) goto err_disable_unlock_all; @@ -5755,13 +5651,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_task_iter_unlock(&sti); - ret = scx_ops_init_task(p, task_group(p), false); + ret = scx_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); scx_task_iter_relock(&sti); scx_task_iter_stop(&sti); - scx_ops_error("ops.init_task() failed (%d) for %s[%d]", - ret, p->comm, p->pid); + scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", + ret, p->comm, p->pid); goto err_disable_unlock_all; } @@ -5779,7 +5675,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) * all eligible tasks. */ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); - static_branch_enable(&__scx_ops_enabled); + static_branch_enable(&__scx_enabled); /* * We're fully committed and can't fail. The task READY -> ENABLED @@ -5810,10 +5706,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); - scx_ops_bypass(false); + scx_bypass(false); - if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { - WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); + if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { + WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); goto err_disable; } @@ -5821,44 +5717,35 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) static_branch_enable(&__scx_switched_all); pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", - scx_ops.name, scx_switched_all() ? "" : " (partial)"); - kobject_uevent(scx_root_kobj, KOBJ_ADD); - mutex_unlock(&scx_ops_enable_mutex); + sch->ops.name, scx_switched_all() ? "" : " (partial)"); + kobject_uevent(&sch->kobj, KOBJ_ADD); + mutex_unlock(&scx_enable_mutex); atomic_long_inc(&scx_enable_seq); return 0; -err_del: - kobject_del(scx_root_kobj); -err: - kobject_put(scx_root_kobj); - scx_root_kobj = NULL; - if (scx_exit_info) { - free_exit_info(scx_exit_info); - scx_exit_info = NULL; - } err_unlock: - mutex_unlock(&scx_ops_enable_mutex); + mutex_unlock(&scx_enable_mutex); return ret; err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); - scx_ops_bypass(false); + scx_bypass(false); err_disable: - mutex_unlock(&scx_ops_enable_mutex); + mutex_unlock(&scx_enable_mutex); /* * Returning an error code here would not pass all the error information - * to userspace. Record errno using scx_ops_error() for cases - * scx_ops_error() wasn't already invoked and exit indicating success so - * that the error is notified through ops.exit() with all the details. + * to userspace. Record errno using scx_error() for cases scx_error() + * wasn't already invoked and exit indicating success so that the error + * is notified through ops.exit() with all the details. * - * Flush scx_ops_disable_work to ensure that error is reported before - * init completion. + * Flush scx_disable_work to ensure that error is reported before init + * completion. sch's base reference will be put by bpf_scx_unreg(). */ - scx_ops_error("scx_ops_enable() failed (%d)", ret); - kthread_flush_work(&scx_ops_disable_work); + scx_error(sch, "scx_enable() failed (%d)", ret); + kthread_flush_work(&sch->disable_work); return 0; } @@ -5909,21 +5796,8 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, return -EACCES; } -static const struct bpf_func_proto * -bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -{ - switch (func_id) { - case BPF_FUNC_task_storage_get: - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - return &bpf_task_storage_delete_proto; - default: - return bpf_base_func_proto(func_id, prog); - } -} - static const struct bpf_verifier_ops bpf_scx_verifier_ops = { - .get_func_proto = bpf_scx_get_func_proto, + .get_func_proto = bpf_base_func_proto, .is_valid_access = bpf_scx_is_valid_access, .btf_struct_access = bpf_scx_btf_struct_access, }; @@ -6002,13 +5876,17 @@ static int bpf_scx_check_member(const struct btf_type *t, static int bpf_scx_reg(void *kdata, struct bpf_link *link) { - return scx_ops_enable(kdata, link); + return scx_enable(kdata, link); } static void bpf_scx_unreg(void *kdata, struct bpf_link *link) { - scx_ops_disable(SCX_EXIT_UNREG); - kthread_flush_work(&scx_ops_disable_work); + struct sched_ext_ops *ops = kdata; + struct scx_sched *sch = ops->priv; + + scx_disable(SCX_EXIT_UNREG); + kthread_flush_work(&sch->disable_work); + kobject_put(&sch->kobj); } static int bpf_scx_init(struct btf *btf) @@ -6130,10 +6008,7 @@ static struct bpf_struct_ops bpf_sched_ext_ops = { static void sysrq_handle_sched_ext_reset(u8 key) { - if (scx_ops_helper) - scx_ops_disable(SCX_EXIT_SYSRQ); - else - pr_info("sched_ext: BPF scheduler not yet used\n"); + scx_disable(SCX_EXIT_SYSRQ); } static const struct sysrq_key_op sysrq_sched_ext_reset_op = { @@ -6281,13 +6156,14 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) */ void print_scx_info(const char *log_lvl, struct task_struct *p) { - enum scx_ops_enable_state state = scx_ops_enable_state(); + struct scx_sched *sch = scx_root; + enum scx_enable_state state = scx_enable_state(); const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; char runnable_at_buf[22] = "?"; struct sched_class *class; unsigned long runnable_at; - if (state == SCX_OPS_DISABLED) + if (state == SCX_DISABLED) return; /* @@ -6296,8 +6172,8 @@ void print_scx_info(const char *log_lvl, struct task_struct *p) */ if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || class != &ext_sched_class) { - printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name, - scx_ops_enable_state_str[state], all); + printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name, + scx_enable_state_str[state], all); return; } @@ -6308,7 +6184,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p) /* print everything onto one line to conserve console space */ printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", - log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all, + log_lvl, sch->ops.name, scx_enable_state_str[state], all, runnable_at_buf); } @@ -6324,12 +6200,12 @@ static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void * case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: case PM_RESTORE_PREPARE: - scx_ops_bypass(true); + scx_bypass(true); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: case PM_POST_RESTORE: - scx_ops_bypass(false); + scx_bypass(false); break; } @@ -6352,11 +6228,8 @@ void __init init_sched_ext_class(void) WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | SCX_TG_ONLINE); - BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); -#ifdef CONFIG_SMP - BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); -#endif + scx_idle_init_masks(); + scx_kick_cpus_pnt_seqs = __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, __alignof__(scx_kick_cpus_pnt_seqs[0])); @@ -6364,15 +6237,16 @@ void __init init_sched_ext_class(void) for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); + int n = cpu_to_node(cpu); init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); @@ -6389,62 +6263,6 @@ void __init init_sched_ext_class(void) /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -#include <linux/btf_ids.h> - -__bpf_kfunc_start_defs(); - -static bool check_builtin_idle_enabled(void) -{ - if (static_branch_likely(&scx_builtin_idle_enabled)) - return true; - - scx_ops_error("built-in idle tracking is disabled"); - return false; -} - -/** - * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() - * @p: task_struct to select a CPU for - * @prev_cpu: CPU @p was on previously - * @wake_flags: %SCX_WAKE_* flags - * @is_idle: out parameter indicating whether the returned CPU is idle - * - * Can only be called from ops.select_cpu() if the built-in CPU selection is - * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). - * - * Returns the picked CPU with *@is_idle indicating whether the picked CPU is - * currently idle and thus a good candidate for direct dispatching. - */ -__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *is_idle) -{ - if (!check_builtin_idle_enabled()) - goto prev_cpu; - - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) - goto prev_cpu; - -#ifdef CONFIG_SMP - return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -#endif - -prev_cpu: - *is_idle = false; - return prev_cpu; -} - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) -BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) -BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) - -static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { - .owner = THIS_MODULE, - .set = &scx_kfunc_ids_select_cpu, -}; - static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) { if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) @@ -6453,12 +6271,12 @@ static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) lockdep_assert_irqs_disabled(); if (unlikely(!p)) { - scx_ops_error("called with NULL task"); + scx_kf_error("called with NULL task"); return false; } if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { - scx_ops_error("invalid enq_flags 0x%llx", enq_flags); + scx_kf_error("invalid enq_flags 0x%llx", enq_flags); return false; } @@ -6478,7 +6296,7 @@ static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, } if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { - scx_ops_error("dispatch buffer overflow"); + scx_kf_error("dispatch buffer overflow"); return; } @@ -6610,6 +6428,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, struct task_struct *p, u64 dsq_id, u64 enq_flags) { + struct scx_sched *sch = scx_root; struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; struct rq *this_rq, *src_rq, *locked_rq; bool dispatched = false; @@ -6644,7 +6463,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, * cause similar live-lock conditions as consume_dispatch_q(). Insert a * breather if necessary. */ - scx_ops_breather(src_rq); + scx_breather(src_rq); locked_rq = src_rq; raw_spin_lock(&src_dsq->lock); @@ -6662,7 +6481,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, } /* @p is still on $src_dsq and stable, determine the destination */ - dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p); + dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p); /* * Apply vtime and slice updates before moving so that the new time is @@ -6675,7 +6494,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, p->scx.slice = kit->slice; /* execute move */ - locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq); + locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq); dispatched = true; out: if (in_balance) { @@ -6723,7 +6542,7 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) if (dspc->cursor > 0) dspc->cursor--; else - scx_ops_error("dispatch buffer underflow"); + scx_kf_error("dispatch buffer underflow"); } /** @@ -6742,21 +6561,22 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) */ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) { + struct scx_sched *sch = scx_root; struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq; if (!scx_kf_allowed(SCX_KF_DISPATCH)) return false; - flush_dispatch_buf(dspc->rq); + flush_dispatch_buf(sch, dspc->rq); - dsq = find_user_dsq(dsq_id); + dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) { - scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); + scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id); return false; } - if (consume_dispatch_q(dspc->rq, dsq)) { + if (consume_dispatch_q(sch, dspc->rq, dsq)) { /* * A successfully consumed task can be dequeued before it starts * running while the CPU is trying to migrate other dispatched @@ -6966,8 +6786,12 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. + * + * Also skip re-enqueueing tasks that can only run on this + * CPU, as they would just be re-added to the same local + * DSQ without any benefit. */ - if (p->migration_pending) + if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) continue; dispatch_dequeue(rq, p); @@ -7006,10 +6830,36 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) { + struct scx_dispatch_q *dsq; + struct scx_sched *sch; + s32 ret; + if (unlikely(node >= (int)nr_node_ids || (node < 0 && node != NUMA_NO_NODE))) return -EINVAL; - return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); + + if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) + return -EINVAL; + + dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); + if (!dsq) + return -ENOMEM; + + init_dsq(dsq, dsq_id); + + rcu_read_lock(); + + sch = rcu_dereference(scx_root); + if (sch) + ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, + dsq_hash_params); + else + ret = -ENODEV; + + rcu_read_unlock(); + if (ret) + kfree(dsq); + return ret; } __bpf_kfunc_end_defs(); @@ -7048,7 +6898,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) struct rq *this_rq; unsigned long irq_flags; - if (!ops_cpu_valid(cpu, NULL)) + if (!kf_cpu_valid(cpu, NULL)) return; local_irq_save(irq_flags); @@ -7072,7 +6922,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) struct rq *target_rq = cpu_rq(cpu); if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) - scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); + scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); if (raw_spin_rq_trylock(target_rq)) { if (can_skip_idle_kick(target_rq)) { @@ -7105,23 +6955,30 @@ out: */ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) { + struct scx_sched *sch; struct scx_dispatch_q *dsq; s32 ret; preempt_disable(); + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) { + ret = -ENODEV; + goto out; + } + if (dsq_id == SCX_DSQ_LOCAL) { ret = READ_ONCE(this_rq()->scx.local_dsq.nr); goto out; } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; - if (ops_cpu_valid(cpu, NULL)) { + if (ops_cpu_valid(sch, cpu, NULL)) { ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); goto out; } } else { - dsq = find_user_dsq(dsq_id); + dsq = find_user_dsq(sch, dsq_id); if (dsq) { ret = READ_ONCE(dsq->nr); goto out; @@ -7144,7 +7001,13 @@ out: */ __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) { - destroy_dsq(dsq_id); + struct scx_sched *sch; + + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) + destroy_dsq(sch, dsq_id); + rcu_read_unlock(); } /** @@ -7161,16 +7024,27 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) { struct bpf_iter_scx_dsq_kern *kit = (void *)it; + struct scx_sched *sch; BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > sizeof(struct bpf_iter_scx_dsq)); BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != __alignof__(struct bpf_iter_scx_dsq)); + /* + * next() and destroy() will be called regardless of the return value. + * Always clear $kit->dsq. + */ + kit->dsq = NULL; + + sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held()); + if (unlikely(!sch)) + return -ENODEV; + if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) return -EINVAL; - kit->dsq = find_user_dsq(dsq_id); + kit->dsq = find_user_dsq(sch, dsq_id); if (!kit->dsq) return -ENOENT; @@ -7260,21 +7134,20 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || (data__sz && !data)) { - scx_ops_error("invalid data=%p and data__sz=%u", - (void *)data, data__sz); + scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz); return -EINVAL; } ret = copy_from_kernel_nofault(data_buf, data, data__sz); if (ret < 0) { - scx_ops_error("failed to read data fields (%d)", ret); + scx_kf_error("failed to read data fields (%d)", ret); return ret; } ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, &bprintf_data); if (ret < 0) { - scx_ops_error("format preparation failed (%d)", ret); + scx_kf_error("format preparation failed (%d)", ret); return ret; } @@ -7282,8 +7155,7 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, bprintf_data.bin_args); bpf_bprintf_cleanup(&bprintf_data); if (ret < 0) { - scx_ops_error("(\"%s\", %p, %u) failed to format", - fmt, data, data__sz); + scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz); return ret; } @@ -7316,8 +7188,7 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s", - scx_exit_bstr_buf.line); + scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -7337,8 +7208,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s", - scx_exit_bstr_buf.line); + scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -7362,7 +7232,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, s32 ret; if (raw_smp_processor_id() != dd->cpu) { - scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends"); + scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends"); return; } @@ -7403,7 +7273,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, */ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) { - if (ops_cpu_valid(cpu, NULL)) + if (kf_cpu_valid(cpu, NULL)) return arch_scale_cpu_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7425,7 +7295,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) */ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) { - if (ops_cpu_valid(cpu, NULL)) + if (kf_cpu_valid(cpu, NULL)) return arch_scale_freq_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7448,22 +7318,51 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) { if (unlikely(perf > SCX_CPUPERF_ONE)) { - scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu); return; } - if (ops_cpu_valid(cpu, NULL)) { - struct rq *rq = cpu_rq(cpu); + if (kf_cpu_valid(cpu, NULL)) { + struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); + struct rq_flags rf; + + /* + * When called with an rq lock held, restrict the operation + * to the corresponding CPU to prevent ABBA deadlocks. + */ + if (locked_rq && rq != locked_rq) { + scx_kf_error("Invalid target CPU %d", cpu); + return; + } + + /* + * If no rq lock is held, allow to operate on any CPU by + * acquiring the corresponding rq lock. + */ + if (!locked_rq) { + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + } rq->scx.cpuperf_target = perf; + cpufreq_update_util(rq, 0); - rcu_read_lock_sched_notrace(); - cpufreq_update_util(cpu_rq(cpu), 0); - rcu_read_unlock_sched_notrace(); + if (!locked_rq) + rq_unlock_irqrestore(rq, &rf); } } /** + * scx_bpf_nr_node_ids - Return the number of possible node IDs + * + * All valid node IDs in the system are smaller than the returned value. + */ +__bpf_kfunc u32 scx_bpf_nr_node_ids(void) +{ + return nr_node_ids; +} + +/** * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs * * All valid CPU IDs in the system are smaller than the returned value. @@ -7504,142 +7403,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) } /** - * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking - * per-CPU cpumask. - * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. - */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) -{ - if (!check_builtin_idle_enabled()) - return cpu_none_mask; - -#ifdef CONFIG_SMP - return idle_masks.cpu; -#else - return cpu_none_mask; -#endif -} - -/** - * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, - * per-physical-core cpumask. Can be used to determine if an entire physical - * core is free. - * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. - */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) -{ - if (!check_builtin_idle_enabled()) - return cpu_none_mask; - -#ifdef CONFIG_SMP - if (sched_smt_active()) - return idle_masks.smt; - else - return idle_masks.cpu; -#else - return cpu_none_mask; -#endif -} - -/** - * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to - * either the percpu, or SMT idle-tracking cpumask. - * @idle_mask: &cpumask to use - */ -__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) -{ - /* - * Empty function body because we aren't actually acquiring or releasing - * a reference to a global idle cpumask, which is read-only in the - * caller and is never released. The acquire / release semantics here - * are just used to make the cpumask a trusted pointer in the caller. - */ -} - -/** - * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state - * @cpu: cpu to test and clear idle for - * - * Returns %true if @cpu was idle and its idle state was successfully cleared. - * %false otherwise. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) -{ - if (!check_builtin_idle_enabled()) - return false; - - if (ops_cpu_valid(cpu, NULL)) - return test_and_clear_cpu_idle(cpu); - else - return false; -} - -/** - * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu - * number on success. -%EBUSY if no matching cpu was found. - * - * Idle CPU tracking may race against CPU scheduling state transitions. For - * example, this function may return -%EBUSY as CPUs are transitioning into the - * idle state. If the caller then assumes that there will be dispatch events on - * the CPUs as they were all busy, the scheduler may end up stalling with CPUs - * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and - * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch - * event in the near future. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - if (!check_builtin_idle_enabled()) - return -EBUSY; - - return scx_pick_idle_cpu(cpus_allowed, flags); -} - -/** - * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any - * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu - * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is - * empty. - * - * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not - * set, this function can't tell which CPUs are idle and will always pick any - * CPU. - */ -__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - s32 cpu; - - if (static_branch_likely(&scx_builtin_idle_enabled)) { - cpu = scx_pick_idle_cpu(cpus_allowed, flags); - if (cpu >= 0) - return cpu; - } - - cpu = cpumask_any_distribute(cpus_allowed); - if (cpu < nr_cpu_ids) - return cpu; - else - return -EBUSY; -} - -/** * scx_bpf_task_running - Is task currently running? * @p: task of interest */ @@ -7663,7 +7426,7 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) */ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) { - if (!ops_cpu_valid(cpu, NULL)) + if (!kf_cpu_valid(cpu, NULL)) return NULL; return cpu_rq(cpu); @@ -7759,6 +7522,57 @@ __bpf_kfunc u64 scx_bpf_now(void) return clock; } +static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events) +{ + struct scx_event_stats *e_cpu; + int cpu; + + /* Aggregate per-CPU event counters into @events. */ + memset(events, 0, sizeof(*events)); + for_each_possible_cpu(cpu) { + e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu); + scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); + scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); + scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); + scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); + scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); + scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); + scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); + } +} + +/* + * scx_bpf_events - Get a system-wide event counter to + * @events: output buffer from a BPF program + * @events__sz: @events len, must end in '__sz'' for the verifier + */ +__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, + size_t events__sz) +{ + struct scx_sched *sch; + struct scx_event_stats e_sys; + + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (sch) + scx_read_events(sch, &e_sys); + else + memset(&e_sys, 0, sizeof(e_sys)); + rcu_read_unlock(); + + /* + * We cannot entirely trust a BPF-provided size since a BPF program + * might be compiled against a different vmlinux.h, of which + * scx_event_stats would be larger (a newer vmlinux.h) or smaller + * (an older vmlinux.h). Hence, we use the smaller size to avoid + * memory corruption. + */ + events__sz = min(events__sz, sizeof(*events)); + memcpy(events, &e_sys, events__sz); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) @@ -7774,16 +7588,11 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) +BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) -BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) @@ -7791,6 +7600,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq) BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif BTF_ID_FLAGS(func, scx_bpf_now) +BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { @@ -7814,8 +7624,6 @@ static int __init scx_init(void) * check using scx_kf_allowed(). */ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, - &scx_kfunc_set_select_cpu)) || - (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_enqueue_dispatch)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_dispatch)) || @@ -7835,6 +7643,12 @@ static int __init scx_init(void) return ret; } + ret = scx_idle_init(); + if (ret) { + pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); + return ret; + } + ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); if (ret) { pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 1079b56b0f7a..a75835c23f15 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -8,6 +8,13 @@ */ #ifdef CONFIG_SCHED_CLASS_EXT +static inline bool scx_kf_allowed_if_unlocked(void) +{ + return !current->scx.kf_mask; +} + +DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); + void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); @@ -19,6 +26,7 @@ void scx_rq_activate(struct rq *rq); void scx_rq_deactivate(struct rq *rq); int scx_check_setscheduler(struct task_struct *p, int policy); bool task_should_scx(int policy); +bool scx_allow_ttwu_queue(const struct task_struct *p); void init_sched_ext_class(void); static inline u32 scx_cpuperf_target(s32 cpu) @@ -52,6 +60,7 @@ static inline void scx_rq_activate(struct rq *rq) {} static inline void scx_rq_deactivate(struct rq *rq) {} static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } static inline bool task_on_scx(const struct task_struct *p) { return false; } +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; } static inline void init_sched_ext_class(void) {} #endif /* CONFIG_SCHED_CLASS_EXT */ @@ -70,6 +79,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} #ifdef CONFIG_CGROUP_SCHED #ifdef CONFIG_EXT_GROUP_SCHED +void scx_tg_init(struct task_group *tg); int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); @@ -79,6 +89,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); #else /* CONFIG_EXT_GROUP_SCHED */ +static inline void scx_tg_init(struct task_group *tg) {} static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c new file mode 100644 index 000000000000..6d29d3cbc670 --- /dev/null +++ b/kernel/sched/ext_idle.c @@ -0,0 +1,1315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Built-in idle CPU tracking policy. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> + */ +#include "ext_idle.h" + +/* Enable/disable built-in idle CPU selection policy */ +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); + +/* Enable/disable per-node idle cpumasks */ +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node); + +#ifdef CONFIG_SMP +/* Enable/disable LLC aware optimizations */ +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); + +/* Enable/disable NUMA aware optimizations */ +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); + +/* + * cpumasks to track idle CPUs within each NUMA node. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask + * from is used to track all the idle CPUs in the system. + */ +struct scx_idle_cpus { + cpumask_var_t cpu; + cpumask_var_t smt; +}; + +/* + * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE + * is not enabled). + */ +static struct scx_idle_cpus scx_idle_global_masks; + +/* + * Per-node idle cpumasks. + */ +static struct scx_idle_cpus **scx_idle_node_masks; + +/* + * Local per-CPU cpumasks (used to generate temporary idle cpumasks). + */ +static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask); +static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask); +static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask); + +/* + * Return the idle masks associated to a target @node. + * + * NUMA_NO_NODE identifies the global idle cpumask. + */ +static struct scx_idle_cpus *idle_cpumask(int node) +{ + return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node]; +} + +/* + * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if + * per-node idle cpumasks are disabled. + */ +static int scx_cpu_node_if_enabled(int cpu) +{ + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +} + +bool scx_idle_test_and_clear_cpu(int cpu) +{ + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + +#ifdef CONFIG_SCHED_SMT + /* + * SMT mask should be cleared whether we can claim @cpu or not. The SMT + * cluster is not wholly idle either way. This also prevents + * scx_pick_idle_cpu() from getting caught in an infinite loop. + */ + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; + + /* + * If offline, @cpu is not its own sibling and + * scx_pick_idle_cpu() can get caught in an infinite loop as + * @cpu is never cleared from the idle SMT mask. Ensure that + * @cpu is eventually cleared. + * + * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to + * reduce memory writes, which may help alleviate cache + * coherence pressure. + */ + if (cpumask_intersects(smt, idle_smts)) + cpumask_andnot(idle_smts, idle_smts, smt); + else if (cpumask_test_cpu(cpu, idle_smts)) + __cpumask_clear_cpu(cpu, idle_smts); + } +#endif + + return cpumask_test_and_clear_cpu(cpu, idle_cpus); +} + +/* + * Pick an idle CPU in a specific NUMA node. + */ +static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + int cpu; + +retry: + if (sched_smt_active()) { + cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed); + if (cpu < nr_cpu_ids) + goto found; + + if (flags & SCX_PICK_IDLE_CORE) + return -EBUSY; + } + + cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed); + if (cpu >= nr_cpu_ids) + return -EBUSY; + +found: + if (scx_idle_test_and_clear_cpu(cpu)) + return cpu; + else + goto retry; +} + +#ifdef CONFIG_NUMA +/* + * Tracks nodes that have not yet been visited when searching for an idle + * CPU across all available nodes. + */ +static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited); + +/* + * Search for an idle CPU across all nodes, excluding @node. + */ +static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + nodemask_t *unvisited; + s32 cpu = -EBUSY; + + preempt_disable(); + unvisited = this_cpu_ptr(&per_cpu_unvisited); + + /* + * Restrict the search to the online nodes (excluding the current + * node that has been visited already). + */ + nodes_copy(*unvisited, node_states[N_ONLINE]); + node_clear(node, *unvisited); + + /* + * Traverse all nodes in order of increasing distance, starting + * from @node. + * + * This loop is O(N^2), with N being the amount of NUMA nodes, + * which might be quite expensive in large NUMA systems. However, + * this complexity comes into play only when a scheduler enables + * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU + * without specifying a target NUMA node, so it shouldn't be a + * bottleneck is most cases. + * + * As a future optimization we may want to cache the list of nodes + * in a per-node array, instead of actually traversing them every + * time. + */ + for_each_node_numadist(node, *unvisited) { + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + break; + } + preempt_enable(); + + return cpu; +} +#else +static inline s32 +pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + return -EBUSY; +} +#endif + +/* + * Find an idle CPU in the system, starting from @node. + */ +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + s32 cpu; + + /* + * Always search in the starting node first (this is an + * optimization that can save some cycles even when the search is + * not limited to a single node). + */ + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + /* + * Stop the search if we are using only a single global cpumask + * (NUMA_NO_NODE) or if the search is restricted to the first node + * only. + */ + if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE) + return -EBUSY; + + /* + * Extend the search to the other online nodes. + */ + return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags); +} + +/* + * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC + * domain is not defined). + */ +static unsigned int llc_weight(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sd->span_weight; +} + +/* + * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC + * domain is not defined). + */ +static struct cpumask *llc_span(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sched_domain_span(sd); +} + +/* + * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the + * NUMA domain is not defined). + */ +static unsigned int numa_weight(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return 0; + sg = sd->groups; + if (!sg) + return 0; + + return sg->group_weight; +} + +/* + * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA + * domain is not defined). + */ +static struct cpumask *numa_span(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return NULL; + sg = sd->groups; + if (!sg) + return NULL; + + return sched_group_span(sg); +} + +/* + * Return true if the LLC domains do not perfectly overlap with the NUMA + * domains, false otherwise. + */ +static bool llc_numa_mismatch(void) +{ + int cpu; + + /* + * We need to scan all online CPUs to verify whether their scheduling + * domains overlap. + * + * While it is rare to encounter architectures with asymmetric NUMA + * topologies, CPU hotplugging or virtualized environments can result + * in asymmetric configurations. + * + * For example: + * + * NUMA 0: + * - LLC 0: cpu0..cpu7 + * - LLC 1: cpu8..cpu15 [offline] + * + * NUMA 1: + * - LLC 0: cpu16..cpu23 + * - LLC 1: cpu24..cpu31 + * + * In this case, if we only check the first online CPU (cpu0), we might + * incorrectly assume that the LLC and NUMA domains are fully + * overlapping, which is incorrect (as NUMA 1 has two distinct LLC + * domains). + */ + for_each_online_cpu(cpu) + if (llc_weight(cpu) != numa_weight(cpu)) + return true; + + return false; +} + +/* + * Initialize topology-aware scheduling. + * + * Detect if the system has multiple LLC or multiple NUMA domains and enable + * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle + * selection policy. + * + * Assumption: the kernel's internal topology representation assumes that each + * CPU belongs to a single LLC domain, and that each LLC domain is entirely + * contained within a single NUMA node. + */ +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) +{ + bool enable_llc = false, enable_numa = false; + unsigned int nr_cpus; + s32 cpu = cpumask_first(cpu_online_mask); + + /* + * Enable LLC domain optimization only when there are multiple LLC + * domains among the online CPUs. If all online CPUs are part of a + * single LLC domain, the idle CPU selection logic can choose any + * online CPU without bias. + * + * Note that it is sufficient to check the LLC domain of the first + * online CPU to determine whether a single LLC domain includes all + * CPUs. + */ + rcu_read_lock(); + nr_cpus = llc_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus()) + enable_llc = true; + pr_debug("sched_ext: LLC=%*pb weight=%u\n", + cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); + } + + /* + * Enable NUMA optimization only when there are multiple NUMA domains + * among the online CPUs and the NUMA domains don't perfectly overlaps + * with the LLC domains. + * + * If all CPUs belong to the same NUMA node and the same LLC domain, + * enabling both NUMA and LLC optimizations is unnecessary, as checking + * for an idle CPU in the same domain twice is redundant. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA + * optimization, as we would naturally select idle CPUs within + * specific NUMA nodes querying the corresponding per-node cpumask. + */ + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + nr_cpus = numa_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) + enable_numa = true; + pr_debug("sched_ext: NUMA=%*pb weight=%u\n", + cpumask_pr_args(numa_span(cpu)), nr_cpus); + } + } + rcu_read_unlock(); + + pr_debug("sched_ext: LLC idle selection %s\n", + str_enabled_disabled(enable_llc)); + pr_debug("sched_ext: NUMA idle selection %s\n", + str_enabled_disabled(enable_numa)); + + if (enable_llc) + static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); + if (enable_numa) + static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); +} + +/* + * Return true if @p can run on all possible CPUs, false otherwise. + */ +static inline bool task_affinity_all(const struct task_struct *p) +{ + return p->nr_cpus_allowed >= num_possible_cpus(); +} + +/* + * Built-in CPU idle selection policy: + * + * 1. Prioritize full-idle cores: + * - always prioritize CPUs from fully idle cores (both logical CPUs are + * idle) to avoid interference caused by SMT. + * + * 2. Reuse the same CPU: + * - prefer the last used CPU to take advantage of cached data (L1, L2) and + * branch prediction optimizations. + * + * 3. Pick a CPU within the same LLC (Last-Level Cache): + * - if the above conditions aren't met, pick a CPU that shares the same + * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain + * cache locality. + * + * 4. Pick a CPU within the same NUMA node, if enabled: + * - choose a CPU from the same NUMA node, if the node cpumask is a + * subset of @cpus_allowed, to reduce memory access latency. + * + * 5. Pick any idle CPU within the @cpus_allowed domain. + * + * Step 3 and 4 are performed only if the system has, respectively, + * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and + * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs. + * + * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always + * begin in @prev_cpu's node and proceed to other nodes in order of + * increasing distance. + * + * Return the picked CPU if idle, or a negative value otherwise. + * + * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because + * we never call ops.select_cpu() for them, see select_task_rq(). + */ +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) +{ + const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL; + const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr; + int node = scx_cpu_node_if_enabled(prev_cpu); + bool is_prev_allowed; + s32 cpu; + + preempt_disable(); + + /* + * Check whether @prev_cpu is still within the allowed set. If not, + * we can still try selecting a nearby CPU. + */ + is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); + + /* + * Determine the subset of CPUs usable by @p within @cpus_allowed. + */ + if (allowed != p->cpus_ptr) { + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask); + + if (task_affinity_all(p)) { + allowed = cpus_allowed; + } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) { + allowed = local_cpus; + } else { + cpu = -EBUSY; + goto out_enable; + } + } + + /* + * This is necessary to protect llc_cpus. + */ + rcu_read_lock(); + + /* + * Determine the subset of CPUs that the task can use in its + * current LLC and node. + * + * If the task can run on all CPUs, use the node and LLC cpumasks + * directly. + */ + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) { + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask); + const struct cpumask *cpus = numa_span(prev_cpu); + + if (allowed == p->cpus_ptr && task_affinity_all(p)) + numa_cpus = cpus; + else if (cpus && cpumask_and(local_cpus, allowed, cpus)) + numa_cpus = local_cpus; + } + + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) { + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask); + const struct cpumask *cpus = llc_span(prev_cpu); + + if (allowed == p->cpus_ptr && task_affinity_all(p)) + llc_cpus = cpus; + else if (cpus && cpumask_and(local_cpus, allowed, cpus)) + llc_cpus = local_cpus; + } + + /* + * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. + */ + if (wake_flags & SCX_WAKE_SYNC) { + int waker_node; + + /* + * If the waker's CPU is cache affine and prev_cpu is idle, + * then avoid a migration. + */ + cpu = smp_processor_id(); + if (is_prev_allowed && cpus_share_cache(cpu, prev_cpu) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * If the waker's local DSQ is empty, and the system is under + * utilized, try to wake up @p to the local DSQ of the waker. + * + * Checking only for an empty local DSQ is insufficient as it + * could give the wakee an unfair advantage when the system is + * oversaturated. + * + * Checking only for the presence of idle CPUs is also + * insufficient as the local DSQ of the waker could have tasks + * piled up on it even if there is an idle core elsewhere on + * the system. + */ + waker_node = cpu_to_node(cpu); + if (!(current->flags & PF_EXITING) && + cpu_rq(cpu)->scx.local_dsq.nr == 0 && + (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && + !cpumask_empty(idle_cpumask(waker_node)->cpu)) { + if (cpumask_test_cpu(cpu, allowed)) + goto out_unlock; + } + } + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if (sched_smt_active()) { + /* + * Keep using @prev_cpu if it's part of a fully idle core. + */ + if (is_prev_allowed && + cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * Search for any fully idle core in the same LLC domain. + */ + if (llc_cpus) { + cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any fully idle core in the same NUMA node. + */ + if (numa_cpus) { + cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any full-idle core usable by the task. + * + * If the node-aware idle CPU selection policy is enabled + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always + * begin in prev_cpu's node and proceed to other nodes in + * order of increasing distance. + */ + cpu = scx_pick_idle_cpu(allowed, node, flags | SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + + /* + * Give up if we're strictly looking for a full-idle SMT + * core. + */ + if (flags & SCX_PICK_IDLE_CORE) { + cpu = -EBUSY; + goto out_unlock; + } + } + + /* + * Use @prev_cpu if it's idle. + */ + if (is_prev_allowed && scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * Search for any idle CPU in the same LLC domain. + */ + if (llc_cpus) { + cpu = pick_idle_cpu_in_node(llc_cpus, node, 0); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any idle CPU in the same NUMA node. + */ + if (numa_cpus) { + cpu = pick_idle_cpu_in_node(numa_cpus, node, 0); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any idle CPU usable by the task. + * + * If the node-aware idle CPU selection policy is enabled + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin + * in prev_cpu's node and proceed to other nodes in order of + * increasing distance. + */ + cpu = scx_pick_idle_cpu(allowed, node, flags); + +out_unlock: + rcu_read_unlock(); +out_enable: + preempt_enable(); + + return cpu; +} + +/* + * Initialize global and per-node idle cpumasks. + */ +void scx_idle_init_masks(void) +{ + int i; + + /* Allocate global idle cpumasks */ + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL)); + + /* Allocate per-node idle cpumasks */ + scx_idle_node_masks = kcalloc(num_possible_nodes(), + sizeof(*scx_idle_node_masks), GFP_KERNEL); + BUG_ON(!scx_idle_node_masks); + + for_each_node(i) { + scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks), + GFP_KERNEL, i); + BUG_ON(!scx_idle_node_masks[i]); + + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i)); + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i)); + } + + /* Allocate local per-cpu idle cpumasks */ + for_each_possible_cpu(i) { + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i), + GFP_KERNEL, cpu_to_node(i))); + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i), + GFP_KERNEL, cpu_to_node(i))); + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i), + GFP_KERNEL, cpu_to_node(i))); + } +} + +static void update_builtin_idle(int cpu, bool idle) +{ + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + + assign_cpu(cpu, idle_cpus, idle); + +#ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; + + if (idle) { + /* + * idle_smt handling is racy but that's fine as it's + * only for optimization and self-correcting. + */ + if (!cpumask_subset(smt, idle_cpus)) + return; + cpumask_or(idle_smts, idle_smts, smt); + } else { + cpumask_andnot(idle_smts, idle_smts, smt); + } + } +#endif +} + +/* + * Update the idle state of a CPU to @idle. + * + * If @do_notify is true, ops.update_idle() is invoked to notify the scx + * scheduler of an actual idle state transition (idle to busy or vice + * versa). If @do_notify is false, only the idle state in the idle masks is + * refreshed without invoking ops.update_idle(). + * + * This distinction is necessary, because an idle CPU can be "reserved" and + * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as + * busy even if no tasks are dispatched. In this case, the CPU may return + * to idle without a true state transition. Refreshing the idle masks + * without invoking ops.update_idle() ensures accurate idle state tracking + * while avoiding unnecessary updates and maintaining balanced state + * transitions. + */ +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + struct scx_sched *sch = scx_root; + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + /* + * Update the idle masks: + * - for real idle transitions (do_notify == true) + * - for idle-to-idle transitions (indicated by the previous task + * being the idle thread, managed by pick_task_idle()) + * + * Skip updating idle masks if the previous task is not the idle + * thread, since set_next_task_idle() has already handled it when + * transitioning from a task to the idle thread (calling this + * function with do_notify == true). + * + * In this way we can avoid updating the idle masks twice, + * unnecessarily. + */ + if (static_branch_likely(&scx_builtin_idle_enabled)) + if (do_notify || is_idle_task(rq->curr)) + update_builtin_idle(cpu, idle); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + * + * This must come after builtin idle update so that BPF schedulers can + * create interlocking between ops.update_idle() and ops.enqueue() - + * either enqueue() sees the idle bit or update_idle() sees the task + * that enqueue() queued. + */ + if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle); +} + +static void reset_idle_masks(struct sched_ext_ops *ops) +{ + int node; + + /* + * Consider all online cpus idle. Should converge to the actual state + * quickly. + */ + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask); + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask); + return; + } + + for_each_node(node) { + const struct cpumask *node_mask = cpumask_of_node(node); + + cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask); + cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); + } +} +#endif /* CONFIG_SMP */ + +void scx_idle_enable(struct sched_ext_ops *ops) +{ + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) + static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); + else + static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + + if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) + static_branch_enable_cpuslocked(&scx_builtin_idle_per_node); + else + static_branch_disable_cpuslocked(&scx_builtin_idle_per_node); + +#ifdef CONFIG_SMP + reset_idle_masks(ops); +#endif +} + +void scx_idle_disable(void) +{ + static_branch_disable(&scx_builtin_idle_enabled); + static_branch_disable(&scx_builtin_idle_per_node); +} + +/******************************************************************************** + * Helpers that can be called from the BPF scheduler. + */ + +static int validate_node(int node) +{ + if (!static_branch_likely(&scx_builtin_idle_per_node)) { + scx_kf_error("per-node idle tracking is disabled"); + return -EOPNOTSUPP; + } + + /* Return no entry for NUMA_NO_NODE (not a critical scx error) */ + if (node == NUMA_NO_NODE) + return -ENOENT; + + /* Make sure node is in a valid range */ + if (node < 0 || node >= nr_node_ids) { + scx_kf_error("invalid node %d", node); + return -EINVAL; + } + + /* Make sure the node is part of the set of possible nodes */ + if (!node_possible(node)) { + scx_kf_error("unavailable node %d", node); + return -EINVAL; + } + + return node; +} + +__bpf_kfunc_start_defs(); + +static bool check_builtin_idle_enabled(void) +{ + if (static_branch_likely(&scx_builtin_idle_enabled)) + return true; + + scx_kf_error("built-in idle tracking is disabled"); + return false; +} + +s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *allowed, u64 flags) +{ + struct rq *rq; + struct rq_flags rf; + s32 cpu; + + if (!kf_cpu_valid(prev_cpu, NULL)) + return -EINVAL; + + if (!check_builtin_idle_enabled()) + return -EBUSY; + + /* + * If called from an unlocked context, acquire the task's rq lock, + * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed. + * + * Otherwise, allow to use this kfunc only from ops.select_cpu() + * and ops.select_enqueue(). + */ + if (scx_kf_allowed_if_unlocked()) { + rq = task_rq_lock(p, &rf); + } else { + if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) + return -EPERM; + rq = scx_locked_rq(); + } + + /* + * Validate locking correctness to access p->cpus_ptr and + * p->nr_cpus_allowed: if we're holding an rq lock, we're safe; + * otherwise, assert that p->pi_lock is held. + */ + if (!rq) + lockdep_assert_held(&p->pi_lock); + +#ifdef CONFIG_SMP + /* + * This may also be called from ops.enqueue(), so we need to handle + * per-CPU tasks as well. For these tasks, we can skip all idle CPU + * selection optimizations and simply check whether the previously + * used CPU is idle and within the allowed cpumask. + */ + if (p->nr_cpus_allowed == 1) { + if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && + scx_idle_test_and_clear_cpu(prev_cpu)) + cpu = prev_cpu; + else + cpu = -EBUSY; + } else { + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, + allowed ?: p->cpus_ptr, flags); + } +#else + cpu = -EBUSY; +#endif + if (scx_kf_allowed_if_unlocked()) + task_rq_unlock(rq, p, &rf); + + return cpu; +} + +/** + * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or + * trigger an error if @cpu is invalid + * @cpu: target CPU + */ +__bpf_kfunc int scx_bpf_cpu_node(s32 cpu) +{ +#ifdef CONFIG_NUMA + if (!kf_cpu_valid(cpu, NULL)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +#else + return 0; +#endif +} + +/** + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @is_idle: out parameter indicating whether the returned CPU is idle + * + * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked + * context such as a BPF test_run() call, as long as built-in CPU selection + * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE + * is set. + * + * Returns the picked CPU with *@is_idle indicating whether the picked CPU is + * currently idle and thus a good candidate for direct dispatching. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *is_idle) +{ + s32 cpu; + + cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0); + if (cpu >= 0) { + *is_idle = true; + return cpu; + } + *is_idle = false; + + return prev_cpu; +} + +/** + * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p, + * prioritizing those in @cpus_allowed + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @cpus_allowed: cpumask of allowed CPUs + * @flags: %SCX_PICK_IDLE* flags + * + * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked + * context such as a BPF test_run() call, as long as built-in CPU selection + * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE + * is set. + * + * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * + * Returns the selected idle CPU, which will be automatically awakened upon + * returning from ops.select_cpu() and can be used for direct dispatch, or + * a negative value if no idle CPU is available. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) +{ + return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags); +} + +/** + * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the + * idle-tracking per-CPU cpumask of a target NUMA node. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking + * per-CPU cpumask. + * + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) +{ + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_cpumask(NUMA_NO_NODE)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the + * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be + * used to determine if an entire physical core is free. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_cpumask(node)->smt; + else + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, + * per-physical-core cpumask. Can be used to determine if an entire physical + * core is free. + * + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) +{ + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_cpumask(NUMA_NO_NODE)->smt; + else + return idle_cpumask(NUMA_NO_NODE)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to + * either the percpu, or SMT idle-tracking cpumask. + * @idle_mask: &cpumask to use + */ +__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) +{ + /* + * Empty function body because we aren't actually acquiring or releasing + * a reference to a global idle cpumask, which is read-only in the + * caller and is never released. The acquire / release semantics here + * are just used to make the cpumask a trusted pointer in the caller. + */ +} + +/** + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state + * @cpu: cpu to test and clear idle for + * + * Returns %true if @cpu was idle and its idle state was successfully cleared. + * %false otherwise. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +{ + if (!check_builtin_idle_enabled()) + return false; + + if (kf_cpu_valid(cpu, NULL)) + return scx_idle_test_and_clear_cpu(cpu); + else + return false; +} + +/** + * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_* flags + * + * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node. + * + * Returns the picked idle cpu number on success, or -%EBUSY if no matching + * cpu was found. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node). + * + * Always returns an error if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if + * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + node = validate_node(node); + if (node < 0) + return node; + + return scx_pick_idle_cpu(cpus_allowed, node, flags); +} + +/** + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu + * number on success. -%EBUSY if no matching cpu was found. + * + * Idle CPU tracking may race against CPU scheduling state transitions. For + * example, this function may return -%EBUSY as CPUs are transitioning into the + * idle state. If the caller then assumes that there will be dispatch events on + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch + * event in the near future. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_idle_cpu_node() instead. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_kf_error("per-node idle tracking is enabled"); + return -EBUSY; + } + + if (!check_builtin_idle_enabled()) + return -EBUSY; + + return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); +} + +/** + * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available + * or pick any CPU from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless %SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node, regardless of + * the CPU idle state). + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + s32 cpu; + + node = validate_node(node); + if (node < 0) + return node; + + cpu = scx_pick_idle_cpu(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + if (flags & SCX_PICK_IDLE_IN_NODE) + cpu = cpumask_any_and_distribute(cpumask_of_node(node), cpus_allowed); + else + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +/** + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_any_cpu_node() instead. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + s32 cpu; + + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_kf_error("per-node idle tracking is enabled"); + return -EBUSY; + } + + if (static_branch_likely(&scx_builtin_idle_enabled)) { + cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); + if (cpu >= 0) + return cpu; + } + + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_idle) +BTF_ID_FLAGS(func, scx_bpf_cpu_node) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_idle) + +static const struct btf_kfunc_id_set scx_kfunc_set_idle = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_idle, +}; + +int scx_idle_init(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle); + + return ret; +} diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h new file mode 100644 index 000000000000..37be78a7502b --- /dev/null +++ b/kernel/sched/ext_idle.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> + */ +#ifndef _KERNEL_SCHED_EXT_IDLE_H +#define _KERNEL_SCHED_EXT_IDLE_H + +struct sched_ext_ops; + +#ifdef CONFIG_SMP +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); +void scx_idle_init_masks(void); +bool scx_idle_test_and_clear_cpu(int cpu); +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags); +#else /* !CONFIG_SMP */ +static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {} +static inline void scx_idle_init_masks(void) {} +static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } +static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + return -EBUSY; +} +#endif /* CONFIG_SMP */ + +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags); +void scx_idle_enable(struct sched_ext_ops *ops); +void scx_idle_disable(void); +int scx_idle_init(void); + +#endif /* _KERNEL_SCHED_EXT_IDLE_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1c0ef435a7aa..7a14da5396fb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -74,12 +74,12 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_base_slice = 750000ULL; -static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; +unsigned int sysctl_sched_base_slice = 700000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; -const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; static int __init setup_sched_thermal_decay_shift(char *str) { @@ -399,7 +399,7 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) static inline void assert_list_leaf_cfs_rq(struct rq *rq) { - SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); + WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); } /* Iterate through all leaf cfs_rq's on a runqueue */ @@ -696,7 +696,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { s64 vlag, limit; - SCHED_WARN_ON(!se->on_rq); + WARN_ON_ONCE(!se->on_rq); vlag = avg_vruntime(cfs_rq) - se->vruntime; limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); @@ -884,6 +884,26 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) } /* + * HACK, stash a copy of deadline at the point of pick in vlag, + * which isn't used until dequeue. + */ +static inline void set_protect_slice(struct sched_entity *se) +{ + se->vlag = se->deadline; +} + +static inline bool protect_slice(struct sched_entity *se) +{ + return se->vlag == se->deadline; +} + +static inline void cancel_protect_slice(struct sched_entity *se) +{ + if (protect_slice(se)) + se->vlag = se->deadline + 1; +} + +/* * Earliest Eligible Virtual Deadline First * * In order to provide latency guarantees for different request sizes @@ -919,11 +939,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - /* - * Once selected, run a task until it either becomes non-eligible or - * until it gets a new slice. See the HACK in set_next_entity(). - */ - if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) + if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -967,7 +983,6 @@ found: return best; } -#ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -994,7 +1009,6 @@ int sched_update_scaling(void) return 0; } #endif -#endif static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -2259,7 +2273,8 @@ static bool task_numa_compare(struct task_numa_env *env, rcu_read_lock(); cur = rcu_dereference(dst_rq->curr); - if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) + if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) || + !cur->mm)) cur = NULL; /* @@ -3301,7 +3316,7 @@ static void task_numa_work(struct callback_head *work) bool vma_pids_skipped; bool vma_pids_forced = false; - SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); work->next = work; /* @@ -3315,6 +3330,15 @@ static void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + /* + * Memory is pinned to only one NUMA node via cpuset.mems, naturally + * no page can be migrated. + */ + if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) { + trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed); + return; + } + if (!mm->numa_next_scan) { mm->numa_next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); @@ -3781,6 +3805,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, update_entity_lag(cfs_rq, se); se->deadline -= se->vruntime; se->rel_deadline = 1; + cfs_rq->nr_queued--; if (!curr) __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); @@ -3807,10 +3832,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { - update_load_add(&cfs_rq->load, se->load.weight); place_entity(cfs_rq, se, 0); + update_load_add(&cfs_rq->load, se->load.weight); if (!curr) __enqueue_entity(cfs_rq, se); + cfs_rq->nr_queued++; /* * The entity's vruntime has been adjusted, so let's check @@ -4020,7 +4046,7 @@ static inline bool load_avg_is_decayed(struct sched_avg *sa) * Make sure that rounding and/or propagation of PELT values never * break this. */ - SCHED_WARN_ON(sa->load_avg || + WARN_ON_ONCE(sa->load_avg || sa->util_avg || sa->runnable_avg); @@ -4045,15 +4071,17 @@ static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) { struct cfs_rq *prev_cfs_rq; struct list_head *prev; + struct rq *rq = rq_of(cfs_rq); if (cfs_rq->on_list) { prev = cfs_rq->leaf_cfs_rq_list.prev; } else { - struct rq *rq = rq_of(cfs_rq); - prev = rq->tmp_alone_branch; } + if (prev == &rq->leaf_cfs_rq_list) + return false; + prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); return (prev_cfs_rq->tg->parent == cfs_rq->tg); @@ -4917,13 +4945,6 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, goto done; /* - * To avoid overestimation of actual task utilization, skip updates if - * we cannot grant there is idle time in this CPU. - */ - if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) - return; - - /* * To avoid underestimate of task utilization, skip updates of EWMA if * we cannot grant that thread got all CPU time it wanted. */ @@ -5442,7 +5463,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) clear_buddies(cfs_rq, se); if (flags & DEQUEUE_DELAYED) { - SCHED_WARN_ON(!se->sched_delayed); + WARN_ON_ONCE(!se->sched_delayed); } else { bool delay = sleep; /* @@ -5452,7 +5473,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_SPECIAL) delay = false; - SCHED_WARN_ON(delay && se->sched_delayed); + WARN_ON_ONCE(delay && se->sched_delayed); if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { @@ -5528,15 +5549,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - /* - * HACK, stash a copy of deadline at the point of pick in vlag, - * which isn't used until dequeue. - */ - se->vlag = se->deadline; + + set_protect_slice(se); } update_stats_curr_start(cfs_rq, se); - SCHED_WARN_ON(cfs_rq->curr); + WARN_ON_ONCE(cfs_rq->curr); cfs_rq->curr = se; /* @@ -5577,7 +5595,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) if (sched_feat(PICK_BUDDY) && cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { /* ->next will never be delayed */ - SCHED_WARN_ON(cfs_rq->next->sched_delayed); + WARN_ON_ONCE(cfs_rq->next->sched_delayed); return cfs_rq->next; } @@ -5613,7 +5631,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* in !on_rq case, update occurred at dequeue */ update_load_avg(cfs_rq, prev, 0); } - SCHED_WARN_ON(cfs_rq->curr != prev); + WARN_ON_ONCE(cfs_rq->curr != prev); cfs_rq->curr = NULL; } @@ -5836,7 +5854,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttled_clock_self = 0; - if (SCHED_WARN_ON((s64)delta < 0)) + if (WARN_ON_ONCE((s64)delta < 0)) delta = 0; cfs_rq->throttled_clock_self_time += delta; @@ -5856,7 +5874,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); - SCHED_WARN_ON(cfs_rq->throttled_clock_self); + WARN_ON_ONCE(cfs_rq->throttled_clock_self); if (cfs_rq->nr_queued) cfs_rq->throttled_clock_self = rq_clock(rq); } @@ -5965,7 +5983,7 @@ done: * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; - SCHED_WARN_ON(cfs_rq->throttled_clock); + WARN_ON_ONCE(cfs_rq->throttled_clock); if (cfs_rq->nr_queued) cfs_rq->throttled_clock = rq_clock(rq); return true; @@ -6121,7 +6139,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) } /* Already enqueued */ - if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) + if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list))) return; first = list_empty(&rq->cfsb_csd_list); @@ -6140,7 +6158,7 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { lockdep_assert_rq_held(rq_of(cfs_rq)); - if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || + if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) || cfs_rq->runtime_remaining <= 0)) return; @@ -6176,7 +6194,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) goto next; /* By the above checks, this should never be true */ - SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + WARN_ON_ONCE(cfs_rq->runtime_remaining > 0); raw_spin_lock(&cfs_b->lock); runtime = -cfs_rq->runtime_remaining + 1; @@ -6197,7 +6215,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) * We currently only expect to be unthrottling * a single cfs_rq locally. */ - SCHED_WARN_ON(!list_empty(&local_unthrottle)); + WARN_ON_ONCE(!list_empty(&local_unthrottle)); list_add_tail(&cfs_rq->throttled_csd_list, &local_unthrottle); } @@ -6222,7 +6240,7 @@ next: rq_unlock_irqrestore(rq, &rf); } - SCHED_WARN_ON(!list_empty(&local_unthrottle)); + WARN_ON_ONCE(!list_empty(&local_unthrottle)); rcu_read_unlock(); @@ -6539,14 +6557,14 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); - cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); /* Add a random offset so that timers interleave */ hrtimer_set_expires(&cfs_b->period_timer, get_random_u32_below(cfs_b->period)); - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->slack_timer.function = sched_cfs_slack_timer; + hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); cfs_b->slack_started = false; } @@ -6774,7 +6792,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - SCHED_WARN_ON(task_rq(p) != rq); + WARN_ON_ONCE(task_rq(p) != rq); if (rq->cfs.h_nr_queued > 1) { u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; @@ -6885,8 +6903,8 @@ requeue_delayed_entity(struct sched_entity *se) * Because a delayed entity is one that is still on * the runqueue competing until elegibility. */ - SCHED_WARN_ON(!se->sched_delayed); - SCHED_WARN_ON(!se->on_rq); + WARN_ON_ONCE(!se->sched_delayed); + WARN_ON_ONCE(!se->on_rq); if (sched_feat(DELAY_ZERO)) { update_entity_lag(cfs_rq, se); @@ -6928,7 +6946,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * Let's add the task's estimated utilization to the cfs_rq's * estimated utilization, before we update schedutil. */ - if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) + if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED)) util_est_enqueue(&rq->cfs, p); if (flags & ENQUEUE_DELAYED) { @@ -6989,6 +7007,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_group(se); se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); cfs_rq->h_nr_runnable += h_nr_runnable; @@ -7066,9 +7086,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) h_nr_idle = task_has_idle_policy(p); if (task_sleep || task_delayed || !se->sched_delayed) h_nr_runnable = 1; - } else { - cfs_rq = group_cfs_rq(se); - slice = cfs_rq_min_slice(cfs_rq); } for_each_sched_entity(se) { @@ -7078,6 +7095,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (p && &p->se == se) return -1; + slice = cfs_rq_min_slice(cfs_rq); break; } @@ -7118,6 +7136,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) update_cfs_group(se); se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); cfs_rq->h_nr_runnable -= h_nr_runnable; @@ -7142,8 +7162,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) rq->next_balance = jiffies; if (p && task_delayed) { - SCHED_WARN_ON(!task_sleep); - SCHED_WARN_ON(p->on_rq != 1); + WARN_ON_ONCE(!task_sleep); + WARN_ON_ONCE(p->on_rq != 1); /* Fix-up what dequeue_task_fair() skipped */ hrtick_update(rq); @@ -7166,7 +7186,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) */ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { - if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) + if (!p->se.sched_delayed) util_est_dequeue(&rq->cfs, p); util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); @@ -7181,6 +7201,11 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) return true; } +static inline unsigned int cfs_h_nr_delayed(struct rq *rq) +{ + return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable); +} + #ifdef CONFIG_SMP /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ @@ -7342,8 +7367,12 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; - if (sync && cpu_rq(this_cpu)->nr_running == 1) - return this_cpu; + if (sync) { + struct rq *rq = cpu_rq(this_cpu); + + if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1) + return this_cpu; + } if (available_idle_cpu(prev_cpu)) return prev_cpu; @@ -8721,7 +8750,7 @@ static inline void set_task_max_allowed_capacity(struct task_struct *p) {} static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { - if (SCHED_WARN_ON(!se->on_rq)) + if (WARN_ON_ONCE(!se->on_rq)) return; if (se_is_idle(se)) return; @@ -8781,8 +8810,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Preempt an idle entity in favor of a non-idle entity (and don't preempt * in the inverse case). */ - if (cse_is_idle && !pse_is_idle) + if (cse_is_idle && !pse_is_idle) { + /* + * When non-idle entity preempt an idle entity, + * don't give idle entity slice protection. + */ + cancel_protect_slice(se); goto preempt; + } + if (cse_is_idle != pse_is_idle) return; @@ -8801,8 +8837,8 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Note that even if @p does not turn out to be the most eligible * task at this moment, current's slice protection will be lost. */ - if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) - se->vlag = se->deadline + 1; + if (do_preempt_short(cfs_rq, pse, se)) + cancel_protect_slice(se); /* * If @p has become the most eligible task, force preemption. @@ -9415,12 +9451,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; /* Prevent to re-select dst_cpu via env's CPUs: */ - for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) { - env->flags |= LBF_DST_PINNED; - env->new_dst_cpu = cpu; - break; - } + cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr); + + if (cpu < nr_cpu_ids) { + env->flags |= LBF_DST_PINNED; + env->new_dst_cpu = cpu; } return 0; @@ -10235,7 +10270,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group (sgs->group_weight - sgs->idle_cpus != 1)) return false; - return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); + return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu)); } /* One group has more than one SMT CPU while the other group does not */ @@ -10472,7 +10507,8 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_asym_packing: /* Prefer to move from lowest priority CPU's work */ - return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); + return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu), + READ_ONCE(sg->asym_prefer_cpu)); case group_misfit_task: /* @@ -12459,7 +12495,7 @@ unlock: void nohz_balance_exit_idle(struct rq *rq) { - SCHED_WARN_ON(rq != this_rq()); + WARN_ON_ONCE(rq != this_rq()); if (likely(!rq->nohz_tick_stopped)) return; @@ -12495,7 +12531,7 @@ void nohz_balance_enter_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - SCHED_WARN_ON(cpu != smp_processor_id()); + WARN_ON_ONCE(cpu != smp_processor_id()); /* If this CPU is going down, then nothing needs to be done: */ if (!cpu_active(cpu)) @@ -12578,7 +12614,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) int balance_cpu; struct rq *rq; - SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); + WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); /* * We assume there will be no idle load after this update and clear @@ -13018,7 +13054,7 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, struct cfs_rq *cfs_rqb; s64 delta; - SCHED_WARN_ON(task_rq(b)->core != rq->core); + WARN_ON_ONCE(task_rq(b)->core != rq->core); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -13221,7 +13257,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) static void switched_to_fair(struct rq *rq, struct task_struct *p) { - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); attach_task_cfs_rq(p); @@ -13256,7 +13292,7 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs if (!first) return; - SCHED_WARN_ON(se->sched_delayed); + WARN_ON_ONCE(se->sched_delayed); if (hrtick_enabled_fair(rq)) hrtick_start_fair(rq, p); @@ -13643,7 +13679,6 @@ DEFINE_SCHED_CLASS(fair) = { #endif }; -#ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq, *pos; @@ -13677,7 +13712,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) rcu_read_unlock(); } #endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_SCHED_DEBUG */ __init void init_sched_fair_class(void) { diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 81bc8b329ef1..93b038d48900 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -40,7 +40,7 @@ int housekeeping_any_cpu(enum hk_type type) if (cpu < nr_cpu_ids) return cpu; - cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); + cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask); if (likely(cpu < nr_cpu_ids)) return cpu; /* diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index bb56805e3d47..ad04a5c3162a 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -733,7 +733,7 @@ static int psi_rtpoll_worker(void *data) static void poll_timer_fn(struct timer_list *t) { - struct psi_group *group = from_timer(group, t, rtpoll_timer); + struct psi_group *group = timer_container_of(group, t, rtpoll_timer); atomic_set(&group->rtpoll_wakeup, 1); wake_up_interruptible(&group->rtpoll_wait); @@ -1440,7 +1440,7 @@ void psi_trigger_destroy(struct psi_trigger *t) group->rtpoll_task, lockdep_is_held(&group->rtpoll_trigger_lock)); rcu_assign_pointer(group->rtpoll_task, NULL); - del_timer(&group->rtpoll_timer); + timer_delete(&group->rtpoll_timer); } } mutex_unlock(&group->rtpoll_trigger_lock); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4b8e33c615b1..e40422c37033 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -89,6 +89,7 @@ void init_rt_rq(struct rt_rq *rt_rq) rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; raw_spin_lock_init(&rt_rq->rt_runtime_lock); + rt_rq->tg = &root_task_group; #endif } @@ -127,9 +128,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) raw_spin_lock_init(&rt_b->rt_runtime_lock); - hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_HARD); - rt_b->rt_period_timer.function = sched_rt_period_timer; + hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); } static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) @@ -169,19 +169,21 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_SCHED_DEBUG WARN_ON_ONCE(!rt_entity_is_task(rt_se)); -#endif + return container_of(rt_se, struct task_struct, rt); } static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { + /* Cannot fold with non-CONFIG_RT_GROUP_SCHED version, layout */ + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); return rt_rq->rq; } static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) { + WARN_ON(!rt_group_sched_enabled() && rt_se->rt_rq->tg != &root_task_group); return rt_se->rt_rq; } @@ -189,11 +191,15 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = rt_se->rt_rq; + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); return rt_rq->rq; } void unregister_rt_sched_group(struct task_group *tg) { + if (!rt_group_sched_enabled()) + return; + if (tg->rt_se) destroy_rt_bandwidth(&tg->rt_bandwidth); } @@ -202,6 +208,9 @@ void free_rt_sched_group(struct task_group *tg) { int i; + if (!rt_group_sched_enabled()) + return; + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -246,6 +255,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) struct sched_rt_entity *rt_se; int i; + if (!rt_group_sched_enabled()) + return 1; + tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); if (!tg->rt_rq) goto err; @@ -484,9 +496,6 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { - if (!rt_rq->tg) - return RUNTIME_INF; - return rt_rq->rt_runtime; } @@ -499,6 +508,11 @@ typedef struct task_group *rt_rq_iter_t; static inline struct task_group *next_task_group(struct task_group *tg) { + if (!rt_group_sched_enabled()) { + WARN_ON(tg != &root_task_group); + return NULL; + } + do { tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list); @@ -511,9 +525,9 @@ static inline struct task_group *next_task_group(struct task_group *tg) } #define for_each_rt_rq(rt_rq, iter, rq) \ - for (iter = container_of(&task_groups, typeof(*iter), list); \ - (iter = next_task_group(iter)) && \ - (rt_rq = iter->rt_rq[cpu_of(rq)]);) + for (iter = &root_task_group; \ + iter && (rt_rq = iter->rt_rq[cpu_of(rq)]); \ + iter = next_task_group(iter)) #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = rt_se->parent) @@ -1068,13 +1082,12 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); -#ifdef CONFIG_RT_GROUP_SCHED /* * Change rq's cpupri only if rt_rq is the top queue. */ - if (&rq->rt != rt_rq) + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) return; -#endif + if (rq->online && prio < prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, prio); } @@ -1084,13 +1097,12 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); -#ifdef CONFIG_RT_GROUP_SCHED /* * Change rq's cpupri only if rt_rq is the top queue. */ - if (&rq->rt != rt_rq) + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) return; -#endif + if (rq->online && rt_rq->highest_prio.curr != prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } @@ -1158,8 +1170,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_se_boosted(rt_se)) rt_rq->rt_nr_boosted++; - if (rt_rq->tg) - start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); } static void @@ -1259,11 +1270,9 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr static inline struct sched_statistics * __schedstats_from_rt_se(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_RT_GROUP_SCHED /* schedstats is not supported for rt group. */ if (!rt_entity_is_task(rt_se)) return NULL; -#endif return &rt_task_of(rt_se)->stats; } @@ -1713,7 +1722,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) BUG_ON(idx >= MAX_RT_PRIO); queue = array->queue + idx; - if (SCHED_WARN_ON(list_empty(queue))) + if (WARN_ON_ONCE(list_empty(queue))) return NULL; next = list_entry(queue->next, struct sched_rt_entity, run_list); @@ -1885,6 +1894,27 @@ static int find_lowest_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(task_current_donor(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!task_on_rq_queued(p)); + BUG_ON(!rt_task(p)); + + return p; +} + /* Will lock the rq it finds */ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) { @@ -1915,18 +1945,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) /* * We had to unlock the run queue. In * the mean time, task could have - * migrated already or had its affinity changed. - * Also make sure that it wasn't scheduled on its rq. + * migrated already or had its affinity changed, + * therefore check if the task is still at the + * head of the pushable tasks list. * It is possible the task was scheduled, set * "migrate_disabled" and then got preempted, so we must * check the task migration disable flag here too. */ - if (unlikely(task_rq(task) != rq || + if (unlikely(is_migration_disabled(task) || !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || - task_on_cpu(rq, task) || - !rt_task(task) || - is_migration_disabled(task) || - !task_on_rq_queued(task))) { + task != pick_next_pushable_task(rq))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1946,27 +1974,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } -static struct task_struct *pick_next_pushable_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_tasks(rq)) - return NULL; - - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); - - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(task_current_donor(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); - - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!rt_task(p)); - - return p; -} - /* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task @@ -2604,8 +2611,9 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) { struct rt_rq *rt_rq; -#ifdef CONFIG_RT_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED // XXX maybe add task_rt_rq(), see also sched_rt_period_rt_rq rt_rq = task_group(p)->rt_rq[cpu]; + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); #else rt_rq = &cpu_rq(cpu)->rt; #endif @@ -2715,6 +2723,9 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) return -EBUSY; + if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group)) + return -EBUSY; + total = to_ratio(period, runtime); /* @@ -2870,7 +2881,7 @@ static int sched_rt_global_constraints(void) int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { /* Don't accept real-time tasks when there is no way for them to run */ - if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + if (rt_group_sched_enabled() && rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) return 0; return 1; @@ -2910,6 +2921,7 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff int ret; mutex_lock(&mutex); + sched_domains_mutex_lock(); old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; @@ -2936,6 +2948,7 @@ undo: sysctl_sched_rt_period = old_period; sysctl_sched_rt_runtime = old_runtime; } + sched_domains_mutex_unlock(); mutex_unlock(&mutex); return ret; @@ -2967,7 +2980,6 @@ static int sched_rr_handler(const struct ctl_table *table, int write, void *buff } #endif /* CONFIG_SYSCTL */ -#ifdef CONFIG_SCHED_DEBUG void print_rt_stats(struct seq_file *m, int cpu) { rt_rq_iter_t iter; @@ -2978,4 +2990,3 @@ void print_rt_stats(struct seq_file *m, int cpu) print_rt_rq(m, cpu, rt_rq); rcu_read_unlock(); } -#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b93c8c3dc05a..475bb5998295 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -91,12 +91,6 @@ struct cpuidle_state; #include "cpupri.h" #include "cpudeadline.h" -#ifdef CONFIG_SCHED_DEBUG -# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -#else -# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) -#endif - /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -819,15 +813,17 @@ struct rt_rq { #ifdef CONFIG_RT_GROUP_SCHED int rt_throttled; - u64 rt_time; - u64 rt_runtime; + u64 rt_time; /* consumed RT time, goes up in update_curr_rt */ + u64 rt_runtime; /* allotted RT time, "slice" from rt_bandwidth, RT sharing/balancing */ /* Nests inside the rq lock: */ raw_spinlock_t rt_runtime_lock; unsigned int rt_nr_boosted; - struct rq *rq; - struct task_group *tg; + struct rq *rq; /* this is always top-level rq, cache? */ +#endif +#ifdef CONFIG_CGROUP_SCHED + struct task_group *tg; /* this tg has "this" rt_rq on given CPU for runnable entities */ #endif }; @@ -998,7 +994,7 @@ struct root_domain { * Also, some corner cases, like 'wrap around' is dangerous, but given * that u64 is 'big enough'. So that shouldn't be a concern. */ - u64 visit_gen; + u64 visit_cookie; #ifdef HAVE_RT_PUSH_IPI /* @@ -1180,10 +1176,8 @@ struct rq { atomic_t nr_iowait; -#ifdef CONFIG_SCHED_DEBUG u64 last_seen_need_resched_ns; int ticks_without_resched; -#endif #ifdef CONFIG_MEMBARRIER int membarrier_state; @@ -1506,6 +1500,23 @@ static inline bool sched_group_cookie_match(struct rq *rq, } #endif /* !CONFIG_SCHED_CORE */ +#ifdef CONFIG_RT_GROUP_SCHED +# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED +DECLARE_STATIC_KEY_FALSE(rt_group_sched); +static inline bool rt_group_sched_enabled(void) +{ + return static_branch_unlikely(&rt_group_sched); +} +# else +DECLARE_STATIC_KEY_TRUE(rt_group_sched); +static inline bool rt_group_sched_enabled(void) +{ + return static_branch_likely(&rt_group_sched); +} +# endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ +#else +# define rt_group_sched_enabled() false +#endif /* CONFIG_RT_GROUP_SCHED */ static inline void lockdep_assert_rq_held(struct rq *rq) { @@ -1571,7 +1582,7 @@ static inline void update_idle_core(struct rq *rq) { } static inline struct task_struct *task_of(struct sched_entity *se) { - SCHED_WARN_ON(!entity_is_task(se)); + WARN_ON_ONCE(!entity_is_task(se)); return container_of(se, struct task_struct, se); } @@ -1652,7 +1663,7 @@ static inline void assert_clock_updated(struct rq *rq) * The only reason for not seeing a clock update since the * last rq_pin_lock() is if we're currently skipping updates. */ - SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); + WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP); } static inline u64 rq_clock(struct rq *rq) @@ -1699,7 +1710,7 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq) static inline void rq_clock_start_loop_update(struct rq *rq) { lockdep_assert_rq_held(rq); - SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); + WARN_ON_ONCE(rq->clock_update_flags & RQCF_ACT_SKIP); rq->clock_update_flags |= RQCF_ACT_SKIP; } @@ -1712,14 +1723,12 @@ static inline void rq_clock_stop_loop_update(struct rq *rq) struct rq_flags { unsigned long flags; struct pin_cookie cookie; -#ifdef CONFIG_SCHED_DEBUG /* * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the * current pin context is stashed here in case it needs to be * restored in rq_repin_lock(). */ unsigned int clock_update_flags; -#endif }; extern struct balance_callback balance_push_callback; @@ -1727,10 +1736,10 @@ extern struct balance_callback balance_push_callback; #ifdef CONFIG_SCHED_CLASS_EXT extern const struct sched_class ext_sched_class; -DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */ +DECLARE_STATIC_KEY_FALSE(__scx_enabled); /* SCX BPF scheduler loaded */ DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */ -#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) +#define scx_enabled() static_branch_unlikely(&__scx_enabled) #define scx_switched_all() static_branch_unlikely(&__scx_switched_all) static inline void scx_rq_clock_update(struct rq *rq, u64 clock) @@ -1770,21 +1779,18 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { rf->cookie = lockdep_pin_lock(__rq_lockp(rq)); -#ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -# ifdef CONFIG_SMP - SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); -# endif +#ifdef CONFIG_SMP + WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); #endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) { -#ifdef CONFIG_SCHED_DEBUG if (rq->clock_update_flags > RQCF_ACT_SKIP) rf->clock_update_flags = RQCF_UPDATED; -#endif + scx_rq_clock_invalidate(rq); lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); } @@ -1793,12 +1799,10 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) { lockdep_repin_lock(__rq_lockp(rq), rf->cookie); -#ifdef CONFIG_SCHED_DEBUG /* * Restore the value we stashed in @rf for this pin context. */ rq->clock_update_flags |= rf->clock_update_flags; -#endif } extern @@ -2072,9 +2076,7 @@ struct sched_group_capacity { unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ -#ifdef CONFIG_SCHED_DEBUG int id; -#endif unsigned long cpumask[]; /* Balance mask */ }; @@ -2114,13 +2116,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) extern int group_balance_cpu(struct sched_group *sg); -#ifdef CONFIG_SCHED_DEBUG extern void update_sched_domain_debugfs(void); extern void dirty_sched_domain_sysctl(int cpu); -#else -static inline void update_sched_domain_debugfs(void) { } -static inline void dirty_sched_domain_sysctl(int cpu) { } -#endif extern int sched_update_scaling(void); @@ -2168,6 +2165,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_RT_GROUP_SCHED + /* + * p->rt.rt_rq is NULL initially and it is easier to assign + * root_task_group's rt_rq than switching in rt_rq_of_se() + * Clobbers tg(!) + */ + if (!rt_group_sched_enabled()) + tg = &root_task_group; p->rt.rt_rq = tg->rt_rq[cpu]; p->rt.parent = tg->rt_se[cpu]; #endif @@ -2200,13 +2204,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) } /* - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + * Tunables: */ -#ifdef CONFIG_SCHED_DEBUG -# define const_debug __read_mostly -#else -# define const_debug const -#endif #define SCHED_FEAT(name, enabled) \ __SCHED_FEAT_##name , @@ -2218,13 +2217,11 @@ enum { #undef SCHED_FEAT -#ifdef CONFIG_SCHED_DEBUG - /* * To support run-time toggling of sched features, all the translation units * (but core.c) reference the sysctl_sched_features defined in core.c. */ -extern const_debug unsigned int sysctl_sched_features; +extern __read_mostly unsigned int sysctl_sched_features; #ifdef CONFIG_JUMP_LABEL @@ -2246,24 +2243,6 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #endif /* !CONFIG_JUMP_LABEL */ -#else /* !SCHED_DEBUG: */ - -/* - * Each translation unit has its own copy of sysctl_sched_features to allow - * constants propagation at compile time and compiler optimization based on - * features default. - */ -#define SCHED_FEAT(name, enabled) \ - (1UL << __SCHED_FEAT_##name) * enabled | -static const_debug __maybe_unused unsigned int sysctl_sched_features = -#include "features.h" - 0; -#undef SCHED_FEAT - -#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) - -#endif /* !SCHED_DEBUG */ - extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; @@ -2685,7 +2664,7 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { - SCHED_WARN_ON(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held()); return rq->idle_state; } @@ -2843,12 +2822,11 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); # define SCHED_NR_MIGRATE_BREAK 32 #endif -extern const_debug unsigned int sysctl_sched_nr_migrate; -extern const_debug unsigned int sysctl_sched_migration_cost; +extern __read_mostly unsigned int sysctl_sched_nr_migrate; +extern __read_mostly unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_base_slice; -#ifdef CONFIG_SCHED_DEBUG extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; @@ -2859,7 +2837,6 @@ extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_hot_threshold; -#endif #ifdef CONFIG_SCHED_HRTICK @@ -2932,7 +2909,6 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif -#ifdef CONFIG_SCHED_DEBUG /* * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to * acquire rq lock instead of rq_lock(). So at the end of these two functions @@ -2947,9 +2923,6 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); #endif } -#else -static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { } -#endif #define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ __DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ @@ -3162,7 +3135,6 @@ extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); -#ifdef CONFIG_SCHED_DEBUG extern bool sched_debug_verbose; extern void print_cfs_stats(struct seq_file *m, int cpu); @@ -3173,15 +3145,12 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); -# ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, unsigned long tpf, unsigned long gsf, unsigned long gpf); -# endif /* CONFIG_NUMA_BALANCING */ -#else /* !CONFIG_SCHED_DEBUG: */ -static inline void resched_latency_warn(int cpu, u64 latency) { } -#endif /* !CONFIG_SCHED_DEBUG */ +#endif /* CONFIG_NUMA_BALANCING */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); @@ -3259,11 +3228,11 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime); +extern int sched_clock_irqtime; static inline int irqtime_enabled(void) { - return static_branch_likely(&sched_clock_irqtime); + return sched_clock_irqtime; } /* @@ -3394,6 +3363,31 @@ static inline bool update_other_load_avgs(struct rq *rq) { return false; } unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} + +/* + * Enabling static branches would get the cpus_read_lock(), + * check whether uclamp_is_used before enable it to avoid always + * calling cpus_read_lock(). Because we never disable this + * static key once enable it. + */ +static inline void sched_uclamp_enable(void) +{ + if (!uclamp_is_used()) + static_branch_enable(&sched_uclamp_used); +} + static inline unsigned long uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { @@ -3417,7 +3411,7 @@ static inline bool uclamp_rq_is_capped(struct rq *rq) unsigned long rq_util; unsigned long max_util; - if (!static_branch_likely(&sched_uclamp_used)) + if (!uclamp_is_used()) return false; rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); @@ -3426,19 +3420,6 @@ static inline bool uclamp_rq_is_capped(struct rq *rq) return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util; } -/* - * When uclamp is compiled in, the aggregation at rq level is 'turned off' - * by default in the fast path and only gets turned on once userspace performs - * an operation that requires it. - * - * Returns true if userspace opted-in to use uclamp and aggregation at rq level - * hence is active. - */ -static inline bool uclamp_is_used(void) -{ - return static_branch_likely(&sched_uclamp_used); -} - #define for_each_clamp_id(clamp_id) \ for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) @@ -3486,6 +3467,8 @@ static inline bool uclamp_is_used(void) return false; } +static inline void sched_uclamp_enable(void) {} + static inline unsigned long uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { @@ -3552,8 +3535,6 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } -extern struct cpufreq_governor schedutil_gov; - #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #define perf_domain_span(pd) NULL @@ -3619,6 +3600,7 @@ extern int preempt_dynamic_mode; extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif +extern const char *preempt_modes[]; #ifdef CONFIG_SCHED_MM_CID @@ -3698,10 +3680,28 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { struct cpumask *cidmask = mm_cidmask(mm); struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid = __this_cpu_read(pcpu_cid->recent_cid); + int cid, max_nr_cid, allowed_max_nr_cid; + /* + * After shrinking the number of threads or reducing the number + * of allowed cpus, reduce the value of max_nr_cid so expansion + * of cid allocation will preserve cache locality if the number + * of threads or allowed cpus increase again. + */ + max_nr_cid = atomic_read(&mm->max_nr_cid); + while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), + atomic_read(&mm->mm_users))), + max_nr_cid > allowed_max_nr_cid) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ + if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { + max_nr_cid = allowed_max_nr_cid; + break; + } + } /* Try to re-use recent cid. This improves cache locality. */ - if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask)) + cid = __this_cpu_read(pcpu_cid->recent_cid); + if (!mm_cid_is_unset(cid) && cid < max_nr_cid && + !cpumask_test_and_set_cpu(cid, cidmask)) return cid; /* * Expand cid allocation if the maximum number of concurrency @@ -3709,8 +3709,9 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) * and number of threads. Expanding cid allocation as much as * possible improves cache locality. */ - cid = atomic_read(&mm->max_nr_cid); + cid = max_nr_cid; while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) continue; if (!cpumask_test_and_set_cpu(cid, cidmask)) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 19cdbe96f93d..452826df6ae1 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -144,7 +144,7 @@ static inline void psi_enqueue(struct task_struct *p, int flags) if (p->se.sched_delayed) { /* CPU migration of "sleeping" task */ - SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); + WARN_ON_ONCE(!(flags & ENQUEUE_MIGRATED)); if (p->in_memstall) set |= TSK_MEMSTALL; if (p->in_iowait) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 456d339be98f..547c1f05b667 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -368,7 +368,7 @@ static int uclamp_validate(struct task_struct *p, * blocking operation which obviously cannot be done while holding * scheduler locks. */ - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); return 0; } @@ -634,13 +634,14 @@ change: * Do not allow real-time tasks into groups that have no runtime * assigned. */ - if (rt_bandwidth_enabled() && rt_policy(policy) && + if (rt_group_sched_enabled() && + rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { retval = -EPERM; goto unlock; } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP if (dl_bandwidth_enabled() && dl_policy(policy) && !(attr->sched_flags & SCHED_FLAG_SUGOV)) { @@ -875,7 +876,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lparam; - if (!param || pid < 0) + if (unlikely(!param || pid < 0)) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; @@ -984,7 +985,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, struct sched_attr attr; int retval; - if (!uattr || pid < 0 || flags) + if (unlikely(!uattr || pid < 0 || flags)) return -EINVAL; retval = sched_copy_attr(uattr, &attr); @@ -1049,7 +1050,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) struct task_struct *p; int retval; - if (!param || pid < 0) + if (unlikely(!param || pid < 0)) return -EINVAL; scoped_guard (rcu) { @@ -1085,8 +1086,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, struct task_struct *p; int retval; - if (!uattr || pid < 0 || usize > PAGE_SIZE || - usize < SCHED_ATTR_SIZE_VER0 || flags) + if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || + usize < SCHED_ATTR_SIZE_VER0 || flags)) return -EINVAL; scoped_guard (rcu) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c49aea8c1025..b958fe48e020 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -6,13 +6,19 @@ #include <linux/bsearch.h> DEFINE_MUTEX(sched_domains_mutex); +void sched_domains_mutex_lock(void) +{ + mutex_lock(&sched_domains_mutex); +} +void sched_domains_mutex_unlock(void) +{ + mutex_unlock(&sched_domains_mutex); +} /* Protected by sched_domains_mutex: */ static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; -#ifdef CONFIG_SCHED_DEBUG - static int __init sched_debug_setup(char *str) { sched_debug_verbose = true; @@ -151,15 +157,6 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) break; } } -#else /* !CONFIG_SCHED_DEBUG */ - -# define sched_debug_verbose 0 -# define sched_domain_debug(sd, cpu) do { } while (0) -static inline bool sched_debug(void) -{ - return false; -} -#endif /* CONFIG_SCHED_DEBUG */ /* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ #define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | @@ -215,8 +212,6 @@ static bool sched_energy_update; static bool sched_is_eas_possible(const struct cpumask *cpu_mask) { bool any_asym_capacity = false; - struct cpufreq_policy *policy; - struct cpufreq_governor *gov; int i; /* EAS is enabled for asymmetric CPU capacity topologies. */ @@ -251,25 +246,12 @@ static bool sched_is_eas_possible(const struct cpumask *cpu_mask) return false; } - /* Do not attempt EAS if schedutil is not being used. */ - for_each_cpu(i, cpu_mask) { - policy = cpufreq_cpu_get(i); - if (!policy) { - if (sched_debug()) { - pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d", - cpumask_pr_args(cpu_mask), i); - } - return false; - } - gov = policy->governor; - cpufreq_cpu_put(policy); - if (gov != &schedutil_gov) { - if (sched_debug()) { - pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n", - cpumask_pr_args(cpu_mask)); - } - return false; + if (!cpufreq_ready_for_eas(cpu_mask)) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n", + cpumask_pr_args(cpu_mask)); } + return false; } return true; @@ -560,7 +542,7 @@ static int init_rootdomain(struct root_domain *rd) rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); #endif - rd->visit_gen = 0; + rd->visit_cookie = 0; init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; @@ -1336,6 +1318,64 @@ next: update_group_capacity(sd, cpu); } +#ifdef CONFIG_SMP + +/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ +void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) +{ + int asym_prefer_cpu = cpu; + struct sched_domain *sd; + + guard(rcu)(); + + for_each_domain(cpu, sd) { + struct sched_group *sg; + int group_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + continue; + + /* + * Groups of overlapping domain are replicated per NUMA + * node and will require updating "asym_prefer_cpu" on + * each local copy. + * + * If you are hitting this warning, consider moving + * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" + * which is shared by all the overlapping groups. + */ + WARN_ON_ONCE(sd->flags & SD_OVERLAP); + + sg = sd->groups; + if (cpu != sg->asym_prefer_cpu) { + /* + * Since the parent is a superset of the current group, + * if the cpu is not the "asym_prefer_cpu" at the + * current level, it cannot be the preferred CPU at a + * higher levels either. + */ + if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu)) + return; + + WRITE_ONCE(sg->asym_prefer_cpu, cpu); + continue; + } + + /* Ranking has improved; CPU is still the preferred one. */ + if (new_prio >= old_prio) + continue; + + for_each_cpu(group_cpu, sched_group_span(sg)) { + if (sched_asym_prefer(group_cpu, asym_prefer_cpu)) + asym_prefer_cpu = group_cpu; + } + + WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); + } +} + +#endif /* CONFIG_SMP */ + /* * Set of available CPUs grouped by their corresponding capacities * Each list entry contains a CPU mask reflecting CPUs that share the same @@ -2101,7 +2141,7 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu) for (i = 0; i < sched_domains_numa_levels; i++) { if (!masks[i][j]) break; - cpu = cpumask_any_and(cpus, masks[i][j]); + cpu = cpumask_any_and_distribute(cpus, masks[i][j]); if (cpu < nr_cpu_ids) { found = cpu; break; @@ -2275,9 +2315,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sgc) return -ENOMEM; -#ifdef CONFIG_SCHED_DEBUG sgc->id = j; -#endif *per_cpu_ptr(sdd->sgc, j) = sgc; } @@ -2352,35 +2390,54 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve /* * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for - * any two given CPUs at this (non-NUMA) topology level. + * any two given CPUs on non-NUMA topology levels. */ -static bool topology_span_sane(struct sched_domain_topology_level *tl, - const struct cpumask *cpu_map, int cpu) +static bool topology_span_sane(const struct cpumask *cpu_map) { - int i = cpu + 1; + struct sched_domain_topology_level *tl; + struct cpumask *covered, *id_seen; + int cpu; - /* NUMA levels are allowed to overlap */ - if (tl->flags & SDTL_OVERLAP) - return true; + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; + id_seen = sched_domains_tmpmask2; + + for_each_sd_topology(tl) { + + /* NUMA levels are allowed to overlap */ + if (tl->flags & SDTL_OVERLAP) + continue; + + cpumask_clear(covered); + cpumask_clear(id_seen); - /* - * Non-NUMA levels cannot partially overlap - they must be either - * completely equal or completely disjoint. Otherwise we can end up - * breaking the sched_group lists - i.e. a later get_group() pass - * breaks the linking done for an earlier span. - */ - for_each_cpu_from(i, cpu_map) { /* - * We should 'and' all those masks with 'cpu_map' to exactly - * match the topology we're about to build, but that can only - * remove CPUs, which only lessens our ability to detect - * overlaps + * Non-NUMA levels cannot partially overlap - they must be either + * completely equal or completely disjoint. Otherwise we can end up + * breaking the sched_group lists - i.e. a later get_group() pass + * breaks the linking done for an earlier span. */ - if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && - cpumask_intersects(tl->mask(cpu), tl->mask(i))) - return false; - } + for_each_cpu(cpu, cpu_map) { + const struct cpumask *tl_cpu_mask = tl->mask(cpu); + int id; + + /* lowest bit set in this mask is used as a unique id */ + id = cpumask_first(tl_cpu_mask); + if (cpumask_test_cpu(id, id_seen)) { + /* First CPU has already been seen, ensure identical spans */ + if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) + return false; + } else { + /* First CPU hasn't been seen before, ensure it's a completely new span */ + if (cpumask_intersects(tl_cpu_mask, covered)) + return false; + + cpumask_or(covered, covered, tl_cpu_mask); + cpumask_set_cpu(id, id_seen); + } + } + } return true; } @@ -2413,9 +2470,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = NULL; for_each_sd_topology(tl) { - if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) - goto error; - sd = build_sched_domain(tl, cpu_map, attr, sd, i); has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; @@ -2429,6 +2483,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + if (WARN_ON(!topology_span_sane(cpu_map))) + goto error; + /* Build the groups for the domains */ for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { @@ -2680,7 +2737,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * Call with hotplug lock and sched_domains_mutex held */ -void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], +static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { bool __maybe_unused has_eas = false; @@ -2712,21 +2769,8 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_cur[i], doms_new[j]) && - dattrs_equal(dattr_cur, i, dattr_new, j)) { - struct root_domain *rd; - - /* - * This domain won't be destroyed and as such - * its dl_bw->total_bw needs to be cleared. - * Tasks contribution will be then recomputed - * in function dl_update_tasks_root_domain(), - * dl_servers contribution in function - * dl_restore_server_root_domain(). - */ - rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; - dl_clear_root_domain(rd); + dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; - } } /* No match - a current sched domain not in new doms_new[] */ detach_destroy_domains(doms_cur[i]); @@ -2783,6 +2827,7 @@ match3: ndoms_cur = ndoms_new; update_sched_domain_debugfs(); + dl_rebuild_rd_accounting(); } /* @@ -2791,7 +2836,7 @@ match3: void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 7bbb408431eb..41aa761c7738 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -29,13 +29,11 @@ #include <linux/syscalls.h> #include <linux/sysctl.h> +#include <asm/syscall.h> + /* Not exposed in headers: strictly internal use only. */ #define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1) -#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER -#include <asm/syscall.h> -#endif - #ifdef CONFIG_SECCOMP_FILTER #include <linux/file.h> #include <linux/filter.h> @@ -576,6 +574,9 @@ void seccomp_filter_release(struct task_struct *tsk) if (WARN_ON((tsk->flags & PF_EXITING) == 0)) return; + if (READ_ONCE(tsk->seccomp.filter) == NULL) + return; + spin_lock_irq(&tsk->sighand->siglock); orig = tsk->seccomp.filter; /* Detach task from its filter tree. */ @@ -601,6 +602,13 @@ static inline void seccomp_sync_threads(unsigned long flags) BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); assert_spin_locked(¤t->sighand->siglock); + /* + * Don't touch any of the threads if the process is being killed. + * This allows for a lockless check in seccomp_filter_release. + */ + if (current->signal->flags & SIGNAL_GROUP_EXIT) + return; + /* Synchronize all threads. */ caller = current; for_each_thread(caller, thread) { @@ -1074,6 +1082,13 @@ void secure_computing_strict(int this_syscall) else BUG(); } +int __secure_computing(void) +{ + int this_syscall = syscall_get_nr(current, current_pt_regs()); + + secure_computing_strict(this_syscall); + return 0; +} #else #ifdef CONFIG_SECCOMP_FILTER @@ -1225,13 +1240,12 @@ out: return -1; } -static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, - const bool recheck_after_trace) +static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) { u32 filter_ret, action; + struct seccomp_data sd; struct seccomp_filter *match = NULL; int data; - struct seccomp_data sd_local; /* * Make sure that any changes to mode from another thread have @@ -1239,12 +1253,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ smp_rmb(); - if (!sd) { - populate_seccomp_data(&sd_local); - sd = &sd_local; - } + populate_seccomp_data(&sd); - filter_ret = seccomp_run_filters(sd, &match); + filter_ret = seccomp_run_filters(&sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION_FULL; @@ -1302,13 +1313,13 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, * a reload of all registers. This does not goto skip since * a skip would have already been reported. */ - if (__seccomp_filter(this_syscall, NULL, true)) + if (__seccomp_filter(this_syscall, true)) return -1; return 0; case SECCOMP_RET_USER_NOTIF: - if (seccomp_do_user_notification(this_syscall, match, sd)) + if (seccomp_do_user_notification(this_syscall, match, &sd)) goto skip; return 0; @@ -1350,8 +1361,7 @@ skip: return -1; } #else -static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, - const bool recheck_after_trace) +static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) { BUG(); @@ -1359,7 +1369,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, } #endif -int __secure_computing(const struct seccomp_data *sd) +int __secure_computing(void) { int mode = current->seccomp.mode; int this_syscall; @@ -1368,15 +1378,14 @@ int __secure_computing(const struct seccomp_data *sd) unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return 0; - this_syscall = sd ? sd->nr : - syscall_get_nr(current, current_pt_regs()); + this_syscall = syscall_get_nr(current, current_pt_regs()); switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ return 0; case SECCOMP_MODE_FILTER: - return __seccomp_filter(this_syscall, sd, false); + return __seccomp_filter(this_syscall, false); /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */ case SECCOMP_MODE_DEAD: WARN_ON_ONCE(1); diff --git a/kernel/signal.c b/kernel/signal.c index 875e97f6205a..148082db9a55 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -176,9 +176,10 @@ static bool recalc_sigpending_tsk(struct task_struct *t) void recalc_sigpending(void) { - if (!recalc_sigpending_tsk(current) && !freezing(current)) - clear_thread_flag(TIF_SIGPENDING); - + if (!recalc_sigpending_tsk(current) && !freezing(current)) { + if (unlikely(test_thread_flag(TIF_SIGPENDING))) + clear_thread_flag(TIF_SIGPENDING); + } } EXPORT_SYMBOL(recalc_sigpending); @@ -2092,7 +2093,7 @@ static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueu * from a non-periodic timer, then just drop the reference * count. Otherwise queue it on the ignored list. */ - if (tmr->it_signal && tmr->it_sig_periodic) + if (posixtimer_valid(tmr) && tmr->it_sig_periodic) hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers); else posixtimer_putref(tmr); @@ -2179,12 +2180,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); - /* - * tsk is a group leader and has no threads, wake up the - * non-PIDFD_THREAD waiters. - */ - if (thread_group_empty(tsk)) - do_notify_pidfd(tsk); + + /* ptraced, or group-leader without sub-threads */ + do_notify_pidfd(tsk); if (sig != SIGCHLD) { /* @@ -4009,6 +4007,47 @@ static struct pid *pidfd_to_pid(const struct file *file) (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \ PIDFD_SIGNAL_PROCESS_GROUP) +static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type, + siginfo_t __user *info, unsigned int flags) +{ + kernel_siginfo_t kinfo; + + switch (flags) { + case PIDFD_SIGNAL_THREAD: + type = PIDTYPE_PID; + break; + case PIDFD_SIGNAL_THREAD_GROUP: + type = PIDTYPE_TGID; + break; + case PIDFD_SIGNAL_PROCESS_GROUP: + type = PIDTYPE_PGID; + break; + } + + if (info) { + int ret; + + ret = copy_siginfo_from_user_any(&kinfo, info); + if (unlikely(ret)) + return ret; + + if (unlikely(sig != kinfo.si_signo)) + return -EINVAL; + + /* Only allow sending arbitrary signals to yourself. */ + if ((task_pid(current) != pid || type > PIDTYPE_TGID) && + (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) + return -EPERM; + } else { + prepare_kill_siginfo(sig, &kinfo, type); + } + + if (type == PIDTYPE_PGID) + return kill_pgrp_info(sig, &kinfo, pid); + + return kill_pid_info_type(sig, &kinfo, pid, type); +} + /** * sys_pidfd_send_signal - Signal a process through a pidfd * @pidfd: file descriptor of the process @@ -4026,9 +4065,7 @@ static struct pid *pidfd_to_pid(const struct file *file) SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, siginfo_t __user *, info, unsigned int, flags) { - int ret; struct pid *pid; - kernel_siginfo_t kinfo; enum pid_type type; /* Enforce flags be set to 0 until we add an extension. */ @@ -4039,57 +4076,39 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; - CLASS(fd, f)(pidfd); - if (fd_empty(f)) - return -EBADF; + switch (pidfd) { + case PIDFD_SELF_THREAD: + pid = get_task_pid(current, PIDTYPE_PID); + type = PIDTYPE_PID; + break; + case PIDFD_SELF_THREAD_GROUP: + pid = get_task_pid(current, PIDTYPE_TGID); + type = PIDTYPE_TGID; + break; + default: { + CLASS(fd, f)(pidfd); + if (fd_empty(f)) + return -EBADF; - /* Is this a pidfd? */ - pid = pidfd_to_pid(fd_file(f)); - if (IS_ERR(pid)) - return PTR_ERR(pid); + /* Is this a pidfd? */ + pid = pidfd_to_pid(fd_file(f)); + if (IS_ERR(pid)) + return PTR_ERR(pid); - if (!access_pidfd_pidns(pid)) - return -EINVAL; + if (!access_pidfd_pidns(pid)) + return -EINVAL; - switch (flags) { - case 0: /* Infer scope from the type of pidfd. */ if (fd_file(f)->f_flags & PIDFD_THREAD) type = PIDTYPE_PID; else type = PIDTYPE_TGID; - break; - case PIDFD_SIGNAL_THREAD: - type = PIDTYPE_PID; - break; - case PIDFD_SIGNAL_THREAD_GROUP: - type = PIDTYPE_TGID; - break; - case PIDFD_SIGNAL_PROCESS_GROUP: - type = PIDTYPE_PGID; - break; - } - if (info) { - ret = copy_siginfo_from_user_any(&kinfo, info); - if (unlikely(ret)) - return ret; - - if (unlikely(sig != kinfo.si_signo)) - return -EINVAL; - - /* Only allow sending arbitrary signals to yourself. */ - if ((task_pid(current) != pid || type > PIDTYPE_TGID) && - (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) - return -EPERM; - } else { - prepare_kill_siginfo(sig, &kinfo, type); + return do_pidfd_send_signal(pid, sig, type, info, flags); + } } - if (type == PIDTYPE_PGID) - return kill_pgrp_info(sig, &kinfo, pid); - else - return kill_pid_info_type(sig, &kinfo, pid, type); + return do_pidfd_send_signal(pid, sig, type, info, flags); } static int @@ -4962,9 +4981,20 @@ static const struct ctl_table signal_debug_table[] = { #endif }; +static const struct ctl_table signal_table[] = { + { + .procname = "print-fatal-signals", + .data = &print_fatal_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + static int __init init_signal_sysctls(void) { register_sysctl_init("debug", signal_debug_table); + register_sysctl_init("kernel", signal_table); return 0; } early_initcall(init_signal_sysctls); diff --git a/kernel/softirq.c b/kernel/softirq.c index 4dae6ac2e83f..513b1945987c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -126,6 +126,18 @@ static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = { .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock), }; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key bh_lock_key; +struct lockdep_map bh_lock_map = { + .name = "local_bh", + .key = &bh_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */ + .lock_type = LD_LOCK_PERCPU, +}; +EXPORT_SYMBOL_GPL(bh_lock_map); +#endif + /** * local_bh_blocked() - Check for idle whether BH processing is blocked * @@ -148,6 +160,8 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); + lock_map_acquire_read(&bh_lock_map); + /* First entry of a task into a BH disabled section? */ if (!current->softirq_disable_cnt) { if (preemptible()) { @@ -211,6 +225,8 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); lockdep_assert_irqs_enabled(); + lock_map_release(&bh_lock_map); + local_irq_save(flags); curcnt = __this_cpu_read(softirq_ctrl.cnt); @@ -261,6 +277,8 @@ static inline void ksoftirqd_run_begin(void) /* Counterpart to ksoftirqd_run_begin() */ static inline void ksoftirqd_run_end(void) { + /* pairs with the lock_map_acquire_read() in ksoftirqd_run_begin() */ + lock_map_release(&bh_lock_map); __local_bh_enable(SOFTIRQ_OFFSET, true); WARN_ON_ONCE(in_interrupt()); local_irq_enable(); diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index bb7d066a7c39..269683d41aa9 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -206,7 +206,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func) continue; } - arch_static_call_transform(site_addr, NULL, func, + arch_static_call_transform(site_addr, tramp, func, static_call_is_tail(site)); } } @@ -325,13 +325,12 @@ static int __static_call_mod_text_reserved(void *start, void *end) struct module *mod; int ret; - preempt_disable(); - mod = __module_text_address((unsigned long)start); - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); - if (!try_module_get(mod)) - mod = NULL; - preempt_enable(); - + scoped_guard(rcu) { + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + } if (!mod) return 0; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 8896d844d738..5d2d0562115b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -250,6 +250,7 @@ static int multi_cpu_stop(void *data) * be detected and reported on their side. */ touch_nmi_watchdog(); + /* Also suppress RCU CPU stall warnings. */ rcu_momentary_eqs(); } } while (curstate != MULTI_STOP_EXIT); diff --git a/kernel/sys.c b/kernel/sys.c index cb366ff8703a..adc0de0aa364 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -52,6 +52,7 @@ #include <linux/user_namespace.h> #include <linux/time_namespace.h> #include <linux/binfmts.h> +#include <linux/futex.h> #include <linux/sched.h> #include <linux/sched/autogroup.h> @@ -1085,6 +1086,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; + struct pid *pids[PIDTYPE_MAX] = { 0 }; struct pid *pgrp; int err; @@ -1142,13 +1144,14 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) goto out; if (task_pgrp(p) != pgrp) - change_pid(p, PIDTYPE_PGID, pgrp); + change_pid(pids, p, PIDTYPE_PGID, pgrp); err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); rcu_read_unlock(); + free_pids(pids); return err; } @@ -1222,21 +1225,22 @@ out: return retval; } -static void set_special_pids(struct pid *pid) +static void set_special_pids(struct pid **pids, struct pid *pid) { struct task_struct *curr = current->group_leader; if (task_session(curr) != pid) - change_pid(curr, PIDTYPE_SID, pid); + change_pid(pids, curr, PIDTYPE_SID, pid); if (task_pgrp(curr) != pid) - change_pid(curr, PIDTYPE_PGID, pid); + change_pid(pids, curr, PIDTYPE_PGID, pid); } int ksys_setsid(void) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); + struct pid *pids[PIDTYPE_MAX] = { 0 }; pid_t session = pid_vnr(sid); int err = -EPERM; @@ -1252,13 +1256,14 @@ int ksys_setsid(void) goto out; group_leader->signal->leader = 1; - set_special_pids(sid); + set_special_pids(pids, sid); proc_clear_tty(group_leader); err = session; out: write_unlock_irq(&tasklist_lock); + free_pids(pids); if (err > 0) { proc_sid_connector(group_leader); sched_autogroup_create_attach(group_leader); @@ -2811,6 +2816,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = arch_lock_shadow_stack_status(me, arg2); break; + case PR_TIMER_CREATE_RESTORE_IDS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = posixtimer_create_prctl(arg2); + break; + case PR_FUTEX_HASH: + error = futex_hash_prctl(arg2, arg3, arg4); + break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); error = -EINVAL; diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c index eb2842bd0557..92f94ea28957 100644 --- a/kernel/sysctl-test.c +++ b/kernel/sysctl-test.c @@ -367,54 +367,6 @@ static void sysctl_test_api_dointvec_write_single_greater_int_max( KUNIT_EXPECT_EQ(test, 0, *((int *)table.data)); } -/* - * Test that registering an invalid extra value is not allowed. - */ -static void sysctl_test_register_sysctl_sz_invalid_extra_value( - struct kunit *test) -{ - unsigned char data = 0; - const struct ctl_table table_foo[] = { - { - .procname = "foo", - .data = &data, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_FOUR, - .extra2 = SYSCTL_ONE_THOUSAND, - }, - }; - - const struct ctl_table table_bar[] = { - { - .procname = "bar", - .data = &data, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_NEG_ONE, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - }; - - const struct ctl_table table_qux[] = { - { - .procname = "qux", - .data = &data, - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO_HUNDRED, - }, - }; - - KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_foo)); - KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_bar)); - KUNIT_EXPECT_NOT_NULL(test, register_sysctl("foo", table_qux)); -} - static struct kunit_case sysctl_test_cases[] = { KUNIT_CASE(sysctl_test_api_dointvec_null_tbl_data), KUNIT_CASE(sysctl_test_api_dointvec_table_maxlen_unset), @@ -426,7 +378,6 @@ static struct kunit_case sysctl_test_cases[] = { KUNIT_CASE(sysctl_test_dointvec_write_happy_single_negative), KUNIT_CASE(sysctl_test_api_dointvec_write_single_less_int_min), KUNIT_CASE(sysctl_test_api_dointvec_write_single_greater_int_max), - KUNIT_CASE(sysctl_test_register_sysctl_sz_invalid_extra_value), {} }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cb57da499ebb..9b4f0cff76ea 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -20,18 +20,12 @@ */ #include <linux/module.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/slab.h> #include <linux/sysctl.h> #include <linux/bitmap.h> -#include <linux/signal.h> -#include <linux/panic.h> #include <linux/printk.h> #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> -#include <linux/kmemleak.h> #include <linux/filter.h> #include <linux/fs.h> #include <linux/init.h> @@ -42,26 +36,19 @@ #include <linux/highuid.h> #include <linux/writeback.h> #include <linux/ratelimit.h> -#include <linux/hugetlb.h> #include <linux/initrd.h> #include <linux/key.h> #include <linux/times.h> #include <linux/limits.h> -#include <linux/dcache.h> #include <linux/syscalls.h> -#include <linux/vmstat.h> #include <linux/nfs_fs.h> #include <linux/acpi.h> #include <linux/reboot.h> -#include <linux/ftrace.h> -#include <linux/perf_event.h> -#include <linux/oom.h> #include <linux/kmod.h> #include <linux/capability.h> #include <linux/binfmts.h> #include <linux/sched/sysctl.h> #include <linux/mount.h> -#include <linux/userfaultfd_k.h> #include <linux/pid.h> #include "../lib/kstrtox.h" @@ -71,12 +58,8 @@ #ifdef CONFIG_X86 #include <asm/nmi.h> -#include <asm/stacktrace.h> #include <asm/io.h> #endif -#ifdef CONFIG_SPARC -#include <asm/setup.h> -#endif #ifdef CONFIG_RT_MUTEXES #include <linux/rtmutex.h> #endif @@ -91,12 +74,6 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); #if defined(CONFIG_SYSCTL) /* Constants used for minimum and maximum */ - -#ifdef CONFIG_PERF_EVENTS -static const int six_hundred_forty_kb = 640 * 1024; -#endif - - static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -129,12 +106,6 @@ enum sysctl_writes_mode { static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; #endif /* CONFIG_PROC_SYSCTL */ - -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) -int sysctl_legacy_va_layout; -#endif - #endif /* CONFIG_SYSCTL */ /* @@ -1610,13 +1581,6 @@ int proc_do_static_key(const struct ctl_table *table, int write, } static const struct ctl_table kern_table[] = { - { - .procname = "panic", - .data = &panic_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -1634,45 +1598,6 @@ static const struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif - { - .procname = "print-fatal-signals", - .data = &print_fatal_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#ifdef CONFIG_SPARC - { - .procname = "reboot-cmd", - .data = reboot_command, - .maxlen = 256, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "stop-a", - .data = &stop_a_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "scons-poweroff", - .data = &scons_pwroff, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_SPARC64 - { - .procname = "tsb-ratio", - .data = &sysctl_tsb_ratio, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_PARISC { .procname = "soft-power", @@ -1691,38 +1616,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_STACK_TRACER - { - .procname = "stack_tracer_enabled", - .data = &stack_tracer_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = stack_trace_sysctl, - }, -#endif -#ifdef CONFIG_TRACING - { - .procname = "ftrace_dump_on_oops", - .data = &ftrace_dump_on_oops, - .maxlen = MAX_TRACER_SIZE, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "traceoff_on_warning", - .data = &__disable_trace_on_warning, - .maxlen = sizeof(__disable_trace_on_warning), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tracepoint_printk", - .data = &tracepoint_printk, - .maxlen = sizeof(tracepoint_printk), - .mode = 0644, - .proc_handler = tracepoint_printk_sysctl, - }, -#endif #ifdef CONFIG_MODULES { .procname = "modprobe", @@ -1794,29 +1687,6 @@ static const struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_MAXOLDUID, }, -#ifdef CONFIG_S390 - { - .procname = "userprocess_debug", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - { - .procname = "panic_on_oops", - .data = &panic_on_oops, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_print", - .data = &panic_print, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, { .procname = "ngroups_max", .data = (void *)&ngroups_max, @@ -1831,16 +1701,6 @@ static const struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) - { - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - #if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ defined(CONFIG_DEBUG_STACKOVERFLOW) { @@ -1851,43 +1711,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_X86) - { - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_on_io_nmi", - .data = &panic_on_io_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_type", - .data = &bootloader_type, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_version", - .data = &bootloader_version, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "io_delay_type", - .data = &io_delay_type, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #if defined(CONFIG_MMU) { .procname = "randomize_va_space", @@ -1897,24 +1720,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", - .data = &spin_retry, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) - { - .procname = "acpi_video_flags", - .data = &acpi_realmode_flags, - .maxlen = sizeof (unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", @@ -1933,72 +1738,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_PERF_EVENTS - /* - * User-space scripts rely on the existence of this file - * as a feature check for perf_events being enabled. - * - * So it's an ABI, do not remove! - */ - { - .procname = "perf_event_paranoid", - .data = &sysctl_perf_event_paranoid, - .maxlen = sizeof(sysctl_perf_event_paranoid), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_mlock_kb", - .data = &sysctl_perf_event_mlock, - .maxlen = sizeof(sysctl_perf_event_mlock), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_max_sample_rate", - .data = &sysctl_perf_event_sample_rate, - .maxlen = sizeof(sysctl_perf_event_sample_rate), - .mode = 0644, - .proc_handler = perf_event_max_sample_rate_handler, - .extra1 = SYSCTL_ONE, - }, - { - .procname = "perf_cpu_time_max_percent", - .data = &sysctl_perf_cpu_time_max_percent, - .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), - .mode = 0644, - .proc_handler = perf_cpu_time_max_percent_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "perf_event_max_stack", - .data = &sysctl_perf_event_max_stack, - .maxlen = sizeof(sysctl_perf_event_max_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&six_hundred_forty_kb, - }, - { - .procname = "perf_event_max_contexts_per_stack", - .data = &sysctl_perf_event_max_contexts_per_stack, - .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_THOUSAND, - }, -#endif - { - .procname = "panic_on_warn", - .data = &panic_on_warn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #ifdef CONFIG_TREE_RCU { .procname = "panic_on_rcu_stall", @@ -2021,215 +1760,9 @@ static const struct ctl_table kern_table[] = { #endif }; -static const struct ctl_table vm_table[] = { - { - .procname = "overcommit_memory", - .data = &sysctl_overcommit_memory, - .maxlen = sizeof(sysctl_overcommit_memory), - .mode = 0644, - .proc_handler = overcommit_policy_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "overcommit_ratio", - .data = &sysctl_overcommit_ratio, - .maxlen = sizeof(sysctl_overcommit_ratio), - .mode = 0644, - .proc_handler = overcommit_ratio_handler, - }, - { - .procname = "overcommit_kbytes", - .data = &sysctl_overcommit_kbytes, - .maxlen = sizeof(sysctl_overcommit_kbytes), - .mode = 0644, - .proc_handler = overcommit_kbytes_handler, - }, - { - .procname = "page-cluster", - .data = &page_cluster, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&page_cluster_max, - }, - { - .procname = "dirtytime_expire_seconds", - .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirtytime_expire_interval), - .mode = 0644, - .proc_handler = dirtytime_interval_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO_HUNDRED, - }, -#ifdef CONFIG_NUMA - { - .procname = "numa_stat", - .data = &sysctl_vm_numa_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { - .procname = "drop_caches", - .data = &sysctl_drop_caches, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = drop_caches_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_FOUR, - }, - { - .procname = "page_lock_unfairness", - .data = &sysctl_page_lock_unfairness, - .maxlen = sizeof(sysctl_page_lock_unfairness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#ifdef CONFIG_MMU - { - .procname = "max_map_count", - .data = &sysctl_max_map_count, - .maxlen = sizeof(sysctl_max_map_count), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#else - { - .procname = "nr_trim_pages", - .data = &sysctl_nr_trim_pages, - .maxlen = sizeof(sysctl_nr_trim_pages), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif - { - .procname = "vfs_cache_pressure", - .data = &sysctl_vfs_cache_pressure, - .maxlen = sizeof(sysctl_vfs_cache_pressure), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) - { - .procname = "legacy_va_layout", - .data = &sysctl_legacy_va_layout, - .maxlen = sizeof(sysctl_legacy_va_layout), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_NUMA - { - .procname = "zone_reclaim_mode", - .data = &node_reclaim_mode, - .maxlen = sizeof(node_reclaim_mode), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_SMP - { - .procname = "stat_interval", - .data = &sysctl_stat_interval, - .maxlen = sizeof(sysctl_stat_interval), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "stat_refresh", - .data = NULL, - .maxlen = 0, - .mode = 0600, - .proc_handler = vmstat_refresh, - }, -#endif -#ifdef CONFIG_MMU - { - .procname = "mmap_min_addr", - .data = &dac_mmap_min_addr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = mmap_min_addr_handler, - }, -#endif -#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ - (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) - { - .procname = "vdso_enabled", -#ifdef CONFIG_X86_32 - .data = &vdso32_enabled, - .maxlen = sizeof(vdso32_enabled), -#else - .data = &vdso_enabled, - .maxlen = sizeof(vdso_enabled), -#endif - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#endif - { - .procname = "user_reserve_kbytes", - .data = &sysctl_user_reserve_kbytes, - .maxlen = sizeof(sysctl_user_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "admin_reserve_kbytes", - .data = &sysctl_admin_reserve_kbytes, - .maxlen = sizeof(sysctl_admin_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS - { - .procname = "mmap_rnd_bits", - .data = &mmap_rnd_bits, - .maxlen = sizeof(mmap_rnd_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_bits_min, - .extra2 = (void *)&mmap_rnd_bits_max, - }, -#endif -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS - { - .procname = "mmap_rnd_compat_bits", - .data = &mmap_rnd_compat_bits, - .maxlen = sizeof(mmap_rnd_compat_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_compat_bits_min, - .extra2 = (void *)&mmap_rnd_compat_bits_max, - }, -#endif -}; - int __init sysctl_init_bases(void) { register_sysctl_init("kernel", kern_table); - register_sysctl_init("vm", vm_table); return 0; } diff --git a/kernel/time/Makefile b/kernel/time/Makefile index fe0ae82124fe..e6e9b85d4db5 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 + +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_sched_clock.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y += time.o timer.o hrtimer.o sleep_timeout.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o timecounter.o alarmtimer.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0ddccdff119a..577f0e6842d4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -70,12 +70,10 @@ static DEFINE_SPINLOCK(rtcdev_lock); */ struct rtc_device *alarmtimer_get_rtcdev(void) { - unsigned long flags; struct rtc_device *ret; - spin_lock_irqsave(&rtcdev_lock, flags); + guard(spinlock_irqsave)(&rtcdev_lock); ret = rtcdev; - spin_unlock_irqrestore(&rtcdev_lock, flags); return ret; } @@ -83,7 +81,6 @@ EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); static int alarmtimer_rtc_add_device(struct device *dev) { - unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); struct platform_device *pdev; int ret = 0; @@ -101,25 +98,18 @@ static int alarmtimer_rtc_add_device(struct device *dev) if (!IS_ERR(pdev)) device_init_wakeup(&pdev->dev, true); - spin_lock_irqsave(&rtcdev_lock, flags); - if (!IS_ERR(pdev) && !rtcdev) { - if (!try_module_get(rtc->owner)) { + scoped_guard(spinlock_irqsave, &rtcdev_lock) { + if (!IS_ERR(pdev) && !rtcdev && try_module_get(rtc->owner)) { + rtcdev = rtc; + /* hold a reference so it doesn't go away */ + get_device(dev); + pdev = NULL; + } else { ret = -1; - goto unlock; } - - rtcdev = rtc; - /* hold a reference so it doesn't go away */ - get_device(dev); - pdev = NULL; - } else { - ret = -1; } -unlock: - spin_unlock_irqrestore(&rtcdev_lock, flags); platform_device_unregister(pdev); - return ret; } @@ -198,7 +188,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) struct alarm *alarm = container_of(timer, struct alarm, timer); struct alarm_base *base = &alarm_bases[alarm->type]; - scoped_guard (spinlock_irqsave, &base->lock) + scoped_guard(spinlock_irqsave, &base->lock) alarmtimer_dequeue(base, alarm); if (alarm->function) @@ -228,17 +218,16 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); static int alarmtimer_suspend(struct device *dev) { ktime_t min, now, expires; - int i, ret, type; struct rtc_device *rtc; - unsigned long flags; struct rtc_time tm; + int i, ret, type; - spin_lock_irqsave(&freezer_delta_lock, flags); - min = freezer_delta; - expires = freezer_expires; - type = freezer_alarmtype; - freezer_delta = 0; - spin_unlock_irqrestore(&freezer_delta_lock, flags); + scoped_guard(spinlock_irqsave, &freezer_delta_lock) { + min = freezer_delta; + expires = freezer_expires; + type = freezer_alarmtype; + freezer_delta = 0; + } rtc = alarmtimer_get_rtcdev(); /* If we have no rtcdev, just return */ @@ -251,9 +240,8 @@ static int alarmtimer_suspend(struct device *dev) struct timerqueue_node *next; ktime_t delta; - spin_lock_irqsave(&base->lock, flags); - next = timerqueue_getnext(&base->timerqueue); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) + next = timerqueue_getnext(&base->timerqueue); if (!next) continue; delta = ktime_sub(next->expires, base->get_ktime()); @@ -352,13 +340,12 @@ EXPORT_SYMBOL_GPL(alarm_init); void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - spin_lock_irqsave(&base->lock, flags); - alarm->node.expires = start; - alarmtimer_enqueue(base, alarm); - hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) { + alarm->node.expires = start; + alarmtimer_enqueue(base, alarm); + hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); + } trace_alarmtimer_start(alarm, base->get_ktime()); } @@ -381,13 +368,11 @@ EXPORT_SYMBOL_GPL(alarm_start_relative); void alarm_restart(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - spin_lock_irqsave(&base->lock, flags); + guard(spinlock_irqsave)(&base->lock); hrtimer_set_expires(&alarm->timer, alarm->node.expires); hrtimer_restart(&alarm->timer); alarmtimer_enqueue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(alarm_restart); @@ -401,14 +386,13 @@ EXPORT_SYMBOL_GPL(alarm_restart); int alarm_try_to_cancel(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; int ret; - spin_lock_irqsave(&base->lock, flags); - ret = hrtimer_try_to_cancel(&alarm->timer); - if (ret >= 0) - alarmtimer_dequeue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) { + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) + alarmtimer_dequeue(base, alarm); + } trace_alarmtimer_cancel(alarm, base->get_ktime()); return ret; @@ -479,7 +463,6 @@ EXPORT_SYMBOL_GPL(alarm_forward_now); static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) { struct alarm_base *base; - unsigned long flags; ktime_t delta; switch(type) { @@ -498,13 +481,12 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) delta = ktime_sub(absexp, base->get_ktime()); - spin_lock_irqsave(&freezer_delta_lock, flags); + guard(spinlock_irqsave)(&freezer_delta_lock); if (!freezer_delta || (delta < freezer_delta)) { freezer_delta = delta; freezer_expires = absexp; freezer_alarmtype = type; } - spin_unlock_irqrestore(&freezer_delta_lock, flags); } /** @@ -515,9 +497,9 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) { if (clockid == CLOCK_REALTIME_ALARM) return ALARM_REALTIME; - if (clockid == CLOCK_BOOTTIME_ALARM) - return ALARM_BOOTTIME; - return -1; + + WARN_ON_ONCE(clockid != CLOCK_BOOTTIME_ALARM); + return ALARM_BOOTTIME; } /** diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 2a7802ec480c..6a8bc7da9062 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void) { int cpu, i, n = verify_n_cpus; - if (n < 0) { + if (n < 0 || n >= num_online_cpus()) { /* Check all of the CPUs. */ cpumask_copy(&cpus_chosen, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); @@ -619,7 +619,7 @@ static inline void clocksource_stop_watchdog(void) { if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) return; - del_timer(&watchdog_timer); + timer_delete(&watchdog_timer); watchdog_running = 0; } @@ -1510,7 +1510,7 @@ static int __init boot_override_clocksource(char* str) { mutex_lock(&clocksource_mutex); if (str) - strscpy(override_name, str, sizeof(override_name)); + strscpy(override_name, str); mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index deb1aa32814e..30899a8cc52c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -117,16 +117,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .csd = CSD_INIT(retrigger_next_event, NULL) }; -static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, - [CLOCK_TAI] = HRTIMER_BASE_TAI, -}; - static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) { if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) @@ -376,7 +366,7 @@ static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { - return ((struct hrtimer *) addr)->function; + return ACCESS_PRIVATE((struct hrtimer *)addr, function); } /* @@ -475,19 +465,17 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer, static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif -static inline void -debug_init(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); } -static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) { debug_hrtimer_init_on_stack(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, @@ -1326,8 +1314,6 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, struct hrtimer_clock_base *base; unsigned long flags; - if (WARN_ON_ONCE(!timer->function)) - return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard @@ -1439,7 +1425,7 @@ static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) * running. * * This prevents priority inversion: if the soft irq thread is preempted - * in the middle of a timer callback, then calling del_timer_sync() can + * in the middle of a timer callback, then calling hrtimer_cancel() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer @@ -1587,23 +1573,24 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - if (likely(clock_id < MAX_CLOCKS)) { - int base = hrtimer_clock_to_base_table[clock_id]; - - if (likely(base != HRTIMER_MAX_CLOCK_BASES)) - return base; + switch (clock_id) { + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; + case CLOCK_MONOTONIC: + return HRTIMER_BASE_MONOTONIC; + case CLOCK_BOOTTIME: + return HRTIMER_BASE_BOOTTIME; + case CLOCK_TAI: + return HRTIMER_BASE_TAI; + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; } - WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); - return HRTIMER_BASE_MONOTONIC; -} - -static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) -{ - return HRTIMER_NORESTART; } -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) +static void __hrtimer_setup(struct hrtimer *timer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; @@ -1636,41 +1623,14 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); -} - -static void __hrtimer_setup(struct hrtimer *timer, - enum hrtimer_restart (*function)(struct hrtimer *), - clockid_t clock_id, enum hrtimer_mode mode) -{ - __hrtimer_init(timer, clock_id, mode); if (WARN_ON_ONCE(!function)) - timer->function = hrtimer_dummy_timeout; + ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; else - timer->function = function; + ACCESS_PRIVATE(timer, function) = function; } /** - * hrtimer_init - initialize a timer to the given clock - * @timer: the timer to be initialized - * @clock_id: the clock to be used - * @mode: The modes which are relevant for initialization: - * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, - * HRTIMER_MODE_REL_SOFT - * - * The PINNED variants of the above can be handed in, - * but the PINNED bit is ignored as pinning happens - * when the hrtimer is started - */ -void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_init(timer, clock_id, mode); - __hrtimer_init(timer, clock_id, mode); -} -EXPORT_SYMBOL_GPL(hrtimer_init); - -/** * hrtimer_setup - initialize a timer to the given clock * @timer: the timer to be initialized * @function: the callback function @@ -1686,7 +1646,7 @@ EXPORT_SYMBOL_GPL(hrtimer_init); void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { - debug_init(timer, clock_id, mode); + debug_setup(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup); @@ -1705,7 +1665,7 @@ void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { - debug_init_on_stack(timer, clock_id, mode); + debug_setup_on_stack(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); @@ -1779,7 +1739,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); - fn = timer->function; + fn = ACCESS_PRIVATE(timer, function); /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the @@ -2054,7 +2014,7 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because - * __hrtimer_init_sleeper() determines the delivery mode on RT so the + * __hrtimer_setup_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) @@ -2064,8 +2024,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); -static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) +static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, + clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly @@ -2091,8 +2051,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, mode |= HRTIMER_MODE_HARD; } - __hrtimer_init(&sl->timer, clock_id, mode); - sl->timer.function = hrtimer_wakeup; + __hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode); sl->task = current; } @@ -2105,8 +2064,8 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { - debug_init_on_stack(&sl->timer, clock_id, mode); - __hrtimer_init_sleeper(sl, clock_id, mode); + debug_setup_on_stack(&sl->timer, clock_id, mode); + __hrtimer_setup_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index bc4db9e5ab70..34eeacac2253 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -75,13 +75,11 @@ struct clocksource * __init __weak clocksource_default_clock(void) static struct clocksource refined_jiffies; -int register_refined_jiffies(long cycles_per_second) +void __init register_refined_jiffies(long cycles_per_second) { u64 nsec_per_tick, shift_hz; long cycles_per_tick; - - refined_jiffies = clocksource_jiffies; refined_jiffies.name = "refined-jiffies"; refined_jiffies.rating++; @@ -100,5 +98,4 @@ int register_refined_jiffies(long cycles_per_second) refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; __clocksource_register(&refined_jiffies); - return 0; } diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0775b9ec952a..e3642278df43 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -165,26 +165,26 @@ static struct timens_offset offset_from_ts(struct timespec64 off) * HVCLOCK * VVAR * - * The check for vdso_data->clock_mode is in the unlikely path of + * The check for vdso_clock->clock_mode is in the unlikely path of * the seq begin magic. So for the non-timens case most of the time * 'seq' is even, so the branch is not taken. * * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check - * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * - * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which * enforces the time namespace handling path. */ -static void timens_setup_vdso_data(struct vdso_data *vdata, - struct time_namespace *ns) +static void timens_setup_vdso_clock_data(struct vdso_clock *vc, + struct time_namespace *ns) { - struct timens_offset *offset = vdata->offset; + struct timens_offset *offset = vc->offset; struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); - vdata->seq = 1; - vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; + vc->seq = 1; + vc->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; @@ -219,7 +219,8 @@ static DEFINE_MUTEX(offset_lock); static void timens_set_vvar_page(struct task_struct *task, struct time_namespace *ns) { - struct vdso_data *vdata; + struct vdso_time_data *vdata; + struct vdso_clock *vc; unsigned int i; if (ns == &init_time_ns) @@ -235,10 +236,11 @@ static void timens_set_vvar_page(struct task_struct *task, goto out; ns->frozen_offsets = true; - vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + vdata = page_address(ns->vvar_page); + vc = vdata->clock_data; for (i = 0; i < CS_BASES; i++) - timens_setup_vdso_data(&vdata[i], ns); + timens_setup_vdso_clock_data(&vc[i], ns); out: mutex_unlock(&offset_lock); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 163e7a2033b6..b837d3d9d325 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -678,8 +678,7 @@ void ntp_notify_cmos_timer(bool offset_set) static void __init ntp_init_cmos_sync(void) { - hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - sync_hrtimer.function = sync_timer_callback; + hrtimer_setup(&sync_hrtimer, sync_timer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS); } #else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ static inline void __init ntp_init_cmos_sync(void) { } diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 1af0bb2cc45c..101a0f7c43e0 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -90,26 +90,6 @@ static long posix_clock_ioctl(struct file *fp, return err; } -#ifdef CONFIG_COMPAT -static long posix_clock_compat_ioctl(struct file *fp, - unsigned int cmd, unsigned long arg) -{ - struct posix_clock_context *pccontext = fp->private_data; - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENOTTY; - - if (!clk) - return -ENODEV; - - if (clk->ops.ioctl) - err = clk->ops.ioctl(pccontext, cmd, arg); - - put_posix_clock(clk); - - return err; -} -#endif - static int posix_clock_open(struct inode *inode, struct file *fp) { int err; @@ -129,6 +109,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) goto out; } pccontext->clk = clk; + pccontext->fp = fp; if (clk->ops.open) { err = clk->ops.open(pccontext, fp->f_mode); if (err) { @@ -171,11 +152,9 @@ static const struct file_operations posix_clock_file_operations = { .read = posix_clock_read, .poll = posix_clock_poll, .unlocked_ioctl = posix_clock_ioctl, + .compat_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, -#ifdef CONFIG_COMPAT - .compat_ioctl = posix_clock_compat_ioctl, -#endif }; int posix_clock_register(struct posix_clock *clk, struct device *dev) @@ -251,7 +230,7 @@ static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx) if (err) return err; - if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + if (tx->modes && (cd.fp->f_mode & FMODE_WRITE) == 0) { err = -EACCES; goto out; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 50e8d04ab661..2e5b89d7d866 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1406,6 +1406,15 @@ void run_posix_cpu_timers(void) lockdep_assert_irqs_disabled(); /* + * Ensure that release_task(tsk) can't happen while + * handle_posix_cpu_timers() is running. Otherwise, a concurrent + * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and + * miss timer->it.cpu.firing != 0. + */ + if (tsk->exit_state) + return; + + /* * If the actual expiry is deferred to task work context and the * work is already scheduled there is no point to do anything here. */ diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1b675aee99a9..2053b1a4c9e4 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -9,34 +9,27 @@ * * These are all the functions necessary to implement POSIX clocks & timers */ -#include <linux/mm.h> +#include <linux/compat.h> +#include <linux/compiler.h> +#include <linux/init.h> +#include <linux/jhash.h> #include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/mutex.h> -#include <linux/sched/task.h> - -#include <linux/uaccess.h> #include <linux/list.h> -#include <linux/init.h> -#include <linux/compiler.h> -#include <linux/hash.h> +#include <linux/memblock.h> +#include <linux/nospec.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> +#include <linux/prctl.h> +#include <linux/sched/task.h> +#include <linux/slab.h> #include <linux/syscalls.h> -#include <linux/wait.h> -#include <linux/workqueue.h> -#include <linux/export.h> -#include <linux/hashtable.h> -#include <linux/compat.h> -#include <linux/nospec.h> +#include <linux/time.h> #include <linux/time_namespace.h> +#include <linux/uaccess.h> #include "timekeeping.h" #include "posix-timers.h" -static struct kmem_cache *posix_timers_cache; - /* * Timers are managed in a hash table for lockless lookup. The hash key is * constructed from current::signal and the timer ID and the timer is @@ -46,39 +39,67 @@ static struct kmem_cache *posix_timers_cache; * This allows checkpoint/restore to reconstruct the exact timer IDs for * a process. */ -static DEFINE_HASHTABLE(posix_timers_hashtable, 9); -static DEFINE_SPINLOCK(hash_lock); +struct timer_hash_bucket { + spinlock_t lock; + struct hlist_head head; +}; + +static struct { + struct timer_hash_bucket *buckets; + unsigned long mask; + struct kmem_cache *cache; +} __timer_data __ro_after_init __aligned(4*sizeof(long)); + +#define timer_buckets (__timer_data.buckets) +#define timer_hashmask (__timer_data.mask) +#define posix_timers_cache (__timer_data.cache) static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; +#define TIMER_ANY_ID INT_MIN + /* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); +static struct k_itimer *__lock_timer(timer_t timer_id); -#define lock_timer(tid, flags) \ -({ struct k_itimer *__timr; \ - __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ - __timr; \ +#define lock_timer(tid) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \ + __timr; \ }) -static int hash(struct signal_struct *sig, unsigned int nr) +static inline void unlock_timer(struct k_itimer *timr) { - return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); + if (likely((timr))) + spin_unlock_irq(&timr->it_lock); } -static struct k_itimer *__posix_timers_find(struct hlist_head *head, - struct signal_struct *sig, - timer_t id) +#define scoped_timer_get_or_fail(_id) \ + scoped_cond_guard(lock_timer, return -EINVAL, _id) + +#define scoped_timer (scope) + +DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id); +DEFINE_CLASS_IS_COND_GUARD(lock_timer); + +static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr) { + return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask]; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct timer_hash_bucket *bucket = hash_bucket(sig, id); struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { + hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) { /* timer->it_signal can be set concurrently */ if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) return timer; @@ -86,46 +107,88 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, return NULL; } -static struct k_itimer *posix_timer_by_id(timer_t id) +static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer) { - struct signal_struct *sig = current->signal; - struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + unsigned long val = (unsigned long)timer->it_signal; - return __posix_timers_find(head, sig, id); + /* + * Mask out bit 0, which acts as invalid marker to prevent + * posix_timer_by_id() detecting it as valid. + */ + return (struct signal_struct *)(val & ~1UL); } -static int posix_timer_add(struct k_itimer *timer) +static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig, + timer_t id) { - struct signal_struct *sig = current->signal; - struct hlist_head *head; - unsigned int cnt, id; + struct hlist_head *head = &bucket->head; + struct k_itimer *timer; - /* - * FIXME: Replace this by a per signal struct xarray once there is - * a plan to handle the resulting CRIU regression gracefully. - */ - for (cnt = 0; cnt <= INT_MAX; cnt++) { - spin_lock(&hash_lock); - id = sig->next_posix_timer_id; + hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) { + if ((posix_sig_owner(timer) == sig) && (timer->it_id == id)) + return true; + } + return false; +} - /* Write the next ID back. Clamp it to the positive space */ - sig->next_posix_timer_id = (id + 1) & INT_MAX; +static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id) +{ + struct timer_hash_bucket *bucket = hash_bucket(sig, id); - head = &posix_timers_hashtable[hash(sig, id)]; - if (!__posix_timers_find(head, sig, id)) { - hlist_add_head_rcu(&timer->t_hash, head); - spin_unlock(&hash_lock); - return id; + scoped_guard (spinlock, &bucket->lock) { + /* + * Validate under the lock as this could have raced against + * another thread ending up with the same ID, which is + * highly unlikely, but possible. + */ + if (!posix_timer_hashed(bucket, sig, id)) { + /* + * Set the timer ID and the signal pointer to make + * it identifiable in the hash table. The signal + * pointer has bit 0 set to indicate that it is not + * yet fully initialized. posix_timer_hashed() + * masks this bit out, but the syscall lookup fails + * to match due to it being set. This guarantees + * that there can't be duplicate timer IDs handed + * out. + */ + timer->it_id = (timer_t)id; + timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL); + hlist_add_head_rcu(&timer->t_hash, &bucket->head); + return true; } - spin_unlock(&hash_lock); } - /* POSIX return code when no timer ID could be allocated */ - return -EAGAIN; + return false; } -static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +static int posix_timer_add(struct k_itimer *timer, int req_id) { - spin_unlock_irqrestore(&timr->it_lock, flags); + struct signal_struct *sig = current->signal; + + if (unlikely(req_id != TIMER_ANY_ID)) { + if (!posix_timer_add_at(timer, sig, req_id)) + return -EBUSY; + + /* + * Move the ID counter past the requested ID, so that after + * switching back to normal mode the IDs are outside of the + * exact allocated region. That avoids ID collisions on the + * next regular timer_create() invocations. + */ + atomic_set(&sig->next_posix_timer_id, req_id + 1); + return req_id; + } + + for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) { + /* Get the next timer ID and clamp it to positive space */ + unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX; + + if (posix_timer_add_at(timer, sig, id)) + return id; + cond_resched(); + } + /* POSIX return code when no timer ID could be allocated */ + return -EAGAIN; } static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) @@ -220,15 +283,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) return 0; } -static __init int init_posix_timers(void) -{ - posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof(struct k_itimer), 0, - SLAB_PANIC | SLAB_ACCOUNT, NULL); - return 0; -} -__initcall(init_posix_timers); - /* * The siginfo si_overrun field and the return value of timer_getoverrun(2) * are of type int. Clamp the overrun value to INT_MAX @@ -259,7 +313,7 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it * since the signal was queued. In either case, don't rearm and * drop the signal. */ - if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal)) + if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr))) return false; if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) @@ -304,6 +358,9 @@ void posix_timer_queue_signal(struct k_itimer *timr) { lockdep_assert_held(&timr->it_lock); + if (!posixtimer_valid(timr)) + return; + timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED; posixtimer_send_sigqueue(timr); } @@ -324,6 +381,21 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +long posixtimer_create_prctl(unsigned long ctrl) +{ + switch (ctrl) { + case PR_TIMER_CREATE_RESTORE_IDS_OFF: + current->signal->timer_create_restore_ids = 0; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_ON: + current->signal->timer_create_restore_ids = 1; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_GET: + return current->signal->timer_create_restore_ids; + } + return -EINVAL; +} + static struct pid *good_sigevent(sigevent_t * event) { struct pid *pid = task_tgid(current); @@ -350,8 +422,12 @@ static struct pid *good_sigevent(sigevent_t * event) static struct k_itimer *alloc_posix_timer(void) { - struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + struct k_itimer *tmr; + if (unlikely(!posix_timers_cache)) + return NULL; + + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; @@ -373,15 +449,16 @@ void posixtimer_free_timer(struct k_itimer *tmr) static void posix_timer_unhash_and_free(struct k_itimer *tmr) { - spin_lock(&hash_lock); - hlist_del_rcu(&tmr->t_hash); - spin_unlock(&hash_lock); + struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id); + + scoped_guard (spinlock, &bucket->lock) + hlist_del_rcu(&tmr->t_hash); posixtimer_putref(tmr); } static int common_timer_create(struct k_itimer *new_timer) { - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0); return 0; } @@ -390,6 +467,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, timer_t __user *created_timer_id) { const struct k_clock *kc = clockid_to_kclock(which_clock); + timer_t req_id = TIMER_ANY_ID; struct k_itimer *new_timer; int error, new_timer_id; @@ -404,26 +482,32 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, spin_lock_init(&new_timer->it_lock); + /* Special case for CRIU to restore timers with a given timer ID. */ + if (unlikely(current->signal->timer_create_restore_ids)) { + if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) + return -EFAULT; + /* Valid IDs are 0..INT_MAX */ + if ((unsigned int)req_id > INT_MAX) + return -EINVAL; + } + /* * Add the timer to the hash table. The timer is not yet valid - * because new_timer::it_signal is still NULL. The timer id is also - * not yet visible to user space. + * after insertion, but has a unique ID allocated. */ - new_timer_id = posix_timer_add(new_timer); + new_timer_id = posix_timer_add(new_timer, req_id); if (new_timer_id < 0) { posixtimer_free_timer(new_timer); return new_timer_id; } - new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; new_timer->it_overrun = -1LL; if (event) { - rcu_read_lock(); - new_timer->it_pid = get_pid(good_sigevent(event)); - rcu_read_unlock(); + scoped_guard (rcu) + new_timer->it_pid = get_pid(good_sigevent(event)); if (!new_timer->it_pid) { error = -EINVAL; goto out; @@ -434,7 +518,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, } else { new_timer->it_sigev_notify = SIGEV_SIGNAL; new_timer->sigq.info.si_signo = SIGALRM; - memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t)); new_timer->sigq.info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } @@ -453,7 +536,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, } /* * After succesful copy out, the timer ID is visible to user space - * now but not yet valid because new_timer::signal is still NULL. + * now but not yet valid because new_timer::signal low order bit is 1. * * Complete the initialization with the clock specific create * callback. @@ -462,14 +545,25 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (error) goto out; - spin_lock_irq(¤t->sighand->siglock); - /* This makes the timer valid in the hash table */ - WRITE_ONCE(new_timer->it_signal, current->signal); - hlist_add_head(&new_timer->list, ¤t->signal->posix_timers); - spin_unlock_irq(¤t->sighand->siglock); /* - * After unlocking sighand::siglock @new_timer is subject to - * concurrent removal and cannot be touched anymore + * timer::it_lock ensures that __lock_timer() observes a fully + * initialized timer when it observes a valid timer::it_signal. + * + * sighand::siglock is required to protect signal::posix_timers. + */ + scoped_guard (spinlock_irq, &new_timer->it_lock) { + guard(spinlock)(¤t->sighand->siglock); + /* + * new_timer::it_signal contains the signal pointer with + * bit 0 set, which makes it invalid for syscall operations. + * Store the unmodified signal pointer to make it valid. + */ + WRITE_ONCE(new_timer->it_signal, current->signal); + hlist_add_head_rcu(&new_timer->list, ¤t->signal->posix_timers); + } + /* + * After unlocking @new_timer is subject to concurrent removal and + * cannot be touched anymore */ return 0; out: @@ -507,7 +601,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, } #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *__lock_timer(timer_t timer_id) { struct k_itimer *timr; @@ -522,11 +616,11 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * The hash lookup and the timers are RCU protected. * * Timers are added to the hash in invalid state where - * timr::it_signal == NULL. timer::it_signal is only set after the - * rest of the initialization succeeded. + * timr::it_signal is marked invalid. timer::it_signal is only set + * after the rest of the initialization succeeded. * * Timer destruction happens in steps: - * 1) Set timr::it_signal to NULL with timr::it_lock held + * 1) Set timr::it_signal marked invalid with timr::it_lock held * 2) Release timr::it_lock * 3) Remove from the hash under hash_lock * 4) Put the reference count. @@ -543,25 +637,21 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * * The lookup validates locklessly that timr::it_signal == * current::it_signal and timr::it_id == @timer_id. timr::it_id - * can't change, but timr::it_signal becomes NULL during - * destruction. + * can't change, but timr::it_signal can become invalid during + * destruction, which makes the locked check fail. */ - rcu_read_lock(); + guard(rcu)(); timr = posix_timer_by_id(timer_id); if (timr) { - spin_lock_irqsave(&timr->it_lock, *flags); + spin_lock_irq(&timr->it_lock); /* * Validate under timr::it_lock that timr::it_signal is * still valid. Pairs with #1 above. */ - if (timr->it_signal == current->signal) { - rcu_read_unlock(); + if (timr->it_signal == current->signal) return timr; - } - spin_unlock_irqrestore(&timr->it_lock, *flags); + spin_unlock_irq(&timr->it_lock); } - rcu_read_unlock(); - return NULL; } @@ -652,24 +742,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int ret = 0; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - memset(setting, 0, sizeof(*setting)); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_get)) - ret = -EINVAL; - else - kc->timer_get(timr, setting); - - unlock_timer(timr, flags); - return ret; + scoped_timer_get_or_fail(timer_id) + scoped_timer->kclock->timer_get(scoped_timer, setting); + return 0; } /* Get the time remaining on a POSIX.1b interval timer. */ @@ -723,18 +799,8 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { - struct k_itimer *timr; - unsigned long flags; - int overrun; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - overrun = timer_overrun_to_int(timr); - unlock_timer(timr, flags); - - return overrun; + scoped_timer_get_or_fail(timer_id) + return timer_overrun_to_int(scoped_timer); } static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, @@ -747,7 +813,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, /* * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the - * hood. See hrtimer_init(). Update timr->kclock, so the generic + * hood. See hrtimer_setup(). Update timr->kclock, so the generic * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might @@ -756,8 +822,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, if (timr->it_clock == CLOCK_REALTIME) timr->kclock = absolute ? &clock_realtime : &clock_monotonic; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; + hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode); if (!absolute) expires = ktime_add_safe(expires, timer->base->get_time()); @@ -791,26 +856,13 @@ static void common_timer_wait_running(struct k_itimer *timer) * when the task which tries to delete or disarm the timer has preempted * the task which runs the expiry in task work context. */ -static struct k_itimer *timer_wait_running(struct k_itimer *timer, - unsigned long *flags) +static void timer_wait_running(struct k_itimer *timer) { - const struct k_clock *kc = READ_ONCE(timer->kclock); - timer_t timer_id = READ_ONCE(timer->it_id); - - /* Prevent kfree(timer) after dropping the lock */ - rcu_read_lock(); - unlock_timer(timer, *flags); - /* * kc->timer_wait_running() might drop RCU lock. So @timer * cannot be touched anymore after the function returns! */ - if (!WARN_ON_ONCE(!kc->timer_wait_running)) - kc->timer_wait_running(timer); - - rcu_read_unlock(); - /* Relock the timer. It might be not longer hashed. */ - return lock_timer(timer_id, flags); + timer->kclock->timer_wait_running(timer); } /* @@ -865,15 +917,9 @@ int common_timer_set(struct k_itimer *timr, int flags, return 0; } -static int do_timer_settime(timer_t timer_id, int tmr_flags, - struct itimerspec64 *new_spec64, +static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64, struct itimerspec64 *old_spec64) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int error; - if (!timespec64_valid(&new_spec64->it_interval) || !timespec64_valid(&new_spec64->it_value)) return -EINVAL; @@ -881,33 +927,28 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags, if (old_spec64) memset(old_spec64, 0, sizeof(*old_spec64)); - timr = lock_timer(timer_id, &flags); -retry: - if (!timr) - return -EINVAL; + for (; ; old_spec64 = NULL) { + struct k_itimer *timr; - if (old_spec64) - old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + scoped_timer_get_or_fail(timer_id) { + timr = scoped_timer; - /* Prevent signal delivery and rearming. */ - timr->it_signal_seq++; + if (old_spec64) + old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_set)) - error = -EINVAL; - else - error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); - - if (error == TIMER_RETRY) { - // We already got the old time... - old_spec64 = NULL; - /* Unlocks and relocks the timer if it still exists */ - timr = timer_wait_running(timr, &flags); - goto retry; - } - unlock_timer(timr, flags); + /* Prevent signal delivery and rearming. */ + timr->it_signal_seq++; - return error; + int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64); + if (ret != TIMER_RETRY) + return ret; + + /* Protect the timer from being freed when leaving the lock scope */ + rcu_read_lock(); + } + timer_wait_running(timr); + rcu_read_unlock(); + } } /* Set a POSIX.1b interval timer */ @@ -978,110 +1019,58 @@ static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr) } } -static inline int timer_delete_hook(struct k_itimer *timer) +static void posix_timer_delete(struct k_itimer *timer) { - const struct k_clock *kc = timer->kclock; - - /* Prevent signal delivery and rearming. */ + /* + * Invalidate the timer, remove it from the linked list and remove + * it from the ignored list if pending. + * + * The invalidation must be written with siglock held so that the + * signal code observes the invalidated timer::it_signal in + * do_sigaction(), which prevents it from moving a pending signal + * of a deleted timer to the ignore list. + * + * The invalidation also prevents signal queueing, signal delivery + * and therefore rearming from the signal delivery path. + * + * A concurrent lookup can still find the timer in the hash, but it + * will check timer::it_signal with timer::it_lock held and observe + * bit 0 set, which invalidates it. That also prevents the timer ID + * from being handed out before this timer is completely gone. + */ timer->it_signal_seq++; - if (WARN_ON_ONCE(!kc || !kc->timer_del)) - return -EINVAL; - return kc->timer_del(timer); + scoped_guard (spinlock, ¤t->sighand->siglock) { + unsigned long sig = (unsigned long)timer->it_signal | 1UL; + + WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig); + hlist_del_rcu(&timer->list); + posix_timer_cleanup_ignored(timer); + } + + while (timer->kclock->timer_del(timer) == TIMER_RETRY) { + guard(rcu)(); + spin_unlock_irq(&timer->it_lock); + timer_wait_running(timer); + spin_lock_irq(&timer->it_lock); + } } /* Delete a POSIX.1b interval timer. */ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) { struct k_itimer *timer; - unsigned long flags; - timer = lock_timer(timer_id, &flags); - -retry_delete: - if (!timer) - return -EINVAL; - - if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { - /* Unlocks and relocks the timer if it still exists */ - timer = timer_wait_running(timer, &flags); - goto retry_delete; + scoped_timer_get_or_fail(timer_id) { + timer = scoped_timer; + posix_timer_delete(timer); } - - spin_lock(¤t->sighand->siglock); - hlist_del(&timer->list); - posix_timer_cleanup_ignored(timer); - /* - * A concurrent lookup could check timer::it_signal lockless. It - * will reevaluate with timer::it_lock held and observe the NULL. - * - * It must be written with siglock held so that the signal code - * observes timer->it_signal == NULL in do_sigaction(SIG_IGN), - * which prevents it from moving a pending signal of a deleted - * timer to the ignore list. - */ - WRITE_ONCE(timer->it_signal, NULL); - spin_unlock(¤t->sighand->siglock); - - unlock_timer(timer, flags); + /* Remove it from the hash, which frees up the timer ID */ posix_timer_unhash_and_free(timer); return 0; } /* - * Delete a timer if it is armed, remove it from the hash and schedule it - * for RCU freeing. - */ -static void itimer_delete(struct k_itimer *timer) -{ - unsigned long flags; - - /* - * irqsave is required to make timer_wait_running() work. - */ - spin_lock_irqsave(&timer->it_lock, flags); - -retry_delete: - /* - * Even if the timer is not longer accessible from other tasks - * it still might be armed and queued in the underlying timer - * mechanism. Worse, that timer mechanism might run the expiry - * function concurrently. - */ - if (timer_delete_hook(timer) == TIMER_RETRY) { - /* - * Timer is expired concurrently, prevent livelocks - * and pointless spinning on RT. - * - * timer_wait_running() drops timer::it_lock, which opens - * the possibility for another task to delete the timer. - * - * That's not possible here because this is invoked from - * do_exit() only for the last thread of the thread group. - * So no other task can access and delete that timer. - */ - if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) - return; - - goto retry_delete; - } - hlist_del(&timer->list); - - posix_timer_cleanup_ignored(timer); - - /* - * Setting timer::it_signal to NULL is technically not required - * here as nothing can access the timer anymore legitimately via - * the hash table. Set it to NULL nevertheless so that all deletion - * paths are consistent. - */ - WRITE_ONCE(timer->it_signal, NULL); - - spin_unlock_irqrestore(&timer->it_lock, flags); - posix_timer_unhash_and_free(timer); -} - -/* * Invoked from do_exit() when the last thread of a thread group exits. * At that point no other task can access the timers of the dying * task anymore. @@ -1089,18 +1078,26 @@ retry_delete: void exit_itimers(struct task_struct *tsk) { struct hlist_head timers; + struct hlist_node *next; + struct k_itimer *timer; + + /* Clear restore mode for exec() */ + tsk->signal->timer_create_restore_ids = 0; if (hlist_empty(&tsk->signal->posix_timers)) return; /* Protect against concurrent read via /proc/$PID/timers */ - spin_lock_irq(&tsk->sighand->siglock); - hlist_move_list(&tsk->signal->posix_timers, &timers); - spin_unlock_irq(&tsk->sighand->siglock); + scoped_guard (spinlock_irq, &tsk->sighand->siglock) + hlist_move_list(&tsk->signal->posix_timers, &timers); /* The timers are not longer accessible via tsk::signal */ - while (!hlist_empty(&timers)) - itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); + hlist_for_each_entry_safe(timer, next, &timers, list) { + scoped_guard (spinlock_irq, &timer->it_lock) + posix_timer_delete(timer); + posix_timer_unhash_and_free(timer); + cond_resched(); + } /* * There should be no timers on the ignored list. itimer_delete() has @@ -1545,3 +1542,31 @@ static const struct k_clock *clockid_to_kclock(const clockid_t id) return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } + +static int __init posixtimer_init(void) +{ + unsigned long i, size; + unsigned int shift; + + posix_timers_cache = kmem_cache_create("posix_timers_cache", + sizeof(struct k_itimer), + __alignof__(struct k_itimer), + SLAB_ACCOUNT, NULL); + + if (IS_ENABLED(CONFIG_BASE_SMALL)) + size = 512; + else + size = roundup_pow_of_two(512 * num_possible_cpus()); + + timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets), + size, 0, 0, &shift, NULL, size, size); + size = 1UL << shift; + timer_hashmask = size - 1; + + for (i = 0; i < size; i++) { + spin_lock_init(&timer_buckets[i].lock); + INIT_HLIST_HEAD(&timer_buckets[i].head); + } + return 0; +} +core_initcall(posixtimer_init); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fcca4e72f1ef..cc15fe293719 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -263,8 +263,7 @@ void __init generic_sched_clock_init(void) * Start the timer to keep sched_clock() properly updated and * sets the initial epoch. */ - hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - sched_clock_timer.function = sched_clock_poll; + hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index dfe939f6e4ec..3c90574bd904 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -22,7 +22,7 @@ struct process_timer { static void process_timeout(struct timer_list *t) { - struct process_timer *timeout = from_timer(timeout, t, timer); + struct process_timer *timeout = timer_container_of(timeout, t, timer); wake_up_process(timeout->task); } @@ -97,10 +97,10 @@ signed long __sched schedule_timeout(signed long timeout) timer.timer.expires = expire; add_timer(&timer.timer); schedule(); - del_timer_sync(&timer.timer); + timer_delete_sync(&timer.timer); /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer.timer); + timer_destroy_on_stack(&timer.timer); timeout = expire - jiffies; diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index e28f9210f8a1..a88b72b0f35e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -100,7 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) void tick_setup_hrtimer_broadcast(void) { - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - bctimer.function = bc_handler; + hrtimer_setup(&bctimer, bc_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); clockevents_register_device(&ce_broadcast_hrtimer); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index a47bcf71defc..9a3859443c04 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -509,6 +509,7 @@ void tick_resume(void) #ifdef CONFIG_SUSPEND static DEFINE_RAW_SPINLOCK(tick_freeze_lock); +static DEFINE_WAIT_OVERRIDE_MAP(tick_freeze_map, LD_WAIT_SLEEP); static unsigned int tick_freeze_depth; /** @@ -528,9 +529,22 @@ void tick_freeze(void) if (tick_freeze_depth == num_online_cpus()) { trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); + /* + * All other CPUs have their interrupts disabled and are + * suspended to idle. Other tasks have been frozen so there + * is no scheduling happening. This means that there is no + * concurrency in the system at this point. Therefore it is + * okay to acquire a sleeping lock on PREEMPT_RT, such as a + * spinlock, because the lock cannot be held by other CPUs + * or threads and acquiring it cannot block. + * + * Inform lockdep about the situation. + */ + lock_map_acquire_try(&tick_freeze_map); system_state = SYSTEM_SUSPEND; sched_clock_suspend(); timekeeping_suspend(); + lock_map_release(&tick_freeze_map); } else { tick_suspend_local(); } @@ -552,8 +566,16 @@ void tick_unfreeze(void) raw_spin_lock(&tick_freeze_lock); if (tick_freeze_depth == num_online_cpus()) { + /* + * Similar to tick_freeze(). On resumption the first CPU may + * acquire uncontended sleeping locks while other CPUs block on + * tick_freeze_lock. + */ + lock_map_acquire_try(&tick_freeze_map); timekeeping_resume(); sched_clock_resume(); + lock_map_release(&tick_freeze_map); + system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa058510af9c..c527b421c865 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1573,12 +1573,10 @@ void tick_setup_sched_timer(bool hrtimer) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) { + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) tick_sched_flag_set(ts, TS_FLAG_HIGHRES); - ts->sched_timer.function = tick_nohz_handler; - } /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1e67d076f195..a009c91f7b05 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -164,10 +164,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk) return ts; } +static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = tk->coarse_nsec; + return ts; +} + +/* + * Update the nanoseconds part for the coarse time keepers. They can't rely + * on xtime_nsec because xtime_nsec could be adjusted by a small negative + * amount when the multiplication factor of the clock is adjusted, which + * could cause the coarse clocks to go slightly backwards. See + * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse + * clockids which only is updated when the clock has been set or we have + * accumulated time. + */ +static inline void tk_update_coarse_nsecs(struct timekeeper *tk) +{ + tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; +} + static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; + tk_update_coarse_nsecs(tk); } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) @@ -175,6 +199,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); + tk_update_coarse_nsecs(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) @@ -708,6 +733,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_normalize_xtime(tk); delta -= incr; } + tk_update_coarse_nsecs(tk); } /** @@ -804,8 +830,8 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; ktime_t base, *offset = offsets[offs]; + unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -813,7 +839,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsecs = tk->coarse_nsec; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -2161,7 +2187,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) struct timekeeper *real_tk = &tk_core.timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; - u64 offset; + u64 offset, orig_offset; guard(raw_spinlock_irqsave)(&tk_core.lock); @@ -2172,7 +2198,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); - + orig_offset = offset; /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; @@ -2205,6 +2231,14 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) */ clock_set |= accumulate_nsecs_to_secs(tk); + /* + * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls + * making small negative adjustments to the base xtime_nsec + * value, only update the coarse clocks if we accumulated time + */ + if (orig_offset != offset) + tk_update_coarse_nsecs(tk); + timekeeping_update_from_shadow(&tk_core, clock_set); return !!clock_set; @@ -2248,7 +2282,7 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); @@ -2271,7 +2305,7 @@ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -2350,12 +2384,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - now = tk_xtime(tk); + now = tk_xtime_coarse(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, - now.tv_nsec + mono.tv_nsec); + now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index c8f776dc6ee0..553fa469d7cc 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -386,32 +386,6 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, } /** - * __round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, false); -} -EXPORT_SYMBOL_GPL(__round_jiffies); - -/** * __round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen @@ -483,22 +457,6 @@ unsigned long round_jiffies_relative(unsigned long j) EXPORT_SYMBOL_GPL(round_jiffies_relative); /** - * __round_jiffies_up - function to round jiffies up to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * This is the same as __round_jiffies() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long __round_jiffies_up(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, true); -} -EXPORT_SYMBOL_GPL(__round_jiffies_up); - -/** * __round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen @@ -744,7 +702,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); + timer_delete_sync(timer); debug_object_init(timer, &timer_debug_descr); return true; default: @@ -790,7 +748,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); + timer_delete_sync(timer); debug_object_free(timer, &timer_debug_descr); return true; default: @@ -850,7 +808,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key); -void init_timer_on_stack_key(struct timer_list *timer, +void timer_init_key_on_stack(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) @@ -858,13 +816,13 @@ void init_timer_on_stack_key(struct timer_list *timer, debug_object_init_on_stack(timer, &timer_debug_descr); do_init_timer(timer, func, flags, name, key); } -EXPORT_SYMBOL_GPL(init_timer_on_stack_key); +EXPORT_SYMBOL_GPL(timer_init_key_on_stack); -void destroy_timer_on_stack(struct timer_list *timer) +void timer_destroy_on_stack(struct timer_list *timer) { debug_object_free(timer, &timer_debug_descr); } -EXPORT_SYMBOL_GPL(destroy_timer_on_stack); +EXPORT_SYMBOL_GPL(timer_destroy_on_stack); #else static inline void debug_timer_init(struct timer_list *timer) { } @@ -904,7 +862,7 @@ static void do_init_timer(struct timer_list *timer, } /** - * init_timer_key - initialize a timer + * timer_init_key - initialize a timer * @timer: the timer to be initialized * @func: timer callback function * @flags: timer flags @@ -912,17 +870,17 @@ static void do_init_timer(struct timer_list *timer, * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * - * init_timer_key() must be done to a timer prior to calling *any* of the + * timer_init_key() must be done to a timer prior to calling *any* of the * other timer functions. */ -void init_timer_key(struct timer_list *timer, +void timer_init_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_init(timer); do_init_timer(timer, func, flags, name, key); } -EXPORT_SYMBOL(init_timer_key); +EXPORT_SYMBOL(timer_init_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { @@ -1212,10 +1170,10 @@ EXPORT_SYMBOL(mod_timer_pending); * * mod_timer(timer, expires) is equivalent to: * - * del_timer(timer); timer->expires = expires; add_timer(timer); + * timer_delete(timer); timer->expires = expires; add_timer(timer); * * mod_timer() is more efficient than the above open coded sequence. In - * case that the timer is inactive, the del_timer() part is a NOP. The + * case that the timer is inactive, the timer_delete() part is a NOP. The * timer is in any case activated with the new expiry time @expires. * * Note that if there are multiple unserialized concurrent users of the @@ -1511,7 +1469,7 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) } /** - * try_to_del_timer_sync - Try to deactivate a timer + * timer_delete_sync_try - Try to deactivate a timer * @timer: Timer to deactivate * * This function tries to deactivate a timer. On success the timer is not @@ -1526,11 +1484,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU */ -int try_to_del_timer_sync(struct timer_list *timer) +int timer_delete_sync_try(struct timer_list *timer) { return __try_to_del_timer_sync(timer, false); } -EXPORT_SYMBOL(try_to_del_timer_sync); +EXPORT_SYMBOL(timer_delete_sync_try); #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) @@ -1900,7 +1858,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) unsigned long clk, next, adj; unsigned lvl, offset = 0; - next = base->clk + NEXT_TIMER_MAX_DELTA; + next = base->clk + TIMER_NEXT_MAX_DELTA; clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); @@ -1963,7 +1921,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) WRITE_ONCE(base->next_expiry, next); base->next_expiry_recalc = false; - base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); + base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA); } #ifdef CONFIG_NO_HZ_COMMON @@ -2015,7 +1973,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base, * easy comparable to find out which base holds the first pending timer. */ if (!base->timers_pending) - WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA); + WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA); return base->next_expiry; } @@ -2399,7 +2357,7 @@ static inline void __run_timers(struct timer_base *base) * timer at this clk are that all matching timers have been * dequeued or no timer has been queued since * base::next_expiry was set to base::clk + - * NEXT_TIMER_MAX_DELTA. + * TIMER_NEXT_MAX_DELTA. */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); @@ -2544,7 +2502,7 @@ int timers_prepare_cpu(unsigned int cpu) for (b = 0; b < NR_BASES; b++) { base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; - base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA; base->next_expiry_recalc = false; base->timers_pending = false; base->is_idle = false; @@ -2599,7 +2557,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; - base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA; timer_base_init_expiry_lock(base); } } @@ -2612,7 +2570,7 @@ static void __init init_timer_cpus(void) init_timer_cpu(cpu); } -void __init init_timers(void) +void __init timers_init(void) { init_timer_cpus(); posix_cputimers_init_work(); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 1c311c46da50..b03d0ada6469 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -46,7 +46,7 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", @@ -98,7 +98,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 05d383143165..32ef27c71b57 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -15,29 +15,29 @@ #include "timekeeping_internal.h" -static inline void update_vdso_data(struct vdso_data *vdata, - struct timekeeper *tk) +static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) { + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; - vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; + vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; + vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; #endif - vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; - vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; - vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; - vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; + vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask; + vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult; + vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift; + vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; + vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; #endif - vdata[CS_RAW].mask = tk->tkr_raw.mask; - vdata[CS_RAW].mult = tk->tkr_raw.mult; - vdata[CS_RAW].shift = tk->tkr_raw.shift; + vc[CS_RAW].mask = tk->tkr_raw.mask; + vc[CS_RAW].mult = tk->tkr_raw.mult; + vc[CS_RAW].shift = tk->tkr_raw.shift; /* CLOCK_MONOTONIC */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; @@ -55,7 +55,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { @@ -65,19 +65,20 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; @@ -86,54 +87,53 @@ void update_vsyscall(struct timekeeper *tk) vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; - vdata[CS_HRES_COARSE].clock_mode = clock_mode; - vdata[CS_RAW].clock_mode = clock_mode; + vc[CS_HRES_COARSE].clock_mode = clock_mode; + vc[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; - vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + vdso_ts->nsec = tk->coarse_nsec; /* CLOCK_MONOTONIC_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec = tk->coarse_nsec; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). - * Note: No need to have a second copy. */ - WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); + WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) - update_vdso_data(vdata, tk); + update_vdso_time_data(vdata, tk); __arch_update_vsyscall(vdata); vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } void update_vsyscall_tz(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; - vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; - vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; + vdata->tz_minuteswest = sys_tz.tz_minuteswest; + vdata->tz_dsttime = sys_tz.tz_dsttime; - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } /** @@ -150,7 +150,7 @@ void update_vsyscall_tz(void) */ unsigned long vdso_update_begin(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; unsigned long flags = timekeeper_lock_irqsave(); vdso_write_begin(vdata); @@ -167,9 +167,9 @@ unsigned long vdso_update_begin(void) */ void vdso_update_end(unsigned long flags) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); timekeeper_unlock_irqrestore(flags); } diff --git a/kernel/torture.c b/kernel/torture.c index dede150aef01..3a0a8cc60401 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -792,6 +792,8 @@ static void torture_stutter_cleanup(void) stutter_task = NULL; } +static unsigned long torture_init_jiffies; + static void torture_print_module_parms(void) { @@ -821,6 +823,7 @@ bool torture_init_begin(char *ttype, int v) torture_type = ttype; verbose = v; fullstop = FULLSTOP_DONTSTOP; + WRITE_ONCE(torture_init_jiffies, jiffies); // Lockless reads. torture_print_module_parms(); return true; } @@ -837,6 +840,15 @@ void torture_init_end(void) EXPORT_SYMBOL_GPL(torture_init_end); /* + * Get the torture_init_begin()-time value of the jiffies counter. + */ +unsigned long get_torture_init_jiffies(void) +{ + return READ_ONCE(torture_init_jiffies); +} +EXPORT_SYMBOL_GPL(get_torture_init_jiffies); + +/* * Clean up torture module. Please note that this is -not- invoked via * the usual module_exit() mechanism, but rather by an explicit call from * the client torture module. Returns true if a race with system shutdown diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d570b8b9c0a9..a3f35c7d83b6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -263,6 +263,17 @@ config FUNCTION_GRAPH_RETADDR the function is called. This feature is off by default, and you can enable it via the trace option funcgraph-retaddr. +config FUNCTION_TRACE_ARGS + bool + depends on PROBE_EVENTS_BTF_ARGS + default y + help + If supported with function argument access API and BTF, then + the function tracer and function graph tracer will support printing + of function arguments. This feature is off by default, and can be + enabled via the trace option func-args (for the function tracer) and + funcgraph-args (for the function graph tracer) + config DYNAMIC_FTRACE bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3679a6d18934..3f6a7bdc6edf 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -893,11 +893,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, rcu_read_unlock(); } -static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) -{ - blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0); -} - static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio) { @@ -1089,8 +1084,6 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); WARN_ON(ret); - ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); - WARN_ON(ret); ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); WARN_ON(ret); ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); @@ -1125,7 +1118,6 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); - unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); @@ -1462,7 +1454,6 @@ static const struct { [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, - [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, }; @@ -1896,6 +1887,8 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) rwbs[i++] = 'S'; if (opf & REQ_META) rwbs[i++] = 'M'; + if (opf & REQ_ATOMIC) + rwbs[i++] = 'U'; rwbs[i] = '\0'; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index adc947587eb8..132c8be6f635 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -392,7 +392,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .arg2_type = ARG_CONST_SIZE, }; -static void __set_printk_clr_event(void) +static void __set_printk_clr_event(struct work_struct *work) { /* * This program might be calling bpf_trace_printk, @@ -405,10 +405,11 @@ static void __set_printk_clr_event(void) if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) pr_warn_ratelimited("could not enable bpf_trace_printk events"); } +static DECLARE_WORK(set_printk_work, __set_printk_clr_event); const struct bpf_func_proto *bpf_get_trace_printk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_printk_proto; } @@ -451,7 +452,7 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = { const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_vprintk_proto; } @@ -571,7 +572,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) return value; } -static const struct bpf_func_proto bpf_perf_event_read_proto = { +const struct bpf_func_proto bpf_perf_event_read_proto = { .func = bpf_perf_event_read, .gpl_only = true, .ret_type = RET_INTEGER, @@ -606,6 +607,11 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; +const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void) +{ + return &bpf_perf_event_read_value_proto; +} + static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_raw_record *raw, @@ -843,7 +849,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc if (unlikely(is_global_init(task))) return -EPERM; - if (!preemptible()) { + if (preempt_count() != 0 || irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ @@ -876,7 +882,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig) return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0); } -static const struct bpf_func_proto bpf_send_signal_proto = { +const struct bpf_func_proto bpf_send_signal_proto = { .func = bpf_send_signal, .gpl_only = false, .ret_type = RET_INTEGER, @@ -888,7 +894,7 @@ BPF_CALL_1(bpf_send_signal_thread, u32, sig) return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0); } -static const struct bpf_func_proto bpf_send_signal_thread_proto = { +const struct bpf_func_proto bpf_send_signal_thread_proto = { .func = bpf_send_signal_thread, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1038,27 +1044,14 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_X86_KERNEL_IBT -static unsigned long get_entry_ip(unsigned long fentry_ip) +static inline unsigned long get_entry_ip(unsigned long fentry_ip) { - u32 instr; - - /* We want to be extra safe in case entry ip is on the page edge, - * but otherwise we need to avoid get_kernel_nofault()'s overhead. - */ - if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { - if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) - return fentry_ip; - } else { - instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); - } - if (is_endbr(instr)) +#ifdef CONFIG_X86_KERNEL_IBT + if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; +#endif return fentry_ip; } -#else -#define get_entry_ip(fentry_ip) fentry_ip -#endif BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { @@ -1192,7 +1185,7 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) return entry_cnt * br_entry_size; } -static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { +const struct bpf_func_proto bpf_get_branch_snapshot_proto = { .func = bpf_get_branch_snapshot, .gpl_only = true, .ret_type = RET_INTEGER, @@ -1437,56 +1430,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) const struct bpf_func_proto *func_proto; switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; - case BPF_FUNC_map_lookup_percpu_elem: - return &bpf_map_lookup_percpu_elem_proto; - case BPF_FUNC_ktime_get_ns: - return &bpf_ktime_get_ns_proto; - case BPF_FUNC_ktime_get_boot_ns: - return &bpf_ktime_get_boot_ns_proto; - case BPF_FUNC_tail_call: - return &bpf_tail_call_proto; - case BPF_FUNC_get_current_task: - return &bpf_get_current_task_proto; - case BPF_FUNC_get_current_task_btf: - return &bpf_get_current_task_btf_proto; - case BPF_FUNC_task_pt_regs: - return &bpf_task_pt_regs_proto; - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; - case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; - case BPF_FUNC_get_numa_node_id: - return &bpf_get_numa_node_id_proto; - case BPF_FUNC_perf_event_read: - return &bpf_perf_event_read_proto; - case BPF_FUNC_get_prandom_u32: - return &bpf_get_prandom_u32_proto; - case BPF_FUNC_probe_read_user: - return &bpf_probe_read_user_proto; - case BPF_FUNC_probe_read_kernel: - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? - NULL : &bpf_probe_read_kernel_proto; - case BPF_FUNC_probe_read_user_str: - return &bpf_probe_read_user_str_proto; - case BPF_FUNC_probe_read_kernel_str: - return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? - NULL : &bpf_probe_read_kernel_str_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? @@ -1495,65 +1440,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_str_proto; #endif -#ifdef CONFIG_CGROUPS - case BPF_FUNC_cgrp_storage_get: - return &bpf_cgrp_storage_get_proto; - case BPF_FUNC_cgrp_storage_delete: - return &bpf_cgrp_storage_delete_proto; - case BPF_FUNC_current_task_under_cgroup: - return &bpf_current_task_under_cgroup_proto; -#endif - case BPF_FUNC_send_signal: - return &bpf_send_signal_proto; - case BPF_FUNC_send_signal_thread: - return &bpf_send_signal_thread_proto; - case BPF_FUNC_perf_event_read_value: - return &bpf_perf_event_read_value_proto; - case BPF_FUNC_ringbuf_output: - return &bpf_ringbuf_output_proto; - case BPF_FUNC_ringbuf_reserve: - return &bpf_ringbuf_reserve_proto; - case BPF_FUNC_ringbuf_submit: - return &bpf_ringbuf_submit_proto; - case BPF_FUNC_ringbuf_discard: - return &bpf_ringbuf_discard_proto; - case BPF_FUNC_ringbuf_query: - return &bpf_ringbuf_query_proto; - case BPF_FUNC_jiffies64: - return &bpf_jiffies64_proto; - case BPF_FUNC_get_task_stack: - return prog->sleepable ? &bpf_get_task_stack_sleepable_proto - : &bpf_get_task_stack_proto; - case BPF_FUNC_copy_from_user: - return &bpf_copy_from_user_proto; - case BPF_FUNC_copy_from_user_task: - return &bpf_copy_from_user_task_proto; - case BPF_FUNC_snprintf_btf: - return &bpf_snprintf_btf_proto; - case BPF_FUNC_per_cpu_ptr: - return &bpf_per_cpu_ptr_proto; - case BPF_FUNC_this_cpu_ptr: - return &bpf_this_cpu_ptr_proto; - case BPF_FUNC_task_storage_get: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_get_recur_proto; - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_delete_recur_proto; - return &bpf_task_storage_delete_proto; - case BPF_FUNC_for_each_map_elem: - return &bpf_for_each_map_elem_proto; - case BPF_FUNC_snprintf: - return &bpf_snprintf_proto; case BPF_FUNC_get_func_ip: return &bpf_get_func_ip_proto_tracing; - case BPF_FUNC_get_branch_snapshot: - return &bpf_get_branch_snapshot_proto; - case BPF_FUNC_find_vma: - return &bpf_find_vma_proto; - case BPF_FUNC_trace_vprintk: - return bpf_get_trace_vprintk_proto(); default: break; } @@ -1865,7 +1753,7 @@ static struct pt_regs *get_bpf_raw_tp_regs(void) struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); - if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { + if (nest_level > ARRAY_SIZE(tp_regs->regs)) { this_cpu_dec(bpf_raw_tp_nest_level); return ERR_PTR(-EBUSY); } @@ -2345,10 +2233,9 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address((unsigned long)btp); module_put(mod); - preempt_enable(); } static __always_inline @@ -2932,18 +2819,21 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3 u32 i, err = 0; for (i = 0; i < addrs_cnt; i++) { + bool skip_add = false; struct module *mod; - preempt_disable(); - mod = __module_address(addrs[i]); - /* Either no module or we it's already stored */ - if (!mod || has_module(&arr, mod)) { - preempt_enable(); - continue; + scoped_guard(rcu) { + mod = __module_address(addrs[i]); + /* Either no module or it's already stored */ + if (!mod || has_module(&arr, mod)) { + skip_add = true; + break; /* scoped_guard */ + } + if (!try_module_get(mod)) + err = -EINVAL; } - if (!try_module_get(mod)) - err = -EINVAL; - preempt_enable(); + if (skip_add) + continue; if (err) break; err = add_module(&arr, mod); @@ -2992,6 +2882,9 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_kprobe_multi(prog)) return -EINVAL; @@ -3381,6 +3274,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (sizeof(u64) != sizeof(void *)) return -EOPNOTSUPP; + if (attr->link_create.flags) + return -EINVAL; + if (!is_uprobe_multi(prog)) return -EINVAL; @@ -3422,7 +3318,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } if (pid) { + rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); + rcu_read_unlock(); if (!task) { err = -ESRCH; goto error_path_put; @@ -3570,6 +3468,146 @@ static int __init bpf_kprobe_multi_kfuncs_init(void) late_initcall(bpf_kprobe_multi_kfuncs_init); +typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk); + +/* + * The __always_inline is to make sure the compiler doesn't + * generate indirect calls into callbacks, which is expensive, + * on some kernel configurations. This allows compiler to put + * direct calls into all the specific callback implementations + * (copy_user_data_sleepable, copy_user_data_nofault, and so on) + */ +static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size, + const void *unsafe_src, + copy_fn_t str_copy_fn, + struct task_struct *tsk) +{ + struct bpf_dynptr_kern *dst; + u32 chunk_sz, off; + void *dst_slice; + int cnt, err; + char buf[256]; + + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); + if (likely(dst_slice)) + return str_copy_fn(dst_slice, unsafe_src, size, tsk); + + dst = (struct bpf_dynptr_kern *)dptr; + if (bpf_dynptr_check_off_len(dst, doff, size)) + return -E2BIG; + + for (off = 0; off < size; off += chunk_sz - 1) { + chunk_sz = min_t(u32, sizeof(buf), size - off); + /* Expect str_copy_fn to return count of copied bytes, including + * zero terminator. Next iteration increment off by chunk_sz - 1 to + * overwrite NUL. + */ + cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk); + if (cnt < 0) + return cnt; + err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0); + if (err) + return err; + if (cnt < chunk_sz || chunk_sz == 1) /* we are done */ + return off + cnt; + } + return off; +} + +static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff, + u32 size, const void *unsafe_src, + copy_fn_t copy_fn, struct task_struct *tsk) +{ + struct bpf_dynptr_kern *dst; + void *dst_slice; + char buf[256]; + u32 off, chunk_sz; + int err; + + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); + if (likely(dst_slice)) + return copy_fn(dst_slice, unsafe_src, size, tsk); + + dst = (struct bpf_dynptr_kern *)dptr; + if (bpf_dynptr_check_off_len(dst, doff, size)) + return -E2BIG; + + for (off = 0; off < size; off += chunk_sz) { + chunk_sz = min_t(u32, sizeof(buf), size - off); + err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); + if (err) + return err; + err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0); + if (err) + return err; + } + return 0; +} + +static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size); +} + +static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + int ret; + + if (!tsk) { /* Read from the current task */ + ret = copy_from_user(dst, (const void __user *)unsafe_src, size); + if (ret) + return -EFAULT; + return 0; + } + + ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0); + if (ret != size) + return -EFAULT; + return 0; +} + +static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return copy_from_kernel_nofault(dst, unsafe_src, size); +} + +static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size); +} + +static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + int ret; + + if (unlikely(size == 0)) + return 0; + + if (tsk) { + ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0); + } else { + ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1); + /* strncpy_from_user does not guarantee NUL termination */ + if (ret >= 0) + ((char *)dst)[ret] = '\0'; + } + + if (ret < 0) + return ret; + return ret + 1; +} + +static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src, + u32 size, struct task_struct *tsk) +{ + return strncpy_from_kernel_nofault(dst, unsafe_src, size); +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type, @@ -3581,4 +3619,62 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } +__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, + copy_kernel_data_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_nofault, NULL); +} + +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, + copy_kernel_str_nofault, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_sleepable, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_sleepable, NULL); +} + +__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign, + struct task_struct *tsk) +{ + return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_data_sleepable, tsk); +} + +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off, + u32 size, const void __user *unsafe_ptr__ign, + struct task_struct *tsk) +{ + return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + copy_user_str_sleepable, tsk); +} + __bpf_kfunc_end_defs(); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 5dddfc2149f6..c5b207992fb4 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -865,7 +865,7 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe } /* - * After all architecures have selected HAVE_FUNCTION_GRAPH_FREGS, we can + * After all architectures have selected HAVE_FUNCTION_GRAPH_FREGS, we can * leave only ftrace_return_to_handler(fregs). */ #ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS @@ -1382,6 +1382,8 @@ int register_ftrace_graph(struct fgraph_ops *gops) /* Always save the function, and reset at unregistering */ gops->saved_func = gops->entryfunc; + gops->ops.flags |= FTRACE_OPS_FL_GRAPH; + ret = ftrace_startup_subops(&graph_ops, &gops->ops, command); if (!ret) fgraph_array[i] = gops; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 2560b312ad57..ba7ff14f5339 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -89,8 +89,11 @@ static bool delete_fprobe_node(struct fprobe_hlist_node *node) { lockdep_assert_held(&fprobe_mutex); - WRITE_ONCE(node->fp, NULL); - hlist_del_rcu(&node->hlist); + /* Avoid double deleting */ + if (READ_ONCE(node->fp) != NULL) { + WRITE_ONCE(node->fp, NULL); + hlist_del_rcu(&node->hlist); + } return !!find_first_fprobe_node(node->addr); } @@ -403,14 +406,110 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num) lockdep_assert_held(&fprobe_mutex); fprobe_graph_active--; - if (!fprobe_graph_active) { - /* Q: should we unregister it ? */ + /* Q: should we unregister it ? */ + if (!fprobe_graph_active) unregister_ftrace_graph(&fprobe_graph_ops); - return; + + if (num) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); +} + +#ifdef CONFIG_MODULES + +#define FPROBE_IPS_BATCH_INIT 8 +/* instruction pointer address list */ +struct fprobe_addr_list { + int index; + int size; + unsigned long *addrs; +}; + +static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long addr) +{ + unsigned long *addrs; + + if (alist->index >= alist->size) + return -ENOMEM; + + alist->addrs[alist->index++] = addr; + if (alist->index < alist->size) + return 0; + + /* Expand the address list */ + addrs = kcalloc(alist->size * 2, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return -ENOMEM; + + memcpy(addrs, alist->addrs, alist->size * sizeof(*addrs)); + alist->size *= 2; + kfree(alist->addrs); + alist->addrs = addrs; + + return 0; +} + +static void fprobe_remove_node_in_module(struct module *mod, struct hlist_head *head, + struct fprobe_addr_list *alist) +{ + struct fprobe_hlist_node *node; + int ret = 0; + + hlist_for_each_entry_rcu(node, head, hlist, + lockdep_is_held(&fprobe_mutex)) { + if (!within_module(node->addr, mod)) + continue; + if (delete_fprobe_node(node)) + continue; + /* + * If failed to update alist, just continue to update hlist. + * Therefore, at list user handler will not hit anymore. + */ + if (!ret) + ret = fprobe_addr_list_add(alist, node->addr); } +} + +/* Handle module unloading to manage fprobe_ip_table. */ +static int fprobe_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct fprobe_addr_list alist = {.size = FPROBE_IPS_BATCH_INIT}; + struct module *mod = data; + int i; + + if (val != MODULE_STATE_GOING) + return NOTIFY_DONE; - ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); + alist.addrs = kcalloc(alist.size, sizeof(*alist.addrs), GFP_KERNEL); + /* If failed to alloc memory, we can not remove ips from hash. */ + if (!alist.addrs) + return NOTIFY_DONE; + + mutex_lock(&fprobe_mutex); + for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) + fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); + + if (alist.index < alist.size && alist.index > 0) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, + alist.addrs, alist.index, 1, 0); + mutex_unlock(&fprobe_mutex); + + kfree(alist.addrs); + + return NOTIFY_DONE; +} + +static struct notifier_block fprobe_module_nb = { + .notifier_call = fprobe_module_callback, + .priority = 0, +}; + +static int __init init_fprobe_module(void) +{ + return register_module_notifier(&fprobe_module_nb); } +early_initcall(init_fprobe_module); +#endif static int symbols_cmp(const void *a, const void *b) { @@ -446,6 +545,7 @@ struct filter_match_data { size_t index; size_t size; unsigned long *addrs; + struct module **mods; }; static int filter_match_callback(void *data, const char *name, unsigned long addr) @@ -459,30 +559,47 @@ static int filter_match_callback(void *data, const char *name, unsigned long add if (!ftrace_location(addr)) return 0; - if (match->addrs) - match->addrs[match->index] = addr; + if (match->addrs) { + struct module *mod = __module_text_address(addr); + if (mod && !try_module_get(mod)) + return 0; + + match->mods[match->index] = mod; + match->addrs[match->index] = addr; + } match->index++; return match->index == match->size; } /* * Make IP list from the filter/no-filter glob patterns. - * Return the number of matched symbols, or -ENOENT. + * Return the number of matched symbols, or errno. + * If @addrs == NULL, this just counts the number of matched symbols. If @addrs + * is passed with an array, we need to pass the an @mods array of the same size + * to increment the module refcount for each symbol. + * This means we also need to call `module_put` for each element of @mods after + * using the @addrs. */ -static int ip_list_from_filter(const char *filter, const char *notfilter, - unsigned long *addrs, size_t size) +static int get_ips_from_filter(const char *filter, const char *notfilter, + unsigned long *addrs, struct module **mods, + size_t size) { struct filter_match_data match = { .filter = filter, .notfilter = notfilter, - .index = 0, .size = size, .addrs = addrs}; + .index = 0, .size = size, .addrs = addrs, .mods = mods}; int ret; + if (addrs && !mods) + return -EINVAL; + ret = kallsyms_on_each_symbol(filter_match_callback, &match); if (ret < 0) return ret; - ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match); - if (ret < 0) - return ret; + if (IS_ENABLED(CONFIG_MODULES)) { + ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match); + if (ret < 0) + return ret; + } return match.index ?: -ENOENT; } @@ -544,24 +661,35 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) */ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) { - unsigned long *addrs; - int ret; + unsigned long *addrs __free(kfree) = NULL; + struct module **mods __free(kfree) = NULL; + int ret, num; if (!fp || !filter) return -EINVAL; - ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX); - if (ret < 0) - return ret; + num = get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX); + if (num < 0) + return num; - addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL); + addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL); if (!addrs) return -ENOMEM; - ret = ip_list_from_filter(filter, notfilter, addrs, ret); - if (ret > 0) - ret = register_fprobe_ips(fp, addrs, ret); - kfree(addrs); + mods = kcalloc(num, sizeof(*mods), GFP_KERNEL); + if (!mods) + return -ENOMEM; + + ret = get_ips_from_filter(filter, notfilter, addrs, mods, num); + if (ret < 0) + return ret; + + ret = register_fprobe_ips(fp, addrs, ret); + + for (int i = 0; i < num; i++) { + if (mods[i]) + module_put(mods[i]); + } return ret; } EXPORT_SYMBOL_GPL(register_fprobe); @@ -679,8 +807,7 @@ int unregister_fprobe(struct fprobe *fp) } del_fprobe_hash(fp); - if (count) - fprobe_graph_remove_ips(addrs, count); + fprobe_graph_remove_ips(addrs, count); kfree_rcu(hlist_array, rcu); fp->hlist_array = NULL; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 728ecda6e8d4..4203fad56b6c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -188,7 +188,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, fregs); } -static void ftrace_sync_ipi(void *data) +void ftrace_sync_ipi(void *data) { /* Probably not needed, but do it anyway */ smp_rmb(); @@ -540,6 +540,7 @@ static int function_stat_show(struct seq_file *m, void *v) static struct trace_seq s; unsigned long long avg; unsigned long long stddev; + unsigned long long stddev_denom; #endif guard(mutex)(&ftrace_profile_lock); @@ -559,23 +560,19 @@ static int function_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - /* Sample standard deviation (s^2) */ - if (rec->counter <= 1) - stddev = 0; - else { - /* - * Apply Welford's method: - * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) - */ + /* + * Variance formula: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + * Maybe Welford's method is better here? + * Divide only by 1000 for ns^2 -> us^2 conversion. + * trace_print_graph_duration will divide by 1000 again. + */ + stddev = 0; + stddev_denom = rec->counter * (rec->counter - 1) * 1000; + if (stddev_denom) { stddev = rec->counter * rec->time_squared - rec->time * rec->time; - - /* - * Divide only 1000 for ns^2 -> us^2 conversion. - * trace_print_graph_duration will divide 1000 again. - */ - stddev = div64_ul(stddev, - rec->counter * (rec->counter - 1) * 1000); + stddev = div64_ul(stddev, stddev_denom); } trace_seq_init(&s); @@ -1296,8 +1293,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) void ftrace_free_filter(struct ftrace_ops *ops) { ftrace_ops_init(ops); + if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) + return; free_ftrace_hash(ops->func_hash->filter_hash); free_ftrace_hash(ops->func_hash->notrace_hash); + ops->func_hash->filter_hash = EMPTY_HASH; + ops->func_hash->notrace_hash = EMPTY_HASH; } EXPORT_SYMBOL_GPL(ftrace_free_filter); @@ -3220,15 +3221,22 @@ static struct ftrace_hash *copy_hash(struct ftrace_hash *src) * The filter_hash updates uses just the append_hash() function * and the notrace_hash does not. */ -static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash) +static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash, + int size_bits) { struct ftrace_func_entry *entry; int size; int i; - /* An empty hash does everything */ - if (ftrace_hash_empty(*hash)) - return 0; + if (*hash) { + /* An empty hash does everything */ + if (ftrace_hash_empty(*hash)) + return 0; + } else { + *hash = alloc_ftrace_hash(size_bits); + if (!*hash) + return -ENOMEM; + } /* If new_hash has everything make hash have everything */ if (ftrace_hash_empty(new_hash)) { @@ -3250,6 +3258,31 @@ static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash) } /* + * Remove functions from @hash that are in @notrace_hash + */ +static void remove_hash(struct ftrace_hash *hash, struct ftrace_hash *notrace_hash) +{ + struct ftrace_func_entry *entry; + struct hlist_node *tmp; + int size; + int i; + + /* If the notrace hash is empty, there's nothing to do */ + if (ftrace_hash_empty(notrace_hash)) + return; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) { + if (!__ftrace_lookup_ip(notrace_hash, entry->ip)) + continue; + remove_hash_entry(hash, entry); + kfree(entry); + } + } +} + +/* * Add to @hash only those that are in both @new_hash1 and @new_hash2 * * The notrace_hash updates uses just the intersect_hash() function @@ -3289,64 +3322,6 @@ static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_has return 0; } -/* Return a new hash that has a union of all @ops->filter_hash entries */ -static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) -{ - struct ftrace_hash *new_hash; - struct ftrace_ops *subops; - int ret; - - new_hash = alloc_ftrace_hash(ops->func_hash->filter_hash->size_bits); - if (!new_hash) - return NULL; - - list_for_each_entry(subops, &ops->subop_list, list) { - ret = append_hash(&new_hash, subops->func_hash->filter_hash); - if (ret < 0) { - free_ftrace_hash(new_hash); - return NULL; - } - /* Nothing more to do if new_hash is empty */ - if (ftrace_hash_empty(new_hash)) - break; - } - return new_hash; -} - -/* Make @ops trace evenything except what all its subops do not trace */ -static struct ftrace_hash *intersect_hashes(struct ftrace_ops *ops) -{ - struct ftrace_hash *new_hash = NULL; - struct ftrace_ops *subops; - int size_bits; - int ret; - - list_for_each_entry(subops, &ops->subop_list, list) { - struct ftrace_hash *next_hash; - - if (!new_hash) { - size_bits = subops->func_hash->notrace_hash->size_bits; - new_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->notrace_hash); - if (!new_hash) - return NULL; - continue; - } - size_bits = new_hash->size_bits; - next_hash = new_hash; - new_hash = alloc_ftrace_hash(size_bits); - ret = intersect_hash(&new_hash, next_hash, subops->func_hash->notrace_hash); - free_ftrace_hash(next_hash); - if (ret < 0) { - free_ftrace_hash(new_hash); - return NULL; - } - /* Nothing more to do if new_hash is empty */ - if (ftrace_hash_empty(new_hash)) - break; - } - return new_hash; -} - static bool ops_equal(struct ftrace_hash *A, struct ftrace_hash *B) { struct ftrace_func_entry *entry; @@ -3418,6 +3393,95 @@ static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_ return 0; } +static int add_first_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops_hash *func_hash) +{ + /* If the filter hash is not empty, simply remove the nohash from it */ + if (!ftrace_hash_empty(func_hash->filter_hash)) { + *filter_hash = copy_hash(func_hash->filter_hash); + if (!*filter_hash) + return -ENOMEM; + remove_hash(*filter_hash, func_hash->notrace_hash); + *notrace_hash = EMPTY_HASH; + + } else { + *notrace_hash = copy_hash(func_hash->notrace_hash); + if (!*notrace_hash) + return -ENOMEM; + *filter_hash = EMPTY_HASH; + } + return 0; +} + +static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops_hash *ops_hash, struct ftrace_ops_hash *subops_hash) +{ + int size_bits; + int ret; + + /* If the subops trace all functions so must the main ops */ + if (ftrace_hash_empty(ops_hash->filter_hash) || + ftrace_hash_empty(subops_hash->filter_hash)) { + *filter_hash = EMPTY_HASH; + } else { + /* + * The main ops filter hash is not empty, so its + * notrace_hash had better be, as the notrace hash + * is only used for empty main filter hashes. + */ + WARN_ON_ONCE(!ftrace_hash_empty(ops_hash->notrace_hash)); + + size_bits = max(ops_hash->filter_hash->size_bits, + subops_hash->filter_hash->size_bits); + + /* Copy the subops hash */ + *filter_hash = alloc_and_copy_ftrace_hash(size_bits, subops_hash->filter_hash); + if (!*filter_hash) + return -ENOMEM; + /* Remove any notrace functions from the copy */ + remove_hash(*filter_hash, subops_hash->notrace_hash); + + ret = append_hash(filter_hash, ops_hash->filter_hash, + size_bits); + if (ret < 0) { + free_ftrace_hash(*filter_hash); + *filter_hash = EMPTY_HASH; + return ret; + } + } + + /* + * Only process notrace hashes if the main filter hash is empty + * (tracing all functions), otherwise the filter hash will just + * remove the notrace hash functions, and the notrace hash is + * not needed. + */ + if (ftrace_hash_empty(*filter_hash)) { + /* + * Intersect the notrace functions. That is, if two + * subops are not tracing a set of functions, the + * main ops will only not trace the functions that are + * in both subops, but has to trace the functions that + * are only notrace in one of the subops, for the other + * subops to be able to trace them. + */ + size_bits = max(ops_hash->notrace_hash->size_bits, + subops_hash->notrace_hash->size_bits); + *notrace_hash = alloc_ftrace_hash(size_bits); + if (!*notrace_hash) + return -ENOMEM; + + ret = intersect_hash(notrace_hash, ops_hash->notrace_hash, + subops_hash->notrace_hash); + if (ret < 0) { + free_ftrace_hash(*notrace_hash); + *notrace_hash = EMPTY_HASH; + return ret; + } + } + return 0; +} + /** * ftrace_startup_subops - enable tracing for subops of an ops * @ops: Manager ops (used to pick all the functions of its subops) @@ -3430,11 +3494,10 @@ static int ftrace_update_ops(struct ftrace_ops *ops, struct ftrace_hash *filter_ */ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; struct ftrace_hash *save_filter_hash; struct ftrace_hash *save_notrace_hash; - int size_bits; int ret; if (unlikely(ftrace_disabled)) @@ -3458,14 +3521,14 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int /* For the first subops to ops just enable it normally */ if (list_empty(&ops->subop_list)) { - /* Just use the subops hashes */ - filter_hash = copy_hash(subops->func_hash->filter_hash); - notrace_hash = copy_hash(subops->func_hash->notrace_hash); - if (!filter_hash || !notrace_hash) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - return -ENOMEM; - } + + /* The ops was empty, should have empty hashes */ + WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->filter_hash)); + WARN_ON_ONCE(!ftrace_hash_empty(ops->func_hash->notrace_hash)); + + ret = add_first_hash(&filter_hash, ¬race_hash, subops->func_hash); + if (ret < 0) + return ret; save_filter_hash = ops->func_hash->filter_hash; save_notrace_hash = ops->func_hash->notrace_hash; @@ -3491,47 +3554,16 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int /* * Here there's already something attached. Here are the rules: - * o If either filter_hash is empty then the final stays empty - * o Otherwise, the final is a superset of both hashes - * o If either notrace_hash is empty then the final stays empty - * o Otherwise, the final is an intersection between the hashes + * If the new subops and main ops filter hashes are not empty: + * o Make a copy of the subops filter hash + * o Remove all functions in the nohash from it. + * o Add in the main hash filter functions + * o Remove any of these functions from the main notrace hash */ - if (ftrace_hash_empty(ops->func_hash->filter_hash) || - ftrace_hash_empty(subops->func_hash->filter_hash)) { - filter_hash = EMPTY_HASH; - } else { - size_bits = max(ops->func_hash->filter_hash->size_bits, - subops->func_hash->filter_hash->size_bits); - filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash); - if (!filter_hash) - return -ENOMEM; - ret = append_hash(&filter_hash, subops->func_hash->filter_hash); - if (ret < 0) { - free_ftrace_hash(filter_hash); - return ret; - } - } - if (ftrace_hash_empty(ops->func_hash->notrace_hash) || - ftrace_hash_empty(subops->func_hash->notrace_hash)) { - notrace_hash = EMPTY_HASH; - } else { - size_bits = max(ops->func_hash->filter_hash->size_bits, - subops->func_hash->filter_hash->size_bits); - notrace_hash = alloc_ftrace_hash(size_bits); - if (!notrace_hash) { - free_ftrace_hash(filter_hash); - return -ENOMEM; - } - - ret = intersect_hash(¬race_hash, ops->func_hash->filter_hash, - subops->func_hash->filter_hash); - if (ret < 0) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - return ret; - } - } + ret = add_next_hash(&filter_hash, ¬race_hash, ops->func_hash, subops->func_hash); + if (ret < 0) + return ret; list_add(&subops->list, &ops->subop_list); @@ -3547,6 +3579,45 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int return ret; } +static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash **notrace_hash, + struct ftrace_ops *ops) +{ + struct ftrace_ops_hash temp_hash; + struct ftrace_ops *subops; + bool first = true; + int ret; + + temp_hash.filter_hash = EMPTY_HASH; + temp_hash.notrace_hash = EMPTY_HASH; + + list_for_each_entry(subops, &ops->subop_list, list) { + *filter_hash = EMPTY_HASH; + *notrace_hash = EMPTY_HASH; + + if (first) { + ret = add_first_hash(filter_hash, notrace_hash, subops->func_hash); + if (ret < 0) + return ret; + first = false; + } else { + ret = add_next_hash(filter_hash, notrace_hash, + &temp_hash, subops->func_hash); + if (ret < 0) { + free_ftrace_hash(temp_hash.filter_hash); + free_ftrace_hash(temp_hash.notrace_hash); + return ret; + } + } + + free_ftrace_hash(temp_hash.filter_hash); + free_ftrace_hash(temp_hash.notrace_hash); + + temp_hash.filter_hash = *filter_hash; + temp_hash.notrace_hash = *notrace_hash; + } + return 0; +} + /** * ftrace_shutdown_subops - Remove a subops from a manager ops * @ops: A manager ops to remove @subops from @@ -3561,8 +3632,8 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int */ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; int ret; if (unlikely(ftrace_disabled)) @@ -3595,14 +3666,9 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in } /* Rebuild the hashes without subops */ - filter_hash = append_hashes(ops); - notrace_hash = intersect_hashes(ops); - if (!filter_hash || !notrace_hash) { - free_ftrace_hash(filter_hash); - free_ftrace_hash(notrace_hash); - list_add(&subops->list, &ops->subop_list); - return -ENOMEM; - } + ret = rebuild_hashes(&filter_hash, ¬race_hash, ops); + if (ret < 0) + return ret; ret = ftrace_update_ops(ops, filter_hash, notrace_hash); if (ret < 0) { @@ -3618,11 +3684,11 @@ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, in static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, struct ftrace_hash **orig_subhash, - struct ftrace_hash *hash, - int enable) + struct ftrace_hash *hash) { struct ftrace_ops *ops = subops->managed; - struct ftrace_hash **orig_hash; + struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash; struct ftrace_hash *save_hash; struct ftrace_hash *new_hash; int ret; @@ -3639,24 +3705,18 @@ static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, return -ENOMEM; } - /* Create a new_hash to hold the ops new functions */ - if (enable) { - orig_hash = &ops->func_hash->filter_hash; - new_hash = append_hashes(ops); - } else { - orig_hash = &ops->func_hash->notrace_hash; - new_hash = intersect_hashes(ops); + ret = rebuild_hashes(&filter_hash, ¬race_hash, ops); + if (!ret) { + ret = ftrace_update_ops(ops, filter_hash, notrace_hash); + free_ftrace_hash(filter_hash); + free_ftrace_hash(notrace_hash); } - /* Move the hash over to the new hash */ - ret = __ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable); - - free_ftrace_hash(new_hash); - if (ret) { /* Put back the original hash */ - free_ftrace_hash_rcu(*orig_subhash); + new_hash = *orig_subhash; *orig_subhash = save_hash; + free_ftrace_hash_rcu(new_hash); } else { free_ftrace_hash_rcu(save_hash); } @@ -4313,6 +4373,42 @@ static inline int print_rec(struct seq_file *m, unsigned long ip) } #endif +static void print_subops(struct seq_file *m, struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + struct ftrace_ops *subops; + bool first = true; + + list_for_each_entry(subops, &ops->subop_list, list) { + if (!((subops->flags & FTRACE_OPS_FL_ENABLED) && + hash_contains_ip(rec->ip, subops->func_hash))) + continue; + if (first) { + seq_printf(m, "\tsubops:"); + first = false; + } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (subops->flags & FTRACE_OPS_FL_GRAPH) { + struct fgraph_ops *gops; + + gops = container_of(subops, struct fgraph_ops, ops); + seq_printf(m, " {ent:%pS ret:%pS}", + (void *)gops->entryfunc, + (void *)gops->retfunc); + continue; + } +#endif + if (subops->trampoline) { + seq_printf(m, " {%pS (%pS)}", + (void *)subops->trampoline, + (void *)subops->func); + add_trampoline_func(m, subops, rec); + } else { + seq_printf(m, " {%pS}", + (void *)subops->func); + } + } +} + static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; @@ -4365,6 +4461,7 @@ static int t_show(struct seq_file *m, void *v) (void *)ops->trampoline, (void *)ops->func); add_trampoline_func(m, ops, rec); + print_subops(m, ops, rec); ops = ftrace_find_tramp_ops_next(rec, ops); } while (ops); } else @@ -4377,6 +4474,7 @@ static int t_show(struct seq_file *m, void *v) if (ops) { seq_printf(m, "\tops: %pS (%pS)", ops, ops->func); + print_subops(m, ops, rec); } else { seq_puts(m, "\tops: ERROR!"); } @@ -4880,7 +4978,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, int enable) { if (ops->flags & FTRACE_OPS_FL_SUBOP) - return ftrace_hash_move_and_update_subops(ops, orig_hash, hash, enable); + return ftrace_hash_move_and_update_subops(ops, orig_hash, hash); /* * If this ops is not enabled, it could be sharing its filters @@ -4899,7 +4997,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, list_for_each_entry(subops, &op->subop_list, list) { if ((subops->flags & FTRACE_OPS_FL_ENABLED) && subops->func_hash == ops->func_hash) { - return ftrace_hash_move_and_update_subops(subops, orig_hash, hash, enable); + return ftrace_hash_move_and_update_subops(subops, orig_hash, hash); } } } while_for_each_ftrace_op(op); @@ -5110,8 +5208,12 @@ struct ftrace_func_map { void *data; }; +/* + * Note, ftrace_func_mapper is freed by free_ftrace_hash(&mapper->hash). + * The hash field must be the first field. + */ struct ftrace_func_mapper { - struct ftrace_hash hash; + struct ftrace_hash hash; /* Must be first! */ }; /** @@ -5246,6 +5348,7 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, } } } + /* This also frees the mapper itself */ free_ftrace_hash(&mapper->hash); } @@ -5707,6 +5810,9 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return -ENOENT; free_hash_entry(hash, entry); return 0; + } else if (__ftrace_lookup_ip(hash, ip) != NULL) { + /* Already exists */ + return 0; } entry = add_hash_entry(hash, ip); @@ -5901,9 +6007,10 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) /* Make a copy hash to place the new and the old entries in */ size = hash->count + direct_functions->count; - if (size > 32) - size = 32; - new_hash = alloc_ftrace_hash(fls(size)); + size = fls(size); + if (size > FTRACE_HASH_MAX_BITS) + size = FTRACE_HASH_MAX_BITS; + new_hash = alloc_ftrace_hash(size); if (!new_hash) goto out_unlock; @@ -6842,6 +6949,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) } } } + cond_resched(); } while_for_each_ftrace_rec(); return fail ? -EINVAL : 0; @@ -7005,6 +7113,7 @@ static int ftrace_process_locs(struct module *mod, unsigned long *p; unsigned long addr; unsigned long flags = 0; /* Shut up gcc */ + unsigned long pages; int ret = -ENOMEM; count = end - start; @@ -7012,6 +7121,8 @@ static int ftrace_process_locs(struct module *mod, if (!count) return 0; + pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE); + /* * Sorting mcount in vmlinux at build time depend on * CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in @@ -7056,7 +7167,9 @@ static int ftrace_process_locs(struct module *mod, pg = start_pg; while (p < end) { unsigned long end_offset; - addr = ftrace_call_adjust(*p++); + + addr = *p++; + /* * Some architecture linkers will pad between * the different mcount_loc sections of different @@ -7068,6 +7181,19 @@ static int ftrace_process_locs(struct module *mod, continue; } + /* + * If this is core kernel, make sure the address is in core + * or inittext, as weak functions get zeroed and KASLR can + * move them to something other than zero. It just will not + * move it to an area where kernel text is. + */ + if (!mod && !(is_kernel_text(addr) || is_kernel_inittext(addr))) { + skipped++; + continue; + } + + addr = ftrace_call_adjust(addr); + end_offset = (pg->index+1) * sizeof(pg->records[0]); if (end_offset > PAGE_SIZE << pg->order) { /* We should have allocated enough */ @@ -7107,11 +7233,41 @@ static int ftrace_process_locs(struct module *mod, /* We should have used all pages unless we skipped some */ if (pg_unuse) { - WARN_ON(!skipped); + unsigned long pg_remaining, remaining = 0; + unsigned long skip; + + /* Count the number of entries unused and compare it to skipped. */ + pg_remaining = (ENTRIES_PER_PAGE << pg->order) - pg->index; + + if (!WARN(skipped < pg_remaining, "Extra allocated pages for ftrace")) { + + skip = skipped - pg_remaining; + + for (pg = pg_unuse; pg; pg = pg->next) + remaining += 1 << pg->order; + + pages -= remaining; + + skip = DIV_ROUND_UP(skip, ENTRIES_PER_PAGE); + + /* + * Check to see if the number of pages remaining would + * just fit the number of entries skipped. + */ + WARN(skip != remaining, "Extra allocated pages for ftrace: %lu with %lu skipped", + remaining, skipped); + } /* Need to synchronize with ftrace_location_range() */ synchronize_rcu(); ftrace_free_pages(pg_unuse); } + + if (!mod) { + count -= skipped; + pr_info("ftrace: allocating %ld entries in %ld pages\n", + count, pages); + } + return ret; } @@ -7282,9 +7438,10 @@ void ftrace_release_mod(struct module *mod) mutex_lock(&ftrace_lock); - if (ftrace_disabled) - goto out_unlock; - + /* + * To avoid the UAF problem after the module is unloaded, the + * 'mod_map' resource needs to be released unconditionally. + */ list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { if (mod_map->mod == mod) { list_del_rcu(&mod_map->list); @@ -7293,6 +7450,9 @@ void ftrace_release_mod(struct module *mod) } } + if (ftrace_disabled) + goto out_unlock; + /* * Each module has its own ftrace_pages, remove * them from the list. @@ -7471,6 +7631,9 @@ allocate_ftrace_mod_map(struct module *mod, { struct ftrace_mod_map *mod_map; + if (ftrace_disabled) + return NULL; + mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL); if (!mod_map) return NULL; @@ -7757,9 +7920,6 @@ void __init ftrace_init(void) goto failed; } - pr_info("ftrace: allocating %ld entries in %ld pages\n", - count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); - ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index c62b9b3cfb3d..090bb5ea4a19 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -81,13 +81,9 @@ static inline bool upper_empty(union upper_chunk *chunk) { /* * If chunk->data has no lower chunks, it will be the same - * as a zeroed bitmask. Use find_first_bit() to test it - * and if it doesn't find any bits set, then the array - * is empty. + * as a zeroed bitmask. */ - int bit = find_first_bit((unsigned long *)chunk->data, - sizeof(chunk->data) * 8); - return bit >= sizeof(chunk->data) * 8; + return bitmap_empty((unsigned long *)chunk->data, BITS_PER_TYPE(chunk->data)); } static inline int pid_split(unsigned int pid, unsigned int *upper1, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bb6089c2951e..00fc38d70e86 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -31,6 +31,7 @@ #include <asm/local64.h> #include <asm/local.h> +#include <asm/setup.h> #include "trace.h" @@ -48,9 +49,12 @@ static void update_pages_handler(struct work_struct *work); struct ring_buffer_meta { int magic; - int struct_size; - unsigned long text_addr; - unsigned long data_addr; + int struct_sizes; + unsigned long total_size; + unsigned long buffers_offset; +}; + +struct ring_buffer_cpu_meta { unsigned long first_buffer; unsigned long head_buffer; unsigned long commit_buffer; @@ -517,7 +521,7 @@ struct ring_buffer_per_cpu { struct mutex mapping_lock; unsigned long *subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; - struct ring_buffer_meta *ring_meta; + struct ring_buffer_cpu_meta *ring_meta; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -550,8 +554,7 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; - long last_text_delta; - long last_data_delta; + struct ring_buffer_meta *meta; unsigned int subbuf_size; unsigned int subbuf_order; @@ -1271,7 +1274,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) rb_set_list_to_head(head->list.prev); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->head_buffer = (unsigned long)head->page; } } @@ -1569,7 +1572,7 @@ out_locked: static unsigned long rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) { - addr += sizeof(struct ring_buffer_meta) + + addr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_subbufs; return ALIGN(addr, subbuf_size); } @@ -1580,19 +1583,22 @@ rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) { int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; - unsigned long ptr = buffer->range_addr_start; - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; + struct ring_buffer_meta *bmeta; + unsigned long ptr; int nr_subbufs; - if (!ptr) + bmeta = buffer->meta; + if (!bmeta) return NULL; + ptr = (unsigned long)bmeta + bmeta->buffers_offset; + meta = (struct ring_buffer_cpu_meta *)ptr; + /* When nr_pages passed in is zero, the first meta has already been initialized */ if (!nr_pages) { - meta = (struct ring_buffer_meta *)ptr; nr_subbufs = meta->nr_subbufs; } else { - meta = NULL; /* Include the reader page */ nr_subbufs = nr_pages + 1; } @@ -1624,7 +1630,7 @@ static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) } /* Return the start of subbufs given the meta pointer */ -static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) +static void *rb_subbufs_from_meta(struct ring_buffer_cpu_meta *meta) { int subbuf_size = meta->subbuf_size; unsigned long ptr; @@ -1640,7 +1646,7 @@ static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) */ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) { - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; unsigned long ptr; int subbuf_size; @@ -1666,14 +1672,77 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) } /* + * See if the existing memory contains a valid meta section. + * if so, use that, otherwise initialize it. + */ +static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size) +{ + unsigned long ptr = buffer->range_addr_start; + struct ring_buffer_meta *bmeta; + unsigned long total_size; + int struct_sizes; + + bmeta = (struct ring_buffer_meta *)ptr; + buffer->meta = bmeta; + + total_size = buffer->range_addr_end - buffer->range_addr_start; + + struct_sizes = sizeof(struct ring_buffer_cpu_meta); + struct_sizes |= sizeof(*bmeta) << 16; + + /* The first buffer will start word size after the meta page */ + ptr += sizeof(*bmeta); + ptr = ALIGN(ptr, sizeof(long)); + ptr += scratch_size; + + if (bmeta->magic != RING_BUFFER_META_MAGIC) { + pr_info("Ring buffer boot meta mismatch of magic\n"); + goto init; + } + + if (bmeta->struct_sizes != struct_sizes) { + pr_info("Ring buffer boot meta mismatch of struct size\n"); + goto init; + } + + if (bmeta->total_size != total_size) { + pr_info("Ring buffer boot meta mismatch of total size\n"); + goto init; + } + + if (bmeta->buffers_offset > bmeta->total_size) { + pr_info("Ring buffer boot meta mismatch of offset outside of total size\n"); + goto init; + } + + if (bmeta->buffers_offset != (void *)ptr - (void *)bmeta) { + pr_info("Ring buffer boot meta mismatch of first buffer offset\n"); + goto init; + } + + return true; + + init: + bmeta->magic = RING_BUFFER_META_MAGIC; + bmeta->struct_sizes = struct_sizes; + bmeta->total_size = total_size; + bmeta->buffers_offset = (void *)ptr - (void *)bmeta; + + /* Zero out the scatch pad */ + memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta)); + + return false; +} + +/* * See if the existing memory contains valid ring buffer data. * As the previous kernel must be the same as this kernel, all * the calculations (size of buffers and number of buffers) * must be the same. */ -static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, - struct trace_buffer *buffer, int nr_pages, - unsigned long *subbuf_mask) +static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu, + struct trace_buffer *buffer, int nr_pages, + unsigned long *subbuf_mask) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; @@ -1684,20 +1753,6 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, if (!subbuf_mask) return false; - /* Check the meta magic and meta struct size */ - if (meta->magic != RING_BUFFER_META_MAGIC || - meta->struct_size != sizeof(*meta)) { - pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu); - return false; - } - - /* The subbuffer's size and number of subbuffers must match */ - if (meta->subbuf_size != subbuf_size || - meta->nr_subbufs != nr_pages + 1) { - pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu); - return false; - } - buffers_start = meta->first_buffer; buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs); @@ -1743,7 +1798,7 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, return true; } -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); +static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf); static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu, unsigned long long *timestamp, u64 *delta_ptr) @@ -1810,7 +1865,7 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) /* If the meta data has been validated, now validate the events */ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; struct buffer_page *head_page; unsigned long entry_bytes = 0; unsigned long entries = 0; @@ -1832,10 +1887,12 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) head_page = cpu_buffer->head_page; - /* If both the head and commit are on the reader_page then we are done. */ - if (head_page == cpu_buffer->reader_page && - head_page == cpu_buffer->commit_page) + /* If the commit_buffer is the reader page, update the commit page */ + if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { + cpu_buffer->commit_page = cpu_buffer->reader_page; + /* Nothing more to do, the only page is the reader page */ goto done; + } /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { @@ -1891,24 +1948,13 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) } } -/* Used to calculate data delta */ -static char rb_data_ptr[] = ""; - -#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr) -#define THIS_DATA_PTR ((unsigned long)rb_data_ptr) - -static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) -{ - meta->text_addr = THIS_TEXT_PTR; - meta->data_addr = THIS_DATA_PTR; -} - -static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) +static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size) { - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; unsigned long *subbuf_mask; unsigned long delta; void *subbuf; + bool valid = false; int cpu; int i; @@ -1916,20 +1962,21 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ + if (rb_meta_init(buffer, scratch_size)) + valid = true; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); - if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { + if (valid && rb_cpu_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; meta->first_buffer += delta; meta->head_buffer += delta; meta->commit_buffer += delta; - buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr; - buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr; continue; } @@ -1940,16 +1987,12 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) memset(meta, 0, next_meta - (void *)meta); - meta->magic = RING_BUFFER_META_MAGIC; - meta->struct_size = sizeof(*meta); - meta->nr_subbufs = nr_pages + 1; meta->subbuf_size = PAGE_SIZE; subbuf = rb_subbufs_from_meta(meta); meta->first_buffer = (unsigned long)subbuf; - rb_meta_init_text_addr(meta); /* * The buffers[] array holds the order of the sub-buffers @@ -1971,7 +2014,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) static void *rbm_start(struct seq_file *m, loff_t *pos) { struct ring_buffer_per_cpu *cpu_buffer = m->private; - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val; if (!meta) @@ -1996,7 +2039,7 @@ static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) static int rbm_show(struct seq_file *m, void *v) { struct ring_buffer_per_cpu *cpu_buffer = m->private; - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val = (unsigned long)v; if (val == 1) { @@ -2045,7 +2088,7 @@ int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, in static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; if (meta->head_buffer == (unsigned long)bpage->page) cpu_buffer->head_page = bpage; @@ -2060,7 +2103,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { struct trace_buffer *buffer = cpu_buffer->buffer; - struct ring_buffer_meta *meta = NULL; + struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; gfp_t mflags; @@ -2183,8 +2226,8 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { - struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_meta *meta; + struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = NULL; + struct ring_buffer_cpu_meta *meta; struct buffer_page *bpage; struct page *page; int ret; @@ -2209,7 +2252,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); if (!bpage) - goto fail_free_buffer; + return NULL; rb_check_bpage(cpu_buffer, bpage); @@ -2275,13 +2318,11 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_head_page_activate(cpu_buffer); } - return cpu_buffer; + return_ptr(cpu_buffer); fail_free_reader: free_buffer_page(cpu_buffer->reader_page); - fail_free_buffer: - kfree(cpu_buffer); return NULL; } @@ -2313,9 +2354,10 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, + unsigned long scratch_size, struct lock_class_key *key) { - struct trace_buffer *buffer; + struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; int subbuf_size; int bsize; @@ -2329,7 +2371,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, return NULL; if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) - goto fail_free_buffer; + return NULL; buffer->subbuf_order = order; subbuf_size = (PAGE_SIZE << order); @@ -2355,10 +2397,23 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, /* If start/end are specified, then that overrides size */ if (start && end) { + unsigned long buffers_start; unsigned long ptr; int n; - size = end - start; + /* Make sure that start is word aligned */ + start = ALIGN(start, sizeof(long)); + + /* scratch_size needs to be aligned too */ + scratch_size = ALIGN(scratch_size, sizeof(long)); + + /* Subtract the buffer meta data and word aligned */ + buffers_start = start + sizeof(struct ring_buffer_cpu_meta); + buffers_start = ALIGN(buffers_start, sizeof(long)); + buffers_start += scratch_size; + + /* Calculate the size for the per CPU data */ + size = end - buffers_start; size = size / nr_cpu_ids; /* @@ -2368,7 +2423,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, * needed, plus account for the integer array index that * will be appended to the meta data. */ - nr_pages = (size - sizeof(struct ring_buffer_meta)) / + nr_pages = (size - sizeof(struct ring_buffer_cpu_meta)) / (subbuf_size + sizeof(int)); /* Need at least two pages plus the reader page */ if (nr_pages < 3) @@ -2376,8 +2431,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, again: /* Make sure that the size fits aligned */ - for (n = 0, ptr = start; n < nr_cpu_ids; n++) { - ptr += sizeof(struct ring_buffer_meta) + + for (n = 0, ptr = buffers_start; n < nr_cpu_ids; n++) { + ptr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_pages; ptr = ALIGN(ptr, subbuf_size); ptr += subbuf_size * nr_pages; @@ -2394,7 +2449,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, buffer->range_addr_start = start; buffer->range_addr_end = end; - rb_range_meta_init(buffer, nr_pages); + rb_range_meta_init(buffer, nr_pages, scratch_size); } else { /* need at least two pages */ @@ -2415,7 +2470,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, mutex_init(&buffer->mutex); - return buffer; + return_ptr(buffer); fail_free_buffers: for_each_buffer_cpu(buffer, cpu) { @@ -2427,8 +2482,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); - fail_free_buffer: - kfree(buffer); return NULL; } @@ -2447,7 +2500,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ - return alloc_buffer(size, flags, 0, 0, 0,key); + return alloc_buffer(size, flags, 0, 0, 0, 0, key); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); @@ -2459,6 +2512,7 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); * @order: sub-buffer order * @start: start of allocated range * @range_size: size of allocated range + * @scratch_size: size of scratch area (for preallocated memory buffers) * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE @@ -2469,32 +2523,29 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long range_size, + unsigned long scratch_size, struct lock_class_key *key) { - return alloc_buffer(size, flags, order, start, start + range_size, key); + return alloc_buffer(size, flags, order, start, start + range_size, + scratch_size, key); } -/** - * ring_buffer_last_boot_delta - return the delta offset from last boot - * @buffer: The buffer to return the delta from - * @text: Return text delta - * @data: Return data delta - * - * Returns: The true if the delta is non zero - */ -bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, - long *data) +void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) { - if (!buffer) - return false; + struct ring_buffer_meta *meta; + void *ptr; - if (!buffer->last_text_delta) - return false; + if (!buffer || !buffer->meta) + return NULL; - *text = buffer->last_text_delta; - *data = buffer->last_data_delta; + meta = buffer->meta; - return true; + ptr = (void *)ALIGN((unsigned long)meta + sizeof(*meta), sizeof(long)); + + if (size) + *size = (void *)meta + meta->buffers_offset - ptr; + + return ptr; } /** @@ -2794,6 +2845,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (nr_pages < 2) nr_pages = 2; + /* + * Keep CPUs from coming online while resizing to synchronize + * with new per CPU buffers being created. + */ + guard(cpus_read_lock)(); + /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->resizing); @@ -2838,7 +2895,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cond_resched(); } - cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary @@ -2878,7 +2934,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cpu_buffer->nr_pages_to_update = 0; } - cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; @@ -2906,8 +2961,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, goto out_err; } - cpus_read_lock(); - /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); @@ -2926,7 +2979,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } cpu_buffer->nr_pages_to_update = 0; - cpus_read_unlock(); } out: @@ -3105,7 +3157,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) } /* Return the index into the sub-buffers for a given sub-buffer */ -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) +static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf) { void *subbuf_array; @@ -3117,7 +3169,7 @@ static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long old_head = (unsigned long)next_page->page; unsigned long new_head; @@ -3134,7 +3186,7 @@ static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *reader) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; void *old_reader = cpu_buffer->reader_page->page; void *new_reader = reader->page; int id; @@ -3763,7 +3815,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page; } /* add barrier to keep gcc from optimizing too much */ @@ -4629,10 +4681,7 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); rb_decrement_entry(cpu_buffer, event); - if (rb_try_to_discard(cpu_buffer, event)) - goto out; - - out: + rb_try_to_discard(cpu_buffer, event); rb_end_commit(cpu_buffer); trace_recursive_unlock(cpu_buffer); @@ -4830,6 +4879,24 @@ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer) } /** + * ring_buffer_record_is_on_cpu - return true if the ring buffer can write + * @buffer: The ring buffer to see if write is enabled + * @cpu: The CPU to test if the ring buffer can write too + * + * Returns true if the ring buffer is in a state that it accepts writes + * for a particular CPU. + */ +bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + cpu_buffer = buffer->buffers[cpu]; + + return ring_buffer_record_is_set_on(buffer) && + !atomic_read(&cpu_buffer->record_disabled); +} + +/** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. * @cpu: The CPU buffer to stop @@ -5318,7 +5385,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * moving it. The page before the header page has the * flag bit '1' set if it is pointing to the page we want. * but if the writer is in the process of moving it - * than it will be '2' or already moved '0'. + * then it will be '2' or already moved '0'. */ ret = rb_head_page_replace(reader, cpu_buffer->reader_page); @@ -5947,6 +6014,39 @@ static void rb_clear_buffer_page(struct buffer_page *page) page->read = 0; } +/* + * When the buffer is memory mapped to user space, each sub buffer + * has a unique id that is used by the meta data to tell the user + * where the current reader page is. + * + * For a normal allocated ring buffer, the id is saved in the buffer page + * id field, and updated via this function. + * + * But for a fixed memory mapped buffer, the id is already assigned for + * fixed memory ording in the memory layout and can not be used. Instead + * the index of where the page lies in the memory layout is used. + * + * For the normal pages, set the buffer page id with the passed in @id + * value and return that. + * + * For fixed memory mapped pages, get the page index in the memory layout + * and return that as the id. + */ +static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage, int id) +{ + /* + * For boot buffers, the id is the index, + * otherwise, set the buffer page with this id + */ + if (cpu_buffer->ring_meta) + id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page); + else + bpage->id = id; + + return id; +} + static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; @@ -5955,7 +6055,9 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) return; meta->reader.read = cpu_buffer->reader_page->read; - meta->reader.id = cpu_buffer->reader_page->id; + meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, + cpu_buffer->reader_page->id); + meta->reader.lost_events = cpu_buffer->lost_events; meta->entries = local_read(&cpu_buffer->entries); @@ -5963,7 +6065,7 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) meta->read = cpu_buffer->read; /* Some archs do not have data cache coherency between kernel and user-space */ - flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); + flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE); } static void @@ -6016,7 +6118,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) if (cpu_buffer->mapped) { rb_update_meta_page(cpu_buffer); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = meta->head_buffer; } } @@ -6025,21 +6127,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) /* Must have disabled the cpu buffer then done a synchronize_rcu */ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) - goto out; + return; arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); arch_spin_unlock(&cpu_buffer->lock); - - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** @@ -6050,7 +6147,6 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_meta *meta; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; @@ -6069,11 +6165,6 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); - /* Make sure persistent meta now uses this buffer's addresses */ - meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); - if (meta) - rb_meta_init_text_addr(meta); - mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -6088,7 +6179,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_meta *meta; int cpu; /* prevent another thread from changing buffer sizes */ @@ -6116,11 +6206,6 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) reset_disabled_cpu_buffer(cpu_buffer); - /* Make sure persistent meta now uses this buffer's addresses */ - meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); - if (meta) - rb_meta_init_text_addr(meta); - atomic_dec(&cpu_buffer->record_disabled); atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); } @@ -6239,37 +6324,33 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) - goto out; + return -EINVAL; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; /* It's up to the callers to not try to swap mapped buffers */ - if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { - ret = -EBUSY; - goto out; - } + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) + return -EBUSY; /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) - goto out; + return -EINVAL; if (buffer_a->subbuf_order != buffer_b->subbuf_order) - goto out; - - ret = -EAGAIN; + return -EINVAL; if (atomic_read(&buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&buffer_b->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_b->record_disabled)) - goto out; + return -EAGAIN; /* * We can't do a synchronize_rcu here because this @@ -6306,7 +6387,6 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); -out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); @@ -6465,38 +6545,37 @@ int ring_buffer_read_page(struct trace_buffer *buffer, struct buffer_data_page *bpage; struct buffer_page *reader; unsigned long missed_events; - unsigned long flags; unsigned int commit; unsigned int read; u64 save_timestamp; - int ret = -1; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return -1; /* * If len is not big enough to hold the page header, then * we can not copy anything. */ if (len <= BUF_PAGE_HDR_SIZE) - goto out; + return -1; len -= BUF_PAGE_HDR_SIZE; if (!data_page || !data_page->data) - goto out; + return -1; + if (data_page->order != buffer->subbuf_order) - goto out; + return -1; bpage = data_page->data; if (!bpage) - goto out; + return -1; - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); reader = rb_get_reader_page(cpu_buffer); if (!reader) - goto out_unlock; + return -1; event = rb_reader_event(cpu_buffer); @@ -6530,7 +6609,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (full && (!read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page)) - goto out_unlock; + return -1; if (len > (commit - read)) len = (commit - read); @@ -6539,7 +6618,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, size = rb_event_ts_length(event); if (len < size) - goto out_unlock; + return -1; /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; @@ -6597,7 +6676,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (reader->real_end) local_set(&bpage->commit, reader->real_end); } - ret = read; cpu_buffer->lost_events = 0; @@ -6624,11 +6702,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (commit < buffer->subbuf_size) memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); - out_unlock: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - out: - return ret; + return read; } EXPORT_SYMBOL_GPL(ring_buffer_read_page); @@ -6721,7 +6795,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) old_size = buffer->subbuf_size; /* prevent another thread from changing buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); atomic_inc(&buffer->record_disabled); /* Make sure all commits have finished */ @@ -6826,7 +6900,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); return 0; @@ -6835,7 +6908,6 @@ error: buffer->subbuf_size = old_size; atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; @@ -6883,23 +6955,29 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, struct trace_buffer_meta *meta = cpu_buffer->meta_page; unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; struct buffer_page *first_subbuf, *subbuf; + int cnt = 0; int id = 0; - subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; - cpu_buffer->reader_page->id = id++; + id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id); + subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page; + cnt++; first_subbuf = subbuf = rb_set_head_page(cpu_buffer); do { + id = rb_page_id(cpu_buffer, subbuf, id); + if (WARN_ON(id >= nr_subbufs)) break; subbuf_ids[id] = (unsigned long)subbuf->page; - subbuf->id = id; rb_inc_page(&subbuf); id++; + cnt++; } while (subbuf != first_subbuf); + WARN_ON(cnt != nr_subbufs); + /* install subbuf ID to kern VA translation */ cpu_buffer->subbuf_ids = subbuf_ids; @@ -6991,7 +7069,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, { unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff; unsigned int subbuf_pages, subbuf_order; - struct page **pages; + struct page **pages __free(kfree) = NULL; int p = 0, s = 0; int err; @@ -7059,10 +7137,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, struct page *page; int off = 0; - if (WARN_ON_ONCE(s >= nr_subbufs)) { - err = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(s >= nr_subbufs)) + return -EINVAL; page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); @@ -7077,9 +7153,6 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); -out: - kfree(pages); - return err; } #else @@ -7095,36 +7168,34 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags, *subbuf_ids; - int err = 0; + int err; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (cpu_buffer->user_mapped) { err = __rb_map_vma(cpu_buffer, vma); if (!err) err = __rb_inc_dec_mapped(cpu_buffer, true); - mutex_unlock(&cpu_buffer->mapping_lock); return err; } /* prevent another thread from changing buffer/sub-buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); err = rb_alloc_meta_page(cpu_buffer); if (err) - goto unlock; + return err; /* subbuf_ids include the reader while nr_pages does not */ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); if (!subbuf_ids) { rb_free_meta_page(cpu_buffer); - err = -ENOMEM; - goto unlock; + return -ENOMEM; } atomic_inc(&cpu_buffer->resize_disabled); @@ -7152,35 +7223,29 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, atomic_dec(&cpu_buffer->resize_disabled); } -unlock: - mutex_unlock(&buffer->mutex); - mutex_unlock(&cpu_buffer->mapping_lock); - - return err; + return 0; } int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; - int err = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (!cpu_buffer->user_mapped) { - err = -ENODEV; - goto out; + return -ENODEV; } else if (cpu_buffer->user_mapped > 1) { __rb_inc_dec_mapped(cpu_buffer, false); - goto out; + return 0; } - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* This is the last user space mapping */ @@ -7195,12 +7260,7 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) rb_free_meta_page(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); - mutex_unlock(&buffer->mutex); - -out: - mutex_unlock(&cpu_buffer->mapping_lock); - - return err; + return 0; } int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) @@ -7241,8 +7301,8 @@ consume: /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; - if (cpu_buffer->reader_page != cpu_buffer->commit_page) { - if (missed_events) { + if (missed_events) { + if (cpu_buffer->reader_page != cpu_buffer->commit_page) { struct buffer_data_page *bpage = reader->page; unsigned int commit; /* @@ -7263,13 +7323,23 @@ consume: local_add(RB_MISSED_STORED, &bpage->commit); } local_add(RB_MISSED_EVENTS, &bpage->commit); + } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page, + "Reader on commit with %ld missed events", + missed_events)) { + /* + * There shouldn't be any missed events if the tail_page + * is on the reader page. But if the tail page is not on the + * reader page and the commit_page is, that would mean that + * there's a commit_overrun (an interrupt preempted an + * addition of an event and then filled the buffer + * with new events). In this case it's not an + * error, but it should still be reported. + * + * TODO: Add missed events to the page for user space to know. + */ + pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n", + cpu, missed_events, cpu_buffer->reader_page->page->time_stamp); } - } else { - /* - * There really shouldn't be any missed events if the commit - * is on the reader page. - */ - WARN_ON_ONCE(missed_events); } cpu_buffer->lost_events = 0; @@ -7278,7 +7348,8 @@ consume: out: /* Some archs do not have data cache coherency between kernel and user-space */ - flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); + flush_kernel_vmap_range(cpu_buffer->reader_page->page, + buffer->subbuf_size + BUF_PAGE_HDR_SIZE); rb_update_meta_page(cpu_buffer); @@ -7411,9 +7482,9 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) /* Ignore dropped events before test starts. */ if (started) { if (nested) - data->bytes_dropped += len; - else data->bytes_dropped_nested += len; + else + data->bytes_dropped += len; } return len; } diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 8226352a0062..b39f36013ef2 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -27,6 +27,13 @@ menuconfig RV source "kernel/trace/rv/monitors/wip/Kconfig" source "kernel/trace/rv/monitors/wwnr/Kconfig" +source "kernel/trace/rv/monitors/sched/Kconfig" +source "kernel/trace/rv/monitors/tss/Kconfig" +source "kernel/trace/rv/monitors/sco/Kconfig" +source "kernel/trace/rv/monitors/snroc/Kconfig" +source "kernel/trace/rv/monitors/scpd/Kconfig" +source "kernel/trace/rv/monitors/snep/Kconfig" +source "kernel/trace/rv/monitors/sncid/Kconfig" # Add new monitors here config RV_REACTORS diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 188b64668e1f..f9b2cd0483c3 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -5,6 +5,13 @@ ccflags-y += -I $(src) # needed for trace events obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o +obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o +obj-$(CONFIG_RV_MON_TSS) += monitors/tss/tss.o +obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o +obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o +obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o +obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o +obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig new file mode 100644 index 000000000000..ae3eb410abd7 --- /dev/null +++ b/kernel/trace/rv/monitors/sched/Kconfig @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCHED + depends on RV + bool "sched monitor" + help + Collection of monitors to check the scheduler behaves according to specifications. + Enable this to enable all scheduler specification supported by the current kernel. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c new file mode 100644 index 000000000000..905e03c3c934 --- /dev/null +++ b/kernel/trace/rv/monitors/sched/sched.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +#define MODULE_NAME "sched" + +#include "sched.h" + +struct rv_monitor rv_sched; + +struct rv_monitor rv_sched = { + .name = "sched", + .description = "container for several scheduler monitor specifications.", + .enable = NULL, + .disable = NULL, + .reset = NULL, + .enabled = 0, +}; + +static int __init register_sched(void) +{ + rv_register_monitor(&rv_sched, NULL); + return 0; +} + +static void __exit unregister_sched(void) +{ + rv_unregister_monitor(&rv_sched); +} + +module_init(register_sched); +module_exit(unregister_sched); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sched: container for several scheduler monitor specifications."); diff --git a/kernel/trace/rv/monitors/sched/sched.h b/kernel/trace/rv/monitors/sched/sched.h new file mode 100644 index 000000000000..ba148dd8d48b --- /dev/null +++ b/kernel/trace/rv/monitors/sched/sched.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +extern struct rv_monitor rv_sched; diff --git a/kernel/trace/rv/monitors/sco/Kconfig b/kernel/trace/rv/monitors/sco/Kconfig new file mode 100644 index 000000000000..097c96cccdd7 --- /dev/null +++ b/kernel/trace/rv/monitors/sco/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCO + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sco monitor" + help + Monitor to ensure sched_set_state happens only in thread context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c new file mode 100644 index 000000000000..4cff59220bfc --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sco" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sco.h" + +static struct rv_monitor rv_sco; +DECLARE_DA_MON_PER_CPU(sco, unsigned char); + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + da_handle_start_event_sco(sched_set_state_sco); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_sco(schedule_entry_sco); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_sco(schedule_exit_sco); +} + +static int enable_sco(void) +{ + int retval; + + retval = da_monitor_init_sco(); + if (retval) + return retval; + + rv_attach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("sco", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sco", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_sco(void) +{ + rv_sco.enabled = 0; + + rv_detach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("sco", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sco", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_sco(); +} + +static struct rv_monitor rv_sco = { + .name = "sco", + .description = "scheduling context operations.", + .enable = enable_sco, + .disable = disable_sco, + .reset = da_monitor_reset_all_sco, + .enabled = 0, +}; + +static int __init register_sco(void) +{ + rv_register_monitor(&rv_sco, &rv_sched); + return 0; +} + +static void __exit unregister_sco(void) +{ + rv_unregister_monitor(&rv_sco); +} + +module_init(register_sco); +module_exit(unregister_sco); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sco: scheduling context operations."); diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h new file mode 100644 index 000000000000..7a4c1f2d5ca1 --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sco automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sco { + thread_context_sco = 0, + scheduling_context_sco, + state_max_sco +}; + +#define INVALID_STATE state_max_sco + +enum events_sco { + sched_set_state_sco = 0, + schedule_entry_sco, + schedule_exit_sco, + event_max_sco +}; + +struct automaton_sco { + char *state_names[state_max_sco]; + char *event_names[event_max_sco]; + unsigned char function[state_max_sco][event_max_sco]; + unsigned char initial_state; + bool final_states[state_max_sco]; +}; + +static const struct automaton_sco automaton_sco = { + .state_names = { + "thread_context", + "scheduling_context" + }, + .event_names = { + "sched_set_state", + "schedule_entry", + "schedule_exit" + }, + .function = { + { thread_context_sco, scheduling_context_sco, INVALID_STATE }, + { INVALID_STATE, INVALID_STATE, thread_context_sco }, + }, + .initial_state = thread_context_sco, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sco/sco_trace.h b/kernel/trace/rv/monitors/sco/sco_trace.h new file mode 100644 index 000000000000..b711cd9024ec --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SCO +DEFINE_EVENT(event_da_monitor, event_sco, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_sco, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SCO */ diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig new file mode 100644 index 000000000000..b9114fbf680f --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCPD + depends on RV + depends on PREEMPT_TRACER + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "scpd monitor" + help + Monitor to ensure schedule is called with preemption disabled. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c new file mode 100644 index 000000000000..cbdd6a5f8d7f --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "scpd" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "scpd.h" + +static struct rv_monitor rv_scpd; +DECLARE_DA_MON_PER_CPU(scpd, unsigned char); + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_scpd(preempt_disable_scpd); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_scpd(preempt_enable_scpd); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_scpd(schedule_entry_scpd); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_event_scpd(schedule_exit_scpd); +} + +static int enable_scpd(void) +{ + int retval; + + retval = da_monitor_init_scpd(); + if (retval) + return retval; + + rv_attach_trace_probe("scpd", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("scpd", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_scpd(void) +{ + rv_scpd.enabled = 0; + + rv_detach_trace_probe("scpd", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("scpd", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_scpd(); +} + +static struct rv_monitor rv_scpd = { + .name = "scpd", + .description = "schedule called with preemption disabled.", + .enable = enable_scpd, + .disable = disable_scpd, + .reset = da_monitor_reset_all_scpd, + .enabled = 0, +}; + +static int __init register_scpd(void) +{ + rv_register_monitor(&rv_scpd, &rv_sched); + return 0; +} + +static void __exit unregister_scpd(void) +{ + rv_unregister_monitor(&rv_scpd); +} + +module_init(register_scpd); +module_exit(unregister_scpd); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("scpd: schedule called with preemption disabled."); diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h new file mode 100644 index 000000000000..295f735a5811 --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of scpd automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_scpd { + cant_sched_scpd = 0, + can_sched_scpd, + state_max_scpd +}; + +#define INVALID_STATE state_max_scpd + +enum events_scpd { + preempt_disable_scpd = 0, + preempt_enable_scpd, + schedule_entry_scpd, + schedule_exit_scpd, + event_max_scpd +}; + +struct automaton_scpd { + char *state_names[state_max_scpd]; + char *event_names[event_max_scpd]; + unsigned char function[state_max_scpd][event_max_scpd]; + unsigned char initial_state; + bool final_states[state_max_scpd]; +}; + +static const struct automaton_scpd automaton_scpd = { + .state_names = { + "cant_sched", + "can_sched" + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "schedule_entry", + "schedule_exit" + }, + .function = { + { can_sched_scpd, INVALID_STATE, INVALID_STATE, INVALID_STATE }, + { INVALID_STATE, cant_sched_scpd, can_sched_scpd, can_sched_scpd }, + }, + .initial_state = cant_sched_scpd, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/scpd/scpd_trace.h b/kernel/trace/rv/monitors/scpd/scpd_trace.h new file mode 100644 index 000000000000..6b0f4aa4732e --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SCPD +DEFINE_EVENT(event_da_monitor, event_scpd, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_scpd, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SCPD */ diff --git a/kernel/trace/rv/monitors/sncid/Kconfig b/kernel/trace/rv/monitors/sncid/Kconfig new file mode 100644 index 000000000000..76bcfef4fd10 --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNCID + depends on RV + depends on IRQSOFF_TRACER + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sncid monitor" + help + Monitor to ensure schedule is not called with interrupt disabled. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c new file mode 100644 index 000000000000..f5037cd6214c --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/sncid.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sncid" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sncid.h" + +static struct rv_monitor rv_sncid; +DECLARE_DA_MON_PER_CPU(sncid, unsigned char); + +static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_sncid(irq_disable_sncid); +} + +static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_sncid(irq_enable_sncid); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_start_event_sncid(schedule_entry_sncid); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_sncid(schedule_exit_sncid); +} + +static int enable_sncid(void) +{ + int retval; + + retval = da_monitor_init_sncid(); + if (retval) + return retval; + + rv_attach_trace_probe("sncid", irq_disable, handle_irq_disable); + rv_attach_trace_probe("sncid", irq_enable, handle_irq_enable); + rv_attach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_sncid(void) +{ + rv_sncid.enabled = 0; + + rv_detach_trace_probe("sncid", irq_disable, handle_irq_disable); + rv_detach_trace_probe("sncid", irq_enable, handle_irq_enable); + rv_detach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_sncid(); +} + +static struct rv_monitor rv_sncid = { + .name = "sncid", + .description = "schedule not called with interrupt disabled.", + .enable = enable_sncid, + .disable = disable_sncid, + .reset = da_monitor_reset_all_sncid, + .enabled = 0, +}; + +static int __init register_sncid(void) +{ + rv_register_monitor(&rv_sncid, &rv_sched); + return 0; +} + +static void __exit unregister_sncid(void) +{ + rv_unregister_monitor(&rv_sncid); +} + +module_init(register_sncid); +module_exit(unregister_sncid); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sncid: schedule not called with interrupt disabled."); diff --git a/kernel/trace/rv/monitors/sncid/sncid.h b/kernel/trace/rv/monitors/sncid/sncid.h new file mode 100644 index 000000000000..21304725142b --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/sncid.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sncid automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sncid { + can_sched_sncid = 0, + cant_sched_sncid, + state_max_sncid +}; + +#define INVALID_STATE state_max_sncid + +enum events_sncid { + irq_disable_sncid = 0, + irq_enable_sncid, + schedule_entry_sncid, + schedule_exit_sncid, + event_max_sncid +}; + +struct automaton_sncid { + char *state_names[state_max_sncid]; + char *event_names[event_max_sncid]; + unsigned char function[state_max_sncid][event_max_sncid]; + unsigned char initial_state; + bool final_states[state_max_sncid]; +}; + +static const struct automaton_sncid automaton_sncid = { + .state_names = { + "can_sched", + "cant_sched" + }, + .event_names = { + "irq_disable", + "irq_enable", + "schedule_entry", + "schedule_exit" + }, + .function = { + { cant_sched_sncid, INVALID_STATE, can_sched_sncid, can_sched_sncid }, + { INVALID_STATE, can_sched_sncid, INVALID_STATE, INVALID_STATE }, + }, + .initial_state = can_sched_sncid, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sncid/sncid_trace.h b/kernel/trace/rv/monitors/sncid/sncid_trace.h new file mode 100644 index 000000000000..3ce42a57671d --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/sncid_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNCID +DEFINE_EVENT(event_da_monitor, event_sncid, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_sncid, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SNCID */ diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig new file mode 100644 index 000000000000..77527f971232 --- /dev/null +++ b/kernel/trace/rv/monitors/snep/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNEP + depends on RV + depends on PREEMPT_TRACER + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "snep monitor" + help + Monitor to ensure schedule does not enable preempt. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c new file mode 100644 index 000000000000..0076ba6d7ea4 --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "snep" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "snep.h" + +static struct rv_monitor rv_snep; +DECLARE_DA_MON_PER_CPU(snep, unsigned char); + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_snep(preempt_disable_snep); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_snep(preempt_enable_snep); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_snep(schedule_entry_snep); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_snep(schedule_exit_snep); +} + +static int enable_snep(void) +{ + int retval; + + retval = da_monitor_init_snep(); + if (retval) + return retval; + + rv_attach_trace_probe("snep", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("snep", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("snep", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("snep", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_snep(void) +{ + rv_snep.enabled = 0; + + rv_detach_trace_probe("snep", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("snep", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("snep", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("snep", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_snep(); +} + +static struct rv_monitor rv_snep = { + .name = "snep", + .description = "schedule does not enable preempt.", + .enable = enable_snep, + .disable = disable_snep, + .reset = da_monitor_reset_all_snep, + .enabled = 0, +}; + +static int __init register_snep(void) +{ + rv_register_monitor(&rv_snep, &rv_sched); + return 0; +} + +static void __exit unregister_snep(void) +{ + rv_unregister_monitor(&rv_snep); +} + +module_init(register_snep); +module_exit(unregister_snep); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("snep: schedule does not enable preempt."); diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h new file mode 100644 index 000000000000..6d16b9ad931e --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of snep automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_snep { + non_scheduling_context_snep = 0, + scheduling_contex_snep, + state_max_snep +}; + +#define INVALID_STATE state_max_snep + +enum events_snep { + preempt_disable_snep = 0, + preempt_enable_snep, + schedule_entry_snep, + schedule_exit_snep, + event_max_snep +}; + +struct automaton_snep { + char *state_names[state_max_snep]; + char *event_names[event_max_snep]; + unsigned char function[state_max_snep][event_max_snep]; + unsigned char initial_state; + bool final_states[state_max_snep]; +}; + +static const struct automaton_snep automaton_snep = { + .state_names = { + "non_scheduling_context", + "scheduling_contex" + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "schedule_entry", + "schedule_exit" + }, + .function = { + { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, INVALID_STATE }, + { INVALID_STATE, INVALID_STATE, INVALID_STATE, non_scheduling_context_snep }, + }, + .initial_state = non_scheduling_context_snep, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/snep/snep_trace.h b/kernel/trace/rv/monitors/snep/snep_trace.h new file mode 100644 index 000000000000..01aad49a949a --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNEP +DEFINE_EVENT(event_da_monitor, event_snep, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_snep, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SNEP */ diff --git a/kernel/trace/rv/monitors/snroc/Kconfig b/kernel/trace/rv/monitors/snroc/Kconfig new file mode 100644 index 000000000000..6e4365a2fea3 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNROC + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_ID + bool "snroc monitor" + help + Monitor to ensure sched_set_state happens only in the respective task's context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c new file mode 100644 index 000000000000..bb1f60d55296 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "snroc" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "snroc.h" + +static struct rv_monitor rv_snroc; +DECLARE_DA_MON_PER_TASK(snroc, unsigned char); + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + da_handle_event_snroc(tsk, sched_set_state_snroc); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_start_event_snroc(prev, sched_switch_out_snroc); + da_handle_event_snroc(next, sched_switch_in_snroc); +} + +static int enable_snroc(void) +{ + int retval; + + retval = da_monitor_init_snroc(); + if (retval) + return retval; + + rv_attach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("snroc", sched_switch, handle_sched_switch); + + return 0; +} + +static void disable_snroc(void) +{ + rv_snroc.enabled = 0; + + rv_detach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("snroc", sched_switch, handle_sched_switch); + + da_monitor_destroy_snroc(); +} + +static struct rv_monitor rv_snroc = { + .name = "snroc", + .description = "set non runnable on its own context.", + .enable = enable_snroc, + .disable = disable_snroc, + .reset = da_monitor_reset_all_snroc, + .enabled = 0, +}; + +static int __init register_snroc(void) +{ + rv_register_monitor(&rv_snroc, &rv_sched); + return 0; +} + +static void __exit unregister_snroc(void) +{ + rv_unregister_monitor(&rv_snroc); +} + +module_init(register_snroc); +module_exit(unregister_snroc); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("snroc: set non runnable on its own context."); diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h new file mode 100644 index 000000000000..c3650a2b1b10 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of snroc automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_snroc { + other_context_snroc = 0, + own_context_snroc, + state_max_snroc +}; + +#define INVALID_STATE state_max_snroc + +enum events_snroc { + sched_set_state_snroc = 0, + sched_switch_in_snroc, + sched_switch_out_snroc, + event_max_snroc +}; + +struct automaton_snroc { + char *state_names[state_max_snroc]; + char *event_names[event_max_snroc]; + unsigned char function[state_max_snroc][event_max_snroc]; + unsigned char initial_state; + bool final_states[state_max_snroc]; +}; + +static const struct automaton_snroc automaton_snroc = { + .state_names = { + "other_context", + "own_context" + }, + .event_names = { + "sched_set_state", + "sched_switch_in", + "sched_switch_out" + }, + .function = { + { INVALID_STATE, own_context_snroc, INVALID_STATE }, + { own_context_snroc, INVALID_STATE, other_context_snroc }, + }, + .initial_state = other_context_snroc, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/snroc/snroc_trace.h b/kernel/trace/rv/monitors/snroc/snroc_trace.h new file mode 100644 index 000000000000..50114cef5122 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNROC +DEFINE_EVENT(event_da_monitor_id, event_snroc, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_snroc, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_SNROC */ diff --git a/kernel/trace/rv/monitors/tss/Kconfig b/kernel/trace/rv/monitors/tss/Kconfig new file mode 100644 index 000000000000..479f86f52e60 --- /dev/null +++ b/kernel/trace/rv/monitors/tss/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_TSS + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "tss monitor" + help + Monitor to ensure sched_switch happens only in scheduling context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c new file mode 100644 index 000000000000..542787e6524f --- /dev/null +++ b/kernel/trace/rv/monitors/tss/tss.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "tss" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "tss.h" + +static struct rv_monitor rv_tss; +DECLARE_DA_MON_PER_CPU(tss, unsigned char); + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_event_tss(sched_switch_tss); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_tss(schedule_entry_tss); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_tss(schedule_exit_tss); +} + +static int enable_tss(void) +{ + int retval; + + retval = da_monitor_init_tss(); + if (retval) + return retval; + + rv_attach_trace_probe("tss", sched_switch, handle_sched_switch); + rv_attach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_tss(void) +{ + rv_tss.enabled = 0; + + rv_detach_trace_probe("tss", sched_switch, handle_sched_switch); + rv_detach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_tss(); +} + +static struct rv_monitor rv_tss = { + .name = "tss", + .description = "task switch while scheduling.", + .enable = enable_tss, + .disable = disable_tss, + .reset = da_monitor_reset_all_tss, + .enabled = 0, +}; + +static int __init register_tss(void) +{ + rv_register_monitor(&rv_tss, &rv_sched); + return 0; +} + +static void __exit unregister_tss(void) +{ + rv_unregister_monitor(&rv_tss); +} + +module_init(register_tss); +module_exit(unregister_tss); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("tss: task switch while scheduling."); diff --git a/kernel/trace/rv/monitors/tss/tss.h b/kernel/trace/rv/monitors/tss/tss.h new file mode 100644 index 000000000000..f0a36fda1b87 --- /dev/null +++ b/kernel/trace/rv/monitors/tss/tss.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of tss automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_tss { + thread_tss = 0, + sched_tss, + state_max_tss +}; + +#define INVALID_STATE state_max_tss + +enum events_tss { + sched_switch_tss = 0, + schedule_entry_tss, + schedule_exit_tss, + event_max_tss +}; + +struct automaton_tss { + char *state_names[state_max_tss]; + char *event_names[event_max_tss]; + unsigned char function[state_max_tss][event_max_tss]; + unsigned char initial_state; + bool final_states[state_max_tss]; +}; + +static const struct automaton_tss automaton_tss = { + .state_names = { + "thread", + "sched" + }, + .event_names = { + "sched_switch", + "schedule_entry", + "schedule_exit" + }, + .function = { + { INVALID_STATE, sched_tss, INVALID_STATE }, + { sched_tss, INVALID_STATE, thread_tss }, + }, + .initial_state = thread_tss, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/tss/tss_trace.h b/kernel/trace/rv/monitors/tss/tss_trace.h new file mode 100644 index 000000000000..4619dbb50cc0 --- /dev/null +++ b/kernel/trace/rv/monitors/tss/tss_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_TSS +DEFINE_EVENT(event_da_monitor, event_tss, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_tss, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_TSS */ diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig index 3ef664b5cd90..e464b9294865 100644 --- a/kernel/trace/rv/monitors/wip/Kconfig +++ b/kernel/trace/rv/monitors/wip/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +# config RV_MON_WIP depends on RV depends on PREEMPT_TRACER diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index db7389157c87..ed758fec8608 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -71,7 +71,7 @@ static struct rv_monitor rv_wip = { static int __init register_wip(void) { - rv_register_monitor(&rv_wip); + rv_register_monitor(&rv_wip, NULL); return 0; } diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index 2e373f2c65ed..c7193748bf36 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Automatically generated C representation of wip automaton * For further information about this format, see kernel documentation: diff --git a/kernel/trace/rv/monitors/wwnr/Kconfig b/kernel/trace/rv/monitors/wwnr/Kconfig index ee741aa6d6b8..d3bfc20037db 100644 --- a/kernel/trace/rv/monitors/wwnr/Kconfig +++ b/kernel/trace/rv/monitors/wwnr/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +# config RV_MON_WWNR depends on RV select DA_MON_EVENTS_ID diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 3b16994a9984..172f31c4b0f3 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -70,7 +70,7 @@ static struct rv_monitor rv_wwnr = { static int __init register_wwnr(void) { - rv_register_monitor(&rv_wwnr); + rv_register_monitor(&rv_wwnr, NULL); return 0; } diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index d0d9c4b8121b..0a59d23edf61 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Automatically generated C representation of wwnr automaton * For further information about this format, see kernel documentation: diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 8657fc8806e7..e4077500a91d 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -162,7 +162,7 @@ struct dentry *get_monitors_root(void) /* * Interface for the monitor register. */ -static LIST_HEAD(rv_monitors_list); +LIST_HEAD(rv_monitors_list); static int task_monitor_count; static bool task_monitor_slots[RV_PER_TASK_MONITORS]; @@ -207,6 +207,35 @@ void rv_put_task_monitor_slot(int slot) } /* + * Monitors with a parent are nested, + * Monitors without a parent could be standalone or containers. + */ +bool rv_is_nested_monitor(struct rv_monitor_def *mdef) +{ + return mdef->parent != NULL; +} + +/* + * We set our list to have nested monitors listed after their parent + * if a monitor has a child element its a container. + * Containers can be also identified based on their function pointers: + * as they are not real monitors they do not need function definitions + * for enable()/disable(). Use this condition to find empty containers. + * Keep both conditions in case we have some non-compliant containers. + */ +bool rv_is_container_monitor(struct rv_monitor_def *mdef) +{ + struct rv_monitor_def *next; + + if (list_is_last(&mdef->list, &rv_monitors_list)) + return false; + + next = list_next_entry(mdef, list); + + return next->parent == mdef->monitor || !mdef->monitor->enable; +} + +/* * This section collects the monitor/ files and folders. */ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, @@ -229,7 +258,8 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) if (mdef->monitor->enabled) { mdef->monitor->enabled = 0; - mdef->monitor->disable(); + if (mdef->monitor->disable) + mdef->monitor->disable(); /* * Wait for the execution of all events to finish. @@ -243,6 +273,60 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) return 0; } +static void rv_disable_single(struct rv_monitor_def *mdef) +{ + __rv_disable_monitor(mdef, true); +} + +static int rv_enable_single(struct rv_monitor_def *mdef) +{ + int retval; + + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) + return 0; + + retval = mdef->monitor->enable(); + + if (!retval) + mdef->monitor->enabled = 1; + + return retval; +} + +static void rv_disable_container(struct rv_monitor_def *mdef) +{ + struct rv_monitor_def *p = mdef; + int enabled = 0; + + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (p->parent != mdef->monitor) + break; + enabled += __rv_disable_monitor(p, false); + } + if (enabled) + tracepoint_synchronize_unregister(); + mdef->monitor->enabled = 0; +} + +static int rv_enable_container(struct rv_monitor_def *mdef) +{ + struct rv_monitor_def *p = mdef; + int retval = 0; + + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (retval || p->parent != mdef->monitor) + break; + retval = rv_enable_single(p); + } + if (retval) + rv_disable_container(mdef); + else + mdef->monitor->enabled = 1; + return retval; +} + /** * rv_disable_monitor - disable a given runtime monitor * @mdef: Pointer to the monitor definition structure. @@ -251,7 +335,11 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) */ int rv_disable_monitor(struct rv_monitor_def *mdef) { - __rv_disable_monitor(mdef, true); + if (rv_is_container_monitor(mdef)) + rv_disable_container(mdef); + else + rv_disable_single(mdef); + return 0; } @@ -265,15 +353,10 @@ int rv_enable_monitor(struct rv_monitor_def *mdef) { int retval; - lockdep_assert_held(&rv_interface_lock); - - if (mdef->monitor->enabled) - return 0; - - retval = mdef->monitor->enable(); - - if (!retval) - mdef->monitor->enabled = 1; + if (rv_is_container_monitor(mdef)) + retval = rv_enable_container(mdef); + else + retval = rv_enable_single(mdef); return retval; } @@ -336,9 +419,9 @@ static const struct file_operations interface_desc_fops = { * the monitor dir, where the specific options of the monitor * are exposed. */ -static int create_monitor_dir(struct rv_monitor_def *mdef) +static int create_monitor_dir(struct rv_monitor_def *mdef, struct rv_monitor_def *parent) { - struct dentry *root = get_monitors_root(); + struct dentry *root = parent ? parent->root_d : get_monitors_root(); const char *name = mdef->monitor->name; struct dentry *tmp; int retval; @@ -377,7 +460,11 @@ static int monitors_show(struct seq_file *m, void *p) { struct rv_monitor_def *mon_def = p; - seq_printf(m, "%s\n", mon_def->monitor->name); + if (mon_def->parent) + seq_printf(m, "%s:%s\n", mon_def->parent->name, + mon_def->monitor->name); + else + seq_printf(m, "%s\n", mon_def->monitor->name); return 0; } @@ -514,7 +601,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user struct rv_monitor_def *mdef; int retval = -EINVAL; bool enable = true; - char *ptr; + char *ptr, *tmp; int len; if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1) @@ -541,6 +628,11 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user retval = -EINVAL; + /* we support 1 nesting level, trim the parent */ + tmp = strstr(ptr, ":"); + if (tmp) + ptr = tmp+1; + list_for_each_entry(mdef, &rv_monitors_list, list) { if (strcmp(ptr, mdef->monitor->name) != 0) continue; @@ -613,7 +705,7 @@ static void reset_all_monitors(void) struct rv_monitor_def *mdef; list_for_each_entry(mdef, &rv_monitors_list, list) { - if (mdef->monitor->enabled) + if (mdef->monitor->enabled && mdef->monitor->reset) mdef->monitor->reset(); } } @@ -685,18 +777,19 @@ static void destroy_monitor_dir(struct rv_monitor_def *mdef) /** * rv_register_monitor - register a rv monitor. * @monitor: The rv_monitor to be registered. + * @parent: The parent of the monitor to be registered, NULL if not nested. * * Returns 0 if successful, error otherwise. */ -int rv_register_monitor(struct rv_monitor *monitor) +int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) { - struct rv_monitor_def *r; + struct rv_monitor_def *r, *p = NULL; int retval = 0; if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { pr_info("Monitor %s has a name longer than %d\n", monitor->name, MAX_RV_MONITOR_NAME_SIZE); - return -1; + return -EINVAL; } mutex_lock(&rv_interface_lock); @@ -704,11 +797,27 @@ int rv_register_monitor(struct rv_monitor *monitor) list_for_each_entry(r, &rv_monitors_list, list) { if (strcmp(monitor->name, r->monitor->name) == 0) { pr_info("Monitor %s is already registered\n", monitor->name); - retval = -1; + retval = -EEXIST; goto out_unlock; } } + if (parent) { + list_for_each_entry(r, &rv_monitors_list, list) { + if (strcmp(parent->name, r->monitor->name) == 0) { + p = r; + break; + } + } + } + + if (p && rv_is_nested_monitor(p)) { + pr_info("Parent monitor %s is already nested, cannot nest further\n", + parent->name); + retval = -EINVAL; + goto out_unlock; + } + r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); if (!r) { retval = -ENOMEM; @@ -716,14 +825,19 @@ int rv_register_monitor(struct rv_monitor *monitor) } r->monitor = monitor; + r->parent = parent; - retval = create_monitor_dir(r); + retval = create_monitor_dir(r, p); if (retval) { kfree(r); goto out_unlock; } - list_add_tail(&r->list, &rv_monitors_list); + /* keep children close to the parent for easier visualisation */ + if (p) + list_add(&r->list, &p->list); + else + list_add_tail(&r->list, &rv_monitors_list); out_unlock: mutex_unlock(&rv_interface_lock); diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index db6cb0913dbd..98fca0a1adbc 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -21,6 +21,7 @@ struct rv_interface { #define MAX_RV_REACTOR_NAME_SIZE 32 extern struct mutex rv_interface_lock; +extern struct list_head rv_monitors_list; #ifdef CONFIG_RV_REACTORS struct rv_reactor_def { @@ -34,6 +35,7 @@ struct rv_reactor_def { struct rv_monitor_def { struct list_head list; struct rv_monitor *monitor; + struct rv_monitor *parent; struct dentry *root_d; #ifdef CONFIG_RV_REACTORS struct rv_reactor_def *rdef; @@ -45,6 +47,8 @@ struct rv_monitor_def { struct dentry *get_monitors_root(void); int rv_disable_monitor(struct rv_monitor_def *mdef); int rv_enable_monitor(struct rv_monitor_def *mdef); +bool rv_is_container_monitor(struct rv_monitor_def *mdef); +bool rv_is_nested_monitor(struct rv_monitor_def *mdef); #ifdef CONFIG_RV_REACTORS int reactor_populate_monitor(struct rv_monitor_def *mdef); diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 7b49cbe388d4..9501ca886d83 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -158,8 +158,9 @@ static const struct seq_operations monitor_reactors_seq_ops = { .show = monitor_reactor_show }; -static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef, - bool reacting) +static void monitor_swap_reactors_single(struct rv_monitor_def *mdef, + struct rv_reactor_def *rdef, + bool reacting, bool nested) { bool monitor_enabled; @@ -179,10 +180,31 @@ static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor mdef->reacting = reacting; mdef->monitor->react = rdef->reactor->react; - if (monitor_enabled) + /* enable only once if iterating through a container */ + if (monitor_enabled && !nested) rv_enable_monitor(mdef); } +static void monitor_swap_reactors(struct rv_monitor_def *mdef, + struct rv_reactor_def *rdef, bool reacting) +{ + struct rv_monitor_def *p = mdef; + + if (rv_is_container_monitor(mdef)) + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (p->parent != mdef->monitor) + break; + monitor_swap_reactors_single(p, rdef, reacting, true); + } + /* + * This call enables and disables the monitor if they were active. + * In case of a container, we already disabled all and will enable all. + * All nested monitors are enabled also if they were off, we may refine + * this logic in the future. + */ + monitor_swap_reactors_single(mdef, rdef, reacting, false); +} + static ssize_t monitor_reactors_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 5e65097423ba..422b75f58891 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -58,6 +58,11 @@ DECLARE_EVENT_CLASS(error_da_monitor, ); #include <monitors/wip/wip_trace.h> +#include <monitors/tss/tss_trace.h> +#include <monitors/sco/sco_trace.h> +#include <monitors/scpd/scpd_trace.h> +#include <monitors/snep/snep_trace.h> +#include <monitors/sncid/sncid_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ @@ -118,6 +123,7 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, ); #include <monitors/wwnr/wwnr_trace.h> +#include <monitors/snroc/snroc_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_ID here #endif /* CONFIG_DA_MON_EVENTS_ID */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0e6d517e74e0..95ae7c4e5835 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -49,6 +49,9 @@ #include <linux/fsnotify.h> #include <linux/irq_work.h> #include <linux/workqueue.h> +#include <linux/sort.h> +#include <linux/io.h> /* vmap_page_range() */ +#include <linux/fs_context.h> #include <asm/setup.h> /* COMMAND_LINE_SIZE */ @@ -87,6 +90,7 @@ void __init disable_tracing_selftest(const char *reason) static struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; static bool tracepoint_printk_stop_on_boot __initdata; +static bool traceoff_after_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); /* For tracers that don't implement custom flags */ @@ -117,6 +121,7 @@ static int tracing_disabled = 1; cpumask_var_t __read_mostly tracing_buffer_mask; +#define MAX_TRACER_SIZE 100 /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops * @@ -139,7 +144,40 @@ cpumask_var_t __read_mostly tracing_buffer_mask; char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ -int __disable_trace_on_warning; +static int __disable_trace_on_warning; + +int tracepoint_printk_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +static const struct ctl_table trace_sysctl_table[] = { + { + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = MAX_TRACER_SIZE, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "traceoff_on_warning", + .data = &__disable_trace_on_warning, + .maxlen = sizeof(__disable_trace_on_warning), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = tracepoint_printk_sysctl, + }, +}; + +static int __init init_trace_sysctls(void) +{ + register_sysctl_init("kernel", trace_sysctl_table); + return 0; +} +subsys_initcall(init_trace_sysctls); #ifdef CONFIG_TRACE_EVAL_MAP_FILE /* Map of enums to their values, for "eval_map" file */ @@ -330,6 +368,13 @@ static int __init set_tracepoint_printk_stop(char *str) } __setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop); +static int __init set_traceoff_after_boot(char *str) +{ + traceoff_after_boot = true; + return 1; +} +__setup("traceoff_after_boot", set_traceoff_after_boot); + unsigned long long ns2usecs(u64 nsec) { nsec += 500; @@ -483,7 +528,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ - TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK) + TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK | \ + TRACE_ITER_COPY_MARKER) /* trace_options that are only supported by global_trace */ #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ @@ -491,7 +537,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export); /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ - (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK) + (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK | \ + TRACE_ITER_COPY_MARKER) /* * The global_trace is the descriptor that holds the top-level tracing @@ -503,6 +550,9 @@ static struct trace_array global_trace = { static struct trace_array *printk_trace = &global_trace; +/* List of trace_arrays interested in the top level trace_marker */ +static LIST_HEAD(marker_copies); + static __always_inline bool printk_binsafe(struct trace_array *tr) { /* @@ -524,6 +574,28 @@ static void update_printk_trace(struct trace_array *tr) tr->trace_flags |= TRACE_ITER_TRACE_PRINTK; } +/* Returns true if the status of tr changed */ +static bool update_marker_trace(struct trace_array *tr, int enabled) +{ + lockdep_assert_held(&event_mutex); + + if (enabled) { + if (!list_empty(&tr->marker_list)) + return false; + + list_add_rcu(&tr->marker_list, &marker_copies); + tr->trace_flags |= TRACE_ITER_COPY_MARKER; + return true; + } + + if (list_empty(&tr->marker_list)) + return false; + + list_del_init(&tr->marker_list); + tr->trace_flags &= ~TRACE_ITER_COPY_MARKER; + return true; +} + void trace_set_ring_buffer_expanded(struct trace_array *tr) { if (!tr) @@ -1573,6 +1645,39 @@ void tracer_tracing_off(struct trace_array *tr) } /** + * tracer_tracing_disable() - temporary disable the buffer from write + * @tr: The trace array to disable its buffer for + * + * Expects trace_tracing_enable() to re-enable tracing. + * The difference between this and tracer_tracing_off() is that this + * is a counter and can nest, whereas, tracer_tracing_off() can + * be called multiple times and a single trace_tracing_on() will + * enable it. + */ +void tracer_tracing_disable(struct trace_array *tr) +{ + if (WARN_ON_ONCE(!tr->array_buffer.buffer)) + return; + + ring_buffer_record_disable(tr->array_buffer.buffer); +} + +/** + * tracer_tracing_enable() - counter part of tracer_tracing_disable() + * @tr: The trace array that had tracer_tracincg_disable() called on it + * + * This is called after tracer_tracing_disable() has been called on @tr, + * when it's safe to re-enable tracing. + */ +void tracer_tracing_enable(struct trace_array *tr) +{ + if (WARN_ON_ONCE(!tr->array_buffer.buffer)) + return; + + ring_buffer_record_enable(tr->array_buffer.buffer); +} + +/** * tracing_off - turn off tracing buffers * * This function stops the tracing buffers from recording data. @@ -2878,13 +2983,16 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, void trace_function(struct trace_array *tr, unsigned long ip, unsigned long - parent_ip, unsigned int trace_ctx) + parent_ip, unsigned int trace_ctx, struct ftrace_regs *fregs) { struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; + int size = sizeof(*entry); + + size += FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long); - event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), + event = __trace_buffer_lock_reserve(buffer, TRACE_FN, size, trace_ctx); if (!event) return; @@ -2892,6 +3000,13 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long entry->ip = ip; entry->parent_ip = parent_ip; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + if (static_branch_unlikely(&trace_function_exports_enabled)) ftrace_exports(event, TRACE_EXPORT_FUNCTION); __buffer_unlock_commit(buffer, event); @@ -3322,10 +3437,9 @@ out_nobuffer: } EXPORT_SYMBOL_GPL(trace_vbprintk); -__printf(3, 0) -static int -__trace_array_vprintk(struct trace_buffer *buffer, - unsigned long ip, const char *fmt, va_list args) +static __printf(3, 0) +int __trace_array_vprintk(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) { struct ring_buffer_event *event; int len = 0, size; @@ -3375,7 +3489,6 @@ out_nobuffer: return len; } -__printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { @@ -3405,7 +3518,6 @@ int trace_array_vprintk(struct trace_array *tr, * Note, trace_array_init_printk() must be called on @tr before this * can be used. */ -__printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { @@ -3450,7 +3562,6 @@ int trace_array_init_printk(struct trace_array *tr) } EXPORT_SYMBOL_GPL(trace_array_init_printk); -__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) { @@ -3466,7 +3577,6 @@ int trace_array_printk_buf(struct trace_buffer *buffer, return ret; } -__printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(printk_trace, ip, fmt, args); @@ -4100,12 +4210,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) entries, total, buf->cpu, - preempt_model_none() ? "server" : - preempt_model_voluntary() ? "desktop" : - preempt_model_full() ? "preempt" : - preempt_model_lazy() ? "lazy" : - preempt_model_rt() ? "preempt_rt" : - "unknown", + preempt_model_str(), /* These are reserved for later use */ 0, 0, 0, 0); #ifdef CONFIG_SMP @@ -4193,7 +4298,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) * safe to use if the array has delta offsets * Force printing via the fields. */ - if ((tr->text_delta || tr->data_delta) && + if ((tr->text_delta) && event->type > __TRACE_LAST_TYPE) return print_event_fields(iter, event); @@ -5038,7 +5143,6 @@ int tracing_set_cpumask(struct trace_array *tr, */ if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); @@ -5046,7 +5150,6 @@ int tracing_set_cpumask(struct trace_array *tr, } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); #ifdef CONFIG_TRACER_MAX_TRACE ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); @@ -5179,7 +5282,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { if ((mask == TRACE_ITER_RECORD_TGID) || (mask == TRACE_ITER_RECORD_CMD) || - (mask == TRACE_ITER_TRACE_PRINTK)) + (mask == TRACE_ITER_TRACE_PRINTK) || + (mask == TRACE_ITER_COPY_MARKER)) lockdep_assert_held(&event_mutex); /* do nothing if flag is already set */ @@ -5210,6 +5314,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) } } + if (mask == TRACE_ITER_COPY_MARKER) + update_marker_trace(tr, enabled); + if (enabled) tr->trace_flags |= mask; else @@ -5988,11 +6095,137 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr, return __tracing_resize_ring_buffer(tr, size, cpu_id); } +struct trace_mod_entry { + unsigned long mod_addr; + char mod_name[MODULE_NAME_LEN]; +}; + +struct trace_scratch { + unsigned int clock_id; + unsigned long text_addr; + unsigned long nr_entries; + struct trace_mod_entry entries[]; +}; + +static DEFINE_MUTEX(scratch_mutex); + +static int cmp_mod_entry(const void *key, const void *pivot) +{ + unsigned long addr = (unsigned long)key; + const struct trace_mod_entry *ent = pivot; + + if (addr >= ent[0].mod_addr && addr < ent[1].mod_addr) + return 0; + else + return addr - ent->mod_addr; +} + +/** + * trace_adjust_address() - Adjust prev boot address to current address. + * @tr: Persistent ring buffer's trace_array. + * @addr: Address in @tr which is adjusted. + */ +unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + unsigned long raddr; + int idx = 0, nr_entries; + + /* If we don't have last boot delta, return the address */ + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return addr; + + /* tr->module_delta must be protected by rcu. */ + guard(rcu)(); + tscratch = tr->scratch; + /* if there is no tscrach, module_delta must be NULL. */ + module_delta = READ_ONCE(tr->module_delta); + if (!module_delta || !tscratch->nr_entries || + tscratch->entries[0].mod_addr > addr) { + raddr = addr + tr->text_delta; + return __is_kernel(raddr) || is_kernel_core_data(raddr) || + is_kernel_rodata(raddr) ? raddr : addr; + } + + /* Note that entries must be sorted. */ + nr_entries = tscratch->nr_entries; + if (nr_entries == 1 || + tscratch->entries[nr_entries - 1].mod_addr < addr) + idx = nr_entries - 1; + else { + entry = __inline_bsearch((void *)addr, + tscratch->entries, + nr_entries - 1, + sizeof(tscratch->entries[0]), + cmp_mod_entry); + if (entry) + idx = entry - tscratch->entries; + } + + return addr + module_delta->delta[idx]; +} + +#ifdef CONFIG_MODULES +static int save_mod(struct module *mod, void *data) +{ + struct trace_array *tr = data; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + unsigned int size; + + tscratch = tr->scratch; + if (!tscratch) + return -1; + size = tr->scratch_size; + + if (struct_size(tscratch, entries, tscratch->nr_entries + 1) > size) + return -1; + + entry = &tscratch->entries[tscratch->nr_entries]; + + tscratch->nr_entries++; + + entry->mod_addr = (unsigned long)mod->mem[MOD_TEXT].base; + strscpy(entry->mod_name, mod->name); + + return 0; +} +#else +static int save_mod(struct module *mod, void *data) +{ + return 0; +} +#endif + static void update_last_data(struct trace_array *tr) { - if (!tr->text_delta && !tr->data_delta) + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + + if (!(tr->flags & TRACE_ARRAY_FL_BOOT)) + return; + + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) return; + /* Only if the buffer has previous boot data clear and update it. */ + tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; + + /* Reset the module list and reload them */ + if (tr->scratch) { + struct trace_scratch *tscratch = tr->scratch; + + tscratch->clock_id = tr->clock_id; + memset(tscratch->entries, 0, + flex_array_size(tscratch, entries, tscratch->nr_entries)); + tscratch->nr_entries = 0; + + guard(mutex)(&scratch_mutex); + module_for_each_mod(save_mod, tr); + } + /* * Need to clear all CPU buffers as there cannot be events * from the previous boot mixed with events with this boot @@ -6003,7 +6236,17 @@ static void update_last_data(struct trace_array *tr) /* Using current data now */ tr->text_delta = 0; - tr->data_delta = 0; + + if (!tr->scratch) + return; + + tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + WRITE_ONCE(tr->module_delta, NULL); + kfree_rcu(module_delta, rcu); + + /* Set the persistent ring buffer meta data to this address */ + tscratch->text_addr = (unsigned long)_text; } /** @@ -6469,6 +6712,22 @@ static int tracing_wait_pipe(struct file *filp) return 1; } +static bool update_last_data_if_empty(struct trace_array *tr) +{ + if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return false; + + if (!ring_buffer_empty(tr->array_buffer.buffer)) + return false; + + /* + * If the buffer contains the last boot data and all per-cpu + * buffers are empty, reset it from the kernel side. + */ + update_last_data(tr); + return true; +} + /* * Consumer reader. */ @@ -6500,6 +6759,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, } waitagain: + if (update_last_data_if_empty(iter->tr)) + return 0; + sret = tracing_wait_pipe(filp); if (sret <= 0) return sret; @@ -6682,13 +6944,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), - trace_seq_used(&iter->seq)); + min((size_t)trace_seq_used(&iter->seq), + (size_t)PAGE_SIZE)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; - spd.partial[i].len = trace_seq_used(&iter->seq); + spd.partial[i].len = ret; trace_seq_init(&iter->seq); } @@ -6812,19 +7075,102 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static ssize_t -tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +#define LAST_BOOT_HEADER ((void *)1) + +static void *l_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_array *tr = filp->private_data; - struct seq_buf seq; - char buf[64]; + struct trace_array *tr = m->private; + struct trace_scratch *tscratch = tr->scratch; + unsigned int index = *pos; - seq_buf_init(&seq, buf, 64); + (*pos)++; - seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta); - seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta); + if (*pos == 1) + return LAST_BOOT_HEADER; + + /* Only show offsets of the last boot data */ + if (!tscratch || !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + return NULL; + + /* *pos 0 is for the header, 1 is for the first module */ + index--; + + if (index >= tscratch->nr_entries) + return NULL; - return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq)); + return &tscratch->entries[index]; +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&scratch_mutex); + + return l_next(m, NULL, pos); +} + +static void l_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&scratch_mutex); +} + +static void show_last_boot_header(struct seq_file *m, struct trace_array *tr) +{ + struct trace_scratch *tscratch = tr->scratch; + + /* + * Do not leak KASLR address. This only shows the KASLR address of + * the last boot. When the ring buffer is started, the LAST_BOOT + * flag gets cleared, and this should only report "current". + * Otherwise it shows the KASLR address from the previous boot which + * should not be the same as the current boot. + */ + if (tscratch && (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) + seq_printf(m, "%lx\t[kernel]\n", tscratch->text_addr); + else + seq_puts(m, "# Current\n"); +} + +static int l_show(struct seq_file *m, void *v) +{ + struct trace_array *tr = m->private; + struct trace_mod_entry *entry = v; + + if (v == LAST_BOOT_HEADER) { + show_last_boot_header(m, tr); + return 0; + } + + seq_printf(m, "%lx\t%s\n", entry->mod_addr, entry->mod_name); + return 0; +} + +static const struct seq_operations last_boot_seq_ops = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show, +}; + +static int tracing_last_boot_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = seq_open(file, &last_boot_seq_ops); + if (ret) { + trace_array_put(tr); + return ret; + } + + m = file->private_data; + m->private = tr; + + return 0; } static int tracing_buffer_meta_open(struct inode *inode, struct file *filp) @@ -6875,11 +7221,9 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) #define TRACE_MARKER_MAX_SIZE 4096 -static ssize_t -tracing_mark_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *fpos) +static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user *ubuf, + size_t cnt, unsigned long ip) { - struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; @@ -6893,18 +7237,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, #define FAULTED_STR "<faulted>" #define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ - if (tracing_disabled) - return -EINVAL; - - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) - return -EINVAL; - - if ((ssize_t)cnt < 0) - return -EINVAL; - - if (cnt > TRACE_MARKER_MAX_SIZE) - cnt = TRACE_MARKER_MAX_SIZE; - meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ again: size = cnt + meta_size; @@ -6937,7 +7269,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } entry = ring_buffer_event_data(event); - entry->ip = _THIS_IP_; + entry->ip = ip; len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); if (len) { @@ -6970,18 +7302,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } static ssize_t -tracing_mark_raw_write(struct file *filp, const char __user *ubuf, +tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { struct trace_array *tr = filp->private_data; - struct ring_buffer_event *event; - struct trace_buffer *buffer; - struct raw_data_entry *entry; - ssize_t written; - int size; - int len; - -#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + ssize_t written = -ENODEV; + unsigned long ip; if (tracing_disabled) return -EINVAL; @@ -6989,10 +7315,42 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; - /* The marker must at least have a tag id */ - if (cnt < sizeof(unsigned int)) + if ((ssize_t)cnt < 0) return -EINVAL; + if (cnt > TRACE_MARKER_MAX_SIZE) + cnt = TRACE_MARKER_MAX_SIZE; + + /* The selftests expect this function to be the IP address */ + ip = _THIS_IP_; + + /* The global trace_marker can go to multiple instances */ + if (tr == &global_trace) { + guard(rcu)(); + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { + written = write_marker_to_buffer(tr, ubuf, cnt, ip); + if (written < 0) + break; + } + } else { + written = write_marker_to_buffer(tr, ubuf, cnt, ip); + } + + return written; +} + +static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, + const char __user *ubuf, size_t cnt) +{ + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct raw_data_entry *entry; + ssize_t written; + int size; + int len; + +#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + size = sizeof(*entry) + cnt; if (cnt < FAULT_SIZE_ID) size += FAULT_SIZE_ID - cnt; @@ -7023,6 +7381,40 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, return written; } +static ssize_t +tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct trace_array *tr = filp->private_data; + ssize_t written = -ENODEV; + +#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + + if (tracing_disabled) + return -EINVAL; + + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + + /* The marker must at least have a tag id */ + if (cnt < sizeof(unsigned int)) + return -EINVAL; + + /* The global trace_marker_raw can go to multiple instances */ + if (tr == &global_trace) { + guard(rcu)(); + list_for_each_entry_rcu(tr, &marker_copies, marker_list) { + written = write_raw_marker_to_buffer(tr, ubuf, cnt); + if (written < 0) + break; + } + } else { + written = write_raw_marker_to_buffer(tr, ubuf, cnt); + } + + return written; +} + static int tracing_clock_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; @@ -7067,6 +7459,12 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) tracing_reset_online_cpus(&tr->max_buffer); #endif + if (tr->scratch && !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) { + struct trace_scratch *tscratch = tr->scratch; + + tscratch->clock_id = i; + } + mutex_unlock(&trace_types_lock); return 0; @@ -7453,10 +7851,10 @@ static const struct file_operations trace_time_stamp_mode_fops = { }; static const struct file_operations last_boot_fops = { - .open = tracing_open_generic_tr, - .read = tracing_last_boot_read, - .llseek = generic_file_llseek, - .release = tracing_release_generic_tr, + .open = tracing_last_boot_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_seq_release, }; #ifdef CONFIG_TRACER_SNAPSHOT @@ -7942,6 +8340,9 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) { if (trace_empty(iter) && !iter->closed) { + if (update_last_data_if_empty(iter->tr)) + return 0; + if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; @@ -8279,8 +8680,8 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) struct trace_iterator *iter = &info->iter; int ret = 0; - /* Currently the boot mapped buffer is not supported for mmap */ - if (iter->tr->flags & TRACE_ARRAY_FL_BOOT) + /* A memmap'ed buffer is not supported for user space mmap */ + if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP) return -ENODEV; ret = get_snapshot_map(iter->tr); @@ -9196,22 +9597,134 @@ static struct dentry *trace_instance_dir; static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); +#ifdef CONFIG_MODULES +static int make_mod_delta(struct module *mod, void *data) +{ + struct trace_module_delta *module_delta; + struct trace_scratch *tscratch; + struct trace_mod_entry *entry; + struct trace_array *tr = data; + int i; + + tscratch = tr->scratch; + module_delta = READ_ONCE(tr->module_delta); + for (i = 0; i < tscratch->nr_entries; i++) { + entry = &tscratch->entries[i]; + if (strcmp(mod->name, entry->mod_name)) + continue; + if (mod->state == MODULE_STATE_GOING) + module_delta->delta[i] = 0; + else + module_delta->delta[i] = (unsigned long)mod->mem[MOD_TEXT].base + - entry->mod_addr; + break; + } + return 0; +} +#else +static int make_mod_delta(struct module *mod, void *data) +{ + return 0; +} +#endif + +static int mod_addr_comp(const void *a, const void *b, const void *data) +{ + const struct trace_mod_entry *e1 = a; + const struct trace_mod_entry *e2 = b; + + return e1->mod_addr > e2->mod_addr ? 1 : -1; +} + +static void setup_trace_scratch(struct trace_array *tr, + struct trace_scratch *tscratch, unsigned int size) +{ + struct trace_module_delta *module_delta; + struct trace_mod_entry *entry; + int i, nr_entries; + + if (!tscratch) + return; + + tr->scratch = tscratch; + tr->scratch_size = size; + + if (tscratch->text_addr) + tr->text_delta = (unsigned long)_text - tscratch->text_addr; + + if (struct_size(tscratch, entries, tscratch->nr_entries) > size) + goto reset; + + /* Check if each module name is a valid string */ + for (i = 0; i < tscratch->nr_entries; i++) { + int n; + + entry = &tscratch->entries[i]; + + for (n = 0; n < MODULE_NAME_LEN; n++) { + if (entry->mod_name[n] == '\0') + break; + if (!isprint(entry->mod_name[n])) + goto reset; + } + if (n == MODULE_NAME_LEN) + goto reset; + } + + /* Sort the entries so that we can find appropriate module from address. */ + nr_entries = tscratch->nr_entries; + sort_r(tscratch->entries, nr_entries, sizeof(struct trace_mod_entry), + mod_addr_comp, NULL, NULL); + + if (IS_ENABLED(CONFIG_MODULES)) { + module_delta = kzalloc(struct_size(module_delta, delta, nr_entries), GFP_KERNEL); + if (!module_delta) { + pr_info("module_delta allocation failed. Not able to decode module address."); + goto reset; + } + init_rcu_head(&module_delta->rcu); + } else + module_delta = NULL; + WRITE_ONCE(tr->module_delta, module_delta); + + /* Scan modules to make text delta for modules. */ + module_for_each_mod(make_mod_delta, tr); + + /* Set trace_clock as the same of the previous boot. */ + if (tscratch->clock_id != tr->clock_id) { + if (tscratch->clock_id >= ARRAY_SIZE(trace_clocks) || + tracing_set_clock(tr, trace_clocks[tscratch->clock_id].name) < 0) { + pr_info("the previous trace_clock info is not valid."); + goto reset; + } + } + return; + reset: + /* Invalid trace modules */ + memset(tscratch, 0, size); +} + static int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { enum ring_buffer_flags rb_flags; + struct trace_scratch *tscratch; + unsigned int scratch_size = 0; rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; buf->tr = tr; if (tr->range_addr_start && tr->range_addr_size) { + /* Add scratch buffer to handle 128 modules */ buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0, tr->range_addr_start, - tr->range_addr_size); + tr->range_addr_size, + struct_size(tscratch, entries, 128)); + + tscratch = ring_buffer_meta_scratch(buf->buffer, &scratch_size); + setup_trace_scratch(tr, tscratch, scratch_size); - ring_buffer_last_boot_delta(buf->buffer, - &tr->text_delta, &tr->data_delta); /* * This is basically the same as a mapped buffer, * with the same restrictions. @@ -9280,6 +9793,7 @@ static void free_trace_buffers(struct trace_array *tr) return; free_trace_buffer(&tr->array_buffer); + kfree(tr->module_delta); #ifdef CONFIG_TRACER_MAX_TRACE free_trace_buffer(&tr->max_buffer); @@ -9408,6 +9922,7 @@ trace_array_create_systems(const char *name, const char *systems, INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); INIT_LIST_HEAD(&tr->err_log); + INIT_LIST_HEAD(&tr->marker_list); #ifdef CONFIG_MODULES INIT_LIST_HEAD(&tr->mod_events); @@ -9445,6 +9960,7 @@ trace_array_create_systems(const char *name, const char *systems, free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree_const(tr->system_names); + kfree(tr->range_name); kfree(tr->name); kfree(tr); @@ -9475,30 +9991,35 @@ static int instance_mkdir(const char *name) return ret; } -static u64 map_pages(u64 start, u64 size) +#ifdef CONFIG_MMU +static u64 map_pages(unsigned long start, unsigned long size) { - struct page **pages; - phys_addr_t page_start; - unsigned int page_count; - unsigned int i; - void *vaddr; - - page_count = DIV_ROUND_UP(size, PAGE_SIZE); + unsigned long vmap_start, vmap_end; + struct vm_struct *area; + int ret; - page_start = start; - pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); - if (!pages) + area = get_vm_area(size, VM_IOREMAP); + if (!area) return 0; - for (i = 0; i < page_count; i++) { - phys_addr_t addr = page_start + i * PAGE_SIZE; - pages[i] = pfn_to_page(addr >> PAGE_SHIFT); + vmap_start = (unsigned long) area->addr; + vmap_end = vmap_start + size; + + ret = vmap_page_range(vmap_start, vmap_end, + start, pgprot_nx(PAGE_KERNEL)); + if (ret < 0) { + free_vm_area(area); + return 0; } - vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL); - kfree(pages); - return (u64)(unsigned long)vaddr; + return (u64)vmap_start; +} +#else +static inline u64 map_pages(unsigned long start, unsigned long size) +{ + return 0; } +#endif /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. @@ -9561,6 +10082,9 @@ static int __remove_instance(struct trace_array *tr) if (printk_trace == tr) update_printk_trace(&global_trace); + if (update_marker_trace(tr, 0)) + synchronize_rcu(); + tracing_set_nop(tr); clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); @@ -9571,6 +10095,11 @@ static int __remove_instance(struct trace_array *tr) free_trace_buffers(tr); clear_tracing_err_log(tr); + if (tr->range_name) { + reserve_mem_release_by_name(tr->range_name); + kfree(tr->range_name); + } + for (i = 0; i < tr->nr_topts; i++) { kfree(tr->topts[i].topts); } @@ -9731,6 +10260,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; struct file_system_type *type; + struct fs_context *fc; + int ret; /* * To maintain backward compatibility for tools that mount @@ -9740,12 +10271,20 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) type = get_fs_type("tracefs"); if (!type) return NULL; - mnt = vfs_submount(mntpt, type, "tracefs", NULL); + + fc = fs_context_for_submount(type, mntpt); put_filesystem(type); - if (IS_ERR(mnt)) - return NULL; - mntget(mnt); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + ret = vfs_parse_fs_string(fc, "source", + "tracefs", strlen("tracefs")); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + put_fs_context(fc); return mnt; } @@ -9892,6 +10431,24 @@ static void trace_module_remove_evals(struct module *mod) static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ +static void trace_module_record(struct module *mod, bool add) +{ + struct trace_array *tr; + unsigned long flags; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + flags = tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT); + /* Update any persistent trace array that has already been started */ + if (flags == TRACE_ARRAY_FL_BOOT && add) { + guard(mutex)(&scratch_mutex); + save_mod(mod, tr); + } else if (flags & TRACE_ARRAY_FL_LAST_BOOT) { + /* Update delta if the module loaded in previous boot */ + make_mod_delta(mod, tr); + } + } +} + static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -9900,9 +10457,11 @@ static int trace_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: trace_module_add_evals(mod); + trace_module_record(mod, true); break; case MODULE_STATE_GOING: trace_module_remove_evals(mod); + trace_module_record(mod, false); break; } @@ -10084,7 +10643,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m static struct trace_iterator iter; unsigned int old_userobj; unsigned long flags; - int cnt = 0, cpu; + int cnt = 0; /* * Always turn off tracing when we dump. @@ -10101,9 +10660,8 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m /* Simulate the iterator */ trace_init_iter(&iter, tr); - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + /* While dumping, do not allow the buffer to be enable */ + tracer_tracing_disable(tr); old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -10162,9 +10720,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m tr->trace_flags |= old_userobj; - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_enable(tr); local_irq_restore(flags); } @@ -10351,6 +10907,7 @@ static inline void do_allocate_snapshot(const char *name) { } __init static void enable_instances(void) { struct trace_array *tr; + bool memmap_area = false; char *curr_str; char *name; char *str; @@ -10368,6 +10925,7 @@ __init static void enable_instances(void) bool traceoff = false; char *flag_delim; char *addr_delim; + char *rname __free(kfree) = NULL; tok = strsep(&curr_str, ","); @@ -10418,16 +10976,31 @@ __init static void enable_instances(void) name); continue; } + memmap_area = true; } else if (tok) { if (!reserve_mem_find_by_name(tok, &start, &size)) { start = 0; pr_warn("Failed to map boot instance %s to %s\n", name, tok); continue; } + rname = kstrdup(tok, GFP_KERNEL); } if (start) { - addr = map_pages(start, size); + /* Start and size must be page aligned */ + if (start & ~PAGE_MASK) { + pr_warn("Tracing: mapping start addr %pa is not page aligned\n", &start); + continue; + } + if (size & ~PAGE_MASK) { + pr_warn("Tracing: mapping size %pa is not page aligned\n", &size); + continue; + } + + if (memmap_area) + addr = map_pages(start, size); + else + addr = (unsigned long)phys_to_virt(start); if (addr) { pr_info("Tracing: mapped boot instance %s at physical memory %pa of size 0x%lx\n", name, &start, (unsigned long)size); @@ -10454,15 +11027,18 @@ __init static void enable_instances(void) update_printk_trace(tr); /* - * If start is set, then this is a mapped buffer, and - * cannot be deleted by user space, so keep the reference - * to it. + * memmap'd buffers can not be freed. */ - if (start) { - tr->flags |= TRACE_ARRAY_FL_BOOT; + if (memmap_area) { + tr->flags |= TRACE_ARRAY_FL_MEMMAP; tr->ref++; } + if (start) { + tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; + tr->range_name = no_free_ptr(rname); + } + while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); } @@ -10584,6 +11160,7 @@ __init static int tracer_alloc_buffers(void) INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); INIT_LIST_HEAD(&global_trace.err_log); + list_add(&global_trace.marker_list, &marker_copies); list_add(&global_trace.list, &ftrace_trace_arrays); apply_trace_boot_options(); @@ -10704,6 +11281,9 @@ __init static int late_trace_init(void) tracepoint_printk = 0; } + if (traceoff_after_boot) + tracing_off(); + tracing_set_default_clock(); clear_boot_tracer(); return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9c21ba45b7af..bd084953a98b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -21,6 +21,7 @@ #include <linux/workqueue.h> #include <linux/ctype.h> #include <linux/once_lite.h> +#include <linux/ftrace_regs.h> #include "pid_list.h" @@ -182,8 +183,7 @@ struct trace_array; * the trace, etc.) */ struct trace_array_cpu { - atomic_t disabled; - void *buffer_page; /* ring buffer spare */ + local_t disabled; unsigned long entries; unsigned long saved_latency; @@ -312,6 +312,11 @@ struct trace_func_repeats { u64 ts_last_call; }; +struct trace_module_delta { + struct rcu_head rcu; + long delta[]; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -348,8 +353,13 @@ struct trace_array { unsigned int mapped; unsigned long range_addr_start; unsigned long range_addr_size; + char *range_name; long text_delta; - long data_delta; + struct trace_module_delta *module_delta; + void *scratch; /* pointer in persistent memory */ + int scratch_size; + + int buffer_disabled; struct trace_pid_list __rcu *filtered_pids; struct trace_pid_list __rcu *filtered_no_pids; @@ -367,7 +377,6 @@ struct trace_array { * CONFIG_TRACER_MAX_TRACE. */ arch_spinlock_t max_lock; - int buffer_disabled; #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; @@ -394,6 +403,7 @@ struct trace_array { struct trace_options *topts; struct list_head systems; struct list_head events; + struct list_head marker_list; struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ /* one per_cpu trace_pipe can be opened by only one user */ @@ -433,9 +443,11 @@ struct trace_array { }; enum { - TRACE_ARRAY_FL_GLOBAL = BIT(0), - TRACE_ARRAY_FL_BOOT = BIT(1), - TRACE_ARRAY_FL_MOD_INIT = BIT(2), + TRACE_ARRAY_FL_GLOBAL = BIT(0), + TRACE_ARRAY_FL_BOOT = BIT(1), + TRACE_ARRAY_FL_LAST_BOOT = BIT(2), + TRACE_ARRAY_FL_MOD_INIT = BIT(3), + TRACE_ARRAY_FL_MEMMAP = BIT(4), }; #ifdef CONFIG_MODULES @@ -462,6 +474,8 @@ extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); +extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr); + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. @@ -651,12 +665,29 @@ bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); void tracer_tracing_off(struct trace_array *tr); +void tracer_tracing_disable(struct trace_array *tr); +void tracer_tracing_enable(struct trace_array *tr); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); + +/** + * tracer_tracing_is_on_cpu - show real state of ring buffer enabled on for a cpu + * @tr : the trace array to know if ring buffer is enabled + * @cpu: The cpu buffer to check if enabled + * + * Shows real state of the per CPU buffer if it is enabled or not. + */ +static inline bool tracer_tracing_is_on_cpu(struct trace_array *tr, int cpu) +{ + if (tr->array_buffer.buffer) + return ring_buffer_record_is_on_cpu(tr->array_buffer.buffer, cpu); + return false; +} + int tracing_init_dentry(void); struct ring_buffer_event; @@ -697,7 +728,8 @@ unsigned long trace_total_entries(struct trace_array *tr); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned int trace_ctx); + unsigned int trace_ctx, + struct ftrace_regs *regs); void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, @@ -783,6 +815,8 @@ extern void trace_find_cmdline(int pid, char comm[]); extern int trace_find_tgid(int pid); extern void trace_event_follow_fork(struct trace_array *tr, bool enable); +extern int trace_events_enabled(struct trace_array *tr, const char *system); + #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; extern unsigned long ftrace_number_of_pages; @@ -836,13 +870,15 @@ static inline void __init disable_tracing_selftest(const char *reason) extern void *head_page(struct trace_array_cpu *data); extern unsigned long long ns2usecs(u64 nsec); -extern int -trace_vbprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_vprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args); + +__printf(2, 0) +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +__printf(2, 0) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +__printf(3, 0) +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); @@ -897,6 +933,7 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_RETVAL 0x800 #define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 #define TRACE_GRAPH_PRINT_RETADDR 0x2000 +#define TRACE_GRAPH_ARGS 0x4000 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) @@ -1348,6 +1385,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ C(TRACE_PRINTK, "trace_printk_dest"), \ + C(COPY_MARKER, "copy_trace_marker"),\ C(PAUSE_ON_TRACE, "pause-on-trace"), \ C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ FUNCTION_FLAGS \ @@ -1714,7 +1752,7 @@ struct event_trigger_data { unsigned long count; int ref; int flags; - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; char *filter_str; @@ -1752,6 +1790,9 @@ extern int event_enable_register_trigger(char *glob, extern void event_enable_unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file); +extern struct event_trigger_data * +trigger_data_alloc(struct event_command *cmd_ops, char *cmd, char *param, + void *private_data); extern void trigger_data_free(struct event_trigger_data *data); extern int event_trigger_init(struct event_trigger_data *data); extern int trace_event_trigger_enable_disable(struct trace_event_file *file, @@ -1778,11 +1819,6 @@ extern bool event_trigger_check_remove(const char *glob); extern bool event_trigger_empty_param(const char *param); extern int event_trigger_separate_filter(char *param_and_filter, char **param, char **filter, bool param_required); -extern struct event_trigger_data * -event_trigger_alloc(struct event_command *cmd_ops, - char *cmd, - char *param, - void *private_data); extern int event_trigger_parse_num(char *trigger, struct event_trigger_data *trigger_data); extern int event_trigger_set_filter(struct event_command *cmd_ops, @@ -1959,7 +1995,7 @@ struct event_command { int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); - struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); + const struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; /** diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 6d08a5523ce0..6809b370e991 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,7 +32,6 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) { struct trace_array *tr = branch_tracer; struct trace_buffer *buffer; - struct trace_array_cpu *data; struct ring_buffer_event *event; struct trace_branch *entry; unsigned long flags; @@ -54,8 +53,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) raw_local_irq_save(flags); current->trace_recursion |= TRACE_BRANCH_BIT; - data = this_cpu_ptr(tr->array_buffer.data); - if (atomic_read(&data->disabled)) + if (!tracer_tracing_is_on_cpu(tr, raw_smp_processor_id())) goto out; trace_ctx = tracing_gen_ctx_flags(flags); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index a322e4f249a5..5d64a18cacac 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -16,7 +16,7 @@ #include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" -static DEFINE_MUTEX(dyn_event_ops_mutex); +DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) @@ -116,6 +116,20 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type return ret; } +/* + * Locked version of event creation. The event creation must be protected by + * dyn_event_ops_mutex because of protecting trace_probe_log. + */ +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type) +{ + int ret; + + mutex_lock(&dyn_event_ops_mutex); + ret = type->create(raw_command); + mutex_unlock(&dyn_event_ops_mutex); + return ret; +} + static int create_dyn_event(const char *raw_command) { struct dyn_event_operations *ops; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 936477a111d3..beee3f8d7544 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); void dyn_event_seq_stop(struct seq_file *m, void *v); int dyn_events_release_all(struct dyn_event_operations *type); int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type); /* * for_each_dyn_event - iterate over the dyn_event list diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index fbfb396905a6..de294ae2c5c5 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -61,8 +61,9 @@ FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, F_STRUCT( - __field_fn( unsigned long, ip ) - __field_fn( unsigned long, parent_ip ) + __field_fn( unsigned long, ip ) + __field_fn( unsigned long, parent_ip ) + __dynamic_array( unsigned long, args ) ), F_printk(" %ps <-- %ps", @@ -72,17 +73,18 @@ FTRACE_ENTRY_REG(function, ftrace_entry, ); /* Function call entry */ -FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, TRACE_GRAPH_ENT, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( int, graph_ent, depth ) + __field_packed( unsigned int, graph_ent, depth ) + __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth) + F_printk("--> %ps (%u)", (void *)__entry->func, __entry->depth) ); #ifdef CONFIG_FUNCTION_GRAPH_RETADDR @@ -95,11 +97,11 @@ FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry, F_STRUCT( __field_struct( struct fgraph_retaddr_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( int, graph_ent, depth ) + __field_packed( unsigned int, graph_ent, depth ) __field_packed( unsigned long, graph_ent, retaddr ) ), - F_printk("--> %ps (%d) <- %ps", (void *)__entry->func, __entry->depth, + F_printk("--> %ps (%u) <- %ps", (void *)__entry->func, __entry->depth, (void *)__entry->retaddr) ); @@ -122,13 +124,13 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_struct( struct ftrace_graph_ret, ret ) __field_packed( unsigned long, ret, func ) __field_packed( unsigned long, ret, retval ) - __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, depth ) __field_packed( unsigned int, ret, overrun ) __field(unsigned long long, calltime ) __field(unsigned long long, rettime ) ), - F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", + F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u retval: %lx", (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth, __entry->retval) @@ -144,13 +146,13 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) __field_packed( unsigned long, ret, func ) - __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, depth ) __field_packed( unsigned int, ret, overrun ) __field(unsigned long long, calltime ) __field(unsigned long long, rettime ) ), - F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", + F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u", (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 82fd637cfc19..916555f0de81 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -478,7 +478,7 @@ static void eprobe_trigger_func(struct event_trigger_data *data, __eprobe_trace_func(edata, rec); } -static struct event_trigger_ops eprobe_trigger_ops = { +static const struct event_trigger_ops eprobe_trigger_ops = { .trigger = eprobe_trigger_func, .print = eprobe_trigger_print, .init = eprobe_trigger_init, @@ -507,8 +507,8 @@ static void eprobe_trigger_unreg_func(char *glob, } -static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, - char *param) +static const struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, + char *param) { return &eprobe_trigger_ops; } @@ -913,6 +913,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } if (argc - 2 > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); ret = -E2BIG; goto error; } @@ -967,10 +969,13 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } } + trace_probe_log_clear(); return ret; + parse_error: ret = -EINVAL; error: + trace_probe_log_clear(); trace_event_probe_cleanup(ep); return ret; } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 3ff9caa4a71b..a6bb7577e8c5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -49,7 +49,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event)) { - ret = perf_allow_tracepoint(&p_event->attr); + ret = perf_allow_tracepoint(); if (ret) return ret; @@ -86,7 +86,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, * ...otherwise raw tracepoint data can be a severe data leak, * only allow root to have these. */ - ret = perf_allow_tracepoint(&p_event->attr); + ret = perf_allow_tracepoint(); if (ret) return ret; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4cb275316e51..120531268abf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -400,6 +400,20 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca return true; } +static void handle_dereference_arg(const char *arg_str, u64 string_flags, int len, + u64 *dereference_flags, int arg, + struct trace_event_call *call) +{ + if (string_flags & (1ULL << arg)) { + if (process_string(arg_str, len, call)) + *dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(arg_str, len, call)) + *dereference_flags &= ~(1ULL << arg); + else + pr_warn("TRACE EVENT ERROR: Bad dereference argument: '%.*s'\n", + len, arg_str); +} + /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and @@ -470,6 +484,7 @@ static void test_event_printk(struct trace_event_call *call) case '%': continue; case 'p': + do_pointer: /* Find dereferencing fields */ switch (fmt[i + 1]) { case 'B': case 'R': case 'r': @@ -498,6 +513,12 @@ static void test_event_printk(struct trace_event_call *call) continue; if (fmt[i + j] == '*') { star = true; + /* Handle %*pbl case */ + if (!j && fmt[i + 1] == 'p') { + arg++; + i++; + goto do_pointer; + } continue; } if ((fmt[i + j] == 's')) { @@ -556,11 +577,9 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (string_flags & (1ULL << arg)) { - if (process_string(fmt + start_arg, e - start_arg, call)) - dereference_flags &= ~(1ULL << arg); - } else if (process_pointer(fmt + start_arg, e - start_arg, call)) - dereference_flags &= ~(1ULL << arg); + handle_dereference_arg(fmt + start_arg, string_flags, + e - start_arg, + &dereference_flags, arg, call); } start_arg = i; @@ -571,11 +590,9 @@ static void test_event_printk(struct trace_event_call *call) } if (dereference_flags & (1ULL << arg)) { - if (string_flags & (1ULL << arg)) { - if (process_string(fmt + start_arg, i - start_arg, call)) - dereference_flags &= ~(1ULL << arg); - } else if (process_pointer(fmt + start_arg, i - start_arg, call)) - dereference_flags &= ~(1ULL << arg); + handle_dereference_arg(fmt + start_arg, string_flags, + i - start_arg, + &dereference_flags, arg, call); } /* @@ -615,7 +632,6 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init); bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) { struct trace_array *tr = trace_file->tr; - struct trace_array_cpu *data; struct trace_pid_list *no_pid_list; struct trace_pid_list *pid_list; @@ -625,9 +641,11 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) if (!pid_list && !no_pid_list) return false; - data = this_cpu_ptr(tr->array_buffer.data); - - return data->ignore_pid; + /* + * This is recorded at every sched_switch for this task. + * Thus, even if the task migrates the ignore value will be the same. + */ + return this_cpu_read(tr->array_buffer.data->ignore_pid) != 0; } EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); @@ -790,7 +808,9 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } - call->class->reg(call, TRACE_REG_UNREGISTER, file); + ret = call->class->reg(call, TRACE_REG_UNREGISTER, file); + + WARN_ON_ONCE(ret); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ if (file->flags & EVENT_FILE_FL_SOFT_MODE) @@ -1591,6 +1611,13 @@ s_next(struct seq_file *m, void *v, loff_t *pos) return iter; #endif + /* + * The iter is allocated in s_start() and passed via the 'v' + * parameter. To stop the iterator, NULL must be returned. But + * the return value is what the 'v' parameter in s_stop() receives + * and frees. Free iter here as it will no longer be used. + */ + kfree(iter); return NULL; } @@ -1667,9 +1694,9 @@ static int s_show(struct seq_file *m, void *v) } #endif -static void s_stop(struct seq_file *m, void *p) +static void s_stop(struct seq_file *m, void *v) { - kfree(p); + kfree(v); t_stop(m, NULL); } @@ -1811,28 +1838,28 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } -static ssize_t -system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) +/* + * Returns: + * 0 : no events exist? + * 1 : all events are disabled + * 2 : all events are enabled + * 3 : some events are enabled and some are enabled + */ +int trace_events_enabled(struct trace_array *tr, const char *system) { - const char set_to_char[4] = { '?', '0', '1', 'X' }; - struct trace_subsystem_dir *dir = filp->private_data; - struct event_subsystem *system = dir->subsystem; struct trace_event_call *call; struct trace_event_file *file; - struct trace_array *tr = dir->tr; - char buf[2]; int set = 0; - int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); + list_for_each_entry(file, &tr->events, list) { call = file->event_call; if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || !trace_event_name(call) || !call->class || !call->class->reg) continue; - if (system && strcmp(call->class->system, system->name) != 0) + if (system && strcmp(call->class->system, system) != 0) continue; /* @@ -1848,7 +1875,23 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (set == 3) break; } - mutex_unlock(&event_mutex); + + return set; +} + +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char set_to_char[4] = { '?', '0', '1', 'X' }; + struct trace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; + struct trace_array *tr = dir->tr; + char buf[2]; + int set; + int ret; + + set = trace_events_enabled(tr, system ? system->name : NULL); buf[0] = set_to_char[set]; buf[1] = '\n'; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0993dfc1c5c1..3885aadc434d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -808,7 +808,7 @@ static __always_inline char *test_string(char *str) kstr = ubuf->buffer; /* For safety, do not trust the string pointer */ - if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0) return NULL; return kstr; } @@ -827,7 +827,7 @@ static __always_inline char *test_ustring(char *str) /* user space address? */ ustr = (char __user *)str; - if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0) return NULL; return kstr; @@ -1250,7 +1250,9 @@ static void append_filter_err(struct trace_array *tr, static inline struct event_filter *event_filter(struct trace_event_file *file) { - return file->filter; + return rcu_dereference_protected(file->filter, + lockdep_is_held(&event_mutex)); + } /* caller must hold event_mutex */ @@ -1320,7 +1322,7 @@ void free_event_filter(struct event_filter *filter) static inline void __remove_filter(struct trace_event_file *file) { filter_disable(file); - remove_filter_string(file->filter); + remove_filter_string(event_filter(file)); } static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, @@ -1335,22 +1337,137 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, } } +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + +struct filter_head { + struct list_head list; + struct rcu_head rcu; +}; + + +static void free_filter_list(struct rcu_head *rhp) +{ + struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); + struct filter_list *filter_item, *tmp; + + list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + kfree(filter_list); +} + +static void free_filter_list_tasks(struct rcu_head *rhp) +{ + call_rcu(rhp, free_filter_list); +} + +/* + * The tracepoint_synchronize_unregister() is a double rcu call. + * It calls synchronize_rcu_tasks_trace() followed by synchronize_rcu(). + * Instead of waiting for it, simply call these via the call_rcu*() + * variants. + */ +static void delay_free_filter(struct filter_head *head) +{ + call_rcu_tasks_trace(&head->rcu, free_filter_list_tasks); +} + +static void try_delay_free_filter(struct event_filter *filter) +{ + struct filter_head *head; + struct filter_list *item; + + head = kmalloc(sizeof(*head), GFP_KERNEL); + if (!head) + goto free_now; + + INIT_LIST_HEAD(&head->list); + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) { + kfree(head); + goto free_now; + } + + item->filter = filter; + list_add_tail(&item->list, &head->list); + delay_free_filter(head); + return; + + free_now: + /* Make sure the filter is not being used */ + tracepoint_synchronize_unregister(); + __free_filter(filter); +} + static inline void __free_subsystem_filter(struct trace_event_file *file) { - __free_filter(file->filter); + __free_filter(event_filter(file)); file->filter = NULL; } +static inline void event_set_filter(struct trace_event_file *file, + struct event_filter *filter) +{ + rcu_assign_pointer(file->filter, filter); +} + +static inline void event_clear_filter(struct trace_event_file *file) +{ + RCU_INIT_POINTER(file->filter, NULL); +} + static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, - struct trace_array *tr) + struct trace_array *tr, + struct event_filter *filter) { struct trace_event_file *file; + struct filter_head *head; + struct filter_list *item; + + head = kmalloc(sizeof(*head), GFP_KERNEL); + if (!head) + goto free_now; + + INIT_LIST_HEAD(&head->list); list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) + goto free_now; + item->filter = event_filter(file); + list_add_tail(&item->list, &head->list); + event_clear_filter(file); + } + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) + goto free_now; + + item->filter = filter; + list_add_tail(&item->list, &head->list); + + delay_free_filter(head); + return; + free_now: + tracepoint_synchronize_unregister(); + + if (head) + free_filter_list(&head->rcu); + + list_for_each_entry(file, &tr->events, list) { + if (file->system != dir || !file->filter) + continue; __free_subsystem_filter(file); } + __free_filter(filter); } int filter_assign_type(const char *type) @@ -2120,22 +2237,6 @@ static inline void event_set_filtered_flag(struct trace_event_file *file) trace_buffered_event_enable(); } -static inline void event_set_filter(struct trace_event_file *file, - struct event_filter *filter) -{ - rcu_assign_pointer(file->filter, filter); -} - -static inline void event_clear_filter(struct trace_event_file *file) -{ - RCU_INIT_POINTER(file->filter, NULL); -} - -struct filter_list { - struct list_head list; - struct event_filter *filter; -}; - static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_array *tr, struct filter_parse_error *pe, @@ -2144,11 +2245,16 @@ static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_event_file *file; struct filter_list *filter_item; struct event_filter *filter = NULL; - struct filter_list *tmp; - LIST_HEAD(filter_list); + struct filter_head *filter_list; bool fail = true; int err; + filter_list = kmalloc(sizeof(*filter_list), GFP_KERNEL); + if (!filter_list) + return -ENOMEM; + + INIT_LIST_HEAD(&filter_list->list); + list_for_each_entry(file, &tr->events, list) { if (file->system != dir) @@ -2175,7 +2281,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, if (!filter_item) goto fail_mem; - list_add_tail(&filter_item->list, &filter_list); + list_add_tail(&filter_item->list, &filter_list->list); /* * Regardless of if this returned an error, we still * replace the filter for the call. @@ -2195,31 +2301,22 @@ static int process_system_preds(struct trace_subsystem_dir *dir, * Do a synchronize_rcu() and to ensure all calls are * done with them before we free them. */ - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + delay_free_filter(filter_list); return 0; fail: /* No call succeeded */ - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - list_del(&filter_item->list); - kfree(filter_item); - } + free_filter_list(&filter_list->rcu); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: __free_filter(filter); + /* If any call succeeded, we still need to sync */ if (!fail) - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + delay_free_filter(filter_list); + else + free_filter_list(&filter_list->rcu); + return -ENOMEM; } @@ -2361,9 +2458,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_clear_filter(file); - /* Make sure the filter is not being used */ - tracepoint_synchronize_unregister(); - __free_filter(filter); + try_delay_free_filter(filter); return 0; } @@ -2387,11 +2482,8 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_set_filter(file, filter); - if (tmp) { - /* Make sure the call is done with the filter */ - tracepoint_synchronize_unregister(); - __free_filter(tmp); - } + if (tmp) + try_delay_free_filter(tmp); } return err; @@ -2417,9 +2509,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ - tracepoint_synchronize_unregister(); - filter_free_subsystem_filters(dir, tr); - __free_filter(filter); + filter_free_subsystem_filters(dir, tr, filter); return 0; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 261163b00137..1d536219b624 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -114,6 +114,7 @@ enum hist_field_fn { HIST_FIELD_FN_BUCKET, HIST_FIELD_FN_TIMESTAMP, HIST_FIELD_FN_CPU, + HIST_FIELD_FN_COMM, HIST_FIELD_FN_STRING, HIST_FIELD_FN_DYNSTRING, HIST_FIELD_FN_RELDYNSTRING, @@ -506,6 +507,7 @@ enum hist_field_flags { HIST_FIELD_FL_CONST = 1 << 18, HIST_FIELD_FL_PERCENT = 1 << 19, HIST_FIELD_FL_GRAPH = 1 << 20, + HIST_FIELD_FL_COMM = 1 << 21, }; struct var_defs { @@ -885,6 +887,15 @@ static u64 hist_field_cpu(struct hist_field *hist_field, return cpu; } +static u64 hist_field_comm(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + return (u64)(unsigned long)current->comm; +} + /** * check_field_for_var_ref - Check if a VAR_REF field references a variable * @hist_field: The VAR_REF field to check @@ -1338,6 +1349,8 @@ static const char *hist_field_name(struct hist_field *field, field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_CPU) field_name = "common_cpu"; + else if (field->flags & HIST_FIELD_FL_COMM) + field_name = "common_comm"; else if (field->flags & HIST_FIELD_FL_EXPR || field->flags & HIST_FIELD_FL_VAR_REF) { if (field->system) { @@ -2015,6 +2028,13 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } + if (flags & HIST_FIELD_FL_COMM) { + hist_field->fn_num = HIST_FIELD_FN_COMM; + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->type = "char[]"; + goto out; + } + if (WARN_ON_ONCE(!field)) goto out; @@ -2359,9 +2379,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->attrs->ts_in_usecs = true; } else if (strcmp(field_name, "common_stacktrace") == 0) { *flags |= HIST_FIELD_FL_STACKTRACE; - } else if (strcmp(field_name, "common_cpu") == 0) + } else if (strcmp(field_name, "common_cpu") == 0) { *flags |= HIST_FIELD_FL_CPU; - else if (strcmp(field_name, "hitcount") == 0) + } else if (strcmp(field_name, "common_comm") == 0) { + *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING; + } else if (strcmp(field_name, "hitcount") == 0) *flags |= HIST_FIELD_FL_HITCOUNT; else { field = trace_find_event_field(file->event_call, field_name); @@ -2377,6 +2399,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_CPU; } else if (field && field->filter_type == FILTER_STACKTRACE) { *flags |= HIST_FIELD_FL_STACKTRACE; + } else if (field && field->filter_type == FILTER_COMM) { + *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); @@ -4327,6 +4351,8 @@ static u64 hist_fn_call(struct hist_field *hist_field, return hist_field_timestamp(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_CPU: return hist_field_cpu(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_COMM: + return hist_field_comm(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_STRING: return hist_field_string(hist_field, elt, buffer, rbe, event); case HIST_FIELD_FN_DYNSTRING: @@ -5212,22 +5238,25 @@ static inline void add_to_key(char *compound_key, void *key, size_t size = key_field->size; if (key_field->flags & HIST_FIELD_FL_STRING) { - struct ftrace_event_field *field; - field = key_field->field; - if (field->filter_type == FILTER_DYN_STRING || - field->filter_type == FILTER_RDYN_STRING) - size = *(u32 *)(rec + field->offset) >> 16; - else if (field->filter_type == FILTER_STATIC_STRING) - size = field->size; + if (key_field->flags & HIST_FIELD_FL_COMM) { + size = strlen((char *)key); + } else { + struct ftrace_event_field *field; + + field = key_field->field; + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) + size = *(u32 *)(rec + field->offset) >> 16; + else if (field->filter_type == FILTER_STATIC_STRING) + size = field->size; + } /* ensure NULL-termination */ if (size > key_field->size - 1) size = key_field->size - 1; - - strncpy(compound_key + key_field->offset, (char *)key, size); - } else - memcpy(compound_key + key_field->offset, key, size); + } + memcpy(compound_key + key_field->offset, key, size); } static void @@ -5246,17 +5275,94 @@ hist_trigger_actions(struct hist_trigger_data *hist_data, } } +/* + * The hist_pad structure is used to save information to create + * a histogram from the histogram trigger. It's too big to store + * on the stack, so when the histogram trigger is initialized + * a percpu array of 4 hist_pad structures is allocated. + * This will cover every context from normal, softirq, irq and NMI + * in the very unlikely event that a tigger happens at each of + * these contexts and interrupts a currently active trigger. + */ +struct hist_pad { + unsigned long entries[HIST_STACKTRACE_DEPTH]; + u64 var_ref_vals[TRACING_MAP_VARS_MAX]; + char compound_key[HIST_KEY_SIZE_MAX]; +}; + +static struct hist_pad __percpu *hist_pads; +static DEFINE_PER_CPU(int, hist_pad_cnt); +static refcount_t hist_pad_ref; + +/* One hist_pad for every context (normal, softirq, irq, NMI) */ +#define MAX_HIST_CNT 4 + +static int alloc_hist_pad(void) +{ + lockdep_assert_held(&event_mutex); + + if (refcount_read(&hist_pad_ref)) { + refcount_inc(&hist_pad_ref); + return 0; + } + + hist_pads = __alloc_percpu(sizeof(struct hist_pad) * MAX_HIST_CNT, + __alignof__(struct hist_pad)); + if (!hist_pads) + return -ENOMEM; + + refcount_set(&hist_pad_ref, 1); + return 0; +} + +static void free_hist_pad(void) +{ + lockdep_assert_held(&event_mutex); + + if (!refcount_dec_and_test(&hist_pad_ref)) + return; + + free_percpu(hist_pads); + hist_pads = NULL; +} + +static struct hist_pad *get_hist_pad(void) +{ + struct hist_pad *hist_pad; + int cnt; + + if (WARN_ON_ONCE(!hist_pads)) + return NULL; + + preempt_disable(); + + hist_pad = per_cpu_ptr(hist_pads, smp_processor_id()); + + if (this_cpu_read(hist_pad_cnt) == MAX_HIST_CNT) { + preempt_enable(); + return NULL; + } + + cnt = this_cpu_inc_return(hist_pad_cnt) - 1; + + return &hist_pad[cnt]; +} + +static void put_hist_pad(void) +{ + this_cpu_dec(hist_pad_cnt); + preempt_enable(); +} + static void event_hist_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe) { struct hist_trigger_data *hist_data = data->private_data; bool use_compound_key = (hist_data->n_keys > 1); - unsigned long entries[HIST_STACKTRACE_DEPTH]; - u64 var_ref_vals[TRACING_MAP_VARS_MAX]; - char compound_key[HIST_KEY_SIZE_MAX]; struct tracing_map_elt *elt = NULL; struct hist_field *key_field; + struct hist_pad *hist_pad; u64 field_contents; void *key = NULL; unsigned int i; @@ -5264,12 +5370,18 @@ static void event_hist_trigger(struct event_trigger_data *data, if (unlikely(!rbe)) return; - memset(compound_key, 0, hist_data->key_size); + hist_pad = get_hist_pad(); + if (!hist_pad) + return; + + memset(hist_pad->compound_key, 0, hist_data->key_size); for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + unsigned long *entries = hist_pad->entries; + memset(entries, 0, HIST_STACKTRACE_SIZE); if (key_field->field) { unsigned long *stack, n_entries; @@ -5293,26 +5405,31 @@ static void event_hist_trigger(struct event_trigger_data *data, } if (use_compound_key) - add_to_key(compound_key, key, key_field, rec); + add_to_key(hist_pad->compound_key, key, key_field, rec); } if (use_compound_key) - key = compound_key; + key = hist_pad->compound_key; if (hist_data->n_var_refs && - !resolve_var_refs(hist_data, key, var_ref_vals, false)) - return; + !resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, false)) + goto out; elt = tracing_map_insert(hist_data->map, key); if (!elt) - return; + goto out; - hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals); + hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, hist_pad->var_ref_vals); - if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); + if (resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, true)) { + hist_trigger_actions(hist_data, elt, buffer, rec, rbe, + key, hist_pad->var_ref_vals); + } hist_poll_wakeup(); + + out: + put_hist_pad(); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -5689,12 +5806,16 @@ static int event_hist_open(struct inode *inode, struct file *file) guard(mutex)(&event_mutex); event_file = event_file_data(file); - if (!event_file) - return -ENODEV; + if (!event_file) { + ret = -ENODEV; + goto err; + } hist_file = kzalloc(sizeof(*hist_file), GFP_KERNEL); - if (!hist_file) - return -ENOMEM; + if (!hist_file) { + ret = -ENOMEM; + goto err; + } hist_file->file = file; hist_file->last_act = get_hist_hit_count(event_file); @@ -5702,9 +5823,14 @@ static int event_hist_open(struct inode *inode, struct file *file) /* Clear private_data to avoid warning in single_open() */ file->private_data = NULL; ret = single_open(file, hist_show, hist_file); - if (ret) + if (ret) { kfree(hist_file); + goto err; + } + return 0; +err: + tracing_release_file_tr(inode, file); return ret; } @@ -5979,7 +6105,10 @@ static int event_hist_debug_open(struct inode *inode, struct file *file) /* Clear private_data to avoid warning in single_open() */ file->private_data = NULL; - return single_open(file, hist_debug_show, file); + ret = single_open(file, hist_debug_show, file); + if (ret) + tracing_release_file_tr(inode, file); + return ret; } const struct file_operations event_hist_debug_fops = { @@ -5999,6 +6128,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags & HIST_FIELD_FL_CPU) seq_puts(m, "common_cpu"); + if (hist_field->flags & HIST_FIELD_FL_COMM) + seq_puts(m, "common_comm"); else if (hist_field->flags & HIST_FIELD_FL_CONST) seq_printf(m, "%llu", hist_field->constant); else if (field_name) { @@ -6145,6 +6276,9 @@ static int event_hist_trigger_init(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; + if (alloc_hist_pad() < 0) + return -ENOMEM; + if (!data->ref && hist_data->attrs->name) save_named_trigger(hist_data->attrs->name, data); @@ -6189,9 +6323,10 @@ static void event_hist_trigger_free(struct event_trigger_data *data) destroy_hist_data(hist_data); } + free_hist_pad(); } -static struct event_trigger_ops event_hist_trigger_ops = { +static const struct event_trigger_ops event_hist_trigger_ops = { .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_init, @@ -6204,9 +6339,7 @@ static int event_hist_trigger_named_init(struct event_trigger_data *data) save_named_trigger(data->named_data->name, data); - event_hist_trigger_init(data->named_data); - - return 0; + return event_hist_trigger_init(data->named_data); } static void event_hist_trigger_named_free(struct event_trigger_data *data) @@ -6223,15 +6356,15 @@ static void event_hist_trigger_named_free(struct event_trigger_data *data) } } -static struct event_trigger_ops event_hist_trigger_named_ops = { +static const struct event_trigger_ops event_hist_trigger_named_ops = { .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_named_init, .free = event_hist_trigger_named_free, }; -static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, - char *param) +static const struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, + char *param) { return &event_hist_trigger_ops; } @@ -6693,7 +6826,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, return PTR_ERR(hist_data); } - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, hist_data); if (!trigger_data) { ret = -ENOMEM; goto out_free; @@ -6724,27 +6857,27 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (existing_hist_update_only(glob, trigger_data, file)) goto out_free; - ret = event_trigger_register(cmd_ops, file, glob, trigger_data); - if (ret < 0) - goto out_free; + if (!get_named_trigger_data(trigger_data)) { - if (get_named_trigger_data(trigger_data)) - goto enable; + ret = create_actions(hist_data); + if (ret) + goto out_free; - ret = create_actions(hist_data); - if (ret) - goto out_unreg; + if (has_hist_vars(hist_data) || hist_data->n_var_refs) { + ret = save_hist_vars(hist_data); + if (ret) + goto out_free; + } - if (has_hist_vars(hist_data) || hist_data->n_var_refs) { - ret = save_hist_vars(hist_data); + ret = tracing_map_init(hist_data->map); if (ret) - goto out_unreg; + goto out_free; } - ret = tracing_map_init(hist_data->map); - if (ret) - goto out_unreg; -enable: + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret < 0) + goto out_free; + ret = hist_trigger_enable(trigger_data, file); if (ret) goto out_unreg; @@ -6826,38 +6959,38 @@ hist_enable_count_trigger(struct event_trigger_data *data, hist_enable_trigger(data, buffer, rec, event); } -static struct event_trigger_ops hist_enable_trigger_ops = { +static const struct event_trigger_ops hist_enable_trigger_ops = { .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops hist_enable_count_trigger_ops = { +static const struct event_trigger_ops hist_enable_count_trigger_ops = { .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops hist_disable_trigger_ops = { +static const struct event_trigger_ops hist_disable_trigger_ops = { .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops hist_disable_count_trigger_ops = { +static const struct event_trigger_ops hist_disable_count_trigger_ops = { .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * hist_enable_get_trigger_ops(char *cmd, char *param) { - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; bool enable; enable = (strcmp(cmd, ENABLE_HIST_STR) == 0); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index e3f7d09e5512..33cfbd4ed76d 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -207,7 +207,7 @@ static int synth_field_string_size(char *type) if (len == 0) return 0; /* variable-length string */ - strncpy(buf, start, len); + memcpy(buf, start, len); buf[len] = '\0'; err = kstrtouint(buf, 0, &size); @@ -305,7 +305,7 @@ static const char *synth_field_fmt(char *type) else if (strcmp(type, "gfp_t") == 0) fmt = "%x"; else if (synth_field_is_string(type)) - fmt = "%.*s"; + fmt = "%s"; else if (synth_field_is_stack(type)) fmt = "%s"; @@ -370,7 +370,6 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, union trace_synth_field *data = &entry->fields[n_u64]; trace_seq_printf(s, print_fmt, se->fields[i]->name, - STR_VAR_LEN_MAX, (char *)entry + data->as_dynamic.offset, i == se->n_fields - 1 ? "" : " "); n_u64++; @@ -612,7 +611,7 @@ static int __set_synth_event_print_fmt(struct synth_event *event, fmt = synth_field_fmt(event->fields[i]->type); pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", event->fields[i]->name, fmt, - i == event->n_fields - 1 ? "" : ", "); + i == event->n_fields - 1 ? "" : " "); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); @@ -852,6 +851,38 @@ static struct trace_event_fields synth_event_fields_array[] = { {} }; +static int synth_event_reg(struct trace_event_call *call, + enum trace_reg type, void *data) +{ + struct synth_event *event = container_of(call, struct synth_event, call); + + switch (type) { +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: +#endif + case TRACE_REG_REGISTER: + if (!try_module_get(event->mod)) + return -EBUSY; + break; + default: + break; + } + + int ret = trace_event_reg(call, type, data); + + switch (type) { +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_UNREGISTER: +#endif + case TRACE_REG_UNREGISTER: + module_put(event->mod); + break; + default: + break; + } + return ret; +} + static int register_synth_event(struct synth_event *event) { struct trace_event_call *call = &event->call; @@ -881,7 +912,7 @@ static int register_synth_event(struct synth_event *event) goto out; } call->flags = TRACE_EVENT_FL_TRACEPOINT; - call->class->reg = trace_event_reg; + call->class->reg = synth_event_reg; call->class->probe = trace_event_raw_event_synth; call->data = event; call->tp = event->tp; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d45448947094..cbfc306c0159 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -552,16 +552,14 @@ static int register_trigger(char *glob, lockdep_assert_held(&event_mutex); list_for_each_entry(test, &file->triggers, list) { - if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { - ret = -EEXIST; - goto out; - } + if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) + return -EEXIST; } if (data->ops->init) { ret = data->ops->init(data); if (ret < 0) - goto out; + return ret; } list_add_rcu(&data->list, &file->triggers); @@ -572,7 +570,6 @@ static int register_trigger(char *glob, list_del_rcu(&data->list); update_cond_flag(file); } -out: return ret; } @@ -770,7 +767,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, if (!param_and_filter) { if (param_required) ret = -EINVAL; - goto out; + return ret; } /* @@ -781,7 +778,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, */ if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) { *filter = param_and_filter; - goto out; + return ret; } /* @@ -799,12 +796,11 @@ int event_trigger_separate_filter(char *param_and_filter, char **param, if (!**filter) *filter = NULL; } -out: return ret; } /** - * event_trigger_alloc - allocate and init event_trigger_data for a trigger + * trigger_data_alloc - allocate and init event_trigger_data for a trigger * @cmd_ops: The event_command operations for the trigger * @cmd: The cmd string * @param: The param string @@ -815,17 +811,17 @@ out: * trigger_ops to assign to the event_trigger_data. @private_data can * also be passed in and associated with the event_trigger_data. * - * Use event_trigger_free() to free an event_trigger_data object. + * Use trigger_data_free() to free an event_trigger_data object. * * Return: The trigger_data object success, NULL otherwise */ -struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, - char *cmd, - char *param, - void *private_data) +struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data) { struct event_trigger_data *trigger_data; - struct event_trigger_ops *trigger_ops; + const struct event_trigger_ops *trigger_ops; trigger_ops = cmd_ops->get_trigger_ops(cmd, param); @@ -989,15 +985,14 @@ event_trigger_parse(struct event_command *cmd_ops, return ret; ret = -ENOMEM; - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, file); if (!trigger_data) - goto out; + return ret; if (remove) { event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); - kfree(trigger_data); - ret = 0; - goto out; + trigger_data_free(trigger_data); + return 0; } ret = event_trigger_parse_num(param, trigger_data); @@ -1017,13 +1012,12 @@ event_trigger_parse(struct event_command *cmd_ops, /* Down the counter of trigger_data or free it if not used anymore */ event_trigger_free(trigger_data); - out: return ret; out_free: event_trigger_reset_filter(cmd_ops, trigger_data); - kfree(trigger_data); - goto out; + trigger_data_free(trigger_data); + return ret; } /** @@ -1057,10 +1051,10 @@ int set_trigger_filter(char *filter_str, s = strsep(&filter_str, " \t"); if (!strlen(s) || strcmp(s, "if") != 0) - goto out; + return ret; if (!filter_str) - goto out; + return ret; /* The filter is for the 'trigger' event, not the triggered event */ ret = create_event_filter(file->tr, file->event_call, @@ -1104,7 +1098,6 @@ int set_trigger_filter(char *filter_str, ret = -ENOMEM; } } - out: return ret; } @@ -1367,38 +1360,38 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops traceon_trigger_ops = { +static const struct event_trigger_ops traceon_trigger_ops = { .trigger = traceon_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops traceon_count_trigger_ops = { +static const struct event_trigger_ops traceon_count_trigger_ops = { .trigger = traceon_count_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops traceoff_trigger_ops = { +static const struct event_trigger_ops traceoff_trigger_ops = { .trigger = traceoff_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops traceoff_count_trigger_ops = { +static const struct event_trigger_ops traceoff_count_trigger_ops = { .trigger = traceoff_count_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * onoff_get_trigger_ops(char *cmd, char *param) { - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; /* we register both traceon and traceoff to this callback */ if (strcmp(cmd, "traceon") == 0) @@ -1491,21 +1484,21 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops snapshot_trigger_ops = { +static const struct event_trigger_ops snapshot_trigger_ops = { .trigger = snapshot_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops snapshot_count_trigger_ops = { +static const struct event_trigger_ops snapshot_count_trigger_ops = { .trigger = snapshot_count_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * snapshot_get_trigger_ops(char *cmd, char *param) { return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; @@ -1560,7 +1553,7 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_event_file *file = data->private_data; if (file) - __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP); else trace_dump_stack(STACK_SKIP); } @@ -1586,21 +1579,21 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops stacktrace_trigger_ops = { +static const struct event_trigger_ops stacktrace_trigger_ops = { .trigger = stacktrace_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops stacktrace_count_trigger_ops = { +static const struct event_trigger_ops stacktrace_count_trigger_ops = { .trigger = stacktrace_count_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * stacktrace_get_trigger_ops(char *cmd, char *param) { return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; @@ -1711,28 +1704,28 @@ void event_enable_trigger_free(struct event_trigger_data *data) } } -static struct event_trigger_ops event_enable_trigger_ops = { +static const struct event_trigger_ops event_enable_trigger_ops = { .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops event_enable_count_trigger_ops = { +static const struct event_trigger_ops event_enable_count_trigger_ops = { .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops event_disable_trigger_ops = { +static const struct event_trigger_ops event_disable_trigger_ops = { .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops event_disable_count_trigger_ops = { +static const struct event_trigger_ops event_disable_count_trigger_ops = { .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, @@ -1772,7 +1765,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, ret = -EINVAL; event_enable_file = find_event_file(tr, system, event); if (!event_enable_file) - goto out; + return ret; #ifdef CONFIG_HIST_TRIGGERS hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) || @@ -1787,16 +1780,16 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); if (!enable_data) - goto out; + return ret; enable_data->hist = hist; enable_data->enable = enable; enable_data->file = event_enable_file; - trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data); + trigger_data = trigger_data_alloc(cmd_ops, cmd, param, enable_data); if (!trigger_data) { kfree(enable_data); - goto out; + return ret; } if (remove) { @@ -1804,7 +1797,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, kfree(trigger_data); kfree(enable_data); ret = 0; - goto out; + return ret; } /* Up the trigger_data count to make sure nothing frees it on failure */ @@ -1834,7 +1827,6 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, goto out_disable; event_trigger_free(trigger_data); - out: return ret; out_disable: trace_event_enable_disable(event_enable_file, 0, 1); @@ -1845,7 +1837,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, event_trigger_free(trigger_data); kfree(enable_data); - goto out; + return ret; } int event_enable_register_trigger(char *glob, @@ -1865,15 +1857,14 @@ int event_enable_register_trigger(char *glob, (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) && (test_enable_data->file == enable_data->file)) { - ret = -EEXIST; - goto out; + return -EEXIST; } } if (data->ops->init) { ret = data->ops->init(data); if (ret < 0) - goto out; + return ret; } list_add_rcu(&data->list, &file->triggers); @@ -1884,7 +1875,6 @@ int event_enable_register_trigger(char *glob, list_del_rcu(&data->list); update_cond_flag(file); } -out: return ret; } @@ -1916,10 +1906,10 @@ void event_enable_unregister_trigger(char *glob, data->ops->free(data); } -static struct event_trigger_ops * +static const struct event_trigger_ops * event_enable_get_trigger_ops(char *cmd, char *param) { - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; bool enable; #ifdef CONFIG_HIST_TRIGGERS diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 97325fbd6283..af42aaa3d172 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -455,7 +455,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work) if (ret && ret != -ENOENT) { struct user_event *user = enabler->event; - pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n", + pr_warn("user_events: Fault for mm: 0x%p @ 0x%llx event: %s\n", mm->mm, (unsigned long long)uaddr, EVENT_NAME(user)); } @@ -2793,11 +2793,8 @@ static int user_seq_show(struct seq_file *m, void *p) seq_printf(m, "%s", EVENT_TP_NAME(user)); - if (status != 0) - seq_puts(m, " #"); - if (status != 0) { - seq_puts(m, " Used by"); + seq_puts(m, " # Used by"); if (status & EVENT_STATUS_FTRACE) seq_puts(m, " ftrace"); if (status & EVENT_STATUS_PERF) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index b8f3c4ba309b..b40fa59159ac 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -919,14 +919,15 @@ static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mo struct __find_tracepoint_cb_data *data = priv; if (!data->tpoint && !strcmp(data->tp_name, tp->name)) { - data->tpoint = tp; - if (!data->mod) { + /* If module is not specified, try getting module refcount. */ + if (!data->mod && mod) { + /* If failed to get refcount, ignore this tracepoint. */ + if (!try_module_get(mod)) + return; + data->mod = mod; - if (!try_module_get(data->mod)) { - data->tpoint = NULL; - data->mod = NULL; - } } + data->tpoint = tp; } } @@ -939,11 +940,9 @@ static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) } /* - * Find a tracepoint from kernel and module. If the tracepoint is in a module, - * this increments the module refcount to prevent unloading until the - * trace_fprobe is registered to the list. After registering the trace_fprobe - * on the trace_fprobe list, the module refcount is decremented because - * tracepoint_probe_module_cb will handle it. + * Find a tracepoint from kernel and module. If the tracepoint is on the module, + * the module's refcount is incremented and returned as *@tp_mod. Thus, if it is + * not NULL, caller must call module_put(*tp_mod) after used the tracepoint. */ static struct tracepoint *find_tracepoint(const char *tp_name, struct module **tp_mod) @@ -973,6 +972,10 @@ static void reenable_trace_fprobe(struct trace_fprobe *tf) } } +/* + * Find a tracepoint from specified module. In this case, this does not get the + * module's refcount. The caller must ensure the module is not freed. + */ static struct tracepoint *find_tracepoint_in_module(struct module *mod, const char *tp_name) { @@ -1008,10 +1011,13 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self, reenable_trace_fprobe(tf); } } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) { - tracepoint_probe_unregister(tf->tpoint, + unregister_fprobe(&tf->fp); + if (trace_fprobe_is_tracepoint(tf)) { + tracepoint_probe_unregister(tf->tpoint, tf->tpoint->probestub, NULL); - tf->tpoint = NULL; - tf->mod = NULL; + tf->tpoint = TRACEPOINT_STUB; + tf->mod = NULL; + } } } mutex_unlock(&event_mutex); @@ -1049,6 +1055,19 @@ static int parse_symbol_and_return(int argc, const char *argv[], if (*is_return) return 0; + if (is_tracepoint) { + tmp = *symbol; + while (*tmp && (isalnum(*tmp) || *tmp == '_')) + tmp++; + if (*tmp) { + /* find a wrong character. */ + trace_probe_log_err(tmp - *symbol, BAD_TP_NAME); + kfree(*symbol); + *symbol = NULL; + return -EINVAL; + } + } + /* If there is $retval, this should be a return fprobe. */ for (i = 2; i < argc; i++) { tmp = strstr(argv[i], "$retval"); @@ -1056,6 +1075,8 @@ static int parse_symbol_and_return(int argc, const char *argv[], if (is_tracepoint) { trace_probe_log_set_index(i); trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE); + kfree(*symbol); + *symbol = NULL; return -EINVAL; } *is_return = true; @@ -1186,8 +1207,11 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], argc = new_argc; argv = new_argv; } - if (argc > MAX_TRACE_ARGS) + if (argc > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) @@ -1215,6 +1239,11 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], if (is_return && tf->tp.entry_arg) { tf->fp.entry_handler = trace_fprobe_entry_handler; tf->fp.entry_data_size = traceprobe_get_entry_data_size(&tf->tp); + if (ALIGN(tf->fp.entry_data_size, sizeof(long)) > MAX_FPROBE_DATA_SIZE) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_EARGS); + return -E2BIG; + } } ret = traceprobe_set_print_fmt(&tf->tp, diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d358c9935164..d17c18934445 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -25,6 +25,9 @@ static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); static void +function_args_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); static void @@ -42,9 +45,10 @@ enum { TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ TRACE_FUNC_OPT_STACK = 0x1, TRACE_FUNC_OPT_NO_REPEATS = 0x2, + TRACE_FUNC_OPT_ARGS = 0x4, /* Update this to next highest bit. */ - TRACE_FUNC_OPT_HIGHEST_BIT = 0x4 + TRACE_FUNC_OPT_HIGHEST_BIT = 0x8 }; #define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1) @@ -114,6 +118,8 @@ static ftrace_func_t select_trace_function(u32 flags_val) switch (flags_val & TRACE_FUNC_OPT_MASK) { case TRACE_FUNC_NO_OPTS: return function_trace_call; + case TRACE_FUNC_OPT_ARGS: + return function_args_trace_call; case TRACE_FUNC_OPT_STACK: return function_stack_trace_call; case TRACE_FUNC_OPT_NO_REPEATS: @@ -203,7 +209,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; - struct trace_array_cpu *data; unsigned int trace_ctx; int bit; @@ -216,11 +221,31 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); + trace_ctx = tracing_gen_ctx_dec(); + + trace_function(tr, ip, parent_ip, trace_ctx, NULL); + + ftrace_test_recursion_unlock(bit); +} + +static void +function_args_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct trace_array *tr = op->private; + unsigned int trace_ctx; + int bit; + + if (unlikely(!tr->function_enabled)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + trace_ctx = tracing_gen_ctx(); - data = this_cpu_ptr(tr->array_buffer.data); - if (!atomic_read(&data->disabled)) - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); ftrace_test_recursion_unlock(bit); } @@ -266,11 +291,11 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); + disabled = local_inc_return(&data->disabled); if (likely(disabled == 1)) { trace_ctx = tracing_gen_ctx_flags(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); #ifdef CONFIG_UNWINDER_FRAME_POINTER if (ftrace_pids_enabled(op)) skip++; @@ -278,7 +303,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, __trace_stack(tr, trace_ctx, skip); } - atomic_dec(&data->disabled); + local_dec(&data->disabled); local_irq_restore(flags); } @@ -319,9 +344,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, { struct trace_func_repeats *last_info; struct trace_array *tr = op->private; - struct trace_array_cpu *data; unsigned int trace_ctx; - unsigned long flags; int bit; if (unlikely(!tr->function_enabled)) @@ -332,8 +355,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, return; parent_ip = function_get_true_parent_ip(parent_ip, fregs); - data = this_cpu_ptr(tr->array_buffer.data); - if (atomic_read(&data->disabled)) + if (!tracer_tracing_is_on(tr)) goto out; /* @@ -347,11 +369,10 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (is_repeat_check(tr, last_info, ip, parent_ip)) goto out; - local_save_flags(flags); - trace_ctx = tracing_gen_ctx_flags(flags); + trace_ctx = tracing_gen_ctx_dec(); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); out: ftrace_test_recursion_unlock(bit); @@ -381,7 +402,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); + disabled = local_inc_return(&data->disabled); if (likely(disabled == 1)) { last_info = per_cpu_ptr(tr->last_func_repeats, cpu); @@ -391,12 +412,12 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_flags(flags); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); __trace_stack(tr, trace_ctx, STACK_SKIP); } out: - atomic_dec(&data->disabled); + local_dec(&data->disabled); local_irq_restore(flags); } @@ -405,6 +426,9 @@ static struct tracer_opt func_opts[] = { { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif { TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) }, +#ifdef CONFIG_FUNCTION_TRACE_ARGS + { TRACER_OPT(func-args, TRACE_FUNC_OPT_ARGS) }, +#endif { } /* Always set a last empty entry */ }; @@ -599,11 +623,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static __always_inline void trace_stack(struct trace_array *tr) { - unsigned int trace_ctx; - - trace_ctx = tracing_gen_ctx(); - - __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); + __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP); } static void diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 136c750b0b4d..14d74a7491b8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -71,6 +71,10 @@ static struct tracer_opt trace_opts[] = { /* Display function return address ? */ { TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) }, #endif +#ifdef CONFIG_FUNCTION_TRACE_ARGS + /* Display function arguments ? */ + { TRACER_OPT(funcgraph-args, TRACE_GRAPH_ARGS) }, +#endif /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, @@ -110,25 +114,43 @@ static void print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags); -int __trace_graph_entry(struct trace_array *tr, - struct ftrace_graph_ent *trace, - unsigned int trace_ctx) +static int __graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, + unsigned int trace_ctx, struct ftrace_regs *fregs) { struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ent_entry *entry; + int size; - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, - sizeof(*entry), trace_ctx); + /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */ + size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long)); + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, size, trace_ctx); if (!event) return 0; - entry = ring_buffer_event_data(event); - entry->graph_ent = *trace; + + entry = ring_buffer_event_data(event); + entry->graph_ent = *trace; + +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + trace_buffer_unlock_commit_nostack(buffer, event); return 1; } +int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned int trace_ctx) +{ + return __graph_entry(tr, trace, trace_ctx, NULL); +} + #ifdef CONFIG_FUNCTION_GRAPH_RETADDR int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, @@ -174,18 +196,15 @@ struct fgraph_times { unsigned long long sleeptime; /* may be optional! */ }; -int trace_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct ftrace_regs *fregs) +static int graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; - struct trace_array_cpu *data; struct fgraph_times *ftimes; unsigned int trace_ctx; - long disabled; int ret = 0; - int cpu; if (*task_var & TRACE_GRAPH_NOTRACE) return 0; @@ -235,25 +254,32 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, if (tracing_thresh) return 1; - preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_read(&data->disabled); - if (likely(!disabled)) { - trace_ctx = tracing_gen_ctx(); - if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && - tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) { - unsigned long retaddr = ftrace_graph_top_ret_addr(current); - ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); - } else { - ret = __trace_graph_entry(tr, trace, trace_ctx); - } + trace_ctx = tracing_gen_ctx(); + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && + tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) { + unsigned long retaddr = ftrace_graph_top_ret_addr(current); + ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); + } else { + ret = __graph_entry(tr, trace, trace_ctx, fregs); } - preempt_enable_notrace(); return ret; } +int trace_graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + return graph_entry(trace, gops, NULL); +} + +static int trace_graph_entry_args(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + return graph_entry(trace, gops, fregs); +} + static void __trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned int trace_ctx) @@ -315,13 +341,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace, { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; - struct trace_array_cpu *data; struct fgraph_times *ftimes; unsigned int trace_ctx; u64 calltime, rettime; - long disabled; int size; - int cpu; rettime = trace_clock_local(); @@ -340,15 +363,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace, calltime = ftimes->calltime; - preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_read(&data->disabled); - if (likely(!disabled)) { - trace_ctx = tracing_gen_ctx(); - __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); - } - preempt_enable_notrace(); + trace_ctx = tracing_gen_ctx(); + __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); } static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, @@ -418,14 +434,17 @@ static int graph_trace_init(struct trace_array *tr) { int ret; - tr->gops->entryfunc = trace_graph_entry; + if (tracer_flags_is_set(TRACE_GRAPH_ARGS)) + tr->gops->entryfunc = trace_graph_entry_args; + else + tr->gops->entryfunc = trace_graph_entry; if (tracing_thresh) tr->gops->retfunc = trace_graph_thresh_return; else tr->gops->retfunc = trace_graph_return; - /* Make gops functions are visible before we start tracing */ + /* Make gops functions visible before we start tracing */ smp_mb(); ret = register_ftrace_graph(tr->gops); @@ -436,6 +455,34 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +static struct tracer graph_trace; + +static int ftrace_graph_trace_args(struct trace_array *tr, int set) +{ + trace_func_graph_ent_t entry; + + /* Do nothing if the current tracer is not this tracer */ + if (tr->current_trace != &graph_trace) + return 0; + + if (set) + entry = trace_graph_entry_args; + else + entry = trace_graph_entry; + + /* See if there's any changes */ + if (tr->gops->entryfunc == entry) + return 0; + + unregister_ftrace_graph(tr->gops); + + tr->gops->entryfunc = entry; + + /* Make gops functions visible before we start tracing */ + smp_mb(); + return register_ftrace_graph(tr->gops); +} + static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); @@ -775,7 +822,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry, struct ftrace_graph_ret *graph_ret, void *func, - u32 opt_flags, u32 trace_flags) + u32 opt_flags, u32 trace_flags, int args_size) { unsigned long err_code = 0; unsigned long retval = 0; @@ -809,11 +856,16 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT) print_retaddr = false; - trace_seq_printf(s, "%ps();", func); + trace_seq_printf(s, "%ps", func); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { + print_function_args(s, entry->args, (unsigned long)func); + trace_seq_putc(s, ';'); + } else + trace_seq_puts(s, "();"); + if (print_retval || print_retaddr) trace_seq_puts(s, " /*"); - else - trace_seq_putc(s, '\n'); } else { print_retaddr = false; trace_seq_printf(s, "} /* %ps", func); @@ -831,12 +883,13 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr } if (!entry || print_retval || print_retaddr) - trace_seq_puts(s, " */\n"); + trace_seq_puts(s, " */"); } #else -#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags) do {} while (0) +#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags, args_size) \ + do {} while (0) #endif @@ -852,16 +905,17 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; - unsigned long func; + unsigned long ret_func; + int args_size; int cpu = iter->cpu; int i; + args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args); + graph_ret = &ret_entry->ret; call = &entry->graph_ent; duration = ret_entry->rettime - ret_entry->calltime; - func = call->func + iter->tr->text_delta; - if (data) { struct fgraph_cpu_data *cpu_data; @@ -887,16 +941,25 @@ print_graph_entry_leaf(struct trace_iterator *iter, for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) trace_seq_putc(s, ' '); + ret_func = graph_ret->func + iter->tr->text_delta; + /* * Write out the function return value or return address */ if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) { print_graph_retval(s, entry, graph_ret, (void *)graph_ret->func + iter->tr->text_delta, - flags, tr->trace_flags); + flags, tr->trace_flags, args_size); } else { - trace_seq_printf(s, "%ps();\n", (void *)func); + trace_seq_printf(s, "%ps", (void *)ret_func); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { + print_function_args(s, entry->args, ret_func); + trace_seq_putc(s, ';'); + } else + trace_seq_puts(s, "();"); } + trace_seq_putc(s, '\n'); print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -913,6 +976,7 @@ print_graph_entry_nested(struct trace_iterator *iter, struct fgraph_data *data = iter->private; struct trace_array *tr = iter->tr; unsigned long func; + int args_size; int i; if (data) { @@ -937,7 +1001,17 @@ print_graph_entry_nested(struct trace_iterator *iter, func = call->func + iter->tr->text_delta; - trace_seq_printf(s, "%ps() {", (void *)func); + trace_seq_printf(s, "%ps", (void *)func); + + args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) + print_function_args(s, entry->args, func); + else + trace_seq_puts(s, "()"); + + trace_seq_puts(s, " {"); + if (flags & __TRACE_GRAPH_PRINT_RETADDR && entry->ent.type == TRACE_GRAPH_RETADDR_ENT) print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry, @@ -1107,21 +1181,38 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter, u32 flags) { struct fgraph_data *data = iter->private; - struct ftrace_graph_ent *call = &field->graph_ent; + struct ftrace_graph_ent *call; struct ftrace_graph_ret_entry *leaf_ret; static enum print_line_t ret; int cpu = iter->cpu; + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * sizeof(struct ftrace_graph_ent_entry) is very small, + * it can be safely saved at the stack. + */ + struct ftrace_graph_ent_entry *entry; + u8 save_buf[sizeof(*entry) + FTRACE_REGS_MAX_ARGS * sizeof(long)]; + + /* The ent_size is expected to be as big as the entry */ + if (iter->ent_size > sizeof(save_buf)) + iter->ent_size = sizeof(save_buf); + + entry = (void *)save_buf; + memcpy(entry, field, iter->ent_size); + + call = &entry->graph_ent; if (check_irq_entry(iter, flags, call->func, call->depth)) return TRACE_TYPE_HANDLED; print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags); - leaf_ret = get_return_for_leaf(iter, field); + leaf_ret = get_return_for_leaf(iter, entry); if (leaf_ret) - ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags); + ret = print_graph_entry_leaf(iter, entry, leaf_ret, s, flags); else - ret = print_graph_entry_nested(iter, field, s, cpu, flags); + ret = print_graph_entry_nested(iter, entry, s, cpu, flags); if (data) { /* @@ -1195,7 +1286,8 @@ print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s, * funcgraph-retval option is enabled. */ if (flags & __TRACE_GRAPH_PRINT_RETVAL) { - print_graph_retval(s, NULL, trace, (void *)func, flags, tr->trace_flags); + print_graph_retval(s, NULL, trace, (void *)func, flags, + tr->trace_flags, 0); } else { /* * If the return function does not have a matching entry, @@ -1205,10 +1297,11 @@ print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s, * that if the funcgraph-tail option is enabled. */ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) - trace_seq_puts(s, "}\n"); + trace_seq_puts(s, "}"); else - trace_seq_printf(s, "} /* %ps */\n", (void *)func); + trace_seq_printf(s, "} /* %ps */", (void *)func); } + trace_seq_putc(s, '\n'); /* Overrun */ if (flags & TRACE_GRAPH_PRINT_OVERRUN) @@ -1323,16 +1416,8 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) switch (entry->type) { case TRACE_GRAPH_ENT: { - /* - * print_graph_entry() may consume the current event, - * thus @field may become invalid, so we need to save it. - * sizeof(struct ftrace_graph_ent_entry) is very small, - * it can be safely saved at the stack. - */ - struct ftrace_graph_ent_entry saved; trace_assign_type(field, entry); - saved = *field; - return print_graph_entry(&saved, s, iter, flags); + return print_graph_entry(field, s, iter, flags); } #ifdef CONFIG_FUNCTION_GRAPH_RETADDR case TRACE_GRAPH_RETADDR_ENT: { @@ -1511,6 +1596,7 @@ void graph_trace_close(struct trace_iterator *iter) if (data) { free_percpu(data->cpu_data); kfree(data); + iter->private = NULL; } } @@ -1526,6 +1612,9 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (bit == TRACE_GRAPH_GRAPH_TIME) ftrace_graph_graph_time_control(set); + if (bit == TRACE_GRAPH_ARGS) + return ftrace_graph_trace_args(tr, set); + return 0; } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 7294ad676379..5496758b6c76 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -123,12 +123,12 @@ static int func_prolog_dec(struct trace_array *tr, return 0; *data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&(*data)->disabled); + disabled = local_inc_return(&(*data)->disabled); if (likely(disabled == 1)) return 1; - atomic_dec(&(*data)->disabled); + local_dec(&(*data)->disabled); return 0; } @@ -150,9 +150,9 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_flags(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); - atomic_dec(&data->disabled); + local_dec(&data->disabled); } #endif /* CONFIG_FUNCTION_TRACER */ @@ -209,7 +209,7 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, trace_ctx = tracing_gen_ctx_flags(flags); ret = __trace_graph_entry(tr, trace, trace_ctx); - atomic_dec(&data->disabled); + local_dec(&data->disabled); return ret; } @@ -238,7 +238,7 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace, trace_ctx = tracing_gen_ctx_flags(flags); __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); - atomic_dec(&data->disabled); + local_dec(&data->disabled); } static struct fgraph_ops fgraph_ops = { @@ -250,8 +250,6 @@ static void irqsoff_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - else - iter->private = NULL; } static void irqsoff_trace_close(struct trace_iterator *iter) @@ -295,11 +293,17 @@ __trace_function(struct trace_array *tr, if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); } #else -#define __trace_function trace_function +static inline void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned int trace_ctx) +{ + return trace_function(tr, ip, parent_ip, trace_ctx, NULL); +} static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { @@ -393,6 +397,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) int cpu; struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; + long disabled; if (!tracer_enabled || !tracing_is_enabled()) return; @@ -404,20 +409,22 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) data = per_cpu_ptr(tr->array_buffer.data, cpu); - if (unlikely(!data) || atomic_read(&data->disabled)) + if (unlikely(!data) || local_read(&data->disabled)) return; - atomic_inc(&data->disabled); + disabled = local_inc_return(&data->disabled); - data->critical_sequence = max_sequence; - data->preempt_timestamp = ftrace_now(cpu); - data->critical_start = parent_ip ? : ip; + if (disabled == 1) { + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + data->critical_start = parent_ip ? : ip; - __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); + __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); - per_cpu(tracing_cpu, cpu) = 1; + per_cpu(tracing_cpu, cpu) = 1; + } - atomic_dec(&data->disabled); + local_dec(&data->disabled); } static nokprobe_inline void @@ -427,6 +434,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned int trace_ctx; + long disabled; cpu = raw_smp_processor_id(); /* Always clear the tracing cpu on stopping the trace */ @@ -441,16 +449,19 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) data = per_cpu_ptr(tr->array_buffer.data, cpu); if (unlikely(!data) || - !data->critical_start || atomic_read(&data->disabled)) + !data->critical_start || local_read(&data->disabled)) return; - atomic_inc(&data->disabled); + disabled = local_inc_return(&data->disabled); - trace_ctx = tracing_gen_ctx(); - __trace_function(tr, ip, parent_ip, trace_ctx); - check_critical_timing(tr, data, parent_ip ? : ip, cpu); - data->critical_start = 0; - atomic_dec(&data->disabled); + if (disabled == 1) { + trace_ctx = tracing_gen_ctx(); + __trace_function(tr, ip, parent_ip, trace_ctx); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + } + + local_dec(&data->disabled); } /* start and stop critical timings used to for stoppage (in idle) */ diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 1e72d20b3c2f..d7b135de958a 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -98,7 +98,6 @@ static int kdb_ftdump(int argc, const char **argv) long cpu_file; int err; int cnt; - int cpu; if (argc > 2) return KDB_ARGCOUNT; @@ -120,9 +119,7 @@ static int kdb_ftdump(int argc, const char **argv) trace_init_global_iter(&iter); iter.buffer_iter = buffer_iter; - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_disable(iter.tr); /* A negative skip_entries means skip all but the last entries */ if (skip_entries < 0) { @@ -135,9 +132,7 @@ static int kdb_ftdump(int argc, const char **argv) ftrace_dump_buf(skip_entries, cpu_file); - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); - } + tracer_tracing_enable(iter.tr); kdb_trap_printk--; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d8d5f18a141a..3e5c47b6d7b2 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -124,9 +124,8 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) if (!p) return true; *p = '\0'; - rcu_read_lock_sched(); - ret = !!find_module(tk->symbol); - rcu_read_unlock_sched(); + scoped_guard(rcu) + ret = !!find_module(tk->symbol); *p = ':'; return ret; @@ -796,12 +795,10 @@ static struct module *try_module_get_by_name(const char *name) { struct module *mod; - rcu_read_lock_sched(); + guard(rcu)(); mod = find_module(name); if (mod && !try_module_get(mod)) mod = NULL; - rcu_read_unlock_sched(); - return mod; } #else @@ -1007,8 +1004,11 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], argc = new_argc; argv = new_argv; } - if (argc > MAX_TRACE_ARGS) + if (argc > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) @@ -1089,7 +1089,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_kprobe_ops); - ret = trace_kprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_kprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index ba5858866b2f..c706544be60c 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -291,7 +291,6 @@ __init static int init_mmio_trace(void) device_initcall(init_mmio_trace); static void __trace_mmiotrace_rw(struct trace_array *tr, - struct trace_array_cpu *data, struct mmiotrace_rw *rw) { struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -315,12 +314,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); - __trace_mmiotrace_rw(tr, data, rw); + __trace_mmiotrace_rw(tr, rw); } static void __trace_mmiotrace_map(struct trace_array *tr, - struct trace_array_cpu *data, struct mmiotrace_map *map) { struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -344,12 +341,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, void mmio_trace_mapping(struct mmiotrace_map *map) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data; - - preempt_disable(); - data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); - __trace_mmiotrace_map(tr, data, map); - preempt_enable(); + __trace_mmiotrace_map(tr, map); } int mmio_trace_printk(const char *fmt, va_list args) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index f3a2722ee4c0..6819b93309ce 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -316,33 +316,6 @@ static inline void osn_var_reset_all(void) bool trace_osnoise_callback_enabled; /* - * osnoise sample structure definition. Used to store the statistics of a - * sample run. - */ -struct osnoise_sample { - u64 runtime; /* runtime */ - u64 noise; /* noise */ - u64 max_sample; /* max single noise sample */ - int hw_count; /* # HW (incl. hypervisor) interference */ - int nmi_count; /* # NMIs during this sample */ - int irq_count; /* # IRQs during this sample */ - int softirq_count; /* # softirqs during this sample */ - int thread_count; /* # threads during this sample */ -}; - -#ifdef CONFIG_TIMERLAT_TRACER -/* - * timerlat sample structure definition. Used to store the statistics of - * a sample run. - */ -struct timerlat_sample { - u64 timer_latency; /* timer_latency */ - unsigned int seqnum; /* unique sequence */ - int context; /* timer context */ -}; -#endif - -/* * Tracer data. */ static struct osnoise_data { @@ -497,7 +470,7 @@ static void print_osnoise_headers(struct seq_file *s) * Record an osnoise_sample into the tracer buffer. */ static void -__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) +__record_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) { struct ring_buffer_event *event; struct osnoise_entry *entry; @@ -520,17 +493,19 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe } /* - * Record an osnoise_sample on all osnoise instances. + * Record an osnoise_sample on all osnoise instances and fire trace event. */ -static void trace_osnoise_sample(struct osnoise_sample *sample) +static void record_osnoise_sample(struct osnoise_sample *sample) { struct osnoise_instance *inst; struct trace_buffer *buffer; + trace_osnoise_sample(sample); + rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { buffer = inst->tr->array_buffer.buffer; - __trace_osnoise_sample(sample, buffer); + __record_osnoise_sample(sample, buffer); } rcu_read_unlock(); } @@ -574,7 +549,7 @@ static void print_timerlat_headers(struct seq_file *s) #endif /* CONFIG_PREEMPT_RT */ static void -__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) +__record_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) { struct ring_buffer_event *event; struct timerlat_entry *entry; @@ -594,15 +569,17 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf /* * Record an timerlat_sample into the tracer buffer. */ -static void trace_timerlat_sample(struct timerlat_sample *sample) +static void record_timerlat_sample(struct timerlat_sample *sample) { struct osnoise_instance *inst; struct trace_buffer *buffer; + trace_timerlat_sample(sample); + rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { buffer = inst->tr->array_buffer.buffer; - __trace_timerlat_sample(sample, buffer); + __record_timerlat_sample(sample, buffer); } rcu_read_unlock(); } @@ -1542,27 +1519,25 @@ static int run_osnoise(void) /* * In some cases, notably when running on a nohz_full CPU with - * a stopped tick PREEMPT_RCU has no way to account for QSs. - * This will eventually cause unwarranted noise as PREEMPT_RCU - * will force preemption as the means of ending the current - * grace period. We avoid this problem by calling - * rcu_momentary_eqs(), which performs a zero duration - * EQS allowing PREEMPT_RCU to end the current grace period. - * This call shouldn't be wrapped inside an RCU critical - * section. + * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to + * account for QSs. This will eventually cause unwarranted + * noise as RCU forces preemption as the means of ending the + * current grace period. We avoid this by calling + * rcu_momentary_eqs(), which performs a zero duration EQS + * allowing RCU to end the current grace period. This call + * shouldn't be wrapped inside an RCU critical section. * - * Note that in non PREEMPT_RCU kernels QSs are handled through - * cond_resched() + * Normally QSs for other cases are handled through cond_resched(). + * For simplicity, however, we call rcu_momentary_eqs() for all + * configurations here. */ - if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { - if (!disable_irq) - local_irq_disable(); + if (!disable_irq) + local_irq_disable(); - rcu_momentary_eqs(); + rcu_momentary_eqs(); - if (!disable_irq) - local_irq_enable(); - } + if (!disable_irq) + local_irq_enable(); /* * For the non-preemptive kernel config: let threads runs, if @@ -1608,7 +1583,7 @@ static int run_osnoise(void) /* Save interference stats info */ diff_osn_sample_stats(osn_var, &s); - trace_osnoise_sample(&s); + record_osnoise_sample(&s); notify_new_max_latency(max_noise); @@ -1803,7 +1778,7 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) s.timer_latency = diff; s.context = IRQ_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); if (osnoise_data.stop_tracing) { if (time_to_us(diff) >= osnoise_data.stop_tracing) { @@ -1901,8 +1876,7 @@ static int timerlat_main(void *data) tlat->count = 0; tlat->tracing_thread = false; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); tlat->kthread = current; osn_var->pid = current->pid; /* @@ -1923,7 +1897,7 @@ static int timerlat_main(void *data) s.timer_latency = diff; s.context = THREAD_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); notify_new_max_latency(diff); @@ -2032,7 +2006,6 @@ static int start_kthread(unsigned int cpu) if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); - stop_per_cpu_kthreads(); return -ENOMEM; } @@ -2329,7 +2302,7 @@ osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, * osnoise_cpus_write - Write function for "cpus" entry * @filp: The active open file structure * @ubuf: The user buffer that contains the value to write - * @cnt: The maximum number of bytes to write to "file" + * @count: The maximum number of bytes to write to "file" * @ppos: The current position in @file * * This function provides a write implementation for the "cpus" @@ -2347,10 +2320,11 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, { cpumask_var_t osnoise_cpumask_new; int running, err; - char buf[256]; + char *buf __free(kfree) = NULL; - if (count >= 256) - return -EINVAL; + buf = kmalloc(count, GFP_KERNEL); + if (!buf) + return -ENOMEM; if (copy_from_user(buf, ubuf, count)) return -EFAULT; @@ -2456,8 +2430,7 @@ static int timerlat_fd_open(struct inode *inode, struct file *file) tlat = this_cpu_tmr_var(); tlat->count = 0; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); migrate_enable(); return 0; @@ -2529,7 +2502,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, s.timer_latency = diff; s.context = THREAD_URET; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); notify_new_max_latency(diff); @@ -2564,7 +2537,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, s.timer_latency = diff; s.context = THREAD_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); if (osnoise_data.stop_tracing_total) { if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 03d56f711ad1..0b3db02030a7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -5,6 +5,7 @@ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> * */ +#include "trace.h" #include <linux/module.h> #include <linux/mutex.h> #include <linux/ftrace.h> @@ -12,15 +13,19 @@ #include <linux/sched/clock.h> #include <linux/sched/mm.h> #include <linux/idr.h> +#include <linux/btf.h> +#include <linux/bpf.h> +#include <linux/hashtable.h> #include "trace_output.h" +#include "trace_btf.h" -/* must be a power of 2 */ -#define EVENT_HASHSIZE 128 +/* 2^7 = 128 */ +#define EVENT_HASH_BITS 7 DECLARE_RWSEM(trace_event_sem); -static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; +static DEFINE_HASHTABLE(event_hash, EVENT_HASH_BITS); enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { @@ -684,6 +689,88 @@ int trace_print_lat_context(struct trace_iterator *iter) return !trace_seq_has_overflowed(s); } +#ifdef CONFIG_FUNCTION_TRACE_ARGS +void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func) +{ + const struct btf_param *param; + const struct btf_type *t; + const char *param_name; + char name[KSYM_NAME_LEN]; + unsigned long arg; + struct btf *btf; + s32 tid, nr = 0; + int a, p, x; + + trace_seq_printf(s, "("); + + if (!args) + goto out; + if (lookup_symbol_name(func, name)) + goto out; + + /* TODO: Pass module name here too */ + t = btf_find_func_proto(name, &btf); + if (IS_ERR_OR_NULL(t)) + goto out; + + param = btf_get_func_param(t, &nr); + if (!param) + goto out_put; + + for (a = 0, p = 0; p < nr; a++, p++) { + if (p) + trace_seq_puts(s, ", "); + + /* This only prints what the arch allows (6 args by default) */ + if (a == FTRACE_REGS_MAX_ARGS) { + trace_seq_puts(s, "..."); + break; + } + + arg = args[a]; + + param_name = btf_name_by_offset(btf, param[p].name_off); + if (param_name) + trace_seq_printf(s, "%s=", param_name); + t = btf_type_skip_modifiers(btf, param[p].type, &tid); + + switch (t ? BTF_INFO_KIND(t->info) : BTF_KIND_UNKN) { + case BTF_KIND_UNKN: + trace_seq_putc(s, '?'); + /* Still print unknown type values */ + fallthrough; + case BTF_KIND_PTR: + trace_seq_printf(s, "0x%lx", arg); + break; + case BTF_KIND_INT: + trace_seq_printf(s, "%ld", arg); + break; + case BTF_KIND_ENUM: + trace_seq_printf(s, "%ld", arg); + break; + default: + /* This does not handle complex arguments */ + trace_seq_printf(s, "(%s)[0x%lx", btf_type_str(t), arg); + for (x = sizeof(long); x < t->size; x += sizeof(long)) { + trace_seq_putc(s, ':'); + if (++a == FTRACE_REGS_MAX_ARGS) { + trace_seq_puts(s, "...]"); + goto out_put; + } + trace_seq_printf(s, "0x%lx", args[a]); + } + trace_seq_putc(s, ']'); + break; + } + } +out_put: + btf_put(btf); +out: + trace_seq_printf(s, ")"); +} +#endif + /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -694,11 +781,8 @@ int trace_print_lat_context(struct trace_iterator *iter) struct trace_event *ftrace_find_event(int type) { struct trace_event *event; - unsigned key; - - key = type & (EVENT_HASHSIZE - 1); - hlist_for_each_entry(event, &event_hash[key], node) { + hash_for_each_possible(event_hash, event, node, type) { if (event->type == type) return event; } @@ -753,7 +837,6 @@ void trace_event_read_unlock(void) */ int register_trace_event(struct trace_event *event) { - unsigned key; int ret = 0; down_write(&trace_event_sem); @@ -786,9 +869,7 @@ int register_trace_event(struct trace_event *event) if (event->funcs->binary == NULL) event->funcs->binary = trace_nop_print; - key = event->type & (EVENT_HASHSIZE - 1); - - hlist_add_head(&event->node, &event_hash[key]); + hash_add(event_hash, &event->node, event->type); ret = event->type; out: @@ -803,7 +884,7 @@ EXPORT_SYMBOL_GPL(register_trace_event); */ int __unregister_trace_event(struct trace_event *event) { - hlist_del(&event->node); + hash_del(&event->node); free_trace_event_type(event->type); return 0; } @@ -857,6 +938,9 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c struct list_head *head) { struct ftrace_event_field *field; + struct trace_array *tr = iter->tr; + unsigned long long laddr; + unsigned long addr; int offset; int len; int ret; @@ -893,8 +977,8 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c case FILTER_PTR_STRING: if (!iter->fmt_size) trace_iter_expand_format(iter); - pos = *(void **)pos; - ret = strncpy_from_kernel_nofault(iter->fmt, pos, + addr = trace_adjust_address(tr, *(unsigned long *)pos); + ret = strncpy_from_kernel_nofault(iter->fmt, (void *)addr, iter->fmt_size); if (ret < 0) trace_seq_printf(&iter->seq, "(0x%px)", pos); @@ -903,8 +987,8 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c pos, iter->fmt); break; case FILTER_TRACE_FN: - pos = *(void **)pos; - trace_seq_printf(&iter->seq, "%pS", pos); + addr = trace_adjust_address(tr, *(unsigned long *)pos); + trace_seq_printf(&iter->seq, "%pS", (void *)addr); break; case FILTER_CPU: case FILTER_OTHER: @@ -934,14 +1018,36 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c break; } - trace_seq_printf(&iter->seq, "0x%x (%d)", - *(unsigned int *)pos, - *(unsigned int *)pos); + addr = *(unsigned int *)pos; + + /* Some fields reference offset from _stext. */ + if (!strcmp(field->name, "caller_offs") || + !strcmp(field->name, "parent_offs")) { + unsigned long ip; + + ip = addr + (unsigned long)_stext; + ip = trace_adjust_address(tr, ip); + trace_seq_printf(&iter->seq, "%pS ", (void *)ip); + } + + if (sizeof(long) == 4) { + addr = trace_adjust_address(tr, addr); + trace_seq_printf(&iter->seq, "%pS (%d)", + (void *)addr, (int)addr); + } else { + trace_seq_printf(&iter->seq, "0x%x (%d)", + (unsigned int)addr, (int)addr); + } break; case 8: - trace_seq_printf(&iter->seq, "0x%llx (%lld)", - *(unsigned long long *)pos, - *(unsigned long long *)pos); + laddr = *(unsigned long long *)pos; + if (sizeof(long) == 8) { + laddr = trace_adjust_address(tr, (unsigned long)laddr); + trace_seq_printf(&iter->seq, "%pS (%lld)", + (void *)(long)laddr, laddr); + } else { + trace_seq_printf(&iter->seq, "0x%llx (%lld)", laddr, laddr); + } break; default: trace_seq_puts(&iter->seq, "<INVALID-SIZE>"); @@ -961,11 +1067,12 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, struct trace_event_call *call; struct list_head *head; + lockdep_assert_held_read(&trace_event_sem); + /* ftrace defined events have separate call structures */ if (event->type <= __TRACE_LAST_TYPE) { bool found = false; - down_read(&trace_event_sem); list_for_each_entry(call, &ftrace_events, list) { if (call->event.type == event->type) { found = true; @@ -975,7 +1082,6 @@ enum print_line_t print_event_fields(struct trace_iterator *iter, if (call->event.type > __TRACE_LAST_TYPE) break; } - up_read(&trace_event_sem); if (!found) { trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type); goto out; @@ -1005,12 +1111,15 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, } static void print_fn_trace(struct trace_seq *s, unsigned long ip, - unsigned long parent_ip, long delta, int flags) + unsigned long parent_ip, unsigned long *args, + struct trace_array *tr, int flags) { - ip += delta; - parent_ip += delta; + ip = trace_adjust_address(tr, ip); + parent_ip = trace_adjust_address(tr, parent_ip); seq_print_ip_sym(s, ip, flags); + if (args) + print_function_args(s, args, ip); if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { trace_seq_puts(s, " <-"); @@ -1024,10 +1133,18 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; + unsigned long *args; + int args_size; trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); + args_size = iter->ent_size - offsetof(struct ftrace_entry, args); + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) + args = field->args; + else + args = NULL; + + print_fn_trace(s, field->ip, field->parent_ip, args, iter->tr, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1248,7 +1365,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; unsigned long *p; unsigned long *end; - long delta = iter->tr->text_delta; trace_assign_type(field, iter->ent); end = (unsigned long *)((long)iter->ent + iter->ent_size); @@ -1265,7 +1381,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n"); continue; } - seq_print_ip_sym(s, (*p) + delta, flags); + seq_print_ip_sym(s, trace_adjust_address(iter->tr, *p), flags); trace_seq_putc(s, '\n'); } @@ -1614,7 +1730,7 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - ip = field->ip + iter->tr->text_delta; + ip = trace_adjust_address(iter->tr, field->ip); seq_print_ip_sym(s, ip, flags); trace_seq_printf(s, ": %s", field->buf); @@ -1700,7 +1816,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); + print_fn_trace(s, field->ip, field->parent_ip, NULL, iter->tr, flags); trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); trace_print_time(s, iter, iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index dca40f1f1da4..2e305364f2a9 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -41,5 +41,14 @@ extern struct rw_semaphore trace_event_sem; #define SEQ_PUT_HEX_FIELD(s, x) \ trace_seq_putmem_hex(s, &(x), sizeof(x)) +#ifdef CONFIG_FUNCTION_TRACE_ARGS +void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func); +#else +static inline void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func) { + trace_seq_puts(s, "()"); +} +#endif #endif diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8f58ee1e8858..424751cdf31f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -154,9 +154,12 @@ fail: } static struct trace_probe_log trace_probe_log; +extern struct mutex dyn_event_ops_mutex; void trace_probe_log_init(const char *subsystem, int argc, const char **argv) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.subsystem = subsystem; trace_probe_log.argc = argc; trace_probe_log.argv = argv; @@ -165,11 +168,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv) void trace_probe_log_clear(void) { + lockdep_assert_held(&dyn_event_ops_mutex); + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); } void trace_probe_log_set_index(int index) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.index = index; } @@ -178,6 +185,8 @@ void __trace_probe_log_err(int offset, int err_type) char *command, *p; int i, len = 0, pos = 0; + lockdep_assert_held(&dyn_event_ops_mutex); + if (!trace_probe_log.argv) return; @@ -770,6 +779,10 @@ static int check_prepare_btf_string_fetch(char *typename, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +/* + * Add the entry code to store the 'argnum'th parameter and return the offset + * in the entry data buffer where the data will be stored. + */ static int __store_entry_arg(struct trace_probe *tp, int argnum) { struct probe_entry_arg *earg = tp->entry_arg; @@ -793,6 +806,20 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum) tp->entry_arg = earg; } + /* + * The entry code array is repeating the pair of + * [FETCH_OP_ARG(argnum)][FETCH_OP_ST_EDATA(offset of entry data buffer)] + * and the rest of entries are filled with [FETCH_OP_END]. + * + * To reduce the redundant function parameter fetching, we scan the entry + * code array to find the FETCH_OP_ARG which already fetches the 'argnum' + * parameter. If it doesn't match, update 'offset' to find the last + * offset. + * If we find the FETCH_OP_END without matching FETCH_OP_ARG entry, we + * will save the entry with FETCH_OP_ARG and FETCH_OP_ST_EDATA, and + * return data offset so that caller can find the data offset in the entry + * data buffer. + */ offset = 0; for (i = 0; i < earg->size - 1; i++) { switch (earg->code[i].op) { @@ -826,6 +853,16 @@ int traceprobe_get_entry_data_size(struct trace_probe *tp) if (!earg) return 0; + /* + * earg->code[] array has an operation sequence which is run in + * the entry handler. + * The sequence stopped by FETCH_OP_END and each data stored in + * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA + * stores the data at the data buffer + its offset, and all data are + * "unsigned long" size. The offset must be increased when a data is + * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the + * code array. + */ for (i = 0; i < earg->size; i++) { switch (earg->code[i].op) { case FETCH_OP_END: diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5803e6a41570..854e5668f5ee 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -36,7 +36,6 @@ #define MAX_BTF_ARGS_LEN 128 #define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX -#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -481,6 +480,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ C(NO_TRACEPOINT, "Tracepoint is not found"), \ + C(BAD_TP_NAME, "Invalid character in tracepoint name"),\ C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ C(NO_GROUP_NAME, "Group name is not specified"), \ C(GROUP_TOO_LONG, "Group name is too long"), \ @@ -544,7 +544,9 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_BTF_FIELD, "This field is not found."), \ C(BAD_BTF_TID, "Failed to get BTF type info."),\ C(BAD_TYPE4STR, "This type does not fit for string."),\ - C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"), + C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ + C(TOO_MANY_ARGS, "Too many arguments are specified"), \ + C(TOO_MANY_EARGS, "Too many entry arguments specified"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index af30586f1aea..bf1cb80742ae 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -83,14 +83,14 @@ func_prolog_preempt_disable(struct trace_array *tr, goto out_enable; *data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&(*data)->disabled); + disabled = local_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; return 1; out: - atomic_dec(&(*data)->disabled); + local_dec(&(*data)->disabled); out_enable: preempt_enable_notrace(); @@ -144,7 +144,7 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace, *calltime = trace_clock_local(); ret = __trace_graph_entry(tr, trace, trace_ctx); - atomic_dec(&data->disabled); + local_dec(&data->disabled); preempt_enable_notrace(); return ret; @@ -173,7 +173,7 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace, return; __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); - atomic_dec(&data->disabled); + local_dec(&data->disabled); preempt_enable_notrace(); return; @@ -188,8 +188,6 @@ static void wakeup_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - else - iter->private = NULL; } static void wakeup_trace_close(struct trace_iterator *iter) @@ -242,10 +240,10 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, return; local_irq_save(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); local_irq_restore(flags); - atomic_dec(&data->disabled); + local_dec(&data->disabled); preempt_enable_notrace(); } @@ -327,7 +325,7 @@ __trace_function(struct trace_array *tr, if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); } static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) @@ -473,7 +471,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (likely(disabled != 1)) goto out; @@ -510,7 +508,7 @@ out_unlock: arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); } static void __wakeup_reset(struct trace_array *tr) @@ -565,7 +563,7 @@ probe_wakeup(void *ignore, struct task_struct *p) (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return; - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); if (unlikely(disabled != 1)) goto out; @@ -612,7 +610,7 @@ probe_wakeup(void *ignore, struct task_struct *p) out_locked: arch_spin_unlock(&wakeup_lock); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); } static void start_wakeup_tracer(struct trace_array *tr) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 14c6f272c4d8..0aa2514a6593 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -32,7 +32,7 @@ static arch_spinlock_t stack_trace_max_lock = DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); -int stack_tracer_enabled; +static int stack_tracer_enabled; static void print_max_stack(void) { @@ -542,7 +542,7 @@ static __init int enable_stacktrace(char *str) int len; if ((len = str_has_prefix(str, "_filter="))) - strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); + strscpy(stack_trace_filter_buf, str + len); stack_tracer_enabled = 1; return 1; @@ -578,3 +578,23 @@ static __init int stack_trace_init(void) } device_initcall(stack_trace_init); + + +static const struct ctl_table trace_stack_sysctl_table[] = { + { + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = stack_trace_sysctl, + }, +}; + +static int __init init_trace_stack_sysctls(void) +{ + register_sysctl_init("kernel", trace_stack_sysctl_table); + return 0; +} +subsys_initcall(init_trace_stack_sysctls); + + diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index ccc762fbb69c..f95a2c3d5b1b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -562,8 +562,14 @@ static int __trace_uprobe_create(int argc, const char **argv) if (argc < 2) return -ECANCELED; - if (argc - 2 > MAX_TRACE_ARGS) + + trace_probe_log_init("trace_uprobe", argc, argv); + + if (argc - 2 > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; + } if (argv[0][1] == ':') event = &argv[0][2]; @@ -582,7 +588,6 @@ static int __trace_uprobe_create(int argc, const char **argv) return -ECANCELED; } - trace_probe_log_init("trace_uprobe", argc, argv); trace_probe_log_set_index(1); /* filename is the 2nd argument */ *arg++ = '\0'; @@ -736,7 +741,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_uprobe_ops); - ret = trace_uprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_uprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } @@ -1484,7 +1489,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, : BPF_FD_TYPE_UPROBE; *filename = tu->filename; *probe_offset = tu->offset; - *probe_addr = 0; + *probe_addr = tu->ref_ctr_offset; return 0; } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 1848ce7e2976..62719d2941c9 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -127,7 +127,7 @@ static void debug_print_probes(struct tracepoint_func *funcs) return; for (i = 0; funcs[i].func; i++) - printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func); + printk(KERN_DEBUG "Probe %d : %pSb\n", i, funcs[i].func); } static struct tracepoint_func * diff --git a/kernel/ucount.c b/kernel/ucount.c index 86c5f1c0bad9..8686e329b8f2 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -11,11 +11,14 @@ struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, - .count = ATOMIC_INIT(1), + .count = RCUREF_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 -static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; +#define UCOUNTS_HASHTABLE_ENTRIES (1 << UCOUNTS_HASHTABLE_BITS) +static struct hlist_nulls_head ucounts_hashtable[UCOUNTS_HASHTABLE_ENTRIES] = { + [0 ... UCOUNTS_HASHTABLE_ENTRIES - 1] = HLIST_NULLS_HEAD_INIT(0) +}; static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashfn(ns, uid) \ @@ -24,7 +27,6 @@ static DEFINE_SPINLOCK(ucounts_lock); #define ucounts_hashentry(ns, uid) \ (ucounts_hashtable + ucounts_hashfn(ns, uid)) - #ifdef CONFIG_SYSCTL static struct ctl_table_set * set_lookup(struct ctl_table_root *root) @@ -127,88 +129,73 @@ void retire_userns_sysctls(struct user_namespace *ns) #endif } -static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) +static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, + struct hlist_nulls_head *hashent) { struct ucounts *ucounts; + struct hlist_nulls_node *pos; - hlist_for_each_entry(ucounts, hashent, node) { - if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) - return ucounts; + guard(rcu)(); + hlist_nulls_for_each_entry_rcu(ucounts, pos, hashent, node) { + if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) { + if (rcuref_get(&ucounts->count)) + return ucounts; + } } return NULL; } static void hlist_add_ucounts(struct ucounts *ucounts) { - struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + struct hlist_nulls_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + spin_lock_irq(&ucounts_lock); - hlist_add_head(&ucounts->node, hashent); + hlist_nulls_add_head_rcu(&ucounts->node, hashent); spin_unlock_irq(&ucounts_lock); } -static inline bool get_ucounts_or_wrap(struct ucounts *ucounts) +struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { - /* Returns true on a successful get, false if the count wraps. */ - return !atomic_add_negative(1, &ucounts->count); -} + struct hlist_nulls_head *hashent = ucounts_hashentry(ns, uid); + struct ucounts *ucounts, *new; -struct ucounts *get_ucounts(struct ucounts *ucounts) -{ - if (!get_ucounts_or_wrap(ucounts)) { - put_ucounts(ucounts); - ucounts = NULL; - } - return ucounts; -} + ucounts = find_ucounts(ns, uid, hashent); + if (ucounts) + return ucounts; -struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) -{ - struct hlist_head *hashent = ucounts_hashentry(ns, uid); - bool wrapped; - struct ucounts *ucounts, *new = NULL; + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->ns = ns; + new->uid = uid; + rcuref_init(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); - if (!ucounts) { + if (ucounts) { spin_unlock_irq(&ucounts_lock); - - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) - return NULL; - - new->ns = ns; - new->uid = uid; - atomic_set(&new->count, 1); - - spin_lock_irq(&ucounts_lock); - ucounts = find_ucounts(ns, uid, hashent); - if (!ucounts) { - hlist_add_head(&new->node, hashent); - get_user_ns(new->ns); - spin_unlock_irq(&ucounts_lock); - return new; - } + kfree(new); + return ucounts; } - wrapped = !get_ucounts_or_wrap(ucounts); + hlist_nulls_add_head_rcu(&new->node, hashent); + get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); - kfree(new); - if (wrapped) { - put_ucounts(ucounts); - return NULL; - } - return ucounts; + return new; } void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { - hlist_del_init(&ucounts->node); + if (rcuref_put(&ucounts->count)) { + spin_lock_irqsave(&ucounts_lock, flags); + hlist_nulls_del_rcu(&ucounts->node); spin_unlock_irqrestore(&ucounts_lock, flags); + put_user_ns(ucounts->ns); - kfree(ucounts); + kfree_rcu(ucounts, rcu); } } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index aa0b2e47f2f2..682f40d5632d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -238,7 +238,7 @@ EXPORT_SYMBOL(__put_user_ns); struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ - u32 count; /* == 0 unless used with map_id_range_down() */ + u32 count; }; /* @@ -343,16 +343,19 @@ u32 map_id_down(struct uid_gid_map *map, u32 id) * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * -map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) +map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; - u32 first, last; + u32 first, last, id2; + + id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; - if (id >= first && id <= last) + if (id >= first && id <= last && + (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; @@ -363,28 +366,28 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * -map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) +map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = true; - key.count = 1; + key.count = count; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } -u32 map_id_up(struct uid_gid_map *map, u32 id) +u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - extent = map_id_up_base(extents, map, id); + extent = map_id_range_up_base(extents, map, id, count); else - extent = map_id_up_max(extents, map, id); + extent = map_id_range_up_max(extents, map, id, count); /* Map the id or note failure */ if (extent) @@ -395,6 +398,11 @@ u32 map_id_up(struct uid_gid_map *map, u32 id) return id; } +u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + return map_id_range_up(map, id, 1); +} + /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 8800f5acc007..2f844c279a3e 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(vhost_task_stop); * @arg: data to be passed to fn and handled_kill * @name: the thread's name * - * This returns a specialized task for use by the vhost layer or NULL on + * This returns a specialized task for use by the vhost layer or ERR_PTR() on * failure. The returned task is inactive, and the caller must fire it up * through vhost_task_start(). */ @@ -133,7 +133,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL); if (!vtsk) - return NULL; + return ERR_PTR(-ENOMEM); init_completion(&vtsk->exited); mutex_init(&vtsk->exit_mutex); vtsk->data = arg; @@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); if (IS_ERR(tsk)) { kfree(vtsk); - return NULL; + return ERR_PTR(PTR_ERR(tsk)); } vtsk->task = tsk; diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 1fec61603ef3..e066d31d08f8 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -210,6 +210,10 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE); #define PAGE_OFFLINE_MAPCOUNT_VALUE (PGTY_offline << 24) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); +#ifdef CONFIG_UNACCEPTED_MEMORY +#define PAGE_UNACCEPTED_MAPCOUNT_VALUE (PGTY_unaccepted << 24) + VMCOREINFO_NUMBER(PAGE_UNACCEPTED_MAPCOUNT_VALUE); +#endif #ifdef CONFIG_KALLSYMS VMCOREINFO_SYMBOL(kallsyms_names); diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 5267adeaa403..7e45559521af 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -101,12 +101,11 @@ static bool post_one_notification(struct watch_queue *wqueue, struct pipe_inode_info *pipe = wqueue->pipe; struct pipe_buffer *buf; struct page *page; - unsigned int head, tail, mask, note, offset, len; + unsigned int head, tail, note, offset, len; bool done = false; spin_lock_irq(&pipe->rd_wait.lock); - mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; if (pipe_full(head, tail, pipe->ring_size)) @@ -124,7 +123,7 @@ static bool post_one_notification(struct watch_queue *wqueue, memcpy(p + offset, n, len); kunmap_atomic(p); - buf = &pipe->bufs[head & mask]; + buf = pipe_buf(pipe, head); buf->page = page; buf->private = (unsigned long)wqueue; buf->ops = &watch_queue_pipe_buf_ops; @@ -147,7 +146,7 @@ out: return done; lost: - buf = &pipe->bufs[(head - 1) & mask]; + buf = pipe_buf(pipe, head - 1); buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } @@ -269,6 +268,15 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) if (ret < 0) goto error; + /* + * pipe_resize_ring() does not update nr_accounted for watch_queue + * pipes, because the above vastly overprovisions. Set nr_accounted on + * and max_usage this pipe to the number that was actually charged to + * the user above via account_pipe_buffers. + */ + pipe->max_usage = nr_pages; + pipe->nr_accounted = nr_pages; + ret = -ENOMEM; pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b2da7de39d06..80b56c002c7f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -47,6 +47,7 @@ int __read_mostly watchdog_user_enabled = 1; static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT; static int __read_mostly watchdog_softlockup_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +static int __read_mostly watchdog_thresh_next; static int __read_mostly watchdog_hardlockup_available; struct cpumask watchdog_cpumask __read_mostly; @@ -63,6 +64,29 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; */ unsigned int __read_mostly hardlockup_panic = IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); + +#ifdef CONFIG_SYSFS + +static unsigned int hardlockup_count; + +static ssize_t hardlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", hardlockup_count); +} + +static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count); + +static __init int kernel_hardlockup_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_hardlockup_sysfs_init); + +#endif // CONFIG_SYSFS + /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -169,6 +193,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) unsigned int this_cpu = smp_processor_id(); unsigned long flags; +#ifdef CONFIG_SYSFS + ++hardlockup_count; +#endif + /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) return; @@ -311,6 +339,28 @@ unsigned int __read_mostly softlockup_panic = static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; +#ifdef CONFIG_SYSFS + +static unsigned int softlockup_count; + +static ssize_t softlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", softlockup_count); +} + +static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count); + +static __init int kernel_softlockup_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_softlockup_sysfs_init); + +#endif // CONFIG_SYSFS + /* Timestamp taken after the last successful reschedule. */ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); /* Timestamp of the last softlockup report. */ @@ -347,8 +397,6 @@ static int __init watchdog_thresh_setup(char *str) } __setup("watchdog_thresh=", watchdog_thresh_setup); -static void __lockup_detector_cleanup(void); - #ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM enum stats_per_group { STATS_SYSTEM, @@ -744,6 +792,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) touch_ts = __this_cpu_read(watchdog_touch_ts); duration = is_softlockup(touch_ts, period_ts, now); if (unlikely(duration)) { +#ifdef CONFIG_SYSFS + ++softlockup_count; +#endif + /* * Prevent multiple soft-lockup reports if one cpu is already * engaged in dumping all cpu back traces. @@ -797,8 +849,7 @@ static void watchdog_enable(unsigned int cpu) * Start the timer first to prevent the hardlockup watchdog triggering * before the timer has a chance to fire. */ - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - hrtimer->function = watchdog_timer_fn; + hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED_HARD); @@ -873,12 +924,20 @@ int lockup_detector_offline_cpu(unsigned int cpu) return 0; } -static void __lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(bool thresh_changed) { cpus_read_lock(); watchdog_hardlockup_stop(); softlockup_stop_all(); + /* + * To prevent watchdog_timer_fn from using the old interval and + * the new watchdog_thresh at the same time, which could lead to + * false softlockup reports, it is necessary to update the + * watchdog_thresh after the softlockup is completed. + */ + if (thresh_changed) + watchdog_thresh = READ_ONCE(watchdog_thresh_next); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) @@ -886,17 +945,12 @@ static void __lockup_detector_reconfigure(void) watchdog_hardlockup_start(); cpus_read_unlock(); - /* - * Must be called outside the cpus locked section to prevent - * recursive locking in the perf code. - */ - __lockup_detector_cleanup(); } void lockup_detector_reconfigure(void) { mutex_lock(&watchdog_mutex); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); mutex_unlock(&watchdog_mutex); } @@ -916,48 +970,32 @@ static __init void lockup_detector_setup(void) return; mutex_lock(&watchdog_mutex); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ -static void __lockup_detector_reconfigure(void) +static void __lockup_detector_reconfigure(bool thresh_changed) { cpus_read_lock(); watchdog_hardlockup_stop(); + if (thresh_changed) + watchdog_thresh = READ_ONCE(watchdog_thresh_next); lockup_detector_update_enable(); watchdog_hardlockup_start(); cpus_read_unlock(); } void lockup_detector_reconfigure(void) { - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); } static inline void lockup_detector_setup(void) { - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(false); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -static void __lockup_detector_cleanup(void) -{ - lockdep_assert_held(&watchdog_mutex); - hardlockup_detector_perf_cleanup(); -} - -/** - * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes - * - * Caller must not hold the cpu hotplug rwsem. - */ -void lockup_detector_cleanup(void) -{ - mutex_lock(&watchdog_mutex); - __lockup_detector_cleanup(); - mutex_unlock(&watchdog_mutex); -} - /** * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) * @@ -972,11 +1010,11 @@ void lockup_detector_soft_poweroff(void) #ifdef CONFIG_SYSCTL /* Propagate any changes to the watchdog infrastructure */ -static void proc_watchdog_update(void) +static void proc_watchdog_update(bool thresh_changed) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); - __lockup_detector_reconfigure(); + __lockup_detector_reconfigure(thresh_changed); } /* @@ -1010,7 +1048,7 @@ static int proc_watchdog_common(int which, const struct ctl_table *table, int wr } else { err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!err && old != READ_ONCE(*param)) - proc_watchdog_update(); + proc_watchdog_update(false); } mutex_unlock(&watchdog_mutex); return err; @@ -1061,11 +1099,13 @@ static int proc_watchdog_thresh(const struct ctl_table *table, int write, mutex_lock(&watchdog_mutex); - old = READ_ONCE(watchdog_thresh); + watchdog_thresh_next = READ_ONCE(watchdog_thresh); + + old = watchdog_thresh_next; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!err && write && old != READ_ONCE(watchdog_thresh)) - proc_watchdog_update(); + if (!err && write && old != READ_ONCE(watchdog_thresh_next)) + proc_watchdog_update(true); mutex_unlock(&watchdog_mutex); return err; @@ -1086,7 +1126,7 @@ static int proc_watchdog_cpumask(const struct ctl_table *table, int write, err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) - proc_watchdog_update(); + proc_watchdog_update(false); mutex_unlock(&watchdog_mutex); return err; @@ -1106,7 +1146,7 @@ static const struct ctl_table watchdog_sysctls[] = { }, { .procname = "watchdog_thresh", - .data = &watchdog_thresh, + .data = &watchdog_thresh_next, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog_thresh, diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 59c1d86a73a2..75af12ff774e 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -21,8 +21,6 @@ #include <linux/perf_event.h> static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -static DEFINE_PER_CPU(struct perf_event *, dead_event); -static struct cpumask dead_events_mask; static atomic_t watchdog_cpus = ATOMIC_INIT(0); @@ -146,6 +144,7 @@ static int hardlockup_detector_event_create(void) PTR_ERR(evt)); return PTR_ERR(evt); } + WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak"); this_cpu_write(watchdog_ev, evt); return 0; } @@ -181,37 +180,13 @@ void watchdog_hardlockup_disable(unsigned int cpu) if (event) { perf_event_disable(event); + perf_event_release_kernel(event); this_cpu_write(watchdog_ev, NULL); - this_cpu_write(dead_event, event); - cpumask_set_cpu(smp_processor_id(), &dead_events_mask); atomic_dec(&watchdog_cpus); } } /** - * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them - * - * Called from lockup_detector_cleanup(). Serialized by the caller. - */ -void hardlockup_detector_perf_cleanup(void) -{ - int cpu; - - for_each_cpu(cpu, &dead_events_mask) { - struct perf_event *event = per_cpu(dead_event, cpu); - - /* - * Required because for_each_cpu() reports unconditionally - * CPU0 as set on UP kernels. Sigh. - */ - if (event) - perf_event_release_kernel(event); - per_cpu(dead_event, cpu) = NULL; - } - cpumask_clear(&dead_events_mask); -} - -/** * hardlockup_detector_perf_stop - Globally stop watchdog events * * Special interface for x86 to handle the perf HT bug. @@ -294,12 +269,10 @@ void __init hardlockup_config_perf_event(const char *str) } else { unsigned int len = comma - str; - if (len >= sizeof(buf)) + if (len > sizeof(buf)) return; - if (strscpy(buf, str, sizeof(buf)) < 0) - return; - buf[len] = 0; + strscpy(buf, str, len); if (kstrtoull(buf, 16, &config)) return; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 97152f2250fe..9f9148075828 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -686,7 +686,7 @@ EXPORT_SYMBOL_GPL(destroy_work_on_stack); void destroy_delayed_work_on_stack(struct delayed_work *work) { - destroy_timer_on_stack(&work->timer); + timer_destroy_on_stack(&work->timer); debug_object_free(&work->work, &work_debug_descr); } EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); @@ -2057,11 +2057,11 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags, struct delayed_work *dwork = to_delayed_work(work); /* - * dwork->timer is irqsafe. If del_timer() fails, it's + * dwork->timer is irqsafe. If timer_delete() fails, it's * guaranteed that the timer is not queued anywhere and not * running on the local CPU. */ - if (likely(del_timer(&dwork->timer))) + if (likely(timer_delete(&dwork->timer))) return 1; } @@ -2254,8 +2254,10 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, * queues a new work item to a wq after destroy_workqueue(wq). */ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && - WARN_ON_ONCE(!is_chained_work(wq)))) + WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n", + work->func, wq->name))) { return; + } rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ @@ -2479,7 +2481,7 @@ EXPORT_SYMBOL_GPL(queue_work_node); void delayed_work_timer_fn(struct timer_list *t) { - struct delayed_work *dwork = from_timer(dwork, t, timer); + struct delayed_work *dwork = timer_container_of(dwork, t, timer); /* should have been called from irqsafe timer with irq already off */ __queue_work(dwork->cpu, dwork->wq, &dwork->work); @@ -2907,7 +2909,7 @@ static void set_worker_dying(struct worker *worker, struct list_head *list) */ static void idle_worker_timeout(struct timer_list *t) { - struct worker_pool *pool = from_timer(pool, t, idle_timer); + struct worker_pool *pool = timer_container_of(pool, t, idle_timer); bool do_cull = false; if (work_pending(&pool->idle_cull_work)) @@ -3006,7 +3008,7 @@ static void send_mayday(struct work_struct *work) static void pool_mayday_timeout(struct timer_list *t) { - struct worker_pool *pool = from_timer(pool, t, mayday_timer); + struct worker_pool *pool = timer_container_of(pool, t, mayday_timer); struct work_struct *work; raw_spin_lock_irq(&pool->lock); @@ -3067,7 +3069,7 @@ restart: break; } - del_timer_sync(&pool->mayday_timer); + timer_delete_sync(&pool->mayday_timer); raw_spin_lock_irq(&pool->lock); /* * This is necessary even after a new worker was just successfully @@ -3239,7 +3241,7 @@ __acquires(&pool->lock) * point will only record its address. */ trace_workqueue_execute_end(work, worker->current_func); - pwq->stats[PWQ_STAT_COMPLETED]++; + lock_map_release(&lockdep_map); if (!bh_draining) lock_map_release(pwq->wq->lockdep_map); @@ -3270,6 +3272,8 @@ __acquires(&pool->lock) raw_spin_lock_irq(&pool->lock); + pwq->stats[PWQ_STAT_COMPLETED]++; + /* * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked * CPU intensive by wq_worker_tick() if @work hogged CPU longer than @@ -4279,7 +4283,7 @@ EXPORT_SYMBOL_GPL(flush_work); bool flush_delayed_work(struct delayed_work *dwork) { local_irq_disable(); - if (del_timer_sync(&dwork->timer)) + if (timer_delete_sync(&dwork->timer)) __queue_work(dwork->cpu, dwork->wq, &dwork->work); local_irq_enable(); return flush_work(&dwork->work); @@ -4982,9 +4986,9 @@ static void put_unbound_pool(struct worker_pool *pool) reap_dying_workers(&cull_list); /* shut down the timers */ - del_timer_sync(&pool->idle_timer); + timer_delete_sync(&pool->idle_timer); cancel_work_sync(&pool->idle_cull_work); - del_timer_sync(&pool->mayday_timer); + timer_delete_sync(&pool->mayday_timer); /* RCU protected to allow dereferences from get_work_pool() */ call_rcu(&pool->rcu, rcu_free_pool); @@ -5835,6 +5839,17 @@ static bool pwq_busy(struct pool_workqueue *pwq) * @wq: target workqueue * * Safely destroy a workqueue. All work currently pending will be done first. + * + * This function does NOT guarantee that non-pending work that has been + * submitted with queue_delayed_work() and similar functions will be done + * before destroying the workqueue. The fundamental problem is that, currently, + * the workqueue has no way of accessing non-pending delayed_work. delayed_work + * is only linked on the timer-side. All delayed_work must, therefore, be + * canceled before calling this function. + * + * TODO: It would be better if the problem described above wouldn't exist and + * destroy_workqueue() would cleanly cancel all pending and non-pending + * delayed_work. */ void destroy_workqueue(struct workqueue_struct *wq) { @@ -7635,7 +7650,7 @@ notrace void wq_watchdog_touch(int cpu) static void wq_watchdog_set_thresh(unsigned long thresh) { wq_watchdog_thresh = 0; - del_timer_sync(&wq_watchdog_timer); + timer_delete_sync(&wq_watchdog_timer); if (thresh) { wq_watchdog_thresh = thresh; @@ -7752,7 +7767,8 @@ void __init workqueue_init_early(void) restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); - + cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, + housekeeping_cpumask(HK_TYPE_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs(); |