diff options
Diffstat (limited to 'kernel')
81 files changed, 2058 insertions, 1369 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 547c88be8a28..93d0b87f3283 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -64,6 +64,7 @@ #include <uapi/linux/limits.h> #include <uapi/linux/netfilter/nf_tables.h> #include <uapi/linux/openat2.h> // struct open_how +#include <uapi/linux/fanotify.h> #include "audit.h" @@ -2252,7 +2253,7 @@ static inline int audit_copy_fcaps(struct audit_names *name, if (!dentry) return 0; - rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps); + rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps); if (rc) return rc; @@ -2807,7 +2808,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->d.next = context->aux; context->aux = (void *)ax; - get_vfs_caps_from_disk(&init_user_ns, + get_vfs_caps_from_disk(&nop_mnt_idmap, bprm->file->f_path.dentry, &vcaps); ax->fcap.permitted = vcaps.permitted; @@ -2877,10 +2878,21 @@ void __audit_log_kern_module(char *name) context->type = AUDIT_KERN_MODULE; } -void __audit_fanotify(unsigned int response) +void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { - audit_log(audit_context(), GFP_KERNEL, - AUDIT_FANOTIFY, "resp=%u", response); + /* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */ + switch (friar->hdr.type) { + case FAN_RESPONSE_INFO_NONE: + audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, + "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2", + response, FAN_RESPONSE_INFO_NONE); + break; + case FAN_RESPONSE_INFO_AUDIT_RULE: + audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, + "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u", + response, friar->hdr.type, friar->rule_number, + friar->subj_trust, friar->obj_trust); + } } void __audit_tk_injoffset(struct timespec64 offset) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 9ea42a45da47..e14c822f8911 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -51,7 +51,6 @@ BTF_SET_END(bpf_lsm_current_hooks) */ BTF_SET_START(bpf_lsm_locked_sockopt_hooks) #ifdef CONFIG_SECURITY_NETWORK -BTF_ID(func, bpf_lsm_socket_sock_rcv_skb) BTF_ID(func, bpf_lsm_sock_graft) BTF_ID(func, bpf_lsm_inet_csk_clone) BTF_ID(func, bpf_lsm_inet_conn_established) @@ -351,8 +350,10 @@ BTF_ID(func, bpf_lsm_bpf_prog_alloc_security) BTF_ID(func, bpf_lsm_bpf_prog_free_security) BTF_ID(func, bpf_lsm_file_alloc_security) BTF_ID(func, bpf_lsm_file_free_security) +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) +#endif /* CONFIG_SECURITY_NETWORK */ BTF_ID(func, bpf_lsm_task_free) BTF_SET_END(untrusted_lsm_hooks) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f7dd8af06413..b7017cae6fd1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7782,9 +7782,9 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL); - return 0; end: - btf_free_dtor_kfunc_tab(btf); + if (ret) + btf_free_dtor_kfunc_tab(btf); btf_put(btf); return ret; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 5aa2b5525f79..66bded144377 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -152,7 +152,7 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab, { unsigned long flags; - hash = hash & HASHTAB_MAP_LOCK_MASK; + hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); preempt_disable(); if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { @@ -171,7 +171,7 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab, struct bucket *b, u32 hash, unsigned long flags) { - hash = hash & HASHTAB_MAP_LOCK_MASK; + hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); raw_spin_unlock_irqrestore(&b->raw_lock, flags); __this_cpu_dec(*(htab->map_locked[hash])); preempt_enable(); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 4f841e16779e..9948b542a470 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, inode->i_mtime = inode->i_atime; inode->i_ctime = inode->i_atime; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); return inode; } @@ -152,7 +152,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, dir->i_ctime = dir->i_mtime; } -static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -382,7 +382,7 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) return simple_lookup(dir, dentry, flags); } -static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *target) { char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); @@ -559,7 +559,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags) static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) { struct bpf_prog *prog; - int ret = inode_permission(&init_user_ns, inode, MAY_READ); + int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ); if (ret) return ERR_PTR(ret); diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index ebcc3dd0fa19..1db156405b68 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -71,7 +71,7 @@ static int bpf_mem_cache_idx(size_t size) if (size <= 192) return size_index[(size - 1) / 8] - 1; - return fls(size - 1) - 1; + return fls(size - 1) - 2; } #define NUM_CACHES 11 diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 13e4efc971e6..190d9f9dc987 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -216,9 +216,6 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) if (offload->dev_state) offload->offdev->ops->destroy(prog); - /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ - bpf_prog_free_id(prog, true); - list_del_init(&offload->offloads); kfree(offload); prog->aux->offload = NULL; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 64131f88c553..ecca9366c7a6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1972,7 +1972,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) return; if (audit_enabled == AUDIT_OFF) return; - if (op == BPF_AUDIT_LOAD) + if (!in_irq() && !irqs_disabled()) ctx = audit_context(); ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); if (unlikely(!ab)) @@ -2001,7 +2001,7 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) return id > 0 ? 0 : id; } -void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) +void bpf_prog_free_id(struct bpf_prog *prog) { unsigned long flags; @@ -2013,18 +2013,10 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) if (!prog->aux->id) return; - if (do_idr_lock) - spin_lock_irqsave(&prog_idr_lock, flags); - else - __acquire(&prog_idr_lock); - + spin_lock_irqsave(&prog_idr_lock, flags); idr_remove(&prog_idr, prog->aux->id); prog->aux->id = 0; - - if (do_idr_lock) - spin_unlock_irqrestore(&prog_idr_lock, flags); - else - __release(&prog_idr_lock); + spin_unlock_irqrestore(&prog_idr_lock, flags); } static void __bpf_prog_put_rcu(struct rcu_head *rcu) @@ -2067,17 +2059,15 @@ static void bpf_prog_put_deferred(struct work_struct *work) prog = aux->prog; perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); + bpf_prog_free_id(prog); __bpf_prog_put_noref(prog, true); } -static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) +static void __bpf_prog_put(struct bpf_prog *prog) { struct bpf_prog_aux *aux = prog->aux; if (atomic64_dec_and_test(&aux->refcnt)) { - /* bpf_prog_free_id() must be called first */ - bpf_prog_free_id(prog, do_idr_lock); - if (in_irq() || irqs_disabled()) { INIT_WORK(&aux->work, bpf_prog_put_deferred); schedule_work(&aux->work); @@ -2089,7 +2079,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) void bpf_prog_put(struct bpf_prog *prog) { - __bpf_prog_put(prog, true); + __bpf_prog_put(prog); } EXPORT_SYMBOL_GPL(bpf_prog_put); diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index c2a2182ce570..c4ab9d6cdbe9 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -438,6 +438,7 @@ struct bpf_iter_seq_task_vma_info { */ struct bpf_iter_seq_task_common common; struct task_struct *task; + struct mm_struct *mm; struct vm_area_struct *vma; u32 tid; unsigned long prev_vm_start; @@ -456,16 +457,19 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) enum bpf_task_vma_iter_find_op op; struct vm_area_struct *curr_vma; struct task_struct *curr_task; + struct mm_struct *curr_mm; u32 saved_tid = info->tid; /* If this function returns a non-NULL vma, it holds a reference to - * the task_struct, and holds read lock on vma->mm->mmap_lock. + * the task_struct, holds a refcount on mm->mm_users, and holds + * read lock on vma->mm->mmap_lock. * If this function returns NULL, it does not hold any reference or * lock. */ if (info->task) { curr_task = info->task; curr_vma = info->vma; + curr_mm = info->mm; /* In case of lock contention, drop mmap_lock to unblock * the writer. * @@ -504,13 +508,15 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) * 4.2) VMA2 and VMA2' covers different ranges, process * VMA2'. */ - if (mmap_lock_is_contended(curr_task->mm)) { + if (mmap_lock_is_contended(curr_mm)) { info->prev_vm_start = curr_vma->vm_start; info->prev_vm_end = curr_vma->vm_end; op = task_vma_iter_find_vma; - mmap_read_unlock(curr_task->mm); - if (mmap_read_lock_killable(curr_task->mm)) + mmap_read_unlock(curr_mm); + if (mmap_read_lock_killable(curr_mm)) { + mmput(curr_mm); goto finish; + } } else { op = task_vma_iter_next_vma; } @@ -535,42 +541,47 @@ again: op = task_vma_iter_find_vma; } - if (!curr_task->mm) + curr_mm = get_task_mm(curr_task); + if (!curr_mm) goto next_task; - if (mmap_read_lock_killable(curr_task->mm)) + if (mmap_read_lock_killable(curr_mm)) { + mmput(curr_mm); goto finish; + } } switch (op) { case task_vma_iter_first_vma: - curr_vma = find_vma(curr_task->mm, 0); + curr_vma = find_vma(curr_mm, 0); break; case task_vma_iter_next_vma: - curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); + curr_vma = find_vma(curr_mm, curr_vma->vm_end); break; case task_vma_iter_find_vma: /* We dropped mmap_lock so it is necessary to use find_vma * to find the next vma. This is similar to the mechanism * in show_smaps_rollup(). */ - curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1); + curr_vma = find_vma(curr_mm, info->prev_vm_end - 1); /* case 1) and 4.2) above just use curr_vma */ /* check for case 2) or case 4.1) above */ if (curr_vma && curr_vma->vm_start == info->prev_vm_start && curr_vma->vm_end == info->prev_vm_end) - curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); + curr_vma = find_vma(curr_mm, curr_vma->vm_end); break; } if (!curr_vma) { /* case 3) above, or case 2) 4.1) with vma->next == NULL */ - mmap_read_unlock(curr_task->mm); + mmap_read_unlock(curr_mm); + mmput(curr_mm); goto next_task; } info->task = curr_task; info->vma = curr_vma; + info->mm = curr_mm; return curr_vma; next_task: @@ -579,6 +590,7 @@ next_task: put_task_struct(curr_task); info->task = NULL; + info->mm = NULL; info->tid++; goto again; @@ -587,6 +599,7 @@ finish: put_task_struct(curr_task); info->task = NULL; info->vma = NULL; + info->mm = NULL; return NULL; } @@ -658,7 +671,9 @@ static void task_vma_seq_stop(struct seq_file *seq, void *v) */ info->prev_vm_start = ~0UL; info->prev_vm_end = info->vma->vm_end; - mmap_read_unlock(info->task->mm); + mmap_read_unlock(info->mm); + mmput(info->mm); + info->mm = NULL; put_task_struct(info->task); info->task = NULL; } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 11f5ec0b8016..d0ed7d6f5eec 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -488,6 +488,10 @@ again: /* reset fops->func and fops->trampoline for re-register */ tr->fops->func = NULL; tr->fops->trampoline = 0; + + /* reset im->image memory attr for arch_prepare_bpf_trampoline */ + set_memory_nx((long)im->image, 1); + set_memory_rw((long)im->image, 1); goto again; } #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a5255a0dcbb6..7ee218827259 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1054,6 +1054,8 @@ static void print_insn_state(struct bpf_verifier_env *env, */ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t flags) { + size_t alloc_bytes; + void *orig = dst; size_t bytes; if (ZERO_OR_NULL_PTR(src)) @@ -1062,11 +1064,11 @@ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - if (ksize(dst) < ksize(src)) { - kfree(dst); - dst = kmalloc_track_caller(kmalloc_size_roundup(bytes), flags); - if (!dst) - return NULL; + alloc_bytes = max(ksize(orig), kmalloc_size_roundup(bytes)); + dst = krealloc(orig, alloc_bytes, flags); + if (!dst) { + kfree(orig); + return NULL; } memcpy(dst, src, bytes); @@ -2746,6 +2748,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, */ if (insn->src_reg == 0 && is_callback_calling_function(insn->imm)) return -ENOTSUPP; + /* kfunc with imm==0 is invalid and fixup_kfunc_call will + * catch this error later. Make backtracking conservative + * with ENOTSUPP. + */ + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0) + return -ENOTSUPP; /* regular helper call sets R0 */ *reg_mask &= ~1; if (*reg_mask & 0x3f) { @@ -3235,13 +3243,24 @@ static bool __is_pointer_value(bool allow_ptr_leaks, return reg->type != SCALAR_VALUE; } +/* Copy src state preserving dst->parent and dst->live fields */ +static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) +{ + struct bpf_reg_state *parent = dst->parent; + enum bpf_reg_liveness live = dst->live; + + *dst = *src; + dst->parent = parent; + dst->live = live; +} + static void save_register_state(struct bpf_func_state *state, int spi, struct bpf_reg_state *reg, int size) { int i; - state->stack[spi].spilled_ptr = *reg; + copy_register_state(&state->stack[spi].spilled_ptr, reg); if (size == BPF_REG_SIZE) state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; @@ -3287,7 +3306,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, bool sanitize = reg && is_spillable_regtype(reg->type); for (i = 0; i < size; i++) { - if (state->stack[spi].slot_type[i] == STACK_INVALID) { + u8 type = state->stack[spi].slot_type[i]; + + if (type != STACK_MISC && type != STACK_ZERO) { sanitize = true; break; } @@ -3567,7 +3588,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, */ s32 subreg_def = state->regs[dst_regno].subreg_def; - state->regs[dst_regno] = *reg; + copy_register_state(&state->regs[dst_regno], reg); state->regs[dst_regno].subreg_def = subreg_def; } else { for (i = 0; i < size; i++) { @@ -3588,7 +3609,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, if (dst_regno >= 0) { /* restore register state from stack */ - state->regs[dst_regno] = *reg; + copy_register_state(&state->regs[dst_regno], reg); /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions @@ -9582,7 +9603,7 @@ do_sim: */ if (!ptr_is_dst_reg) { tmp = *dst_reg; - *dst_reg = *ptr_reg; + copy_register_state(dst_reg, ptr_reg); } ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx); @@ -10835,7 +10856,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * to propagate min/max range. */ src_reg->id = ++env->id_gen; - *dst_reg = *src_reg; + copy_register_state(dst_reg, src_reg); dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { @@ -10846,7 +10867,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg); return -EACCES; } else if (src_reg->type == SCALAR_VALUE) { - *dst_reg = *src_reg; + copy_register_state(dst_reg, src_reg); /* Make sure ID is cleared otherwise * dst_reg min/max could be incorrectly * propagated into src_reg by find_equal_scalars() @@ -11645,7 +11666,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate, bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) - *reg = *known_reg; + copy_register_state(reg, known_reg); })); } @@ -11822,10 +11843,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * register B - not null * for JNE A, B, ... - A is not null in the false branch; * for JEQ A, B, ... - A is not null in the true branch. + * + * Since PTR_TO_BTF_ID points to a kernel struct that does + * not need to be null checked by the BPF program, i.e., + * could be null even without PTR_MAYBE_NULL marking, so + * only propagate nullness when neither reg is that type. */ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X && __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) && - type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type)) { + type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type) && + base_type(src_reg->type) != PTR_TO_BTF_ID && + base_type(dst_reg->type) != PTR_TO_BTF_ID) { eq_branch_regs = NULL; switch (opcode) { case BPF_JEQ: diff --git a/kernel/capability.c b/kernel/capability.c index 860fd22117c1..339a44dfe2f4 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -486,11 +486,11 @@ EXPORT_SYMBOL(file_ns_capable); * Return true if the inode uid and gid are within the namespace. */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, const struct inode *inode) { - return vfsuid_has_mapping(ns, i_uid_into_vfsuid(mnt_userns, inode)) && - vfsgid_has_mapping(ns, i_gid_into_vfsgid(mnt_userns, inode)); + return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) && + vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode)); } /** @@ -502,13 +502,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, * its own user namespace and that the given inode's uid and gid are * mapped into the current user namespace. */ -bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns, +bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); return ns_capable(ns, cap) && - privileged_wrt_inode_uidgid(ns, mnt_userns, inode); + privileged_wrt_inode_uidgid(ns, idmap, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c099cf3fa02d..935e8121b21e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5065,7 +5065,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) if (!inode) return -ENOMEM; - ret = inode_permission(&init_user_ns, inode, MAY_WRITE); + ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE); iput(inode); return ret; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a585ced99e1e..636f1c682ac0 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1205,12 +1205,13 @@ void rebuild_sched_domains(void) /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * @new_cpus: the temp variable for the new effective_cpus mask * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ -static void update_tasks_cpumask(struct cpuset *cs) +static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; @@ -1224,7 +1225,10 @@ static void update_tasks_cpumask(struct cpuset *cs) if (top_cs && (task->flags & PF_KTHREAD) && kthread_is_per_cpu(task)) continue; - set_cpus_allowed_ptr(task, cs->effective_cpus); + + cpumask_and(new_cpus, cs->effective_cpus, + task_cpu_possible_mask(task)); + set_cpus_allowed_ptr(task, new_cpus); } css_task_iter_end(&it); } @@ -1346,7 +1350,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, * A parent can be left with no CPU as long as there is no * task directly associated with the parent partition. */ - if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) && + if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) && partition_is_populated(parent, cs)) return PERR_NOCPUS; @@ -1509,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, spin_unlock_irq(&callback_lock); if (adding || deleting) - update_tasks_cpumask(parent); + update_tasks_cpumask(parent, tmp->new_cpus); /* * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. @@ -1661,7 +1665,7 @@ update_parent_subparts: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - update_tasks_cpumask(cp); + update_tasks_cpumask(cp, tmp->new_cpus); /* * On legacy hierarchy, if the effective cpumask of any non- @@ -2309,7 +2313,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) } } - update_tasks_cpumask(parent); + update_tasks_cpumask(parent, tmpmask.new_cpus); if (parent->child_ecpus_count) update_sibling_cpumasks(parent, cs, &tmpmask); @@ -2324,6 +2328,7 @@ out: new_prs = -new_prs; spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; + WRITE_ONCE(cs->prs_err, err); spin_unlock_irq(&callback_lock); /* * Update child cpusets, if present. @@ -3345,7 +3350,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, * as the tasks will be migrated to an ancestor. */ if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) - update_tasks_cpumask(cs); + update_tasks_cpumask(cs, new_cpus); if (mems_updated && !nodes_empty(cs->mems_allowed)) update_tasks_nodemask(cs); @@ -3382,7 +3387,7 @@ hotplug_update_tasks(struct cpuset *cs, spin_unlock_irq(&callback_lock); if (cpus_updated) - update_tasks_cpumask(cs); + update_tasks_cpumask(cs, new_cpus); if (mems_updated) update_tasks_nodemask(cs); } @@ -3689,15 +3694,38 @@ void __init cpuset_init_smp(void) * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of cpu_online_mask, even if this means going outside the - * tasks cpuset. + * tasks cpuset, except when the task is in the top cpuset. **/ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { unsigned long flags; + struct cpuset *cs; spin_lock_irqsave(&callback_lock, flags); - guarantee_online_cpus(tsk, pmask); + rcu_read_lock(); + + cs = task_cs(tsk); + if (cs != &top_cpuset) + guarantee_online_cpus(tsk, pmask); + /* + * Tasks in the top cpuset won't get update to their cpumasks + * when a hotplug online/offline event happens. So we include all + * offline cpus in the allowed cpu list. + */ + if ((cs == &top_cpuset) || cpumask_empty(pmask)) { + const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + + /* + * We first exclude cpus allocated to partitions. If there is no + * allowable online cpu left, we fall back to all possible cpus. + */ + cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus); + if (!cpumask_intersects(pmask, cpu_online_mask)) + cpumask_copy(pmask, possible_mask); + } + + rcu_read_unlock(); spin_unlock_irqrestore(&callback_lock, flags); } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 77978e372377..a09f1c19336a 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -510,7 +510,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - atomic_set(&ct->state, state); + arch_atomic_set(&ct->state, state); } else { /* * Even if context tracking is disabled on this CPU, because it's outside @@ -527,7 +527,7 @@ void noinstr __ct_user_enter(enum ctx_state state) */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - atomic_set(&ct->state, state); + arch_atomic_set(&ct->state, state); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race @@ -535,7 +535,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * RCU only requires RCU_DYNTICKS_IDX increments to be fully * ordered. */ - atomic_add(state, &ct->state); + arch_atomic_add(state, &ct->state); } } } @@ -630,12 +630,12 @@ void noinstr __ct_user_exit(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - atomic_set(&ct->state, CONTEXT_KERNEL); + arch_atomic_set(&ct->state, CONTEXT_KERNEL); } else { if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - atomic_set(&ct->state, CONTEXT_KERNEL); + arch_atomic_set(&ct->state, CONTEXT_KERNEL); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race @@ -643,7 +643,7 @@ void noinstr __ct_user_exit(enum ctx_state state) * RCU only requires RCU_DYNTICKS_IDX increments to be fully * ordered. */ - atomic_sub(state, &ct->state); + arch_atomic_sub(state, &ct->state); } } } diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index ba4ba71facf9..b0f0d15085db 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -30,16 +30,9 @@ static int cpu_pm_notify(enum cpu_pm_event event) { int ret; - /* - * This introduces a RCU read critical section, which could be - * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know - * this. - */ - ct_irq_enter_irqson(); rcu_read_lock(); ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL); rcu_read_unlock(); - ct_irq_exit_irqson(); return notifier_to_errno(ret); } @@ -49,11 +42,9 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev unsigned long flags; int ret; - ct_irq_enter_irqson(); raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags); ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL); raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); - ct_irq_exit_irqson(); return notifier_to_errno(ret); } diff --git a/kernel/events/core.c b/kernel/events/core.c index d56328e5080e..7099c77bc53b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4813,19 +4813,17 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; - + raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { atomic_set(&epc->refcount, 1); epc->embedded = 1; - raw_spin_lock_irq(&ctx->lock); list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; - raw_spin_unlock_irq(&ctx->lock); } else { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); } - + raw_spin_unlock_irq(&ctx->lock); return epc; } @@ -4896,33 +4894,30 @@ static void free_epc_rcu(struct rcu_head *head) static void put_pmu_ctx(struct perf_event_pmu_context *epc) { + struct perf_event_context *ctx = epc->ctx; unsigned long flags; - if (!atomic_dec_and_test(&epc->refcount)) + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because of the call-site in _free_event()/put_event() + * which isn't always called under ctx->mutex. + */ + if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) return; - if (epc->ctx) { - struct perf_event_context *ctx = epc->ctx; + WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); - /* - * XXX - * - * lockdep_assert_held(&ctx->mutex); - * - * can't because of the call-site in _free_event()/put_event() - * which isn't always called under ctx->mutex. - */ - - WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); - raw_spin_lock_irqsave(&ctx->lock, flags); - list_del_init(&epc->pmu_ctx_entry); - epc->ctx = NULL; - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } + list_del_init(&epc->pmu_ctx_entry); + epc->ctx = NULL; WARN_ON_ONCE(!list_empty(&epc->pinned_active)); WARN_ON_ONCE(!list_empty(&epc->flexible_active)); + raw_spin_unlock_irqrestore(&ctx->lock, flags); + if (epc->embedded) return; @@ -7046,13 +7041,20 @@ out_put: ring_buffer_put(rb); } -static void __perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, +/* + * A set of common sample data types saved even for non-sample records + * when event->attr.sample_id_all is set. + */ +#define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \ + PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \ + PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER) + +static void __perf_event_header__init_id(struct perf_sample_data *data, struct perf_event *event, u64 sample_type) { data->type = event->attr.sample_type; - header->size += event->id_header_size; + data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL; if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ @@ -7079,8 +7081,10 @@ void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { - if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event, event->attr.sample_type); + if (event->attr.sample_id_all) { + header->size += event->id_header_size; + __perf_event_header__init_id(data, event, event->attr.sample_type); + } } static void __perf_event__output_id_sample(struct perf_output_handle *handle, @@ -7310,7 +7314,7 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { + if (data->br_stack) { size_t size; size = data->br_stack->nr @@ -7554,83 +7558,68 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) return callchain ?: &__empty_callchain; } -void perf_prepare_sample(struct perf_event_header *header, - struct perf_sample_data *data, +static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d) +{ + return d * !!(flags & s); +} + +void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; u64 filtered_sample_type; - header->type = PERF_RECORD_SAMPLE; - header->size = sizeof(*header) + event->header_size; - - header->misc = 0; - header->misc |= perf_misc_flags(regs); - /* - * Clear the sample flags that have already been done by the - * PMU driver. + * Add the sample flags that are dependent to others. And clear the + * sample flags that have already been done by the PMU driver. */ - filtered_sample_type = sample_type & ~data->sample_flags; - __perf_event_header__init_id(header, data, event, filtered_sample_type); - - if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) - data->ip = perf_instruction_pointer(regs); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - int size = 1; - - if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) - data->callchain = perf_callchain(event, regs); + filtered_sample_type = sample_type; + filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE, + PERF_SAMPLE_IP); + filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE | + PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR); + filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER, + PERF_SAMPLE_REGS_USER); + filtered_sample_type &= ~data->sample_flags; - size += data->callchain->nr; - - header->size += size * sizeof(u64); + if (filtered_sample_type == 0) { + /* Make sure it has the correct data->type for output */ + data->type = event->attr.sample_type; + return; } - if (sample_type & PERF_SAMPLE_RAW) { - struct perf_raw_record *raw = data->raw; - int size; + __perf_event_header__init_id(data, event, filtered_sample_type); - if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) { - struct perf_raw_frag *frag = &raw->frag; - u32 sum = 0; + if (filtered_sample_type & PERF_SAMPLE_IP) { + data->ip = perf_instruction_pointer(regs); + data->sample_flags |= PERF_SAMPLE_IP; + } - do { - sum += frag->size; - if (perf_raw_frag_last(frag)) - break; - frag = frag->next; - } while (1); + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) + perf_sample_save_callchain(data, event, regs); - size = round_up(sum + sizeof(u32), sizeof(u64)); - raw->size = size - sizeof(u32); - frag->pad = raw->size - sum; - } else { - size = sizeof(u64); - data->raw = NULL; - } - - header->size += size; + if (filtered_sample_type & PERF_SAMPLE_RAW) { + data->raw = NULL; + data->dyn_size += sizeof(u64); + data->sample_flags |= PERF_SAMPLE_RAW; } - if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - int size = sizeof(u64); /* nr */ - if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { - if (branch_sample_hw_index(event)) - size += sizeof(u64); - - size += data->br_stack->nr - * sizeof(struct perf_branch_entry); - } - header->size += size; + if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) { + data->br_stack = NULL; + data->dyn_size += sizeof(u64); + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } - if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) + if (filtered_sample_type & PERF_SAMPLE_REGS_USER) perf_sample_regs_user(&data->regs_user, regs); - if (sample_type & PERF_SAMPLE_REGS_USER) { + /* + * It cannot use the filtered_sample_type here as REGS_USER can be set + * by STACK_USER (using __cond_set() above) and we don't want to update + * the dyn_size if it's not requested by users. + */ + if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64); @@ -7639,10 +7628,11 @@ void perf_prepare_sample(struct perf_event_header *header, size += hweight64(mask) * sizeof(u64); } - header->size += size; + data->dyn_size += size; + data->sample_flags |= PERF_SAMPLE_REGS_USER; } - if (sample_type & PERF_SAMPLE_STACK_USER) { + if (filtered_sample_type & PERF_SAMPLE_STACK_USER) { /* * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added @@ -7650,9 +7640,10 @@ void perf_prepare_sample(struct perf_event_header *header, * up the rest of the sample size. */ u16 stack_size = event->attr.sample_stack_user; + u16 header_size = perf_sample_data_size(data, event); u16 size = sizeof(u64); - stack_size = perf_sample_ustack_size(stack_size, header->size, + stack_size = perf_sample_ustack_size(stack_size, header_size, data->regs_user.regs); /* @@ -7664,24 +7655,31 @@ void perf_prepare_sample(struct perf_event_header *header, size += sizeof(u64) + stack_size; data->stack_user_size = stack_size; - header->size += size; + data->dyn_size += size; + data->sample_flags |= PERF_SAMPLE_STACK_USER; } - if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) { data->weight.full = 0; + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } - if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = PERF_MEM_NA; + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } - if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = 0; + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } - if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) { - if (filtered_sample_type & PERF_SAMPLE_ADDR) - data->addr = 0; + if (filtered_sample_type & PERF_SAMPLE_ADDR) { + data->addr = 0; + data->sample_flags |= PERF_SAMPLE_ADDR; } - if (sample_type & PERF_SAMPLE_REGS_INTR) { + if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); @@ -7693,20 +7691,23 @@ void perf_prepare_sample(struct perf_event_header *header, size += hweight64(mask) * sizeof(u64); } - header->size += size; + data->dyn_size += size; + data->sample_flags |= PERF_SAMPLE_REGS_INTR; } - if (sample_type & PERF_SAMPLE_PHYS_ADDR && - filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) + if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) { data->phys_addr = perf_virt_to_phys(data->addr); + data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; + } #ifdef CONFIG_CGROUP_PERF - if (sample_type & PERF_SAMPLE_CGROUP) { + if (filtered_sample_type & PERF_SAMPLE_CGROUP) { struct cgroup *cgrp; /* protected by RCU */ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; data->cgroup = cgroup_id(cgrp); + data->sample_flags |= PERF_SAMPLE_CGROUP; } #endif @@ -7715,16 +7716,21 @@ void perf_prepare_sample(struct perf_event_header *header, * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, * but the value will not dump to the userspace. */ - if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) { data->data_page_size = perf_get_page_size(data->addr); + data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE; + } - if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) { data->code_page_size = perf_get_page_size(data->ip); + data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE; + } - if (sample_type & PERF_SAMPLE_AUX) { + if (filtered_sample_type & PERF_SAMPLE_AUX) { u64 size; + u16 header_size = perf_sample_data_size(data, event); - header->size += sizeof(u64); /* size */ + header_size += sizeof(u64); /* size */ /* * Given the 16bit nature of header::size, an AUX sample can @@ -7732,14 +7738,26 @@ void perf_prepare_sample(struct perf_event_header *header, * Make sure this doesn't happen by using up to U16_MAX bytes * per sample in total (rounded down to 8 byte boundary). */ - size = min_t(size_t, U16_MAX - header->size, + size = min_t(size_t, U16_MAX - header_size, event->attr.aux_sample_size); size = rounddown(size, 8); size = perf_prepare_sample_aux(event, data, size); - WARN_ON_ONCE(size + header->size > U16_MAX); - header->size += size; + WARN_ON_ONCE(size + header_size > U16_MAX); + data->dyn_size += size + sizeof(u64); /* size above */ + data->sample_flags |= PERF_SAMPLE_AUX; } +} + +void perf_prepare_header(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + header->type = PERF_RECORD_SAMPLE; + header->size = perf_sample_data_size(data, event); + header->misc = perf_misc_flags(regs); + /* * If you're adding more sample types here, you likely need to do * something about the overflowing header::size, like repurpose the @@ -7767,7 +7785,8 @@ __perf_event_output(struct perf_event *event, /* protect the callchain buffers */ rcu_read_lock(); - perf_prepare_sample(&header, data, event, regs); + perf_prepare_sample(data, event, regs); + perf_prepare_header(&header, data, event, regs); err = output_begin(&handle, data, event, header.size); if (err) @@ -10125,8 +10144,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, }; perf_sample_data_init(&data, 0, 0); - data.raw = &raw; - data.sample_flags |= PERF_SAMPLE_RAW; + perf_sample_save_raw_data(&data, &raw); perf_trace_buf_update(record, event_type); @@ -10333,13 +10351,7 @@ static void bpf_overflow_handler(struct perf_event *event, rcu_read_lock(); prog = READ_ONCE(event->prog); if (prog) { - if (prog->call_get_stack && - (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) && - !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) { - data->callchain = perf_callchain(event, regs); - data->sample_flags |= PERF_SAMPLE_CALLCHAIN; - } - + perf_prepare_sample(data, event, regs); ret = bpf_prog_run(prog, &ctx); } rcu_read_unlock(); diff --git a/kernel/exit.c b/kernel/exit.c index 15dc2ec80c46..bccfa4218356 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -807,6 +807,8 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; + WARN_ON(irqs_disabled()); + synchronize_group_exit(tsk, code); WARN_ON(tsk->plug); @@ -938,6 +940,11 @@ void __noreturn make_task_dead(int signr) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); + if (unlikely(irqs_disabled())) { + pr_info("note: %s[%d] exited with irqs disabled\n", + current->comm, task_pid_nr(current)); + local_irq_enable(); + } if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), diff --git a/kernel/fork.c b/kernel/fork.c index 9f7fe3541897..038b898dad52 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1044,7 +1044,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #endif #ifdef CONFIG_BLK_CGROUP - tsk->throttle_queue = NULL; + tsk->throttle_disk = NULL; tsk->use_memdelay = 0; #endif @@ -1060,6 +1060,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->reported_split_lock = 0; #endif +#ifdef CONFIG_SCHED_MM_CID + tsk->mm_cid = -1; + tsk->mm_cid_active = 0; +#endif return tsk; free_stack: @@ -1169,6 +1173,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); + mm_init_cid(mm); return mm; fail_pcpu: @@ -1601,6 +1606,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) tsk->mm = mm; tsk->active_mm = mm; + sched_mm_cid_fork(tsk); return 0; } @@ -3034,7 +3040,7 @@ void __init mm_cache_init(void) * dynamically sized based on the maximum CPU number this system * can have, taking hotplug into account (nr_cpu_ids). */ - mm_size = sizeof(struct mm_struct) + cpumask_size(); + mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size(); mm_cachep = kmem_cache_create_usercopy("mm_struct", mm_size, ARCH_MIN_MMSTRUCT_ALIGN, diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 473036b43c83..81b97f0f6556 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -14,6 +14,8 @@ include/ arch/$SRCARCH/include/ " +type cpio > /dev/null + # Support incremental builds by skipping archive generation # if timestamps of files being archived are not changed. diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index b64c44ae4c25..2531f3496ab6 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -86,6 +86,11 @@ config GENERIC_IRQ_IPI depends on SMP select IRQ_DOMAIN_HIERARCHY +# Generic IRQ IPI Mux support +config GENERIC_IRQ_IPI_MUX + bool + depends on SMP + # Generic MSI hierarchical interrupt domain support config GENERIC_MSI_IRQ bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index b4f53717d143..f19d3080bf11 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o +obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o obj-$(CONFIG_SMP) += affinity.o obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index d9a5c1d65a79..44a4eba80315 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -7,398 +7,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/cpu.h> -#include <linux/sort.h> - -static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, - unsigned int cpus_per_vec) -{ - const struct cpumask *siblmsk; - int cpu, sibl; - - for ( ; cpus_per_vec > 0; ) { - cpu = cpumask_first(nmsk); - - /* Should not happen, but I'm too lazy to think about it */ - if (cpu >= nr_cpu_ids) - return; - - cpumask_clear_cpu(cpu, nmsk); - cpumask_set_cpu(cpu, irqmsk); - cpus_per_vec--; - - /* If the cpu has siblings, use them first */ - siblmsk = topology_sibling_cpumask(cpu); - for (sibl = -1; cpus_per_vec > 0; ) { - sibl = cpumask_next(sibl, siblmsk); - if (sibl >= nr_cpu_ids) - break; - if (!cpumask_test_and_clear_cpu(sibl, nmsk)) - continue; - cpumask_set_cpu(sibl, irqmsk); - cpus_per_vec--; - } - } -} - -static cpumask_var_t *alloc_node_to_cpumask(void) -{ - cpumask_var_t *masks; - int node; - - masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); - if (!masks) - return NULL; - - for (node = 0; node < nr_node_ids; node++) { - if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) - goto out_unwind; - } - - return masks; - -out_unwind: - while (--node >= 0) - free_cpumask_var(masks[node]); - kfree(masks); - return NULL; -} - -static void free_node_to_cpumask(cpumask_var_t *masks) -{ - int node; - - for (node = 0; node < nr_node_ids; node++) - free_cpumask_var(masks[node]); - kfree(masks); -} - -static void build_node_to_cpumask(cpumask_var_t *masks) -{ - int cpu; - - for_each_possible_cpu(cpu) - cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); -} - -static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, - const struct cpumask *mask, nodemask_t *nodemsk) -{ - int n, nodes = 0; - - /* Calculate the number of nodes in the supplied affinity mask */ - for_each_node(n) { - if (cpumask_intersects(mask, node_to_cpumask[n])) { - node_set(n, *nodemsk); - nodes++; - } - } - return nodes; -} - -struct node_vectors { - unsigned id; - - union { - unsigned nvectors; - unsigned ncpus; - }; -}; - -static int ncpus_cmp_func(const void *l, const void *r) -{ - const struct node_vectors *ln = l; - const struct node_vectors *rn = r; - - return ln->ncpus - rn->ncpus; -} - -/* - * Allocate vector number for each node, so that for each node: - * - * 1) the allocated number is >= 1 - * - * 2) the allocated numbver is <= active CPU number of this node - * - * The actual allocated total vectors may be less than @numvecs when - * active total CPU number is less than @numvecs. - * - * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' - * for each node. - */ -static void alloc_nodes_vectors(unsigned int numvecs, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - const nodemask_t nodemsk, - struct cpumask *nmsk, - struct node_vectors *node_vectors) -{ - unsigned n, remaining_ncpus = 0; - - for (n = 0; n < nr_node_ids; n++) { - node_vectors[n].id = n; - node_vectors[n].ncpus = UINT_MAX; - } - - for_each_node_mask(n, nodemsk) { - unsigned ncpus; - - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - ncpus = cpumask_weight(nmsk); - - if (!ncpus) - continue; - remaining_ncpus += ncpus; - node_vectors[n].ncpus = ncpus; - } - - numvecs = min_t(unsigned, remaining_ncpus, numvecs); - - sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]), - ncpus_cmp_func, NULL); - - /* - * Allocate vectors for each node according to the ratio of this - * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is - * bigger than number of active numa nodes. Always start the - * allocation from the node with minimized nr_cpus. - * - * This way guarantees that each active node gets allocated at - * least one vector, and the theory is simple: over-allocation - * is only done when this node is assigned by one vector, so - * other nodes will be allocated >= 1 vector, since 'numvecs' is - * bigger than number of numa nodes. - * - * One perfect invariant is that number of allocated vectors for - * each node is <= CPU count of this node: - * - * 1) suppose there are two nodes: A and B - * ncpu(X) is CPU count of node X - * vecs(X) is the vector count allocated to node X via this - * algorithm - * - * ncpu(A) <= ncpu(B) - * ncpu(A) + ncpu(B) = N - * vecs(A) + vecs(B) = V - * - * vecs(A) = max(1, round_down(V * ncpu(A) / N)) - * vecs(B) = V - vecs(A) - * - * both N and V are integer, and 2 <= V <= N, suppose - * V = N - delta, and 0 <= delta <= N - 2 - * - * 2) obviously vecs(A) <= ncpu(A) because: - * - * if vecs(A) is 1, then vecs(A) <= ncpu(A) given - * ncpu(A) >= 1 - * - * otherwise, - * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N - * - * 3) prove how vecs(B) <= ncpu(B): - * - * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be - * over-allocated, so vecs(B) <= ncpu(B), - * - * otherwise: - * - * vecs(A) = - * round_down(V * ncpu(A) / N) = - * round_down((N - delta) * ncpu(A) / N) = - * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= - * round_down((N * ncpu(A) - delta * N) / N) = - * cpu(A) - delta - * - * then: - * - * vecs(A) - V >= ncpu(A) - delta - V - * => - * V - vecs(A) <= V + delta - ncpu(A) - * => - * vecs(B) <= N - ncpu(A) - * => - * vecs(B) <= cpu(B) - * - * For nodes >= 3, it can be thought as one node and another big - * node given that is exactly what this algorithm is implemented, - * and we always re-calculate 'remaining_ncpus' & 'numvecs', and - * finally for each node X: vecs(X) <= ncpu(X). - * - */ - for (n = 0; n < nr_node_ids; n++) { - unsigned nvectors, ncpus; - - if (node_vectors[n].ncpus == UINT_MAX) - continue; - - WARN_ON_ONCE(numvecs == 0); - - ncpus = node_vectors[n].ncpus; - nvectors = max_t(unsigned, 1, - numvecs * ncpus / remaining_ncpus); - WARN_ON_ONCE(nvectors > ncpus); - - node_vectors[n].nvectors = nvectors; - - remaining_ncpus -= ncpus; - numvecs -= nvectors; - } -} - -static int __irq_build_affinity_masks(unsigned int startvec, - unsigned int numvecs, - unsigned int firstvec, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - struct cpumask *nmsk, - struct irq_affinity_desc *masks) -{ - unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; - unsigned int last_affv = firstvec + numvecs; - unsigned int curvec = startvec; - nodemask_t nodemsk = NODE_MASK_NONE; - struct node_vectors *node_vectors; - - if (cpumask_empty(cpu_mask)) - return 0; - - nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); - - /* - * If the number of nodes in the mask is greater than or equal the - * number of vectors we just spread the vectors across the nodes. - */ - if (numvecs <= nodes) { - for_each_node_mask(n, nodemsk) { - /* Ensure that only CPUs which are in both masks are set */ - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk); - if (++curvec == last_affv) - curvec = firstvec; - } - return numvecs; - } - - node_vectors = kcalloc(nr_node_ids, - sizeof(struct node_vectors), - GFP_KERNEL); - if (!node_vectors) - return -ENOMEM; - - /* allocate vector number for each node */ - alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, - nodemsk, nmsk, node_vectors); - - for (i = 0; i < nr_node_ids; i++) { - unsigned int ncpus, v; - struct node_vectors *nv = &node_vectors[i]; - - if (nv->nvectors == UINT_MAX) - continue; - - /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); - ncpus = cpumask_weight(nmsk); - if (!ncpus) - continue; - - WARN_ON_ONCE(nv->nvectors > ncpus); - - /* Account for rounding errors */ - extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors); - - /* Spread allocated vectors on CPUs of the current node */ - for (v = 0; v < nv->nvectors; v++, curvec++) { - cpus_per_vec = ncpus / nv->nvectors; - - /* Account for extra vectors to compensate rounding errors */ - if (extra_vecs) { - cpus_per_vec++; - --extra_vecs; - } - - /* - * wrapping has to be considered given 'startvec' - * may start anywhere - */ - if (curvec >= last_affv) - curvec = firstvec; - irq_spread_init_one(&masks[curvec].mask, nmsk, - cpus_per_vec); - } - done += nv->nvectors; - } - kfree(node_vectors); - return done; -} - -/* - * build affinity in two stages: - * 1) spread present CPU on these vectors - * 2) spread other possible CPUs on these vectors - */ -static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, - unsigned int firstvec, - struct irq_affinity_desc *masks) -{ - unsigned int curvec = startvec, nr_present = 0, nr_others = 0; - cpumask_var_t *node_to_cpumask; - cpumask_var_t nmsk, npresmsk; - int ret = -ENOMEM; - - if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return ret; - - if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) - goto fail_nmsk; - - node_to_cpumask = alloc_node_to_cpumask(); - if (!node_to_cpumask) - goto fail_npresmsk; - - /* Stabilize the cpumasks */ - cpus_read_lock(); - build_node_to_cpumask(node_to_cpumask); - - /* Spread on present CPUs starting from affd->pre_vectors */ - ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, - node_to_cpumask, cpu_present_mask, - nmsk, masks); - if (ret < 0) - goto fail_build_affinity; - nr_present = ret; - - /* - * Spread on non present CPUs starting from the next vector to be - * handled. If the spreading of present CPUs already exhausted the - * vector space, assign the non present CPUs to the already spread - * out vectors. - */ - if (nr_present >= numvecs) - curvec = firstvec; - else - curvec = firstvec + nr_present; - cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, - node_to_cpumask, npresmsk, nmsk, - masks); - if (ret >= 0) - nr_others = ret; - - fail_build_affinity: - cpus_read_unlock(); - - if (ret >= 0) - WARN_ON(nr_present + nr_others < numvecs); - - free_node_to_cpumask(node_to_cpumask); - - fail_npresmsk: - free_cpumask_var(npresmsk); - - fail_nmsk: - free_cpumask_var(nmsk); - return ret < 0 ? ret : 0; -} +#include <linux/group_cpus.h> static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) { @@ -461,14 +70,18 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) */ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { unsigned int this_vecs = affd->set_size[i]; - int ret; + int j; + struct cpumask *result = group_cpus_evenly(this_vecs); - ret = irq_build_affinity_masks(curvec, this_vecs, - curvec, masks); - if (ret) { + if (!result) { kfree(masks); return NULL; } + + for (j = 0; j < this_vecs; j++) + cpumask_copy(&masks[curvec + j].mask, &result[j]); + kfree(result); + curvec += this_vecs; usedvecs += this_vecs; } diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c new file mode 100644 index 000000000000..fa4fc18c6131 --- /dev/null +++ b/kernel/irq/ipi-mux.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Multiplex several virtual IPIs over a single HW IPI. + * + * Copyright The Asahi Linux Contributors + * Copyright (c) 2022 Ventana Micro Systems Inc. + */ + +#define pr_fmt(fmt) "ipi-mux: " fmt +#include <linux/cpu.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <linux/irqchip.h> +#include <linux/irqchip/chained_irq.h> +#include <linux/irqdomain.h> +#include <linux/jump_label.h> +#include <linux/percpu.h> +#include <linux/smp.h> + +struct ipi_mux_cpu { + atomic_t enable; + atomic_t bits; +}; + +static struct ipi_mux_cpu __percpu *ipi_mux_pcpu; +static struct irq_domain *ipi_mux_domain; +static void (*ipi_mux_send)(unsigned int cpu); + +static void ipi_mux_mask(struct irq_data *d) +{ + struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); + + atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable); +} + +static void ipi_mux_unmask(struct irq_data *d) +{ + struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); + u32 ibit = BIT(irqd_to_hwirq(d)); + + atomic_or(ibit, &icpu->enable); + + /* + * The atomic_or() above must complete before the atomic_read() + * below to avoid racing ipi_mux_send_mask(). + */ + smp_mb__after_atomic(); + + /* If a pending IPI was unmasked, raise a parent IPI immediately. */ + if (atomic_read(&icpu->bits) & ibit) + ipi_mux_send(smp_processor_id()); +} + +static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask) +{ + struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); + u32 ibit = BIT(irqd_to_hwirq(d)); + unsigned long pending; + int cpu; + + for_each_cpu(cpu, mask) { + icpu = per_cpu_ptr(ipi_mux_pcpu, cpu); + + /* + * This sequence is the mirror of the one in ipi_mux_unmask(); + * see the comment there. Additionally, release semantics + * ensure that the vIPI flag set is ordered after any shared + * memory accesses that precede it. This therefore also pairs + * with the atomic_fetch_andnot in ipi_mux_process(). + */ + pending = atomic_fetch_or_release(ibit, &icpu->bits); + + /* + * The atomic_fetch_or_release() above must complete + * before the atomic_read() below to avoid racing with + * ipi_mux_unmask(). + */ + smp_mb__after_atomic(); + + /* + * The flag writes must complete before the physical IPI is + * issued to another CPU. This is implied by the control + * dependency on the result of atomic_read() below, which is + * itself already ordered after the vIPI flag write. + */ + if (!(pending & ibit) && (atomic_read(&icpu->enable) & ibit)) + ipi_mux_send(cpu); + } +} + +static const struct irq_chip ipi_mux_chip = { + .name = "IPI Mux", + .irq_mask = ipi_mux_mask, + .irq_unmask = ipi_mux_unmask, + .ipi_send_mask = ipi_mux_send_mask, +}; + +static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_set_percpu_devid(virq + i); + irq_domain_set_info(d, virq + i, i, &ipi_mux_chip, NULL, + handle_percpu_devid_irq, NULL, NULL); + } + + return 0; +} + +static const struct irq_domain_ops ipi_mux_domain_ops = { + .alloc = ipi_mux_domain_alloc, + .free = irq_domain_free_irqs_top, +}; + +/** + * ipi_mux_process - Process multiplexed virtual IPIs + */ +void ipi_mux_process(void) +{ + struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); + irq_hw_number_t hwirq; + unsigned long ipis; + unsigned int en; + + /* + * Reading enable mask does not need to be ordered as long as + * this function is called from interrupt handler because only + * the CPU itself can change it's own enable mask. + */ + en = atomic_read(&icpu->enable); + + /* + * Clear the IPIs we are about to handle. This pairs with the + * atomic_fetch_or_release() in ipi_mux_send_mask(). + */ + ipis = atomic_fetch_andnot(en, &icpu->bits) & en; + + for_each_set_bit(hwirq, &ipis, BITS_PER_TYPE(int)) + generic_handle_domain_irq(ipi_mux_domain, hwirq); +} + +/** + * ipi_mux_create - Create virtual IPIs multiplexed on top of a single + * parent IPI. + * @nr_ipi: number of virtual IPIs to create. This should + * be <= BITS_PER_TYPE(int) + * @mux_send: callback to trigger parent IPI for a particular CPU + * + * Returns first virq of the newly created virtual IPIs upon success + * or <=0 upon failure + */ +int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu)) +{ + struct fwnode_handle *fwnode; + struct irq_domain *domain; + int rc; + + if (ipi_mux_domain) + return -EEXIST; + + if (BITS_PER_TYPE(int) < nr_ipi || !mux_send) + return -EINVAL; + + ipi_mux_pcpu = alloc_percpu(typeof(*ipi_mux_pcpu)); + if (!ipi_mux_pcpu) + return -ENOMEM; + + fwnode = irq_domain_alloc_named_fwnode("IPI-Mux"); + if (!fwnode) { + pr_err("unable to create IPI Mux fwnode\n"); + rc = -ENOMEM; + goto fail_free_cpu; + } + + domain = irq_domain_create_linear(fwnode, nr_ipi, + &ipi_mux_domain_ops, NULL); + if (!domain) { + pr_err("unable to add IPI Mux domain\n"); + rc = -ENOMEM; + goto fail_free_fwnode; + } + + domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE; + irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI); + + rc = irq_domain_alloc_irqs(domain, nr_ipi, NUMA_NO_NODE, NULL); + if (rc <= 0) { + pr_err("unable to alloc IRQs from IPI Mux domain\n"); + goto fail_free_domain; + } + + ipi_mux_domain = domain; + ipi_mux_send = mux_send; + + return rc; + +fail_free_domain: + irq_domain_remove(domain); +fail_free_fwnode: + irq_domain_free_fwnode(fwnode); +fail_free_cpu: + free_percpu(ipi_mux_pcpu); + return rc; +} diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8fe1da9614ee..aa5b7eeeceb8 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -25,6 +25,9 @@ static DEFINE_MUTEX(irq_domain_mutex); static struct irq_domain *irq_default_domain; +static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc, const struct irq_affinity_desc *affinity); static void irq_domain_check_hierarchy(struct irq_domain *domain); struct irqchip_fwid { @@ -114,7 +117,7 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode) { struct irqchip_fwid *fwid; - if (WARN_ON(!is_fwnode_irqchip(fwnode))) + if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode))) return; fwid = container_of(fwnode, struct irqchip_fwid, fwnode); @@ -123,23 +126,12 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode) } EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); -/** - * __irq_domain_add() - Allocate a new irq_domain data structure - * @fwnode: firmware node for the interrupt controller - * @size: Size of linear map; 0 for radix mapping only - * @hwirq_max: Maximum number of interrupts supported by controller - * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no - * direct mapping - * @ops: domain callbacks - * @host_data: Controller private data pointer - * - * Allocates and initializes an irq_domain structure. - * Returns pointer to IRQ domain, or NULL on failure. - */ -struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, - irq_hw_number_t hwirq_max, int direct_max, - const struct irq_domain_ops *ops, - void *host_data) +static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode, + unsigned int size, + irq_hw_number_t hwirq_max, + int direct_max, + const struct irq_domain_ops *ops, + void *host_data) { struct irqchip_fwid *fwid; struct irq_domain *domain; @@ -214,25 +206,66 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int s /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); - mutex_init(&domain->revmap_mutex); domain->ops = ops; domain->host_data = host_data; domain->hwirq_max = hwirq_max; - if (direct_max) { + if (direct_max) domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP; - } domain->revmap_size = size; + /* + * Hierarchical domains use the domain lock of the root domain + * (innermost domain). + * + * For non-hierarchical domains (as for root domains), the root + * pointer is set to the domain itself so that &domain->root->mutex + * always points to the right lock. + */ + mutex_init(&domain->mutex); + domain->root = domain; + irq_domain_check_hierarchy(domain); + return domain; +} + +static void __irq_domain_publish(struct irq_domain *domain) +{ mutex_lock(&irq_domain_mutex); debugfs_add_domain_dir(domain); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); pr_debug("Added domain %s\n", domain->name); +} + +/** + * __irq_domain_add() - Allocate a new irq_domain data structure + * @fwnode: firmware node for the interrupt controller + * @size: Size of linear map; 0 for radix mapping only + * @hwirq_max: Maximum number of interrupts supported by controller + * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no + * direct mapping + * @ops: domain callbacks + * @host_data: Controller private data pointer + * + * Allocates and initializes an irq_domain structure. + * Returns pointer to IRQ domain, or NULL on failure. + */ +struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, + irq_hw_number_t hwirq_max, int direct_max, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain; + + domain = __irq_domain_create(fwnode, size, hwirq_max, direct_max, + ops, host_data); + if (domain) + __irq_domain_publish(domain); + return domain; } EXPORT_SYMBOL_GPL(__irq_domain_add); @@ -502,30 +535,34 @@ static bool irq_domain_is_nomap(struct irq_domain *domain) static void irq_domain_clear_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { + lockdep_assert_held(&domain->root->mutex); + if (irq_domain_is_nomap(domain)) return; - mutex_lock(&domain->revmap_mutex); if (hwirq < domain->revmap_size) rcu_assign_pointer(domain->revmap[hwirq], NULL); else radix_tree_delete(&domain->revmap_tree, hwirq); - mutex_unlock(&domain->revmap_mutex); } static void irq_domain_set_mapping(struct irq_domain *domain, irq_hw_number_t hwirq, struct irq_data *irq_data) { + /* + * This also makes sure that all domains point to the same root when + * called from irq_domain_insert_irq() for each domain in a hierarchy. + */ + lockdep_assert_held(&domain->root->mutex); + if (irq_domain_is_nomap(domain)) return; - mutex_lock(&domain->revmap_mutex); if (hwirq < domain->revmap_size) rcu_assign_pointer(domain->revmap[hwirq], irq_data); else radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); - mutex_unlock(&domain->revmap_mutex); } static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) @@ -538,6 +575,9 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) return; hwirq = irq_data->hwirq; + + mutex_lock(&domain->root->mutex); + irq_set_status_flags(irq, IRQ_NOREQUEST); /* remove chip and handler */ @@ -557,10 +597,12 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) /* Clear reverse map for this hwirq */ irq_domain_clear_mapping(domain, hwirq); + + mutex_unlock(&domain->root->mutex); } -int irq_domain_associate(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq) +static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) { struct irq_data *irq_data = irq_get_irq_data(virq); int ret; @@ -573,7 +615,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq, if (WARN(irq_data->domain, "error: virq%i is already associated", virq)) return -EINVAL; - mutex_lock(&irq_domain_mutex); irq_data->hwirq = hwirq; irq_data->domain = domain; if (domain->ops->map) { @@ -590,23 +631,29 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq, } irq_data->domain = NULL; irq_data->hwirq = 0; - mutex_unlock(&irq_domain_mutex); return ret; } - - /* If not already assigned, give the domain the chip's name */ - if (!domain->name && irq_data->chip) - domain->name = irq_data->chip->name; } domain->mapcount++; irq_domain_set_mapping(domain, hwirq, irq_data); - mutex_unlock(&irq_domain_mutex); irq_clear_status_flags(virq, IRQ_NOREQUEST); return 0; } + +int irq_domain_associate(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) +{ + int ret; + + mutex_lock(&domain->root->mutex); + ret = irq_domain_associate_locked(domain, virq, hwirq); + mutex_unlock(&domain->root->mutex); + + return ret; +} EXPORT_SYMBOL_GPL(irq_domain_associate); void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, @@ -619,9 +666,8 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, of_node_full_name(of_node), irq_base, (int)hwirq_base, count); - for (i = 0; i < count; i++) { + for (i = 0; i < count; i++) irq_domain_associate(domain, irq_base + i, hwirq_base + i); - } } EXPORT_SYMBOL_GPL(irq_domain_associate_many); @@ -668,6 +714,34 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) EXPORT_SYMBOL_GPL(irq_create_direct_mapping); #endif +static unsigned int irq_create_mapping_affinity_locked(struct irq_domain *domain, + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity) +{ + struct device_node *of_node = irq_domain_get_of_node(domain); + int virq; + + pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); + + /* Allocate a virtual interrupt number */ + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), + affinity); + if (virq <= 0) { + pr_debug("-> virq allocation failed\n"); + return 0; + } + + if (irq_domain_associate_locked(domain, virq, hwirq)) { + irq_free_desc(virq); + return 0; + } + + pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", + hwirq, of_node_full_name(of_node), virq); + + return virq; +} + /** * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space * @domain: domain owning this hardware interrupt or NULL for default domain @@ -680,14 +754,11 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping); * on the number returned from that call. */ unsigned int irq_create_mapping_affinity(struct irq_domain *domain, - irq_hw_number_t hwirq, - const struct irq_affinity_desc *affinity) + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity) { - struct device_node *of_node; int virq; - pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); - /* Look for default domain if necessary */ if (domain == NULL) domain = irq_default_domain; @@ -695,32 +766,19 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain, WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq); return 0; } - pr_debug("-> using domain @%p\n", domain); - of_node = irq_domain_get_of_node(domain); + mutex_lock(&domain->root->mutex); /* Check if mapping already exists */ virq = irq_find_mapping(domain, hwirq); if (virq) { - pr_debug("-> existing mapping on virq %d\n", virq); - return virq; - } - - /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), - affinity); - if (virq <= 0) { - pr_debug("-> virq allocation failed\n"); - return 0; - } - - if (irq_domain_associate(domain, virq, hwirq)) { - irq_free_desc(virq); - return 0; + pr_debug("existing mapping on virq %d\n", virq); + goto out; } - pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", - hwirq, of_node_full_name(of_node), virq); + virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity); +out: + mutex_unlock(&domain->root->mutex); return virq; } @@ -789,6 +847,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK)) type &= IRQ_TYPE_SENSE_MASK; + mutex_lock(&domain->root->mutex); + /* * If we've already configured this interrupt, * don't do it again, or hell will break loose. @@ -801,7 +861,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) * interrupt number. */ if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) - return virq; + goto out; /* * If the trigger type has not been set yet, then set @@ -809,40 +869,45 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) */ if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) { irq_data = irq_get_irq_data(virq); - if (!irq_data) - return 0; + if (!irq_data) { + virq = 0; + goto out; + } irqd_set_trigger_type(irq_data, type); - return virq; + goto out; } pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n", hwirq, of_node_full_name(to_of_node(fwspec->fwnode))); - return 0; + virq = 0; + goto out; } if (irq_domain_is_hierarchy(domain)) { - virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); - if (virq <= 0) - return 0; + virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE, + fwspec, false, NULL); + if (virq <= 0) { + virq = 0; + goto out; + } } else { /* Create mapping */ - virq = irq_create_mapping(domain, hwirq); + virq = irq_create_mapping_affinity_locked(domain, hwirq, NULL); if (!virq) - return virq; + goto out; } irq_data = irq_get_irq_data(virq); - if (!irq_data) { - if (irq_domain_is_hierarchy(domain)) - irq_domain_free_irqs(virq, 1); - else - irq_dispose_mapping(virq); - return 0; + if (WARN_ON(!irq_data)) { + virq = 0; + goto out; } /* Store trigger type */ irqd_set_trigger_type(irq_data, type); +out: + mutex_unlock(&domain->root->mutex); return virq; } @@ -1102,12 +1167,16 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, struct irq_domain *domain; if (size) - domain = irq_domain_create_linear(fwnode, size, ops, host_data); + domain = __irq_domain_create(fwnode, size, size, 0, ops, host_data); else - domain = irq_domain_create_tree(fwnode, ops, host_data); + domain = __irq_domain_create(fwnode, 0, ~0, 0, ops, host_data); + if (domain) { + domain->root = parent->root; domain->parent = parent; domain->flags |= flags; + + __irq_domain_publish(domain); } return domain; @@ -1123,10 +1192,6 @@ static void irq_domain_insert_irq(int virq) domain->mapcount++; irq_domain_set_mapping(domain, data->hwirq, data); - - /* If not already assigned, give the domain the chip's name */ - if (!domain->name && data->chip) - domain->name = data->chip->name; } irq_clear_status_flags(virq, IRQ_NOREQUEST); @@ -1426,40 +1491,12 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, return domain->ops->alloc(domain, irq_base, nr_irqs, arg); } -/** - * __irq_domain_alloc_irqs - Allocate IRQs from domain - * @domain: domain to allocate from - * @irq_base: allocate specified IRQ number if irq_base >= 0 - * @nr_irqs: number of IRQs to allocate - * @node: NUMA node id for memory allocation - * @arg: domain specific argument - * @realloc: IRQ descriptors have already been allocated if true - * @affinity: Optional irq affinity mask for multiqueue devices - * - * Allocate IRQ numbers and initialized all data structures to support - * hierarchy IRQ domains. - * Parameter @realloc is mainly to support legacy IRQs. - * Returns error code or allocated IRQ number - * - * The whole process to setup an IRQ has been split into two steps. - * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ - * descriptor and required hardware resources. The second step, - * irq_domain_activate_irq(), is to program the hardware with preallocated - * resources. In this way, it's easier to rollback when failing to - * allocate resources. - */ -int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, - unsigned int nr_irqs, int node, void *arg, - bool realloc, const struct irq_affinity_desc *affinity) +static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc, const struct irq_affinity_desc *affinity) { int i, ret, virq; - if (domain == NULL) { - domain = irq_default_domain; - if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) - return -EINVAL; - } - if (realloc && irq_base >= 0) { virq = irq_base; } else { @@ -1478,24 +1515,18 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, goto out_free_desc; } - mutex_lock(&irq_domain_mutex); ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg); - if (ret < 0) { - mutex_unlock(&irq_domain_mutex); + if (ret < 0) goto out_free_irq_data; - } for (i = 0; i < nr_irqs; i++) { ret = irq_domain_trim_hierarchy(virq + i); - if (ret) { - mutex_unlock(&irq_domain_mutex); + if (ret) goto out_free_irq_data; - } } - + for (i = 0; i < nr_irqs; i++) irq_domain_insert_irq(virq + i); - mutex_unlock(&irq_domain_mutex); return virq; @@ -1505,6 +1536,48 @@ out_free_desc: irq_free_descs(virq, nr_irqs); return ret; } + +/** + * __irq_domain_alloc_irqs - Allocate IRQs from domain + * @domain: domain to allocate from + * @irq_base: allocate specified IRQ number if irq_base >= 0 + * @nr_irqs: number of IRQs to allocate + * @node: NUMA node id for memory allocation + * @arg: domain specific argument + * @realloc: IRQ descriptors have already been allocated if true + * @affinity: Optional irq affinity mask for multiqueue devices + * + * Allocate IRQ numbers and initialized all data structures to support + * hierarchy IRQ domains. + * Parameter @realloc is mainly to support legacy IRQs. + * Returns error code or allocated IRQ number + * + * The whole process to setup an IRQ has been split into two steps. + * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ + * descriptor and required hardware resources. The second step, + * irq_domain_activate_irq(), is to program the hardware with preallocated + * resources. In this way, it's easier to rollback when failing to + * allocate resources. + */ +int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc, const struct irq_affinity_desc *affinity) +{ + int ret; + + if (domain == NULL) { + domain = irq_default_domain; + if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) + return -EINVAL; + } + + mutex_lock(&domain->root->mutex); + ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg, + realloc, affinity); + mutex_unlock(&domain->root->mutex); + + return ret; +} EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs); /* The irq_data was moved, fix the revmap to refer to the new location */ @@ -1512,11 +1585,12 @@ static void irq_domain_fix_revmap(struct irq_data *d) { void __rcu **slot; + lockdep_assert_held(&d->domain->root->mutex); + if (irq_domain_is_nomap(d->domain)) return; /* Fix up the revmap. */ - mutex_lock(&d->domain->revmap_mutex); if (d->hwirq < d->domain->revmap_size) { /* Not using radix tree */ rcu_assign_pointer(d->domain->revmap[d->hwirq], d); @@ -1525,7 +1599,6 @@ static void irq_domain_fix_revmap(struct irq_data *d) if (slot) radix_tree_replace_slot(&d->domain->revmap_tree, slot, d); } - mutex_unlock(&d->domain->revmap_mutex); } /** @@ -1541,8 +1614,8 @@ static void irq_domain_fix_revmap(struct irq_data *d) */ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) { - struct irq_data *child_irq_data; - struct irq_data *root_irq_data = irq_get_irq_data(virq); + struct irq_data *irq_data = irq_get_irq_data(virq); + struct irq_data *parent_irq_data; struct irq_desc *desc; int rv = 0; @@ -1567,47 +1640,46 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) if (WARN_ON(!irq_domain_is_hierarchy(domain))) return -EINVAL; - if (!root_irq_data) + if (!irq_data) return -EINVAL; - if (domain->parent != root_irq_data->domain) + if (domain->parent != irq_data->domain) return -EINVAL; - child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL, - irq_data_get_node(root_irq_data)); - if (!child_irq_data) + parent_irq_data = kzalloc_node(sizeof(*parent_irq_data), GFP_KERNEL, + irq_data_get_node(irq_data)); + if (!parent_irq_data) return -ENOMEM; - mutex_lock(&irq_domain_mutex); + mutex_lock(&domain->root->mutex); /* Copy the original irq_data. */ - *child_irq_data = *root_irq_data; + *parent_irq_data = *irq_data; /* - * Overwrite the root_irq_data, which is embedded in struct - * irq_desc, with values for this domain. + * Overwrite the irq_data, which is embedded in struct irq_desc, with + * values for this domain. */ - root_irq_data->parent_data = child_irq_data; - root_irq_data->domain = domain; - root_irq_data->mask = 0; - root_irq_data->hwirq = 0; - root_irq_data->chip = NULL; - root_irq_data->chip_data = NULL; + irq_data->parent_data = parent_irq_data; + irq_data->domain = domain; + irq_data->mask = 0; + irq_data->hwirq = 0; + irq_data->chip = NULL; + irq_data->chip_data = NULL; /* May (probably does) set hwirq, chip, etc. */ rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); if (rv) { /* Restore the original irq_data. */ - *root_irq_data = *child_irq_data; - kfree(child_irq_data); + *irq_data = *parent_irq_data; + kfree(parent_irq_data); goto error; } - irq_domain_fix_revmap(child_irq_data); - irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data); - + irq_domain_fix_revmap(parent_irq_data); + irq_domain_set_mapping(domain, irq_data->hwirq, irq_data); error: - mutex_unlock(&irq_domain_mutex); + mutex_unlock(&domain->root->mutex); return rv; } @@ -1623,8 +1695,8 @@ EXPORT_SYMBOL_GPL(irq_domain_push_irq); */ int irq_domain_pop_irq(struct irq_domain *domain, int virq) { - struct irq_data *root_irq_data = irq_get_irq_data(virq); - struct irq_data *child_irq_data; + struct irq_data *irq_data = irq_get_irq_data(virq); + struct irq_data *parent_irq_data; struct irq_data *tmp_irq_data; struct irq_desc *desc; @@ -1646,37 +1718,37 @@ int irq_domain_pop_irq(struct irq_domain *domain, int virq) if (domain == NULL) return -EINVAL; - if (!root_irq_data) + if (!irq_data) return -EINVAL; tmp_irq_data = irq_domain_get_irq_data(domain, virq); /* We can only "pop" if this domain is at the top of the list */ - if (WARN_ON(root_irq_data != tmp_irq_data)) + if (WARN_ON(irq_data != tmp_irq_data)) return -EINVAL; - if (WARN_ON(root_irq_data->domain != domain)) + if (WARN_ON(irq_data->domain != domain)) return -EINVAL; - child_irq_data = root_irq_data->parent_data; - if (WARN_ON(!child_irq_data)) + parent_irq_data = irq_data->parent_data; + if (WARN_ON(!parent_irq_data)) return -EINVAL; - mutex_lock(&irq_domain_mutex); + mutex_lock(&domain->root->mutex); - root_irq_data->parent_data = NULL; + irq_data->parent_data = NULL; - irq_domain_clear_mapping(domain, root_irq_data->hwirq); + irq_domain_clear_mapping(domain, irq_data->hwirq); irq_domain_free_irqs_hierarchy(domain, virq, 1); /* Restore the original irq_data. */ - *root_irq_data = *child_irq_data; + *irq_data = *parent_irq_data; - irq_domain_fix_revmap(root_irq_data); + irq_domain_fix_revmap(irq_data); - mutex_unlock(&irq_domain_mutex); + mutex_unlock(&domain->root->mutex); - kfree(child_irq_data); + kfree(parent_irq_data); return 0; } @@ -1690,17 +1762,20 @@ EXPORT_SYMBOL_GPL(irq_domain_pop_irq); void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) { struct irq_data *data = irq_get_irq_data(virq); + struct irq_domain *domain; int i; if (WARN(!data || !data->domain || !data->domain->ops->free, "NULL pointer, cannot free irq\n")) return; - mutex_lock(&irq_domain_mutex); + domain = data->domain; + + mutex_lock(&domain->root->mutex); for (i = 0; i < nr_irqs; i++) irq_domain_remove_irq(virq + i); - irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs); - mutex_unlock(&irq_domain_mutex); + irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs); + mutex_unlock(&domain->root->mutex); irq_domain_free_irq_data(virq, nr_irqs); irq_free_descs(virq, nr_irqs); @@ -1865,6 +1940,13 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, irq_set_handler_data(virq, handler_data); } +static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc, const struct irq_affinity_desc *affinity) +{ + return -EINVAL; +} + static void irq_domain_check_hierarchy(struct irq_domain *domain) { } @@ -1915,7 +1997,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d) static void debugfs_remove_domain_dir(struct irq_domain *d) { - debugfs_remove(debugfs_lookup(d->name, domain_dir)); + debugfs_lookup_and_remove(d->name, domain_dir); } void __init irq_domain_debugfs_init(struct dentry *root) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5b7cf28df290..8ce75495e04f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -723,10 +723,13 @@ EXPORT_SYMBOL(disable_irq_nosync); * to complete before returning. If you use this function while * holding a resource the IRQ handler may need you will deadlock. * - * This function may be called - with care - from IRQ context. + * Can only be called from preemptible code as it might sleep when + * an interrupt thread is associated to @irq. + * */ void disable_irq(unsigned int irq) { + might_sleep(); if (!__disable_irq_nosync(irq)) synchronize_irq(irq); } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 955267bbc2be..783a3e6a0b10 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1000,7 +1000,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, fail: msi_unlock_descs(dev); free_fwnode: - kfree(fwnode); + irq_domain_free_fwnode(fwnode); free_bundle: kfree(bundle); return false; @@ -1013,6 +1013,7 @@ free_bundle: */ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) { + struct fwnode_handle *fwnode = NULL; struct msi_domain_info *info; struct irq_domain *domain; @@ -1025,7 +1026,10 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) dev->msi.data->__domains[domid].domain = NULL; info = domain->host_data; + if (irq_domain_is_msi_device(domain)) + fwnode = domain->fwnode; irq_domain_remove(domain); + irq_domain_free_fwnode(fwnode); kfree(container_of(info, struct msi_domain_template, info)); unlock: diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index f35d9cc1aab1..bfbc12da3326 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -157,14 +157,11 @@ static void test_kallsyms_compression_ratio(void) static int lookup_name(void *data, const char *name, struct module *mod, unsigned long addr) { u64 t0, t1, t; - unsigned long flags; struct test_stat *stat = (struct test_stat *)data; - local_irq_save(flags); - t0 = sched_clock(); + t0 = ktime_get_ns(); (void)kallsyms_lookup_name(name); - t1 = sched_clock(); - local_irq_restore(flags); + t1 = ktime_get_ns(); t = t1 - t0; if (t < stat->min) @@ -234,18 +231,15 @@ static int find_symbol(void *data, const char *name, struct module *mod, unsigne static void test_perf_kallsyms_on_each_symbol(void) { u64 t0, t1; - unsigned long flags; struct test_stat stat; memset(&stat, 0, sizeof(stat)); stat.max = INT_MAX; stat.name = stub_name; stat.perf = 1; - local_irq_save(flags); - t0 = sched_clock(); + t0 = ktime_get_ns(); kallsyms_on_each_symbol(find_symbol, &stat); - t1 = sched_clock(); - local_irq_restore(flags); + t1 = ktime_get_ns(); pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0); } @@ -270,17 +264,14 @@ static int match_symbol(void *data, unsigned long addr) static void test_perf_kallsyms_on_each_match_symbol(void) { u64 t0, t1; - unsigned long flags; struct test_stat stat; memset(&stat, 0, sizeof(stat)); stat.max = INT_MAX; stat.name = stub_name; - local_irq_save(flags); - t0 = sched_clock(); + t0 = ktime_get_ns(); kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat); - t1 = sched_clock(); - local_irq_restore(flags); + t1 = ktime_get_ns(); pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0); } diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index dcec1b743c69..a60c561724be 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -159,7 +159,7 @@ static bool __report_matches(const struct expect_report *r) const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT; bool ret = false; unsigned long flags; - typeof(observed.lines) expect; + typeof(*observed.lines) *expect; const char *end; char *cur; int i; @@ -168,6 +168,10 @@ static bool __report_matches(const struct expect_report *r) if (!report_available()) return false; + expect = kmalloc(sizeof(observed.lines), GFP_KERNEL); + if (WARN_ON(!expect)) + return false; + /* Generate expected report contents. */ /* Title */ @@ -253,6 +257,7 @@ static bool __report_matches(const struct expect_report *r) strstr(observed.lines[2], expect[1]))); out: spin_unlock_irqrestore(&observed.lock, flags); + kfree(expect); return ret; } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e3375bc40dad..50d4863974e7 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -55,6 +55,7 @@ #include <linux/rcupdate.h> #include <linux/kprobes.h> #include <linux/lockdep.h> +#include <linux/context_tracking.h> #include <asm/sections.h> @@ -6555,6 +6556,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; int dl = READ_ONCE(debug_locks); + bool rcu = warn_rcu_enter(); /* Note: the following can be executed concurrently, so be careful. */ pr_warn("\n"); @@ -6595,5 +6597,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) lockdep_print_held_locks(curr); pr_warn("\nstack backtrace:\n"); dump_stack(); + warn_rcu_exit(rcu); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 2b23378775fe..ebe6b8ec7cb3 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -371,7 +371,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) /* * We're pending, wait for the owner to go away. * - * 0,1,1 -> 0,1,0 + * 0,1,1 -> *,1,0 * * this wait loop must be a load-acquire such that we match the * store-release that clears the locked bit and create lock @@ -380,7 +380,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * barriers. */ if (val & _Q_LOCKED_MASK) - atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK)); + smp_cond_load_acquire(&lock->locked, !VAL); /* * take ownership and clear the pending bit. diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 010cf4e6d0b8..728f434de2bb 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -901,8 +901,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * then we need to wake the new top waiter up to try * to get the lock. */ - if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) - wake_up_state(waiter->task, waiter->wake_state); + top_waiter = rt_mutex_top_waiter(lock); + if (prerequeue_top_waiter != top_waiter) + wake_up_state(top_waiter->task, top_waiter->wake_state); raw_spin_unlock_irq(&lock->wait_lock); return 0; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 44873594de03..acb5a50309a1 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -256,16 +256,13 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) static inline bool rwsem_write_trylock(struct rw_semaphore *sem) { long tmp = RWSEM_UNLOCKED_VALUE; - bool ret = false; - preempt_disable(); if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); - ret = true; + return true; } - preempt_enable(); - return ret; + return false; } /* @@ -624,18 +621,16 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, */ if (first->handoff_set && (waiter != first)) return false; - - /* - * First waiter can inherit a previously set handoff - * bit and spin on rwsem if lock acquisition fails. - */ - if (waiter == first) - waiter->handoff_set = true; } new = count; if (count & RWSEM_LOCK_MASK) { + /* + * A waiter (first or not) can set the handoff bit + * if it is an RT task or wait in the wait queue + * for too long. + */ if (has_handoff || (!rt_task(waiter->task) && !time_after(jiffies, waiter->timeout))) return false; @@ -651,11 +646,12 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); /* - * We have either acquired the lock with handoff bit cleared or - * set the handoff bit. + * We have either acquired the lock with handoff bit cleared or set + * the handoff bit. Only the first waiter can have its handoff_set + * set here to enable optimistic spinning in slowpath loop. */ if (new & RWSEM_FLAG_HANDOFF) { - waiter->handoff_set = true; + first->handoff_set = true; lockevent_inc(rwsem_wlock_handoff); return false; } @@ -717,7 +713,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) return false; } - preempt_disable(); /* * Disable preemption is equal to the RCU read-side crital section, * thus the task_strcut structure won't go away. @@ -729,7 +724,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) if ((flags & RWSEM_NONSPINNABLE) || (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) ret = false; - preempt_enable(); lockevent_cond_inc(rwsem_opt_fail, !ret); return ret; @@ -829,8 +823,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) int loop = 0; u64 rspin_threshold = 0; - preempt_disable(); - /* sem->wait_lock should not be held when doing optimistic spinning */ if (!osq_lock(&sem->osq)) goto done; @@ -938,7 +930,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) } osq_unlock(&sem->osq); done: - preempt_enable(); lockevent_cond_inc(rwsem_opt_fail, !taken); return taken; } @@ -1092,7 +1083,7 @@ queue: /* Ordered by sem->wait_lock against rwsem_mark_wake(). */ break; } - schedule(); + schedule_preempt_disabled(); lockevent_inc(rwsem_sleep_reader); } @@ -1179,15 +1170,12 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) if (waiter.handoff_set) { enum owner_state owner_state; - preempt_disable(); owner_state = rwsem_spin_on_owner(sem); - preempt_enable(); - if (owner_state == OWNER_NULL) goto trylock_again; } - schedule(); + schedule_preempt_disabled(); lockevent_inc(rwsem_sleep_writer); set_current_state(state); trylock_again: @@ -1254,14 +1242,20 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) */ static inline int __down_read_common(struct rw_semaphore *sem, int state) { + int ret = 0; long count; + preempt_disable(); if (!rwsem_read_trylock(sem, &count)) { - if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) - return -EINTR; + if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) { + ret = -EINTR; + goto out; + } DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); } - return 0; +out: + preempt_enable(); + return ret; } static inline void __down_read(struct rw_semaphore *sem) @@ -1281,19 +1275,23 @@ static inline int __down_read_killable(struct rw_semaphore *sem) static inline int __down_read_trylock(struct rw_semaphore *sem) { + int ret = 0; long tmp; DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); + preempt_disable(); tmp = atomic_long_read(&sem->count); while (!(tmp & RWSEM_READ_FAILED_MASK)) { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, tmp + RWSEM_READER_BIAS)) { rwsem_set_reader_owned(sem); - return 1; + ret = 1; + break; } } - return 0; + preempt_enable(); + return ret; } /* @@ -1301,12 +1299,15 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline int __down_write_common(struct rw_semaphore *sem, int state) { + int ret = 0; + + preempt_disable(); if (unlikely(!rwsem_write_trylock(sem))) { if (IS_ERR(rwsem_down_write_slowpath(sem, state))) - return -EINTR; + ret = -EINTR; } - - return 0; + preempt_enable(); + return ret; } static inline void __down_write(struct rw_semaphore *sem) @@ -1321,8 +1322,14 @@ static inline int __down_write_killable(struct rw_semaphore *sem) static inline int __down_write_trylock(struct rw_semaphore *sem) { + int ret; + + preempt_disable(); DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); - return rwsem_write_trylock(sem); + ret = rwsem_write_trylock(sem); + preempt_enable(); + + return ret; } /* @@ -1335,6 +1342,7 @@ static inline void __up_read(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + preempt_disable(); rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); @@ -1343,6 +1351,7 @@ static inline void __up_read(struct rw_semaphore *sem) clear_nonspinnable(sem); rwsem_wake(sem); } + preempt_enable(); } /* @@ -1363,9 +1372,9 @@ static inline void __up_write(struct rw_semaphore *sem) preempt_disable(); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); - preempt_enable(); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) rwsem_wake(sem); + preempt_enable(); } /* @@ -1383,11 +1392,13 @@ static inline void __downgrade_write(struct rw_semaphore *sem) * write side. As such, rely on RELEASE semantics. */ DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem); + preempt_disable(); tmp = atomic_long_fetch_add_release( -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); rwsem_set_reader_owned(sem); if (tmp & RWSEM_FLAG_WAITERS) rwsem_downgrade_wake(sem); + preempt_enable(); } #else /* !CONFIG_PREEMPT_RT */ @@ -1662,6 +1673,12 @@ void down_read_non_owner(struct rw_semaphore *sem) { might_sleep(); __down_read(sem); + /* + * The owner value for a reader-owned lock is mostly for debugging + * purpose only and is not critical to the correct functioning of + * rwsem. So it is perfectly fine to set it in a preempt-enabled + * context here. + */ __rwsem_set_reader_owned(sem, NULL); } EXPORT_SYMBOL(down_read_non_owner); diff --git a/kernel/module/main.c b/kernel/module/main.c index 48568a0f5651..4ac3fe43e6c8 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2393,7 +2393,8 @@ static bool finished_loading(const char *name) sched_annotate_sleep(); mutex_lock(&module_mutex); mod = find_module_all(name, strlen(name), true); - ret = !mod || mod->state == MODULE_STATE_LIVE; + ret = !mod || mod->state == MODULE_STATE_LIVE + || mod->state == MODULE_STATE_GOING; mutex_unlock(&module_mutex); return ret; @@ -2569,20 +2570,35 @@ static int add_unformed_module(struct module *mod) mod->state = MODULE_STATE_UNFORMED; -again: mutex_lock(&module_mutex); old = find_module_all(mod->name, strlen(mod->name), true); if (old != NULL) { - if (old->state != MODULE_STATE_LIVE) { + if (old->state == MODULE_STATE_COMING + || old->state == MODULE_STATE_UNFORMED) { /* Wait in case it fails to load. */ mutex_unlock(&module_mutex); err = wait_event_interruptible(module_wq, finished_loading(mod->name)); if (err) goto out_unlocked; - goto again; + + /* The module might have gone in the meantime. */ + mutex_lock(&module_mutex); + old = find_module_all(mod->name, strlen(mod->name), + true); } - err = -EEXIST; + + /* + * We are here only when the same module was being loaded. Do + * not try to load it again right now. It prevents long delays + * caused by serialized module load failures. It might happen + * when more devices of the same type trigger load of + * a particular module. + */ + if (old && old->state == MODULE_STATE_LIVE) + err = -EEXIST; + else + err = -EBUSY; goto out; } mod_update_bounds(mod); diff --git a/kernel/panic.c b/kernel/panic.c index 463c9295bc28..487f5b03bf83 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,6 +34,7 @@ #include <linux/ratelimit.h> #include <linux/debugfs.h> #include <linux/sysfs.h> +#include <linux/context_tracking.h> #include <trace/events/error_report.h> #include <asm/sections.h> @@ -679,6 +680,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, void warn_slowpath_fmt(const char *file, int line, unsigned taint, const char *fmt, ...) { + bool rcu = warn_rcu_enter(); struct warn_args args; pr_warn(CUT_HERE); @@ -693,11 +695,13 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, va_start(args.args, fmt); __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); + warn_rcu_exit(rcu); } EXPORT_SYMBOL(warn_slowpath_fmt); #else void __warn_printk(const char *fmt, ...) { + bool rcu = warn_rcu_enter(); va_list args; pr_warn(CUT_HERE); @@ -705,6 +709,7 @@ void __warn_printk(const char *fmt, ...) va_start(args, fmt); vprintk(fmt, args); va_end(args); + warn_rcu_exit(rcu); } EXPORT_SYMBOL(__warn_printk); #endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7decf1e9c486..94f136b25f6a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -123,6 +123,7 @@ bool console_srcu_read_lock_is_held(void) { return srcu_read_lock_held(&console_srcu); } +EXPORT_SYMBOL(console_srcu_read_lock_is_held); #endif enum devkmsg_log_bits { @@ -1891,6 +1892,7 @@ static void console_lock_spinning_enable(void) /** * console_lock_spinning_disable_and_check - mark end of code where another * thread was able to busy wait and check if there is a waiter + * @cookie: cookie returned from console_srcu_read_lock() * * This is called at the end of the section where spinning is allowed. * It has two functions. First, it is a signal that it is no longer @@ -2194,7 +2196,7 @@ static u16 printk_sprint(char *text, u16 size, int facility, } } - trace_console_rcuidle(text, text_len); + trace_console(text, text_len); return text_len; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 54482193e1ed..0786450074c1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -813,7 +813,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, { struct ptrace_rseq_configuration conf = { .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, - .rseq_abi_size = sizeof(*task->rseq), + .rseq_abi_size = task->rseq_len, .signature = task->rseq_sig, .flags = 0, }; diff --git a/kernel/rseq.c b/kernel/rseq.c index d38ab944105d..9de6e35fe679 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -18,6 +18,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/rseq.h> +/* The original rseq structure size (including padding) is 32 bytes. */ +#define ORIG_RSEQ_SIZE 32 + #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) @@ -82,15 +85,25 @@ * F1. <failure> */ -static int rseq_update_cpu_id(struct task_struct *t) +static int rseq_update_cpu_node_id(struct task_struct *t) { - u32 cpu_id = raw_smp_processor_id(); struct rseq __user *rseq = t->rseq; + u32 cpu_id = raw_smp_processor_id(); + u32 node_id = cpu_to_node(cpu_id); + u32 mm_cid = task_mm_cid(t); - if (!user_write_access_begin(rseq, sizeof(*rseq))) + WARN_ON_ONCE((int) mm_cid < 0); + if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); + unsafe_put_user(node_id, &rseq->node_id, efault_end); + unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); + /* + * Additional feature fields added after ORIG_RSEQ_SIZE + * need to be conditionally updated only if + * t->rseq_len != ORIG_RSEQ_SIZE. + */ user_write_access_end(); trace_rseq_update(t); return 0; @@ -101,9 +114,10 @@ efault: return -EFAULT; } -static int rseq_reset_rseq_cpu_id(struct task_struct *t) +static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; + u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, + mm_cid = 0; /* * Reset cpu_id_start to its initial state (0). @@ -117,6 +131,21 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) */ if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; + /* + * Reset node_id to its initial state (0). + */ + if (put_user(node_id, &t->rseq->node_id)) + return -EFAULT; + /* + * Reset mm_cid to its initial state (0). + */ + if (put_user(mm_cid, &t->rseq->mm_cid)) + return -EFAULT; + /* + * Additional feature fields added after ORIG_RSEQ_SIZE + * need to be conditionally reset only if + * t->rseq_len != ORIG_RSEQ_SIZE. + */ return 0; } @@ -301,7 +330,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(ret < 0)) goto error; } - if (unlikely(rseq_update_cpu_id(t))) + if (unlikely(rseq_update_cpu_node_id(t))) goto error; return; @@ -344,15 +373,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; - if (rseq_len != sizeof(*rseq)) + if (rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; - ret = rseq_reset_rseq_cpu_id(current); + ret = rseq_reset_rseq_cpu_node_id(current); if (ret) return ret; current->rseq = NULL; current->rseq_sig = 0; + current->rseq_len = 0; return 0; } @@ -365,7 +395,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, * the provided address differs from the prior * one. */ - if (current->rseq != rseq || rseq_len != sizeof(*rseq)) + if (current->rseq != rseq || rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; @@ -374,15 +404,24 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, } /* - * If there was no rseq previously registered, - * ensure the provided rseq is properly aligned and valid. + * If there was no rseq previously registered, ensure the provided rseq + * is properly aligned, as communcated to user-space through the ELF + * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq + * size, the required alignment is the original struct rseq alignment. + * + * In order to be valid, rseq_len is either the original rseq size, or + * large enough to contain all supported fields, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. */ - if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || - rseq_len != sizeof(*rseq)) + if (rseq_len < ORIG_RSEQ_SIZE || + (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || + (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || + rseq_len < offsetof(struct rseq, end)))) return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; current->rseq = rseq; + current->rseq_len = rseq_len; current->rseq_sig = sig; /* * If rseq was previously inactive, and has just been diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e374c0c923da..5732fa75ebab 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -93,7 +93,7 @@ struct sched_clock_data { static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); -notrace static inline struct sched_clock_data *this_scd(void) +static __always_inline struct sched_clock_data *this_scd(void) { return this_cpu_ptr(&sched_clock_data); } @@ -244,12 +244,12 @@ late_initcall(sched_clock_init_late); * min, max except they take wrapping into account */ -notrace static inline u64 wrap_min(u64 x, u64 y) +static __always_inline u64 wrap_min(u64 x, u64 y) { return (s64)(x - y) < 0 ? x : y; } -notrace static inline u64 wrap_max(u64 x, u64 y) +static __always_inline u64 wrap_max(u64 x, u64 y) { return (s64)(x - y) > 0 ? x : y; } @@ -260,7 +260,7 @@ notrace static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -notrace static u64 sched_clock_local(struct sched_clock_data *scd) +static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) { u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; @@ -287,13 +287,28 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (!try_cmpxchg64(&scd->clock, &old_clock, clock)) + if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; } -notrace static u64 sched_clock_remote(struct sched_clock_data *scd) +noinstr u64 local_clock(void) +{ + u64 clock; + + if (static_branch_likely(&__sched_clock_stable)) + return sched_clock() + __sched_clock_offset; + + preempt_disable_notrace(); + clock = sched_clock_local(this_scd()); + preempt_enable_notrace(); + + return clock; +} +EXPORT_SYMBOL_GPL(local_clock); + +static notrace u64 sched_clock_remote(struct sched_clock_data *scd) { struct sched_clock_data *my_scd = this_scd(); u64 this_clock, remote_clock; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 25b582b6ee5f..fb49dbf61273 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -152,7 +152,7 @@ __read_mostly int scheduler_running; DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); /* kernel prio, less is more */ -static inline int __task_prio(struct task_struct *p) +static inline int __task_prio(const struct task_struct *p) { if (p->sched_class == &stop_sched_class) /* trumps deadline */ return -2; @@ -174,7 +174,8 @@ static inline int __task_prio(struct task_struct *p) */ /* real prio, less is less */ -static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +static inline bool prio_less(const struct task_struct *a, + const struct task_struct *b, bool in_fi) { int pa = __task_prio(a), pb = __task_prio(b); @@ -194,7 +195,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool return false; } -static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b) +static inline bool __sched_core_less(const struct task_struct *a, + const struct task_struct *b) { if (a->core_cookie < b->core_cookie) return true; @@ -2604,27 +2606,71 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) .user_mask = NULL, .flags = SCA_USER, /* clear the user requested mask */ }; + union cpumask_rcuhead { + cpumask_t cpumask; + struct rcu_head rcu; + }; __do_set_cpus_allowed(p, &ac); - kfree(ac.user_mask); + + /* + * Because this is called with p->pi_lock held, it is not possible + * to use kfree() here (when PREEMPT_RT=y), therefore punt to using + * kfree_rcu(). + */ + kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu); +} + +static cpumask_t *alloc_user_cpus_ptr(int node) +{ + /* + * See do_set_cpus_allowed() above for the rcu_head usage. + */ + int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); + + return kmalloc_node(size, GFP_KERNEL, node); } int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) { + cpumask_t *user_mask; unsigned long flags; - if (!src->user_cpus_ptr) + /* + * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's + * may differ by now due to racing. + */ + dst->user_cpus_ptr = NULL; + + /* + * This check is racy and losing the race is a valid situation. + * It is not worth the extra overhead of taking the pi_lock on + * every fork/clone. + */ + if (data_race(!src->user_cpus_ptr)) return 0; - dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); - if (!dst->user_cpus_ptr) + user_mask = alloc_user_cpus_ptr(node); + if (!user_mask) return -ENOMEM; - /* Use pi_lock to protect content of user_cpus_ptr */ + /* + * Use pi_lock to protect content of user_cpus_ptr + * + * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent + * do_set_cpus_allowed(). + */ raw_spin_lock_irqsave(&src->pi_lock, flags); - cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + if (src->user_cpus_ptr) { + swap(dst->user_cpus_ptr, user_mask); + cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + } raw_spin_unlock_irqrestore(&src->pi_lock, flags); + + if (unlikely(user_mask)) + kfree(user_mask); + return 0; } @@ -2907,8 +2953,11 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, } if (!(ctx->flags & SCA_MIGRATE_ENABLE)) { - if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) + if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) { + if (ctx->flags & SCA_USER) + swap(p->user_cpus_ptr, ctx->user_mask); goto out; + } if (WARN_ON_ONCE(p == current && is_migration_disabled(p) && @@ -3581,6 +3630,11 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) return false; } +static inline cpumask_t *alloc_user_cpus_ptr(int node) +{ + return NULL; +} + #endif /* !CONFIG_SMP */ static void @@ -3623,14 +3677,39 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } /* - * Mark the task runnable and perform wakeup-preemption. + * Mark the task runnable. */ -static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, - struct rq_flags *rf) +static inline void ttwu_do_wakeup(struct task_struct *p) { - check_preempt_curr(rq, p, wake_flags); WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + struct rq_flags *rf) +{ + int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; + + lockdep_assert_rq_held(rq); + + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; + +#ifdef CONFIG_SMP + if (wake_flags & WF_MIGRATED) + en_flags |= ENQUEUE_MIGRATED; + else +#endif + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + + activate_task(rq, p, en_flags); + check_preempt_curr(rq, p, wake_flags); + + ttwu_do_wakeup(p); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { @@ -3660,31 +3739,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, #endif } -static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, - struct rq_flags *rf) -{ - int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; - - lockdep_assert_rq_held(rq); - - if (p->sched_contributes_to_load) - rq->nr_uninterruptible--; - -#ifdef CONFIG_SMP - if (wake_flags & WF_MIGRATED) - en_flags |= ENQUEUE_MIGRATED; - else -#endif - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - - activate_task(rq, p, en_flags); - ttwu_do_wakeup(rq, p, wake_flags, rf); -} - /* * Consider @p being inside a wait loop: * @@ -3718,9 +3772,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { - /* check_preempt_curr() may use rq clock */ - update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags, &rf); + if (!task_on_cpu(rq, p)) { + /* + * When on_rq && !on_cpu the task is preempted, see if + * it should preempt the task that is current now. + */ + update_rq_clock(rq); + check_preempt_curr(rq, p, wake_flags); + } + ttwu_do_wakeup(p); ret = 1; } __task_rq_unlock(rq, &rf); @@ -4086,8 +4146,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) goto out; trace_sched_waking(p); - WRITE_ONCE(p->__state, TASK_RUNNING); - trace_sched_wakeup(p); + ttwu_do_wakeup(p); goto out; } @@ -5052,6 +5111,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); rseq_preempt(prev); + switch_mm_cid(prev, next); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); @@ -5504,7 +5564,9 @@ void scheduler_tick(void) unsigned long thermal_pressure; u64 resched_latency; - arch_scale_freq_tick(); + if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + arch_scale_freq_tick(); + sched_clock_tick(); rq_lock(rq, &rf); @@ -6206,7 +6268,7 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd) { int i; - for_each_cpu_wrap(i, sched_domain_span(sd), cpu) { + for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) { if (i == cpu) continue; @@ -8239,12 +8301,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (retval) goto out_put_task; - user_mask = kmalloc(cpumask_size(), GFP_KERNEL); - if (!user_mask) { + /* + * With non-SMP configs, user_cpus_ptr/user_mask isn't used and + * alloc_user_cpus_ptr() returns NULL. + */ + user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); + if (user_mask) { + cpumask_copy(user_mask, in_mask); + } else if (IS_ENABLED(CONFIG_SMP)) { retval = -ENOMEM; goto out_put_task; } - cpumask_copy(user_mask, in_mask); + ac = (struct affinity_context){ .new_mask = in_mask, .user_mask = user_mask, @@ -11305,3 +11373,53 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) { trace_sched_update_nr_running_tp(rq, count); } + +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_exit_signals(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + mm_cid_put(mm, t->mm_cid); + t->mm_cid = -1; + t->mm_cid_active = 0; + local_irq_restore(flags); +} + +void sched_mm_cid_before_execve(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + mm_cid_put(mm, t->mm_cid); + t->mm_cid = -1; + t->mm_cid_active = 0; + local_irq_restore(flags); +} + +void sched_mm_cid_after_execve(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + t->mm_cid = mm_cid_get(mm); + t->mm_cid_active = 1; + local_irq_restore(flags); + rseq_set_notify_resume(t); +} + +void sched_mm_cid_fork(struct task_struct *t) +{ + WARN_ON_ONCE(!t->mm || t->mm_cid != -1); + t->mm_cid_active = 1; +} +#endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 1207c78f85c1..5c840151f3bb 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -48,7 +48,6 @@ struct sugov_cpu { unsigned long util; unsigned long bw_dl; - unsigned long max; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -158,7 +157,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); sg_cpu->bw_dl = cpu_bw_dl(rq); sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), FREQUENCY_UTIL, NULL); @@ -238,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * sugov_iowait_apply() - Apply the IO boost to a CPU. * @sg_cpu: the sugov data for the cpu to boost * @time: the update time from the caller + * @max_cap: the max CPU capacity * * A CPU running a task which woken up after an IO operation can have its * utilization boosted to speed up the completion of those IO operations. @@ -251,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) +static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long max_cap) { unsigned long boost; @@ -280,7 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; + boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); if (sg_cpu->util < boost) sg_cpu->util = boost; @@ -310,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, - u64 time, unsigned int flags) + u64 time, unsigned long max_cap, + unsigned int flags) { sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -321,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, return false; sugov_get_util(sg_cpu); - sugov_iowait_apply(sg_cpu, time); + sugov_iowait_apply(sg_cpu, time, max_cap); return true; } @@ -332,12 +333,15 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned int cached_freq = sg_policy->cached_raw_freq; + unsigned long max_cap; unsigned int next_f; - if (!sugov_update_single_common(sg_cpu, time, flags)) + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + + if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) return; - next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max); + next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. @@ -374,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); unsigned long prev_util = sg_cpu->util; + unsigned long max_cap; /* * Fall back to the "frequency" path if frequency invariance is not @@ -385,7 +390,9 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, return; } - if (!sugov_update_single_common(sg_cpu, time, flags)) + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + + if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) return; /* @@ -399,7 +406,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, sg_cpu->util = prev_util; cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), - map_util_perf(sg_cpu->util), sg_cpu->max); + map_util_perf(sg_cpu->util), max_cap); sg_cpu->sg_policy->last_freq_update_time = time; } @@ -408,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned long util = 0, max = 1; + unsigned long util = 0, max_cap; unsigned int j; + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); - unsigned long j_util, j_max; sugov_get_util(j_sg_cpu); - sugov_iowait_apply(j_sg_cpu, time); - j_util = j_sg_cpu->util; - j_max = j_sg_cpu->max; + sugov_iowait_apply(j_sg_cpu, time, max_cap); - if (j_util * max > j_max * util) { - util = j_util; - max = j_max; - } + util = max(j_sg_cpu->util, util); } - return get_next_freq(sg_policy, util, max); + return get_next_freq(sg_policy, util, max_cap); } static void diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 95fc77853743..af7952f12e6c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -3,6 +3,10 @@ * Simple CPU accounting cgroup controller */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + #include <asm/cputime.h> +#endif + #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0d97d54276cc..71b24371a6f7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2663,17 +2663,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (task_on_rq_queued(p) || task_current(rq, p)) { + if (!task_on_rq_queued(p)) + return; + #ifdef CONFIG_SMP - /* - * This might be too much, but unfortunately - * we don't have the old deadline value, and - * we can't argue if the task is increasing - * or lowering its prio, so... - */ - if (!rq->dl.overloaded) - deadline_queue_pull_task(rq); + /* + * This might be too much, but unfortunately + * we don't have the old deadline value, and + * we can't argue if the task is increasing + * or lowering its prio, so... + */ + if (!rq->dl.overloaded) + deadline_queue_pull_task(rq); + if (task_current(rq, p)) { /* * If we now have a earlier deadline task than p, * then reschedule, provided p is still on this @@ -2681,15 +2684,24 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline)) resched_curr(rq); -#else + } else { /* - * Again, we don't know if p has a earlier - * or later deadline, so let's blindly set a - * (maybe not needed) rescheduling point. + * Current may not be deadline in case p was throttled but we + * have just replenished it (e.g. rt_mutex_setprio()). + * + * Otherwise, if p was given an earlier deadline, reschedule. */ - resched_curr(rq); -#endif /* CONFIG_SMP */ + if (!dl_task(rq->curr) || + dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) + resched_curr(rq); } +#else + /* + * We don't know if p has a earlier or later deadline, so let's blindly + * set a (maybe not needed) rescheduling point. + */ + resched_curr(rq); +#endif } DEFINE_SCHED_CLASS(dl) = { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c36aa54ae071..ff4dbbae3b10 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -468,7 +468,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse) return NULL; } -static inline struct sched_entity *parent_entity(struct sched_entity *se) +static inline struct sched_entity *parent_entity(const struct sched_entity *se) { return se->parent; } @@ -595,8 +595,8 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline bool entity_before(struct sched_entity *a, - struct sched_entity *b) +static inline bool entity_before(const struct sched_entity *a, + const struct sched_entity *b) { return (s64)(a->vruntime - b->vruntime) < 0; } @@ -1804,7 +1804,7 @@ static void update_numa_stats(struct task_numa_env *env, ns->nr_running += rq->cfs.h_nr_running; ns->compute_capacity += capacity_of(cpu); - if (find_idle && !rq->nr_running && idle_cpu(cpu)) { + if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { if (READ_ONCE(rq->numa_migrate_on) || !cpumask_test_cpu(cpu, env->p->cpus_ptr)) continue; @@ -1836,7 +1836,7 @@ static void task_numa_assign(struct task_numa_env *env, int start = env->dst_cpu; /* Find alternative idle CPU. */ - for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { + for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) { if (cpu == env->best_cpu || !idle_cpu(cpu) || !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { continue; @@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util, * * For uclamp_max, we can tolerate a drop in performance level as the * goal is to cap the task. So it's okay if it's getting less. - * - * In case of capacity inversion we should honour the inverted capacity - * for both uclamp_min and uclamp_max all the time. */ - capacity_orig = cpu_in_capacity_inversion(cpu); - if (capacity_orig) { - capacity_orig_thermal = capacity_orig; - } else { - capacity_orig = capacity_orig_of(cpu); - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); - } + capacity_orig = capacity_orig_of(cpu); + capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); /* * We want to force a task to fit a cpu as implied by uclamp_max. @@ -4561,8 +4553,8 @@ static inline int util_fits_cpu(unsigned long util, * handle the case uclamp_min > uclamp_max. */ uclamp_min = min(uclamp_min, uclamp_max); - if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE) - fits = fits && (uclamp_min <= capacity_orig_thermal); + if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal)) + return -1; return fits; } @@ -4572,7 +4564,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu) unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); unsigned long util = task_util_est(p); - return util_fits_cpu(util, uclamp_min, uclamp_max, cpu); + /* + * Return true only if the cpu fully fits the task requirements, which + * include the utilization but also the performance hints. + */ + return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -4656,6 +4652,7 @@ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime = cfs_rq->min_vruntime; + u64 sleep_time; /* * The 'current' period is already promised to the current tasks, @@ -4685,8 +4682,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime -= thresh; } - /* ensure we never gain time by being placed backwards. */ - se->vruntime = max_vruntime(se->vruntime, vruntime); + /* + * Pull vruntime of the entity being placed to the base level of + * cfs_rq, to prevent boosting it if placed backwards. If the entity + * slept for a long time, don't even try to compare its vruntime with + * the base as it may be too far off and the comparison may get + * inversed due to s64 overflow. + */ + sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; + if ((s64)sleep_time > 60LL * NSEC_PER_SEC) + se->vruntime = vruntime; + else + se->vruntime = max_vruntime(se->vruntime, vruntime); } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -4896,7 +4903,13 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) struct sched_entity *se; s64 delta; - ideal_runtime = sched_slice(cfs_rq, curr); + /* + * When many tasks blow up the sched_period; it is possible that + * sched_slice() reports unusually large results (when many tasks are + * very light for example). Therefore impose a maximum. + */ + ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { resched_curr(rq_of(cfs_rq)); @@ -5461,22 +5474,105 @@ unthrottle_throttle: resched_curr(rq); } -static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +#ifdef CONFIG_SMP +static void __cfsb_csd_unthrottle(void *arg) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cursor, *tmp; + struct rq *rq = arg; + struct rq_flags rf; + + rq_lock(rq, &rf); + + /* + * Since we hold rq lock we're safe from concurrent manipulation of + * the CSD list. However, this RCU critical section annotates the + * fact that we pair with sched_free_group_rcu(), so that we cannot + * race with group being freed in the window between removing it + * from the list and advancing to the next entry in the list. + */ + rcu_read_lock(); + + list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list, + throttled_csd_list) { + list_del_init(&cursor->throttled_csd_list); + + if (cfs_rq_throttled(cursor)) + unthrottle_cfs_rq(cursor); + } + + rcu_read_unlock(); + + rq_unlock(rq, &rf); +} + +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + bool first; + + if (rq == this_rq()) { + unthrottle_cfs_rq(cfs_rq); + return; + } + + /* Already enqueued */ + if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) + return; + + first = list_empty(&rq->cfsb_csd_list); + list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list); + if (first) + smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); +} +#else +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + unthrottle_cfs_rq(cfs_rq); +} +#endif + +static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + lockdep_assert_rq_held(rq_of(cfs_rq)); + + if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || + cfs_rq->runtime_remaining <= 0)) + return; + + __unthrottle_cfs_rq_async(cfs_rq); +} + +static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +{ + struct cfs_rq *local_unthrottle = NULL; + int this_cpu = smp_processor_id(); u64 runtime, remaining = 1; + bool throttled = false; + struct cfs_rq *cfs_rq; + struct rq_flags rf; + struct rq *rq; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { - struct rq *rq = rq_of(cfs_rq); - struct rq_flags rf; + rq = rq_of(cfs_rq); + + if (!remaining) { + throttled = true; + break; + } rq_lock_irqsave(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; - /* By the above check, this should never be true */ +#ifdef CONFIG_SMP + /* Already queued for async unthrottle */ + if (!list_empty(&cfs_rq->throttled_csd_list)) + goto next; +#endif + + /* By the above checks, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); raw_spin_lock(&cfs_b->lock); @@ -5490,16 +5586,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) cfs_rq->runtime_remaining += runtime; /* we check whether we're throttled above */ - if (cfs_rq->runtime_remaining > 0) - unthrottle_cfs_rq(cfs_rq); + if (cfs_rq->runtime_remaining > 0) { + if (cpu_of(rq) != this_cpu || + SCHED_WARN_ON(local_unthrottle)) + unthrottle_cfs_rq_async(cfs_rq); + else + local_unthrottle = cfs_rq; + } else { + throttled = true; + } next: rq_unlock_irqrestore(rq, &rf); - - if (!remaining) - break; } rcu_read_unlock(); + + if (local_unthrottle) { + rq = cpu_rq(this_cpu); + rq_lock_irqsave(rq, &rf); + if (cfs_rq_throttled(local_unthrottle)) + unthrottle_cfs_rq(local_unthrottle); + rq_unlock_irqrestore(rq, &rf); + } + + return throttled; } /* @@ -5544,10 +5654,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u while (throttled && cfs_b->runtime > 0) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - distribute_cfs_runtime(cfs_b); + throttled = distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - - throttled = !list_empty(&cfs_b->throttled_cfs_rq); } /* @@ -5824,6 +5932,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); +#ifdef CONFIG_SMP + INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); +#endif } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -5840,12 +5951,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + int __maybe_unused i; + /* init_cfs_bandwidth() was not called */ if (!cfs_b->throttled_cfs_rq.next) return; hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); + + /* + * It is possible that we still have some cfs_rq's pending on a CSD + * list, though this race is very rare. In order for this to occur, we + * must have raced with the last task leaving the group while there + * exist throttled cfs_rq(s), and the period_timer must have queued the + * CSD item but the remote cpu has not yet processed it. To handle this, + * we can simply flush all pending CSD work inline here. We're + * guaranteed at this point that no additional cfs_rq of this group can + * join a CSD list. + */ +#ifdef CONFIG_SMP + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + unsigned long flags; + + if (list_empty(&rq->cfsb_csd_list)) + continue; + + local_irq_save(flags); + __cfsb_csd_unthrottle(rq); + local_irq_restore(flags); + } +#endif } /* @@ -6008,6 +6145,7 @@ static inline bool cpu_overutilized(int cpu) unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + /* Return true only if the utilization doesn't fit CPU's capacity */ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } @@ -6801,6 +6939,7 @@ static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { unsigned long task_util, util_min, util_max, best_cap = 0; + int fits, best_fits = 0; int cpu, best_cpu = -1; struct cpumask *cpus; @@ -6811,17 +6950,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) util_min = uclamp_eff_value(p, UCLAMP_MIN); util_max = uclamp_eff_value(p, UCLAMP_MAX); - for_each_cpu_wrap(cpu, cpus, target) { + for_each_cpu_wrap(cpu, cpus, target + 1) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (util_fits_cpu(task_util, util_min, util_max, cpu)) + + fits = util_fits_cpu(task_util, util_min, util_max, cpu); + + /* This CPU fits with all requirements */ + if (fits > 0) return cpu; + /* + * Only the min performance hint (i.e. uclamp_min) doesn't fit. + * Look for the CPU with best capacity. + */ + else if (fits < 0) + cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu)); - if (cpu_cap > best_cap) { + /* + * First, select CPU which fits better (-1 being better than 0). + * Then, select the one with best capacity at same level. + */ + if ((fits < best_fits) || + ((fits == best_fits) && (cpu_cap > best_cap))) { best_cap = cpu_cap; best_cpu = cpu; + best_fits = fits; } } @@ -6834,7 +6989,11 @@ static inline bool asym_fits_cpu(unsigned long util, int cpu) { if (sched_asym_cpucap_active()) - return util_fits_cpu(util, util_min, util_max, cpu); + /* + * Return true only if the cpu fully fits the task requirements + * which include the utilization and the performance hints. + */ + return (util_fits_cpu(util, util_min, util_max, cpu) > 0); return true; } @@ -7201,6 +7360,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; struct root_domain *rd = this_rq()->rd; int cpu, best_energy_cpu, target = -1; + int prev_fits = -1, best_fits = -1; + unsigned long best_thermal_cap = 0; + unsigned long prev_thermal_cap = 0; struct sched_domain *sd; struct perf_domain *pd; struct energy_env eenv; @@ -7229,13 +7391,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) eenv_task_busy_time(&eenv, p, prev_cpu); for (; pd; pd = pd->next) { + unsigned long util_min = p_util_min, util_max = p_util_max; unsigned long cpu_cap, cpu_thermal_cap, util; unsigned long cur_delta, max_spare_cap = 0; unsigned long rq_util_min, rq_util_max; - unsigned long util_min, util_max; unsigned long prev_spare_cap = 0; int max_spare_cap_cpu = -1; unsigned long base_energy; + int fits, max_fits = -1; cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); @@ -7251,6 +7414,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) eenv.pd_cap = 0; for_each_cpu(cpu, cpus) { + struct rq *rq = cpu_rq(cpu); + eenv.pd_cap += cpu_thermal_cap; if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) @@ -7269,26 +7434,23 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * much capacity we can get out of the CPU; this is * aligned with sched_cpu_util(). */ - if (uclamp_is_used()) { - if (uclamp_rq_is_idle(cpu_rq(cpu))) { - util_min = p_util_min; - util_max = p_util_max; - } else { - /* - * Open code uclamp_rq_util_with() except for - * the clamp() part. Ie: apply max aggregation - * only. util_fits_cpu() logic requires to - * operate on non clamped util but must use the - * max-aggregated uclamp_{min, max}. - */ - rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); - rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); - - util_min = max(rq_util_min, p_util_min); - util_max = max(rq_util_max, p_util_max); - } + if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { + /* + * Open code uclamp_rq_util_with() except for + * the clamp() part. Ie: apply max aggregation + * only. util_fits_cpu() logic requires to + * operate on non clamped util but must use the + * max-aggregated uclamp_{min, max}. + */ + rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN); + rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX); + + util_min = max(rq_util_min, p_util_min); + util_max = max(rq_util_max, p_util_max); } - if (!util_fits_cpu(util, util_min, util_max, cpu)) + + fits = util_fits_cpu(util, util_min, util_max, cpu); + if (!fits) continue; lsub_positive(&cpu_cap, util); @@ -7296,7 +7458,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (cpu == prev_cpu) { /* Always use prev_cpu as a candidate. */ prev_spare_cap = cpu_cap; - } else if (cpu_cap > max_spare_cap) { + prev_fits = fits; + } else if ((fits > max_fits) || + ((fits == max_fits) && (cpu_cap > max_spare_cap))) { /* * Find the CPU with the maximum spare capacity * among the remaining CPUs in the performance @@ -7304,6 +7468,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) */ max_spare_cap = cpu_cap; max_spare_cap_cpu = cpu; + max_fits = fits; } } @@ -7322,26 +7487,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (prev_delta < base_energy) goto unlock; prev_delta -= base_energy; + prev_thermal_cap = cpu_thermal_cap; best_delta = min(best_delta, prev_delta); } /* Evaluate the energy impact of using max_spare_cap_cpu. */ if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { + /* Current best energy cpu fits better */ + if (max_fits < best_fits) + continue; + + /* + * Both don't fit performance hint (i.e. uclamp_min) + * but best energy cpu has better capacity. + */ + if ((max_fits < 0) && + (cpu_thermal_cap <= best_thermal_cap)) + continue; + cur_delta = compute_energy(&eenv, pd, cpus, p, max_spare_cap_cpu); /* CPU utilization has changed */ if (cur_delta < base_energy) goto unlock; cur_delta -= base_energy; - if (cur_delta < best_delta) { - best_delta = cur_delta; - best_energy_cpu = max_spare_cap_cpu; - } + + /* + * Both fit for the task but best energy cpu has lower + * energy impact. + */ + if ((max_fits > 0) && (best_fits > 0) && + (cur_delta >= best_delta)) + continue; + + best_delta = cur_delta; + best_energy_cpu = max_spare_cap_cpu; + best_fits = max_fits; + best_thermal_cap = cpu_thermal_cap; } } rcu_read_unlock(); - if (best_delta < prev_delta) + if ((best_fits > prev_fits) || + ((best_fits > 0) && (best_delta < prev_delta)) || + ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap))) target = best_energy_cpu; return target; @@ -8841,73 +9030,16 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity_orig = arch_scale_cpu_capacity(cpu); unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - struct rq *rq = cpu_rq(cpu); - rq->cpu_capacity_orig = capacity_orig; + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); if (!capacity) capacity = 1; - rq->cpu_capacity = capacity; - - /* - * Detect if the performance domain is in capacity inversion state. - * - * Capacity inversion happens when another perf domain with equal or - * lower capacity_orig_of() ends up having higher capacity than this - * domain after subtracting thermal pressure. - * - * We only take into account thermal pressure in this detection as it's - * the only metric that actually results in *real* reduction of - * capacity due to performance points (OPPs) being dropped/become - * unreachable due to thermal throttling. - * - * We assume: - * * That all cpus in a perf domain have the same capacity_orig - * (same uArch). - * * Thermal pressure will impact all cpus in this perf domain - * equally. - */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { - unsigned long inv_cap = capacity_orig - thermal_load_avg(rq); - struct perf_domain *pd = rcu_dereference(rq->rd->pd); - - rq->cpu_capacity_inverted = 0; - - for (; pd; pd = pd->next) { - struct cpumask *pd_span = perf_domain_span(pd); - unsigned long pd_cap_orig, pd_cap; - - cpu = cpumask_any(pd_span); - pd_cap_orig = arch_scale_cpu_capacity(cpu); - - if (capacity_orig < pd_cap_orig) - continue; - - /* - * handle the case of multiple perf domains have the - * same capacity_orig but one of them is under higher - * thermal pressure. We record it as capacity - * inversion. - */ - if (capacity_orig == pd_cap_orig) { - pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu)); - - if (pd_cap > inv_cap) { - rq->cpu_capacity_inverted = inv_cap; - break; - } - } else if (pd_cap_orig > inv_cap) { - rq->cpu_capacity_inverted = inv_cap; - break; - } - } - } - - trace_sched_cpu_capacity_tp(rq); + cpu_rq(cpu)->cpu_capacity = capacity; + trace_sched_cpu_capacity_tp(cpu_rq(cpu)); sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; @@ -10135,24 +10267,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds); - if (sched_energy_enabled()) { - struct root_domain *rd = env->dst_rq->rd; - - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) - goto out_balanced; - } - - local = &sds.local_stat; - busiest = &sds.busiest_stat; - /* There is no busy sibling group to pull tasks from */ if (!sds.busiest) goto out_balanced; + busiest = &sds.busiest_stat; + /* Misfit tasks should be dealt with regardless of the avg load */ if (busiest->group_type == group_misfit_task) goto force_balance; + if (sched_energy_enabled()) { + struct root_domain *rd = env->dst_rq->rd; + + if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) + goto out_balanced; + } + /* ASYM feature bypasses nice load balance check */ if (busiest->group_type == group_asym_packing) goto force_balance; @@ -10165,6 +10296,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; + local = &sds.local_stat; /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -11728,7 +11860,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) /* * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. */ -static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle) +static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, + bool forceidle) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -11753,11 +11886,12 @@ void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi) se_fi_update(se, rq->core->core_forceidle_seq, in_fi); } -bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi) { struct rq *rq = task_rq(a); - struct sched_entity *sea = &a->se; - struct sched_entity *seb = &b->se; + const struct sched_entity *sea = &a->se; + const struct sched_entity *seb = &b->se; struct cfs_rq *cfs_rqa; struct cfs_rq *cfs_rqb; s64 delta; @@ -12474,6 +12608,11 @@ __init void init_sched_fair_class(void) for_each_possible_cpu(i) { zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); + +#ifdef CONFIG_CFS_BANDWIDTH + INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i)); + INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list); +#endif } open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f26ab2675f7d..e9ef66be2870 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -51,18 +51,22 @@ __setup("hlt", cpu_idle_nopoll_setup); static noinline int __cpuidle cpu_idle_poll(void) { + instrumentation_begin(); trace_cpu_idle(0, smp_processor_id()); stop_critical_timings(); - ct_idle_enter(); - local_irq_enable(); + ct_cpuidle_enter(); + raw_local_irq_enable(); while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); + raw_local_irq_disable(); - ct_idle_exit(); + ct_cpuidle_exit(); start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + local_irq_enable(); + instrumentation_end(); return 1; } @@ -75,7 +79,6 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; - raw_local_irq_enable(); } /** @@ -85,44 +88,20 @@ void __weak arch_cpu_idle(void) */ void __cpuidle default_idle_call(void) { - if (current_clr_polling_and_test()) { - local_irq_enable(); - } else { - + instrumentation_begin(); + if (!current_clr_polling_and_test()) { trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); - /* - * arch_cpu_idle() is supposed to enable IRQs, however - * we can't do that because of RCU and tracing. - * - * Trace IRQs enable here, then switch off RCU, and have - * arch_cpu_idle() use raw_local_irq_enable(). Note that - * ct_idle_enter() relies on lockdep IRQ state, so switch that - * last -- this is very similar to the entry code. - */ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - ct_idle_enter(); - lockdep_hardirqs_on(_THIS_IP_); - + ct_cpuidle_enter(); arch_cpu_idle(); - - /* - * OK, so IRQs are enabled here, but RCU needs them disabled to - * turn itself back on.. funny thing is that disabling IRQs - * will cause tracing, which needs RCU. Jump through hoops to - * make it 'work'. - */ - raw_local_irq_disable(); - lockdep_hardirqs_off(_THIS_IP_); - ct_idle_exit(); - lockdep_hardirqs_on(_THIS_IP_); - raw_local_irq_enable(); + ct_cpuidle_exit(); start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } + local_irq_enable(); + instrumentation_end(); } static int call_cpuidle_s2idle(struct cpuidle_driver *drv, diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 0c5be7ebb1dc..2ad881d07752 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -159,7 +159,8 @@ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ - | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK) + | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ + | MEMBARRIER_CMD_GET_REGISTRATIONS) static void ipi_mb(void *info) { @@ -540,6 +541,40 @@ static int membarrier_register_private_expedited(int flags) return 0; } +static int membarrier_get_registrations(void) +{ + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + int registrations_mask = 0, membarrier_state, i; + static const int states[] = { + MEMBARRIER_STATE_GLOBAL_EXPEDITED | + MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, + MEMBARRIER_STATE_PRIVATE_EXPEDITED | + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE | + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY, + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ | + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY + }; + static const int registration_cmds[] = { + MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ + }; + BUILD_BUG_ON(ARRAY_SIZE(states) != ARRAY_SIZE(registration_cmds)); + + membarrier_state = atomic_read(&mm->membarrier_state); + for (i = 0; i < ARRAY_SIZE(states); ++i) { + if (membarrier_state & states[i]) { + registrations_mask |= registration_cmds[i]; + membarrier_state &= ~states[i]; + } + } + WARN_ON_ONCE(membarrier_state != 0); + return registrations_mask; +} + /** * sys_membarrier - issue memory barriers on a set of threads * @cmd: Takes command values defined in enum membarrier_cmd. @@ -623,6 +658,8 @@ SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); + case MEMBARRIER_CMD_GET_REGISTRATIONS: + return membarrier_get_registrations(); default: return -EINVAL; } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 8ac8b81bfee6..02e011cabe91 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1343,10 +1343,11 @@ void psi_trigger_destroy(struct psi_trigger *t) group = t->group; /* - * Wakeup waiters to stop polling. Can happen if cgroup is deleted - * from under a polling process. + * Wakeup waiters to stop polling and clear the queue to prevent it from + * being accessed later. Can happen if cgroup is deleted from under a + * polling process. */ - wake_up_interruptible(&t->event_wait); + wake_up_pollfree(&t->event_wait); mutex_lock(&group->trigger_lock); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ed2a47e4ddae..0a11f44adee5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1777,6 +1777,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) BUG_ON(idx >= MAX_RT_PRIO); queue = array->queue + idx; + if (SCHED_WARN_ON(list_empty(queue))) + return NULL; next = list_entry(queue->next, struct sched_rt_entity, run_list); return next; @@ -1789,7 +1791,8 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) do { rt_se = pick_next_rt_entity(rt_rq); - BUG_ON(!rt_se); + if (unlikely(!rt_se)) + return NULL; rt_rq = group_rt_rq(rt_se); } while (rt_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 771f8ddb7053..3e8df6d31c1e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -248,7 +248,7 @@ static inline void update_avg(u64 *avg, u64 sample) #define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) -static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se) { #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); @@ -260,8 +260,8 @@ static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) /* * Tells if entity @a should preempt entity @b. */ -static inline bool -dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +static inline bool dl_entity_preempt(const struct sched_dl_entity *a, + const struct sched_dl_entity *b) { return dl_entity_is_special(a) || dl_time_before(a->deadline, b->deadline); @@ -645,6 +645,9 @@ struct cfs_rq { int throttled; int throttle_count; struct list_head throttled_list; +#ifdef CONFIG_SMP + struct list_head throttled_csd_list; +#endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -1041,7 +1044,6 @@ struct rq { unsigned long cpu_capacity; unsigned long cpu_capacity_orig; - unsigned long cpu_capacity_inverted; struct balance_callback *balance_callback; @@ -1154,6 +1156,11 @@ struct rq { /* Scratch cpumask to be temporarily used under rq_lock */ cpumask_var_t scratch_mask; + +#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) + call_single_data_t cfsb_csd; + struct list_head cfsb_csd_list; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1236,7 +1243,8 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; } -bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi); +bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, + bool fi); /* * Helpers to check if the CPU's core cookie matches with the task's cookie @@ -1415,7 +1423,7 @@ static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) } /* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se) { return se->cfs_rq; } @@ -1428,19 +1436,16 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) #else -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} +#define task_of(_se) container_of(_se, struct task_struct, se) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p) { return &task_rq(p)->cfs; } -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se) { - struct task_struct *p = task_of(se); + const struct task_struct *p = task_of(se); struct rq *rq = task_rq(p); return &rq->cfs; @@ -2893,24 +2898,6 @@ static inline unsigned long capacity_orig_of(int cpu) return cpu_rq(cpu)->cpu_capacity_orig; } -/* - * Returns inverted capacity if the CPU is in capacity inversion state. - * 0 otherwise. - * - * Capacity inversion detection only considers thermal impact where actual - * performance points (OPPs) gets dropped. - * - * Capacity inversion state happens when another performance domain that has - * equal or lower capacity_orig_of() becomes effectively larger than the perf - * domain this CPU belongs to due to thermal pressure throttling it hard. - * - * See comment in update_cpu_capacity(). - */ -static inline unsigned long cpu_in_capacity_inversion(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_inverted; -} - /** * enum cpu_util_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency @@ -3261,4 +3248,62 @@ static inline void update_current_exec_runtime(struct task_struct *curr, cgroup_account_cputime(curr, delta_exec); } +#ifdef CONFIG_SCHED_MM_CID +static inline int __mm_cid_get(struct mm_struct *mm) +{ + struct cpumask *cpumask; + int cid; + + cpumask = mm_cidmask(mm); + cid = cpumask_first_zero(cpumask); + if (cid >= nr_cpu_ids) + return -1; + __cpumask_set_cpu(cid, cpumask); + return cid; +} + +static inline void mm_cid_put(struct mm_struct *mm, int cid) +{ + lockdep_assert_irqs_disabled(); + if (cid < 0) + return; + raw_spin_lock(&mm->cid_lock); + __cpumask_clear_cpu(cid, mm_cidmask(mm)); + raw_spin_unlock(&mm->cid_lock); +} + +static inline int mm_cid_get(struct mm_struct *mm) +{ + int ret; + + lockdep_assert_irqs_disabled(); + raw_spin_lock(&mm->cid_lock); + ret = __mm_cid_get(mm); + raw_spin_unlock(&mm->cid_lock); + return ret; +} + +static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) +{ + if (prev->mm_cid_active) { + if (next->mm_cid_active && next->mm == prev->mm) { + /* + * Context switch between threads in same mm, hand over + * the mm_cid from prev to next. + */ + next->mm_cid = prev->mm_cid; + prev->mm_cid = -1; + return; + } + mm_cid_put(prev->mm, prev->mm_cid); + prev->mm_cid = -1; + } + if (next->mm_cid_active) + next->mm_cid = mm_cid_get(next->mm); +} + +#else +static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } +#endif + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8739c2a5a54e..d93c3379e901 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -578,7 +578,7 @@ out: */ struct root_domain def_root_domain; -void init_defrootdomain(void) +void __init init_defrootdomain(void) { init_rootdomain(&def_root_domain); @@ -2451,7 +2451,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * Set up scheduler domains and groups. For now this just excludes isolated * CPUs, but could be used to exclude other special cases in the future. */ -int sched_init_domains(const struct cpumask *cpu_map) +int __init sched_init_domains(const struct cpumask *cpu_map) { int err; diff --git a/kernel/signal.c b/kernel/signal.c index ae26da61c4d9..8cb28f1df294 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2951,6 +2951,7 @@ void exit_signals(struct task_struct *tsk) cgroup_threadgroup_change_begin(tsk); if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { + sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; @@ -2961,6 +2962,7 @@ void exit_signals(struct task_struct *tsk) * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ + sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); diff --git a/kernel/sys.c b/kernel/sys.c index 5fd54bf0e886..88b31f096fb2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1442,6 +1442,8 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource, if (resource >= RLIM_NLIMITS) return -EINVAL; + resource = array_index_nospec(resource, RLIM_NLIMITS); + if (new_rlim) { if (new_rlim->rlim_cur > new_rlim->rlim_max) return -EINVAL; diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index a41753be1a2b..bae8f11070be 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -200,10 +200,14 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US int "Clocksource watchdog maximum allowable skew (in μs)" depends on CLOCKSOURCE_WATCHDOG range 50 1000 - default 100 + default 125 help Specify the maximum amount of allowable watchdog skew in microseconds before reporting the clocksource to be unstable. + The default is based on a half-second clocksource watchdog + interval and NTP's maximum frequency drift of 500 parts + per million. If the clocksource is good enough for NTP, + it is good enough for the clocksource watchdog! endmenu endif diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5897828b9d7e..7e5dff602585 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -470,11 +470,35 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) } EXPORT_SYMBOL_GPL(alarm_forward); -u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle) { struct alarm_base *base = &alarm_bases[alarm->type]; + ktime_t now = base->get_ktime(); + + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) { + /* + * Same issue as with posix_timer_fn(). Timers which are + * periodic but the signal is ignored can starve the system + * with a very small interval. The real fix which was + * promised in the context of posix_timer_fn() never + * materialized, but someone should really work on it. + * + * To prevent DOS fake @now to be 1 jiffie out which keeps + * the overrun accounting correct but creates an + * inconsistency vs. timer_gettime(2). + */ + ktime_t kj = NSEC_PER_SEC / HZ; + + if (interval < kj) + now = ktime_add(now, kj); + } + + return alarm_forward(alarm, now, interval); +} - return alarm_forward(alarm, base->get_ktime(), interval); +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + return __alarm_forward_now(alarm, interval, false); } EXPORT_SYMBOL_GPL(alarm_forward_now); @@ -551,9 +575,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, if (posix_timer_event(ptr, si_private) && ptr->it_interval) { /* * Handle ignored signals and rearm the timer. This will go - * away once we handle ignored signals proper. + * away once we handle ignored signals proper. Ensure that + * small intervals cannot starve the system. */ - ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval); + ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true); ++ptr->it_requeue_pending; ptr->it_active = 1; result = ALARMTIMER_RESTART; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 9cf32ccda715..91836b727cef 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -96,6 +96,11 @@ static int finished_booting; static u64 suspend_start; /* + * Interval: 0.5sec. + */ +#define WATCHDOG_INTERVAL (HZ >> 1) + +/* * Threshold: 0.0312s, when doubled: 0.0625s. * Also a default for cs->uncertainty_margin when registering clocks. */ @@ -106,11 +111,14 @@ static u64 suspend_start; * clocksource surrounding a read of the clocksource being validated. * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as * a lower bound for cs->uncertainty_margin values when registering clocks. + * + * The default of 500 parts per million is based on NTP's limits. + * If a clocksource is good enough for NTP, it is good enough for us! */ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US #define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US #else -#define MAX_SKEW_USEC 100 +#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ) #endif #define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) @@ -140,11 +148,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags) static int clocksource_watchdog_kthread(void *data); static void __clocksource_change_rating(struct clocksource *cs, int rating); -/* - * Interval: 0.5sec. - */ -#define WATCHDOG_INTERVAL (HZ >> 1) - static void clocksource_watchdog_work(struct work_struct *work) { /* @@ -257,8 +260,8 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, goto skip_test; } - pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n", - smp_processor_id(), watchdog->name, wd_delay, nretries); + pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n", + smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name); return WD_READ_UNSTABLE; skip_test: @@ -384,6 +387,15 @@ void clocksource_verify_percpu(struct clocksource *cs) } EXPORT_SYMBOL_GPL(clocksource_verify_percpu); +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + + static void clocksource_watchdog(struct timer_list *unused) { u64 csnow, wdnow, cslast, wdlast, delta; @@ -391,6 +403,7 @@ static void clocksource_watchdog(struct timer_list *unused) int64_t wd_nsec, cs_nsec; struct clocksource *cs; enum wd_read_status read_ret; + unsigned long extra_wait = 0; u32 md; spin_lock(&watchdog_lock); @@ -410,13 +423,30 @@ static void clocksource_watchdog(struct timer_list *unused) read_ret = cs_watchdog_read(cs, &csnow, &wdnow); - if (read_ret != WD_READ_SUCCESS) { - if (read_ret == WD_READ_UNSTABLE) - /* Clock readout unreliable, so give it up. */ - __clocksource_unstable(cs); + if (read_ret == WD_READ_UNSTABLE) { + /* Clock readout unreliable, so give it up. */ + __clocksource_unstable(cs); continue; } + /* + * When WD_READ_SKIP is returned, it means the system is likely + * under very heavy load, where the latency of reading + * watchdog/clocksource is very big, and affect the accuracy of + * watchdog check. So give system some space and suspend the + * watchdog check for 5 minutes. + */ + if (read_ret == WD_READ_SKIP) { + /* + * As the watchdog timer will be suspended, and + * cs->last could keep unchanged for 5 minutes, reset + * the counters. + */ + clocksource_reset_watchdog(); + extra_wait = HZ * 300; + break; + } + /* Clocksource initialized ? */ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || atomic_read(&watchdog_reset_pending)) { @@ -443,12 +473,20 @@ static void clocksource_watchdog(struct timer_list *unused) /* Check the deviation from the watchdog clocksource. */ md = cs->uncertainty_margin + watchdog->uncertainty_margin; if (abs(cs_nsec - wd_nsec) > md) { + u64 cs_wd_msec; + u64 wd_msec; + u32 wd_rem; + pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", smp_processor_id(), cs->name); pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n", watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask); pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n", cs->name, cs_nsec, csnow, cslast, cs->mask); + cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem); + wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem); + pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n", + cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec); if (curr_clocksource == cs) pr_warn(" '%s' is current clocksource.\n", cs->name); else if (curr_clocksource) @@ -512,7 +550,7 @@ static void clocksource_watchdog(struct timer_list *unused) * pair clocksource_stop_watchdog() clocksource_start_watchdog(). */ if (!timer_pending(&watchdog_timer)) { - watchdog_timer.expires += WATCHDOG_INTERVAL; + watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait; add_timer_on(&watchdog_timer, next_cpu); } out: @@ -537,14 +575,6 @@ static inline void clocksource_stop_watchdog(void) watchdog_running = 0; } -static inline void clocksource_reset_watchdog(void) -{ - struct clocksource *cs; - - list_for_each_entry(cs, &watchdog_list, wd_list) - cs->flags &= ~CLOCK_SOURCE_WATCHDOG; -} - static void clocksource_resume_watchdog(void) { atomic_inc(&watchdog_reset_pending); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3ae661ab6260..e8c08292defc 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2089,7 +2089,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, u64 slack; slack = current->timer_slack_ns; - if (dl_task(current) || rt_task(current)) + if (rt_task(current)) slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); @@ -2126,6 +2126,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, if (!timespec64_valid(&tu)) return -EINVAL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, @@ -2147,6 +2148,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, if (!timespec64_valid(&tu)) return -EINVAL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, @@ -2270,7 +2272,7 @@ void __init hrtimers_init(void) /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) + * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks * @mode: timer mode * @clock_id: timer clock to be used */ @@ -2297,6 +2299,13 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return -EINTR; } + /* + * Override any slack passed by the user if under + * rt contraints. + */ + if (rt_task(current)) + delta = 0; + hrtimer_init_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); @@ -2316,7 +2325,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) + * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks * @mode: timer mode * * Make the current task sleep until the given expiry time has @@ -2324,7 +2333,8 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); * the current task state has been set (see set_current_state()). * * The @delta argument gives the kernel the freedom to schedule the - * actual wakeup to a time that is both power and performance friendly. + * actual wakeup to a time that is both power and performance friendly + * for regular (non RT/DL) tasks. * The kernel give the normal best effort behavior for "@expires+@delta", * but may decide to fire the timer earlier, but no earlier than @expires. * diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index cb925e8ef9a8..2f5e9b34022c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -243,13 +243,12 @@ static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, */ static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime) { - u64 curr_cputime; -retry: - curr_cputime = atomic64_read(cputime); - if (sum_cputime > curr_cputime) { - if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime) - goto retry; - } + u64 curr_cputime = atomic64_read(cputime); + + do { + if (sum_cputime <= curr_cputime) + return; + } while (!atomic64_try_cmpxchg(cputime, &curr_cputime, sum_cputime)); } static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 90ea5f373e50..828aeecbd1e8 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -147,6 +147,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; texp = timespec64_to_ktime(t); @@ -240,6 +241,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; texp = timespec64_to_ktime(t); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 5dead89308b7..0c8a87a11b39 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1270,6 +1270,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; @@ -1297,6 +1298,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index 13b11eb62685..20d5df631570 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -149,7 +149,7 @@ module_init(udelay_test_init); static void __exit udelay_test_exit(void) { mutex_lock(&udelay_test_lock); - debugfs_remove(debugfs_lookup(DEBUGFS_FILENAME, NULL)); + debugfs_lookup_and_remove(DEBUGFS_FILENAME, NULL); mutex_unlock(&udelay_test_lock); } diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 797eb93103ad..e28f9210f8a1 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -56,25 +56,20 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * hrtimer callback function is currently running, then * hrtimer_start() cannot move it and the timer stays on the CPU on * which it is assigned at the moment. + */ + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* + * The core tick broadcast mode expects bc->bound_on to be set + * correctly to prevent a CPU which has the broadcast hrtimer + * armed from going deep idle. * - * As this can be called from idle code, the hrtimer_start() - * invocation has to be wrapped with RCU_NONIDLE() as - * hrtimer_start() can call into tracing. + * As tick_broadcast_lock is held, nothing can change the cpu + * base which was just established in hrtimer_start() above. So + * the below access is safe even without holding the hrtimer + * base lock. */ - RCU_NONIDLE( { - hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); - /* - * The core tick broadcast mode expects bc->bound_on to be set - * correctly to prevent a CPU which has the broadcast hrtimer - * armed from going deep idle. - * - * As tick_broadcast_lock is held, nothing can change the cpu - * base which was just established in hrtimer_start() above. So - * the below access is safe even without holding the hrtimer - * base lock. - */ - bc->bound_on = bctimer.base->cpu_base->cpu; - } ); + bc->bound_on = bctimer.base->cpu_base->cpu; + return 0; } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f7fe6fe36173..93bf2b4e47e5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -622,9 +622,13 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void) * to avoid a deep idle transition as we are about to get the * broadcast IPI right away. */ -int tick_check_broadcast_expired(void) +noinstr int tick_check_broadcast_expired(void) { +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H + return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask)); +#else return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +#endif } /* diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 475ecceda768..5e2c2c26b3cc 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -18,7 +18,7 @@ #include "tick-internal.h" /** - * tick_program_event + * tick_program_event - program the CPU local timer device for the next event */ int tick_program_event(ktime_t expires, int force) { @@ -99,7 +99,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) } /** - * tick_check_oneshot_mode - check whether the system is in oneshot mode + * tick_oneshot_mode_active - check whether the system is in oneshot mode * * returns 1 when either nohz or highres are enabled. otherwise 0. */ diff --git a/kernel/time/time.c b/kernel/time/time.c index 526257b3727c..f4198af60fee 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -462,7 +462,7 @@ struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec) EXPORT_SYMBOL(ns_to_kernel_old_timeval); /** - * set_normalized_timespec - set timespec sec and nsec parts and normalize + * set_normalized_timespec64 - set timespec sec and nsec parts and normalize * * @ts: pointer to timespec variable to be set * @sec: seconds to set @@ -526,7 +526,7 @@ struct timespec64 ns_to_timespec64(s64 nsec) EXPORT_SYMBOL(ns_to_timespec64); /** - * msecs_to_jiffies: - convert milliseconds to jiffies + * __msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds * * conversion is done as follows: @@ -541,12 +541,12 @@ EXPORT_SYMBOL(ns_to_timespec64); * handling any 32-bit overflows. * for the details see __msecs_to_jiffies() * - * msecs_to_jiffies() checks for the passed in value being a constant + * __msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code, __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. - * the _msecs_to_jiffies helpers are the HZ dependent conversion + * The _msecs_to_jiffies helpers are the HZ dependent conversion * routines found in include/linux/jiffies.h */ unsigned long __msecs_to_jiffies(const unsigned int m) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f72b9f1de178..5579ead449f2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1590,10 +1590,10 @@ void __weak read_persistent_clock64(struct timespec64 *ts) /** * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset * from the boot. + * @wall_time: current time as returned by persistent clock + * @boot_offset: offset that is defined as wall_time - boot_time * * Weak dummy function for arches that do not yet support it. - * @wall_time: - current time as returned by persistent clock - * @boot_offset: - offset that is defined as wall_time - boot_time * * The default function calculates offset based on the current value of * local_clock(). This way architectures that support sched_clock() but don't @@ -1701,7 +1701,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, } #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) -/** +/* * We have three kinds of time sources to use for sleep time * injection, the preference order is: * 1) non-stop clocksource @@ -1722,7 +1722,7 @@ bool timekeeping_rtc_skipresume(void) return !suspend_timing_needed; } -/** +/* * 1) can be determined whether to use or not only when doing * timekeeping_resume() which is invoked after rtc_suspend(), * so we can't skip rtc_suspend() surely if system has 1). diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 197545241ab8..d7043043f59c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -933,8 +933,8 @@ config RING_BUFFER_RECORD_RECURSION default y help The ring buffer has its own internal recursion. Although when - recursion happens it wont cause harm because of the protection, - but it does cause an unwanted overhead. Enabling this option will + recursion happens it won't cause harm because of the protection, + but it does cause unwanted overhead. Enabling this option will place where recursion was detected into the ftrace "recursed_functions" file. @@ -1017,8 +1017,8 @@ config RING_BUFFER_STARTUP_TEST The test runs for 10 seconds. This will slow your boot time by at least 10 more seconds. - At the end of the test, statics and more checks are done. - It will output the stats of each per cpu buffer. What + At the end of the test, statistics and more checks are done. + It will output the stats of each per cpu buffer: What was written, the sizes, what was read, what was lost, and other similar details. diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 918a7d12df8f..5743be559415 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -320,8 +320,8 @@ static void blk_trace_free(struct request_queue *q, struct blk_trace *bt) * under 'q->debugfs_dir', thus lookup and remove them. */ if (!bt->dir) { - debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir)); - debugfs_remove(debugfs_lookup("msg", q->debugfs_dir)); + debugfs_lookup_and_remove("dropped", q->debugfs_dir); + debugfs_lookup_and_remove("msg", q->debugfs_dir); } else { debugfs_remove(bt->dir); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3bbd3f0c810c..b8ac8b09c86f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -687,8 +687,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, } perf_sample_data_init(sd, 0, 0); - sd->raw = &raw; - sd->sample_flags |= PERF_SAMPLE_RAW; + perf_sample_save_raw_data(sd, &raw); err = __bpf_perf_event_output(regs, map, flags, sd); @@ -746,8 +745,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); - sd->raw = &raw; - sd->sample_flags |= PERF_SAMPLE_RAW; + perf_sample_save_raw_data(sd, &raw); ret = __bpf_perf_event_output(regs, map, flags, sd); out: @@ -833,6 +831,7 @@ static void do_bpf_send_signal(struct irq_work *entry) work = container_of(entry, struct send_signal_irq_work, irq_work); group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type); + put_task_struct(work->task); } static int bpf_send_signal_common(u32 sig, enum pid_type type) @@ -848,6 +847,9 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) return -EPERM; if (unlikely(!nmi_uaccess_okay())) return -EPERM; + /* Task should not be pid=1 to avoid kernel panic. */ + if (unlikely(is_global_init(current))) + return -EPERM; if (irqs_disabled()) { /* Do an early check on signal validity. Otherwise, @@ -864,7 +866,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) * to the irq_work. The current task may change when queued * irq works get executed. */ - work->task = current; + work->task = get_task_struct(current); work->sig = sig; work->type = type; irq_work_queue(&work->irq_work); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 442438b93fe9..750aa3f08b25 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1248,12 +1248,17 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) call_rcu(&hash->rcu, __free_ftrace_hash_rcu); } +/** + * ftrace_free_filter - remove all filters for an ftrace_ops + * @ops - the ops to remove the filters from + */ void ftrace_free_filter(struct ftrace_ops *ops) { ftrace_ops_init(ops); free_ftrace_hash(ops->func_hash->filter_hash); free_ftrace_hash(ops->func_hash->notrace_hash); } +EXPORT_SYMBOL_GPL(ftrace_free_filter); static struct ftrace_hash *alloc_ftrace_hash(int size_bits) { @@ -5839,6 +5844,10 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi); * * Filters denote which functions should be enabled when tracing is enabled * If @ip is NULL, it fails to update filter. + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops). */ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset) @@ -5858,7 +5867,11 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); * * Filters denote which functions should be enabled when tracing is enabled * If @ips array or any ip specified within is NULL , it fails to update filter. - */ + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops). +*/ int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, int remove, int reset) { @@ -5900,6 +5913,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, * * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops). */ int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) @@ -5919,6 +5936,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); * Notrace Filters denote which functions should not be enabled when tracing * is enabled. If @buf is NULL and reset is set, all functions will be enabled * for tracing. + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops). */ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 6c97cc2d754a..7e9061828c24 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -516,7 +516,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user struct rv_monitor_def *mdef; int retval = -EINVAL; bool enable = true; - char *ptr = buff; + char *ptr; int len; if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a555a861b978..54a163ae4815 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3128,6 +3128,9 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, return; } + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY))) + return; + /* * When an NMI triggers, RCU is enabled via ct_nmi_enter(), * but if the above rcu_is_watching() failed, then the NMI @@ -9148,9 +9151,6 @@ buffer_percent_write(struct file *filp, const char __user *ubuf, if (val > 100) return -EINVAL; - if (!val) - val = 1; - tr->buffer_percent = val; (*ppos)++; @@ -10295,6 +10295,8 @@ void __init early_trace_init(void) static_key_enable(&tracepoint_printk_key.key); } tracer_alloc_buffers(); + + init_events(); } void __init trace_init(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e46a49269be2..085a31b978a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1282,6 +1282,7 @@ struct ftrace_event_field { int offset; int size; int is_signed; + int len; }; struct prog_entry; @@ -1490,6 +1491,7 @@ extern void trace_event_enable_cmd_record(bool enable); extern void trace_event_enable_tgid_record(bool enable); extern int event_trace_init(void); +extern int init_events(void); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); extern void __trace_early_add_events(struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 33e0b4f8ebe6..6a942fa275c7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -114,7 +114,7 @@ trace_find_event_field(struct trace_event_call *call, char *name) static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, - int is_signed, int filter_type) + int is_signed, int filter_type, int len) { struct ftrace_event_field *field; @@ -133,6 +133,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field->offset = offset; field->size = size; field->is_signed = is_signed; + field->len = len; list_add(&field->link, head); @@ -150,14 +151,28 @@ int trace_define_field(struct trace_event_call *call, const char *type, head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type); + is_signed, filter_type, 0); } EXPORT_SYMBOL_GPL(trace_define_field); +static int trace_define_field_ext(struct trace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, + int filter_type, int len) +{ + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, + is_signed, filter_type, len); +} + #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ #item, 0, 0, is_signed_type(type), \ - filter_type); \ + filter_type, 0); \ if (ret) \ return ret; @@ -166,7 +181,7 @@ EXPORT_SYMBOL_GPL(trace_define_field); "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), FILTER_OTHER, 0); \ if (ret) \ return ret; @@ -1588,12 +1603,17 @@ static int f_show(struct seq_file *m, void *v) seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", field->type, field->name, field->offset, field->size, !!field->is_signed); - else - seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + else if (field->len) + seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n", (int)(array_descriptor - field->type), field->type, field->name, - array_descriptor, field->offset, + field->len, field->offset, field->size, !!field->is_signed); + else + seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + field->offset, field->size, !!field->is_signed); return 0; } @@ -2379,9 +2399,10 @@ event_define_fields(struct trace_event_call *call) } offset = ALIGN(offset, field->align); - ret = trace_define_field(call, field->type, field->name, + ret = trace_define_field_ext(call, field->type, field->name, offset, field->size, - field->is_signed, field->filter_type); + field->is_signed, field->filter_type, + field->len); if (WARN_ON_ONCE(ret)) { pr_err("error code is %d\n", ret); break; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 96acc2b71ac7..e095c3b3a50d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -128,7 +128,7 @@ static bool is_not(const char *str) } /** - * prog_entry - a singe entry in the filter program + * struct prog_entry - a singe entry in the filter program * @target: Index to jump to on a branch (actually one minus the index) * @when_to_branch: The value of the result of the predicate to do a branch * @pred: The predicate to execute. @@ -140,16 +140,16 @@ struct prog_entry { }; /** - * update_preds- assign a program entry a label target + * update_preds - assign a program entry a label target * @prog: The program array * @N: The index of the current entry in @prog - * @when_to_branch: What to assign a program entry for its branch condition + * @invert: What to assign a program entry for its branch condition * * The program entry at @N has a target that points to the index of a program * entry that can have its target and when_to_branch fields updated. * Update the current program entry denoted by index @N target field to be * that of the updated entry. This will denote the entry to update if - * we are processing an "||" after an "&&" + * we are processing an "||" after an "&&". */ static void update_preds(struct prog_entry *prog, int N, int invert) { diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index fcaf226b7744..5edbf6b1da3f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1988,6 +1988,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 : HIST_FIELD_FN_BUCKET; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); + if (!hist_field->operands[0]) + goto free; hist_field->size = hist_field->operands[0]->size; hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL); if (!hist_field->type) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d960f6b11b5e..58f3946081e2 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -111,7 +111,8 @@ static void __always_unused ____ftrace_check_##name(void) \ #define __array(_type, _item, _len) { \ .type = #_type"["__stringify(_len)"]", .name = #_item, \ .size = sizeof(_type[_len]), .align = __alignof__(_type), \ - is_signed_type(_type), .filter_type = FILTER_OTHER }, + is_signed_type(_type), .filter_type = FILTER_OTHER, \ + .len = _len }, #undef __array_desc #define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 94c1b5eb1dc0..210e1f168392 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -147,9 +147,8 @@ static void osnoise_unregister_instance(struct trace_array *tr) * register/unregister serialization is provided by trace's * trace_types_lock. */ - lockdep_assert_held(&trace_types_lock); - - list_for_each_entry_rcu(inst, &osnoise_instances, list) { + list_for_each_entry_rcu(inst, &osnoise_instances, list, + lockdep_is_held(&trace_types_lock)) { if (inst->tr == tr) { list_del_rcu(&inst->list); found = 1; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 57a13b61f186..bd475a00f96d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1535,7 +1535,7 @@ static struct trace_event *events[] __initdata = { NULL }; -__init static int init_events(void) +__init int init_events(void) { struct trace_event *event; int i, ret; @@ -1548,4 +1548,3 @@ __init static int init_events(void) return 0; } -early_initcall(init_events); diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 1e130da1b742..e37446f7916e 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -15,6 +15,20 @@ #define CREATE_TRACE_POINTS #include <trace/events/preemptirq.h> +/* + * Use regular trace points on architectures that implement noinstr + * tooling: these calls will only happen with RCU enabled, which can + * use a regular tracepoint. + * + * On older architectures, use the rcuidle tracing methods (which + * aren't NMI-safe - so exclude NMI contexts): + */ +#ifdef CONFIG_ARCH_WANTS_NO_INSTR +#define trace(point) trace_##point +#else +#define trace(point) if (!in_nmi()) trace_##point##_rcuidle +#endif + #ifdef CONFIG_TRACE_IRQFLAGS /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); @@ -28,8 +42,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu); void trace_hardirqs_on_prepare(void) { if (this_cpu_read(tracing_irq_cpu)) { - if (!in_nmi()) - trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -40,8 +53,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { - if (!in_nmi()) - trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -63,8 +75,7 @@ void trace_hardirqs_off_finish(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - if (!in_nmi()) - trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); } } @@ -78,56 +89,24 @@ void trace_hardirqs_off(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - if (!in_nmi()) - trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); } } EXPORT_SYMBOL(trace_hardirqs_off); NOKPROBE_SYMBOL(trace_hardirqs_off); - -__visible void trace_hardirqs_on_caller(unsigned long caller_addr) -{ - if (this_cpu_read(tracing_irq_cpu)) { - if (!in_nmi()) - trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); - tracer_hardirqs_on(CALLER_ADDR0, caller_addr); - this_cpu_write(tracing_irq_cpu, 0); - } - - lockdep_hardirqs_on_prepare(); - lockdep_hardirqs_on(caller_addr); -} -EXPORT_SYMBOL(trace_hardirqs_on_caller); -NOKPROBE_SYMBOL(trace_hardirqs_on_caller); - -__visible void trace_hardirqs_off_caller(unsigned long caller_addr) -{ - lockdep_hardirqs_off(caller_addr); - - if (!this_cpu_read(tracing_irq_cpu)) { - this_cpu_write(tracing_irq_cpu, 1); - tracer_hardirqs_off(CALLER_ADDR0, caller_addr); - if (!in_nmi()) - trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); - } -} -EXPORT_SYMBOL(trace_hardirqs_off_caller); -NOKPROBE_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_TRACE_IRQFLAGS */ #ifdef CONFIG_TRACE_PREEMPT_TOGGLE void trace_preempt_on(unsigned long a0, unsigned long a1) { - if (!in_nmi()) - trace_preempt_enable_rcuidle(a0, a1); + trace(preempt_enable)(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - if (!in_nmi()) - trace_preempt_disable_rcuidle(a0, a1); + trace(preempt_disable)(a0, a1); tracer_preempt_off(a0, a1); } #endif diff --git a/kernel/umh.c b/kernel/umh.c index 850631518665..fbf872c624cb 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -438,21 +438,27 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; - if (wait & UMH_KILLABLE) - state |= TASK_KILLABLE; - if (wait & UMH_FREEZABLE) state |= TASK_FREEZABLE; - retval = wait_for_completion_state(&done, state); - if (!retval) - goto wait_done; - if (wait & UMH_KILLABLE) { + retval = wait_for_completion_state(&done, state | TASK_KILLABLE); + if (!retval) + goto wait_done; + /* umh_complete() will see NULL and free sub_info */ if (xchg(&sub_info->complete, NULL)) goto unlock; + + /* + * fallthrough; in case of -ERESTARTSYS now do uninterruptible + * wait_for_completion_state(). Since umh_complete() shall call + * complete() in a moment if xchg() above returned NULL, this + * uninterruptible wait_for_completion_state() will not block + * SIGKILL'ed processes for long. + */ } + wait_for_completion_state(&done, state); wait_done: retval = sub_info->retval; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 07895deca271..b8b541caed48 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -169,7 +169,9 @@ struct worker_pool { struct list_head idle_list; /* L: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ - struct timer_list mayday_timer; /* L: SOS timer for workers */ + struct work_struct idle_cull_work; /* L: worker idle cleanup */ + + struct timer_list mayday_timer; /* L: SOS timer for workers */ /* a workers is either on busy_hash or idle_list, or the manager */ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); @@ -177,6 +179,7 @@ struct worker_pool { struct worker *manager; /* L: purely informational */ struct list_head workers; /* A: attached workers */ + struct list_head dying_workers; /* A: workers about to die */ struct completion *detach_completion; /* all workers detached */ struct ida worker_ida; /* worker IDs for task name */ @@ -326,7 +329,7 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait); static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ -/* PL: allowable cpus for unbound wqs and work items */ +/* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; /* CPU where unbound work was last round robin scheduled from this CPU */ @@ -1433,9 +1436,13 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, lockdep_assert_irqs_disabled(); - /* if draining, only works from the same workqueue are allowed */ - if (unlikely(wq->flags & __WQ_DRAINING) && - WARN_ON_ONCE(!is_chained_work(wq))) + /* + * For a draining wq, only works from the same workqueue are + * allowed. The __WQ_DESTROYING helps to spot the issue that + * queues a new work item to a wq after destroy_workqueue(wq). + */ + if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && + WARN_ON_ONCE(!is_chained_work(wq)))) return; rcu_read_lock(); retry: @@ -1900,7 +1907,7 @@ static void worker_detach_from_pool(struct worker *worker) list_del(&worker->node); worker->pool = NULL; - if (list_empty(&pool->workers)) + if (list_empty(&pool->workers) && list_empty(&pool->dying_workers)) detach_completion = pool->detach_completion; mutex_unlock(&wq_pool_attach_mutex); @@ -1972,21 +1979,55 @@ fail: return NULL; } +static void unbind_worker(struct worker *worker) +{ + lockdep_assert_held(&wq_pool_attach_mutex); + + kthread_set_per_cpu(worker->task, -1); + if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); + else + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); +} + +static void wake_dying_workers(struct list_head *cull_list) +{ + struct worker *worker, *tmp; + + list_for_each_entry_safe(worker, tmp, cull_list, entry) { + list_del_init(&worker->entry); + unbind_worker(worker); + /* + * If the worker was somehow already running, then it had to be + * in pool->idle_list when set_worker_dying() happened or we + * wouldn't have gotten here. + * + * Thus, the worker must either have observed the WORKER_DIE + * flag, or have set its state to TASK_IDLE. Either way, the + * below will be observed by the worker and is safe to do + * outside of pool->lock. + */ + wake_up_process(worker->task); + } +} + /** - * destroy_worker - destroy a workqueue worker + * set_worker_dying - Tag a worker for destruction * @worker: worker to be destroyed + * @list: transfer worker away from its pool->idle_list and into list * - * Destroy @worker and adjust @pool stats accordingly. The worker should - * be idle. + * Tag @worker for destruction and adjust @pool stats accordingly. The worker + * should be idle. * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ -static void destroy_worker(struct worker *worker) +static void set_worker_dying(struct worker *worker, struct list_head *list) { struct worker_pool *pool = worker->pool; lockdep_assert_held(&pool->lock); + lockdep_assert_held(&wq_pool_attach_mutex); /* sanity check frenzy */ if (WARN_ON(worker->current_work) || @@ -1997,34 +2038,94 @@ static void destroy_worker(struct worker *worker) pool->nr_workers--; pool->nr_idle--; - list_del_init(&worker->entry); worker->flags |= WORKER_DIE; - wake_up_process(worker->task); + + list_move(&worker->entry, list); + list_move(&worker->node, &pool->dying_workers); } +/** + * idle_worker_timeout - check if some idle workers can now be deleted. + * @t: The pool's idle_timer that just expired + * + * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in + * worker_leave_idle(), as a worker flicking between idle and active while its + * pool is at the too_many_workers() tipping point would cause too much timer + * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let + * it expire and re-evaluate things from there. + */ static void idle_worker_timeout(struct timer_list *t) { struct worker_pool *pool = from_timer(pool, t, idle_timer); + bool do_cull = false; + + if (work_pending(&pool->idle_cull_work)) + return; raw_spin_lock_irq(&pool->lock); - while (too_many_workers(pool)) { + if (too_many_workers(pool)) { struct worker *worker; unsigned long expires; /* idle_list is kept in LIFO order, check the last one */ worker = list_entry(pool->idle_list.prev, struct worker, entry); expires = worker->last_active + IDLE_WORKER_TIMEOUT; + do_cull = !time_before(jiffies, expires); + + if (!do_cull) + mod_timer(&pool->idle_timer, expires); + } + raw_spin_unlock_irq(&pool->lock); + + if (do_cull) + queue_work(system_unbound_wq, &pool->idle_cull_work); +} + +/** + * idle_cull_fn - cull workers that have been idle for too long. + * @work: the pool's work for handling these idle workers + * + * This goes through a pool's idle workers and gets rid of those that have been + * idle for at least IDLE_WORKER_TIMEOUT seconds. + * + * We don't want to disturb isolated CPUs because of a pcpu kworker being + * culled, so this also resets worker affinity. This requires a sleepable + * context, hence the split between timer callback and work item. + */ +static void idle_cull_fn(struct work_struct *work) +{ + struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); + struct list_head cull_list; + + INIT_LIST_HEAD(&cull_list); + /* + * Grabbing wq_pool_attach_mutex here ensures an already-running worker + * cannot proceed beyong worker_detach_from_pool() in its self-destruct + * path. This is required as a previously-preempted worker could run after + * set_worker_dying() has happened but before wake_dying_workers() did. + */ + mutex_lock(&wq_pool_attach_mutex); + raw_spin_lock_irq(&pool->lock); + + while (too_many_workers(pool)) { + struct worker *worker; + unsigned long expires; + + worker = list_entry(pool->idle_list.prev, struct worker, entry); + expires = worker->last_active + IDLE_WORKER_TIMEOUT; if (time_before(jiffies, expires)) { mod_timer(&pool->idle_timer, expires); break; } - destroy_worker(worker); + set_worker_dying(worker, &cull_list); } raw_spin_unlock_irq(&pool->lock); + wake_dying_workers(&cull_list); + mutex_unlock(&wq_pool_attach_mutex); } static void send_mayday(struct work_struct *work) @@ -2388,12 +2489,12 @@ woke_up: /* am I supposed to die? */ if (unlikely(worker->flags & WORKER_DIE)) { raw_spin_unlock_irq(&pool->lock); - WARN_ON_ONCE(!list_empty(&worker->entry)); set_pf_worker(false); set_task_comm(worker->task, "kworker/dying"); ida_free(&pool->worker_ida, worker->id); worker_detach_from_pool(worker); + WARN_ON_ONCE(!list_empty(&worker->entry)); kfree(worker); return 0; } @@ -3462,10 +3563,12 @@ static int init_worker_pool(struct worker_pool *pool) hash_init(pool->busy_hash); timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); + INIT_WORK(&pool->idle_cull_work, idle_cull_fn); timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); INIT_LIST_HEAD(&pool->workers); + INIT_LIST_HEAD(&pool->dying_workers); ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); @@ -3540,18 +3643,6 @@ static void rcu_free_pool(struct rcu_head *rcu) kfree(pool); } -/* This returns with the lock held on success (pool manager is inactive). */ -static bool wq_manager_inactive(struct worker_pool *pool) -{ - raw_spin_lock_irq(&pool->lock); - - if (pool->flags & POOL_MANAGER_ACTIVE) { - raw_spin_unlock_irq(&pool->lock); - return false; - } - return true; -} - /** * put_unbound_pool - put a worker_pool * @pool: worker_pool to put @@ -3566,8 +3657,11 @@ static bool wq_manager_inactive(struct worker_pool *pool) static void put_unbound_pool(struct worker_pool *pool) { DECLARE_COMPLETION_ONSTACK(detach_completion); + struct list_head cull_list; struct worker *worker; + INIT_LIST_HEAD(&cull_list); + lockdep_assert_held(&wq_pool_mutex); if (--pool->refcnt) @@ -3587,20 +3681,38 @@ static void put_unbound_pool(struct worker_pool *pool) * Become the manager and destroy all workers. This prevents * @pool's workers from blocking on attach_mutex. We're the last * manager and @pool gets freed with the flag set. - * Because of how wq_manager_inactive() works, we will hold the - * spinlock after a successful wait. + * + * Having a concurrent manager is quite unlikely to happen as we can + * only get here with + * pwq->refcnt == pool->refcnt == 0 + * which implies no work queued to the pool, which implies no worker can + * become the manager. However a worker could have taken the role of + * manager before the refcnts dropped to 0, since maybe_create_worker() + * drops pool->lock */ - rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool), - TASK_UNINTERRUPTIBLE); - pool->flags |= POOL_MANAGER_ACTIVE; + while (true) { + rcuwait_wait_event(&manager_wait, + !(pool->flags & POOL_MANAGER_ACTIVE), + TASK_UNINTERRUPTIBLE); + + mutex_lock(&wq_pool_attach_mutex); + raw_spin_lock_irq(&pool->lock); + if (!(pool->flags & POOL_MANAGER_ACTIVE)) { + pool->flags |= POOL_MANAGER_ACTIVE; + break; + } + raw_spin_unlock_irq(&pool->lock); + mutex_unlock(&wq_pool_attach_mutex); + } while ((worker = first_idle_worker(pool))) - destroy_worker(worker); + set_worker_dying(worker, &cull_list); WARN_ON(pool->nr_workers || pool->nr_idle); raw_spin_unlock_irq(&pool->lock); - mutex_lock(&wq_pool_attach_mutex); - if (!list_empty(&pool->workers)) + wake_dying_workers(&cull_list); + + if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers)) pool->detach_completion = &detach_completion; mutex_unlock(&wq_pool_attach_mutex); @@ -3609,6 +3721,7 @@ static void put_unbound_pool(struct worker_pool *pool) /* shut down the timers */ del_timer_sync(&pool->idle_timer); + cancel_work_sync(&pool->idle_cull_work); del_timer_sync(&pool->mayday_timer); /* RCU protected to allow dereferences from get_work_pool() */ @@ -3952,7 +4065,8 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) /* allocate the attrs and pwqs for later installation */ static struct apply_wqattrs_ctx * apply_wqattrs_prepare(struct workqueue_struct *wq, - const struct workqueue_attrs *attrs) + const struct workqueue_attrs *attrs, + const cpumask_var_t unbound_cpumask) { struct apply_wqattrs_ctx *ctx; struct workqueue_attrs *new_attrs, *tmp_attrs; @@ -3968,14 +4082,15 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, goto out_free; /* - * Calculate the attrs of the default pwq. + * Calculate the attrs of the default pwq with unbound_cpumask + * which is wq_unbound_cpumask or to set to wq_unbound_cpumask. * If the user configured cpumask doesn't overlap with the * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask. */ copy_workqueue_attrs(new_attrs, attrs); - cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask); + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask); if (unlikely(cpumask_empty(new_attrs->cpumask))) - cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask); + cpumask_copy(new_attrs->cpumask, unbound_cpumask); /* * We may create multiple pwqs with differing cpumasks. Make a @@ -4072,7 +4187,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, wq->flags &= ~__WQ_ORDERED; } - ctx = apply_wqattrs_prepare(wq, attrs); + ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); if (!ctx) return -ENOMEM; @@ -4414,6 +4529,11 @@ void destroy_workqueue(struct workqueue_struct *wq) */ workqueue_sysfs_unregister(wq); + /* mark the workqueue destruction is in progress */ + mutex_lock(&wq->mutex); + wq->flags |= __WQ_DESTROYING; + mutex_unlock(&wq->mutex); + /* drain it before proceeding with destruction */ drain_workqueue(wq); @@ -4709,22 +4829,53 @@ static void pr_cont_pool_info(struct worker_pool *pool) pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); } -static void pr_cont_work(bool comma, struct work_struct *work) +struct pr_cont_work_struct { + bool comma; + work_func_t func; + long ctr; +}; + +static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp) +{ + if (!pcwsp->ctr) + goto out_record; + if (func == pcwsp->func) { + pcwsp->ctr++; + return; + } + if (pcwsp->ctr == 1) + pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func); + else + pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func); + pcwsp->ctr = 0; +out_record: + if ((long)func == -1L) + return; + pcwsp->comma = comma; + pcwsp->func = func; + pcwsp->ctr = 1; +} + +static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp) { if (work->func == wq_barrier_func) { struct wq_barrier *barr; barr = container_of(work, struct wq_barrier, work); + pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); pr_cont("%s BAR(%d)", comma ? "," : "", task_pid_nr(barr->task)); } else { - pr_cont("%s %ps", comma ? "," : "", work->func); + if (!comma) + pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); + pr_cont_work_flush(comma, work->func, pcwsp); } } static void show_pwq(struct pool_workqueue *pwq) { + struct pr_cont_work_struct pcws = { .ctr = 0, }; struct worker_pool *pool = pwq->pool; struct work_struct *work; struct worker *worker; @@ -4757,7 +4908,8 @@ static void show_pwq(struct pool_workqueue *pwq) worker->rescue_wq ? "(RESCUER)" : "", worker->current_func); list_for_each_entry(work, &worker->scheduled, entry) - pr_cont_work(false, work); + pr_cont_work(false, work, &pcws); + pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); comma = true; } pr_cont("\n"); @@ -4777,9 +4929,10 @@ static void show_pwq(struct pool_workqueue *pwq) if (get_work_pwq(work) != pwq) continue; - pr_cont_work(comma, work); + pr_cont_work(comma, work, &pcws); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } + pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); pr_cont("\n"); } @@ -4788,9 +4941,10 @@ static void show_pwq(struct pool_workqueue *pwq) pr_info(" inactive:"); list_for_each_entry(work, &pwq->inactive_works, entry) { - pr_cont_work(comma, work); + pr_cont_work(comma, work, &pcws); comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); } + pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); pr_cont("\n"); } } @@ -5006,13 +5160,8 @@ static void unbind_workers(int cpu) raw_spin_unlock_irq(&pool->lock); - for_each_pool_worker(worker, pool) { - kthread_set_per_cpu(worker->task, -1); - if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); - else - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); - } + for_each_pool_worker(worker, pool) + unbind_worker(worker); mutex_unlock(&wq_pool_attach_mutex); } @@ -5334,7 +5483,7 @@ out_unlock: } #endif /* CONFIG_FREEZER */ -static int workqueue_apply_unbound_cpumask(void) +static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) { LIST_HEAD(ctxs); int ret = 0; @@ -5350,7 +5499,7 @@ static int workqueue_apply_unbound_cpumask(void) if (wq->flags & __WQ_ORDERED) continue; - ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs); + ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); if (!ctx) { ret = -ENOMEM; break; @@ -5365,6 +5514,11 @@ static int workqueue_apply_unbound_cpumask(void) apply_wqattrs_cleanup(ctx); } + if (!ret) { + mutex_lock(&wq_pool_attach_mutex); + cpumask_copy(wq_unbound_cpumask, unbound_cpumask); + mutex_unlock(&wq_pool_attach_mutex); + } return ret; } @@ -5383,7 +5537,6 @@ static int workqueue_apply_unbound_cpumask(void) int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) { int ret = -EINVAL; - cpumask_var_t saved_cpumask; /* * Not excluding isolated cpus on purpose. @@ -5397,23 +5550,8 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) goto out_unlock; } - if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) { - ret = -ENOMEM; - goto out_unlock; - } - - /* save the old wq_unbound_cpumask. */ - cpumask_copy(saved_cpumask, wq_unbound_cpumask); - - /* update wq_unbound_cpumask at first and apply it to wqs. */ - cpumask_copy(wq_unbound_cpumask, cpumask); - ret = workqueue_apply_unbound_cpumask(); - - /* restore the wq_unbound_cpumask when failed. */ - if (ret < 0) - cpumask_copy(wq_unbound_cpumask, saved_cpumask); + ret = workqueue_apply_unbound_cpumask(cpumask); - free_cpumask_var(saved_cpumask); out_unlock: apply_wqattrs_unlock(); } |