diff options
Diffstat (limited to 'kernel')
36 files changed, 596 insertions, 414 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 17b0d523afb3..9ddfe2aa6671 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1101,13 +1101,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature audit_log_end(ab); } -static int audit_set_feature(struct sk_buff *skb) +static int audit_set_feature(struct audit_features *uaf) { - struct audit_features *uaf; int i; BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); - uaf = nlmsg_data(nlmsg_hdr(skb)); /* if there is ever a version 2 we should handle that here */ @@ -1175,6 +1173,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; void *data; + int data_len; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; @@ -1188,6 +1187,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); + data_len = nlmsg_len(nlh); switch (msg_type) { case AUDIT_GET: { @@ -1211,7 +1211,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_status s; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) @@ -1315,7 +1315,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; break; case AUDIT_SET_FEATURE: - err = audit_set_feature(skb); + if (data_len < sizeof(struct audit_features)) + return -EINVAL; + err = audit_set_feature(data); if (err) return err; break; @@ -1327,6 +1329,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ + char *str = data; + err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push(); @@ -1334,26 +1338,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } audit_log_user_recv_msg(&ab, msg_type); - if (msg_type != AUDIT_USER_TTY) + if (msg_type != AUDIT_USER_TTY) { + /* ensure NULL termination */ + str[data_len - 1] = '\0'; audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, - (char *)data); - else { - int size; - + str); + } else { audit_log_format(ab, " data="); - size = nlmsg_len(nlh); - if (size > 0 && - ((unsigned char *)data)[size - 1] == '\0') - size--; - audit_log_n_untrustedstring(ab, data, size); + if (data_len > 0 && str[data_len - 1] == '\0') + data_len--; + audit_log_n_untrustedstring(ab, str, data_len); } audit_log_end(ab); } break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: - if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) + if (data_len < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(audit_context(), &ab, @@ -1365,7 +1367,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_end(ab); return -EPERM; } - err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh)); + err = audit_rule_change(msg_type, seq, data, data_len); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); @@ -1380,7 +1382,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; - size_t msglen = nlmsg_len(nlh); + size_t msglen = data_len; char *old, *new; err = -EINVAL; @@ -1456,7 +1458,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b0126e9c0743..026e34da4ace 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -456,6 +456,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, bufp = data->buf; for (i = 0; i < data->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; + u32 f_val; err = -EINVAL; @@ -464,12 +465,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; f->type = data->fields[i]; - f->val = data->values[i]; + f_val = data->values[i]; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { + if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; - f->val = 0; + f_val = 0; entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; } @@ -485,7 +486,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_OBJ_UID: - f->uid = make_kuid(current_user_ns(), f->val); + f->uid = make_kuid(current_user_ns(), f_val); if (!uid_valid(f->uid)) goto exit_free; break; @@ -494,11 +495,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: - f->gid = make_kgid(current_user_ns(), f->val); + f->gid = make_kgid(current_user_ns(), f_val); if (!gid_valid(f->gid)) goto exit_free; break; case AUDIT_ARCH: + f->val = f_val; entry->rule.arch_f = f; break; case AUDIT_SUBJ_USER: @@ -511,11 +513,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } + entry->rule.buflen += f_val; + f->lsm_str = str; err = security_audit_rule_init(f->type, f->op, str, (void **)&f->lsm_rule); /* Keep currently invalid fields around in case they @@ -524,68 +528,71 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, pr_warn("audit rule for LSM \'%s\' is invalid\n", str); err = 0; - } - if (err) { - kfree(str); + } else if (err) goto exit_free; - } else - f->lsm_str = str; break; case AUDIT_WATCH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - - err = audit_to_watch(&entry->rule, str, f->val, f->op); + } + err = audit_to_watch(&entry->rule, str, f_val, f->op); if (err) { kfree(str); goto exit_free; } + entry->rule.buflen += f_val; break; case AUDIT_DIR: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } err = audit_make_tree(&entry->rule, str, f->op); kfree(str); if (err) goto exit_free; + entry->rule.buflen += f_val; break; case AUDIT_INODE: + f->val = f_val; err = audit_to_inode(&entry->rule, f); if (err) goto exit_free; break; case AUDIT_FILTERKEY: - if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) + if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; + } + entry->rule.buflen += f_val; entry->rule.filterkey = str; break; case AUDIT_EXE: - if (entry->rule.exe || f->val > PATH_MAX) + if (entry->rule.exe || f_val > PATH_MAX) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); + str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } - entry->rule.buflen += f->val; - - audit_mark = audit_alloc_mark(&entry->rule, str, f->val); + audit_mark = audit_alloc_mark(&entry->rule, str, f_val); if (IS_ERR(audit_mark)) { kfree(str); err = PTR_ERR(audit_mark); goto exit_free; } + entry->rule.buflen += f_val; entry->rule.exe = audit_mark; break; + default: + f->val = f_val; + break; } } diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 042f95534f86..68a89a9f7ccd 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -482,13 +482,21 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) prev_state = cmpxchg(&st_map->kvalue.state, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE); - if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) { + switch (prev_state) { + case BPF_STRUCT_OPS_STATE_INUSE: st_map->st_ops->unreg(&st_map->kvalue.data); if (refcount_dec_and_test(&st_map->kvalue.refcnt)) bpf_map_put(map); + return 0; + case BPF_STRUCT_OPS_STATE_TOBEFREE: + return -EINPROGRESS; + case BPF_STRUCT_OPS_STATE_INIT: + return -ENOENT; + default: + WARN_ON_ONCE(1); + /* Should never happen. Treat it as not found. */ + return -ENOENT; } - - return 0; } static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 805c43b083e9..7787bdcb5d68 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2418,7 +2418,7 @@ static int btf_enum_check_member(struct btf_verifier_env *env, struct_size = struct_type->size; bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); - if (struct_size - bytes_offset < sizeof(int)) { + if (struct_size - bytes_offset < member_type->size) { btf_verifier_log_member(env, struct_type, member, "Member exceeds struct_size"); return -EINVAL; @@ -4142,9 +4142,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, * EFAULT - verifier bug * 0 - 99% match. The last 1% is validated by the verifier. */ -int btf_check_func_type_match(struct bpf_verifier_log *log, - struct btf *btf1, const struct btf_type *t1, - struct btf *btf2, const struct btf_type *t2) +static int btf_check_func_type_match(struct bpf_verifier_log *log, + struct btf *btf1, const struct btf_type *t1, + struct btf *btf2, const struct btf_type *t2) { const struct btf_param *args1, *args2; const char *fn1, *fn2, *s1, *s2; @@ -4564,7 +4564,7 @@ int btf_get_info_by_fd(const struct btf *btf, union bpf_attr __user *uattr) { struct bpf_btf_info __user *uinfo; - struct bpf_btf_info info = {}; + struct bpf_btf_info info; u32 info_copy, btf_copy; void __user *ubtf; u32 uinfo_len; @@ -4573,6 +4573,7 @@ int btf_get_info_by_fd(const struct btf *btf, uinfo_len = attr->info.info_len; info_copy = min_t(u32, uinfo_len, sizeof(info)); + memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_copy)) return -EFAULT; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9a500fadbef5..4f1472409ef8 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -227,6 +227,9 @@ cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) + cgroup_bpf_put(p); + percpu_ref_exit(&cgrp->bpf.refcnt); return -ENOMEM; @@ -302,8 +305,8 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], - *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; + struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_prog_list *pl, *replace_pl = NULL; enum bpf_cgroup_storage_type stype; int err; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2d182c4ee9d9..a1468e3f5af2 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -56,6 +56,7 @@ struct htab_elem { union { struct bpf_htab *htab; struct pcpu_freelist_node fnode; + struct htab_elem *batch_flink; }; }; }; @@ -126,6 +127,17 @@ free_elems: bpf_map_area_free(htab->elems); } +/* The LRU list has a lock (lru_lock). Each htab bucket has a lock + * (bucket_lock). If both locks need to be acquired together, the lock + * order is always lru_lock -> bucket_lock and this only happens in + * bpf_lru_list.c logic. For example, certain code path of + * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), + * will acquire lru_lock first followed by acquiring bucket_lock. + * + * In hashtab.c, to avoid deadlock, lock acquisition of + * bucket_lock followed by lru_lock is not allowed. In such cases, + * bucket_lock needs to be released first before acquiring lru_lock. + */ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, u32 hash) { @@ -1256,10 +1268,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size; + struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; - unsigned long flags; + unsigned long flags = 0; + bool locked = false; struct htab_elem *l; struct bucket *b; int ret = 0; @@ -1319,15 +1333,25 @@ again_nocopy: dst_val = values; b = &htab->buckets[batch]; head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + /* do not grab the lock unless need it (bucket_cnt > 0). */ + if (locked) + raw_spin_lock_irqsave(&b->lock, flags); bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) bucket_cnt++; + if (bucket_cnt && !locked) { + locked = true; + goto again_nocopy; + } + if (bucket_cnt > (max_count - total)) { if (total == 0) ret = -ENOSPC; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1337,6 +1361,9 @@ again_nocopy: if (bucket_cnt > bucket_size) { bucket_size = bucket_cnt; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1346,6 +1373,10 @@ again_nocopy: goto alloc; } + /* Next block is only safe to run if you have grabbed the lock */ + if (!locked) + goto next_batch; + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { memcpy(dst_key, l->key, key_size); @@ -1370,16 +1401,33 @@ again_nocopy: } if (do_delete) { hlist_nulls_del_rcu(&l->hash_node); - if (is_lru_map) - bpf_lru_push_free(&htab->lru, &l->lru_node); - else + + /* bpf_lru_push_free() will acquire lru_lock, which + * may cause deadlock. See comments in function + * prealloc_lru_pop(). Let us do bpf_lru_push_free() + * after releasing the bucket lock. + */ + if (is_lru_map) { + l->batch_flink = node_to_free; + node_to_free = l; + } else { free_htab_elem(htab, l); + } } dst_key += key_size; dst_val += value_size; } raw_spin_unlock_irqrestore(&b->lock, flags); + locked = false; + + while (node_to_free) { + l = node_to_free; + node_to_free = node_to_free->batch_flink; + bpf_lru_push_free(&htab->lru, &l->lru_node); + } + +next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. */ diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2c5dc6541ece..bd09290e3648 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -321,7 +321,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, ulen = info->jited_prog_len; info->jited_prog_len = aux->offload->jited_len; - if (info->jited_prog_len & ulen) { + if (info->jited_prog_len && ulen) { uinsns = u64_to_user_ptr(info->jited_prog_insns); ulen = min_t(u32, info->jited_prog_len, ulen); if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a91ad518c050..966b7b34cde0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -696,14 +696,15 @@ int bpf_get_file_flag(int flags) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. - * Return 0 on success and < 0 on error. +/* dst and src must have at least "size" number of bytes. + * Return strlen on success and < 0 on error. */ -static int bpf_obj_name_cpy(char *dst, const char *src) +int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) { - const char *end = src + BPF_OBJ_NAME_LEN; + const char *end = src + size; + const char *orig_src = src; - memset(dst, 0, BPF_OBJ_NAME_LEN); + memset(dst, 0, size); /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { if (!isalnum(*src) && @@ -712,11 +713,11 @@ static int bpf_obj_name_cpy(char *dst, const char *src) *dst++ = *src++; } - /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ + /* No '\0' found in "size" number of bytes */ if (src == end) return -EINVAL; - return 0; + return src - orig_src; } int map_check_no_btf(const struct bpf_map *map, @@ -810,8 +811,9 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = bpf_obj_name_cpy(map->name, attr->map_name); - if (err) + err = bpf_obj_name_cpy(map->name, attr->map_name, + sizeof(attr->map_name)); + if (err < 0) goto free_map; atomic64_set(&map->refcnt, 1); @@ -1510,6 +1512,11 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { + fdput(f); + return -ENOTSUPP; + } + mutex_lock(&map->freeze_mutex); if (map->writecnt) { @@ -2093,8 +2100,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) goto free_prog; prog->aux->load_time = ktime_get_boottime_ns(); - err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); - if (err) + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, + sizeof(attr->prog_name)); + if (err < 0) goto free_prog; /* run eBPF verifier */ @@ -2787,7 +2795,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, union bpf_attr __user *uattr) { struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); - struct bpf_prog_info info = {}; + struct bpf_prog_info info; u32 info_len = attr->info.info_len; struct bpf_prog_stats stats; char __user *uinsns; @@ -2799,6 +2807,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return err; info_len = min_t(u32, sizeof(info), info_len); + memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_len)) return -EFAULT; @@ -3062,7 +3071,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, union bpf_attr __user *uattr) { struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); - struct bpf_map_info info = {}; + struct bpf_map_info info; u32 info_len = attr->info.info_len; int err; @@ -3071,6 +3080,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return err; info_len = min_t(u32, sizeof(info), info_len); + memset(&info, 0, sizeof(info)); info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; @@ -3354,7 +3364,7 @@ err_put: SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { - union bpf_attr attr = {}; + union bpf_attr attr; int err; if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) @@ -3366,6 +3376,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz size = min_t(u32, size, sizeof(attr)); /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + memset(&attr, 0, sizeof(attr)); if (copy_from_user(&attr, uattr, size) != 0) return -EFAULT; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index be1a1c83cdd1..f2d7cea86ffe 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -471,6 +471,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) */ p++; if (p >= end) { + (*pos)++; return NULL; } else { *pos = *p; @@ -782,7 +783,7 @@ void cgroup1_release_agent(struct work_struct *work) pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf) + if (!pathbuf || !agentbuf || !strlen(agentbuf)) goto out; spin_lock_irq(&css_set_lock); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 75f687301bbf..3dead0416b91 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3542,21 +3542,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -4400,12 +4400,16 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) } } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); - if (!list_empty(&cset->tasks)) + if (!list_empty(&cset->tasks)) { it->task_pos = cset->tasks.next; - else if (!list_empty(&cset->mg_tasks)) + it->cur_tasks_head = &cset->tasks; + } else if (!list_empty(&cset->mg_tasks)) { it->task_pos = cset->mg_tasks.next; - else + it->cur_tasks_head = &cset->mg_tasks; + } else { it->task_pos = cset->dying_tasks.next; + it->cur_tasks_head = &cset->dying_tasks; + } it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; @@ -4463,10 +4467,14 @@ repeat: else it->task_pos = it->task_pos->next; - if (it->task_pos == it->tasks_head) + if (it->task_pos == it->tasks_head) { it->task_pos = it->mg_tasks_head->next; - if (it->task_pos == it->mg_tasks_head) + it->cur_tasks_head = it->mg_tasks_head; + } + if (it->task_pos == it->mg_tasks_head) { it->task_pos = it->dying_tasks_head->next; + it->cur_tasks_head = it->dying_tasks_head; + } if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); } else { @@ -4485,11 +4493,12 @@ repeat: goto repeat; /* and dying leaders w/o live member threads */ - if (!atomic_read(&task->signal->live)) + if (it->cur_tasks_head == it->dying_tasks_head && + !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ - if (task->flags & PF_EXITING) + if (it->cur_tasks_head == it->dying_tasks_head) goto repeat; } } @@ -4595,6 +4604,9 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) struct kernfs_open_file *of = s->private; struct css_task_iter *it = of->priv; + if (pos) + (*pos)++; + return css_task_iter_next(it); } @@ -4610,7 +4622,7 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, * from position 0, so we can simply keep iterating on !0 *pos. */ if (!it) { - if (WARN_ON_ONCE((*pos)++)) + if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); it = kzalloc(sizeof(*it), GFP_KERNEL); @@ -4618,10 +4630,11 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, return ERR_PTR(-ENOMEM); of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); - } else if (!(*pos)++) { + } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); - } + } else + return it->cur_task; return cgroup_procs_next(s, NULL, NULL); } @@ -6258,6 +6271,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) return; } + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) + return; + rcu_read_lock(); while (true) { diff --git a/kernel/compat.c b/kernel/compat.c index 95005f849c68..843dd17e6078 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,70 +26,6 @@ #include <linux/uaccess.h> -static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __get_user(tv->tv_sec, &ctv->tv_sec) || - __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __put_user(tv->tv_sec, &ctv->tv_sec) || - __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __get_user(ts->tv_sec, &cts->tv_sec) || - __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __put_user(ts->tv_sec, &cts->tv_sec) || - __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -int compat_get_timeval(struct timeval *tv, const void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_get_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_get_timeval); - -int compat_put_timeval(const struct timeval *tv, void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_put_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_put_timeval); - -int compat_get_timespec(struct timespec *ts, const void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_get_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_get_timespec); - -int compat_put_timespec(const struct timespec *ts, void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_put_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_put_timespec); - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index daa4e6eefdde..8bc6f2d670f9 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -302,9 +302,16 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; + bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; int err; + if (size_cmdline != -1 && default_cma) { + pr_info("Reserved memory: bypass %s node, using cmdline CMA params instead\n", + rmem->name); + return -EBUSY; + } + if (!of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; @@ -322,7 +329,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) /* Architecture specific contiguous memory fixup. */ dma_contiguous_early_fixup(rmem->base, rmem->size); - if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) + if (default_cma) dma_contiguous_set_default(cma); rmem->ops = &rmem_cma_ops; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 6af7ae83c4ad..ac7956c38f69 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -23,18 +23,6 @@ */ unsigned int zone_dma_bits __ro_after_init = 24; -static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) -{ - if (!dev->dma_mask) { - dev_err_once(dev, "DMA map on device without dma_mask\n"); - } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) { - dev_err_once(dev, - "overflow %pad+%zu of DMA mask %llx bus limit %llx\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - } - WARN_ON_ONCE(1); -} - static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { @@ -357,13 +345,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, EXPORT_SYMBOL(dma_direct_unmap_sg); #endif -static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, - size_t size) -{ - return swiotlb_force != SWIOTLB_FORCE && - dma_capable(dev, dma_addr, size, true); -} - dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -371,9 +352,16 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (unlikely(!dma_direct_possible(dev, dma_addr, size)) && - !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) { - report_addr(dev, dma_addr, size); + if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + return swiotlb_map(dev, phys, size, dir, attrs); + + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (swiotlb_force != SWIOTLB_NO_FORCE) + return swiotlb_map(dev, phys, size, dir, attrs); + + dev_WARN_ONCE(dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); return DMA_MAPPING_ERROR; } @@ -411,7 +399,10 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, dma_addr_t dma_addr = paddr; if (unlikely(!dma_capable(dev, dma_addr, size, false))) { - report_addr(dev, dma_addr, size); + dev_err_once(dev, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + WARN_ON_ONCE(1); return DMA_MAPPING_ERROR; } @@ -472,28 +463,26 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, } #endif /* CONFIG_MMU */ -/* - * Because 32-bit DMA masks are so common we expect every architecture to be - * able to satisfy them - either by not supporting more physical memory, or by - * providing a ZONE_DMA32. If neither is the case, the architecture needs to - * use an IOMMU instead of the direct mapping. - */ int dma_direct_supported(struct device *dev, u64 mask) { - u64 min_mask; + u64 min_mask = (max_pfn - 1) << PAGE_SHIFT; - if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = DMA_BIT_MASK(zone_dma_bits); - else - min_mask = DMA_BIT_MASK(32); - - min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT); + /* + * Because 32-bit DMA masks are so common we expect every architecture + * to be able to satisfy them - either by not supporting more physical + * memory, or by providing a ZONE_DMA32. If neither is the case, the + * architecture needs to use an IOMMU instead of the direct mapping. + */ + if (mask >= DMA_BIT_MASK(32)) + return 1; /* * This check needs to be against the actual bit mask value, so * use __phys_to_dma() here so that the SME encryption mask isn't * part of the check. */ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); return mask >= __phys_to_dma(dev, min_mask); } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9280d6f8271e..c19379fabd20 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -22,6 +22,7 @@ #include <linux/cache.h> #include <linux/dma-direct.h> +#include <linux/dma-noncoherent.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/spinlock.h> @@ -656,35 +657,38 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, } /* - * Create a swiotlb mapping for the buffer at @phys, and in case of DMAing + * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing * to the device copy the data into it as well. */ -bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) +dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, + enum dma_data_direction dir, unsigned long attrs) { - trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force); + phys_addr_t swiotlb_addr; + dma_addr_t dma_addr; - if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) { - dev_warn_ratelimited(dev, - "Cannot do DMA to address %pa\n", phys); - return false; - } + trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, + swiotlb_force); - /* Oh well, have to allocate and map a bounce buffer. */ - *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), - *phys, size, size, dir, attrs); - if (*phys == (phys_addr_t)DMA_MAPPING_ERROR) - return false; + swiotlb_addr = swiotlb_tbl_map_single(dev, + __phys_to_dma(dev, io_tlb_start), + paddr, size, size, dir, attrs); + if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) + return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ - *dma_addr = __phys_to_dma(dev, *phys); - if (unlikely(!dma_capable(dev, *dma_addr, size, true))) { - swiotlb_tbl_unmap_single(dev, *phys, size, size, dir, + dma_addr = __phys_to_dma(dev, swiotlb_addr); + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return false; + dev_WARN_ONCE(dev, 1, + "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; } - return true; + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(swiotlb_addr, size, dir); + return dma_addr; } size_t swiotlb_max_mapping_size(struct device *dev) diff --git a/kernel/exit.c b/kernel/exit.c index 2833ffb0c211..0b81b26a872a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -619,8 +619,8 @@ static void forget_original_parent(struct task_struct *father, reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { - t->real_parent = reaper; - BUG_ON((!t->ptrace) != (t->parent == father)); + RCU_INIT_POINTER(t->real_parent, reaper); + BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) diff --git a/kernel/fork.c b/kernel/fork.c index 60a1295f4384..d90af13431c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -397,8 +397,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, THREAD_SIZE / 1024 * account); - mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } } @@ -1508,7 +1508,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); - rcu_assign_pointer(tsk->sighand, sig); + RCU_INIT_POINTER(tsk->sighand, sig); if (!sig) return -ENOMEM; diff --git a/kernel/futex.c b/kernel/futex.c index 0cf84c8664f2..82dfacb3250e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -385,9 +385,9 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) */ static struct futex_hash_bucket *hash_futex(union futex_key *key) { - u32 hash = jhash2((u32*)&key->both.word, - (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); + return &futex_queues[hash & (futex_hashsize - 1)]; } @@ -429,7 +429,7 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies smp_mb(); (B) */ + smp_mb(); /* explicit smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: futex_get_mm(key); /* implies smp_mb(); (B) */ @@ -463,7 +463,6 @@ static void drop_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - iput(key->shared.inode); break; case FUT_OFF_MMSHARED: mmdrop(key->private.mm); @@ -505,6 +504,46 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, return timeout; } +/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that match_futex() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex @@ -517,9 +556,15 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, * * The key words are stored in @key on success. * - * For shared mappings, it's (page->index, file_inode(vma->vm_file), - * offset_within_page). For private mappings, it's (uaddr, current->mm). - * We can usually work out the index without swapping in the page. + * For shared mappings (when @fshared), the key is: + * ( inode->i_sequence, page->index, offset_within_page ) + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ @@ -659,8 +704,6 @@ again: key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ - } else { struct inode *inode; @@ -692,40 +735,14 @@ again: goto again; } - /* - * Take a reference unless it is about to be freed. Previously - * this reference was taken by ihold under the page lock - * pinning the inode in place so i_lock was unnecessary. The - * only way for this check to fail is if the inode was - * truncated in parallel which is almost certainly an - * application bug. In such a case, just retry. - * - * We are not calling into get_futex_key_refs() in file-backed - * cases, therefore a successful atomic_inc return below will - * guarantee that get_futex_key() will still imply smp_mb(); (B). - */ - if (!atomic_inc_not_zero(&inode->i_count)) { - rcu_read_unlock(); - put_page(page); - - goto again; - } - - /* Should be impossible but lets be paranoid for now */ - if (WARN_ON_ONCE(inode->i_mapping != mapping)) { - err = -EFAULT; - rcu_read_unlock(); - iput(inode); - - goto out; - } - key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = inode; + key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = basepage_index(tail); rcu_read_unlock(); } + get_futex_key_refs(key); /* implies smp_mb(); (B) */ + out: put_page(page); return err; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 3924fbe829d4..c9d8eb7f5c02 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -128,8 +128,6 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern int irq_select_affinity_usr(unsigned int irq); - extern void irq_set_thread_affinity(struct irq_desc *desc); extern int irq_do_set_affinity(struct irq_data *data, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3089a60ea8f9..fe40c658f86f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -323,7 +323,11 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); - schedule_work(&desc->affinity_notify->work); + if (!schedule_work(&desc->affinity_notify->work)) { + /* Work was already scheduled, drop our extra ref */ + kref_put(&desc->affinity_notify->kref, + desc->affinity_notify->release); + } } irqd_set(data, IRQD_AFFINITY_SET); @@ -423,7 +427,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) raw_spin_unlock_irqrestore(&desc->lock, flags); if (old_notify) { - cancel_work_sync(&old_notify->work); + if (cancel_work_sync(&old_notify->work)) { + /* Pending work had a ref, put that one too */ + kref_put(&old_notify->kref, old_notify->release); + } kref_put(&old_notify->kref, old_notify->release); } @@ -481,23 +488,9 @@ int irq_setup_affinity(struct irq_desc *desc) { return irq_select_affinity(irq_desc_get_irq(desc)); } -#endif +#endif /* CONFIG_AUTO_IRQ_AFFINITY */ +#endif /* CONFIG_SMP */ -/* - * Called when a bogus affinity is set via /proc/irq - */ -int irq_select_affinity_usr(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&desc->lock, flags); - ret = irq_setup_affinity(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} -#endif /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9e5783d98033..32c071d7bc03 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -111,6 +111,28 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) return show_irq_affinity(AFFINITY_LIST, m); } +#ifndef CONFIG_AUTO_IRQ_AFFINITY +static inline int irq_select_affinity_usr(unsigned int irq) +{ + /* + * If the interrupt is started up already then this fails. The + * interrupt is assigned to an online CPU already. There is no + * point to move it around randomly. Tell user space that the + * selected mask is bogus. + * + * If not then any change to the affinity is pointless because the + * startup code invokes irq_setup_affinity() which will select + * a online CPU anyway. + */ + return -EINVAL; +} +#else +/* ALPHA magic affinity auto selector. Keep it for historical reasons. */ +static inline int irq_select_affinity_usr(unsigned int irq) +{ + return irq_select_affinity(irq); +} +#endif static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) diff --git a/kernel/notifier.c b/kernel/notifier.c index 63d7501ac638..5989bbb93039 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,7 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_all(); + vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/pid.c b/kernel/pid.c index 0f4ecb57214c..647b4bb457b5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -247,6 +247,16 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tmp = tmp->parent; } + /* + * ENOMEM is not the most obvious choice especially for the case + * where the child subreaper has already exited and the pid + * namespace denies the creation of any new processes. But ENOMEM + * is what we have exposed to userspace for a long time and it is + * documented behavior for pid namespaces. So we can't easily + * change it even if there were an error code better suited. + */ + retval = -ENOMEM; + if (unlikely(is_child_reaper(pid))) { if (pid_ns_prepare_proc(ns)) goto out_free; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ddade80ad276..d82b7b88d616 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1681,7 +1681,7 @@ static unsigned long minimum_image_size(unsigned long saveable) * hibernation for allocations made while saving the image and for device * drivers, in case they need to allocate memory from their hibernation * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough - * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through + * estimate) and reserved_size divided by PAGE_SIZE (which is tunable through * /sys/power/reserved_size, respectively). To make this happen, we compute the * total number of available page frames and allocate at least * diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c8a379c357e..c1217bfe5e81 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8337,6 +8337,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_capacity = group->sgc->capacity; + sgs->group_weight = group->group_weight; + sgs->group_type = group_classify(sd->imbalance_pct, group, sgs); /* diff --git a/kernel/signal.c b/kernel/signal.c index 9ad8dea93dbb..5b2396350dd1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -413,27 +413,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi { struct sigqueue *q = NULL; struct user_struct *user; + int sigpending; /* * Protect access to @t credentials. This can go away when all * callers hold rcu read lock. + * + * NOTE! A pending signal will hold on to the user refcount, + * and we get/put the refcount only when the sigpending count + * changes from/to zero. */ rcu_read_lock(); - user = get_uid(__task_cred(t)->user); - atomic_inc(&user->sigpending); + user = __task_cred(t)->user; + sigpending = atomic_inc_return(&user->sigpending); + if (sigpending == 1) + get_uid(user); rcu_read_unlock(); - if (override_rlimit || - atomic_read(&user->sigpending) <= - task_rlimit(t, RLIMIT_SIGPENDING)) { + if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } if (unlikely(q == NULL)) { - atomic_dec(&user->sigpending); - free_uid(user); + if (atomic_dec_and_test(&user->sigpending)) + free_uid(user); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; @@ -447,8 +452,8 @@ static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; - atomic_dec(&q->user->sigpending); - free_uid(q->user); + if (atomic_dec_and_test(&q->user->sigpending)) + free_uid(q->user); kmem_cache_free(sigqueue_cachep, q); } diff --git a/kernel/sys.c b/kernel/sys.c index f9bc5c303e3f..d325f3ab624a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -47,6 +47,7 @@ #include <linux/syscalls.h> #include <linux/kprobes.h> #include <linux/user_namespace.h> +#include <linux/time_namespace.h> #include <linux/binfmts.h> #include <linux/sched.h> @@ -2546,6 +2547,7 @@ static int do_sysinfo(struct sysinfo *info) memset(info, 0, sizeof(struct sysinfo)); ktime_get_boottime_ts64(&tp); + timens_add_boottime(&tp); info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d396aaaf19a3..ad5b88a53c5a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -805,15 +805,6 @@ static struct ctl_table kern_table[] = { .extra2 = &maxolduid, }, #ifdef CONFIG_S390 -#ifdef CONFIG_MATHEMU - { - .procname = "ieee_emulation_warnings", - .data = &sysctl_ieee_emulation_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "userprocess_debug", .data = &show_unhandled_signals, diff --git a/kernel/time/time.c b/kernel/time/time.c index cdd7386115ff..3985b2b32d08 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -449,49 +449,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -struct timespec ns_to_timespec(const s64 nsec) -{ - struct timespec ts; - s32 rem; - - if (!nsec) - return (struct timespec) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; - } - ts.tv_nsec = rem; - - return ts; -} -EXPORT_SYMBOL(ns_to_timespec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -struct timeval ns_to_timeval(const s64 nsec) -{ - struct timespec ts = ns_to_timespec(nsec); - struct timeval tv; - - tv.tv_sec = ts.tv_sec; - tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; - - return tv; -} -EXPORT_SYMBOL(ns_to_timeval); - struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 91e885194dbc..402eef84c859 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -143,8 +143,8 @@ if FTRACE config BOOTTIME_TRACING bool "Boot-time Tracing support" - depends on BOOT_CONFIG && TRACING - default y + depends on TRACING + select BOOT_CONFIG help Enable developer to setup ftrace subsystem via supplemental kernel cmdline at boot time for debugging (tracing) driver diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 0735ae8545d8..ca39dc3230cb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -335,6 +335,7 @@ static void put_probe_ref(void) static void blk_trace_cleanup(struct blk_trace *bt) { + synchronize_rcu(); blk_trace_free(bt); put_probe_ref(); } @@ -629,8 +630,10 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, static int __blk_trace_startstop(struct request_queue *q, int start) { int ret; - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (bt == NULL) return -EINVAL; @@ -740,8 +743,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) void blk_trace_shutdown(struct request_queue *q) { mutex_lock(&q->blk_trace_mutex); - - if (q->blk_trace) { + if (rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } @@ -752,8 +755,10 @@ void blk_trace_shutdown(struct request_queue *q) #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + /* We don't use the 'bt' value here except as an optimization... */ + bt = rcu_dereference_protected(q->blk_trace, 1); if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; @@ -796,10 +801,14 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) static void blk_add_trace_rq(struct request *rq, int error, unsigned int nr_bytes, u32 what, u64 cgid) { - struct blk_trace *bt = rq->q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); @@ -808,6 +817,7 @@ static void blk_add_trace_rq(struct request *rq, int error, __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), rq->cmd_flags, what, error, 0, NULL, cgid); + rcu_read_unlock(); } static void blk_add_trace_rq_insert(void *ignore, @@ -853,14 +863,19 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, u32 what, int error) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, what, error, 0, NULL, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } static void blk_add_trace_bio_bounce(void *ignore, @@ -905,11 +920,14 @@ static void blk_add_trace_getrq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } @@ -921,27 +939,35 @@ static void blk_add_trace_sleeprq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } static void blk_add_trace_plug(void *ignore, struct request_queue *q) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, unsigned int depth, bool explicit) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); u32 what; @@ -953,14 +979,17 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } + rcu_read_unlock(); } static void blk_add_trace_split(void *ignore, struct request_queue *q, struct bio *bio, unsigned int pdu) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(pdu); @@ -969,6 +998,7 @@ static void blk_add_trace_split(void *ignore, BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), &rpdu, blk_trace_bio_get_cgid(q, bio)); } + rcu_read_unlock(); } /** @@ -988,11 +1018,15 @@ static void blk_add_trace_bio_remap(void *ignore, struct request_queue *q, struct bio *bio, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(bio_dev(bio)); @@ -1001,6 +1035,7 @@ static void blk_add_trace_bio_remap(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } /** @@ -1021,11 +1056,15 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); @@ -1034,6 +1073,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } /** @@ -1051,14 +1091,19 @@ void blk_add_driver_data(struct request_queue *q, struct request *rq, void *data, size_t len) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, 0, len, data, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1597,6 +1642,7 @@ static int blk_trace_remove_queue(struct request_queue *q) return -EINVAL; put_probe_ref(); + synchronize_rcu(); blk_trace_free(bt); return 0; } @@ -1758,6 +1804,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct hd_struct *p = dev_to_part(dev); struct request_queue *q; struct block_device *bdev; + struct blk_trace *bt; ssize_t ret = -ENXIO; bdev = bdget(part_devt(p)); @@ -1770,21 +1817,23 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - ret = sprintf(buf, "%u\n", !!q->blk_trace); + ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; } - if (q->blk_trace == NULL) + if (bt == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) - ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); + ret = blk_trace_mask2str(buf, bt->act_mask); else if (attr == &dev_attr_pid) - ret = sprintf(buf, "%u\n", q->blk_trace->pid); + ret = sprintf(buf, "%u\n", bt->pid); else if (attr == &dev_attr_start_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + ret = sprintf(buf, "%llu\n", bt->start_lba); else if (attr == &dev_attr_end_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: mutex_unlock(&q->blk_trace_mutex); @@ -1801,6 +1850,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct block_device *bdev; struct request_queue *q; struct hd_struct *p; + struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1831,8 +1881,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - if (!!value == !!q->blk_trace) { + if (!!value == !!bt) { ret = 0; goto out_unlock_bdev; } @@ -1844,18 +1896,21 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } ret = 0; - if (q->blk_trace == NULL) + if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); + } if (ret == 0) { if (attr == &dev_attr_act_mask) - q->blk_trace->act_mask = value; + bt->act_mask = value; else if (attr == &dev_attr_pid) - q->blk_trace->pid = value; + bt->pid = value; else if (attr == &dev_attr_start_lba) - q->blk_trace->start_lba = value; + bt->start_lba = value; else if (attr == &dev_attr_end_lba) - q->blk_trace->end_lba = value; + bt->end_lba = value; } out_unlock_bdev: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 19e793aa441a..68250d433bd7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -732,7 +732,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) if (unlikely(!nmi_uaccess_okay())) return -EPERM; - if (in_nmi()) { + if (irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3f7ee102868a..fd81c7de77a7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1547,6 +1547,8 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) rec = bsearch(&key, pg->records, pg->index, sizeof(struct dyn_ftrace), ftrace_cmp_recs); + if (rec) + break; } return rec; } diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 4aefe003cb7c..7d56d621ffea 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -111,11 +111,11 @@ static int __init test_gen_synth_cmd(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"hula hoops"; /* next_comm_field */ + vals[1] = (u64)(long)"hula hoops"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed"; /* my_string_field */ + vals[4] = raw_smp_processor_id(); /* cpu */ + vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 598; /* my_int_field */ /* Now generate a gen_synth_test event */ @@ -218,11 +218,11 @@ static int __init test_empty_synth_event(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed_2.0"; /* my_string_field */ + vals[4] = raw_smp_processor_id(); /* cpu */ + vals[5] = (u64)(long)"thneed_2.0"; /* my_string_field */ vals[6] = 399; /* my_int_field */ /* Now trace an empty_synth_test event */ @@ -290,11 +290,11 @@ static int __init test_create_synth_event(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed"; /* my_string_field */ + vals[4] = raw_smp_processor_id(); /* cpu */ + vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 398; /* my_int_field */ /* Now generate a create_synth_test event */ @@ -330,7 +330,7 @@ static int __init test_add_next_synth_val(void) goto out; /* next_comm_field */ - ret = synth_event_add_next_val((u64)"slinky", &trace_state); + ret = synth_event_add_next_val((u64)(long)"slinky", &trace_state); if (ret) goto out; @@ -345,12 +345,12 @@ static int __init test_add_next_synth_val(void) goto out; /* cpu */ - ret = synth_event_add_next_val(smp_processor_id(), &trace_state); + ret = synth_event_add_next_val(raw_smp_processor_id(), &trace_state); if (ret) goto out; /* my_string_field */ - ret = synth_event_add_next_val((u64)"thneed_2.01", &trace_state); + ret = synth_event_add_next_val((u64)(long)"thneed_2.01", &trace_state); if (ret) goto out; @@ -388,7 +388,7 @@ static int __init test_add_synth_val(void) if (ret) goto out; - ret = synth_event_add_val("cpu", smp_processor_id(), &trace_state); + ret = synth_event_add_val("cpu", raw_smp_processor_id(), &trace_state); if (ret) goto out; @@ -396,12 +396,12 @@ static int __init test_add_synth_val(void) if (ret) goto out; - ret = synth_event_add_val("next_comm_field", (u64)"silly putty", + ret = synth_event_add_val("next_comm_field", (u64)(long)"silly putty", &trace_state); if (ret) goto out; - ret = synth_event_add_val("my_string_field", (u64)"thneed_9", + ret = synth_event_add_val("my_string_field", (u64)(long)"thneed_9", &trace_state); if (ret) goto out; @@ -423,13 +423,13 @@ static int __init test_trace_synth_event(void) /* Trace some bogus values just for testing */ ret = synth_event_trace(create_synth_test, 7, /* number of values */ - 444, /* next_pid_field */ - (u64)"clackers", /* next_comm_field */ - 1000000, /* ts_ns */ - 1000, /* ts_ms */ - smp_processor_id(), /* cpu */ - (u64)"Thneed", /* my_string_field */ - 999); /* my_int_field */ + (u64)444, /* next_pid_field */ + (u64)(long)"clackers", /* next_comm_field */ + (u64)1000000, /* ts_ns */ + (u64)1000, /* ts_ms */ + (u64)raw_smp_processor_id(), /* cpu */ + (u64)(long)"Thneed", /* my_string_field */ + (u64)999); /* my_int_field */ return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c797a15a1fc7..6b11e4e2150c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1837,6 +1837,7 @@ static __init int init_trace_selftests(void) pr_info("Running postponed tracer tests:\n"); + tracing_selftest_running = true; list_for_each_entry_safe(p, n, &postponed_selftests, list) { /* This loop can take minutes when sanitizers are enabled, so * lets make sure we allow RCU processing. @@ -1859,6 +1860,7 @@ static __init int init_trace_selftests(void) list_del(&p->list); kfree(p); } + tracing_selftest_running = false; out: mutex_unlock(&trace_types_lock); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 483b3fd1094f..5f6834a2bf41 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -821,6 +821,29 @@ static const char *synth_field_fmt(char *type) return fmt; } +static void print_synth_event_num_val(struct trace_seq *s, + char *print_fmt, char *name, + int size, u64 val, char *space) +{ + switch (size) { + case 1: + trace_seq_printf(s, print_fmt, name, (u8)val, space); + break; + + case 2: + trace_seq_printf(s, print_fmt, name, (u16)val, space); + break; + + case 4: + trace_seq_printf(s, print_fmt, name, (u32)val, space); + break; + + default: + trace_seq_printf(s, print_fmt, name, val, space); + break; + } +} + static enum print_line_t print_synth_event(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -859,10 +882,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; + char *space = (i == se->n_fields - 1 ? "" : " "); - trace_seq_printf(s, print_fmt, se->fields[i]->name, - entry->fields[n_u64], - i == se->n_fields - 1 ? "" : " "); + print_synth_event_num_val(s, print_fmt, + se->fields[i]->name, + se->fields[i]->size, + entry->fields[n_u64], + space); if (strcmp(se->fields[i]->type, "gfp_t") == 0) { trace_seq_puts(s, " ("); @@ -1805,6 +1831,8 @@ __synth_event_trace_start(struct trace_event_file *file, int entry_size, fields_size = 0; int ret = 0; + memset(trace_state, '\0', sizeof(*trace_state)); + /* * Normal event tracing doesn't get called at all unless the * ENABLED bit is set (which attaches the probe thus allowing @@ -1885,6 +1913,11 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) return ret; } + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + va_start(args, n_vals); for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { u64 val; @@ -1898,12 +1931,30 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - state.entry->fields[n_u64] = val; + struct synth_field *field = state.event->fields[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } n_u64++; } } va_end(args); - +out: __synth_event_trace_end(&state); return ret; @@ -1942,6 +1993,11 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, return ret; } + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)vals[i]; @@ -1950,11 +2006,30 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - state.entry->fields[n_u64] = vals[i]; + struct synth_field *field = state.event->fields[i]; + u64 val = vals[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } n_u64++; } } - +out: __synth_event_trace_end(&state); return ret; @@ -1997,8 +2072,6 @@ int synth_event_trace_start(struct trace_event_file *file, if (!trace_state) return -EINVAL; - memset(trace_state, '\0', sizeof(*trace_state)); - ret = __synth_event_trace_start(file, trace_state); if (ret == -ENOENT) ret = 0; /* just disabled, not really an error */ @@ -2069,8 +2142,25 @@ static int __synth_event_add_val(const char *field_name, u64 val, str_field = (char *)&entry->fields[field->offset]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); - } else - entry->fields[field->offset] = val; + } else { + switch (field->size) { + case 1: + *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + break; + + case 2: + *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + break; + + case 4: + *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + break; + + default: + trace_state->entry->fields[field->offset] = val; + break; + } + } out: return ret; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 301db4406bc3..4e01c448b4b4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1411,14 +1411,16 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, return; rcu_read_lock(); retry: - if (req_cpu == WORK_CPU_UNBOUND) - cpu = wq_select_unbound_cpu(raw_smp_processor_id()); - /* pwq which will be used unless @work is executing elsewhere */ - if (!(wq->flags & WQ_UNBOUND)) - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); - else + if (wq->flags & WQ_UNBOUND) { + if (req_cpu == WORK_CPU_UNBOUND) + cpu = wq_select_unbound_cpu(raw_smp_processor_id()); pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); + } else { + if (req_cpu == WORK_CPU_UNBOUND) + cpu = raw_smp_processor_id(); + pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + } /* * If @work was previously on a different pool, it might still be |