diff options
Diffstat (limited to 'kernel')
53 files changed, 1514 insertions, 911 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 17b0d523afb3..9ddfe2aa6671 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1101,13 +1101,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature audit_log_end(ab); } -static int audit_set_feature(struct sk_buff *skb) +static int audit_set_feature(struct audit_features *uaf) { - struct audit_features *uaf; int i; BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); - uaf = nlmsg_data(nlmsg_hdr(skb)); /* if there is ever a version 2 we should handle that here */ @@ -1175,6 +1173,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; void *data; + int data_len; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; @@ -1188,6 +1187,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); + data_len = nlmsg_len(nlh); switch (msg_type) { case AUDIT_GET: { @@ -1211,7 +1211,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_status s; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) @@ -1315,7 +1315,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; break; case AUDIT_SET_FEATURE: - err = audit_set_feature(skb); + if (data_len < sizeof(struct audit_features)) + return -EINVAL; + err = audit_set_feature(data); if (err) return err; break; @@ -1327,6 +1329,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) err = audit_filter(msg_type, AUDIT_FILTER_USER); if (err == 1) { /* match or error */ + char *str = data; + err = 0; if (msg_type == AUDIT_USER_TTY) { err = tty_audit_push(); @@ -1334,26 +1338,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } audit_log_user_recv_msg(&ab, msg_type); - if (msg_type != AUDIT_USER_TTY) + if (msg_type != AUDIT_USER_TTY) { + /* ensure NULL termination */ + str[data_len - 1] = '\0'; audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, - (char *)data); - else { - int size; - + str); + } else { audit_log_format(ab, " data="); - size = nlmsg_len(nlh); - if (size > 0 && - ((unsigned char *)data)[size - 1] == '\0') - size--; - audit_log_n_untrustedstring(ab, data, size); + if (data_len > 0 && str[data_len - 1] == '\0') + data_len--; + audit_log_n_untrustedstring(ab, str, data_len); } audit_log_end(ab); } break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: - if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) + if (data_len < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(audit_context(), &ab, @@ -1365,7 +1367,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_end(ab); return -EPERM; } - err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh)); + err = audit_rule_change(msg_type, seq, data, data_len); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); @@ -1380,7 +1382,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; - size_t msglen = nlmsg_len(nlh); + size_t msglen = data_len; char *old, *new; err = -EINVAL; @@ -1456,7 +1458,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); + memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); /* check if new data is valid */ if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b0126e9c0743..026e34da4ace 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -456,6 +456,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, bufp = data->buf; for (i = 0; i < data->field_count; i++) { struct audit_field *f = &entry->rule.fields[i]; + u32 f_val; err = -EINVAL; @@ -464,12 +465,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; f->type = data->fields[i]; - f->val = data->values[i]; + f_val = data->values[i]; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { + if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; - f->val = 0; + f_val = 0; entry->rule.pflags |= AUDIT_LOGINUID_LEGACY; } @@ -485,7 +486,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SUID: case AUDIT_FSUID: case AUDIT_OBJ_UID: - f->uid = make_kuid(current_user_ns(), f->val); + f->uid = make_kuid(current_user_ns(), f_val); if (!uid_valid(f->uid)) goto exit_free; break; @@ -494,11 +495,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: - f->gid = make_kgid(current_user_ns(), f->val); + f->gid = make_kgid(current_user_ns(), f_val); if (!gid_valid(f->gid)) goto exit_free; break; case AUDIT_ARCH: + f->val = f_val; entry->rule.arch_f = f; break; case AUDIT_SUBJ_USER: @@ -511,11 +513,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } + entry->rule.buflen += f_val; + f->lsm_str = str; err = security_audit_rule_init(f->type, f->op, str, (void **)&f->lsm_rule); /* Keep currently invalid fields around in case they @@ -524,68 +528,71 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, pr_warn("audit rule for LSM \'%s\' is invalid\n", str); err = 0; - } - if (err) { - kfree(str); + } else if (err) goto exit_free; - } else - f->lsm_str = str; break; case AUDIT_WATCH: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - - err = audit_to_watch(&entry->rule, str, f->val, f->op); + } + err = audit_to_watch(&entry->rule, str, f_val, f->op); if (err) { kfree(str); goto exit_free; } + entry->rule.buflen += f_val; break; case AUDIT_DIR: - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; - + } err = audit_make_tree(&entry->rule, str, f->op); kfree(str); if (err) goto exit_free; + entry->rule.buflen += f_val; break; case AUDIT_INODE: + f->val = f_val; err = audit_to_inode(&entry->rule, f); if (err) goto exit_free; break; case AUDIT_FILTERKEY: - if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) + if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); - if (IS_ERR(str)) + str = audit_unpack_string(&bufp, &remain, f_val); + if (IS_ERR(str)) { + err = PTR_ERR(str); goto exit_free; - entry->rule.buflen += f->val; + } + entry->rule.buflen += f_val; entry->rule.filterkey = str; break; case AUDIT_EXE: - if (entry->rule.exe || f->val > PATH_MAX) + if (entry->rule.exe || f_val > PATH_MAX) goto exit_free; - str = audit_unpack_string(&bufp, &remain, f->val); + str = audit_unpack_string(&bufp, &remain, f_val); if (IS_ERR(str)) { err = PTR_ERR(str); goto exit_free; } - entry->rule.buflen += f->val; - - audit_mark = audit_alloc_mark(&entry->rule, str, f->val); + audit_mark = audit_alloc_mark(&entry->rule, str, f_val); if (IS_ERR(audit_mark)) { kfree(str); err = PTR_ERR(audit_mark); goto exit_free; } + entry->rule.buflen += f_val; entry->rule.exe = audit_mark; break; + default: + f->val = f_val; + break; } } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 805c43b083e9..787140095e58 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4142,9 +4142,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, * EFAULT - verifier bug * 0 - 99% match. The last 1% is validated by the verifier. */ -int btf_check_func_type_match(struct bpf_verifier_log *log, - struct btf *btf1, const struct btf_type *t1, - struct btf *btf2, const struct btf_type *t2) +static int btf_check_func_type_match(struct bpf_verifier_log *log, + struct btf *btf1, const struct btf_type *t1, + struct btf *btf2, const struct btf_type *t2) { const struct btf_param *args1, *args2; const char *fn1, *fn2, *s1, *s2; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2d182c4ee9d9..a1468e3f5af2 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -56,6 +56,7 @@ struct htab_elem { union { struct bpf_htab *htab; struct pcpu_freelist_node fnode; + struct htab_elem *batch_flink; }; }; }; @@ -126,6 +127,17 @@ free_elems: bpf_map_area_free(htab->elems); } +/* The LRU list has a lock (lru_lock). Each htab bucket has a lock + * (bucket_lock). If both locks need to be acquired together, the lock + * order is always lru_lock -> bucket_lock and this only happens in + * bpf_lru_list.c logic. For example, certain code path of + * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), + * will acquire lru_lock first followed by acquiring bucket_lock. + * + * In hashtab.c, to avoid deadlock, lock acquisition of + * bucket_lock followed by lru_lock is not allowed. In such cases, + * bucket_lock needs to be released first before acquiring lru_lock. + */ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, u32 hash) { @@ -1256,10 +1268,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size; + struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; - unsigned long flags; + unsigned long flags = 0; + bool locked = false; struct htab_elem *l; struct bucket *b; int ret = 0; @@ -1319,15 +1333,25 @@ again_nocopy: dst_val = values; b = &htab->buckets[batch]; head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + /* do not grab the lock unless need it (bucket_cnt > 0). */ + if (locked) + raw_spin_lock_irqsave(&b->lock, flags); bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) bucket_cnt++; + if (bucket_cnt && !locked) { + locked = true; + goto again_nocopy; + } + if (bucket_cnt > (max_count - total)) { if (total == 0) ret = -ENOSPC; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1337,6 +1361,9 @@ again_nocopy: if (bucket_cnt > bucket_size) { bucket_size = bucket_cnt; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1346,6 +1373,10 @@ again_nocopy: goto alloc; } + /* Next block is only safe to run if you have grabbed the lock */ + if (!locked) + goto next_batch; + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { memcpy(dst_key, l->key, key_size); @@ -1370,16 +1401,33 @@ again_nocopy: } if (do_delete) { hlist_nulls_del_rcu(&l->hash_node); - if (is_lru_map) - bpf_lru_push_free(&htab->lru, &l->lru_node); - else + + /* bpf_lru_push_free() will acquire lru_lock, which + * may cause deadlock. See comments in function + * prealloc_lru_pop(). Let us do bpf_lru_push_free() + * after releasing the bucket lock. + */ + if (is_lru_map) { + l->batch_flink = node_to_free; + node_to_free = l; + } else { free_htab_elem(htab, l); + } } dst_key += key_size; dst_val += value_size; } raw_spin_unlock_irqrestore(&b->lock, flags); + locked = false; + + while (node_to_free) { + l = node_to_free; + node_to_free = node_to_free->batch_flink; + bpf_lru_push_free(&htab->lru, &l->lru_node); + } + +next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. */ diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2c5dc6541ece..bd09290e3648 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -321,7 +321,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, ulen = info->jited_prog_len; info->jited_prog_len = aux->offload->jited_len; - if (info->jited_prog_len & ulen) { + if (info->jited_prog_len && ulen) { uinsns = u64_to_user_ptr(info->jited_prog_insns); ulen = min_t(u32, info->jited_prog_len, ulen); if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index db552b9f9377..75f687301bbf 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5927,11 +5927,14 @@ void cgroup_post_fork(struct task_struct *child) spin_lock_irq(&css_set_lock); - WARN_ON_ONCE(!list_empty(&child->cg_list)); - cset = task_css_set(current); /* current is @child's parent */ - get_css_set(cset); - cset->nr_tasks++; - css_set_move_task(child, NULL, cset, false); + /* init tasks are special, only link regular threads */ + if (likely(child->pid)) { + WARN_ON_ONCE(!list_empty(&child->cg_list)); + cset = task_css_set(current); /* current is @child's parent */ + get_css_set(cset); + cset->nr_tasks++; + css_set_move_task(child, NULL, cset, false); + } /* * If the cgroup has to be frozen, the new task has too. Let's set diff --git a/kernel/compat.c b/kernel/compat.c index 95005f849c68..843dd17e6078 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,70 +26,6 @@ #include <linux/uaccess.h> -static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __get_user(tv->tv_sec, &ctv->tv_sec) || - __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __put_user(tv->tv_sec, &ctv->tv_sec) || - __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __get_user(ts->tv_sec, &cts->tv_sec) || - __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __put_user(ts->tv_sec, &cts->tv_sec) || - __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -int compat_get_timeval(struct timeval *tv, const void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_get_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_get_timeval); - -int compat_put_timeval(const struct timeval *tv, void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_put_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_put_timeval); - -int compat_get_timespec(struct timespec *ts, const void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_get_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_get_timespec); - -int compat_put_timespec(const struct timespec *ts, void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_put_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_put_timespec); - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/cpu.c b/kernel/cpu.c index 9c706af713fb..221bf6a9e98a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -331,12 +331,12 @@ void lockdep_assert_cpus_held(void) static void lockdep_acquire_cpus_lock(void) { - rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_); + rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_); } static void lockdep_release_cpus_lock(void) { - rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, _THIS_IP_); + rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_); } /* diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index daa4e6eefdde..8bc6f2d670f9 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -302,9 +302,16 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; + bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; int err; + if (size_cmdline != -1 && default_cma) { + pr_info("Reserved memory: bypass %s node, using cmdline CMA params instead\n", + rmem->name); + return -EBUSY; + } + if (!of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; @@ -322,7 +329,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) /* Architecture specific contiguous memory fixup. */ dma_contiguous_early_fixup(rmem->base, rmem->size); - if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) + if (default_cma) dma_contiguous_set_default(cma); rmem->ops = &rmem_cma_ops; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 6af7ae83c4ad..ac7956c38f69 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -23,18 +23,6 @@ */ unsigned int zone_dma_bits __ro_after_init = 24; -static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) -{ - if (!dev->dma_mask) { - dev_err_once(dev, "DMA map on device without dma_mask\n"); - } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) { - dev_err_once(dev, - "overflow %pad+%zu of DMA mask %llx bus limit %llx\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - } - WARN_ON_ONCE(1); -} - static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { @@ -357,13 +345,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, EXPORT_SYMBOL(dma_direct_unmap_sg); #endif -static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, - size_t size) -{ - return swiotlb_force != SWIOTLB_FORCE && - dma_capable(dev, dma_addr, size, true); -} - dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -371,9 +352,16 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (unlikely(!dma_direct_possible(dev, dma_addr, size)) && - !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) { - report_addr(dev, dma_addr, size); + if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + return swiotlb_map(dev, phys, size, dir, attrs); + + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (swiotlb_force != SWIOTLB_NO_FORCE) + return swiotlb_map(dev, phys, size, dir, attrs); + + dev_WARN_ONCE(dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); return DMA_MAPPING_ERROR; } @@ -411,7 +399,10 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, dma_addr_t dma_addr = paddr; if (unlikely(!dma_capable(dev, dma_addr, size, false))) { - report_addr(dev, dma_addr, size); + dev_err_once(dev, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + WARN_ON_ONCE(1); return DMA_MAPPING_ERROR; } @@ -472,28 +463,26 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, } #endif /* CONFIG_MMU */ -/* - * Because 32-bit DMA masks are so common we expect every architecture to be - * able to satisfy them - either by not supporting more physical memory, or by - * providing a ZONE_DMA32. If neither is the case, the architecture needs to - * use an IOMMU instead of the direct mapping. - */ int dma_direct_supported(struct device *dev, u64 mask) { - u64 min_mask; + u64 min_mask = (max_pfn - 1) << PAGE_SHIFT; - if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = DMA_BIT_MASK(zone_dma_bits); - else - min_mask = DMA_BIT_MASK(32); - - min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT); + /* + * Because 32-bit DMA masks are so common we expect every architecture + * to be able to satisfy them - either by not supporting more physical + * memory, or by providing a ZONE_DMA32. If neither is the case, the + * architecture needs to use an IOMMU instead of the direct mapping. + */ + if (mask >= DMA_BIT_MASK(32)) + return 1; /* * This check needs to be against the actual bit mask value, so * use __phys_to_dma() here so that the SME encryption mask isn't * part of the check. */ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); return mask >= __phys_to_dma(dev, min_mask); } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9280d6f8271e..c19379fabd20 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -22,6 +22,7 @@ #include <linux/cache.h> #include <linux/dma-direct.h> +#include <linux/dma-noncoherent.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/spinlock.h> @@ -656,35 +657,38 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, } /* - * Create a swiotlb mapping for the buffer at @phys, and in case of DMAing + * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing * to the device copy the data into it as well. */ -bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) +dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, + enum dma_data_direction dir, unsigned long attrs) { - trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force); + phys_addr_t swiotlb_addr; + dma_addr_t dma_addr; - if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) { - dev_warn_ratelimited(dev, - "Cannot do DMA to address %pa\n", phys); - return false; - } + trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, + swiotlb_force); - /* Oh well, have to allocate and map a bounce buffer. */ - *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), - *phys, size, size, dir, attrs); - if (*phys == (phys_addr_t)DMA_MAPPING_ERROR) - return false; + swiotlb_addr = swiotlb_tbl_map_single(dev, + __phys_to_dma(dev, io_tlb_start), + paddr, size, size, dir, attrs); + if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) + return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ - *dma_addr = __phys_to_dma(dev, *phys); - if (unlikely(!dma_capable(dev, *dma_addr, size, true))) { - swiotlb_tbl_unmap_single(dev, *phys, size, size, dir, + dma_addr = __phys_to_dma(dev, swiotlb_addr); + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return false; + dev_WARN_ONCE(dev, 1, + "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; } - return true; + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(swiotlb_addr, size, dir); + return dma_addr; } size_t swiotlb_max_mapping_size(struct device *dev) diff --git a/kernel/exit.c b/kernel/exit.c index 2833ffb0c211..f64a8f9d412a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -258,6 +258,7 @@ void rcuwait_wake_up(struct rcuwait *w) wake_up_process(task); rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX diff --git a/kernel/futex.c b/kernel/futex.c index 7fdd2c949487..23bc277ef6f6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -135,8 +135,7 @@ * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see hb_waiters_inc) and where (B) orders the write - * to futex and the waiters read -- this is done by the barriers for both - * shared and private futexes in get_futex_key_refs(). + * to futex and the waiters read (see hb_waiters_pending()). * * This yields the following case (where X:=waiters, Y:=futex): * @@ -331,17 +330,6 @@ static void compat_exit_robust_list(struct task_struct *curr); static inline void compat_exit_robust_list(struct task_struct *curr) { } #endif -static inline void futex_get_mm(union futex_key *key) -{ - mmgrab(key->private.mm); - /* - * Ensure futex_get_mm() implies a full barrier such that - * get_futex_key() implies a full barrier. This is relied upon - * as smp_mb(); (B), see the ordering comment above. - */ - smp_mb__after_atomic(); -} - /* * Reflects a new waiter being added to the waitqueue. */ @@ -370,6 +358,10 @@ static inline void hb_waiters_dec(struct futex_hash_bucket *hb) static inline int hb_waiters_pending(struct futex_hash_bucket *hb) { #ifdef CONFIG_SMP + /* + * Full barrier (B), see the ordering comment above. + */ + smp_mb(); return atomic_read(&hb->waiters); #else return 1; @@ -407,70 +399,6 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) && key1->both.offset == key2->both.offset); } -/* - * Take a reference to the resource addressed by a key. - * Can be called while holding spinlocks. - * - */ -static void get_futex_key_refs(union futex_key *key) -{ - if (!key->both.ptr) - return; - - /* - * On MMU less systems futexes are always "private" as there is no per - * process address space. We need the smp wmb nevertheless - yes, - * arch/blackfin has MMU less SMP ... - */ - if (!IS_ENABLED(CONFIG_MMU)) { - smp_mb(); /* explicit smp_mb(); (B) */ - return; - } - - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies smp_mb(); (B) */ - break; - case FUT_OFF_MMSHARED: - futex_get_mm(key); /* implies smp_mb(); (B) */ - break; - default: - /* - * Private futexes do not hold reference on an inode or - * mm, therefore the only purpose of calling get_futex_key_refs - * is because we need the barrier for the lockless waiter check. - */ - smp_mb(); /* explicit smp_mb(); (B) */ - } -} - -/* - * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. This is - * a no-op for private futexes, see comment in the get - * counterpart. - */ -static void drop_futex_key_refs(union futex_key *key) -{ - if (!key->both.ptr) { - /* If we're here then we tried to put a key we failed to get */ - WARN_ON_ONCE(1); - return; - } - - if (!IS_ENABLED(CONFIG_MMU)) - return; - - switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { - case FUT_OFF_INODE: - iput(key->shared.inode); - break; - case FUT_OFF_MMSHARED: - mmdrop(key->private.mm); - break; - } -} - enum futex_access { FUTEX_READ, FUTEX_WRITE @@ -505,6 +433,46 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, return timeout; } +/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that match_futex() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex @@ -517,9 +485,15 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, * * The key words are stored in @key on success. * - * For shared mappings, it's (page->index, file_inode(vma->vm_file), - * offset_within_page). For private mappings, it's (uaddr, current->mm). - * We can usually work out the index without swapping in the page. + * For shared mappings (when @fshared), the key is: + * ( inode->i_sequence, page->index, offset_within_page ) + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ @@ -556,7 +530,6 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a if (!fshared) { key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ return 0; } @@ -659,8 +632,6 @@ again: key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ - } else { struct inode *inode; @@ -692,36 +663,8 @@ again: goto again; } - /* - * Take a reference unless it is about to be freed. Previously - * this reference was taken by ihold under the page lock - * pinning the inode in place so i_lock was unnecessary. The - * only way for this check to fail is if the inode was - * truncated in parallel which is almost certainly an - * application bug. In such a case, just retry. - * - * We are not calling into get_futex_key_refs() in file-backed - * cases, therefore a successful atomic_inc return below will - * guarantee that get_futex_key() will still imply smp_mb(); (B). - */ - if (!atomic_inc_not_zero(&inode->i_count)) { - rcu_read_unlock(); - put_page(page); - - goto again; - } - - /* Should be impossible but lets be paranoid for now */ - if (WARN_ON_ONCE(inode->i_mapping != mapping)) { - err = -EFAULT; - rcu_read_unlock(); - iput(inode); - - goto out; - } - key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = inode; + key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = basepage_index(tail); rcu_read_unlock(); } @@ -733,7 +676,6 @@ out: static inline void put_futex_key(union futex_key *key) { - drop_futex_key_refs(key); } /** @@ -1867,7 +1809,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; } - get_futex_key_refs(key2); q->key = *key2; } @@ -1889,7 +1830,6 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - get_futex_key_refs(key); q->key = *key; __unqueue_futex(q); @@ -2000,7 +1940,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 *cmpval, int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; - int drop_count = 0, task_count = 0, ret; + int task_count = 0, ret; struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; @@ -2121,7 +2061,6 @@ retry_private: */ if (ret > 0) { WARN_ON(pi_state); - drop_count++; task_count++; /* * If we acquired the lock, then the user space value @@ -2241,7 +2180,6 @@ retry_private: * doing so. */ requeue_pi_wake_futex(this, &key2, hb2); - drop_count++; continue; } else if (ret) { /* @@ -2262,7 +2200,6 @@ retry_private: } } requeue_futex(this, hb1, hb2, &key2); - drop_count++; } /* @@ -2277,15 +2214,6 @@ out_unlock: wake_up_q(&wake_q); hb_waiters_dec(hb2); - /* - * drop_futex_key_refs() must be called outside the spinlocks. During - * the requeue we moved futex_q's from the hash bucket at key1 to the - * one at key2 and updated their key pointer. We no longer need to - * hold the references to key1. - */ - while (--drop_count >= 0) - drop_futex_key_refs(&key1); - out_put_keys: put_futex_key(&key2); out_put_key1: @@ -2415,7 +2343,6 @@ retry: ret = 1; } - drop_futex_key_refs(&q->key); return ret; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a4ace611f47f..16ee716e8291 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -145,6 +145,13 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags for_each_action_of_desc(desc, action) { irqreturn_t res; + /* + * If this IRQ would be threaded under force_irqthreads, mark it so. + */ + if (irq_settings_can_thread(desc) && + !(action->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))) + trace_hardirq_threaded(); + trace_irq_handler_entry(irq, action); res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 3924fbe829d4..c9d8eb7f5c02 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -128,8 +128,6 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern int irq_select_affinity_usr(unsigned int irq); - extern void irq_set_thread_affinity(struct irq_desc *desc); extern int irq_do_set_affinity(struct irq_data *data, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3089a60ea8f9..7eee98c38f25 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -481,23 +481,9 @@ int irq_setup_affinity(struct irq_desc *desc) { return irq_select_affinity(irq_desc_get_irq(desc)); } -#endif +#endif /* CONFIG_AUTO_IRQ_AFFINITY */ +#endif /* CONFIG_SMP */ -/* - * Called when a bogus affinity is set via /proc/irq - */ -int irq_select_affinity_usr(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&desc->lock, flags); - ret = irq_setup_affinity(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} -#endif /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9e5783d98033..32c071d7bc03 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -111,6 +111,28 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) return show_irq_affinity(AFFINITY_LIST, m); } +#ifndef CONFIG_AUTO_IRQ_AFFINITY +static inline int irq_select_affinity_usr(unsigned int irq) +{ + /* + * If the interrupt is started up already then this fails. The + * interrupt is assigned to an online CPU already. There is no + * point to move it around randomly. Tell user space that the + * selected mask is bogus. + * + * If not then any change to the affinity is pointless because the + * startup code invokes irq_setup_affinity() which will select + * a online CPU anyway. + */ + return -EINVAL; +} +#else +/* ALPHA magic affinity auto selector. Keep it for historical reasons. */ +static inline int irq_select_affinity_usr(unsigned int irq) +{ + return irq_select_affinity(irq); +} +#endif static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 828cc30774bc..48b5d1b6af4d 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -153,7 +153,9 @@ static void irq_work_run_list(struct llist_head *list) */ flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); + lockdep_irq_work_enter(work); work->func(work); + lockdep_irq_work_exit(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 32406ef0d6a2..0ebf9807d971 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -84,12 +84,39 @@ module_param(lock_stat, int, 0644); * to use a raw spinlock - we really dont want the spinlock * code to recurse back into the lockdep code... */ -static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t __lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static struct task_struct *__owner; + +static inline void lockdep_lock(void) +{ + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + + arch_spin_lock(&__lock); + __owner = current; + current->lockdep_recursion++; +} + +static inline void lockdep_unlock(void) +{ + if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) + return; + + current->lockdep_recursion--; + __owner = NULL; + arch_spin_unlock(&__lock); +} + +static inline bool lockdep_assert_locked(void) +{ + return DEBUG_LOCKS_WARN_ON(__owner != current); +} + static struct task_struct *lockdep_selftest_task_struct; + static int graph_lock(void) { - arch_spin_lock(&lockdep_lock); + lockdep_lock(); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -97,27 +124,15 @@ static int graph_lock(void) * dropped already) */ if (!debug_locks) { - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); return 0; } - /* prevent any recursions within lockdep from causing deadlocks */ - current->lockdep_recursion++; return 1; } -static inline int graph_unlock(void) +static inline void graph_unlock(void) { - if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { - /* - * The lockdep graph lock isn't locked while we expect it to - * be, we're confused now, bye! - */ - return DEBUG_LOCKS_WARN_ON(1); - } - - current->lockdep_recursion--; - arch_spin_unlock(&lockdep_lock); - return 0; + lockdep_unlock(); } /* @@ -128,7 +143,7 @@ static inline int debug_locks_off_graph_unlock(void) { int ret = debug_locks_off(); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); return ret; } @@ -147,6 +162,7 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); #define KEYHASH_SIZE (1UL << KEYHASH_BITS) static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; +unsigned long nr_zapped_classes; #ifndef CONFIG_DEBUG_LOCKDEP static #endif @@ -377,18 +393,31 @@ void lockdep_init_task(struct task_struct *task) task->lockdep_recursion = 0; } +/* + * Split the recrursion counter in two to readily detect 'off' vs recursion. + */ +#define LOCKDEP_RECURSION_BITS 16 +#define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) +#define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) + void lockdep_off(void) { - current->lockdep_recursion++; + current->lockdep_recursion += LOCKDEP_OFF; } EXPORT_SYMBOL(lockdep_off); void lockdep_on(void) { - current->lockdep_recursion--; + current->lockdep_recursion -= LOCKDEP_OFF; } EXPORT_SYMBOL(lockdep_on); +static inline void lockdep_recursion_finish(void) +{ + if (WARN_ON_ONCE(--current->lockdep_recursion)) + current->lockdep_recursion = 0; +} + void lockdep_set_selftest_task(struct task_struct *task) { lockdep_selftest_task_struct = task; @@ -575,6 +604,7 @@ static const char *usage_str[] = #include "lockdep_states.h" #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", + [LOCK_USAGE_STATES] = "IN-NMI", }; #endif @@ -653,7 +683,9 @@ static void print_lock_name(struct lock_class *class) printk(KERN_CONT " ("); __print_lock_name(class); - printk(KERN_CONT "){%s}", usage); + printk(KERN_CONT "){%s}-{%hd:%hd}", usage, + class->wait_type_outer ?: class->wait_type_inner, + class->wait_type_inner); } static void print_lockdep_cache(struct lockdep_map *lock) @@ -787,6 +819,7 @@ static int count_matching_names(struct lock_class *new_class) return count + 1; } +/* used from NMI context -- must be lockless */ static inline struct lock_class * look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { @@ -1070,13 +1103,15 @@ static inline void check_data_structures(void) { } #endif /* CONFIG_DEBUG_LOCKDEP */ +static void init_chain_block_buckets(void); + /* * Initialize the lock_classes[] array elements, the free_lock_classes list * and also the delayed_free structure. */ static void init_data_structures_once(void) { - static bool ds_initialized, rcu_head_initialized; + static bool __read_mostly ds_initialized, rcu_head_initialized; int i; if (likely(rcu_head_initialized)) @@ -1100,6 +1135,7 @@ static void init_data_structures_once(void) INIT_LIST_HEAD(&lock_classes[i].locks_after); INIT_LIST_HEAD(&lock_classes[i].locks_before); } + init_chain_block_buckets(); } static inline struct hlist_head *keyhashentry(const struct lock_class_key *key) @@ -1230,6 +1266,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) WARN_ON_ONCE(!list_empty(&class->locks_before)); WARN_ON_ONCE(!list_empty(&class->locks_after)); class->name_version = count_matching_names(class); + class->wait_type_inner = lock->wait_type_inner; + class->wait_type_outer = lock->wait_type_outer; /* * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: @@ -1469,6 +1507,8 @@ static int __bfs(struct lock_list *source_entry, struct circular_queue *cq = &lock_cq; int ret = 1; + lockdep_assert_locked(); + if (match(source_entry, data)) { *target_entry = source_entry; ret = 0; @@ -1491,8 +1531,6 @@ static int __bfs(struct lock_list *source_entry, head = get_dep_list(lock, offset); - DEBUG_LOCKS_WARN_ON(!irqs_disabled()); - list_for_each_entry_rcu(entry, head, entry) { if (!lock_accessed(entry)) { unsigned int cq_depth; @@ -1719,9 +1757,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) this.class = class; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); ret = __lockdep_count_forward_deps(&this); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); return ret; @@ -1746,9 +1784,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) this.class = class; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); ret = __lockdep_count_backward_deps(&this); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); return ret; @@ -2298,18 +2336,6 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, return 0; } -static void inc_chains(void) -{ - if (current->hardirq_context) - nr_hardirq_chains++; - else { - if (current->softirq_context) - nr_softirq_chains++; - else - nr_process_chains++; - } -} - #else static inline int check_irq_usage(struct task_struct *curr, @@ -2317,13 +2343,27 @@ static inline int check_irq_usage(struct task_struct *curr, { return 1; } +#endif /* CONFIG_TRACE_IRQFLAGS */ -static inline void inc_chains(void) +static void inc_chains(int irq_context) { - nr_process_chains++; + if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT) + nr_hardirq_chains++; + else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT) + nr_softirq_chains++; + else + nr_process_chains++; } -#endif /* CONFIG_TRACE_IRQFLAGS */ +static void dec_chains(int irq_context) +{ + if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT) + nr_hardirq_chains--; + else if (irq_context & LOCK_CHAIN_SOFTIRQ_CONTEXT) + nr_softirq_chains--; + else + nr_process_chains--; +} static void print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) @@ -2622,8 +2662,235 @@ out_bug: struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS); -int nr_chain_hlocks; static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; +unsigned long nr_zapped_lock_chains; +unsigned int nr_free_chain_hlocks; /* Free chain_hlocks in buckets */ +unsigned int nr_lost_chain_hlocks; /* Lost chain_hlocks */ +unsigned int nr_large_chain_blocks; /* size > MAX_CHAIN_BUCKETS */ + +/* + * The first 2 chain_hlocks entries in the chain block in the bucket + * list contains the following meta data: + * + * entry[0]: + * Bit 15 - always set to 1 (it is not a class index) + * Bits 0-14 - upper 15 bits of the next block index + * entry[1] - lower 16 bits of next block index + * + * A next block index of all 1 bits means it is the end of the list. + * + * On the unsized bucket (bucket-0), the 3rd and 4th entries contain + * the chain block size: + * + * entry[2] - upper 16 bits of the chain block size + * entry[3] - lower 16 bits of the chain block size + */ +#define MAX_CHAIN_BUCKETS 16 +#define CHAIN_BLK_FLAG (1U << 15) +#define CHAIN_BLK_LIST_END 0xFFFFU + +static int chain_block_buckets[MAX_CHAIN_BUCKETS]; + +static inline int size_to_bucket(int size) +{ + if (size > MAX_CHAIN_BUCKETS) + return 0; + + return size - 1; +} + +/* + * Iterate all the chain blocks in a bucket. + */ +#define for_each_chain_block(bucket, prev, curr) \ + for ((prev) = -1, (curr) = chain_block_buckets[bucket]; \ + (curr) >= 0; \ + (prev) = (curr), (curr) = chain_block_next(curr)) + +/* + * next block or -1 + */ +static inline int chain_block_next(int offset) +{ + int next = chain_hlocks[offset]; + + WARN_ON_ONCE(!(next & CHAIN_BLK_FLAG)); + + if (next == CHAIN_BLK_LIST_END) + return -1; + + next &= ~CHAIN_BLK_FLAG; + next <<= 16; + next |= chain_hlocks[offset + 1]; + + return next; +} + +/* + * bucket-0 only + */ +static inline int chain_block_size(int offset) +{ + return (chain_hlocks[offset + 2] << 16) | chain_hlocks[offset + 3]; +} + +static inline void init_chain_block(int offset, int next, int bucket, int size) +{ + chain_hlocks[offset] = (next >> 16) | CHAIN_BLK_FLAG; + chain_hlocks[offset + 1] = (u16)next; + + if (size && !bucket) { + chain_hlocks[offset + 2] = size >> 16; + chain_hlocks[offset + 3] = (u16)size; + } +} + +static inline void add_chain_block(int offset, int size) +{ + int bucket = size_to_bucket(size); + int next = chain_block_buckets[bucket]; + int prev, curr; + + if (unlikely(size < 2)) { + /* + * We can't store single entries on the freelist. Leak them. + * + * One possible way out would be to uniquely mark them, other + * than with CHAIN_BLK_FLAG, such that we can recover them when + * the block before it is re-added. + */ + if (size) + nr_lost_chain_hlocks++; + return; + } + + nr_free_chain_hlocks += size; + if (!bucket) { + nr_large_chain_blocks++; + + /* + * Variable sized, sort large to small. + */ + for_each_chain_block(0, prev, curr) { + if (size >= chain_block_size(curr)) + break; + } + init_chain_block(offset, curr, 0, size); + if (prev < 0) + chain_block_buckets[0] = offset; + else + init_chain_block(prev, offset, 0, 0); + return; + } + /* + * Fixed size, add to head. + */ + init_chain_block(offset, next, bucket, size); + chain_block_buckets[bucket] = offset; +} + +/* + * Only the first block in the list can be deleted. + * + * For the variable size bucket[0], the first block (the largest one) is + * returned, broken up and put back into the pool. So if a chain block of + * length > MAX_CHAIN_BUCKETS is ever used and zapped, it will just be + * queued up after the primordial chain block and never be used until the + * hlock entries in the primordial chain block is almost used up. That + * causes fragmentation and reduce allocation efficiency. That can be + * monitored by looking at the "large chain blocks" number in lockdep_stats. + */ +static inline void del_chain_block(int bucket, int size, int next) +{ + nr_free_chain_hlocks -= size; + chain_block_buckets[bucket] = next; + + if (!bucket) + nr_large_chain_blocks--; +} + +static void init_chain_block_buckets(void) +{ + int i; + + for (i = 0; i < MAX_CHAIN_BUCKETS; i++) + chain_block_buckets[i] = -1; + + add_chain_block(0, ARRAY_SIZE(chain_hlocks)); +} + +/* + * Return offset of a chain block of the right size or -1 if not found. + * + * Fairly simple worst-fit allocator with the addition of a number of size + * specific free lists. + */ +static int alloc_chain_hlocks(int req) +{ + int bucket, curr, size; + + /* + * We rely on the MSB to act as an escape bit to denote freelist + * pointers. Make sure this bit isn't set in 'normal' class_idx usage. + */ + BUILD_BUG_ON((MAX_LOCKDEP_KEYS-1) & CHAIN_BLK_FLAG); + + init_data_structures_once(); + + if (nr_free_chain_hlocks < req) + return -1; + + /* + * We require a minimum of 2 (u16) entries to encode a freelist + * 'pointer'. + */ + req = max(req, 2); + bucket = size_to_bucket(req); + curr = chain_block_buckets[bucket]; + + if (bucket) { + if (curr >= 0) { + del_chain_block(bucket, req, chain_block_next(curr)); + return curr; + } + /* Try bucket 0 */ + curr = chain_block_buckets[0]; + } + + /* + * The variable sized freelist is sorted by size; the first entry is + * the largest. Use it if it fits. + */ + if (curr >= 0) { + size = chain_block_size(curr); + if (likely(size >= req)) { + del_chain_block(0, size, chain_block_next(curr)); + add_chain_block(curr + req, size - req); + return curr; + } + } + + /* + * Last resort, split a block in a larger sized bucket. + */ + for (size = MAX_CHAIN_BUCKETS; size > req; size--) { + bucket = size_to_bucket(size); + curr = chain_block_buckets[bucket]; + if (curr < 0) + continue; + + del_chain_block(bucket, size, chain_block_next(curr)); + add_chain_block(curr + req, size - req); + return curr; + } + + return -1; +} + +static inline void free_chain_hlocks(int base, int size) +{ + add_chain_block(base, max(size, 2)); +} struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) { @@ -2803,7 +3070,7 @@ static inline int add_chain_cache(struct task_struct *curr, * disabled to make this an IRQ-safe lock.. for recursion reasons * lockdep won't complain about its own locking errors. */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + if (lockdep_assert_locked()) return 0; chain = alloc_lock_chain(); @@ -2824,15 +3091,8 @@ static inline int add_chain_cache(struct task_struct *curr, BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks)); BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes)); - if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { - chain->base = nr_chain_hlocks; - for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx; - chain_hlocks[chain->base + j] = lock_id; - } - chain_hlocks[chain->base + j] = class - lock_classes; - nr_chain_hlocks += chain->depth; - } else { + j = alloc_chain_hlocks(chain->depth); + if (j < 0) { if (!debug_locks_off_graph_unlock()) return 0; @@ -2841,9 +3101,16 @@ static inline int add_chain_cache(struct task_struct *curr, return 0; } + chain->base = j; + for (j = 0; j < chain->depth - 1; j++, i++) { + int lock_id = curr->held_locks[i].class_idx; + + chain_hlocks[chain->base + j] = lock_id; + } + chain_hlocks[chain->base + j] = class - lock_classes; hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); - inc_chains(); + inc_chains(chain->irq_context); return 1; } @@ -2987,6 +3254,8 @@ static inline int validate_chain(struct task_struct *curr, { return 1; } + +static void init_chain_block_buckets(void) { } #endif /* CONFIG_PROVE_LOCKING */ /* @@ -3429,9 +3698,9 @@ void lockdep_hardirqs_on(unsigned long ip) if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; - current->lockdep_recursion = 1; + current->lockdep_recursion++; __trace_hardirqs_on_caller(ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); } NOKPROBE_SYMBOL(lockdep_hardirqs_on); @@ -3487,7 +3756,7 @@ void trace_softirqs_on(unsigned long ip) return; } - current->lockdep_recursion = 1; + current->lockdep_recursion++; /* * We'll do an OFF -> ON transition: */ @@ -3502,7 +3771,7 @@ void trace_softirqs_on(unsigned long ip) */ if (curr->hardirqs_enabled) mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); } /* @@ -3596,7 +3865,8 @@ lock_used: static inline unsigned int task_irq_context(struct task_struct *task) { - return 2 * !!task->hardirq_context + !!task->softirq_context; + return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context + + LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context; } static int separate_irq_context(struct task_struct *curr, @@ -3682,6 +3952,113 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } +static int +print_lock_invalid_wait_context(struct task_struct *curr, + struct held_lock *hlock) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + pr_warn("\n"); + pr_warn("=============================\n"); + pr_warn("[ BUG: Invalid wait context ]\n"); + print_kernel_ident(); + pr_warn("-----------------------------\n"); + + pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + print_lock(hlock); + + pr_warn("other info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + pr_warn("stack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Verify the wait_type context. + * + * This check validates we takes locks in the right wait-type order; that is it + * ensures that we do not take mutexes inside spinlocks and do not attempt to + * acquire spinlocks inside raw_spinlocks and the sort. + * + * The entire thing is slightly more complex because of RCU, RCU is a lock that + * can be taken from (pretty much) any context but also has constraints. + * However when taken in a stricter environment the RCU lock does not loosen + * the constraints. + * + * Therefore we must look for the strictest environment in the lock stack and + * compare that to the lock we're trying to acquire. + */ +static int check_wait_context(struct task_struct *curr, struct held_lock *next) +{ + short next_inner = hlock_class(next)->wait_type_inner; + short next_outer = hlock_class(next)->wait_type_outer; + short curr_inner; + int depth; + + if (!curr->lockdep_depth || !next_inner || next->trylock) + return 0; + + if (!next_outer) + next_outer = next_inner; + + /* + * Find start of current irq_context.. + */ + for (depth = curr->lockdep_depth - 1; depth >= 0; depth--) { + struct held_lock *prev = curr->held_locks + depth; + if (prev->irq_context != next->irq_context) + break; + } + depth++; + + /* + * Set appropriate wait type for the context; for IRQs we have to take + * into account force_irqthread as that is implied by PREEMPT_RT. + */ + if (curr->hardirq_context) { + /* + * Check if force_irqthreads will run us threaded. + */ + if (curr->hardirq_threaded || curr->irq_config) + curr_inner = LD_WAIT_CONFIG; + else + curr_inner = LD_WAIT_SPIN; + } else if (curr->softirq_context) { + /* + * Softirqs are always threaded. + */ + curr_inner = LD_WAIT_CONFIG; + } else { + curr_inner = LD_WAIT_MAX; + } + + for (; depth < curr->lockdep_depth; depth++) { + struct held_lock *prev = curr->held_locks + depth; + short prev_inner = hlock_class(prev)->wait_type_inner; + + if (prev_inner) { + /* + * We can have a bigger inner than a previous one + * when outer is smaller than inner, as with RCU. + * + * Also due to trylocks. + */ + curr_inner = min(curr_inner, prev_inner); + } + } + + if (next_outer > curr_inner) + return print_lock_invalid_wait_context(curr, next); + + return 0; +} + #else /* CONFIG_PROVE_LOCKING */ static inline int @@ -3701,13 +4078,20 @@ static inline int separate_irq_context(struct task_struct *curr, return 0; } +static inline int check_wait_context(struct task_struct *curr, + struct held_lock *next) +{ + return 0; +} + #endif /* CONFIG_PROVE_LOCKING */ /* * Initialize a lock instance's lock-class mapping info: */ -void lockdep_init_map(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) +void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass, + short inner, short outer) { int i; @@ -3728,6 +4112,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->name = name; + lock->wait_type_outer = outer; + lock->wait_type_inner = inner; + /* * No key, no joy, we need to hash something. */ @@ -3755,13 +4142,13 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; register_lock_class(lock, subclass, 1); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } } -EXPORT_SYMBOL_GPL(lockdep_init_map); +EXPORT_SYMBOL_GPL(lockdep_init_map_waits); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3862,7 +4249,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes; - if (depth) { + if (depth) { /* we're holding locks */ hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (!references) @@ -3904,6 +4291,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif hlock->pin_count = pin_count; + if (check_wait_context(curr, hlock)) + return 0; + /* Initialize the lock usage bit */ if (!mark_usage(curr, hlock, check)) return 0; @@ -4139,7 +4529,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name, return 0; } - lockdep_init_map(lock, name, key, 0); + lockdep_init_map_waits(lock, name, key, 0, + lock->wait_type_inner, + lock->wait_type_outer); class = register_lock_class(lock, subclass, 0); hlock->class_idx = class - lock_classes; @@ -4437,11 +4829,11 @@ void lock_set_class(struct lockdep_map *lock, const char *name, return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; check_flags(flags); if (__lock_set_class(lock, name, key, subclass, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_set_class); @@ -4454,15 +4846,45 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip) return; raw_local_irq_save(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; check_flags(flags); if (__lock_downgrade(lock, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_downgrade); +/* NMI context !!! */ +static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock, int subclass) +{ +#ifdef CONFIG_PROVE_LOCKING + struct lock_class *class = look_up_lock_class(lock, subclass); + + /* if it doesn't have a class (yet), it certainly hasn't been used yet */ + if (!class) + return; + + if (!(class->usage_mask & LOCK_USED)) + return; + + hlock->class_idx = class - lock_classes; + + print_usage_bug(current, hlock, LOCK_USED, LOCK_USAGE_STATES); +#endif +} + +static bool lockdep_nmi(void) +{ + if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK) + return false; + + if (!in_nmi()) + return false; + + return true; +} + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -4473,17 +4895,34 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; - if (unlikely(current->lockdep_recursion)) + if (unlikely(current->lockdep_recursion)) { + /* XXX allow trylock from NMI ?!? */ + if (lockdep_nmi() && !trylock) { + struct held_lock hlock; + + hlock.acquire_ip = ip; + hlock.instance = lock; + hlock.nest_lock = nest_lock; + hlock.irq_context = 2; // XXX + hlock.trylock = trylock; + hlock.read = read; + hlock.check = check; + hlock.hardirqs_off = true; + hlock.references = 0; + + verify_lock_unused(lock, &hlock, subclass); + } return; + } raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), nest_lock, ip, 0, 0); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_acquire); @@ -4497,11 +4936,11 @@ void lock_release(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; trace_lock_release(lock, ip); if (__lock_release(lock, ip)) check_chain_key(current); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_release); @@ -4517,9 +4956,9 @@ int lock_is_held_type(const struct lockdep_map *lock, int read) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; ret = __lock_is_held(lock, read); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); return ret; @@ -4538,9 +4977,9 @@ struct pin_cookie lock_pin_lock(struct lockdep_map *lock) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; cookie = __lock_pin_lock(lock); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); return cookie; @@ -4557,9 +4996,9 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; __lock_repin_lock(lock, cookie); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_repin_lock); @@ -4574,9 +5013,9 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; __lock_unpin_lock(lock, cookie); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_unpin_lock); @@ -4712,10 +5151,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; trace_lock_contended(lock, ip); __lock_contended(lock, ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_contended); @@ -4732,9 +5171,9 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); - current->lockdep_recursion = 1; + current->lockdep_recursion++; __lock_acquired(lock, ip); - current->lockdep_recursion = 0; + lockdep_recursion_finish(); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(lock_acquired); @@ -4768,57 +5207,33 @@ static void remove_class_from_lock_chain(struct pending_free *pf, struct lock_class *class) { #ifdef CONFIG_PROVE_LOCKING - struct lock_chain *new_chain; - u64 chain_key; int i; for (i = chain->base; i < chain->base + chain->depth; i++) { if (chain_hlocks[i] != class - lock_classes) continue; - /* The code below leaks one chain_hlock[] entry. */ - if (--chain->depth > 0) { - memmove(&chain_hlocks[i], &chain_hlocks[i + 1], - (chain->base + chain->depth - i) * - sizeof(chain_hlocks[0])); - } /* * Each lock class occurs at most once in a lock chain so once * we found a match we can break out of this loop. */ - goto recalc; + goto free_lock_chain; } /* Since the chain has not been modified, return. */ return; -recalc: - chain_key = INITIAL_CHAIN_KEY; - for (i = chain->base; i < chain->base + chain->depth; i++) - chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); - if (chain->depth && chain->chain_key == chain_key) - return; +free_lock_chain: + free_chain_hlocks(chain->base, chain->depth); /* Overwrite the chain key for concurrent RCU readers. */ - WRITE_ONCE(chain->chain_key, chain_key); + WRITE_ONCE(chain->chain_key, INITIAL_CHAIN_KEY); + dec_chains(chain->irq_context); + /* * Note: calling hlist_del_rcu() from inside a * hlist_for_each_entry_rcu() loop is safe. */ hlist_del_rcu(&chain->entry); __set_bit(chain - lock_chains, pf->lock_chains_being_freed); - if (chain->depth == 0) - return; - /* - * If the modified lock chain matches an existing lock chain, drop - * the modified lock chain. - */ - if (lookup_chain_cache(chain_key)) - return; - new_chain = alloc_lock_chain(); - if (WARN_ON_ONCE(!new_chain)) { - debug_locks_off(); - return; - } - *new_chain = *chain; - hlist_add_head_rcu(&new_chain->entry, chainhashentry(chain_key)); + nr_zapped_lock_chains++; #endif } @@ -4874,6 +5289,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) } remove_class_from_lock_chains(pf, class); + nr_zapped_classes++; } static void reinit_class(struct lock_class *class) @@ -4958,8 +5374,7 @@ static void free_zapped_rcu(struct rcu_head *ch) return; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); - current->lockdep_recursion = 1; + lockdep_lock(); /* closed head */ pf = delayed_free.pf + (delayed_free.index ^ 1); @@ -4971,8 +5386,7 @@ static void free_zapped_rcu(struct rcu_head *ch) */ call_rcu_zapped(delayed_free.pf + delayed_free.index); - current->lockdep_recursion = 0; - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); } @@ -5017,13 +5431,11 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size) init_data_structures_once(); raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); - current->lockdep_recursion = 1; + lockdep_lock(); pf = get_pending_free(); __lockdep_free_key_range(pf, start, size); call_rcu_zapped(pf); - current->lockdep_recursion = 0; - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); /* @@ -5045,10 +5457,10 @@ static void lockdep_free_key_range_imm(void *start, unsigned long size) init_data_structures_once(); raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); __lockdep_free_key_range(pf, start, size); __free_zapped_classes(pf); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); } @@ -5144,10 +5556,10 @@ static void lockdep_reset_lock_imm(struct lockdep_map *lock) unsigned long flags; raw_local_irq_save(flags); - arch_spin_lock(&lockdep_lock); + lockdep_lock(); __lockdep_reset_lock(pf, lock); __free_zapped_classes(pf); - arch_spin_unlock(&lockdep_lock); + lockdep_unlock(); raw_local_irq_restore(flags); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 18d85aebbb57..baca699b94e9 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -106,6 +106,12 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define STACK_TRACE_HASH_SIZE 16384 #endif +/* + * Bit definitions for lock_chain.irq_context + */ +#define LOCK_CHAIN_SOFTIRQ_CONTEXT (1 << 0) +#define LOCK_CHAIN_HARDIRQ_CONTEXT (1 << 1) + #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) @@ -124,17 +130,21 @@ extern const char *__get_key_name(const struct lockdep_subclass_key *key, struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); extern unsigned long nr_lock_classes; +extern unsigned long nr_zapped_classes; +extern unsigned long nr_zapped_lock_chains; extern unsigned long nr_list_entries; long lockdep_next_lockchain(long i); unsigned long lock_chain_count(void); -extern int nr_chain_hlocks; extern unsigned long nr_stack_trace_entries; extern unsigned int nr_hardirq_chains; extern unsigned int nr_softirq_chains; extern unsigned int nr_process_chains; -extern unsigned int max_lockdep_depth; +extern unsigned int nr_free_chain_hlocks; +extern unsigned int nr_lost_chain_hlocks; +extern unsigned int nr_large_chain_blocks; +extern unsigned int max_lockdep_depth; extern unsigned int max_bfs_queue_depth; #ifdef CONFIG_PROVE_LOCKING diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 231684cfc5ae..5525cd3ba0c8 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -128,15 +128,22 @@ static int lc_show(struct seq_file *m, void *v) struct lock_chain *chain = v; struct lock_class *class; int i; + static const char * const irq_strs[] = { + [0] = "0", + [LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq", + [LOCK_CHAIN_SOFTIRQ_CONTEXT] = "softirq", + [LOCK_CHAIN_SOFTIRQ_CONTEXT| + LOCK_CHAIN_HARDIRQ_CONTEXT] = "hardirq|softirq", + }; if (v == SEQ_START_TOKEN) { - if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS) + if (!nr_free_chain_hlocks) seq_printf(m, "(buggered) "); seq_printf(m, "all lock chains:\n"); return 0; } - seq_printf(m, "irq_context: %d\n", chain->irq_context); + seq_printf(m, "irq_context: %s\n", irq_strs[chain->irq_context]); for (i = 0; i < chain->depth; i++) { class = lock_chain_get_class(chain, i); @@ -271,8 +278,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", lock_chain_count(), MAX_LOCKDEP_CHAINS); - seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", - nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); + seq_printf(m, " dependency chain hlocks used: %11lu [max: %lu]\n", + MAX_LOCKDEP_CHAIN_HLOCKS - + (nr_free_chain_hlocks + nr_lost_chain_hlocks), + MAX_LOCKDEP_CHAIN_HLOCKS); + seq_printf(m, " dependency chain hlocks lost: %11u\n", + nr_lost_chain_hlocks); #endif #ifdef CONFIG_TRACE_IRQFLAGS @@ -336,6 +347,18 @@ static int lockdep_stats_show(struct seq_file *m, void *v) seq_printf(m, " debug_locks: %11u\n", debug_locks); + /* + * Zappped classes and lockdep data buffers reuse statistics. + */ + seq_puts(m, "\n"); + seq_printf(m, " zapped classes: %11lu\n", + nr_zapped_classes); +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " zapped lock chains: %11lu\n", + nr_zapped_lock_chains); + seq_printf(m, " large chain blocks: %11u\n", + nr_large_chain_blocks); +#endif return 0; } diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 771d4ca96dda..a7276aaf2abc 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -85,7 +85,7 @@ void debug_mutex_init(struct mutex *lock, const char *name, * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); #endif lock->magic = lock; } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 364d38a0c444..a008a1ba21a7 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -1,27 +1,29 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/atomic.h> -#include <linux/rwsem.h> #include <linux/percpu.h> +#include <linux/wait.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/sched/task.h> #include <linux/errno.h> -#include "rwsem.h" - int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, - const char *name, struct lock_class_key *rwsem_key) + const char *name, struct lock_class_key *key) { sem->read_count = alloc_percpu(int); if (unlikely(!sem->read_count)) return -ENOMEM; - /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ rcu_sync_init(&sem->rss); - __init_rwsem(&sem->rw_sem, name, rwsem_key); rcuwait_init(&sem->writer); - sem->readers_block = 0; + init_waitqueue_head(&sem->waiters); + atomic_set(&sem->block, 0); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); @@ -41,73 +43,139 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *sem) } EXPORT_SYMBOL_GPL(percpu_free_rwsem); -int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) +static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { + __this_cpu_inc(*sem->read_count); + /* * Due to having preemption disabled the decrement happens on * the same CPU as the increment, avoiding the * increment-on-one-CPU-and-decrement-on-another problem. * - * If the reader misses the writer's assignment of readers_block, then - * the writer is guaranteed to see the reader's increment. + * If the reader misses the writer's assignment of sem->block, then the + * writer is guaranteed to see the reader's increment. * * Conversely, any readers that increment their sem->read_count after - * the writer looks are guaranteed to see the readers_block value, - * which in turn means that they are guaranteed to immediately - * decrement their sem->read_count, so that it doesn't matter that the - * writer missed them. + * the writer looks are guaranteed to see the sem->block value, which + * in turn means that they are guaranteed to immediately decrement + * their sem->read_count, so that it doesn't matter that the writer + * missed them. */ smp_mb(); /* A matches D */ /* - * If !readers_block the critical section starts here, matched by the + * If !sem->block the critical section starts here, matched by the * release in percpu_up_write(). */ - if (likely(!smp_load_acquire(&sem->readers_block))) + if (likely(!atomic_read_acquire(&sem->block))) + return true; + + __this_cpu_dec(*sem->read_count); + + /* Prod writer to re-evaluate readers_active_check() */ + rcuwait_wake_up(&sem->writer); + + return false; +} + +static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem) +{ + if (atomic_read(&sem->block)) + return false; + + return atomic_xchg(&sem->block, 1) == 0; +} + +static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader) +{ + if (reader) { + bool ret; + + preempt_disable(); + ret = __percpu_down_read_trylock(sem); + preempt_enable(); + + return ret; + } + return __percpu_down_write_trylock(sem); +} + +/* + * The return value of wait_queue_entry::func means: + * + * <0 - error, wakeup is terminated and the error is returned + * 0 - no wakeup, a next waiter is tried + * >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive. + * + * We use EXCLUSIVE for both readers and writers to preserve FIFO order, + * and play games with the return value to allow waking multiple readers. + * + * Specifically, we wake readers until we've woken a single writer, or until a + * trylock fails. + */ +static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, + unsigned int mode, int wake_flags, + void *key) +{ + struct task_struct *p = get_task_struct(wq_entry->private); + bool reader = wq_entry->flags & WQ_FLAG_CUSTOM; + struct percpu_rw_semaphore *sem = key; + + /* concurrent against percpu_down_write(), can get stolen */ + if (!__percpu_rwsem_trylock(sem, reader)) return 1; - /* - * Per the above comment; we still have preemption disabled and - * will thus decrement on the same CPU as we incremented. - */ - __percpu_up_read(sem); + list_del_init(&wq_entry->entry); + smp_store_release(&wq_entry->private, NULL); - if (try) - return 0; + wake_up_process(p); + put_task_struct(p); - /* - * We either call schedule() in the wait, or we'll fall through - * and reschedule on the preempt_enable() in percpu_down_read(). - */ - preempt_enable_no_resched(); + return !reader; /* wake (readers until) 1 writer */ +} + +static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) +{ + DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); + bool wait; + spin_lock_irq(&sem->waiters.lock); /* - * Avoid lockdep for the down/up_read() we already have them. + * Serialize against the wakeup in percpu_up_write(), if we fail + * the trylock, the wakeup must see us on the list. */ - __down_read(&sem->rw_sem); - this_cpu_inc(*sem->read_count); - __up_read(&sem->rw_sem); + wait = !__percpu_rwsem_trylock(sem, reader); + if (wait) { + wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM; + __add_wait_queue_entry_tail(&sem->waiters, &wq_entry); + } + spin_unlock_irq(&sem->waiters.lock); - preempt_disable(); - return 1; + while (wait) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!smp_load_acquire(&wq_entry.private)) + break; + schedule(); + } + __set_current_state(TASK_RUNNING); } -EXPORT_SYMBOL_GPL(__percpu_down_read); -void __percpu_up_read(struct percpu_rw_semaphore *sem) +bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) { - smp_mb(); /* B matches C */ - /* - * In other words, if they see our decrement (presumably to aggregate - * zero, as that is the only time it matters) they will also see our - * critical section. - */ - __this_cpu_dec(*sem->read_count); + if (__percpu_down_read_trylock(sem)) + return true; - /* Prod writer to recheck readers_active */ - rcuwait_wake_up(&sem->writer); + if (try) + return false; + + preempt_enable(); + percpu_rwsem_wait(sem, /* .reader = */ true); + preempt_disable(); + + return true; } -EXPORT_SYMBOL_GPL(__percpu_up_read); +EXPORT_SYMBOL_GPL(__percpu_down_read); #define per_cpu_sum(var) \ ({ \ @@ -124,6 +192,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read); * zero. If this sum is zero, then it is stable due to the fact that if any * newly arriving readers increment a given counter, they will immediately * decrement that same counter. + * + * Assumes sem->block is set. */ static bool readers_active_check(struct percpu_rw_semaphore *sem) { @@ -142,32 +212,36 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem) void percpu_down_write(struct percpu_rw_semaphore *sem) { + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + /* Notify readers to take the slow path. */ rcu_sync_enter(&sem->rss); - down_write(&sem->rw_sem); - /* - * Notify new readers to block; up until now, and thus throughout the - * longish rcu_sync_enter() above, new readers could still come in. + * Try set sem->block; this provides writer-writer exclusion. + * Having sem->block set makes new readers block. */ - WRITE_ONCE(sem->readers_block, 1); + if (!__percpu_down_write_trylock(sem)) + percpu_rwsem_wait(sem, /* .reader = */ false); - smp_mb(); /* D matches A */ + /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */ /* - * If they don't see our writer of readers_block, then we are - * guaranteed to see their sem->read_count increment, and therefore - * will wait for them. + * If they don't see our store of sem->block, then we are guaranteed to + * see their sem->read_count increment, and therefore will wait for + * them. */ - /* Wait for all now active readers to complete. */ - rcuwait_wait_event(&sem->writer, readers_active_check(sem)); + /* Wait for all active readers to complete. */ + rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL_GPL(percpu_down_write); void percpu_up_write(struct percpu_rw_semaphore *sem) { + rwsem_release(&sem->dep_map, _RET_IP_); + /* * Signal the writer is done, no fast path yet. * @@ -178,12 +252,12 @@ void percpu_up_write(struct percpu_rw_semaphore *sem) * Therefore we force it through the slow path which guarantees an * acquire and thereby guarantees the critical section's consistency. */ - smp_store_release(&sem->readers_block, 0); + atomic_set_release(&sem->block, 0); /* - * Release the write lock, this will allow readers back in the game. + * Prod any pending reader/writer to make progress. */ - up_write(&sem->rw_sem); + __wake_up(&sem->waiters, TASK_NORMAL, 1, sem); /* * Once this completes (at least one RCU-sched grace period hence) the diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 0d9b6be9ecc8..f11b9bd3431d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -28,7 +28,6 @@ #include <linux/rwsem.h> #include <linux/atomic.h> -#include "rwsem.h" #include "lock_events.h" /* @@ -329,7 +328,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, * Make sure we are not reinitializing a held semaphore: */ debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); + lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); #endif #ifdef CONFIG_DEBUG_RWSEMS sem->magic = sem; @@ -660,8 +659,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, unsigned long flags; bool ret = true; - BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE)); - if (need_resched()) { lockevent_inc(rwsem_opt_fail); return false; @@ -1338,7 +1335,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* * lock for reading */ -inline void __down_read(struct rw_semaphore *sem) +static inline void __down_read(struct rw_semaphore *sem) { if (!rwsem_read_trylock(sem)) { rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); @@ -1426,7 +1423,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* * unlock after reading */ -inline void __up_read(struct rw_semaphore *sem) +static inline void __up_read(struct rw_semaphore *sem) { long tmp; diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 2534ce49f648..e69de29bb2d1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef __INTERNAL_RWSEM_H -#define __INTERNAL_RWSEM_H -#include <linux/rwsem.h> - -extern void __down_read(struct rw_semaphore *sem); -extern void __up_read(struct rw_semaphore *sem); - -#endif /* __INTERNAL_RWSEM_H */ diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 472dd462a40c..b9d93087ee66 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -14,14 +14,14 @@ #include <linux/export.h> void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, - struct lock_class_key *key) + struct lock_class_key *key, short inner) { #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, inner); #endif lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; lock->magic = SPINLOCK_MAGIC; @@ -39,7 +39,7 @@ void __rwlock_init(rwlock_t *lock, const char *name, * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_CONFIG); #endif lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; lock->magic = RWLOCK_MAGIC; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ddade80ad276..d82b7b88d616 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1681,7 +1681,7 @@ static unsigned long minimum_image_size(unsigned long saveable) * hibernation for allocations made while saving the image and for device * drivers, in case they need to allocate memory from their hibernation * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough - * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through + * estimate) and reserved_size divided by PAGE_SIZE (which is tunable through * /sys/power/reserved_size, respectively). To make this happen, we compute the * total number of available page frames and allocate at least * diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 2c47280fbfc7..8b1bb5ee7e5d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -131,11 +131,12 @@ static void s2idle_loop(void) * to avoid them upfront. */ for (;;) { - if (s2idle_ops && s2idle_ops->wake) - s2idle_ops->wake(); - - if (pm_wakeup_pending()) + if (s2idle_ops && s2idle_ops->wake) { + if (s2idle_ops->wake()) + break; + } else if (pm_wakeup_pending()) { break; + } pm_wakeup_clear(false); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d91c9156fab2..5066d1dd3077 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1113,6 +1113,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { init_irq_work(&rdp->rcu_iw, rcu_iw_handler); + atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ); rdp->rcu_iw_pending = true; rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 6c4b862f57d6..8d3eb2fe20ae 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -227,18 +227,30 @@ core_initcall(rcu_set_runtime_mode); #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); +struct lockdep_map rcu_lock_map = { + .name = "rcu_read_lock", + .key = &rcu_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* XXX PREEMPT_RCU ? */ +}; EXPORT_SYMBOL_GPL(rcu_lock_map); static struct lock_class_key rcu_bh_lock_key; -struct lockdep_map rcu_bh_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); +struct lockdep_map rcu_bh_lock_map = { + .name = "rcu_read_lock_bh", + .key = &rcu_bh_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_LOCK also makes BH preemptible */ +}; EXPORT_SYMBOL_GPL(rcu_bh_lock_map); static struct lock_class_key rcu_sched_lock_key; -struct lockdep_map rcu_sched_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); +struct lockdep_map rcu_sched_lock_map = { + .name = "rcu_read_lock_sched", + .key = &rcu_sched_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_SPIN, +}; EXPORT_SYMBOL_GPL(rcu_sched_lock_map); static struct lock_class_key rcu_callback_key; diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a1ad5b7d5521..a778554f9dad 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -29,12 +29,12 @@ void complete(struct completion *x) { unsigned long flags; - spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); if (x->done != UINT_MAX) x->done++; - __wake_up_locked(&x->wait, TASK_NORMAL, 1); - spin_unlock_irqrestore(&x->wait.lock, flags); + swake_up_locked(&x->wait); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); @@ -58,10 +58,12 @@ void complete_all(struct completion *x) { unsigned long flags; - spin_lock_irqsave(&x->wait.lock, flags); + lockdep_assert_RT_in_threaded_ctx(); + + raw_spin_lock_irqsave(&x->wait.lock, flags); x->done = UINT_MAX; - __wake_up_locked(&x->wait, TASK_NORMAL, 0); - spin_unlock_irqrestore(&x->wait.lock, flags); + swake_up_all_locked(&x->wait); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); @@ -70,20 +72,20 @@ do_wait_for_common(struct completion *x, long (*action)(long), long timeout, int state) { if (!x->done) { - DECLARE_WAITQUEUE(wait, current); + DECLARE_SWAITQUEUE(wait); - __add_wait_queue_entry_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } + __prepare_to_swait(&x->wait, &wait); __set_current_state(state); - spin_unlock_irq(&x->wait.lock); + raw_spin_unlock_irq(&x->wait.lock); timeout = action(timeout); - spin_lock_irq(&x->wait.lock); + raw_spin_lock_irq(&x->wait.lock); } while (!x->done && timeout); - __remove_wait_queue(&x->wait, &wait); + __finish_swait(&x->wait, &wait); if (!x->done) return timeout; } @@ -100,9 +102,9 @@ __wait_for_common(struct completion *x, complete_acquire(x); - spin_lock_irq(&x->wait.lock); + raw_spin_lock_irq(&x->wait.lock); timeout = do_wait_for_common(x, action, timeout, state); - spin_unlock_irq(&x->wait.lock); + raw_spin_unlock_irq(&x->wait.lock); complete_release(x); @@ -291,12 +293,12 @@ bool try_wait_for_completion(struct completion *x) if (!READ_ONCE(x->done)) return false; - spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = false; else if (x->done != UINT_MAX) x->done--; - spin_unlock_irqrestore(&x->wait.lock, flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); @@ -322,8 +324,8 @@ bool completion_done(struct completion *x) * otherwise we can end up freeing the completion before complete() * is done referencing it. */ - spin_lock_irqsave(&x->wait.lock, flags); - spin_unlock_irqrestore(&x->wait.lock, flags); + raw_spin_lock_irqsave(&x->wait.lock, flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fc1dfc007604..1a9983da4408 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -552,27 +552,32 @@ void resched_cpu(int cpu) */ int get_nohz_timer_target(void) { - int i, cpu = smp_processor_id(); + int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; - if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) - return cpu; + if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { + if (!idle_cpu(cpu)) + return cpu; + default_cpu = cpu; + } rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) { + for_each_cpu_and(i, sched_domain_span(sd), + housekeeping_cpumask(HK_FLAG_TIMER)) { if (cpu == i) continue; - if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { + if (!idle_cpu(i)) { cpu = i; goto unlock; } } } - if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) - cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + if (default_cpu == -1) + default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + cpu = default_cpu; unlock: rcu_read_unlock(); return cpu; @@ -1442,17 +1447,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -static inline bool is_per_cpu_kthread(struct task_struct *p) -{ - if (!(p->flags & PF_KTHREAD)) - return false; - - if (p->nr_cpus_allowed != 1) - return false; - - return true; -} - /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). @@ -3669,28 +3663,32 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) + if (!tick_nohz_tick_stopped_cpu(cpu)) goto out_requeue; rq_lock_irq(rq, &rf); curr = rq->curr; - if (is_idle_task(curr) || cpu_is_offline(cpu)) + if (cpu_is_offline(cpu)) goto out_unlock; + curr = rq->curr; update_rq_clock(rq); - delta = rq_clock_task(rq) - curr->se.exec_start; - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + if (!is_idle_task(curr)) { + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + delta = rq_clock_task(rq) - curr->se.exec_start; + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + } curr->sched_class->task_tick(rq, curr, 0); + calc_load_nohz_remote(rq); out_unlock: rq_unlock_irq(rq, &rf); - out_requeue: + /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough @@ -7063,8 +7061,15 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, queue_flags); - if (running) + if (running) { set_next_task(rq, tsk); + /* + * After changing group, the running task may have joined a + * throttled one but it's still the running task. Trigger a + * resched to make sure that task can still run. + */ + resched_curr(rq); + } task_rq_unlock(rq, tsk, &rf); } @@ -7260,7 +7265,7 @@ capacity_from_percent(char *buf) &req.percent); if (req.ret) return req; - if (req.percent > UCLAMP_PERCENT_SCALE) { + if ((u64)req.percent > UCLAMP_PERCENT_SCALE) { req.ret = -ERANGE; return req; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe4e0d775375..3c8a379c357e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3516,7 +3516,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach - * @flags: migration hints * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. @@ -5912,6 +5911,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(prev) || sched_idle_cpu(prev))) return prev; + /* + * Allow a per-cpu kthread to stack with the wakee if the + * kworker thread and the tasks previous CPUs are the same. + * The assumption is that the wakee queued work for the + * per-cpu kthread that is now complete and the wakeup is + * essentially a sync wakeup. An obvious example of this + * pattern is IO completions. + */ + if (is_per_cpu_kthread(current) && + prev == smp_processor_id() && + this_rq()->nr_running <= 1) { + return prev; + } + /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && @@ -8658,10 +8671,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * Try to use spare capacity of local group without overloading it or * emptying busiest. - * XXX Spreading tasks across NUMA nodes is not always the best policy - * and special care should be taken for SD_NUMA domain level before - * spreading the tasks. For now, load_balance() fully relies on - * NUMA_BALANCING and fbq_classify_group/rq to override the decision. */ if (local->group_type == group_has_spare) { if (busiest->group_type > group_fully_busy) { @@ -8701,16 +8710,37 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->migration_type = migrate_task; lsub_positive(&nr_diff, local->sum_nr_running); env->imbalance = nr_diff >> 1; - return; - } + } else { - /* - * If there is no overload, we just want to even the number of - * idle cpus. - */ - env->migration_type = migrate_task; - env->imbalance = max_t(long, 0, (local->idle_cpus - + /* + * If there is no overload, we just want to even the number of + * idle cpus. + */ + env->migration_type = migrate_task; + env->imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1); + } + + /* Consider allowing a small imbalance between NUMA groups */ + if (env->sd->flags & SD_NUMA) { + unsigned int imbalance_min; + + /* + * Compute an allowed imbalance based on a simple + * pair of communicating tasks that should remain + * local and ignore them. + * + * NOTE: Generally this would have been based on + * the domain size and this was evaluated. However, + * the benefit is similar across a range of workloads + * and machines but scaling by the domain size adds + * the risk that lower domains have to be rebalanced. + */ + imbalance_min = 2; + if (busiest->sum_nr_running <= imbalance_min) + env->imbalance = 0; + } + return; } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 28a516575c18..de22da666ac7 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -231,16 +231,11 @@ static inline int calc_load_read_idx(void) return calc_load_idx & 1; } -void calc_load_nohz_start(void) +static void calc_load_nohz_fold(struct rq *rq) { - struct rq *this_rq = this_rq(); long delta; - /* - * We're going into NO_HZ mode, if there's any pending delta, fold it - * into the pending NO_HZ delta. - */ - delta = calc_load_fold_active(this_rq, 0); + delta = calc_load_fold_active(rq, 0); if (delta) { int idx = calc_load_write_idx(); @@ -248,6 +243,24 @@ void calc_load_nohz_start(void) } } +void calc_load_nohz_start(void) +{ + /* + * We're going into NO_HZ mode, if there's any pending delta, fold it + * into the pending NO_HZ delta. + */ + calc_load_nohz_fold(this_rq()); +} + +/* + * Keep track of the load for NOHZ_FULL, must be called between + * calc_load_nohz_{start,stop}(). + */ +void calc_load_nohz_remote(struct rq *rq) +{ + calc_load_nohz_fold(rq); +} + void calc_load_nohz_stop(void) { struct rq *this_rq = this_rq(); @@ -268,7 +281,7 @@ void calc_load_nohz_stop(void) this_rq->calc_load_update += LOAD_FREQ; } -static long calc_load_nohz_fold(void) +static long calc_load_nohz_read(void) { int idx = calc_load_read_idx(); long delta = 0; @@ -323,7 +336,7 @@ static void calc_global_nohz(void) } #else /* !CONFIG_NO_HZ_COMMON */ -static inline long calc_load_nohz_fold(void) { return 0; } +static inline long calc_load_nohz_read(void) { return 0; } static inline void calc_global_nohz(void) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -346,7 +359,7 @@ void calc_global_load(unsigned long ticks) /* * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. */ - delta = calc_load_nohz_fold(); + delta = calc_load_nohz_read(); if (delta) atomic_long_add(delta, &calc_load_tasks); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ac4bd0ca11cc..028520702717 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1199,6 +1199,9 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; + if (!nbytes) + return -EINVAL; + buf_size = min(nbytes, sizeof(buf)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1a88dc8ad11b..fdc77e796324 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -896,7 +896,7 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr; + struct task_struct __rcu *curr; struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; @@ -2479,3 +2479,19 @@ static inline void membarrier_switch_mm(struct rq *rq, { } #endif + +#ifdef CONFIG_SMP +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} +#endif + +void swake_up_all_locked(struct swait_queue_head *q); +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e83a3f8449f6..e1c655f928c7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -32,6 +32,19 @@ void swake_up_locked(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_locked); +/* + * Wake up all waiters. This is an interface which is solely exposed for + * completions and not for general usage. + * + * It is intentionally different from swake_up_all() to allow usage from + * hard interrupt context and interrupt disabled regions. + */ +void swake_up_all_locked(struct swait_queue_head *q) +{ + while (!list_empty(&q->task_list)) + swake_up_locked(q); +} + void swake_up_one(struct swait_queue_head *q) { unsigned long flags; @@ -69,7 +82,7 @@ void swake_up_all(struct swait_queue_head *q) } EXPORT_SYMBOL(swake_up_all); -static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) { wait->task = current; if (list_empty(&wait->task_list)) diff --git a/kernel/signal.c b/kernel/signal.c index 9ad8dea93dbb..5b2396350dd1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -413,27 +413,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi { struct sigqueue *q = NULL; struct user_struct *user; + int sigpending; /* * Protect access to @t credentials. This can go away when all * callers hold rcu read lock. + * + * NOTE! A pending signal will hold on to the user refcount, + * and we get/put the refcount only when the sigpending count + * changes from/to zero. */ rcu_read_lock(); - user = get_uid(__task_cred(t)->user); - atomic_inc(&user->sigpending); + user = __task_cred(t)->user; + sigpending = atomic_inc_return(&user->sigpending); + if (sigpending == 1) + get_uid(user); rcu_read_unlock(); - if (override_rlimit || - atomic_read(&user->sigpending) <= - task_rlimit(t, RLIMIT_SIGPENDING)) { + if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } if (unlikely(q == NULL)) { - atomic_dec(&user->sigpending); - free_uid(user); + if (atomic_dec_and_test(&user->sigpending)) + free_uid(user); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; @@ -447,8 +452,8 @@ static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; - atomic_dec(&q->user->sigpending); - free_uid(q->user); + if (atomic_dec_and_test(&q->user->sigpending)) + free_uid(q->user); kmem_cache_free(sigqueue_cachep, q); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d396aaaf19a3..ad5b88a53c5a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -805,15 +805,6 @@ static struct ctl_table kern_table[] = { .extra2 = &maxolduid, }, #ifdef CONFIG_S390 -#ifdef CONFIG_MATHEMU - { - .procname = "ieee_emulation_warnings", - .data = &sysctl_ieee_emulation_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "userprocess_debug", .data = &show_unhandled_signals, diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3a609e7344f3..8cce72501aea 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1404,7 +1404,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; - timer->is_hard = !softtimer; + timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } @@ -1514,7 +1514,11 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); + lockdep_hrtimer_enter(timer); + restart = fn(timer); + + lockdep_hrtimer_exit(timer); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index d23b434c2ca7..eddcf4970444 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -58,7 +58,8 @@ static struct clocksource clocksource_jiffies = { .max_cycles = 10, }; -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock); +__cacheline_aligned_in_smp seqcount_t jiffies_seq; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) @@ -67,9 +68,9 @@ u64 get_jiffies_64(void) u64 ret; do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); ret = jiffies_64; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); return ret; } EXPORT_SYMBOL(get_jiffies_64); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 8ff6da77a01f..2c48a7233b19 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1126,8 +1126,11 @@ void run_posix_cpu_timers(void) if (!fastpath_timer_check(tsk)) return; - if (!lock_task_sighand(tsk, &flags)) + lockdep_posixtimer_enter(); + if (!lock_task_sighand(tsk, &flags)) { + lockdep_posixtimer_exit(); return; + } /* * Here we take off tsk->signal->cpu_timers[N] and * tsk->cpu_timers[N] all the timers that are firing, and @@ -1169,6 +1172,7 @@ void run_posix_cpu_timers(void) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } + lockdep_posixtimer_exit(); } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 7e5d3524e924..6c9c342dd0e5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -84,13 +84,15 @@ int tick_is_oneshot_available(void) static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); /* Keep track of the next tick event */ tick_next_period = ktime_add(tick_next_period, tick_period); do_timer(1); - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } @@ -162,9 +164,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) ktime_t next; do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); next = tick_next_period; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a792d21cac64..3e2dc9b8858c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -65,7 +65,8 @@ static void tick_do_update_jiffies64(ktime_t now) return; /* Reevaluate with jiffies_lock held */ - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, last_jiffies_update); if (delta >= tick_period) { @@ -91,10 +92,12 @@ static void tick_do_update_jiffies64(ktime_t now) /* Keep the tick_next_period variable up to date */ tick_next_period = ktime_add(last_jiffies_update, tick_period); } else { - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); return; } - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } @@ -105,12 +108,14 @@ static ktime_t tick_init_jiffy_update(void) { ktime_t period; - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ if (last_jiffies_update == 0) last_jiffies_update = tick_next_period; period = last_jiffies_update; - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); return period; } @@ -240,6 +245,7 @@ static void nohz_full_kick_func(struct irq_work *work) static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { .func = nohz_full_kick_func, + .flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ), }; /* @@ -676,10 +682,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) /* Read jiffies and the time when jiffies were updated last */ do { - seq = read_seqbegin(&jiffies_lock); + seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; - } while (read_seqretry(&jiffies_lock, seq)); + } while (read_seqcount_retry(&jiffies_seq, seq)); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; diff --git a/kernel/time/time.c b/kernel/time/time.c index cdd7386115ff..3985b2b32d08 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -449,49 +449,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -struct timespec ns_to_timespec(const s64 nsec) -{ - struct timespec ts; - s32 rem; - - if (!nsec) - return (struct timespec) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; - } - ts.tv_nsec = rem; - - return ts; -} -EXPORT_SYMBOL(ns_to_timespec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -struct timeval ns_to_timeval(const s64 nsec) -{ - struct timespec ts = ns_to_timespec(nsec); - struct timeval tv; - - tv.tv_sec = ts.tv_sec; - tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; - - return tv; -} -EXPORT_SYMBOL(ns_to_timeval); - struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ca69290bee2a..856280d2cbd4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2397,8 +2397,10 @@ EXPORT_SYMBOL(hardpps); */ void xtime_update(unsigned long ticks) { - write_seqlock(&jiffies_lock); + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); do_timer(ticks); - write_sequnlock(&jiffies_lock); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 141ab3ab0354..099737f6f10c 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -25,7 +25,8 @@ static inline void sched_clock_resume(void) { } extern void do_timer(unsigned long ticks); extern void update_wall_time(void); -extern seqlock_t jiffies_lock; +extern raw_spinlock_t jiffies_lock; +extern seqcount_t jiffies_seq; #define CS_NAME_LEN 32 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 91e885194dbc..402eef84c859 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -143,8 +143,8 @@ if FTRACE config BOOTTIME_TRACING bool "Boot-time Tracing support" - depends on BOOT_CONFIG && TRACING - default y + depends on TRACING + select BOOT_CONFIG help Enable developer to setup ftrace subsystem via supplemental kernel cmdline at boot time for debugging (tracing) driver diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 0735ae8545d8..4560878f0bac 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -335,6 +335,7 @@ static void put_probe_ref(void) static void blk_trace_cleanup(struct blk_trace *bt) { + synchronize_rcu(); blk_trace_free(bt); put_probe_ref(); } @@ -629,8 +630,10 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, static int __blk_trace_startstop(struct request_queue *q, int start) { int ret; - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (bt == NULL) return -EINVAL; @@ -740,8 +743,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) void blk_trace_shutdown(struct request_queue *q) { mutex_lock(&q->blk_trace_mutex); - - if (q->blk_trace) { + if (rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } @@ -752,8 +755,10 @@ void blk_trace_shutdown(struct request_queue *q) #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + /* We don't use the 'bt' value here except as an optimization... */ + bt = rcu_dereference_protected(q->blk_trace, 1); if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; @@ -796,10 +801,14 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) static void blk_add_trace_rq(struct request *rq, int error, unsigned int nr_bytes, u32 what, u64 cgid) { - struct blk_trace *bt = rq->q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); @@ -808,6 +817,7 @@ static void blk_add_trace_rq(struct request *rq, int error, __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), rq->cmd_flags, what, error, 0, NULL, cgid); + rcu_read_unlock(); } static void blk_add_trace_rq_insert(void *ignore, @@ -853,14 +863,19 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, u32 what, int error) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, what, error, 0, NULL, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } static void blk_add_trace_bio_bounce(void *ignore, @@ -905,11 +920,14 @@ static void blk_add_trace_getrq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } @@ -921,27 +939,35 @@ static void blk_add_trace_sleeprq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } static void blk_add_trace_plug(void *ignore, struct request_queue *q) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, unsigned int depth, bool explicit) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); u32 what; @@ -953,14 +979,17 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } + rcu_read_unlock(); } static void blk_add_trace_split(void *ignore, struct request_queue *q, struct bio *bio, unsigned int pdu) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(pdu); @@ -969,6 +998,7 @@ static void blk_add_trace_split(void *ignore, BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), &rpdu, blk_trace_bio_get_cgid(q, bio)); } + rcu_read_unlock(); } /** @@ -988,11 +1018,15 @@ static void blk_add_trace_bio_remap(void *ignore, struct request_queue *q, struct bio *bio, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(bio_dev(bio)); @@ -1001,6 +1035,7 @@ static void blk_add_trace_bio_remap(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } /** @@ -1021,11 +1056,15 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); @@ -1034,6 +1073,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } /** @@ -1051,14 +1091,19 @@ void blk_add_driver_data(struct request_queue *q, struct request *rq, void *data, size_t len) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, 0, len, data, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1597,6 +1642,7 @@ static int blk_trace_remove_queue(struct request_queue *q) return -EINVAL; put_probe_ref(); + synchronize_rcu(); blk_trace_free(bt); return 0; } @@ -1758,6 +1804,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct hd_struct *p = dev_to_part(dev); struct request_queue *q; struct block_device *bdev; + struct blk_trace *bt; ssize_t ret = -ENXIO; bdev = bdget(part_devt(p)); @@ -1770,21 +1817,23 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - ret = sprintf(buf, "%u\n", !!q->blk_trace); + ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; } - if (q->blk_trace == NULL) + if (bt == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) - ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); + ret = blk_trace_mask2str(buf, bt->act_mask); else if (attr == &dev_attr_pid) - ret = sprintf(buf, "%u\n", q->blk_trace->pid); + ret = sprintf(buf, "%u\n", bt->pid); else if (attr == &dev_attr_start_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + ret = sprintf(buf, "%llu\n", bt->start_lba); else if (attr == &dev_attr_end_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: mutex_unlock(&q->blk_trace_mutex); @@ -1801,6 +1850,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct block_device *bdev; struct request_queue *q; struct hd_struct *p; + struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1831,8 +1881,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - if (!!value == !!q->blk_trace) { + if (!!value == !!bt) { ret = 0; goto out_unlock_bdev; } @@ -1844,18 +1896,18 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } ret = 0; - if (q->blk_trace == NULL) + if (bt == NULL) ret = blk_trace_setup_queue(q, bdev); if (ret == 0) { if (attr == &dev_attr_act_mask) - q->blk_trace->act_mask = value; + bt->act_mask = value; else if (attr == &dev_attr_pid) - q->blk_trace->pid = value; + bt->pid = value; else if (attr == &dev_attr_start_lba) - q->blk_trace->start_lba = value; + bt->start_lba = value; else if (attr == &dev_attr_end_lba) - q->blk_trace->end_lba = value; + bt->end_lba = value; } out_unlock_bdev: diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 4aefe003cb7c..7d56d621ffea 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -111,11 +111,11 @@ static int __init test_gen_synth_cmd(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"hula hoops"; /* next_comm_field */ + vals[1] = (u64)(long)"hula hoops"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed"; /* my_string_field */ + vals[4] = raw_smp_processor_id(); /* cpu */ + vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 598; /* my_int_field */ /* Now generate a gen_synth_test event */ @@ -218,11 +218,11 @@ static int __init test_empty_synth_event(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed_2.0"; /* my_string_field */ + vals[4] = raw_smp_processor_id(); /* cpu */ + vals[5] = (u64)(long)"thneed_2.0"; /* my_string_field */ vals[6] = 399; /* my_int_field */ /* Now trace an empty_synth_test event */ @@ -290,11 +290,11 @@ static int __init test_create_synth_event(void) /* Create some bogus values just for testing */ vals[0] = 777; /* next_pid_field */ - vals[1] = (u64)"tiddlywinks"; /* next_comm_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ vals[3] = 1000; /* ts_ms */ - vals[4] = smp_processor_id(); /* cpu */ - vals[5] = (u64)"thneed"; /* my_string_field */ + vals[4] = raw_smp_processor_id(); /* cpu */ + vals[5] = (u64)(long)"thneed"; /* my_string_field */ vals[6] = 398; /* my_int_field */ /* Now generate a create_synth_test event */ @@ -330,7 +330,7 @@ static int __init test_add_next_synth_val(void) goto out; /* next_comm_field */ - ret = synth_event_add_next_val((u64)"slinky", &trace_state); + ret = synth_event_add_next_val((u64)(long)"slinky", &trace_state); if (ret) goto out; @@ -345,12 +345,12 @@ static int __init test_add_next_synth_val(void) goto out; /* cpu */ - ret = synth_event_add_next_val(smp_processor_id(), &trace_state); + ret = synth_event_add_next_val(raw_smp_processor_id(), &trace_state); if (ret) goto out; /* my_string_field */ - ret = synth_event_add_next_val((u64)"thneed_2.01", &trace_state); + ret = synth_event_add_next_val((u64)(long)"thneed_2.01", &trace_state); if (ret) goto out; @@ -388,7 +388,7 @@ static int __init test_add_synth_val(void) if (ret) goto out; - ret = synth_event_add_val("cpu", smp_processor_id(), &trace_state); + ret = synth_event_add_val("cpu", raw_smp_processor_id(), &trace_state); if (ret) goto out; @@ -396,12 +396,12 @@ static int __init test_add_synth_val(void) if (ret) goto out; - ret = synth_event_add_val("next_comm_field", (u64)"silly putty", + ret = synth_event_add_val("next_comm_field", (u64)(long)"silly putty", &trace_state); if (ret) goto out; - ret = synth_event_add_val("my_string_field", (u64)"thneed_9", + ret = synth_event_add_val("my_string_field", (u64)(long)"thneed_9", &trace_state); if (ret) goto out; @@ -423,13 +423,13 @@ static int __init test_trace_synth_event(void) /* Trace some bogus values just for testing */ ret = synth_event_trace(create_synth_test, 7, /* number of values */ - 444, /* next_pid_field */ - (u64)"clackers", /* next_comm_field */ - 1000000, /* ts_ns */ - 1000, /* ts_ms */ - smp_processor_id(), /* cpu */ - (u64)"Thneed", /* my_string_field */ - 999); /* my_int_field */ + (u64)444, /* next_pid_field */ + (u64)(long)"clackers", /* next_comm_field */ + (u64)1000000, /* ts_ns */ + (u64)1000, /* ts_ms */ + (u64)raw_smp_processor_id(), /* cpu */ + (u64)(long)"Thneed", /* my_string_field */ + (u64)999); /* my_int_field */ return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c797a15a1fc7..6b11e4e2150c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1837,6 +1837,7 @@ static __init int init_trace_selftests(void) pr_info("Running postponed tracer tests:\n"); + tracing_selftest_running = true; list_for_each_entry_safe(p, n, &postponed_selftests, list) { /* This loop can take minutes when sanitizers are enabled, so * lets make sure we allow RCU processing. @@ -1859,6 +1860,7 @@ static __init int init_trace_selftests(void) list_del(&p->list); kfree(p); } + tracing_selftest_running = false; out: mutex_unlock(&trace_types_lock); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e7ce7cdac62f..5f6834a2bf41 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -821,6 +821,29 @@ static const char *synth_field_fmt(char *type) return fmt; } +static void print_synth_event_num_val(struct trace_seq *s, + char *print_fmt, char *name, + int size, u64 val, char *space) +{ + switch (size) { + case 1: + trace_seq_printf(s, print_fmt, name, (u8)val, space); + break; + + case 2: + trace_seq_printf(s, print_fmt, name, (u16)val, space); + break; + + case 4: + trace_seq_printf(s, print_fmt, name, (u32)val, space); + break; + + default: + trace_seq_printf(s, print_fmt, name, val, space); + break; + } +} + static enum print_line_t print_synth_event(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -859,10 +882,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; + char *space = (i == se->n_fields - 1 ? "" : " "); - trace_seq_printf(s, print_fmt, se->fields[i]->name, - entry->fields[n_u64], - i == se->n_fields - 1 ? "" : " "); + print_synth_event_num_val(s, print_fmt, + se->fields[i]->name, + se->fields[i]->size, + entry->fields[n_u64], + space); if (strcmp(se->fields[i]->type, "gfp_t") == 0) { trace_seq_puts(s, " ("); @@ -1798,6 +1824,62 @@ void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) } EXPORT_SYMBOL_GPL(synth_event_cmd_init); +static inline int +__synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int entry_size, fields_size = 0; + int ret = 0; + + memset(trace_state, '\0', sizeof(*trace_state)); + + /* + * Normal event tracing doesn't get called at all unless the + * ENABLED bit is set (which attaches the probe thus allowing + * this code to be called, etc). Because this is called + * directly by the user, we don't have that but we still need + * to honor not logging when disabled. For the the iterated + * trace case, we save the enabed state upon start and just + * ignore the following data calls. + */ + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) { + trace_state->disabled = true; + ret = -ENOENT; + goto out; + } + + trace_state->event = file->event_call->data; + + fields_size = trace_state->event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + trace_state->buffer = file->tr->array_buffer.buffer; + ring_buffer_nest_start(trace_state->buffer); + + entry_size = sizeof(*trace_state->entry) + fields_size; + trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer, + file, + entry_size); + if (!trace_state->entry) { + ring_buffer_nest_end(trace_state->buffer); + ret = -EINVAL; + } +out: + return ret; +} + +static inline void +__synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + trace_event_buffer_commit(&trace_state->fbuffer); + + ring_buffer_nest_end(trace_state->buffer); +} + /** * synth_event_trace - Trace a synthetic event * @file: The trace_event_file representing the synthetic event @@ -1819,71 +1901,61 @@ EXPORT_SYMBOL_GPL(synth_event_cmd_init); */ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) { - struct trace_event_buffer fbuffer; - struct synth_trace_event *entry; - struct trace_buffer *buffer; - struct synth_event *event; + struct synth_event_trace_state state; unsigned int i, n_u64; - int fields_size = 0; va_list args; - int ret = 0; - - /* - * Normal event generation doesn't get called at all unless - * the ENABLED bit is set (which attaches the probe thus - * allowing this code to be called, etc). Because this is - * called directly by the user, we don't have that but we - * still need to honor not logging when disabled. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED)) - return 0; - - event = file->event_call->data; - - if (n_vals != event->n_fields) - return -EINVAL; - - if (trace_trigger_soft_disabled(file)) - return -EINVAL; - - fields_size = event->n_u64 * sizeof(u64); + int ret; - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } - entry = trace_event_buffer_reserve(&fbuffer, file, - sizeof(*entry) + fields_size); - if (!entry) { + if (n_vals != state.event->n_fields) { ret = -EINVAL; goto out; } va_start(args, n_vals); - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { u64 val; val = va_arg(args, u64); - if (event->fields[i]->is_string) { + if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)val; - char *str_field = (char *)&entry->fields[n_u64]; + char *str_field = (char *)&state.entry->fields[n_u64]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - entry->fields[n_u64] = val; + struct synth_field *field = state.event->fields[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } n_u64++; } } va_end(args); - - trace_event_buffer_commit(&fbuffer); out: - ring_buffer_nest_end(buffer); + __synth_event_trace_end(&state); return ret; } @@ -1910,64 +1982,55 @@ EXPORT_SYMBOL_GPL(synth_event_trace); int synth_event_trace_array(struct trace_event_file *file, u64 *vals, unsigned int n_vals) { - struct trace_event_buffer fbuffer; - struct synth_trace_event *entry; - struct trace_buffer *buffer; - struct synth_event *event; + struct synth_event_trace_state state; unsigned int i, n_u64; - int fields_size = 0; - int ret = 0; - - /* - * Normal event generation doesn't get called at all unless - * the ENABLED bit is set (which attaches the probe thus - * allowing this code to be called, etc). Because this is - * called directly by the user, we don't have that but we - * still need to honor not logging when disabled. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED)) - return 0; - - event = file->event_call->data; - - if (n_vals != event->n_fields) - return -EINVAL; - - if (trace_trigger_soft_disabled(file)) - return -EINVAL; - - fields_size = event->n_u64 * sizeof(u64); + int ret; - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } - entry = trace_event_buffer_reserve(&fbuffer, file, - sizeof(*entry) + fields_size); - if (!entry) { + if (n_vals != state.event->n_fields) { ret = -EINVAL; goto out; } - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { - if (event->fields[i]->is_string) { + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)vals[i]; - char *str_field = (char *)&entry->fields[n_u64]; + char *str_field = (char *)&state.entry->fields[n_u64]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - entry->fields[n_u64] = vals[i]; + struct synth_field *field = state.event->fields[i]; + u64 val = vals[i]; + + switch (field->size) { + case 1: + *(u8 *)&state.entry->fields[n_u64] = (u8)val; + break; + + case 2: + *(u16 *)&state.entry->fields[n_u64] = (u16)val; + break; + + case 4: + *(u32 *)&state.entry->fields[n_u64] = (u32)val; + break; + + default: + state.entry->fields[n_u64] = val; + break; + } n_u64++; } } - - trace_event_buffer_commit(&fbuffer); out: - ring_buffer_nest_end(buffer); + __synth_event_trace_end(&state); return ret; } @@ -2004,58 +2067,15 @@ EXPORT_SYMBOL_GPL(synth_event_trace_array); int synth_event_trace_start(struct trace_event_file *file, struct synth_event_trace_state *trace_state) { - struct synth_trace_event *entry; - int fields_size = 0; - int ret = 0; - - if (!trace_state) { - ret = -EINVAL; - goto out; - } - - memset(trace_state, '\0', sizeof(*trace_state)); - - /* - * Normal event tracing doesn't get called at all unless the - * ENABLED bit is set (which attaches the probe thus allowing - * this code to be called, etc). Because this is called - * directly by the user, we don't have that but we still need - * to honor not logging when disabled. For the the iterated - * trace case, we save the enabed state upon start and just - * ignore the following data calls. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED)) { - trace_state->enabled = false; - goto out; - } - - trace_state->enabled = true; - - trace_state->event = file->event_call->data; - - if (trace_trigger_soft_disabled(file)) { - ret = -EINVAL; - goto out; - } - - fields_size = trace_state->event->n_u64 * sizeof(u64); + int ret; - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - trace_state->buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(trace_state->buffer); + if (!trace_state) + return -EINVAL; - entry = trace_event_buffer_reserve(&trace_state->fbuffer, file, - sizeof(*entry) + fields_size); - if (!entry) { - ret = -EINVAL; - goto out; - } + ret = __synth_event_trace_start(file, trace_state); + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ - trace_state->entry = entry; -out: return ret; } EXPORT_SYMBOL_GPL(synth_event_trace_start); @@ -2088,7 +2108,7 @@ static int __synth_event_add_val(const char *field_name, u64 val, trace_state->add_next = true; } - if (!trace_state->enabled) + if (trace_state->disabled) goto out; event = trace_state->event; @@ -2122,8 +2142,25 @@ static int __synth_event_add_val(const char *field_name, u64 val, str_field = (char *)&entry->fields[field->offset]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); - } else - entry->fields[field->offset] = val; + } else { + switch (field->size) { + case 1: + *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + break; + + case 2: + *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + break; + + case 4: + *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + break; + + default: + trace_state->entry->fields[field->offset] = val; + break; + } + } out: return ret; } @@ -2223,9 +2260,7 @@ int synth_event_trace_end(struct synth_event_trace_state *trace_state) if (!trace_state) return -EINVAL; - trace_event_buffer_commit(&trace_state->fbuffer); - - ring_buffer_nest_end(trace_state->buffer); + __synth_event_trace_end(trace_state); return 0; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d8264ebb9581..362cca52f5de 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1012,7 +1012,7 @@ int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...) { struct dynevent_arg arg; va_list args; - int ret; + int ret = 0; if (cmd->type != DYNEVENT_TYPE_KPROBE) return -EINVAL; |