diff options
Diffstat (limited to 'kernel')
38 files changed, 701 insertions, 379 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ceb1c4596c51..80d672a11088 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1279,8 +1279,12 @@ static void show_special(struct audit_context *context, int *call_panic) break; case AUDIT_KERN_MODULE: audit_log_format(ab, "name="); - audit_log_untrustedstring(ab, context->module.name); - kfree(context->module.name); + if (context->module.name) { + audit_log_untrustedstring(ab, context->module.name); + kfree(context->module.name); + } else + audit_log_format(ab, "(null)"); + break; } audit_log_end(ab); @@ -2411,8 +2415,9 @@ void __audit_log_kern_module(char *name) { struct audit_context *context = audit_context(); - context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL); - strcpy(context->module.name, name); + context->module.name = kstrdup(name, GFP_KERNEL); + if (!context->module.name) + audit_log_lost("out of memory in __audit_log_kern_module"); context->type = AUDIT_KERN_MODULE; } diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 544e58f5f642..2aa55d030c77 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -378,7 +378,7 @@ static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, return -EINVAL; value_type = btf_type_id_size(btf, &btf_value_id, &value_size); - if (!value_type || value_size > map->value_size) + if (!value_type || value_size != map->value_size) return -EINVAL; return 0; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2d49d18b793a..2590700237c1 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -450,7 +450,7 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) */ static bool btf_type_int_is_regular(const struct btf_type *t) { - u16 nr_bits, nr_bytes; + u8 nr_bits, nr_bytes; u32 int_data; int_data = btf_type_int(t); @@ -991,38 +991,38 @@ static void btf_int_bits_seq_show(const struct btf *btf, void *data, u8 bits_offset, struct seq_file *m) { + u16 left_shift_bits, right_shift_bits; u32 int_data = btf_type_int(t); - u16 nr_bits = BTF_INT_BITS(int_data); - u16 total_bits_offset; - u16 nr_copy_bytes; - u16 nr_copy_bits; - u8 nr_upper_bits; - union { - u64 u64_num; - u8 u8_nums[8]; - } print_num; + u8 nr_bits = BTF_INT_BITS(int_data); + u8 total_bits_offset; + u8 nr_copy_bytes; + u8 nr_copy_bits; + u64 print_num; + /* + * bits_offset is at most 7. + * BTF_INT_OFFSET() cannot exceed 64 bits. + */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); - print_num.u64_num = 0; - memcpy(&print_num.u64_num, data, nr_copy_bytes); - - /* Ditch the higher order bits */ - nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits); - if (nr_upper_bits) { - /* We need to mask out some bits of the upper byte. */ - u8 mask = (1 << nr_upper_bits) - 1; + print_num = 0; + memcpy(&print_num, data, nr_copy_bytes); - print_num.u8_nums[nr_copy_bytes - 1] &= mask; - } +#ifdef __BIG_ENDIAN_BITFIELD + left_shift_bits = bits_offset; +#else + left_shift_bits = BITS_PER_U64 - nr_copy_bits; +#endif + right_shift_bits = BITS_PER_U64 - nr_bits; - print_num.u64_num >>= bits_offset; + print_num <<= left_shift_bits; + print_num >>= right_shift_bits; - seq_printf(m, "0x%llx", print_num.u64_num); + seq_printf(m, "0x%llx", print_num); } static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, @@ -1032,7 +1032,7 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, u32 int_data = btf_type_int(t); u8 encoding = BTF_INT_ENCODING(int_data); bool sign = encoding & BTF_INT_SIGNED; - u32 nr_bits = BTF_INT_BITS(int_data); + u8 nr_bits = BTF_INT_BITS(int_data); if (bits_offset || BTF_INT_OFFSET(int_data) || BITS_PER_BYTE_MASKED(nr_bits)) { @@ -1519,9 +1519,9 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, { bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION; const struct btf_member *member; + u32 meta_needed, last_offset; struct btf *btf = env->btf; u32 struct_size = t->size; - u32 meta_needed; u16 i; meta_needed = btf_type_vlen(t) * sizeof(*member); @@ -1534,6 +1534,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, btf_verifier_log_type(env, t, NULL); + last_offset = 0; for_each_member(i, t, member) { if (!btf_name_offset_valid(btf, member->name_off)) { btf_verifier_log_member(env, t, member, @@ -1555,6 +1556,16 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } + /* + * ">" instead of ">=" because the last member could be + * "char a[0];" + */ + if (last_offset > member->offset) { + btf_verifier_log_member(env, t, member, + "Invalid member bits_offset"); + return -EINVAL; + } + if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { btf_verifier_log_member(env, t, member, "Memmber bits_offset exceeds its struct size"); @@ -1562,6 +1573,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, } btf_verifier_log_member(env, t, member, NULL); + last_offset = member->offset; } return meta_needed; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index f7c00bd6f8e4..3d83ee7df381 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -428,6 +428,60 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return ret; } +int cgroup_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog) +{ + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + attr->attach_flags); + cgroup_put(cgrp); + return ret; +} + +int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + prog = NULL; + + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + if (prog) + bpf_prog_put(prog); + + cgroup_put(cgrp); + return ret; +} + +int cgroup_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct cgroup *cgrp; + int ret; + + cgrp = cgroup_get_from_fd(attr->query.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + ret = cgroup_bpf_query(cgrp, attr, uattr); + + cgroup_put(cgrp); + return ret; +} + /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a9e6c04d0f4a..1e5625d46414 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_fill_ill_insns(hdr, size); hdr->pages = size / PAGE_SIZE; - hdr->locked = 0; - hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return 0; } -static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp) -{ -#ifdef CONFIG_ARCH_HAS_SET_MEMORY - int i, err; - - for (i = 0; i < fp->aux->func_cnt; i++) { - err = bpf_prog_check_pages_ro_single(fp->aux->func[i]); - if (err) - return err; - } - - return bpf_prog_check_pages_ro_single(fp); -#endif - return 0; -} - static void bpf_prog_select_func(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON @@ -1524,17 +1506,7 @@ finalize: * all eBPF JITs might immediately support all features. */ *err = bpf_check_tail_call(fp); - if (*err) - return fp; - - /* Checkpoint: at this point onwards any cBPF -> eBPF or - * native eBPF program is read-only. If we failed to change - * the page attributes (e.g. allocation failure from - * splitting large pages), then reject the whole program - * in order to guarantee not ending up with any W+X pages - * from BPF side in kernel. - */ - *err = bpf_prog_check_pages_ro_locked(fp); + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 642c97f6d1b8..d361fc1e3bf3 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -334,10 +334,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; struct xdp_frame *xdpf; + int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -350,7 +355,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, { int err; - err = __xdp_generic_ok_fwd_dev(skb, dst->dev); + err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; skb->dev = dst->dev; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3ca2198a6d22..513d9dfcf4ee 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -747,13 +747,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, * old element will be freed immediately. * Otherwise return an error */ - atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); + l_new = ERR_PTR(-E2BIG); + goto dec_count; } l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); - if (!l_new) - return ERR_PTR(-ENOMEM); + if (!l_new) { + l_new = ERR_PTR(-ENOMEM); + goto dec_count; + } } memcpy(l_new->key, key, key_size); @@ -766,7 +768,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, GFP_ATOMIC | __GFP_NOWARN); if (!pptr) { kfree(l_new); - return ERR_PTR(-ENOMEM); + l_new = ERR_PTR(-ENOMEM); + goto dec_count; } } @@ -780,6 +783,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new->hash = hash; return l_new; +dec_count: + atomic_dec(&htab->count); + return l_new; } static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 52a91d816c0e..98fb7938beea 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -72,6 +72,7 @@ struct bpf_htab { u32 n_buckets; u32 elem_size; struct bpf_sock_progs progs; + struct rcu_head rcu; }; struct htab_elem { @@ -89,8 +90,8 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; struct sock **entry; - struct htab_elem *hash_link; - struct bpf_htab *htab; + struct htab_elem __rcu *hash_link; + struct bpf_htab __rcu *htab; }; struct smap_psock { @@ -120,6 +121,7 @@ struct smap_psock { struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; struct list_head maps; + spinlock_t maps_lock; /* Back reference used when sock callback trigger sockmap operations */ struct sock *sock; @@ -140,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +static void bpf_tcp_close(struct sock *sk, long timeout); static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { @@ -161,7 +164,42 @@ out: return !empty; } -static struct proto tcp_bpf_proto; +enum { + SOCKMAP_IPV4, + SOCKMAP_IPV6, + SOCKMAP_NUM_PROTS, +}; + +enum { + SOCKMAP_BASE, + SOCKMAP_TX, + SOCKMAP_NUM_CONFIGS, +}; + +static struct proto *saved_tcpv6_prot __read_mostly; +static DEFINE_SPINLOCK(tcpv6_prot_lock); +static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; +static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], + struct proto *base) +{ + prot[SOCKMAP_BASE] = *base; + prot[SOCKMAP_BASE].close = bpf_tcp_close; + prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; + prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; + + prot[SOCKMAP_TX] = prot[SOCKMAP_BASE]; + prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg; + prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage; +} + +static void update_sk_prot(struct sock *sk, struct smap_psock *psock) +{ + int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4; + int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE; + + sk->sk_prot = &bpf_tcp_prots[family][conf]; +} + static int bpf_tcp_init(struct sock *sk) { struct smap_psock *psock; @@ -181,14 +219,17 @@ static int bpf_tcp_init(struct sock *sk) psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; - if (psock->bpf_tx_msg) { - tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; - tcp_bpf_proto.sendpage = bpf_tcp_sendpage; - tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; - tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; + /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ + if (sk->sk_family == AF_INET6 && + unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { + spin_lock_bh(&tcpv6_prot_lock); + if (likely(sk->sk_prot != saved_tcpv6_prot)) { + build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot); + smp_store_release(&saved_tcpv6_prot, sk->sk_prot); + } + spin_unlock_bh(&tcpv6_prot_lock); } - - sk->sk_prot = &tcp_bpf_proto; + update_sk_prot(sk, psock); rcu_read_unlock(); return 0; } @@ -219,24 +260,64 @@ out: rcu_read_unlock(); } +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, + u32 hash, void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) { + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + } + + return NULL; +} + +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { atomic_dec(&htab->count); kfree_rcu(l, rcu); } +static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, + struct smap_psock *psock) +{ + struct smap_psock_map_entry *e; + + spin_lock_bh(&psock->maps_lock); + e = list_first_entry_or_null(&psock->maps, + struct smap_psock_map_entry, + list); + if (e) + list_del(&e->list); + spin_unlock_bh(&psock->maps_lock); + return e; +} + static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); - struct smap_psock_map_entry *e, *tmp; + struct smap_psock_map_entry *e; struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; struct sock *osk; + lock_sock(sk); rcu_read_lock(); psock = smap_psock_sk(sk); if (unlikely(!psock)) { rcu_read_unlock(); + release_sock(sk); return sk->sk_prot->close(sk, timeout); } @@ -247,7 +328,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout) */ close_fun = psock->save_close; - write_lock_bh(&sk->sk_callback_lock); if (psock->cork) { free_start_sg(psock->sock, psock->cork); kfree(psock->cork); @@ -260,21 +340,40 @@ static void bpf_tcp_close(struct sock *sk, long timeout) kfree(md); } - list_for_each_entry_safe(e, tmp, &psock->maps, list) { + e = psock_map_pop(sk, psock); + while (e) { if (e->entry) { osk = cmpxchg(e->entry, sk, NULL); if (osk == sk) { - list_del(&e->list); smap_release_sock(psock, sk); } } else { - hlist_del_rcu(&e->hash_link->hash_node); - smap_release_sock(psock, e->hash_link->sk); - free_htab_elem(e->htab, e->hash_link); + struct htab_elem *link = rcu_dereference(e->hash_link); + struct bpf_htab *htab = rcu_dereference(e->htab); + struct hlist_head *head; + struct htab_elem *l; + struct bucket *b; + + b = __select_bucket(htab, link->hash); + head = &b->head; + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, + link->hash, link->key, + htab->map.key_size); + /* If another thread deleted this object skip deletion. + * The refcnt on psock may or may not be zero. + */ + if (l) { + hlist_del_rcu(&link->hash_node); + smap_release_sock(psock, link->sk); + free_htab_elem(htab, link); + } + raw_spin_unlock_bh(&b->lock); } + e = psock_map_pop(sk, psock); } - write_unlock_bh(&sk->sk_callback_lock); rcu_read_unlock(); + release_sock(sk); close_fun(sk, timeout); } @@ -472,7 +571,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) while (sg[i].length) { free += sg[i].length; sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); + if (!md->skb) + put_page(sg_page(&sg[i])); sg[i].length = 0; sg[i].page_link = 0; sg[i].offset = 0; @@ -481,6 +581,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) if (i == MAX_SKB_FRAGS) i = 0; } + if (md->skb) + consume_skb(md->skb); return free; } @@ -1111,8 +1213,7 @@ static void bpf_tcp_msg_add(struct smap_psock *psock, static int bpf_tcp_ulp_register(void) { - tcp_bpf_proto = tcp_prot; - tcp_bpf_proto.close = bpf_tcp_close; + build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); /* Once BPF TX ULP is registered it is never unregistered. It * will be in the ULP list for the lifetime of the system. Doing * duplicate registers is not a problem. @@ -1135,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) */ TCP_SKB_CB(skb)->bpf.sk_redir = NULL; skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); preempt_enable(); @@ -1357,7 +1458,9 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { if (refcount_dec_and_test(&psock->refcnt)) { tcp_cleanup_ulp(sock); + write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); clear_bit(SMAP_TX_RUNNING, &psock->state); rcu_assign_sk_user_data(sock, NULL); call_rcu_sched(&psock->rcu, smap_destroy_psock); @@ -1388,7 +1491,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); @@ -1508,6 +1611,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, int node) INIT_LIST_HEAD(&psock->maps); INIT_LIST_HEAD(&psock->ingress); refcount_set(&psock->refcnt, 1); + spin_lock_init(&psock->maps_lock); rcu_assign_sk_user_data(sock, psock); sock_hold(sock); @@ -1564,18 +1668,32 @@ free_stab: return ERR_PTR(err); } -static void smap_list_remove(struct smap_psock *psock, - struct sock **entry, - struct htab_elem *hash_link) +static void smap_list_map_remove(struct smap_psock *psock, + struct sock **entry) { struct smap_psock_map_entry *e, *tmp; + spin_lock_bh(&psock->maps_lock); list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry || e->hash_link == hash_link) { + if (e->entry == entry) + list_del(&e->list); + } + spin_unlock_bh(&psock->maps_lock); +} + +static void smap_list_hash_remove(struct smap_psock *psock, + struct htab_elem *hash_link) +{ + struct smap_psock_map_entry *e, *tmp; + + spin_lock_bh(&psock->maps_lock); + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + struct htab_elem *c = rcu_dereference(e->hash_link); + + if (c == hash_link) list_del(&e->list); - break; - } } + spin_unlock_bh(&psock->maps_lock); } static void sock_map_free(struct bpf_map *map) @@ -1601,7 +1719,6 @@ static void sock_map_free(struct bpf_map *map) if (!sock) continue; - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -1609,10 +1726,9 @@ static void sock_map_free(struct bpf_map *map) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, &stab->sock_map[i], NULL); + smap_list_map_remove(psock, &stab->sock_map[i]); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); @@ -1661,17 +1777,15 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (!sock) return -EINVAL; - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); if (!psock) goto out; if (psock->bpf_parse) smap_stop_sock(psock, sock); - smap_list_remove(psock, &stab->sock_map[k], NULL); + smap_list_map_remove(psock, &stab->sock_map[k]); smap_release_sock(psock, sock); out: - write_unlock_bh(&sock->sk_callback_lock); return 0; } @@ -1752,7 +1866,6 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, } } - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* 2. Do not allow inheriting programs if psock exists and has @@ -1789,7 +1902,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); if (!e) { err = -ENOMEM; - goto out_progs; + goto out_free; } } @@ -1809,7 +1922,9 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, if (err) goto out_free; smap_init_progs(psock, verdict, parse); + write_lock_bh(&sock->sk_callback_lock); smap_start_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); } /* 4. Place psock in sockmap for use and stop any programs on @@ -1819,9 +1934,10 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, */ if (map_link) { e->entry = map_link; + spin_lock_bh(&psock->maps_lock); list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); } - write_unlock_bh(&sock->sk_callback_lock); return err; out_free: smap_release_sock(psock, sock); @@ -1832,7 +1948,6 @@ out_progs: } if (tx_msg) bpf_prog_put(tx_msg); - write_unlock_bh(&sock->sk_callback_lock); kfree(e); return err; } @@ -1869,10 +1984,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, if (osock) { struct smap_psock *opsock = smap_psock_sk(osock); - write_lock_bh(&osock->sk_callback_lock); - smap_list_remove(opsock, &stab->sock_map[i], NULL); + smap_list_map_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); - write_unlock_bh(&osock->sk_callback_lock); } out: return err; @@ -1915,6 +2028,24 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return 0; } +int sockmap_get_from_fd(const union bpf_attr *attr, int type, + struct bpf_prog *prog) +{ + int ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int err; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = sock_map_prog(map, prog, attr->attach_type); + fdput(f); + return err; +} + static void *sock_map_lookup(struct bpf_map *map, void *key) { return NULL; @@ -1944,7 +2075,13 @@ static int sock_map_update_elem(struct bpf_map *map, return -EOPNOTSUPP; } + lock_sock(skops.sk); + preempt_disable(); + rcu_read_lock(); err = sock_map_ctx_update_elem(&skops, map, key, flags); + rcu_read_unlock(); + preempt_enable(); + release_sock(skops.sk); fput(socket->file); return err; } @@ -2043,14 +2180,13 @@ free_htab: return ERR_PTR(err); } -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +static void __bpf_htab_free(struct rcu_head *rcu) { - return &htab->buckets[hash & (htab->n_buckets - 1)]; -} + struct bpf_htab *htab; -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &__select_bucket(htab, hash)->head; + htab = container_of(rcu, struct bpf_htab, rcu); + bpf_map_area_free(htab->buckets); + kfree(htab); } static void sock_hash_free(struct bpf_map *map) @@ -2069,16 +2205,18 @@ static void sock_hash_free(struct bpf_map *map) */ rcu_read_lock(); for (i = 0; i < htab->n_buckets; i++) { - struct hlist_head *head = select_bucket(htab, i); + struct bucket *b = __select_bucket(htab, i); + struct hlist_head *head; struct hlist_node *n; struct htab_elem *l; + raw_spin_lock_bh(&b->lock); + head = &b->head; hlist_for_each_entry_safe(l, n, head, hash_node) { struct sock *sock = l->sk; struct smap_psock *psock; hlist_del_rcu(&l->hash_node); - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get * the sk_callback_lock before this case but after xchg @@ -2086,16 +2224,15 @@ static void sock_hash_free(struct bpf_map *map) * (psock) to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, NULL, l); + smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); - kfree(l); + free_htab_elem(htab, l); } + raw_spin_unlock_bh(&b->lock); } rcu_read_unlock(); - bpf_map_area_free(htab->buckets); - kfree(htab); + call_rcu(&htab->rcu, __bpf_htab_free); } static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, @@ -2122,19 +2259,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, return l_new; } -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, - u32 hash, void *key, u32 key_size) -{ - struct htab_elem *l; - - hlist_for_each_entry_rcu(l, head, hash_node) { - if (l->hash == hash && !memcmp(&l->key, key, key_size)) - return l; - } - - return NULL; -} - static inline u32 htab_map_hash(const void *key, u32 key_len) { return jhash(key, key_len, 0); @@ -2230,7 +2354,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, if (err) goto err; - /* bpf_map_update_elem() can be called in_irq() */ + /* psock is valid here because otherwise above *ctx_update_elem would + * have thrown an error. It is safe to skip error check. + */ + psock = smap_psock_sk(sock); raw_spin_lock_bh(&b->lock); l_old = lookup_elem_raw(head, hash, key, key_size); if (l_old && map_flags == BPF_NOEXIST) { @@ -2248,15 +2375,12 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto bucket_err; } - psock = smap_psock_sk(sock); - if (unlikely(!psock)) { - err = -EINVAL; - goto bucket_err; - } - - e->hash_link = l_new; - e->htab = container_of(map, struct bpf_htab, map); + rcu_assign_pointer(e->hash_link, l_new); + rcu_assign_pointer(e->htab, + container_of(map, struct bpf_htab, map)); + spin_lock_bh(&psock->maps_lock); list_add_tail(&e->list, &psock->maps); + spin_unlock_bh(&psock->maps_lock); /* add new element to the head of the list, so that * concurrent search will find it before old elem @@ -2266,19 +2390,17 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, psock = smap_psock_sk(l_old->sk); hlist_del_rcu(&l_old->hash_node); - smap_list_remove(psock, NULL, l_old); + smap_list_hash_remove(psock, l_old); smap_release_sock(psock, l_old->sk); free_htab_elem(htab, l_old); } raw_spin_unlock_bh(&b->lock); return 0; bucket_err: + smap_release_sock(psock, sock); raw_spin_unlock_bh(&b->lock); err: kfree(e); - psock = smap_psock_sk(sock); - if (psock) - smap_release_sock(psock, sock); return err; } @@ -2300,7 +2422,13 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + lock_sock(skops.sk); + preempt_disable(); + rcu_read_lock(); err = sock_hash_ctx_update_elem(&skops, map, key, flags); + rcu_read_unlock(); + preempt_enable(); + release_sock(skops.sk); fput(socket->file); return err; } @@ -2326,7 +2454,6 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) struct smap_psock *psock; hlist_del_rcu(&l->hash_node); - write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); /* This check handles a racing sock event that can get the * sk_callback_lock before this case but after xchg happens @@ -2334,10 +2461,9 @@ static int sock_hash_delete_elem(struct bpf_map *map, void *key) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, NULL, l); + smap_list_hash_remove(psock, l); smap_release_sock(psock, sock); } - write_unlock_bh(&sock->sk_callback_lock); free_htab_elem(htab, l); ret = 0; } @@ -2359,10 +2485,8 @@ struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - raw_spin_lock_bh(&b->lock); l = lookup_elem_raw(head, hash, key, key_size); sk = l ? l->sk : NULL; - raw_spin_unlock_bh(&b->lock); return sk; } @@ -2383,6 +2507,7 @@ const struct bpf_map_ops sock_hash_ops = { .map_get_next_key = sock_hash_get_next_key, .map_update_elem = sock_hash_update_elem, .map_delete_elem = sock_hash_delete_elem, + .map_release_uref = sock_map_release, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35dc466641f2..a31a1ba0f8ea 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -735,7 +735,9 @@ static int map_update_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_update_elem(map, key, value, attr->flags); goto out; - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH || + map->map_type == BPF_MAP_TYPE_SOCKMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } @@ -1483,8 +1485,6 @@ out_free_tp: return err; } -#ifdef CONFIG_CGROUP_BPF - static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { @@ -1499,40 +1499,6 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, - int type, bool attach) -{ - struct bpf_prog *prog = NULL; - int ufd = attr->target_fd; - struct bpf_map *map; - struct fd f; - int err; - - f = fdget(ufd); - map = __bpf_map_get(f); - if (IS_ERR(map)) - return PTR_ERR(map); - - if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, type); - if (IS_ERR(prog)) { - fdput(f); - return PTR_ERR(prog); - } - } - - err = sock_map_prog(map, prog, attr->attach_type); - if (err) { - fdput(f); - if (prog) - bpf_prog_put(prog); - return err; - } - - fdput(f); - return 0; -} - #define BPF_F_ATTACH_MASK \ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) @@ -1540,7 +1506,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; struct bpf_prog *prog; - struct cgroup *cgrp; int ret; if (!capable(CAP_NET_ADMIN)) @@ -1577,12 +1542,15 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); + ptype = BPF_PROG_TYPE_SK_MSG; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); + ptype = BPF_PROG_TYPE_SK_SKB; + break; case BPF_LIRC_MODE2: - return lirc_prog_attach(attr); + ptype = BPF_PROG_TYPE_LIRC_MODE2; + break; default: return -EINVAL; } @@ -1596,18 +1564,20 @@ static int bpf_prog_attach(const union bpf_attr *attr) return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) { - bpf_prog_put(prog); - return PTR_ERR(cgrp); + switch (ptype) { + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + ret = sockmap_get_from_fd(attr, ptype, prog); + break; + case BPF_PROG_TYPE_LIRC_MODE2: + ret = lirc_prog_attach(attr, prog); + break; + default: + ret = cgroup_bpf_prog_attach(attr, ptype, prog); } - ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, - attr->attach_flags); if (ret) bpf_prog_put(prog); - cgroup_put(cgrp); - return ret; } @@ -1616,9 +1586,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - struct bpf_prog *prog; - struct cgroup *cgrp; - int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -1651,29 +1618,17 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); default: return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - - prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); - if (IS_ERR(prog)) - prog = NULL; - - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); - if (prog) - bpf_prog_put(prog); - cgroup_put(cgrp); - return ret; + return cgroup_bpf_prog_detach(attr, ptype); } #define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt @@ -1681,9 +1636,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - struct cgroup *cgrp; - int ret; - if (!capable(CAP_NET_ADMIN)) return -EPERM; if (CHECK_ATTR(BPF_PROG_QUERY)) @@ -1711,14 +1663,9 @@ static int bpf_prog_query(const union bpf_attr *attr, default: return -EINVAL; } - cgrp = cgroup_get_from_fd(attr->query.target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - ret = cgroup_bpf_query(cgrp, attr, uattr); - cgroup_put(cgrp); - return ret; + + return cgroup_bpf_prog_query(attr, uattr); } -#endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -2365,7 +2312,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; -#ifdef CONFIG_CGROUP_BPF case BPF_PROG_ATTACH: err = bpf_prog_attach(&attr); break; @@ -2375,7 +2321,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_QUERY: err = bpf_prog_query(&attr, uattr); break; -#endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e2bf834f13a..63aaac52a265 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5430,6 +5430,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; + /* Upon error here we cannot fall back to interpreter but + * need a hard reject of the program. Thus -EFAULT is + * propagated in any case. + */ subprog = find_subprog(env, i + insn->imm + 1); if (subprog < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", @@ -5450,7 +5454,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); if (!func) - return -ENOMEM; + goto out_undo_insn; for (i = 0; i < env->subprog_cnt; i++) { subprog_start = subprog_end; @@ -5515,7 +5519,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) tmp = bpf_int_jit_compile(func[i]); if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); - err = -EFAULT; + err = -ENOTSUPP; goto out_free; } cond_resched(); @@ -5552,6 +5556,7 @@ out_free: if (func[i]) bpf_jit_free(func[i]); kfree(func); +out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { @@ -5578,6 +5583,8 @@ static int fixup_call_args(struct bpf_verifier_env *env) err = jit_subprogs(env); if (err == 0) return 0; + if (err == -EFAULT) + return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON for (i = 0; i < prog->len; i++, insn++) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 8f0434a9951a..eec2d5fb676b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6343,7 +6343,7 @@ static u64 perf_virt_to_phys(u64 virt) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; -static struct perf_callchain_entry * +struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { bool kernel = !event->attr.exclude_callchain_kernel; @@ -6382,7 +6382,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - data->callchain = perf_callchain(event, regs); + if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + data->callchain = perf_callchain(event, regs); + size += data->callchain->nr; header->size += size * sizeof(u64); @@ -7335,6 +7337,10 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { + /* d_inode(NULL) won't be equal to any mapped user-space file */ + if (!filter->path.dentry) + return false; + if (d_inode(filter->path.dentry) != file_inode(file)) return false; diff --git a/kernel/fork.c b/kernel/fork.c index 9440d61b925c..1b27babc4c78 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -303,11 +303,36 @@ struct kmem_cache *files_cachep; struct kmem_cache *fs_cachep; /* SLAB cache for vm_area_struct structures */ -struct kmem_cache *vm_area_cachep; +static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) +{ + struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + + if (vma) + vma_init(vma, mm); + return vma; +} + +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) +{ + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + + if (new) { + *new = *orig; + INIT_LIST_HEAD(&new->anon_vma_chain); + } + return new; +} + +void vm_area_free(struct vm_area_struct *vma) +{ + kmem_cache_free(vm_area_cachep, vma); +} + static void account_kernel_stack(struct task_struct *tsk, int account) { void *stack = task_stack_page(tsk); @@ -455,11 +480,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto fail_nomem; charge = len; } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; - *tmp = *mpnt; - INIT_LIST_HEAD(&tmp->anon_vma_chain); retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; @@ -539,7 +562,7 @@ fail_uprobe_end: fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); + vm_area_free(tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index daeabd791d58..9a8b7ba9aa88 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1068,6 +1068,13 @@ static int irq_setup_forced_threading(struct irqaction *new) if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) return 0; + /* + * No further action required for interrupts which are requested as + * threaded interrupts already + */ + if (new->handler == irq_default_primary_handler) + return 0; + new->flags |= IRQF_ONESHOT; /* @@ -1075,7 +1082,7 @@ static int irq_setup_forced_threading(struct irqaction *new) * thread handler. We force thread them as well by creating a * secondary action. */ - if (new->handler != irq_default_primary_handler && new->thread_fn) { + if (new->handler && new->thread_fn) { /* Allocate the secondary action */ new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!new->secondary) diff --git a/kernel/kthread.c b/kernel/kthread.c index 481951bf091d..486dedbd9af5 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task) static void __kthread_parkme(struct kthread *self) { for (;;) { - set_current_state(TASK_PARKED); + /* + * TASK_PARKED is a special state; we must serialize against + * possible pending wakeups to avoid store-store collisions on + * task->state. + * + * Such a collision might possibly result in the task state + * changin from TASK_PARKED and us failing the + * wait_task_inactive() in kthread_park(). + */ + set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; + + complete_all(&self->parked); schedule(); } __set_current_state(TASK_RUNNING); @@ -191,11 +202,6 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); -void kthread_park_complete(struct task_struct *k) -{ - complete_all(&to_kthread(k)->parked); -} - static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -319,8 +325,14 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), task = create->result; if (!IS_ERR(task)) { static const struct sched_param param = { .sched_priority = 0 }; + char name[TASK_COMM_LEN]; - vsnprintf(task->comm, sizeof(task->comm), namefmt, args); + /* + * task is already visible to other tasks, so updating + * COMM must be protected. + */ + vsnprintf(name, sizeof(name), namefmt, args); + set_task_comm(task, name); /* * root may have changed our (kthreadd's) priority or CPU mask. * The kernel thread should not inherit these properties. @@ -461,6 +473,9 @@ void kthread_unpark(struct task_struct *k) reinit_completion(&kthread->parked); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. + */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -487,7 +502,16 @@ int kthread_park(struct task_struct *k) set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); + /* + * Wait for __kthread_parkme() to complete(), this means we + * _will_ have TASK_PARKED and are about to call schedule(). + */ wait_for_completion(&kthread->parked); + /* + * Now wait for that schedule() to complete and the task to + * get scheduled out. + */ + WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4f014be7a4b8..2823d4163a37 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1465,6 +1465,29 @@ rt_mutex_fastunlock(struct rt_mutex *lock, rt_mutex_postunlock(&wake_q); } +static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) +{ + might_sleep(); + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/** + * rt_mutex_lock_nested - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * @subclass: the lockdep subclass + */ +void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) +{ + __rt_mutex_lock(lock, subclass); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); +#endif + +#ifndef CONFIG_DEBUG_LOCK_ALLOC /** * rt_mutex_lock - lock a rt_mutex * @@ -1472,12 +1495,10 @@ rt_mutex_fastunlock(struct rt_mutex *lock, */ void __sched rt_mutex_lock(struct rt_mutex *lock) { - might_sleep(); - - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); + __rt_mutex_lock(lock, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock); +#endif /** * rt_mutex_lock_interruptible - lock a rt_mutex interruptible diff --git a/kernel/memremap.c b/kernel/memremap.c index 5857267a4af5..38283363da06 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -176,10 +176,27 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; + struct dev_pagemap *conflict_pgmap; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - align_start; + align_end = align_start + align_size - 1; + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_start), NULL); + if (conflict_pgmap) { + dev_WARN(dev, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return ERR_PTR(-ENOMEM); + } + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL); + if (conflict_pgmap) { + dev_WARN(dev, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return ERR_PTR(-ENOMEM); + } + is_ram = region_intersects(align_start, align_size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); @@ -199,7 +216,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) mutex_lock(&pgmap_lock); error = 0; - align_end = align_start + align_size - 1; foreach_order_pgoff(res, order, pgoff) { error = __radix_tree_insert(&pgmap_radix, @@ -305,7 +321,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); #ifdef CONFIG_DEV_PAGEMAP_OPS DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL_GPL(devmap_managed_key); +EXPORT_SYMBOL(devmap_managed_key); static atomic_t devmap_enable; /* @@ -346,5 +362,5 @@ void __put_devmap_managed_page(struct page *page) } else if (!count) __put_page(page); } -EXPORT_SYMBOL_GPL(__put_devmap_managed_page); +EXPORT_SYMBOL(__put_devmap_managed_page); #endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/kernel/rseq.c b/kernel/rseq.c index 22b6acf1ad63..c6242d8594dc 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -85,9 +85,9 @@ static int rseq_update_cpu_id(struct task_struct *t) { u32 cpu_id = raw_smp_processor_id(); - if (__put_user(cpu_id, &t->rseq->cpu_id_start)) + if (put_user(cpu_id, &t->rseq->cpu_id_start)) return -EFAULT; - if (__put_user(cpu_id, &t->rseq->cpu_id)) + if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; trace_rseq_update(t); return 0; @@ -100,14 +100,14 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) /* * Reset cpu_id_start to its initial state (0). */ - if (__put_user(cpu_id_start, &t->rseq->cpu_id_start)) + if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) return -EFAULT; /* * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming * in after unregistration can figure out that rseq needs to be * registered again. */ - if (__put_user(cpu_id, &t->rseq->cpu_id)) + if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; return 0; } @@ -115,29 +115,36 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; - unsigned long ptr; + u64 ptr; u32 __user *usig; u32 sig; int ret; - ret = __get_user(ptr, &t->rseq->rseq_cs); - if (ret) - return ret; + if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr))) + return -EFAULT; if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } - urseq_cs = (struct rseq_cs __user *)ptr; + if (ptr >= TASK_SIZE) + return -EINVAL; + urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) return -EFAULT; - if (rseq_cs->version > 0) - return -EINVAL; + if (rseq_cs->start_ip >= TASK_SIZE || + rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || + rseq_cs->abort_ip >= TASK_SIZE || + rseq_cs->version > 0) + return -EINVAL; + /* Check for overflow. */ + if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) + return -EINVAL; /* Ensure that abort_ip is not in the critical section. */ if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) return -EINVAL; - usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); + usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); ret = get_user(sig, usig); if (ret) return ret; @@ -146,7 +153,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) printk_ratelimited(KERN_WARNING "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", sig, current->rseq_sig, current->pid, usig); - return -EPERM; + return -EINVAL; } return 0; } @@ -157,7 +164,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) int ret; /* Get thread flags. */ - ret = __get_user(flags, &t->rseq->flags); + ret = get_user(flags, &t->rseq->flags); if (ret) return ret; @@ -195,9 +202,11 @@ static int clear_rseq_cs(struct task_struct *t) * of code outside of the rseq assembly block. This performs * a lazy clear of the rseq_cs field. * - * Set rseq_cs to NULL with single-copy atomicity. + * Set rseq_cs to NULL. */ - return __put_user(0UL, &t->rseq->rseq_cs); + if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64))) + return -EFAULT; + return 0; } /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78d8facba456..fe365c9a08e9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,7 +7,6 @@ */ #include "sched.h" -#include <linux/kthread.h> #include <linux/nospec.h> #include <linux/kcov.h> @@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev) membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } - if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { - switch (prev_state) { - case TASK_DEAD: - if (prev->sched_class->task_dead) - prev->sched_class->task_dead(prev); + if (unlikely(prev_state == TASK_DEAD)) { + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - - /* Task is done with its stack. */ - put_task_stack(prev); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); - put_task_struct(prev); - break; + /* Task is done with its stack. */ + put_task_stack(prev); - case TASK_PARKED: - kthread_park_complete(prev); - break; - } + put_task_struct(prev); } tick_nohz_task_switch(); @@ -3113,7 +3104,9 @@ static void sched_tick_remote(struct work_struct *work) struct tick_work *twork = container_of(dwork, struct tick_work, work); int cpu = twork->cpu; struct rq *rq = cpu_rq(cpu); + struct task_struct *curr; struct rq_flags rf; + u64 delta; /* * Handle the tick only if it appears the remote CPU is running in full @@ -3122,24 +3115,28 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { - struct task_struct *curr; - u64 delta; + if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) + goto out_requeue; - rq_lock_irq(rq, &rf); - update_rq_clock(rq); - curr = rq->curr; - delta = rq_clock_task(rq) - curr->se.exec_start; + rq_lock_irq(rq, &rf); + curr = rq->curr; + if (is_idle_task(curr)) + goto out_unlock; - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); - curr->sched_class->task_tick(rq, curr, 0); - rq_unlock_irq(rq, &rf); - } + update_rq_clock(rq); + delta = rq_clock_task(rq) - curr->se.exec_start; + + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + curr->sched_class->task_tick(rq, curr, 0); + +out_unlock: + rq_unlock_irq(rq, &rf); +out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3cde46483f0a..c907fde01eaa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - if (rq->rt.rt_nr_running) + if (rt_rq_is_runnable(&rq->rt)) return sg_cpu->max; /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fbfc3f1d368a..b5fbdde6afa9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2090,8 +2090,14 @@ retry: sub_rq_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); add_rq_bw(&next_task->dl, &later_rq->dl); + + /* + * Update the later_rq clock here, because the clock is used + * by the cpufreq_update_util() inside __add_running_bw(). + */ + update_rq_clock(later_rq); add_running_bw(&next_task->dl, &later_rq->dl); - activate_task(later_rq, next_task, 0); + activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); ret = 1; resched_curr(later_rq); @@ -2290,8 +2296,17 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && p->dl.dl_runtime) task_non_contending(p); - if (!task_on_rq_queued(p)) + if (!task_on_rq_queued(p)) { + /* + * Inactive timer is armed. However, p is leaving DEADLINE and + * might migrate away from this rq while continuing to run on + * some other class. We need to remove its contribution from + * this rq running_bw now, or sub_rq_bw (below) will complain. + */ + if (p->dl.dl_non_contending) + sub_running_bw(&p->dl, &rq->dl); sub_rq_bw(&p->dl, &rq->dl); + } /* * We cannot use inactive_task_timer() to invoke sub_running_bw() diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1866e64792a7..2f0a0be4d344 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) if (!sched_feat(UTIL_EST)) return; - /* - * Update root cfs_rq's estimated utilization - * - * If *p is the last task then the root cfs_rq's estimated utilization - * of a CPU is 0 by definition. - */ - ue.enqueued = 0; - if (cfs_rq->nr_running) { - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, - (_task_util_est(p) | UTIL_AVG_UNCHANGED)); - } + /* Update root cfs_rq's estimated utilization */ + ue.enqueued = cfs_rq->avg.util_est.enqueued; + ue.enqueued -= min_t(unsigned int, ue.enqueued, + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); /* @@ -4590,6 +4582,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); + cfs_b->expires_seq++; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4612,6 +4605,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); u64 amount = 0, min_amount, expires; + int expires_seq; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4628,6 +4622,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } + expires_seq = cfs_b->expires_seq; expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); @@ -4637,8 +4632,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) * spread between our sched_clock and the one on which runtime was * issued. */ - if ((s64)(expires - cfs_rq->runtime_expires) > 0) + if (cfs_rq->expires_seq != expires_seq) { + cfs_rq->expires_seq = expires_seq; cfs_rq->runtime_expires = expires; + } return cfs_rq->runtime_remaining > 0; } @@ -4664,12 +4661,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. It is valid to compare - * cfs_b->runtime_expires without any locks since we only care about - * exact equality, so a partial write will still work. + * whether the global deadline(cfs_b->expires_seq) has advanced. */ - - if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { + if (cfs_rq->expires_seq == cfs_b->expires_seq) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { @@ -5202,13 +5196,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + u64 overrun; + lockdep_assert_held(&cfs_b->lock); - if (!cfs_b->period_active) { - cfs_b->period_active = 1; - hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); - hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); - } + if (cfs_b->period_active) + return; + + cfs_b->period_active = 1; + overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); + cfs_b->expires_seq++; + hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 47556b0c9a95..eaaec8364f96 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; - if (!rt_se) + if (!rt_se) { dequeue_top_rt_rq(rt_rq); + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); + } else if (on_rt_rq(rt_se)) dequeue_rt_entity(rt_se, 0); } @@ -833,6 +836,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) * can be time-consuming. Try to avoid it when possible. */ raw_spin_lock(&rt_rq->rt_runtime_lock); + if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF) + rt_rq->rt_runtime = rt_b->rt_runtime; skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; raw_spin_unlock(&rt_rq->rt_runtime_lock); if (skip) @@ -1001,8 +1006,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) sub_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 0; - /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq, 0); } static void @@ -1014,11 +1017,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) if (rt_rq->rt_queued) return; - if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) + + if (rt_rq_throttled(rt_rq)) return; - add_nr_running(rq, rt_rq->rt_nr_running); - rt_rq->rt_queued = 1; + if (rt_rq->rt_nr_running) { + add_nr_running(rq, rt_rq->rt_nr_running); + rt_rq->rt_queued = 1; + } /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq, 0); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6601baf2361c..c7742dcc136c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -334,9 +334,10 @@ struct cfs_bandwidth { u64 runtime; s64 hierarchical_quota; u64 runtime_expires; + int expires_seq; - int idle; - int period_active; + short idle; + short period_active; struct hrtimer period_timer; struct hrtimer slack_timer; struct list_head throttled_cfs_rq; @@ -551,6 +552,7 @@ struct cfs_rq { #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; + int expires_seq; u64 runtime_expires; s64 runtime_remaining; @@ -609,6 +611,11 @@ struct rt_rq { #endif }; +static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq) +{ + return rt_rq->rt_queued && rt_rq->rt_nr_running; +} + /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 05a831427bc7..56a0fed30c0a 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -47,7 +47,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); } - if (!cpumask_test_cpu(cpu, sched_group_span(group))) { + if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 900dcfee542c..6f584861d329 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -79,12 +79,16 @@ static void wakeup_softirqd(void) /* * If ksoftirqd is scheduled, we do not want to process pending softirqs - * right now. Let ksoftirqd handle this at its own rate, to get fairness. + * right now. Let ksoftirqd handle this at its own rate, to get fairness, + * unless we're doing some of the synchronous softirqs. */ -static bool ksoftirqd_running(void) +#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ)) +static bool ksoftirqd_running(unsigned long pending) { struct task_struct *tsk = __this_cpu_read(ksoftirqd); + if (pending & SOFTIRQ_NOW_MASK) + return false; return tsk && (tsk->state == TASK_RUNNING); } @@ -328,7 +332,7 @@ asmlinkage __visible void do_softirq(void) pending = local_softirq_pending(); - if (pending && !ksoftirqd_running()) + if (pending && !ksoftirqd_running(pending)) do_softirq_own_stack(); local_irq_restore(flags); @@ -355,7 +359,7 @@ void irq_enter(void) static inline void invoke_softirq(void) { - if (ksoftirqd_running()) + if (ksoftirqd_running(local_softirq_pending())) return; if (!force_irqthreads) { @@ -386,7 +390,7 @@ static inline void tick_irq_exit(void) /* Make sure that timer wheel updates are propagated */ if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { - if (!in_interrupt()) + if (!in_irq()) tick_nohz_irq_exit(); } #endif diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index f89014a2c238..e190d1ef3a23 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -260,6 +260,15 @@ retry: err = 0; __cpu_stop_queue_work(stopper1, work1, &wakeq); __cpu_stop_queue_work(stopper2, work2, &wakeq); + /* + * The waking up of stopper threads has to happen + * in the same scheduling context as the queueing. + * Otherwise, there is a possibility of one of the + * above stoppers being woken up by another CPU, + * and preempting us. This will cause us to n ot + * wake up the other stopper forever. + */ + preempt_disable(); unlock: raw_spin_unlock(&stopper2->lock); raw_spin_unlock_irq(&stopper1->lock); @@ -270,7 +279,10 @@ unlock: goto retry; } - wake_up_q(&wakeq); + if (!err) { + wake_up_q(&wakeq); + preempt_enable(); + } return err; } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b7005dd21ec1..14de3727b18e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -277,8 +277,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev, */ return !curdev || newdev->rating > curdev->rating || - (!cpumask_equal(curdev->cpumask, newdev->cpumask) && - !tick_check_percpu(curdev, newdev, smp_processor_id())); + !cpumask_equal(curdev->cpumask, newdev->cpumask); } /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index da9455a6b42b..5b33e2f5c0ed 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -642,7 +642,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) static inline bool local_timer_softirq_pending(void) { - return local_softirq_pending() & TIMER_SOFTIRQ; + return local_softirq_pending() & BIT(TIMER_SOFTIRQ); } static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index efed9c1cfb7e..caf9cbf35816 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -192,17 +192,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, regs); } -/** - * clear_ftrace_function - reset the ftrace function - * - * This NULLs the ftrace function and in essence stops - * tracing. There may be lag - */ -void clear_ftrace_function(void) -{ - ftrace_trace_function = ftrace_stub; -} - static void ftrace_sync(struct work_struct *work) { /* @@ -6689,7 +6678,7 @@ void ftrace_kill(void) { ftrace_disabled = 1; ftrace_enabled = 0; - clear_ftrace_function(); + ftrace_trace_function = ftrace_stub; } /** diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6a46af21765c..0b0b688ea166 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3227,6 +3227,22 @@ int ring_buffer_record_is_on(struct ring_buffer *buffer) } /** + * ring_buffer_record_is_set_on - return true if the ring buffer is set writable + * @buffer: The ring buffer to see if write is set enabled + * + * Returns true if the ring buffer is set writable by ring_buffer_record_on(). + * Note that this does NOT mean it is in a writable state. + * + * It may return true when the ring buffer has been disabled by + * ring_buffer_record_disable(), as that is a temporary disabling of + * the ring buffer. + */ +int ring_buffer_record_is_set_on(struct ring_buffer *buffer) +{ + return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); +} + +/** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. * @cpu: The CPU buffer to stop diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a0079b4c7a49..823687997b01 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1373,6 +1373,12 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&tr->max_lock); + /* Inherit the recordable setting from trace_buffer */ + if (ring_buffer_record_is_set_on(tr->trace_buffer.buffer)) + ring_buffer_record_on(tr->max_buffer.buffer); + else + ring_buffer_record_off(tr->max_buffer.buffer); + swap(tr->trace_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); @@ -2953,6 +2959,7 @@ out_nobuffer: } EXPORT_SYMBOL_GPL(trace_vbprintk); +__printf(3, 0) static int __trace_array_vprintk(struct ring_buffer *buffer, unsigned long ip, const char *fmt, va_list args) @@ -3007,12 +3014,14 @@ out_nobuffer: return len; } +__printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); } +__printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { @@ -3028,6 +3037,7 @@ int trace_array_printk(struct trace_array *tr, return ret; } +__printf(3, 4) int trace_array_printk_buf(struct ring_buffer *buffer, unsigned long ip, const char *fmt, ...) { @@ -3043,6 +3053,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer, return ret; } +__printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(&global_trace, ip, fmt, args); @@ -3360,8 +3371,8 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m, print_event_info(buf, m); - seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); - seq_printf(m, "# | | | %s | |\n", tgid ? " | " : ""); + seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); + seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); } static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, @@ -3381,9 +3392,9 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file tgid ? tgid_space : space); seq_printf(m, "# %s||| / delay\n", tgid ? tgid_space : space); - seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", + seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n", tgid ? " TGID " : space); - seq_printf(m, "# | | | %s|||| | |\n", + seq_printf(m, "# | | %s | |||| | |\n", tgid ? " | " : space); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 630c5a24b2b2..f8f86231ad90 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -583,9 +583,7 @@ static __always_inline void trace_clear_recursion(int bit) static inline struct ring_buffer_iter * trace_buffer_iter(struct trace_iterator *iter, int cpu) { - if (iter->buffer_iter && iter->buffer_iter[cpu]) - return iter->buffer_iter[cpu]; - return NULL; + return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL; } int tracer_init(struct tracer *t, struct trace_array *tr); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0dceb77d1d42..893a206bcba4 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1701,6 +1701,7 @@ static void create_filter_finish(struct filter_parse_error *pe) * @filter_str: filter string * @set_str: remember @filter_str and enable detailed error in filter * @filterp: out param for created filter (always updated on return) + * Must be a pointer that references a NULL pointer. * * Creates a filter for @call with @filter_str. If @set_str is %true, * @filter_str is copied and recorded in the new filter. @@ -1718,6 +1719,10 @@ static int create_filter(struct trace_event_call *call, struct filter_parse_error *pe = NULL; int err; + /* filterp must point to NULL */ + if (WARN_ON(*filterp)) + *filterp = NULL; + err = create_filter_start(filter_string, set_str, &pe, filterp); if (err) return err; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 046c716a6536..aae18af94c94 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -393,7 +393,7 @@ static void hist_err_event(char *str, char *system, char *event, char *var) else if (system) snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); else - strncpy(err, var, MAX_FILTER_STR_VAL); + strscpy(err, var, MAX_FILTER_STR_VAL); hist_err(str, err); } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d18249683682..5dea177cef53 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -679,6 +679,8 @@ event_trigger_callback(struct event_command *cmd_ops, goto out_free; out_reg: + /* Up the trigger_data count to make sure reg doesn't free it on failure */ + event_trigger_init(trigger_ops, trigger_data); ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); /* * The above returns on success the # of functions enabled, @@ -686,11 +688,13 @@ event_trigger_callback(struct event_command *cmd_ops, * Consider no functions a failure too. */ if (!ret) { + cmd_ops->unreg(glob, trigger_ops, trigger_data, file); ret = -ENOENT; - goto out_free; - } else if (ret < 0) - goto out_free; - ret = 0; + } else if (ret > 0) + ret = 0; + + /* Down the counter of trigger_data or free it if not used anymore */ + event_trigger_free(trigger_ops, trigger_data); out: return ret; @@ -1416,6 +1420,9 @@ int event_enable_trigger_func(struct event_command *cmd_ops, goto out; } + /* Up the trigger_data count to make sure nothing frees it on failure */ + event_trigger_init(trigger_ops, trigger_data); + if (trigger) { number = strsep(&trigger, ":"); @@ -1466,6 +1473,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, goto out_disable; /* Just return zero, not the number of enabled functions */ ret = 0; + event_trigger_free(trigger_ops, trigger_data); out: return ret; @@ -1476,7 +1484,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops, out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); - kfree(trigger_data); + event_trigger_free(trigger_ops, trigger_data); kfree(enable_data); goto out; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 23c0b0cb5fb9..169b3c44ee97 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -831,6 +831,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; + int cpu = iter->cpu; int i; graph_ret = &ret_entry->ret; @@ -839,7 +840,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, if (data) { struct fgraph_cpu_data *cpu_data; - int cpu = iter->cpu; cpu_data = per_cpu_ptr(data->cpu_data, cpu); @@ -869,6 +869,9 @@ print_graph_entry_leaf(struct trace_iterator *iter, trace_seq_printf(s, "%ps();\n", (void *)call->func); + print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, + cpu, iter->ent->pid, flags); + return trace_handle_return(s); } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index daa81571b22a..6b71860f3998 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -400,11 +400,10 @@ static struct trace_kprobe *find_trace_kprobe(const char *event, static int enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) { + struct event_file_link *link = NULL; int ret = 0; if (file) { - struct event_file_link *link; - link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { ret = -ENOMEM; @@ -424,6 +423,18 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file) else ret = enable_kprobe(&tk->rp.kp); } + + if (ret) { + if (file) { + /* Notice the if is true on not WARN() */ + if (!WARN_ON_ONCE(!link)) + list_del_rcu(&link->list); + kfree(link); + tk->tp.flags &= ~TP_FLAG_TRACE; + } else { + tk->tp.flags &= ~TP_FLAG_PROFILE; + } + } out: return ret; } @@ -1480,8 +1491,10 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, } ret = __register_trace_kprobe(tk); - if (ret < 0) + if (ret < 0) { + kfree(tk->tp.call.print_fmt); goto error; + } return &tk->tp.call; error: @@ -1501,6 +1514,8 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call) } __unregister_trace_kprobe(tk); + + kfree(tk->tp.call.print_fmt); free_trace_kprobe(tk); } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 90db994ac900..1c8e30fda46a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -594,8 +594,7 @@ int trace_print_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%16s-%-5d [%03d] ", - comm, entry->pid, iter->cpu); + trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { unsigned int tgid = trace_find_tgid(entry->pid); @@ -606,6 +605,8 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "(%5d) ", tgid); } + trace_seq_printf(s, "[%03d] ", iter->cpu); + if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); |