diff options
Diffstat (limited to 'kernel')
54 files changed, 2422 insertions, 681 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 227db99b0f19..d97e8f0f73ca 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -443,15 +443,15 @@ static int audit_set_failure(u32 state) * Drop any references inside the auditd connection tracking struct and free * the memory. */ - static void auditd_conn_free(struct rcu_head *rcu) - { +static void auditd_conn_free(struct rcu_head *rcu) +{ struct auditd_connection *ac; ac = container_of(rcu, struct auditd_connection, rcu); put_pid(ac->pid); put_net(ac->net); kfree(ac); - } +} /** * auditd_set - Set/Reset the auditd connection state diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c1c0b60d3f2f..43171a0bb02b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -495,6 +495,42 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); /** + * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and + * provided by user sockaddr + * @sk: sock struct that will use sockaddr + * @uaddr: sockaddr struct provided by user + * @type: The type of program to be exectuted + * + * socket is expected to be of type INET or INET6. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, + enum bpf_attach_type type) +{ + struct bpf_sock_addr_kern ctx = { + .sk = sk, + .uaddr = uaddr, + }; + struct cgroup *cgrp; + int ret; + + /* Check socket family since not all sockets represent network + * endpoint (e.g. AF_UNIX). + */ + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); + +/** * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock * @sk: socket to get cgroup from * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains @@ -545,7 +581,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id) +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -566,6 +602,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id) static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 8740406df2cd..d6b76377cb6e 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -113,16 +113,16 @@ static const char *const bpf_jmp_string[16] = { }; static void print_bpf_end_insn(bpf_insn_print_t verbose, - struct bpf_verifier_env *env, + void *private_data, const struct bpf_insn *insn) { - verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + verbose(private_data, "(%02x) r%d = %s%d r%d\n", + insn->code, insn->dst_reg, BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", insn->imm, insn->dst_reg); } void print_bpf_insn(const struct bpf_insn_cbs *cbs, - struct bpf_verifier_env *env, const struct bpf_insn *insn, bool allow_ptr_leaks) { @@ -132,23 +132,23 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, if (class == BPF_ALU || class == BPF_ALU64) { if (BPF_OP(insn->code) == BPF_END) { if (class == BPF_ALU64) - verbose(env, "BUG_alu64_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code); else - print_bpf_end_insn(verbose, env, insn); + print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(env, "(%02x) r%d = %s-r%d\n", + verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n", insn->code, insn->dst_reg, class == BPF_ALU ? "(u32) " : "", insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) %sr%d %s %sr%d\n", + verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->src_reg); } else { - verbose(env, "(%02x) %sr%d %s %s%d\n", + verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], @@ -157,46 +157,46 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) - verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_XADD) - verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else - verbose(env, "BUG_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_%02x\n", insn->code); } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_st_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); return; } - verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_ldx_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); return; } - verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", + verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n", insn->code, insn->dst_reg, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (class == BPF_LD) { if (BPF_MODE(insn->code) == BPF_ABS) { - verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", + verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->imm); } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", + verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); @@ -212,12 +212,12 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, if (map_ptr && !allow_ptr_leaks) imm = 0; - verbose(env, "(%02x) r%d = %s\n", + verbose(cbs->private_data, "(%02x) r%d = %s\n", insn->code, insn->dst_reg, __func_imm_name(cbs, insn, imm, tmp, sizeof(tmp))); } else { - verbose(env, "BUG_ld_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); return; } } else if (class == BPF_JMP) { @@ -227,35 +227,35 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, char tmp[64]; if (insn->src_reg == BPF_PSEUDO_CALL) { - verbose(env, "(%02x) call pc%s\n", + verbose(cbs->private_data, "(%02x) call pc%s\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp))); } else { strcpy(tmp, "unknown"); - verbose(env, "(%02x) call %s#%d\n", insn->code, + verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp)), insn->imm); } } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose(env, "(%02x) goto pc%+d\n", + verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose(env, "(%02x) exit\n", insn->code); + verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", + verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->src_reg, insn->off); } else { - verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", + verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } } else { - verbose(env, "(%02x) %s\n", + verbose(cbs->private_data, "(%02x) %s\n", insn->code, bpf_class_string[class]); } } diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index 266fe8ee542b..e1324a834a24 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -22,14 +22,12 @@ #include <string.h> #endif -struct bpf_verifier_env; - extern const char *const bpf_alu_string[16]; extern const char *const bpf_class_string[8]; const char *func_id_name(int id); -typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env, +typedef __printf(2, 3) void (*bpf_insn_print_t)(void *private_data, const char *, ...); typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, const struct bpf_insn *insn); @@ -45,7 +43,6 @@ struct bpf_insn_cbs { }; void print_bpf_insn(const struct bpf_insn_cbs *cbs, - struct bpf_verifier_env *env, const struct bpf_insn *insn, bool allow_ptr_leaks); #endif diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 81e2f6995adb..bf6da59ae0d0 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -178,6 +178,9 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { + /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future + * extensions. + */ if (strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index a927e89dad6e..d2bda5aa25d7 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -38,8 +38,11 @@ #include <linux/skbuff.h> #include <linux/workqueue.h> #include <linux/list.h> +#include <linux/mm.h> #include <net/strparser.h> #include <net/tcp.h> +#include <linux/ptr_ring.h> +#include <net/inet_common.h> #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -47,6 +50,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sock_map; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; }; @@ -62,8 +66,7 @@ struct smap_psock_map_entry { struct smap_psock { struct rcu_head rcu; - /* refcnt is used inside sk_callback_lock */ - u32 refcnt; + refcount_t refcnt; /* datapath variables */ struct sk_buff_head rxqueue; @@ -74,7 +77,17 @@ struct smap_psock { int save_off; struct sk_buff *save_skb; + /* datapath variables for tx_msg ULP */ + struct sock *sk_redir; + int apply_bytes; + int cork_bytes; + int sg_size; + int eval; + struct sk_msg_buff *cork; + struct list_head ingress; + struct strparser strp; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; struct list_head maps; @@ -92,11 +105,33 @@ struct smap_psock { void (*save_write_space)(struct sock *sk); }; +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len); +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); + static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { return rcu_dereference_sk_user_data(sk); } +static bool bpf_tcp_stream_read(const struct sock *sk) +{ + struct smap_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; + empty = list_empty(&psock->ingress); +out: + rcu_read_unlock(); + return !empty; +} + static struct proto tcp_bpf_proto; static int bpf_tcp_init(struct sock *sk) { @@ -116,31 +151,48 @@ static int bpf_tcp_init(struct sock *sk) psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; + + if (psock->bpf_tx_msg) { + tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; + tcp_bpf_proto.sendpage = bpf_tcp_sendpage; + tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; + tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; + } + sk->sk_prot = &tcp_bpf_proto; rcu_read_unlock(); return 0; } +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); + static void bpf_tcp_release(struct sock *sk) { struct smap_psock *psock; rcu_read_lock(); psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; - if (likely(psock)) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + psock->cork = NULL; } + + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; +out: rcu_read_unlock(); } -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); - static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); struct smap_psock_map_entry *e, *tmp; + struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; struct sock *osk; @@ -159,6 +211,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; write_lock_bh(&sk->sk_callback_lock); + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { + list_del(&md->list); + free_start_sg(psock->sock, md); + kfree(md); + } + list_for_each_entry_safe(e, tmp, &psock->maps, list) { osk = cmpxchg(e->entry, sk, NULL); if (osk == sk) { @@ -175,6 +233,7 @@ enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, + __SK_NONE, }; static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { @@ -186,10 +245,782 @@ static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { .release = bpf_tcp_release, }; +static int memcopy_from_iter(struct sock *sk, + struct sk_msg_buff *md, + struct iov_iter *from, int bytes) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_curr, rc = -ENOSPC; + + do { + int copy; + char *to; + + if (md->sg_copybreak >= sg[i].length) { + md->sg_copybreak = 0; + + if (++i == MAX_SKB_FRAGS) + i = 0; + + if (i == md->sg_end) + break; + } + + copy = sg[i].length - md->sg_copybreak; + to = sg_virt(&sg[i]) + md->sg_copybreak; + md->sg_copybreak += copy; + + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + rc = copy_from_iter_nocache(to, copy, from); + else + rc = copy_from_iter(to, copy, from); + + if (rc != copy) { + rc = -EFAULT; + goto out; + } + + bytes -= copy; + if (!bytes) + break; + + md->sg_copybreak = 0; + if (++i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +out: + md->sg_curr = i; + return rc; +} + +static int bpf_tcp_push(struct sock *sk, int apply_bytes, + struct sk_msg_buff *md, + int flags, bool uncharge) +{ + bool apply = apply_bytes; + struct scatterlist *sg; + int offset, ret = 0; + struct page *p; + size_t size; + + while (1) { + sg = md->sg_data + md->sg_start; + size = (apply && apply_bytes < sg->length) ? + apply_bytes : sg->length; + offset = sg->offset; + + tcp_rate_check_app_limited(sk); + p = sg_page(sg); +retry: + ret = do_tcp_sendpages(sk, p, offset, size, flags); + if (ret != size) { + if (ret > 0) { + if (apply) + apply_bytes -= ret; + size -= ret; + offset += ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + goto retry; + } + + sg->length = size; + sg->offset = offset; + return ret; + } + + if (apply) + apply_bytes -= ret; + sg->offset += ret; + sg->length -= ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + + if (!sg->length) { + put_page(p); + md->sg_start++; + if (md->sg_start == MAX_SKB_FRAGS) + md->sg_start = 0; + sg_init_table(sg, 1); + + if (md->sg_start == md->sg_end) + break; + } + + if (apply && !apply_bytes) + break; + } + return 0; +} + +static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data + md->sg_start; + + if (md->sg_copy[md->sg_start]) { + md->data = md->data_end = 0; + } else { + md->data = sg_virt(sg); + md->data_end = md->data + sg->length; + } +} + +static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start; + + do { + int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + if (!bytes) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +} + +static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start, free; + + while (bytes && sg[i].length) { + free = sg[i].length; + if (bytes < free) { + sg[i].length -= bytes; + sg[i].offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + bytes -= sg[i].length; + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } +} + +static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = start, free = 0; + + while (sg[i].length) { + free += sg[i].length; + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } + + return free; +} + +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +{ + int free = free_sg(sk, md->sg_start, md); + + md->sg_start = md->sg_end; + return free; +} + +static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) +{ + return free_sg(sk, md->sg_curr, md); +} + +static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) +{ + return ((_rc == SK_PASS) ? + (md->map ? __SK_REDIRECT : __SK_PASS) : + __SK_DROP); +} + +static unsigned int smap_do_tx_msg(struct sock *sk, + struct smap_psock *psock, + struct sk_msg_buff *md) +{ + struct bpf_prog *prog; + unsigned int rc, _rc; + + preempt_disable(); + rcu_read_lock(); + + /* If the policy was removed mid-send then default to 'accept' */ + prog = READ_ONCE(psock->bpf_tx_msg); + if (unlikely(!prog)) { + _rc = SK_PASS; + goto verdict; + } + + bpf_compute_data_pointers_sg(md); + rc = (*prog->bpf_func)(md, prog->insnsi); + psock->apply_bytes = md->apply_bytes; + + /* Moving return codes from UAPI namespace into internal namespace */ + _rc = bpf_map_msg_verdict(rc, md); + + /* The psock has a refcount on the sock but not on the map and because + * we need to drop rcu read lock here its possible the map could be + * removed between here and when we need it to execute the sock + * redirect. So do the map lookup now for future use. + */ + if (_rc == __SK_REDIRECT) { + if (psock->sk_redir) + sock_put(psock->sk_redir); + psock->sk_redir = do_msg_redirect_map(md); + if (!psock->sk_redir) { + _rc = __SK_DROP; + goto verdict; + } + sock_hold(psock->sk_redir); + } +verdict: + rcu_read_unlock(); + preempt_enable(); + + return _rc; +} + +static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, + struct smap_psock *psock, + struct sk_msg_buff *md, int flags) +{ + bool apply = apply_bytes; + size_t size, copied = 0; + struct sk_msg_buff *r; + int err = 0, i; + + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL); + if (unlikely(!r)) + return -ENOMEM; + + lock_sock(sk); + r->sg_start = md->sg_start; + i = md->sg_start; + + do { + r->sg_data[i] = md->sg_data[i]; + + size = (apply && apply_bytes < md->sg_data[i].length) ? + apply_bytes : md->sg_data[i].length; + + if (!sk_wmem_schedule(sk, size)) { + if (!copied) + err = -ENOMEM; + break; + } + + sk_mem_charge(sk, size); + r->sg_data[i].length = size; + md->sg_data[i].length -= size; + md->sg_data[i].offset += size; + copied += size; + + if (md->sg_data[i].length) { + get_page(sg_page(&r->sg_data[i])); + r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1; + } else { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + r->sg_end = i; + } + + if (apply) { + apply_bytes -= size; + if (!apply_bytes) + break; + } + } while (i != md->sg_end); + + md->sg_start = i; + + if (!err) { + list_add_tail(&r->list, &psock->ingress); + sk->sk_data_ready(sk); + } else { + free_start_sg(sk, r); + kfree(r); + } + + release_sock(sk); + return err; +} + +static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, + struct sk_msg_buff *md, + int flags) +{ + struct smap_psock *psock; + struct scatterlist *sg; + int i, err, free = 0; + bool ingress = !!(md->flags & BPF_F_INGRESS); + + sg = md->sg_data; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out_rcu; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto out_rcu; + + rcu_read_unlock(); + + if (ingress) { + err = bpf_tcp_ingress(sk, send, psock, md, flags); + } else { + lock_sock(sk); + err = bpf_tcp_push(sk, send, md, flags, false); + release_sock(sk); + } + smap_release_sock(psock, sk); + if (unlikely(err)) + goto out; + return 0; +out_rcu: + rcu_read_unlock(); +out: + i = md->sg_start; + while (sg[i].length) { + free += sg[i].length; + put_page(sg_page(&sg[i])); + sg[i].length = 0; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } + return free; +} + +static inline void bpf_md_init(struct smap_psock *psock) +{ + if (!psock->apply_bytes) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } +} + +static void apply_bytes_dec(struct smap_psock *psock, int i) +{ + if (psock->apply_bytes) { + if (psock->apply_bytes < i) + psock->apply_bytes = 0; + else + psock->apply_bytes -= i; + } +} + +static int bpf_exec_tx_verdict(struct smap_psock *psock, + struct sk_msg_buff *m, + struct sock *sk, + int *copied, int flags) +{ + bool cork = false, enospc = (m->sg_start == m->sg_end); + struct sock *redir; + int err = 0; + int send; + +more_data: + if (psock->eval == __SK_NONE) + psock->eval = smap_do_tx_msg(sk, psock, m); + + if (m->cork_bytes && + m->cork_bytes > psock->sg_size && !enospc) { + psock->cork_bytes = m->cork_bytes - psock->sg_size; + if (!psock->cork) { + psock->cork = kcalloc(1, + sizeof(struct sk_msg_buff), + GFP_ATOMIC | __GFP_NOWARN); + + if (!psock->cork) { + err = -ENOMEM; + goto out_err; + } + } + memcpy(psock->cork, m, sizeof(*m)); + goto out_err; + } + + send = psock->sg_size; + if (psock->apply_bytes && psock->apply_bytes < send) + send = psock->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + err = bpf_tcp_push(sk, send, m, flags, true); + if (unlikely(err)) { + *copied -= free_start_sg(sk, m); + break; + } + + apply_bytes_dec(psock, send); + psock->sg_size -= send; + break; + case __SK_REDIRECT: + redir = psock->sk_redir; + apply_bytes_dec(psock, send); + + if (psock->cork) { + cork = true; + psock->cork = NULL; + } + + return_mem_sg(sk, send, m); + release_sock(sk); + + err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); + lock_sock(sk); + + if (cork) { + free_start_sg(sk, m); + kfree(m); + m = NULL; + } + if (unlikely(err)) + *copied -= err; + else + psock->sg_size -= send; + break; + case __SK_DROP: + default: + free_bytes_sg(sk, send, m); + apply_bytes_dec(psock, send); + *copied -= send; + psock->sg_size -= send; + err = -EACCES; + break; + } + + if (likely(!err)) { + bpf_md_init(psock); + if (m && + m->sg_data[m->sg_start].page_link && + m->sg_data[m->sg_start].length) + goto more_data; + } + +out_err: + return err; +} + +static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct iov_iter *iter = &msg->msg_iter; + struct smap_psock *psock; + int copied = 0; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; + + if (unlikely(!refcount_inc_not_zero(&psock->refcnt))) + goto out; + rcu_read_unlock(); + + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + lock_sock(sk); + while (copied != len) { + struct scatterlist *sg; + struct sk_msg_buff *md; + int i; + + md = list_first_entry_or_null(&psock->ingress, + struct sk_msg_buff, list); + if (unlikely(!md)) + break; + i = md->sg_start; + do { + struct page *page; + int n, copy; + + sg = &md->sg_data[i]; + copy = sg->length; + page = sg_page(sg); + + if (copied + copy > len) + copy = len - copied; + + n = copy_page_to_iter(page, sg->offset, copy, iter); + if (n != copy) { + md->sg_start = i; + release_sock(sk); + smap_release_sock(psock, sk); + return -EFAULT; + } + + copied += copy; + sg->offset += copy; + sg->length -= copy; + sk_mem_uncharge(sk, copy); + + if (!sg->length) { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + if (!md->skb) + put_page(page); + } + if (copied == len) + break; + } while (i != md->sg_end); + md->sg_start = i; + + if (!sg->length && md->sg_start == md->sg_end) { + list_del(&md->list); + if (md->skb) + consume_skb(md->skb); + kfree(md); + } + } + + release_sock(sk); + smap_release_sock(psock, sk); + return copied; +out: + rcu_read_unlock(); + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); +} + + +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; + struct sk_msg_buff md = {0}; + unsigned int sg_copy = 0; + struct smap_psock *psock; + int copied = 0, err = 0; + struct scatterlist *sg; + long timeo; + + /* Its possible a sock event or user removed the psock _but_ the ops + * have not been reprogrammed yet so we get here. In this case fallback + * to tcp_sendmsg. Note this only works because we _only_ ever allow + * a single ULP there is no hierarchy here. + */ + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + /* Increment the psock refcnt to ensure its not released while sending a + * message. Required because sk lookup and bpf programs are used in + * separate rcu critical sections. Its OK if we lose the map entry + * but we can't lose the sock reference. + */ + if (!refcount_inc_not_zero(&psock->refcnt)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + sg = md.sg_data; + sg_init_marker(sg, MAX_SKB_FRAGS); + rcu_read_unlock(); + + lock_sock(sk); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + while (msg_data_left(msg)) { + struct sk_msg_buff *m; + bool enospc = false; + int copy; + + if (sk->sk_err) { + err = sk->sk_err; + goto out_err; + } + + copy = msg_data_left(msg); + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + m = psock->cork_bytes ? psock->cork : &md; + m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; + err = sk_alloc_sg(sk, copy, m->sg_data, + m->sg_start, &m->sg_end, &sg_copy, + m->sg_end - 1); + if (err) { + if (err != -ENOSPC) + goto wait_for_memory; + enospc = true; + copy = sg_copy; + } + + err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); + if (err < 0) { + free_curr_sg(sk, m); + goto out_err; + } + + psock->sg_size += copy; + copied += copy; + sg_copy = 0; + + /* When bytes are being corked skip running BPF program and + * applying verdict unless there is no more buffer space. In + * the ENOSPC case simply run BPF prorgram with currently + * accumulated data. We don't have much choice at this point + * we could try extending the page frags or chaining complex + * frags but even in these cases _eventually_ we will hit an + * OOM scenario. More complex recovery schemes may be + * implemented in the future, but BPF programs must handle + * the case where apply_cork requests are not honored. The + * canonical method to verify this is to check data length. + */ + if (psock->cork_bytes) { + if (copy > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= copy; + + if (psock->cork_bytes && !enospc) + goto out_cork; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); + if (unlikely(err < 0)) + goto out_err; + continue; +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_err; + } +out_err: + if (err < 0) + err = sk_stream_error(sk, msg->msg_flags, err); +out_cork: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +} + +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct sk_msg_buff md = {0}, *m = NULL; + int err = 0, copied = 0; + struct smap_psock *psock; + struct scatterlist *sg; + bool enospc = false; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto accept; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto accept; + rcu_read_unlock(); + + lock_sock(sk); + + if (psock->cork_bytes) { + m = psock->cork; + sg = &m->sg_data[m->sg_end]; + } else { + m = &md; + sg = m->sg_data; + sg_init_marker(sg, MAX_SKB_FRAGS); + } + + /* Catch case where ring is full and sendpage is stalled. */ + if (unlikely(m->sg_end == m->sg_start && + m->sg_data[m->sg_end].length)) + goto out_err; + + psock->sg_size += size; + sg_set_page(sg, page, size, offset); + get_page(page); + m->sg_copy[m->sg_end] = true; + sk_mem_charge(sk, size); + m->sg_end++; + copied = size; + + if (m->sg_end == MAX_SKB_FRAGS) + m->sg_end = 0; + + if (m->sg_end == m->sg_start) + enospc = true; + + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + + if (psock->cork_bytes && !enospc) + goto out_err; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); +out_err: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +accept: + rcu_read_unlock(); + return tcp_sendpage(sk, page, offset, size, flags); +} + +static void bpf_tcp_msg_add(struct smap_psock *psock, + struct sock *sk, + struct bpf_prog *tx_msg) +{ + struct bpf_prog *orig_tx_msg; + + orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); + if (orig_tx_msg) + bpf_prog_put(orig_tx_msg); +} + static int bpf_tcp_ulp_register(void) { tcp_bpf_proto = tcp_prot; tcp_bpf_proto.close = bpf_tcp_close; + /* Once BPF TX ULP is registered it is never unregistered. It + * will be in the ULP list for the lifetime of the system. Doing + * duplicate registers is not a problem. + */ return tcp_register_ulp(&bpf_tcp_ulp_ops); } @@ -220,27 +1051,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) __SK_DROP; } +static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sock; + int copied = 0, num_sg; + struct sk_msg_buff *r; + + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); + if (unlikely(!r)) + return -EAGAIN; + + if (!sk_rmem_schedule(sk, skb, skb->len)) { + kfree(r); + return -EAGAIN; + } + + sg_init_table(r->sg_data, MAX_SKB_FRAGS); + num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); + if (unlikely(num_sg < 0)) { + kfree(r); + return num_sg; + } + sk_mem_charge(sk, skb->len); + copied = skb->len; + r->sg_start = 0; + r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; + r->skb = skb; + list_add_tail(&r->list, &psock->ingress); + sk->sk_data_ready(sk); + return copied; +} + static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) { + struct smap_psock *peer; struct sock *sk; + __u32 in; int rc; rc = smap_verdict_func(psock, skb); switch (rc) { case __SK_REDIRECT: sk = do_sk_redirect_map(skb); - if (likely(sk)) { - struct smap_psock *peer = smap_psock_sk(sk); - - if (likely(peer && - test_bit(SMAP_TX_RUNNING, &peer->state) && - !sock_flag(sk, SOCK_DEAD) && - sock_writeable(sk))) { - skb_set_owner_w(skb, sk); - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } + if (!sk) { + kfree_skb(skb); + break; + } + + peer = smap_psock_sk(sk); + in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; + + if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) || + !test_bit(SMAP_TX_RUNNING, &peer->state))) { + kfree_skb(skb); + break; + } + + if (!in && sock_writeable(sk)) { + skb_set_owner_w(skb, sk); + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; + } else if (in && + atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; } /* Fall through and free skb otherwise */ case __SK_DROP: @@ -302,15 +1178,23 @@ static void smap_tx_work(struct work_struct *w) } while ((skb = skb_dequeue(&psock->rxqueue))) { + __u32 flags; + rem = skb->len; off = 0; start: + flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; do { - if (likely(psock->sock->sk_socket)) - n = skb_send_sock_locked(psock->sock, - skb, off, rem); - else + if (likely(psock->sock->sk_socket)) { + if (flags) + n = smap_do_ingress(psock, skb); + else + n = skb_send_sock_locked(psock->sock, + skb, off, rem); + } else { n = -EINVAL; + } + if (n <= 0) { if (n == -EAGAIN) { /* Retry when space is available */ @@ -328,7 +1212,9 @@ start: rem -= n; off += n; } while (rem); - kfree_skb(skb); + + if (!flags) + kfree_skb(skb); } out: release_sock(psock->sock); @@ -373,15 +1259,13 @@ static void smap_destroy_psock(struct rcu_head *rcu) static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { - psock->refcnt--; - if (psock->refcnt) - return; - - tcp_cleanup_ulp(sock); - smap_stop_sock(psock, sock); - clear_bit(SMAP_TX_RUNNING, &psock->state); - rcu_assign_sk_user_data(sock, NULL); - call_rcu_sched(&psock->rcu, smap_destroy_psock); + if (refcount_dec_and_test(&psock->refcnt)) { + tcp_cleanup_ulp(sock); + smap_stop_sock(psock, sock); + clear_bit(SMAP_TX_RUNNING, &psock->state); + rcu_assign_sk_user_data(sock, NULL); + call_rcu_sched(&psock->rcu, smap_destroy_psock); + } } static int smap_parse_func_strparser(struct strparser *strp, @@ -415,7 +1299,6 @@ static int smap_parse_func_strparser(struct strparser *strp, return rc; } - static int smap_read_sock_done(struct strparser *strp, int err) { return err; @@ -469,6 +1352,7 @@ static void sock_map_remove_complete(struct bpf_stab *stab) static void smap_gc_work(struct work_struct *w) { struct smap_psock_map_entry *e, *tmp; + struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; psock = container_of(w, struct smap_psock, gc_work); @@ -485,12 +1369,28 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_parse); if (psock->bpf_verdict) bpf_prog_put(psock->bpf_verdict); + if (psock->bpf_tx_msg) + bpf_prog_put(psock->bpf_tx_msg); + + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + } + + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { + list_del(&md->list); + free_start_sg(psock->sock, md); + kfree(md); + } list_for_each_entry_safe(e, tmp, &psock->maps, list) { list_del(&e->list); kfree(e); } + if (psock->sk_redir) + sock_put(psock->sk_redir); + sock_put(psock->sock); kfree(psock); } @@ -506,12 +1406,14 @@ static struct smap_psock *smap_init_psock(struct sock *sock, if (!psock) return ERR_PTR(-ENOMEM); + psock->eval = __SK_NONE; psock->sock = sock; skb_queue_head_init(&psock->rxqueue); INIT_WORK(&psock->tx_work, smap_tx_work); INIT_WORK(&psock->gc_work, smap_gc_work); INIT_LIST_HEAD(&psock->maps); - psock->refcnt = 1; + INIT_LIST_HEAD(&psock->ingress); + refcount_set(&psock->refcnt, 1); rcu_assign_sk_user_data(sock, psock); sock_hold(sock); @@ -714,10 +1616,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct smap_psock_map_entry *e = NULL; - struct bpf_prog *verdict, *parse; + struct bpf_prog *verdict, *parse, *tx_msg; struct sock *osock, *sock; struct smap_psock *psock; u32 i = *(u32 *)key; + bool new = false; int err; if (unlikely(flags > BPF_EXIST)) @@ -740,6 +1643,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, */ verdict = READ_ONCE(stab->bpf_verdict); parse = READ_ONCE(stab->bpf_parse); + tx_msg = READ_ONCE(stab->bpf_tx_msg); if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation @@ -758,6 +1662,17 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } } + if (tx_msg) { + tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); + if (IS_ERR(tx_msg)) { + if (verdict) + bpf_prog_put(verdict); + if (parse) + bpf_prog_put(parse); + return PTR_ERR(tx_msg); + } + } + write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); @@ -772,7 +1687,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = -EBUSY; goto out_progs; } - psock->refcnt++; + if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { + err = -EBUSY; + goto out_progs; + } + if (!refcount_inc_not_zero(&psock->refcnt)) { + err = -EAGAIN; + goto out_progs; + } } else { psock = smap_init_psock(sock, stab); if (IS_ERR(psock)) { @@ -780,11 +1702,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } - err = tcp_set_ulp_id(sock, TCP_ULP_BPF); - if (err) - goto out_progs; - set_bit(SMAP_TX_RUNNING, &psock->state); + new = true; } e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); @@ -797,6 +1716,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. */ + if (tx_msg) + bpf_tcp_msg_add(psock, sock, tx_msg); + if (new) { + err = tcp_set_ulp_id(sock, TCP_ULP_BPF); + if (err) + goto out_free; + } + if (parse && verdict && !psock->strp_enabled) { err = smap_init_sock(psock, sock); if (err) @@ -818,8 +1745,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - if (osock != sock && parse) - smap_stop_sock(opsock, osock); smap_list_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); @@ -832,6 +1757,8 @@ out_progs: bpf_prog_put(verdict); if (parse) bpf_prog_put(parse); + if (tx_msg) + bpf_prog_put(tx_msg); write_unlock_bh(&sock->sk_callback_lock); kfree(e); return err; @@ -846,6 +1773,9 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return -EINVAL; switch (type) { + case BPF_SK_MSG_VERDICT: + orig = xchg(&stab->bpf_tx_msg, prog); + break; case BPF_SK_SKB_STREAM_PARSER: orig = xchg(&stab->bpf_parse, prog); break; @@ -907,6 +1837,10 @@ static void sock_map_release(struct bpf_map *map, struct file *map_file) orig = xchg(&stab->bpf_verdict, NULL); if (orig) bpf_prog_put(orig); + + orig = xchg(&stab->bpf_tx_msg, NULL); + if (orig) + bpf_prog_put(orig); } const struct bpf_map_ops sock_map_ops = { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b0ecf43f5894..57eeb1234b67 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,16 +9,19 @@ #include <linux/filter.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> +#include <linux/elf.h> +#include <linux/pagemap.h> #include "percpu_freelist.h" -#define STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ + BPF_F_STACK_BUILD_ID) struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; u32 nr; - u64 ip[]; + u64 data[]; }; struct bpf_stack_map { @@ -29,6 +32,17 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; +static inline bool stack_map_use_build_id(struct bpf_map *map) +{ + return (map->map_flags & BPF_F_STACK_BUILD_ID); +} + +static inline int stack_map_data_size(struct bpf_map *map) +{ + return stack_map_use_build_id(map) ? + sizeof(struct bpf_stack_build_id) : sizeof(u64); +} + static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; @@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - value_size < 8 || value_size % 8 || - value_size / 8 > sysctl_perf_event_max_stack) + value_size < 8 || value_size % 8) + return ERR_PTR(-EINVAL); + + BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); + if (attr->map_flags & BPF_F_STACK_BUILD_ID) { + if (value_size % sizeof(struct bpf_stack_build_id) || + value_size / sizeof(struct bpf_stack_build_id) + > sysctl_perf_event_max_stack) + return ERR_PTR(-EINVAL); + } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2 */ @@ -114,13 +136,184 @@ free_smap: return ERR_PTR(err); } +#define BPF_BUILD_ID 3 +/* + * Parse build id from the note segment. This logic can be shared between + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are + * identical. + */ +static inline int stack_map_parse_build_id(void *page_addr, + unsigned char *build_id, + void *note_start, + Elf32_Word note_size) +{ + Elf32_Word note_offs = 0, new_offs; + + /* check for overflow */ + if (note_start < page_addr || note_start + note_size < note_start) + return -EINVAL; + + /* only supports note that fits in the first page */ + if (note_start + note_size > page_addr + PAGE_SIZE) + return -EINVAL; + + while (note_offs + sizeof(Elf32_Nhdr) < note_size) { + Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + + if (nhdr->n_type == BPF_BUILD_ID && + nhdr->n_namesz == sizeof("GNU") && + nhdr->n_descsz == BPF_BUILD_ID_SIZE) { + memcpy(build_id, + note_start + note_offs + + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), + BPF_BUILD_ID_SIZE); + return 0; + } + new_offs = note_offs + sizeof(Elf32_Nhdr) + + ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); + if (new_offs <= note_offs) /* overflow */ + break; + note_offs = new_offs; + } + return -EINVAL; +} + +/* Parse build ID from 32-bit ELF */ +static int stack_map_get_build_id_32(void *page_addr, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; + Elf32_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + return -EINVAL; + + phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID from 64-bit ELF */ +static int stack_map_get_build_id_64(void *page_addr, + unsigned char *build_id) +{ + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; + Elf64_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + return -EINVAL; + + phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) + if (phdr[i].p_type == PT_NOTE) + return stack_map_parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz); + return -EINVAL; +} + +/* Parse build ID of ELF file mapped to vma */ +static int stack_map_get_build_id(struct vm_area_struct *vma, + unsigned char *build_id) +{ + Elf32_Ehdr *ehdr; + struct page *page; + void *page_addr; + int ret; + + /* only works for page backed storage */ + if (!vma->vm_file) + return -EINVAL; + + page = find_get_page(vma->vm_file->f_mapping, 0); + if (!page) + return -EFAULT; /* page not mapped */ + + ret = -EINVAL; + page_addr = page_address(page); + ehdr = (Elf32_Ehdr *)page_addr; + + /* compare magic x7f "ELF" */ + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + /* only support executable file and shared object file */ + if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) + goto out; + + if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) + ret = stack_map_get_build_id_32(page_addr, build_id); + else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) + ret = stack_map_get_build_id_64(page_addr, build_id); +out: + put_page(page); + return ret; +} + +static void stack_map_get_build_id_offset(struct bpf_map *map, + struct stack_map_bucket *bucket, + u64 *ips, u32 trace_nr, bool user) +{ + int i; + struct vm_area_struct *vma; + struct bpf_stack_build_id *id_offs; + + bucket->nr = trace_nr; + id_offs = (struct bpf_stack_build_id *)bucket->data; + + /* + * We cannot do up_read() in nmi context, so build_id lookup is + * only supported for non-nmi events. If at some point, it is + * possible to run find_vma() without taking the semaphore, we + * would like to allow build_id lookup in nmi context. + * + * Same fallback is used for kernel stack (!user) on a stackmap + * with build_id. + */ + if (!user || !current || !current->mm || in_nmi() || + down_read_trylock(¤t->mm->mmap_sem) == 0) { + /* cannot access current->mm, fall back to ips */ + for (i = 0; i < trace_nr; i++) { + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + } + return; + } + + for (i = 0; i < trace_nr; i++) { + vma = find_vma(current->mm, ips[i]); + if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { + /* per entry fall back to ips */ + id_offs[i].status = BPF_STACK_BUILD_ID_IP; + id_offs[i].ip = ips[i]; + continue; + } + id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] + - vma->vm_start; + id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + } + up_read(¤t->mm->mmap_sem); +} + BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; - u32 max_depth = map->value_size / 8; + u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, bool user = flags & BPF_F_USER_STACK; bool kernel = !user; u64 *ips; + bool hash_matches; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) @@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); - if (bucket && bucket->hash == hash) { - if (flags & BPF_F_FAST_STACK_CMP) + hash_matches = bucket && bucket->hash == hash; + /* fast cmp */ + if (hash_matches && flags & BPF_F_FAST_STACK_CMP) + return id; + + if (stack_map_use_build_id(map)) { + /* for build_id+offset, pop a bucket before slow cmp */ + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + stack_map_get_build_id_offset(map, new_bucket, ips, + trace_nr, user); + trace_len = trace_nr * sizeof(struct bpf_stack_build_id); + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, new_bucket->data, trace_len) == 0) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return id; - if (bucket->nr == trace_nr && - memcmp(bucket->ip, ips, trace_len) == 0) + } + if (bucket && !(flags & BPF_F_REUSE_STACKID)) { + pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); + return -EEXIST; + } + } else { + if (hash_matches && bucket->nr == trace_nr && + memcmp(bucket->data, ips, trace_len) == 0) return id; + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + memcpy(new_bucket->data, ips, trace_len); } - /* this call stack is not in the map, try to add it */ - if (bucket && !(flags & BPF_F_REUSE_STACKID)) - return -EEXIST; - - new_bucket = (struct stack_map_bucket *) - pcpu_freelist_pop(&smap->freelist); - if (unlikely(!new_bucket)) - return -ENOMEM; - - memcpy(new_bucket->ip, ips, trace_len); new_bucket->hash = hash; new_bucket->nr = trace_nr; @@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) if (!bucket) return -ENOENT; - trace_len = bucket->nr * sizeof(u64); - memcpy(value, bucket->ip, trace_len); + trace_len = bucket->nr * stack_map_data_size(map); + memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); old_bucket = xchg(&smap->buckets[id], bucket); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 43f95d190eea..0244973ee544 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -203,11 +203,13 @@ static int bpf_map_alloc_id(struct bpf_map *map) { int id; + idr_preload(GFP_KERNEL); spin_lock_bh(&map_idr_lock); id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); if (id > 0) map->id = id; spin_unlock_bh(&map_idr_lock); + idr_preload_end(); if (WARN_ON_ONCE(!id)) return -ENOSPC; @@ -940,11 +942,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) { int id; + idr_preload(GFP_KERNEL); spin_lock_bh(&prog_idr_lock); id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); if (id > 0) prog->aux->id = id; spin_unlock_bh(&prog_idr_lock); + idr_preload_end(); /* id is in [1, INT_MAX) */ if (WARN_ON_ONCE(!id)) @@ -1167,8 +1171,75 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); +/* Initially all BPF programs could be loaded w/o specifying + * expected_attach_type. Later for some of them specifying expected_attach_type + * at load time became required so that program could be validated properly. + * Programs of types that are allowed to be loaded both w/ and w/o (for + * backward compatibility) expected_attach_type, should have the default attach + * type assigned to expected_attach_type for the latter case, so that it can be + * validated later at attach time. + * + * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if + * prog type requires it but has some attach types that have to be backward + * compatible. + */ +static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) +{ + switch (attr->prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't + * exist so checking for non-zero is the way to go here. + */ + if (!attr->expected_attach_type) + attr->expected_attach_type = + BPF_CGROUP_INET_SOCK_CREATE; + break; + } +} + +static int +bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, + enum bpf_attach_type expected_attach_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + switch (expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + return 0; + default: + return -EINVAL; + } + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + switch (expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return 0; + default: + return -EINVAL; + } + default: + return 0; + } +} + +static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, + enum bpf_attach_type attach_type) +{ + switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + default: + return 0; + } +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex +#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type static int bpf_prog_load(union bpf_attr *attr) { @@ -1205,11 +1276,17 @@ static int bpf_prog_load(union bpf_attr *attr) !capable(CAP_SYS_ADMIN)) return -EPERM; + bpf_prog_load_fixup_attach_type(attr); + if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) + return -EINVAL; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; + prog->expected_attach_type = attr->expected_attach_type; + prog->aux->offload_requested = !!attr->prog_ifindex; err = security_bpf_prog_alloc(prog->aux); @@ -1311,11 +1388,87 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } +struct bpf_raw_tracepoint { + struct bpf_raw_event_map *btp; + struct bpf_prog *prog; +}; + +static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) +{ + struct bpf_raw_tracepoint *raw_tp = filp->private_data; + + if (raw_tp->prog) { + bpf_probe_unregister(raw_tp->btp, raw_tp->prog); + bpf_prog_put(raw_tp->prog); + } + kfree(raw_tp); + return 0; +} + +static const struct file_operations bpf_raw_tp_fops = { + .release = bpf_raw_tracepoint_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, +}; + +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd + +static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +{ + struct bpf_raw_tracepoint *raw_tp; + struct bpf_raw_event_map *btp; + struct bpf_prog *prog; + char tp_name[128]; + int tp_fd, err; + + if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name), + sizeof(tp_name) - 1) < 0) + return -EFAULT; + tp_name[sizeof(tp_name) - 1] = 0; + + btp = bpf_find_raw_tracepoint(tp_name); + if (!btp) + return -ENOENT; + + raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); + if (!raw_tp) + return -ENOMEM; + raw_tp->btp = btp; + + prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, + BPF_PROG_TYPE_RAW_TRACEPOINT); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto out_free_tp; + } + + err = bpf_probe_register(raw_tp->btp, prog); + if (err) + goto out_put_prog; + + raw_tp->prog = prog; + tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, + O_CLOEXEC); + if (tp_fd < 0) { + bpf_probe_unregister(raw_tp->btp, prog); + err = tp_fd; + goto out_put_prog; + } + return tp_fd; + +out_put_prog: + bpf_prog_put(prog); +out_free_tp: + kfree(raw_tp); + return err; +} + #ifdef CONFIG_CGROUP_BPF #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) +static int sockmap_get_from_fd(const union bpf_attr *attr, + int type, bool attach) { struct bpf_prog *prog = NULL; int ufd = attr->target_fd; @@ -1329,8 +1482,7 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) return PTR_ERR(map); if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, - BPF_PROG_TYPE_SK_SKB); + prog = bpf_prog_get_type(attr->attach_bpf_fd, type); if (IS_ERR(prog)) { fdput(f); return PTR_ERR(prog); @@ -1374,17 +1526,27 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, true); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); default: return -EINVAL; } @@ -1393,6 +1555,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); + if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { + bpf_prog_put(prog); + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) { bpf_prog_put(prog); @@ -1429,17 +1596,27 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); default: return -EINVAL; } @@ -1478,6 +1655,12 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; @@ -1917,6 +2100,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET_INFO_BY_FD: err = bpf_obj_get_info_by_fd(&attr, uattr); break; + case BPF_RAW_TRACEPOINT_OPEN: + err = bpf_raw_tracepoint_open(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c6eff108aa99..5dd1dcb902bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -168,23 +168,12 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); -/* log_level controls verbosity level of eBPF verifier. - * bpf_verifier_log_write() is used to dump the verification trace to the log, - * so the user can figure out what's wrong with the program - */ -__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, - const char *fmt, ...) +void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, + va_list args) { - struct bpf_verifer_log *log = &env->log; unsigned int n; - va_list args; - if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) - return; - - va_start(args, fmt); n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); - va_end(args); WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, "verifier log line truncated - local buffer too short\n"); @@ -197,14 +186,37 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, else log->ubuf = NULL; } -EXPORT_SYMBOL_GPL(bpf_verifier_log_write); -/* Historically bpf_verifier_log_write was called verbose, but the name was too - * generic for symbol export. The function was renamed, but not the calls in - * the verifier to avoid complicating backports. Hence the alias below. + +/* log_level controls verbosity level of eBPF verifier. + * bpf_verifier_log_write() is used to dump the verification trace to the log, + * so the user can figure out what's wrong with the program */ -static __printf(2, 3) void verbose(struct bpf_verifier_env *env, - const char *fmt, ...) - __attribute__((alias("bpf_verifier_log_write"))); +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, + const char *fmt, ...) +{ + va_list args; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(&env->log, fmt, args); + va_end(args); +} +EXPORT_SYMBOL_GPL(bpf_verifier_log_write); + +__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) +{ + struct bpf_verifier_env *env = private_data; + va_list args; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(&env->log, fmt, args); + va_end(args); +} static bool type_is_pkt_pointer(enum bpf_reg_type type) { @@ -508,10 +520,6 @@ err: static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -#define CALLEE_SAVED_REGS 5 -static const int callee_saved[CALLEE_SAVED_REGS] = { - BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9 -}; static void __mark_reg_not_init(struct bpf_reg_state *reg); @@ -1252,6 +1260,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: if (meta) return meta->pkt_access; @@ -1314,7 +1323,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, }; if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, &info)) { + env->ops->is_valid_access(off, size, t, env->prog, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -2075,7 +2084,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKMAP: if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_map) goto error; break; default: @@ -2113,6 +2123,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_sk_redirect_map: + case BPF_FUNC_msg_redirect_map: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; @@ -2338,7 +2349,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } if (env->ops->get_func_proto) - fn = env->ops->get_func_proto(func_id); + fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), func_id); @@ -3876,6 +3887,7 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: break; @@ -4601,10 +4613,11 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level) { const struct bpf_insn_cbs cbs = { .cb_print = verbose, + .private_data = env, }; verbose(env, "%d: ", insn_idx); - print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks); + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } if (bpf_prog_is_dev_bound(env->prog->aux)) { @@ -5560,7 +5573,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = env->ops->get_func_proto(insn->imm); + fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ @@ -5602,7 +5615,7 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { struct bpf_verifier_env *env; - struct bpf_verifer_log *log; + struct bpf_verifier_log *log; int ret = -EINVAL; /* no program is valid */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4bfb2908ec15..a662bfcbea0e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4524,10 +4524,10 @@ static struct cftype cgroup_base_files[] = { * and thus involve punting to css->destroy_work adding two additional * steps to the already complex sequence. */ -static void css_free_work_fn(struct work_struct *work) +static void css_free_rwork_fn(struct work_struct *work) { - struct cgroup_subsys_state *css = - container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup_subsys_state *css = container_of(to_rcu_work(work), + struct cgroup_subsys_state, destroy_rwork); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; @@ -4573,15 +4573,6 @@ static void css_free_work_fn(struct work_struct *work) } } -static void css_free_rcu_fn(struct rcu_head *rcu_head) -{ - struct cgroup_subsys_state *css = - container_of(rcu_head, struct cgroup_subsys_state, rcu_head); - - INIT_WORK(&css->destroy_work, css_free_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); -} - static void css_release_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = @@ -4631,7 +4622,8 @@ static void css_release_work_fn(struct work_struct *work) mutex_unlock(&cgroup_mutex); - call_rcu(&css->rcu_head, css_free_rcu_fn); + INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); + queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -4765,7 +4757,8 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); err_free_css: - call_rcu(&css->rcu_head, css_free_rcu_fn); + INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); + queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); } diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 6fc87ccda1d7..c6766f326072 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -132,3 +132,9 @@ config GENERIC_IRQ_DEBUGFS If you don't know what to do here, say N. endmenu + +config GENERIC_IRQ_MULTI_HANDLER + depends on !MULTI_IRQ_HANDLER + bool + help + Allow to specify the low level IRQ handler at run time. diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 8c82ea26e837..16cbf6beb276 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/autoprobe.c - * * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar * * This file contains the interrupt probing code and driver APIs. diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c69357a43849..a2b3d9de999c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1,13 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/chip.c - * * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * - * This file contains the core interrupt handling code, for irq-chip - * based architectures. - * - * Detailed information is available in Documentation/core-api/genericirq.rst + * This file contains the core interrupt handling code, for irq-chip based + * architectures. Detailed information is available in + * Documentation/core-api/genericirq.rst */ #include <linux/irq.h> diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 9eb09aef0313..5b1072e394b2 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Generic cpu hotunplug interrupt migration code copied from the * arch/arm implementation diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index acfaaef8672a..4dadeb3d6666 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -1,8 +1,6 @@ -/* - * Copyright 2017 Thomas Gleixner <tglx@linutronix.de> - * - * This file is licensed under the GPL V2. - */ +// SPDX-License-Identifier: GPL-2.0 +// Copyright 2017 Thomas Gleixner <tglx@linutronix.de> + #include <linux/irqdomain.h> #include <linux/irq.h> #include <linux/uaccess.h> diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 194c506d9d20..6a682c229e10 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/module.h> #include <linux/interrupt.h> #include <linux/device.h> diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index 326a67f2410b..0b0cdf206dc4 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 508c03dfef25..e2999a070a99 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Library implementing the most common irq chip callback functions * diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 79f987b942b8..38554bc35375 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/handle.c - * * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * - * This file contains the core interrupt handling code. - * - * Detailed information is available in Documentation/core-api/genericirq.rst + * This file contains the core interrupt handling code. Detailed + * information is available in Documentation/core-api/genericirq.rst * */ @@ -20,6 +18,10 @@ #include "internals.h" +#ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER +void (*handle_arch_irq)(struct pt_regs *) __ro_after_init; +#endif + /** * handle_bad_irq - handle spurious and unhandled irqs * @desc: description of the interrupt @@ -207,3 +209,14 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); return ret; } + +#ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER +int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) +{ + if (handle_arch_irq) + return -EBUSY; + + handle_arch_irq = handle_irq; + return 0; +} +#endif diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 259a22aa9934..8b778e37dc6d 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/ipi.c - * * Copyright (C) 2015 Imagination Technologies Ltd * Author: Qais Yousef <qais.yousef@imgtec.com> * diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 24caabf1a0f7..fc4f361a86bb 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl> * @@ -7,6 +8,7 @@ * option) any later version. */ +#include <linux/slab.h> #include <linux/irq_sim.h> #include <linux/irq.h> @@ -49,7 +51,8 @@ static void irq_sim_handle_irq(struct irq_work *work) * @sim: The interrupt simulator object to initialize. * @num_irqs: Number of interrupts to allocate * - * Returns 0 on success and a negative error number on failure. + * On success: return the base of the allocated interrupt range. + * On failure: a negative errno. */ int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) { @@ -78,7 +81,7 @@ int irq_sim_init(struct irq_sim *sim, unsigned int num_irqs) init_irq_work(&sim->work_ctx.work, irq_sim_handle_irq); sim->irq_count = num_irqs; - return 0; + return sim->irq_base; } EXPORT_SYMBOL_GPL(irq_sim_init); @@ -110,7 +113,8 @@ static void devm_irq_sim_release(struct device *dev, void *res) * @sim: The interrupt simulator object to initialize. * @num_irqs: Number of interrupts to allocate * - * Returns 0 on success and a negative error number on failure. + * On success: return the base of the allocated interrupt range. + * On failure: a negative errno. */ int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, unsigned int num_irqs) @@ -123,7 +127,7 @@ int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, return -ENOMEM; rv = irq_sim_init(sim, num_irqs); - if (rv) { + if (rv < 0) { devres_free(dr); return rv; } @@ -131,7 +135,7 @@ int devm_irq_sim_init(struct device *dev, struct irq_sim *sim, dr->sim = sim; devres_add(dev, dr); - return 0; + return rv; } EXPORT_SYMBOL_GPL(devm_irq_sim_init); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 49b54e9979cc..afc7f902d74a 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -1,10 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * - * This file contains the interrupt descriptor management code - * - * Detailed information is available in Documentation/core-api/genericirq.rst + * This file contains the interrupt descriptor management code. Detailed + * information is available in Documentation/core-api/genericirq.rst * */ #include <linux/irq.h> @@ -210,6 +210,22 @@ static ssize_t type_show(struct kobject *kobj, } IRQ_ATTR_RO(type); +static ssize_t wakeup_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); + ssize_t ret = 0; + + raw_spin_lock_irq(&desc->lock); + ret = sprintf(buf, "%s\n", + irqd_is_wakeup_set(&desc->irq_data) ? "enabled" : "disabled"); + raw_spin_unlock_irq(&desc->lock); + + return ret; + +} +IRQ_ATTR_RO(wakeup); + static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -253,6 +269,7 @@ static struct attribute *irq_attrs[] = { &chip_name_attr.attr, &hwirq_attr.attr, &type_attr.attr, + &wakeup_attr.attr, &name_attr.attr, &actions_attr.attr, NULL diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 82b8b18ee1eb..5d9fc01b60a6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + #define pr_fmt(fmt) "irq: " fmt #include <linux/acpi.h> diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0f922729bab9..e3336d904f64 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/manage.c - * * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006 Thomas Gleixner * @@ -855,10 +854,14 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ - if (cpumask_available(desc->irq_common_data.affinity)) - cpumask_copy(mask, desc->irq_common_data.affinity); - else + if (cpumask_available(desc->irq_common_data.affinity)) { + const struct cpumask *m; + + m = irq_data_get_effective_affinity_mask(&desc->irq_data); + cpumask_copy(mask, m); + } else { valid = false; + } raw_spin_unlock_irq(&desc->lock); if (valid) @@ -1519,9 +1522,9 @@ EXPORT_SYMBOL_GPL(setup_irq); * Internal function to unregister an irqaction - used to free * regular and special interrupts that are part of the architecture. */ -static struct irqaction *__free_irq(unsigned int irq, void *dev_id) +static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) { - struct irq_desc *desc = irq_to_desc(irq); + unsigned irq = desc->irq_data.irq; struct irqaction *action, **action_ptr; unsigned long flags; @@ -1651,7 +1654,7 @@ void remove_irq(unsigned int irq, struct irqaction *act) struct irq_desc *desc = irq_to_desc(irq); if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) - __free_irq(irq, act->dev_id); + __free_irq(desc, act->dev_id); } EXPORT_SYMBOL_GPL(remove_irq); @@ -1685,7 +1688,7 @@ const void *free_irq(unsigned int irq, void *dev_id) desc->affinity_notify = NULL; #endif - action = __free_irq(irq, dev_id); + action = __free_irq(desc, dev_id); if (!action) return NULL; diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 4c5770407031..5092494bf261 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -1,8 +1,6 @@ -/* - * Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de> - * - * SPDX-License-Identifier: GPL-2.0 - */ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de> + #include <linux/spinlock.h> #include <linux/seq_file.h> #include <linux/bitmap.h> diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 2f3c4f5382cc..2a8571f72b17 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/msi.c - * * Copyright (C) 2014 Intel Corp. * Author: Jiang Liu <jiang.liu@linux.intel.com> * diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 6bd9b58429cc..d6961d3c6f9e 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/pm.c - * * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * * This file contains power management functions related to interrupts. diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index e8f374971e37..7cb091d81d91 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/proc.c - * * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar * * This file contains the /proc/irq/ handling code. diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 1d08f45135c2..95414ad3506a 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/resend.c - * * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner * diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 6cdecc6f4c53..d867d6ddafdd 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/kernel/irq/spurious.c - * * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar * * This file contains spurious interrupt handling. diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index e0923fa4927a..1e4cb63a5c82 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -1,13 +1,6 @@ -/* - * linux/kernel/irq/timings.c - * - * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> + #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/slab.h> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 93b57f026688..2a2ac53d8b8b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,55 +23,39 @@ #include <linux/sched/signal.h> #include <linux/idr.h> -struct pid_cache { - int nr_ids; - char name[16]; - struct kmem_cache *cachep; - struct list_head list; -}; - -static LIST_HEAD(pid_caches_lh); static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 +/* Write once array, filled from the beginning. */ +static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; /* * creates the kmem cache to allocate pids from. - * @nr_ids: the number of numerical ids this pid will have to carry + * @level: pid namespace level */ -static struct kmem_cache *create_pid_cachep(int nr_ids) +static struct kmem_cache *create_pid_cachep(unsigned int level) { - struct pid_cache *pcache; - struct kmem_cache *cachep; - + /* Level 0 is init_pid_ns.pid_cachep */ + struct kmem_cache **pkc = &pid_cache[level - 1]; + struct kmem_cache *kc; + char name[4 + 10 + 1]; + unsigned int len; + + kc = READ_ONCE(*pkc); + if (kc) + return kc; + + snprintf(name, sizeof(name), "pid_%u", level + 1); + len = sizeof(struct pid) + level * sizeof(struct upid); mutex_lock(&pid_caches_mutex); - list_for_each_entry(pcache, &pid_caches_lh, list) - if (pcache->nr_ids == nr_ids) - goto out; - - pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL); - if (pcache == NULL) - goto err_alloc; - - snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids); - cachep = kmem_cache_create(pcache->name, - sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (cachep == NULL) - goto err_cachep; - - pcache->nr_ids = nr_ids; - pcache->cachep = cachep; - list_add(&pcache->list, &pid_caches_lh); -out: + /* Name collision forces to do allocation under mutex. */ + if (!*pkc) + *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN, 0); mutex_unlock(&pid_caches_mutex); - return pcache->cachep; - -err_cachep: - kfree(pcache); -err_alloc: - mutex_unlock(&pid_caches_mutex); - return NULL; + /* current can fail, but someone else can succeed. */ + return READ_ONCE(*pkc); } static void proc_cleanup_work(struct work_struct *work) @@ -80,9 +64,6 @@ static void proc_cleanup_work(struct work_struct *work) pid_ns_release_proc(ns); } -/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ -#define MAX_PID_NS_LEVEL 32 - static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); @@ -119,7 +100,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns idr_init(&ns->idr); - ns->pid_cachep = create_pid_cachep(level + 1); + ns->pid_cachep = create_pid_cachep(level); if (ns->pid_cachep == NULL) goto out_free_idr; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f274fbef821d..704e55129c3a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -42,7 +42,6 @@ #include <linux/rculist.h> #include <linux/poll.h> #include <linux/irq_work.h> -#include <linux/utsname.h> #include <linux/ctype.h> #include <linux/uio.h> #include <linux/sched/clock.h> @@ -2162,7 +2161,7 @@ void suspend_console(void) { if (!console_suspend_enabled) return; - printk("Suspending console(s) (use no_console_suspend to debug)\n"); + pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); console_lock(); console_suspended = 1; up_console_sem(); @@ -3257,60 +3256,4 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); -static char dump_stack_arch_desc_str[128]; - -/** - * dump_stack_set_arch_desc - set arch-specific str to show with task dumps - * @fmt: printf-style format string - * @...: arguments for the format string - * - * The configured string will be printed right after utsname during task - * dumps. Usually used to add arch-specific system identifiers. If an - * arch wants to make use of such an ID string, it should initialize this - * as soon as possible during boot. - */ -void __init dump_stack_set_arch_desc(const char *fmt, ...) -{ - va_list args; - - va_start(args, fmt); - vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), - fmt, args); - va_end(args); -} - -/** - * dump_stack_print_info - print generic debug info for dump_stack() - * @log_lvl: log level - * - * Arch-specific dump_stack() implementations can use this function to - * print out the same debug information as the generic dump_stack(). - */ -void dump_stack_print_info(const char *log_lvl) -{ - printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n", - log_lvl, raw_smp_processor_id(), current->pid, current->comm, - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - - if (dump_stack_arch_desc_str[0] != '\0') - printk("%sHardware name: %s\n", - log_lvl, dump_stack_arch_desc_str); - - print_worker_info(log_lvl, current); -} - -/** - * show_regs_print_info - print generic debug info for show_regs() - * @log_lvl: log level - * - * show_regs() implementations can use this function to print out generic - * debug information. - */ -void show_regs_print_info(const char *log_lvl) -{ - dump_stack_print_info(log_lvl); -} - #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2975f195e1c4..1a3e9bddd17b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -141,13 +141,15 @@ static void cpuidle_idle_call(void) } /* - * Tell the RCU framework we are entering an idle section, - * so no more rcu read side critical sections and one more + * The RCU framework needs to be told that we are entering an idle + * section, so no more rcu read side critical sections and one more * step to the grace period */ - rcu_idle_enter(); if (cpuidle_not_available(drv, dev)) { + tick_nohz_idle_stop_tick(); + rcu_idle_enter(); + default_idle_call(); goto exit_idle; } @@ -164,20 +166,37 @@ static void cpuidle_idle_call(void) if (idle_should_enter_s2idle() || dev->use_deepest_state) { if (idle_should_enter_s2idle()) { + rcu_idle_enter(); + entered_state = cpuidle_enter_s2idle(drv, dev); if (entered_state > 0) { local_irq_enable(); goto exit_idle; } + + rcu_idle_exit(); } + tick_nohz_idle_stop_tick(); + rcu_idle_enter(); + next_state = cpuidle_find_deepest_state(drv, dev); call_cpuidle(drv, dev, next_state); } else { + bool stop_tick = true; + /* * Ask the cpuidle framework to choose a convenient idle state. */ - next_state = cpuidle_select(drv, dev); + next_state = cpuidle_select(drv, dev, &stop_tick); + + if (stop_tick) + tick_nohz_idle_stop_tick(); + else + tick_nohz_idle_retain_tick(); + + rcu_idle_enter(); + entered_state = call_cpuidle(drv, dev, next_state); /* * Give the governor an opportunity to reflect on the outcome @@ -222,6 +241,7 @@ static void do_idle(void) rmb(); if (cpu_is_offline(cpu)) { + tick_nohz_idle_stop_tick_protected(); cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } @@ -235,10 +255,12 @@ static void do_idle(void) * broadcast device expired for us, we don't want to go deep * idle as we know that the IPI is going to arrive right away. */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) + if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + tick_nohz_idle_restart_tick(); cpu_idle_poll(); - else + } else { cpuidle_idle_call(); + } arch_cpu_idle_exit(); } diff --git a/kernel/signal.c b/kernel/signal.c index f04466655238..47491aa3e790 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2844,10 +2844,6 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) if ((sig == SIGFPE) && (si_code == FPE_FIXME)) layout = SIL_FAULT; #endif -#ifdef BUS_FIXME - if ((sig == SIGBUS) && (si_code == BUS_FIXME)) - layout = SIL_FAULT; -#endif } return layout; } diff --git a/kernel/softirq.c b/kernel/softirq.c index 24d243ef8e71..177de3640c78 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -460,40 +460,46 @@ struct tasklet_head { static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); -void __tasklet_schedule(struct tasklet_struct *t) +static void __tasklet_schedule_common(struct tasklet_struct *t, + struct tasklet_head __percpu *headp, + unsigned int softirq_nr) { + struct tasklet_head *head; unsigned long flags; local_irq_save(flags); + head = this_cpu_ptr(headp); t->next = NULL; - *__this_cpu_read(tasklet_vec.tail) = t; - __this_cpu_write(tasklet_vec.tail, &(t->next)); - raise_softirq_irqoff(TASKLET_SOFTIRQ); + *head->tail = t; + head->tail = &(t->next); + raise_softirq_irqoff(softirq_nr); local_irq_restore(flags); } + +void __tasklet_schedule(struct tasklet_struct *t) +{ + __tasklet_schedule_common(t, &tasklet_vec, + TASKLET_SOFTIRQ); +} EXPORT_SYMBOL(__tasklet_schedule); void __tasklet_hi_schedule(struct tasklet_struct *t) { - unsigned long flags; - - local_irq_save(flags); - t->next = NULL; - *__this_cpu_read(tasklet_hi_vec.tail) = t; - __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); - raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_restore(flags); + __tasklet_schedule_common(t, &tasklet_hi_vec, + HI_SOFTIRQ); } EXPORT_SYMBOL(__tasklet_hi_schedule); -static __latent_entropy void tasklet_action(struct softirq_action *a) +static void tasklet_action_common(struct softirq_action *a, + struct tasklet_head *tl_head, + unsigned int softirq_nr) { struct tasklet_struct *list; local_irq_disable(); - list = __this_cpu_read(tasklet_vec.head); - __this_cpu_write(tasklet_vec.head, NULL); - __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head)); + list = tl_head->head; + tl_head->head = NULL; + tl_head->tail = &tl_head->head; local_irq_enable(); while (list) { @@ -515,47 +521,21 @@ static __latent_entropy void tasklet_action(struct softirq_action *a) local_irq_disable(); t->next = NULL; - *__this_cpu_read(tasklet_vec.tail) = t; - __this_cpu_write(tasklet_vec.tail, &(t->next)); - __raise_softirq_irqoff(TASKLET_SOFTIRQ); + *tl_head->tail = t; + tl_head->tail = &t->next; + __raise_softirq_irqoff(softirq_nr); local_irq_enable(); } } -static __latent_entropy void tasklet_hi_action(struct softirq_action *a) +static __latent_entropy void tasklet_action(struct softirq_action *a) { - struct tasklet_struct *list; - - local_irq_disable(); - list = __this_cpu_read(tasklet_hi_vec.head); - __this_cpu_write(tasklet_hi_vec.head, NULL); - __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head)); - local_irq_enable(); - - while (list) { - struct tasklet_struct *t = list; - - list = list->next; - - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, - &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } + tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); +} - local_irq_disable(); - t->next = NULL; - *__this_cpu_read(tasklet_hi_vec.tail) = t; - __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); - __raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_enable(); - } +static __latent_entropy void tasklet_hi_action(struct softirq_action *a) +{ + tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } void tasklet_init(struct tasklet_struct *t, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f98f28c12020..bdf7090b106d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -253,6 +253,10 @@ extern struct ctl_table random_table[]; extern struct ctl_table epoll_table[]; #endif +#ifdef CONFIG_FW_LOADER_USER_HELPER +extern struct ctl_table firmware_config_table[]; +#endif + #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT int sysctl_legacy_va_layout; #endif @@ -748,6 +752,13 @@ static struct ctl_table kern_table[] = { .mode = 0555, .child = usermodehelper_table, }, +#ifdef CONFIG_FW_LOADER_USER_HELPER + { + .procname = "firmware_config", + .mode = 0555, + .child = firmware_config_table, + }, +#endif { .procname = "overflowuid", .data = &overflowuid, diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ec09ce9a6012..639321bf2e39 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -326,6 +326,17 @@ static int alarmtimer_resume(struct device *dev) } #endif +static void +__alarm_init(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + timerqueue_init(&alarm->node); + alarm->timer.function = alarmtimer_fired; + alarm->function = function; + alarm->type = type; + alarm->state = ALARMTIMER_STATE_INACTIVE; +} + /** * alarm_init - Initialize an alarm structure * @alarm: ptr to alarm to be initialized @@ -335,13 +346,9 @@ static int alarmtimer_resume(struct device *dev) void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) { - timerqueue_init(&alarm->node); hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, - HRTIMER_MODE_ABS); - alarm->timer.function = alarmtimer_fired; - alarm->function = function; - alarm->type = type; - alarm->state = ALARMTIMER_STATE_INACTIVE; + HRTIMER_MODE_ABS); + __alarm_init(alarm, type, function); } EXPORT_SYMBOL_GPL(alarm_init); @@ -719,6 +726,8 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, __set_current_state(TASK_RUNNING); + destroy_hrtimer_on_stack(&alarm->timer); + if (!alarm->data) return 0; @@ -740,6 +749,15 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, return -ERESTART_RESTARTBLOCK; } +static void +alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); + __alarm_init(alarm, type, function); +} + /** * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep * @restart: ptr to restart block @@ -752,7 +770,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) ktime_t exp = restart->nanosleep.expires; struct alarm alarm; - alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup); return alarmtimer_do_nsleep(&alarm, exp, type); } @@ -784,7 +802,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, if (!capable(CAP_WAKE_ALARM)) return -EPERM; - alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + alarm_init_on_stack(&alarm, type, alarmtimer_nsleep_wakeup); exp = timespec64_to_ktime(*tsreq); /* Convert (if necessary) to absolute time */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 65f9e3f24dde..0e974cface0b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -594,6 +594,9 @@ static void __clocksource_select(bool skipcur) if (!best) return; + if (!strlen(override_name)) + goto found; + /* Check for the override clocksource. */ list_for_each_entry(cs, &clocksource_list, list) { if (skipcur && cs == curr_clocksource) @@ -625,6 +628,7 @@ static void __clocksource_select(bool skipcur) break; } +found: if (curr_clocksource != best && !timekeeping_notify(best)) { pr_info("Switched to clocksource %s\n", best->name); curr_clocksource = best; @@ -853,16 +857,16 @@ EXPORT_SYMBOL(clocksource_unregister); #ifdef CONFIG_SYSFS /** - * sysfs_show_current_clocksources - sysfs interface for current clocksource + * current_clocksource_show - sysfs interface for current clocksource * @dev: unused * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing current clocksource. */ -static ssize_t -sysfs_show_current_clocksources(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t current_clocksource_show(struct device *dev, + struct device_attribute *attr, + char *buf) { ssize_t count = 0; @@ -891,7 +895,7 @@ ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) } /** - * sysfs_override_clocksource - interface for manually overriding clocksource + * current_clocksource_store - interface for manually overriding clocksource * @dev: unused * @attr: unused * @buf: name of override clocksource @@ -900,9 +904,9 @@ ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) * Takes input from sysfs interface for manually overriding the default * clocksource selection. */ -static ssize_t sysfs_override_clocksource(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t current_clocksource_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { ssize_t ret; @@ -916,9 +920,10 @@ static ssize_t sysfs_override_clocksource(struct device *dev, return ret; } +static DEVICE_ATTR_RW(current_clocksource); /** - * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * unbind_clocksource_store - interface for manually unbinding clocksource * @dev: unused * @attr: unused * @buf: unused @@ -926,7 +931,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, * * Takes input from sysfs interface for manually unbinding a clocksource. */ -static ssize_t sysfs_unbind_clocksource(struct device *dev, +static ssize_t unbind_clocksource_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -950,19 +955,19 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, return ret ? ret : count; } +static DEVICE_ATTR_WO(unbind_clocksource); /** - * sysfs_show_available_clocksources - sysfs interface for listing clocksource + * available_clocksource_show - sysfs interface for listing clocksource * @dev: unused * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing registered clocksources */ -static ssize_t -sysfs_show_available_clocksources(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t available_clocksource_show(struct device *dev, + struct device_attribute *attr, + char *buf) { struct clocksource *src; ssize_t count = 0; @@ -986,17 +991,15 @@ sysfs_show_available_clocksources(struct device *dev, return count; } +static DEVICE_ATTR_RO(available_clocksource); -/* - * Sysfs setup bits: - */ -static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, - sysfs_override_clocksource); - -static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); - -static DEVICE_ATTR(available_clocksource, 0444, - sysfs_show_available_clocksources, NULL); +static struct attribute *clocksource_attrs[] = { + &dev_attr_current_clocksource.attr, + &dev_attr_unbind_clocksource.attr, + &dev_attr_available_clocksource.attr, + NULL +}; +ATTRIBUTE_GROUPS(clocksource); static struct bus_type clocksource_subsys = { .name = "clocksource", @@ -1006,6 +1009,7 @@ static struct bus_type clocksource_subsys = { static struct device device_clocksource = { .id = 0, .bus = &clocksource_subsys, + .groups = clocksource_groups, }; static int __init init_clocksource_sysfs(void) @@ -1014,17 +1018,7 @@ static int __init init_clocksource_sysfs(void) if (!error) error = device_register(&device_clocksource); - if (!error) - error = device_create_file( - &device_clocksource, - &dev_attr_current_clocksource); - if (!error) - error = device_create_file(&device_clocksource, - &dev_attr_unbind_clocksource); - if (!error) - error = device_create_file( - &device_clocksource, - &dev_attr_available_clocksource); + return error; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 23788100e214..eda1210ce50f 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -91,11 +91,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_real, }, { - .index = HRTIMER_BASE_BOOTTIME, - .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, - }, - { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, @@ -111,11 +106,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_real, }, { - .index = HRTIMER_BASE_BOOTTIME_SOFT, - .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, - }, - { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, @@ -129,7 +119,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, + [CLOCK_BOOTTIME] = HRTIMER_BASE_MONOTONIC, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; @@ -490,6 +480,7 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, + const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { @@ -502,9 +493,22 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); + if (timer == exclude) { + /* Get to the next timer in the queue. */ + next = timerqueue_iterate_next(next); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; + + /* Skip cpu_base update if a timer is being excluded. */ + if (exclude) + continue; + if (timer->is_soft) cpu_base->softirq_next_timer = timer; else @@ -548,7 +552,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; - expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX); + expires_next = __hrtimer_next_event_base(cpu_base, NULL, + active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } @@ -556,7 +561,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; - expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); + expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, + expires_next); } return expires_next; @@ -565,14 +571,12 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; - ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, - offs_real, offs_boot, offs_tai); + offs_real, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; - base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; @@ -1202,6 +1206,39 @@ u64 hrtimer_get_next_event(void) return expires; } + +/** + * hrtimer_next_event_without - time until next expiry event w/o one timer + * @exclude: timer to exclude + * + * Returns the next expiry time over all timers except for the @exclude one or + * KTIME_MAX if none of them is pending. + */ +u64 hrtimer_next_event_without(const struct hrtimer *exclude) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + u64 expires = KTIME_MAX; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (__hrtimer_hres_active(cpu_base)) { + unsigned int active; + + if (!cpu_base->softirq_activated) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + expires = __hrtimer_next_event_base(cpu_base, exclude, + active, KTIME_MAX); + } + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + expires = __hrtimer_next_event_base(cpu_base, exclude, active, + expires); + } + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + return expires; +} #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8d70da1b9a0d..a09ded765f6c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -31,7 +31,7 @@ /* USER_HZ period (usecs): */ -unsigned long tick_usec = TICK_USEC; +unsigned long tick_usec = USER_TICK_USEC; /* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index b258bee13b02..6259dbc0191a 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -73,6 +73,8 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) case CLOCK_BOOTTIME: get_monotonic_boottime64(tp); break; + case CLOCK_MONOTONIC_ACTIVE: + ktime_get_active_ts64(tp); default: return -EINVAL; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 10b7186d0638..b6899b5060bd 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -252,15 +252,16 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * return 0; } -static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) +static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) { - get_monotonic_boottime64(tp); + timekeeping_clocktai64(tp); return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_monotonic_active(clockid_t which_clock, + struct timespec64 *tp) { - timekeeping_clocktai64(tp); + ktime_get_active_ts64(tp); return 0; } @@ -1316,19 +1317,9 @@ static const struct k_clock clock_tai = { .timer_arm = common_hrtimer_arm, }; -static const struct k_clock clock_boottime = { +static const struct k_clock clock_monotonic_active = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - .timer_rearm = common_hrtimer_rearm, - .timer_forward = common_hrtimer_forward, - .timer_remaining = common_hrtimer_remaining, - .timer_try_to_cancel = common_hrtimer_try_to_cancel, - .timer_arm = common_hrtimer_arm, + .clock_get = posix_get_monotonic_active, }; static const struct k_clock * const posix_clocks[] = { @@ -1339,10 +1330,11 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, - [CLOCK_BOOTTIME] = &clock_boottime, + [CLOCK_BOOTTIME] = &clock_monotonic, [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, + [CLOCK_MONOTONIC_ACTIVE] = &clock_monotonic_active, }; static const struct k_clock *clockid_to_kclock(const clockid_t id) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 49edc1c4f3e6..099572ca4a8f 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -419,6 +419,19 @@ void tick_suspend_local(void) clockevents_shutdown(td->evtdev); } +static void tick_forward_next_period(void) +{ + ktime_t delta, now = ktime_get(); + u64 n; + + delta = ktime_sub(now, tick_next_period); + n = ktime_divns(delta, tick_period); + tick_next_period += n * tick_period; + if (tick_next_period < now) + tick_next_period += tick_period; + tick_sched_forward_next_period(); +} + /** * tick_resume_local - Resume the local tick device * @@ -431,6 +444,8 @@ void tick_resume_local(void) struct tick_device *td = this_cpu_ptr(&tick_cpu_device); bool broadcast = tick_resume_check_broadcast(); + tick_forward_next_period(); + clockevents_tick_resume(td->evtdev); if (!broadcast) { if (td->mode == TICKDEV_MODE_PERIODIC) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e277284c2831..21efab7485ca 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -141,6 +141,12 @@ static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } #endif /* !(BROADCAST && ONESHOT) */ +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +extern void tick_sched_forward_next_period(void); +#else +static inline void tick_sched_forward_next_period(void) { } +#endif + /* NO_HZ_FULL internal */ #ifdef CONFIG_NO_HZ_FULL extern void tick_nohz_init(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5d4a0342f934..646645e981f9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -52,6 +52,15 @@ struct tick_sched *tick_get_tick_sched(int cpu) static ktime_t last_jiffies_update; /* + * Called after resume. Make sure that jiffies are not fast forwarded due to + * clock monotonic being forwarded by the suspended time. + */ +void tick_sched_forward_next_period(void) +{ + last_jiffies_update = tick_next_period; +} + +/* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) @@ -113,8 +122,7 @@ static ktime_t tick_init_jiffy_update(void) return period; } - -static void tick_sched_do_timer(ktime_t now) +static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); @@ -134,6 +142,9 @@ static void tick_sched_do_timer(ktime_t now) /* Check, if the jiffies need an update */ if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); + + if (ts->inidle) + ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) @@ -465,7 +476,9 @@ __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { - return __this_cpu_read(tick_cpu_sched.tick_stopped); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->tick_stopped; } bool tick_nohz_tick_stopped_cpu(int cpu) @@ -528,14 +541,11 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) sched_clock_idle_wakeup_event(); } -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +static void tick_nohz_start_idle(struct tick_sched *ts) { - ktime_t now = ktime_get(); - - ts->idle_entrytime = now; + ts->idle_entrytime = ktime_get(); ts->idle_active = 1; sched_clock_idle_sleep_event(); - return now; } /** @@ -644,13 +654,10 @@ static inline bool local_timer_softirq_pending(void) return local_softirq_pending() & TIMER_SOFTIRQ; } -static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, - ktime_t now, int cpu) +static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { - struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; unsigned long seq, basejiff; - ktime_t tick; /* Read jiffies and the time when jiffies were updated last */ do { @@ -659,6 +666,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, basejiff = jiffies; } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; + ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work @@ -703,47 +711,63 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * next period, so no point in stopping it either, bail. */ if (!ts->tick_stopped) { - tick = 0; + ts->timer_expires = 0; goto out; } } /* + * If this CPU is the one which had the do_timer() duty last, we limit + * the sleep time to the timekeeping max_deferment value. + * Otherwise we can sleep as long as we want. + */ + delta = timekeeping_max_deferment(); + if (cpu != tick_do_timer_cpu && + (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) + delta = KTIME_MAX; + + /* Calculate the next expiry time */ + if (delta < (KTIME_MAX - basemono)) + expires = basemono + delta; + else + expires = KTIME_MAX; + + ts->timer_expires = min_t(u64, expires, next_tick); + +out: + return ts->timer_expires; +} + +static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + u64 basemono = ts->timer_expires_base; + u64 expires = ts->timer_expires; + ktime_t tick = expires; + + /* Make sure we won't be trying to stop it twice in a row. */ + ts->timer_expires_base = 0; + + /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it - * was the one which had the do_timer() duty last. If this CPU - * is the one which had the do_timer() duty last, we limit the - * sleep time to the timekeeping max_deferment value. - * Otherwise we can sleep as long as we want. + * was the one which had the do_timer() duty last. */ - delta = timekeeping_max_deferment(); if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - delta = KTIME_MAX; ts->do_timer_last = 0; - } else if (!ts->do_timer_last) { - delta = KTIME_MAX; } - /* Calculate the next expiry time */ - if (delta < (KTIME_MAX - basemono)) - expires = basemono + delta; - else - expires = KTIME_MAX; - - expires = min_t(u64, expires, next_tick); - tick = expires; - /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) - goto out; + return; WARN_ON_ONCE(1); printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", @@ -777,7 +801,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); - goto out; + return; } hrtimer_set_expires(&ts->sched_timer, tick); @@ -786,15 +810,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(tick, 1); -out: - /* - * Update the estimated sleep length until the next timer - * (not only the tick). - */ - ts->sleep_length = ktime_sub(dev->next_event, now); - return tick; } +static void tick_nohz_retain_tick(struct tick_sched *ts) +{ + ts->timer_expires_base = 0; +} + +#ifdef CONFIG_NO_HZ_FULL +static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) +{ + if (tick_nohz_next_event(ts, cpu)) + tick_nohz_stop_tick(ts, cpu); + else + tick_nohz_retain_tick(ts); +} +#endif /* CONFIG_NO_HZ_FULL */ + static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ @@ -830,7 +862,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) return; if (can_stop_full_tick(cpu, ts)) - tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + tick_nohz_stop_sched_tick(ts, cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, ktime_get()); #endif @@ -856,10 +888,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { - ts->sleep_length = NSEC_PER_SEC / HZ; + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) return false; - } if (need_resched()) return false; @@ -894,42 +924,65 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return true; } -static void __tick_nohz_idle_enter(struct tick_sched *ts) +static void __tick_nohz_idle_stop_tick(struct tick_sched *ts) { - ktime_t now, expires; + ktime_t expires; int cpu = smp_processor_id(); - now = tick_nohz_start_idle(ts); + /* + * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the + * tick timer expiration time is known already. + */ + if (ts->timer_expires_base) + expires = ts->timer_expires; + else if (can_stop_idle_tick(cpu, ts)) + expires = tick_nohz_next_event(ts, cpu); + else + return; + + ts->idle_calls++; - if (can_stop_idle_tick(cpu, ts)) { + if (expires > 0LL) { int was_stopped = ts->tick_stopped; - ts->idle_calls++; + tick_nohz_stop_tick(ts, cpu); - expires = tick_nohz_stop_sched_tick(ts, now, cpu); - if (expires > 0LL) { - ts->idle_sleeps++; - ts->idle_expires = expires; - } + ts->idle_sleeps++; + ts->idle_expires = expires; if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } + } else { + tick_nohz_retain_tick(ts); } } /** - * tick_nohz_idle_enter - stop the idle tick from the idle task + * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick - * Called when we start the idle loop. - * - * The arch is responsible of calling: + */ +void tick_nohz_idle_stop_tick(void) +{ + __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched)); +} + +void tick_nohz_idle_retain_tick(void) +{ + tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); + /* + * Undo the effect of get_next_timer_interrupt() called from + * tick_nohz_next_event(). + */ + timer_clear_idle(); +} + +/** + * tick_nohz_idle_enter - prepare for entering idle on the current CPU * - * - rcu_idle_enter() after its last use of RCU before the CPU is put - * to sleep. - * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. + * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { @@ -940,8 +993,11 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); + + WARN_ON_ONCE(ts->timer_expires_base); + ts->inidle = 1; - __tick_nohz_idle_enter(ts); + tick_nohz_start_idle(ts); local_irq_enable(); } @@ -959,21 +1015,62 @@ void tick_nohz_irq_exit(void) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->inidle) - __tick_nohz_idle_enter(ts); + tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** - * tick_nohz_get_sleep_length - return the length of the current sleep + * tick_nohz_idle_got_tick - Check whether or not the tick handler has run + */ +bool tick_nohz_idle_got_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->got_idle_tick) { + ts->got_idle_tick = 0; + return true; + } + return false; +} + +/** + * tick_nohz_get_sleep_length - return the expected length of the current sleep + * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled */ -ktime_t tick_nohz_get_sleep_length(void) +ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + int cpu = smp_processor_id(); + /* + * The idle entry time is expected to be a sufficient approximation of + * the current time at this point. + */ + ktime_t now = ts->idle_entrytime; + ktime_t next_event; + + WARN_ON_ONCE(!ts->inidle); + + *delta_next = ktime_sub(dev->next_event, now); - return ts->sleep_length; + if (!can_stop_idle_tick(cpu, ts)) + return *delta_next; + + next_event = tick_nohz_next_event(ts, cpu); + if (!next_event) + return *delta_next; + + /* + * If the next highres timer to expire is earlier than next_event, the + * idle governor needs to know that. + */ + next_event = min_t(u64, next_event, + hrtimer_next_event_without(&ts->sched_timer)); + + return ktime_sub(next_event, now); } /** @@ -1022,6 +1119,20 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) #endif } +static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now) +{ + tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); +} + +void tick_nohz_idle_restart_tick(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->tick_stopped) + __tick_nohz_idle_restart_tick(ts, ktime_get()); +} + /** * tick_nohz_idle_exit - restart the idle tick from the idle task * @@ -1032,24 +1143,26 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!ts->inidle); + WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 0; + idle_active = ts->idle_active; + tick_stopped = ts->tick_stopped; - if (ts->idle_active || ts->tick_stopped) + if (idle_active || tick_stopped) now = ktime_get(); - if (ts->idle_active) + if (idle_active) tick_nohz_stop_idle(ts, now); - if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now); - tick_nohz_account_idle_ticks(ts); - } + if (tick_stopped) + __tick_nohz_idle_restart_tick(ts, now); local_irq_enable(); } @@ -1065,7 +1178,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) dev->next_event = KTIME_MAX; - tick_sched_do_timer(now); + tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); /* No need to reprogram if we are running tickless */ @@ -1160,7 +1273,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); - tick_sched_do_timer(now); + tick_sched_do_timer(ts, now); /* * Do not call, when we are not in irq context and have diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 954b43dbf21c..6de959a854b2 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -38,31 +38,37 @@ enum tick_nohz_mode { * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding - * @sleep_length: Duration of the current idle sleep + * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) + * @timer_expires_base: Base time clock monotonic for @timer_expires * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @got_idle_tick: Tick timer function has run with @inidle set */ struct tick_sched { struct hrtimer sched_timer; unsigned long check_clocks; enum tick_nohz_mode nohz_mode; + + unsigned int inidle : 1; + unsigned int tick_stopped : 1; + unsigned int idle_active : 1; + unsigned int do_timer_last : 1; + unsigned int got_idle_tick : 1; + ktime_t last_tick; ktime_t next_tick; - int inidle; - int tick_stopped; unsigned long idle_jiffies; unsigned long idle_calls; unsigned long idle_sleeps; - int idle_active; ktime_t idle_entrytime; ktime_t idle_waketime; ktime_t idle_exittime; ktime_t idle_sleeptime; ktime_t iowait_sleeptime; - ktime_t sleep_length; unsigned long last_jiffies; + u64 timer_expires; + u64 timer_expires_base; u64 next_timer; ktime_t idle_expires; - int do_timer_last; atomic_t tick_dep_mask; }; diff --git a/kernel/time/time.c b/kernel/time/time.c index bd4e6c7dd689..3044d48ebe56 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -488,6 +488,18 @@ struct timeval ns_to_timeval(const s64 nsec) } EXPORT_SYMBOL(ns_to_timeval); +struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) +{ + struct timespec64 ts = ns_to_timespec64(nsec); + struct __kernel_old_timeval tv; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000; + + return tv; +} +EXPORT_SYMBOL(ns_to_kernel_old_timeval); + /** * set_normalized_timespec - set timespec sec and nsec parts and normalize * diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cd03317e7b57..ca90219a1e73 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -138,7 +138,12 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { - tk->offs_boot = ktime_add(tk->offs_boot, delta); + /* Update both bases so mono and raw stay coupled. */ + tk->tkr_mono.base += delta; + tk->tkr_raw.base += delta; + + /* Accumulate time spent in suspend */ + tk->time_suspended += delta; } /* @@ -332,6 +337,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->tkr_mono.mult = clock->mult; tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; + tk->skip_second_overflow = 0; } /* Timekeeper helper functions. */ @@ -467,36 +473,6 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); -/** - * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. - * - * To keep it NMI safe since we're accessing from tracing, we're not using a - * separate timekeeper with updates to monotonic clock and boot offset - * protected with seqlocks. This has the following minor side effects: - * - * (1) Its possible that a timestamp be taken after the boot offset is updated - * but before the timekeeper is updated. If this happens, the new boot offset - * is added to the old timekeeping making the clock appear to update slightly - * earlier: - * CPU 0 CPU 1 - * timekeeping_inject_sleeptime64() - * __timekeeping_inject_sleeptime(tk, delta); - * timestamp(); - * timekeeping_update(tk, TK_CLEAR_NTP...); - * - * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be - * partially updated. Since the tk->offs_boot update is a rare event, this - * should be a rare occurrence which postprocessing should be able to handle. - */ -u64 notrace ktime_get_boot_fast_ns(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); -} -EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); - - /* * See comment for __ktime_get_fast_ns() vs. timestamp ordering */ @@ -788,7 +764,6 @@ EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, - [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, }; @@ -886,6 +861,39 @@ void ktime_get_ts64(struct timespec64 *ts) EXPORT_SYMBOL_GPL(ktime_get_ts64); /** + * ktime_get_active_ts64 - Get the active non-suspended monotonic clock + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime clock and + * the wall_to_monotonic offset, subtracts the accumulated suspend time and + * stores the result in normalized timespec64 format in the variable + * pointed to by @ts. + */ +void ktime_get_active_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 tomono, tsusp; + u64 nsec, nssusp; + unsigned int seq; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->xtime_sec; + nsec = timekeeping_get_ns(&tk->tkr_mono); + tomono = tk->wall_to_monotonic; + nssusp = tk->time_suspended; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsec + tomono.tv_nsec); + tsusp = ns_to_timespec64(nssusp); + *ts = timespec64_sub(*ts, tsusp); +} + +/** * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC * * Returns the seconds portion of CLOCK_MONOTONIC with a single non @@ -1585,7 +1593,6 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, return; } tk_xtime_add(tk, delta); - tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } @@ -1799,20 +1806,19 @@ device_initcall(timekeeping_init_ops); */ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, s64 offset, - bool negative, - int adj_scale) + s32 mult_adj) { s64 interval = tk->cycle_interval; - s32 mult_adj = 1; - if (negative) { - mult_adj = -mult_adj; + if (mult_adj == 0) { + return; + } else if (mult_adj == -1) { interval = -interval; - offset = -offset; + offset = -offset; + } else if (mult_adj != 1) { + interval *= mult_adj; + offset *= mult_adj; } - mult_adj <<= adj_scale; - interval <<= adj_scale; - offset <<= adj_scale; /* * So the following can be confusing. @@ -1860,8 +1866,6 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, * xtime_nsec_2 = xtime_nsec_1 - offset * Which simplfies to: * xtime_nsec -= offset - * - * XXX - TODO: Doc ntp_error calculation. */ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { /* NTP adjustment caused clocksource mult overflow */ @@ -1872,89 +1876,38 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, tk->tkr_mono.mult += mult_adj; tk->xtime_interval += interval; tk->tkr_mono.xtime_nsec -= offset; - tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; } /* - * Calculate the multiplier adjustment needed to match the frequency - * specified by NTP + * Adjust the timekeeper's multiplier to the correct frequency + * and also to reduce the accumulated error value. */ -static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, - s64 offset) +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { - s64 interval = tk->cycle_interval; - s64 xinterval = tk->xtime_interval; - u32 base = tk->tkr_mono.clock->mult; - u32 max = tk->tkr_mono.clock->maxadj; - u32 cur_adj = tk->tkr_mono.mult; - s64 tick_error; - bool negative; - u32 adj_scale; - - /* Remove any current error adj from freq calculation */ - if (tk->ntp_err_mult) - xinterval -= tk->cycle_interval; - - tk->ntp_tick = ntp_tick_length(); - - /* Calculate current error per tick */ - tick_error = ntp_tick_length() >> tk->ntp_error_shift; - tick_error -= (xinterval + tk->xtime_remainder); - - /* Don't worry about correcting it if its small */ - if (likely((tick_error >= 0) && (tick_error <= interval))) - return; - - /* preserve the direction of correction */ - negative = (tick_error < 0); + u32 mult; - /* If any adjustment would pass the max, just return */ - if (negative && (cur_adj - 1) <= (base - max)) - return; - if (!negative && (cur_adj + 1) >= (base + max)) - return; /* - * Sort out the magnitude of the correction, but - * avoid making so large a correction that we go - * over the max adjustment. + * Determine the multiplier from the current NTP tick length. + * Avoid expensive division when the tick length doesn't change. */ - adj_scale = 0; - tick_error = abs(tick_error); - while (tick_error > interval) { - u32 adj = 1 << (adj_scale + 1); - - /* Check if adjustment gets us within 1 unit from the max */ - if (negative && (cur_adj - adj) <= (base - max)) - break; - if (!negative && (cur_adj + adj) >= (base + max)) - break; - - adj_scale++; - tick_error >>= 1; + if (likely(tk->ntp_tick == ntp_tick_length())) { + mult = tk->tkr_mono.mult - tk->ntp_err_mult; + } else { + tk->ntp_tick = ntp_tick_length(); + mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - + tk->xtime_remainder, tk->cycle_interval); } - /* scale the corrections */ - timekeeping_apply_adjustment(tk, offset, negative, adj_scale); -} + /* + * If the clock is behind the NTP time, increase the multiplier by 1 + * to catch up with it. If it's ahead and there was a remainder in the + * tick division, the clock will slow down. Otherwise it will stay + * ahead until the tick length changes to a non-divisible value. + */ + tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0; + mult += tk->ntp_err_mult; -/* - * Adjust the timekeeper's multiplier to the correct frequency - * and also to reduce the accumulated error value. - */ -static void timekeeping_adjust(struct timekeeper *tk, s64 offset) -{ - /* Correct for the current frequency error */ - timekeeping_freqadjust(tk, offset); - - /* Next make a small adjustment to fix any cumulative error */ - if (!tk->ntp_err_mult && (tk->ntp_error > 0)) { - tk->ntp_err_mult = 1; - timekeeping_apply_adjustment(tk, offset, 0, 0); - } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) { - /* Undo any existing error adjustment */ - timekeeping_apply_adjustment(tk, offset, 1, 0); - tk->ntp_err_mult = 0; - } + timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult); if (unlikely(tk->tkr_mono.clock->maxadj && (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) @@ -1971,18 +1924,15 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * in the code above, its possible the required corrective factor to * xtime_nsec could cause it to underflow. * - * Now, since we already accumulated the second, cannot simply roll - * the accumulated second back, since the NTP subsystem has been - * notified via second_overflow. So instead we push xtime_nsec forward - * by the amount we underflowed, and add that amount into the error. - * - * We'll correct this error next time through this function, when - * xtime_nsec is not as small. + * Now, since we have already accumulated the second and the NTP + * subsystem has been notified via second_overflow(), we need to skip + * the next update. */ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { - s64 neg = -(s64)tk->tkr_mono.xtime_nsec; - tk->tkr_mono.xtime_nsec = 0; - tk->ntp_error += neg << tk->ntp_error_shift; + tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC << + tk->tkr_mono.shift; + tk->xtime_sec--; + tk->skip_second_overflow = 1; } } @@ -2005,6 +1955,15 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) tk->tkr_mono.xtime_nsec -= nsecps; tk->xtime_sec++; + /* + * Skip NTP update if this second was accumulated before, + * i.e. xtime_nsec underflowed in timekeeping_adjust() + */ + if (unlikely(tk->skip_second_overflow)) { + tk->skip_second_overflow = 0; + continue; + } + /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); if (unlikely(leap)) { @@ -2121,7 +2080,7 @@ void update_wall_time(void) shift--; } - /* correct the clock when NTP error is too big */ + /* Adjust the multiplier to correct NTP error */ timekeeping_adjust(tk, offset); /* @@ -2166,7 +2125,7 @@ out: void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); + ktime_t t = ktime_sub(tk->offs_real, tk->time_suspended); *ts = ktime_to_timespec64(t); } @@ -2236,7 +2195,6 @@ void do_timer(unsigned long ticks) * ktime_get_update_offsets_now - hrtimer helper * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset - * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets if the @@ -2246,7 +2204,7 @@ void do_timer(unsigned long ticks) * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, - ktime_t *offs_boot, ktime_t *offs_tai) + ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; @@ -2263,7 +2221,6 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, if (*cwsseq != tk->clock_was_set_seq) { *cwsseq = tk->clock_was_set_seq; *offs_real = tk->offs_real; - *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 7a9b4eb7a1d5..79b67f5e0343 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -6,7 +6,6 @@ */ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, - ktime_t *offs_boot, ktime_t *offs_tai); extern int timekeeping_valid_for_hres(void); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 01e6b3a38871..d88e96d4e12c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -524,7 +524,8 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -568,7 +569,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) } } -static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -582,12 +584,13 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_override_return_proto; #endif default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) @@ -661,7 +664,8 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -669,11 +673,12 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) @@ -721,7 +726,8 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { .arg3_type = ARG_CONST_SIZE, }; -static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -731,15 +737,97 @@ static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } +/* + * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp + * to avoid potential recursive reuse issue when/if tracepoints are added + * inside bpf_*_event_output and/or bpf_get_stack_id + */ +static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); +BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + return ____bpf_perf_event_output(regs, map, flags, data, size); +} + +static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { + .func = bpf_perf_event_output_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ + return bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); +} + +static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { + .func = bpf_get_stackid_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_raw_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_raw_tp; + default: + return tracing_func_proto(func_id, prog); + } +} + +static bool raw_tp_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + /* largest tracepoint in the kernel has 12 args */ + if (off < 0 || off >= sizeof(__u64) * 12) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + return true; +} + +const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, - sample_period); + const int size_u64 = sizeof(u64); if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; @@ -750,8 +838,13 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type switch (off) { case bpf_ctx_range(struct bpf_perf_event_data, sample_period): - bpf_ctx_record_field_size(info, size_sp); - if (!bpf_ctx_narrow_access_ok(off, size, size_sp)) + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) + return false; + break; + case bpf_ctx_range(struct bpf_perf_event_data, addr): + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) return false; break; default: @@ -778,6 +871,14 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct perf_sample_data, period, 8, target_size)); break; + case offsetof(struct bpf_perf_event_data, addr): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), si->dst_reg, si->src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, + bpf_target_off(struct perf_sample_data, addr, 8, + target_size)); + break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, regs), si->dst_reg, si->src_reg, @@ -896,3 +997,106 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) return ret; } + +extern struct bpf_raw_event_map __start__bpf_raw_tp[]; +extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; + +struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +{ + struct bpf_raw_event_map *btp = __start__bpf_raw_tp; + + for (; btp < __stop__bpf_raw_tp; btp++) { + if (!strcmp(btp->tp->name, name)) + return btp; + } + return NULL; +} + +static __always_inline +void __bpf_trace_run(struct bpf_prog *prog, u64 *args) +{ + rcu_read_lock(); + preempt_disable(); + (void) BPF_PROG_RUN(prog, args); + preempt_enable(); + rcu_read_unlock(); +} + +#define UNPACK(...) __VA_ARGS__ +#define REPEAT_1(FN, DL, X, ...) FN(X) +#define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__) +#define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__) +#define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__) +#define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__) +#define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__) +#define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__) +#define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__) +#define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__) +#define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__) +#define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__) +#define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__) +#define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__) + +#define SARG(X) u64 arg##X +#define COPY(X) args[X] = arg##X + +#define __DL_COM (,) +#define __DL_SEM (;) + +#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + +#define BPF_TRACE_DEFN_x(x) \ + void bpf_trace_run##x(struct bpf_prog *prog, \ + REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ + { \ + u64 args[x]; \ + REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ + __bpf_trace_run(prog, args); \ + } \ + EXPORT_SYMBOL_GPL(bpf_trace_run##x) +BPF_TRACE_DEFN_x(1); +BPF_TRACE_DEFN_x(2); +BPF_TRACE_DEFN_x(3); +BPF_TRACE_DEFN_x(4); +BPF_TRACE_DEFN_x(5); +BPF_TRACE_DEFN_x(6); +BPF_TRACE_DEFN_x(7); +BPF_TRACE_DEFN_x(8); +BPF_TRACE_DEFN_x(9); +BPF_TRACE_DEFN_x(10); +BPF_TRACE_DEFN_x(11); +BPF_TRACE_DEFN_x(12); + +static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + struct tracepoint *tp = btp->tp; + + /* + * check that program doesn't access arguments beyond what's + * available in this tracepoint + */ + if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) + return -EINVAL; + + return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); +} + +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + int err; + + mutex_lock(&bpf_event_mutex); + err = __bpf_probe_register(btp, prog); + mutex_unlock(&bpf_event_mutex); + return err; +} + +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + int err; + + mutex_lock(&bpf_event_mutex); + err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); + mutex_unlock(&bpf_event_mutex); + return err; +} diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 20a2300ae4e8..5071931eb943 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1164,7 +1164,7 @@ static struct { { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, - { ktime_get_boot_fast_ns, "boot", 1 }, + { ktime_get_mono_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; @@ -2380,7 +2380,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit); * trace_buffer_unlock_commit_regs() * trace_event_buffer_commit() * trace_event_raw_event_xxx() -*/ + */ # define STACK_SKIP 3 void trace_buffer_unlock_commit_regs(struct trace_array *tr, diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index ad1d6164e946..50f44b7b2b32 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -196,7 +196,7 @@ struct notifier_block module_trace_bprintk_format_nb = { }; int __trace_bprintk(unsigned long ip, const char *fmt, ...) - { +{ int ret; va_list ap; @@ -214,7 +214,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...) EXPORT_SYMBOL_GPL(__trace_bprintk); int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) - { +{ if (unlikely(!fmt)) return 0; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 254e636a3d6b..ca7959be8aaa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -153,10 +153,9 @@ struct worker_pool { unsigned long watchdog_ts; /* L: watchdog timestamp */ struct list_head worklist; /* L: list of pending works */ - int nr_workers; /* L: total number of workers */ - /* nr_idle includes the ones off idle_list for rebinding */ - int nr_idle; /* L: currently idle ones */ + int nr_workers; /* L: total number of workers */ + int nr_idle; /* L: currently idle workers */ struct list_head idle_list; /* X: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ @@ -166,7 +165,6 @@ struct worker_pool { DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ - /* see manage_workers() for details on the two manager mutexes */ struct worker *manager; /* L: purely informational */ struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ @@ -1604,6 +1602,40 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(mod_delayed_work_on); +static void rcu_work_rcufn(struct rcu_head *rcu) +{ + struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu); + + /* read the comment in __queue_work() */ + local_irq_disable(); + __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work); + local_irq_enable(); +} + +/** + * queue_rcu_work - queue work after a RCU grace period + * @wq: workqueue to use + * @rwork: work to queue + * + * Return: %false if @rwork was already pending, %true otherwise. Note + * that a full RCU grace period is guaranteed only after a %true return. + * While @rwork is guarnateed to be executed after a %false return, the + * execution may happen before a full RCU grace period has passed. + */ +bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) +{ + struct work_struct *work = &rwork->work; + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + rwork->wq = wq; + call_rcu(&rwork->rcu, rcu_work_rcufn); + return true; + } + + return false; +} +EXPORT_SYMBOL(queue_rcu_work); + /** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state @@ -3001,6 +3033,26 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); +/** + * flush_rcu_work - wait for a rwork to finish executing the last queueing + * @rwork: the rcu work to flush + * + * Return: + * %true if flush_rcu_work() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_rcu_work(struct rcu_work *rwork) +{ + if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) { + rcu_barrier(); + flush_work(&rwork->work); + return true; + } else { + return flush_work(&rwork->work); + } +} +EXPORT_SYMBOL(flush_rcu_work); + static bool __cancel_work(struct work_struct *work, bool is_dwork) { unsigned long flags; |