diff options
author | Ingo Molnar <mingo@kernel.org> | 2017-04-11 10:05:36 +0300 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2017-04-11 10:05:36 +0300 |
commit | 36a4dfc37856cae40e4d864950f768d8b7d4f9f5 (patch) | |
tree | 3aa6a1459a699b986632f64b740e8e98e3ce6500 /kernel | |
parent | a481db34b9beb7a9647c23f2320dd38a2b1d681f (diff) | |
parent | 39da7c509acff13fc8cb12ec1bb20337c988ed36 (diff) | |
download | linux-36a4dfc37856cae40e4d864950f768d8b7d4f9f5.tar.xz |
Merge tag 'v4.11-rc6' into sched/core, to pick up fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 639 | ||||
-rw-r--r-- | kernel/audit.h | 15 | ||||
-rw-r--r-- | kernel/auditsc.c | 29 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 144 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 64 | ||||
-rw-r--r-- | kernel/cpu.c | 28 | ||||
-rw-r--r-- | kernel/events/core.c | 64 | ||||
-rw-r--r-- | kernel/futex.c | 22 | ||||
-rw-r--r-- | kernel/locking/rwsem-spinlock.c | 16 | ||||
-rw-r--r-- | kernel/memremap.c | 4 | ||||
-rw-r--r-- | kernel/padata.c | 5 | ||||
-rw-r--r-- | kernel/ptrace.c | 14 | ||||
-rw-r--r-- | kernel/sched/clock.c | 46 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 20 | ||||
-rw-r--r-- | kernel/sysctl.c | 3 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 8 |
16 files changed, 664 insertions, 457 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index e794544f5e63..2f4964cfde0b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -54,6 +54,10 @@ #include <linux/kthread.h> #include <linux/kernel.h> #include <linux/syscalls.h> +#include <linux/spinlock.h> +#include <linux/rcupdate.h> +#include <linux/mutex.h> +#include <linux/gfp.h> #include <linux/audit.h> @@ -90,13 +94,34 @@ static u32 audit_default; /* If auditing cannot proceed, audit_failure selects what happens. */ static u32 audit_failure = AUDIT_FAIL_PRINTK; -/* - * If audit records are to be written to the netlink socket, audit_pid - * contains the pid of the auditd process and audit_nlk_portid contains - * the portid to use to send netlink messages to that process. +/* private audit network namespace index */ +static unsigned int audit_net_id; + +/** + * struct audit_net - audit private network namespace data + * @sk: communication socket + */ +struct audit_net { + struct sock *sk; +}; + +/** + * struct auditd_connection - kernel/auditd connection state + * @pid: auditd PID + * @portid: netlink portid + * @net: the associated network namespace + * @lock: spinlock to protect write access + * + * Description: + * This struct is RCU protected; you must either hold the RCU lock for reading + * or the included spinlock for writing. */ -int audit_pid; -static __u32 audit_nlk_portid; +static struct auditd_connection { + int pid; + u32 portid; + struct net *net; + spinlock_t lock; +} auditd_conn; /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in @@ -123,10 +148,6 @@ u32 audit_sig_sid = 0; */ static atomic_t audit_lost = ATOMIC_INIT(0); -/* The netlink socket. */ -static struct sock *audit_sock; -static unsigned int audit_net_id; - /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -139,6 +160,7 @@ static LIST_HEAD(audit_freelist); /* queue msgs to send via kauditd_task */ static struct sk_buff_head audit_queue; +static void kauditd_hold_skb(struct sk_buff *skb); /* queue msgs due to temporary unicast send problems */ static struct sk_buff_head audit_retry_queue; /* queue msgs waiting for new auditd connection */ @@ -192,6 +214,43 @@ struct audit_reply { struct sk_buff *skb; }; +/** + * auditd_test_task - Check to see if a given task is an audit daemon + * @task: the task to check + * + * Description: + * Return 1 if the task is a registered audit daemon, 0 otherwise. + */ +int auditd_test_task(const struct task_struct *task) +{ + int rc; + + rcu_read_lock(); + rc = (auditd_conn.pid && task->tgid == auditd_conn.pid ? 1 : 0); + rcu_read_unlock(); + + return rc; +} + +/** + * audit_get_sk - Return the audit socket for the given network namespace + * @net: the destination network namespace + * + * Description: + * Returns the sock pointer if valid, NULL otherwise. The caller must ensure + * that a reference is held for the network namespace while the sock is in use. + */ +static struct sock *audit_get_sk(const struct net *net) +{ + struct audit_net *aunet; + + if (!net) + return NULL; + + aunet = net_generic(net, audit_net_id); + return aunet->sk; +} + static void audit_set_portid(struct audit_buffer *ab, __u32 portid) { if (ab) { @@ -210,9 +269,7 @@ void audit_panic(const char *message) pr_err("%s\n", message); break; case AUDIT_FAIL_PANIC: - /* test audit_pid since printk is always losey, why bother? */ - if (audit_pid) - panic("audit: %s\n", message); + panic("audit: %s\n", message); break; } } @@ -370,21 +427,87 @@ static int audit_set_failure(u32 state) return audit_do_config_change("audit_failure", &audit_failure, state); } -/* - * For one reason or another this nlh isn't getting delivered to the userspace - * audit daemon, just send it to printk. +/** + * auditd_set - Set/Reset the auditd connection state + * @pid: auditd PID + * @portid: auditd netlink portid + * @net: auditd network namespace pointer + * + * Description: + * This function will obtain and drop network namespace references as + * necessary. + */ +static void auditd_set(int pid, u32 portid, struct net *net) +{ + unsigned long flags; + + spin_lock_irqsave(&auditd_conn.lock, flags); + auditd_conn.pid = pid; + auditd_conn.portid = portid; + if (auditd_conn.net) + put_net(auditd_conn.net); + if (net) + auditd_conn.net = get_net(net); + else + auditd_conn.net = NULL; + spin_unlock_irqrestore(&auditd_conn.lock, flags); +} + +/** + * auditd_reset - Disconnect the auditd connection + * + * Description: + * Break the auditd/kauditd connection and move all the queued records into the + * hold queue in case auditd reconnects. + */ +static void auditd_reset(void) +{ + struct sk_buff *skb; + + /* if it isn't already broken, break the connection */ + rcu_read_lock(); + if (auditd_conn.pid) + auditd_set(0, 0, NULL); + rcu_read_unlock(); + + /* flush all of the main and retry queues to the hold queue */ + while ((skb = skb_dequeue(&audit_retry_queue))) + kauditd_hold_skb(skb); + while ((skb = skb_dequeue(&audit_queue))) + kauditd_hold_skb(skb); +} + +/** + * kauditd_print_skb - Print the audit record to the ring buffer + * @skb: audit record + * + * Whatever the reason, this packet may not make it to the auditd connection + * so write it via printk so the information isn't completely lost. */ static void kauditd_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); char *data = nlmsg_data(nlh); - if (nlh->nlmsg_type != AUDIT_EOE) { - if (printk_ratelimit()) - pr_notice("type=%d %s\n", nlh->nlmsg_type, data); - else - audit_log_lost("printk limit exceeded"); - } + if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit()) + pr_notice("type=%d %s\n", nlh->nlmsg_type, data); +} + +/** + * kauditd_rehold_skb - Handle a audit record send failure in the hold queue + * @skb: audit record + * + * Description: + * This should only be used by the kauditd_thread when it fails to flush the + * hold queue. + */ +static void kauditd_rehold_skb(struct sk_buff *skb) +{ + /* put the record back in the queue at the same place */ + skb_queue_head(&audit_hold_queue, skb); + + /* fail the auditd connection */ + auditd_reset(); } /** @@ -421,6 +544,9 @@ static void kauditd_hold_skb(struct sk_buff *skb) /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); kfree_skb(skb); + + /* fail the auditd connection */ + auditd_reset(); } /** @@ -441,51 +567,122 @@ static void kauditd_retry_skb(struct sk_buff *skb) } /** - * auditd_reset - Disconnect the auditd connection + * auditd_send_unicast_skb - Send a record via unicast to auditd + * @skb: audit record * * Description: - * Break the auditd/kauditd connection and move all the records in the retry - * queue into the hold queue in case auditd reconnects. The audit_cmd_mutex - * must be held when calling this function. + * Send a skb to the audit daemon, returns positive/zero values on success and + * negative values on failure; in all cases the skb will be consumed by this + * function. If the send results in -ECONNREFUSED the connection with auditd + * will be reset. This function may sleep so callers should not hold any locks + * where this would cause a problem. */ -static void auditd_reset(void) +static int auditd_send_unicast_skb(struct sk_buff *skb) { - struct sk_buff *skb; - - /* break the connection */ - if (audit_sock) { - sock_put(audit_sock); - audit_sock = NULL; + int rc; + u32 portid; + struct net *net; + struct sock *sk; + + /* NOTE: we can't call netlink_unicast while in the RCU section so + * take a reference to the network namespace and grab local + * copies of the namespace, the sock, and the portid; the + * namespace and sock aren't going to go away while we hold a + * reference and if the portid does become invalid after the RCU + * section netlink_unicast() should safely return an error */ + + rcu_read_lock(); + if (!auditd_conn.pid) { + rcu_read_unlock(); + rc = -ECONNREFUSED; + goto err; } - audit_pid = 0; - audit_nlk_portid = 0; + net = auditd_conn.net; + get_net(net); + sk = audit_get_sk(net); + portid = auditd_conn.portid; + rcu_read_unlock(); - /* flush all of the retry queue to the hold queue */ - while ((skb = skb_dequeue(&audit_retry_queue))) - kauditd_hold_skb(skb); + rc = netlink_unicast(sk, skb, portid, 0); + put_net(net); + if (rc < 0) + goto err; + + return rc; + +err: + if (rc == -ECONNREFUSED) + auditd_reset(); + return rc; } /** - * kauditd_send_unicast_skb - Send a record via unicast to auditd - * @skb: audit record + * kauditd_send_queue - Helper for kauditd_thread to flush skb queues + * @sk: the sending sock + * @portid: the netlink destination + * @queue: the skb queue to process + * @retry_limit: limit on number of netlink unicast failures + * @skb_hook: per-skb hook for additional processing + * @err_hook: hook called if the skb fails the netlink unicast send + * + * Description: + * Run through the given queue and attempt to send the audit records to auditd, + * returns zero on success, negative values on failure. It is up to the caller + * to ensure that the @sk is valid for the duration of this function. + * */ -static int kauditd_send_unicast_skb(struct sk_buff *skb) +static int kauditd_send_queue(struct sock *sk, u32 portid, + struct sk_buff_head *queue, + unsigned int retry_limit, + void (*skb_hook)(struct sk_buff *skb), + void (*err_hook)(struct sk_buff *skb)) { - int rc; + int rc = 0; + struct sk_buff *skb; + static unsigned int failed = 0; - /* if we know nothing is connected, don't even try the netlink call */ - if (!audit_pid) - return -ECONNREFUSED; + /* NOTE: kauditd_thread takes care of all our locking, we just use + * the netlink info passed to us (e.g. sk and portid) */ + + while ((skb = skb_dequeue(queue))) { + /* call the skb_hook for each skb we touch */ + if (skb_hook) + (*skb_hook)(skb); + + /* can we send to anyone via unicast? */ + if (!sk) { + if (err_hook) + (*err_hook)(skb); + continue; + } - /* get an extra skb reference in case we fail to send */ - skb_get(skb); - rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); - if (rc >= 0) { - consume_skb(skb); - rc = 0; + /* grab an extra skb reference in case of error */ + skb_get(skb); + rc = netlink_unicast(sk, skb, portid, 0); + if (rc < 0) { + /* fatal failure for our queue flush attempt? */ + if (++failed >= retry_limit || + rc == -ECONNREFUSED || rc == -EPERM) { + /* yes - error processing for the queue */ + sk = NULL; + if (err_hook) + (*err_hook)(skb); + if (!skb_hook) + goto out; + /* keep processing with the skb_hook */ + continue; + } else + /* no - requeue to preserve ordering */ + skb_queue_head(queue, skb); + } else { + /* it worked - drop the extra reference and continue */ + consume_skb(skb); + failed = 0; + } } - return rc; +out: + return (rc >= 0 ? 0 : rc); } /* @@ -493,16 +690,19 @@ static int kauditd_send_unicast_skb(struct sk_buff *skb) * @skb: audit record * * Description: - * This function doesn't consume an skb as might be expected since it has to - * copy it anyways. + * Write a multicast message to anyone listening in the initial network + * namespace. This function doesn't consume an skb as might be expected since + * it has to copy it anyways. */ static void kauditd_send_multicast_skb(struct sk_buff *skb) { struct sk_buff *copy; - struct audit_net *aunet = net_generic(&init_net, audit_net_id); - struct sock *sock = aunet->nlsk; + struct sock *sock = audit_get_sk(&init_net); struct nlmsghdr *nlh; + /* NOTE: we are not taking an additional reference for init_net since + * we don't have to worry about it going away */ + if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) return; @@ -526,149 +726,75 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb) } /** - * kauditd_wake_condition - Return true when it is time to wake kauditd_thread - * - * Description: - * This function is for use by the wait_event_freezable() call in - * kauditd_thread(). + * kauditd_thread - Worker thread to send audit records to userspace + * @dummy: unused */ -static int kauditd_wake_condition(void) -{ - static int pid_last = 0; - int rc; - int pid = audit_pid; - - /* wake on new messages or a change in the connected auditd */ - rc = skb_queue_len(&audit_queue) || (pid && pid != pid_last); - if (rc) - pid_last = pid; - - return rc; -} - static int kauditd_thread(void *dummy) { int rc; - int auditd = 0; - int reschedule = 0; - struct sk_buff *skb; - struct nlmsghdr *nlh; + u32 portid = 0; + struct net *net = NULL; + struct sock *sk = NULL; #define UNICAST_RETRIES 5 -#define AUDITD_BAD(x,y) \ - ((x) == -ECONNREFUSED || (x) == -EPERM || ++(y) >= UNICAST_RETRIES) - - /* NOTE: we do invalidate the auditd connection flag on any sending - * errors, but we only "restore" the connection flag at specific places - * in the loop in order to help ensure proper ordering of audit - * records */ set_freezable(); while (!kthread_should_stop()) { - /* NOTE: possible area for future improvement is to look at - * the hold and retry queues, since only this thread - * has access to these queues we might be able to do - * our own queuing and skip some/all of the locking */ - - /* NOTE: it might be a fun experiment to split the hold and - * retry queue handling to another thread, but the - * synchronization issues and other overhead might kill - * any performance gains */ + /* NOTE: see the lock comments in auditd_send_unicast_skb() */ + rcu_read_lock(); + if (!auditd_conn.pid) { + rcu_read_unlock(); + goto main_queue; + } + net = auditd_conn.net; + get_net(net); + sk = audit_get_sk(net); + portid = auditd_conn.portid; + rcu_read_unlock(); /* attempt to flush the hold queue */ - while (auditd && (skb = skb_dequeue(&audit_hold_queue))) { - rc = kauditd_send_unicast_skb(skb); - if (rc) { - /* requeue to the same spot */ - skb_queue_head(&audit_hold_queue, skb); - - auditd = 0; - if (AUDITD_BAD(rc, reschedule)) { - mutex_lock(&audit_cmd_mutex); - auditd_reset(); - mutex_unlock(&audit_cmd_mutex); - reschedule = 0; - } - } else - /* we were able to send successfully */ - reschedule = 0; + rc = kauditd_send_queue(sk, portid, + &audit_hold_queue, UNICAST_RETRIES, + NULL, kauditd_rehold_skb); + if (rc < 0) { + sk = NULL; + goto main_queue; } /* attempt to flush the retry queue */ - while (auditd && (skb = skb_dequeue(&audit_retry_queue))) { - rc = kauditd_send_unicast_skb(skb); - if (rc) { - auditd = 0; - if (AUDITD_BAD(rc, reschedule)) { - kauditd_hold_skb(skb); - mutex_lock(&audit_cmd_mutex); - auditd_reset(); - mutex_unlock(&audit_cmd_mutex); - reschedule = 0; - } else - /* temporary problem (we hope), queue - * to the same spot and retry */ - skb_queue_head(&audit_retry_queue, skb); - } else - /* we were able to send successfully */ - reschedule = 0; + rc = kauditd_send_queue(sk, portid, + &audit_retry_queue, UNICAST_RETRIES, + NULL, kauditd_hold_skb); + if (rc < 0) { + sk = NULL; + goto main_queue; } - /* standard queue processing, try to be as quick as possible */ -quick_loop: - skb = skb_dequeue(&audit_queue); - if (skb) { - /* setup the netlink header, see the comments in - * kauditd_send_multicast_skb() for length quirks */ - nlh = nlmsg_hdr(skb); - nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; - - /* attempt to send to any multicast listeners */ - kauditd_send_multicast_skb(skb); - - /* attempt to send to auditd, queue on failure */ - if (auditd) { - rc = kauditd_send_unicast_skb(skb); - if (rc) { - auditd = 0; - if (AUDITD_BAD(rc, reschedule)) { - mutex_lock(&audit_cmd_mutex); - auditd_reset(); - mutex_unlock(&audit_cmd_mutex); - reschedule = 0; - } - - /* move to the retry queue */ - kauditd_retry_skb(skb); - } else - /* everything is working so go fast! */ - goto quick_loop; - } else if (reschedule) - /* we are currently having problems, move to - * the retry queue */ - kauditd_retry_skb(skb); - else - /* dump the message via printk and hold it */ - kauditd_hold_skb(skb); - } else { - /* we have flushed the backlog so wake everyone */ - wake_up(&audit_backlog_wait); - - /* if everything is okay with auditd (if present), go - * to sleep until there is something new in the queue - * or we have a change in the connected auditd; - * otherwise simply reschedule to give things a chance - * to recover */ - if (reschedule) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - } else - wait_event_freezable(kauditd_wait, - kauditd_wake_condition()); - - /* update the auditd connection status */ - auditd = (audit_pid ? 1 : 0); +main_queue: + /* process the main queue - do the multicast send and attempt + * unicast, dump failed record sends to the retry queue; if + * sk == NULL due to previous failures we will just do the + * multicast send and move the record to the retry queue */ + kauditd_send_queue(sk, portid, &audit_queue, 1, + kauditd_send_multicast_skb, + kauditd_retry_skb); + + /* drop our netns reference, no auditd sends past this line */ + if (net) { + put_net(net); + net = NULL; } + sk = NULL; + + /* we have processed all the queues so wake everyone */ + wake_up(&audit_backlog_wait); + + /* NOTE: we want to wake up if there is anything on the queue, + * regardless of if an auditd is connected, as we need to + * do the multicast send and rotate records from the + * main queue to the retry/hold queues */ + wait_event_freezable(kauditd_wait, + (skb_queue_len(&audit_queue) ? 1 : 0)); } return 0; @@ -678,17 +804,16 @@ int audit_send_list(void *_dest) { struct audit_netlink_list *dest = _dest; struct sk_buff *skb; - struct net *net = dest->net; - struct audit_net *aunet = net_generic(net, audit_net_id); + struct sock *sk = audit_get_sk(dest->net); /* wait for parent to finish and send an ACK */ mutex_lock(&audit_cmd_mutex); mutex_unlock(&audit_cmd_mutex); while ((skb = __skb_dequeue(&dest->q)) != NULL) - netlink_unicast(aunet->nlsk, skb, dest->portid, 0); + netlink_unicast(sk, skb, dest->portid, 0); - put_net(net); + put_net(dest->net); kfree(dest); return 0; @@ -722,16 +847,15 @@ out_kfree_skb: static int audit_send_reply_thread(void *arg) { struct audit_reply *reply = (struct audit_reply *)arg; - struct net *net = reply->net; - struct audit_net *aunet = net_generic(net, audit_net_id); + struct sock *sk = audit_get_sk(reply->net); mutex_lock(&audit_cmd_mutex); mutex_unlock(&audit_cmd_mutex); /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ - netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0); - put_net(net); + netlink_unicast(sk, reply->skb, reply->portid, 0); + put_net(reply->net); kfree(reply); return 0; } @@ -949,12 +1073,12 @@ static int audit_set_feature(struct sk_buff *skb) static int audit_replace(pid_t pid) { - struct sk_buff *skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, - &pid, sizeof(pid)); + struct sk_buff *skb; + skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, &pid, sizeof(pid)); if (!skb) return -ENOMEM; - return netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); + return auditd_send_unicast_skb(skb); } static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -981,7 +1105,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); s.enabled = audit_enabled; s.failure = audit_failure; - s.pid = audit_pid; + rcu_read_lock(); + s.pid = auditd_conn.pid; + rcu_read_unlock(); s.rate_limit = audit_rate_limit; s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); @@ -1014,30 +1140,44 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) * from the initial pid namespace, but something * to keep in mind if this changes */ int new_pid = s.pid; + pid_t auditd_pid; pid_t requesting_pid = task_tgid_vnr(current); - if ((!new_pid) && (requesting_pid != audit_pid)) { - audit_log_config_change("audit_pid", new_pid, audit_pid, 0); + /* test the auditd connection */ + audit_replace(requesting_pid); + + rcu_read_lock(); + auditd_pid = auditd_conn.pid; + /* only the current auditd can unregister itself */ + if ((!new_pid) && (requesting_pid != auditd_pid)) { + rcu_read_unlock(); + audit_log_config_change("audit_pid", new_pid, + auditd_pid, 0); return -EACCES; } - if (audit_pid && new_pid && - audit_replace(requesting_pid) != -ECONNREFUSED) { - audit_log_config_change("audit_pid", new_pid, audit_pid, 0); + /* replacing a healthy auditd is not allowed */ + if (auditd_pid && new_pid) { + rcu_read_unlock(); + audit_log_config_change("audit_pid", new_pid, + auditd_pid, 0); return -EEXIST; } + rcu_read_unlock(); + if (audit_enabled != AUDIT_OFF) - audit_log_config_change("audit_pid", new_pid, audit_pid, 1); + audit_log_config_change("audit_pid", new_pid, + auditd_pid, 1); + if (new_pid) { - if (audit_sock) - sock_put(audit_sock); - audit_pid = new_pid; - audit_nlk_portid = NETLINK_CB(skb).portid; - sock_hold(skb->sk); - audit_sock = skb->sk; - } else { + /* register a new auditd connection */ + auditd_set(new_pid, + NETLINK_CB(skb).portid, + sock_net(NETLINK_CB(skb).sk)); + /* try to process any backlog */ + wake_up_interruptible(&kauditd_wait); + } else + /* unregister the auditd connection */ auditd_reset(); - } - wake_up_interruptible(&kauditd_wait); } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(s.rate_limit); @@ -1090,7 +1230,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) break; } - mutex_unlock(&audit_cmd_mutex); audit_log_common_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) audit_log_format(ab, " msg='%.*s'", @@ -1108,7 +1247,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } audit_set_portid(ab, NETLINK_CB(skb).portid); audit_log_end(ab); - mutex_lock(&audit_cmd_mutex); } break; case AUDIT_ADD_RULE: @@ -1298,26 +1436,26 @@ static int __net_init audit_net_init(struct net *net) struct audit_net *aunet = net_generic(net, audit_net_id); - aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); - if (aunet->nlsk == NULL) { + aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg); + if (aunet->sk == NULL) { audit_panic("cannot initialize netlink socket in namespace"); return -ENOMEM; } - aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + return 0; } static void __net_exit audit_net_exit(struct net *net) { struct audit_net *aunet = net_generic(net, audit_net_id); - struct sock *sock = aunet->nlsk; - mutex_lock(&audit_cmd_mutex); - if (sock == audit_sock) + + rcu_read_lock(); + if (net == auditd_conn.net) auditd_reset(); - mutex_unlock(&audit_cmd_mutex); + rcu_read_unlock(); - netlink_kernel_release(sock); - aunet->nlsk = NULL; + netlink_kernel_release(aunet->sk); } static struct pernet_operations audit_net_ops __net_initdata = { @@ -1335,20 +1473,24 @@ static int __init audit_init(void) if (audit_initialized == AUDIT_DISABLED) return 0; - pr_info("initializing netlink subsys (%s)\n", - audit_default ? "enabled" : "disabled"); - register_pernet_subsys(&audit_net_ops); + memset(&auditd_conn, 0, sizeof(auditd_conn)); + spin_lock_init(&auditd_conn.lock); skb_queue_head_init(&audit_queue); skb_queue_head_init(&audit_retry_queue); skb_queue_head_init(&audit_hold_queue); - audit_initialized = AUDIT_INITIALIZED; - audit_enabled = audit_default; - audit_ever_enabled |= !!audit_default; for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); + pr_info("initializing netlink subsys (%s)\n", + audit_default ? "enabled" : "disabled"); + register_pernet_subsys(&audit_net_ops); + + audit_initialized = AUDIT_INITIALIZED; + audit_enabled = audit_default; + audit_ever_enabled |= !!audit_default; + kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); if (IS_ERR(kauditd_task)) { int err = PTR_ERR(kauditd_task); @@ -1519,20 +1661,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE))) return NULL; - /* don't ever fail/sleep on these two conditions: + /* NOTE: don't ever fail/sleep on these two conditions: * 1. auditd generated record - since we need auditd to drain the * queue; also, when we are checking for auditd, compare PIDs using * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() * using a PID anchored in the caller's namespace - * 2. audit command message - record types 1000 through 1099 inclusive - * are command messages/records used to manage the kernel subsystem - * and the audit userspace, blocking on these messages could cause - * problems under load so don't do it (note: not all of these - * command types are valid as record types, but it is quicker to - * just check two ints than a series of ints in a if/switch stmt) */ - if (!((audit_pid && audit_pid == task_tgid_vnr(current)) || - (type >= 1000 && type <= 1099))) { - long sleep_time = audit_backlog_wait_time; + * 2. generator holding the audit_cmd_mutex - we don't want to block + * while holding the mutex */ + if (!(auditd_test_task(current) || + (current == __mutex_owner(&audit_cmd_mutex)))) { + long stime = audit_backlog_wait_time; while (audit_backlog_limit && (skb_queue_len(&audit_queue) > audit_backlog_limit)) { @@ -1541,14 +1679,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, /* sleep if we are allowed and we haven't exhausted our * backlog wait limit */ - if ((gfp_mask & __GFP_DIRECT_RECLAIM) && - (sleep_time > 0)) { + if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); - sleep_time = schedule_timeout(sleep_time); + stime = schedule_timeout(stime); remove_wait_queue(&audit_backlog_wait, &wait); } else { if (audit_rate_check() && printk_ratelimit()) @@ -2127,15 +2264,27 @@ out: */ void audit_log_end(struct audit_buffer *ab) { + struct sk_buff *skb; + struct nlmsghdr *nlh; + if (!ab) return; - if (!audit_rate_check()) { - audit_log_lost("rate limit exceeded"); - } else { - skb_queue_tail(&audit_queue, ab->skb); - wake_up_interruptible(&kauditd_wait); + + if (audit_rate_check()) { + skb = ab->skb; ab->skb = NULL; - } + + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ + nlh = nlmsg_hdr(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* queue the netlink packet and poke the kauditd thread */ + skb_queue_tail(&audit_queue, skb); + wake_up_interruptible(&kauditd_wait); + } else + audit_log_lost("rate limit exceeded"); + audit_buffer_free(ab); } diff --git a/kernel/audit.h b/kernel/audit.h index ca579880303a..0d87f8ab8778 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -218,7 +218,7 @@ extern void audit_log_name(struct audit_context *context, struct audit_names *n, const struct path *path, int record_num, int *call_panic); -extern int audit_pid; +extern int auditd_test_task(const struct task_struct *task); #define AUDIT_INODE_BUCKETS 32 extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -250,10 +250,6 @@ struct audit_netlink_list { int audit_send_list(void *); -struct audit_net { - struct sock *nlsk; -}; - extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; @@ -337,14 +333,7 @@ extern u32 audit_sig_sid; extern int audit_filter(int msgtype, unsigned int listtype); #ifdef CONFIG_AUDITSYSCALL -extern int __audit_signal_info(int sig, struct task_struct *t); -static inline int audit_signal_info(int sig, struct task_struct *t) -{ - if (unlikely((audit_pid && t->tgid == audit_pid) || - (audit_signals && !audit_dummy_context()))) - return __audit_signal_info(sig, t); - return 0; -} +extern int audit_signal_info(int sig, struct task_struct *t); extern void audit_filter_inodes(struct task_struct *, struct audit_context *); extern struct list_head *audit_killed_trees(void); #else diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d6a8de5f8fa3..1c2333155893 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -762,7 +762,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, struct audit_entry *e; enum audit_state state; - if (audit_pid && tsk->tgid == audit_pid) + if (auditd_test_task(tsk)) return AUDIT_DISABLED; rcu_read_lock(); @@ -816,7 +816,7 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { struct audit_names *n; - if (audit_pid && tsk->tgid == audit_pid) + if (auditd_test_task(tsk)) return; rcu_read_lock(); @@ -2249,26 +2249,27 @@ void __audit_ptrace(struct task_struct *t) * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ -int __audit_signal_info(int sig, struct task_struct *t) +int audit_signal_info(int sig, struct task_struct *t) { struct audit_aux_data_pids *axp; struct task_struct *tsk = current; struct audit_context *ctx = tsk->audit_context; kuid_t uid = current_uid(), t_uid = task_uid(t); - if (audit_pid && t->tgid == audit_pid) { - if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { - audit_sig_pid = task_tgid_nr(tsk); - if (uid_valid(tsk->loginuid)) - audit_sig_uid = tsk->loginuid; - else - audit_sig_uid = uid; - security_task_getsecid(tsk, &audit_sig_sid); - } - if (!audit_signals || audit_dummy_context()) - return 0; + if (auditd_test_task(t) && + (sig == SIGTERM || sig == SIGHUP || + sig == SIGUSR1 || sig == SIGUSR2)) { + audit_sig_pid = task_tgid_nr(tsk); + if (uid_valid(tsk->loginuid)) + audit_sig_uid = tsk->loginuid; + else + audit_sig_uid = uid; + security_task_getsecid(tsk, &audit_sig_sid); } + if (!audit_signals || audit_dummy_context()) + return 0; + /* optimize the common case by putting first signal recipient directly * in audit_context */ if (!ctx->target_pid) { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index afe5bab376c9..361a69dfe543 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -30,18 +30,12 @@ struct bpf_htab { struct pcpu_freelist freelist; struct bpf_lru lru; }; - void __percpu *extra_elems; + struct htab_elem *__percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; -enum extra_elem_state { - HTAB_NOT_AN_EXTRA_ELEM = 0, - HTAB_EXTRA_ELEM_FREE, - HTAB_EXTRA_ELEM_USED -}; - /* each htab element is struct htab_elem + key + value */ struct htab_elem { union { @@ -56,7 +50,6 @@ struct htab_elem { }; union { struct rcu_head rcu; - enum extra_elem_state state; struct bpf_lru_node lru_node; }; u32 hash; @@ -77,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab) htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } +static bool htab_is_prealloc(const struct bpf_htab *htab) +{ + return !(htab->map.map_flags & BPF_F_NO_PREALLOC); +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -128,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, static int prealloc_init(struct bpf_htab *htab) { + u32 num_entries = htab->map.max_entries; int err = -ENOMEM, i; - htab->elems = bpf_map_area_alloc(htab->elem_size * - htab->map.max_entries); + if (!htab_is_percpu(htab) && !htab_is_lru(htab)) + num_entries += num_possible_cpus(); + + htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries); if (!htab->elems) return -ENOMEM; if (!htab_is_percpu(htab)) goto skip_percpu_elems; - for (i = 0; i < htab->map.max_entries; i++) { + for (i = 0; i < num_entries; i++) { u32 size = round_up(htab->map.value_size, 8); void __percpu *pptr; @@ -166,11 +167,11 @@ skip_percpu_elems: if (htab_is_lru(htab)) bpf_lru_populate(&htab->lru, htab->elems, offsetof(struct htab_elem, lru_node), - htab->elem_size, htab->map.max_entries); + htab->elem_size, num_entries); else pcpu_freelist_populate(&htab->freelist, htab->elems + offsetof(struct htab_elem, fnode), - htab->elem_size, htab->map.max_entries); + htab->elem_size, num_entries); return 0; @@ -191,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab) static int alloc_extra_elems(struct bpf_htab *htab) { - void __percpu *pptr; + struct htab_elem *__percpu *pptr, *l_new; + struct pcpu_freelist_node *l; int cpu; - pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); + pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, + GFP_USER | __GFP_NOWARN); if (!pptr) return -ENOMEM; for_each_possible_cpu(cpu) { - ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = - HTAB_EXTRA_ELEM_FREE; + l = pcpu_freelist_pop(&htab->freelist); + /* pop will succeed, since prealloc_init() + * preallocated extra num_possible_cpus elements + */ + l_new = container_of(l, struct htab_elem, fnode); + *per_cpu_ptr(pptr, cpu) = l_new; } htab->extra_elems = pptr; return 0; @@ -342,25 +349,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - if (!percpu && !lru) { - /* lru itself can remove the least used element, so - * there is no need for an extra elem during map_update. - */ - err = alloc_extra_elems(htab); - if (err) - goto free_buckets; - } - if (prealloc) { err = prealloc_init(htab); if (err) - goto free_extra_elems; + goto free_buckets; + + if (!percpu && !lru) { + /* lru itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ + err = alloc_extra_elems(htab); + if (err) + goto free_prealloc; + } } return &htab->map; -free_extra_elems: - free_percpu(htab->extra_elems); +free_prealloc: + prealloc_destroy(htab); free_buckets: bpf_map_area_free(htab->buckets); free_htab: @@ -575,12 +582,7 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { - if (l->state == HTAB_EXTRA_ELEM_USED) { - l->state = HTAB_EXTRA_ELEM_FREE; - return; - } - - if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { + if (htab_is_prealloc(htab)) { pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); @@ -610,47 +612,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, - bool old_elem_exists) + struct htab_elem *old_elem) { u32 size = htab->map.value_size; - bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); - struct htab_elem *l_new; + bool prealloc = htab_is_prealloc(htab); + struct htab_elem *l_new, **pl_new; void __percpu *pptr; - int err = 0; if (prealloc) { - struct pcpu_freelist_node *l; + if (old_elem) { + /* if we're updating the existing element, + * use per-cpu extra elems to avoid freelist_pop/push + */ + pl_new = this_cpu_ptr(htab->extra_elems); + l_new = *pl_new; + *pl_new = old_elem; + } else { + struct pcpu_freelist_node *l; - l = pcpu_freelist_pop(&htab->freelist); - if (!l) - err = -E2BIG; - else + l = pcpu_freelist_pop(&htab->freelist); + if (!l) + return ERR_PTR(-E2BIG); l_new = container_of(l, struct htab_elem, fnode); - } else { - if (atomic_inc_return(&htab->count) > htab->map.max_entries) { - atomic_dec(&htab->count); - err = -E2BIG; - } else { - l_new = kmalloc(htab->elem_size, - GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return ERR_PTR(-ENOMEM); } - } - - if (err) { - if (!old_elem_exists) - return ERR_PTR(err); - - /* if we're updating the existing element and the hash table - * is full, use per-cpu extra elems - */ - l_new = this_cpu_ptr(htab->extra_elems); - if (l_new->state != HTAB_EXTRA_ELEM_FREE) - return ERR_PTR(-E2BIG); - l_new->state = HTAB_EXTRA_ELEM_USED; } else { - l_new->state = HTAB_NOT_AN_EXTRA_ELEM; + if (atomic_inc_return(&htab->count) > htab->map.max_entries) + if (!old_elem) { + /* when map is full and update() is replacing + * old element, it's ok to allocate, since + * old element will be freed immediately. + * Otherwise return an error + */ + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); } memcpy(l_new->key, key, key_size); @@ -731,7 +729,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, goto err; l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, - !!l_old); + l_old); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); @@ -744,7 +742,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_nulls_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_nulls_del_rcu(&l_old->hash_node); - free_htab_elem(htab, l_old); + if (!htab_is_prealloc(htab)) + free_htab_elem(htab, l_old); } ret = 0; err: @@ -856,7 +855,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true, onallcpus, false); + hash, true, onallcpus, NULL); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; @@ -1024,8 +1023,7 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { hlist_nulls_del_rcu(&l->hash_node); - if (l->state != HTAB_EXTRA_ELEM_USED) - htab_elem_free(htab, l); + htab_elem_free(htab, l); } } } @@ -1045,7 +1043,7 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (htab->map.map_flags & BPF_F_NO_PREALLOC) + if (!htab_is_prealloc(htab)) delete_all_elements(htab); else prealloc_destroy(htab); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 796b68d00119..a834068a400e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -765,38 +765,56 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) } } -static int check_ptr_alignment(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, int off, int size) +static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, + int off, int size) { - if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) { - if (off % size != 0) { - verbose("misaligned access off %d size %d\n", - off, size); - return -EACCES; - } else { - return 0; - } - } - - if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) - /* misaligned access to packet is ok on x86,arm,arm64 */ - return 0; - if (reg->id && size != 1) { - verbose("Unknown packet alignment. Only byte-sized access allowed\n"); + verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n"); return -EACCES; } /* skb->data is NET_IP_ALIGN-ed */ - if (reg->type == PTR_TO_PACKET && - (NET_IP_ALIGN + reg->off + off) % size != 0) { + if ((NET_IP_ALIGN + reg->off + off) % size != 0) { verbose("misaligned packet access off %d+%d+%d size %d\n", NET_IP_ALIGN, reg->off, off, size); return -EACCES; } + return 0; } +static int check_val_ptr_alignment(const struct bpf_reg_state *reg, + int size) +{ + if (size != 1) { + verbose("Unknown alignment. Only byte-sized access allowed in value access.\n"); + return -EACCES; + } + + return 0; +} + +static int check_ptr_alignment(const struct bpf_reg_state *reg, + int off, int size) +{ + switch (reg->type) { + case PTR_TO_PACKET: + return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : + check_pkt_ptr_alignment(reg, off, size); + case PTR_TO_MAP_VALUE_ADJ: + return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 : + check_val_ptr_alignment(reg, size); + default: + if (off % size != 0) { + verbose("misaligned access off %d size %d\n", + off, size); + return -EACCES; + } + + return 0; + } +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -818,7 +836,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, if (size < 0) return size; - err = check_ptr_alignment(env, reg, off, size); + err = check_ptr_alignment(reg, off, size); if (err) return err; @@ -1925,6 +1943,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * register as unknown. */ if (env->allow_ptr_leaks && + BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD && (dst_reg->type == PTR_TO_MAP_VALUE || dst_reg->type == PTR_TO_MAP_VALUE_ADJ)) dst_reg->type = PTR_TO_MAP_VALUE_ADJ; @@ -1973,14 +1992,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, for (i = 0; i < MAX_BPF_REG; i++) if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) - regs[i].range = dst_reg->off; + /* keep the maximum range already checked */ + regs[i].range = max(regs[i].range, dst_reg->off); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) - reg->range = dst_reg->off; + reg->range = max(reg->range, dst_reg->off); } } diff --git a/kernel/cpu.c b/kernel/cpu.c index f7c063239fa5..37b223e4fc05 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1335,26 +1335,21 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name, struct cpuhp_step *sp; int ret = 0; - mutex_lock(&cpuhp_state_mutex); - if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) { ret = cpuhp_reserve_state(state); if (ret < 0) - goto out; + return ret; state = ret; } sp = cpuhp_get_step(state); - if (name && sp->name) { - ret = -EBUSY; - goto out; - } + if (name && sp->name) + return -EBUSY; + sp->startup.single = startup; sp->teardown.single = teardown; sp->name = name; sp->multi_instance = multi_instance; INIT_HLIST_HEAD(&sp->list); -out: - mutex_unlock(&cpuhp_state_mutex); return ret; } @@ -1428,6 +1423,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, return -EINVAL; get_online_cpus(); + mutex_lock(&cpuhp_state_mutex); if (!invoke || !sp->startup.multi) goto add_node; @@ -1447,16 +1443,14 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, if (ret) { if (sp->teardown.multi) cpuhp_rollback_install(cpu, state, node); - goto err; + goto unlock; } } add_node: ret = 0; - mutex_lock(&cpuhp_state_mutex); hlist_add_head(node, &sp->list); +unlock: mutex_unlock(&cpuhp_state_mutex); - -err: put_online_cpus(); return ret; } @@ -1491,6 +1485,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, return -EINVAL; get_online_cpus(); + mutex_lock(&cpuhp_state_mutex); ret = cpuhp_store_callbacks(state, name, startup, teardown, multi_instance); @@ -1524,6 +1519,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, } } out: + mutex_unlock(&cpuhp_state_mutex); put_online_cpus(); /* * If the requested state is CPUHP_AP_ONLINE_DYN, return the @@ -1547,6 +1543,8 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, return -EINVAL; get_online_cpus(); + mutex_lock(&cpuhp_state_mutex); + if (!invoke || !cpuhp_get_teardown_cb(state)) goto remove; /* @@ -1563,7 +1561,6 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, } remove: - mutex_lock(&cpuhp_state_mutex); hlist_del(node); mutex_unlock(&cpuhp_state_mutex); put_online_cpus(); @@ -1571,6 +1568,7 @@ remove: return 0; } EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); + /** * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state * @state: The state to remove @@ -1589,6 +1587,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) get_online_cpus(); + mutex_lock(&cpuhp_state_mutex); if (sp->multi_instance) { WARN(!hlist_empty(&sp->list), "Error: Removing state %d which has instances left.\n", @@ -1613,6 +1612,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) } remove: cpuhp_store_callbacks(state, NULL, NULL, NULL, false); + mutex_unlock(&cpuhp_state_mutex); put_online_cpus(); } EXPORT_SYMBOL(__cpuhp_remove_state); diff --git a/kernel/events/core.c b/kernel/events/core.c index a17ed56c8ce1..ff01cba86f43 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4256,7 +4256,7 @@ int perf_event_release_kernel(struct perf_event *event) raw_spin_lock_irq(&ctx->lock); /* - * Mark this even as STATE_DEAD, there is no external reference to it + * Mark this event as STATE_DEAD, there is no external reference to it * anymore. * * Anybody acquiring event->child_mutex after the below loop _must_ @@ -10417,21 +10417,22 @@ void perf_event_free_task(struct task_struct *task) continue; mutex_lock(&ctx->mutex); -again: - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, - group_entry) - perf_free_event(event, ctx); + raw_spin_lock_irq(&ctx->lock); + /* + * Destroy the task <-> ctx relation and mark the context dead. + * + * This is important because even though the task hasn't been + * exposed yet the context has been (through child_list). + */ + RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL); + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); + put_task_struct(task); /* cannot be last */ + raw_spin_unlock_irq(&ctx->lock); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, - group_entry) + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) perf_free_event(event, ctx); - if (!list_empty(&ctx->pinned_groups) || - !list_empty(&ctx->flexible_groups)) - goto again; - mutex_unlock(&ctx->mutex); - put_ctx(ctx); } } @@ -10469,7 +10470,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) } /* - * inherit a event from parent task to child task: + * Inherit a event from parent task to child task. + * + * Returns: + * - valid pointer on success + * - NULL for orphaned events + * - IS_ERR() on error */ static struct perf_event * inherit_event(struct perf_event *parent_event, @@ -10563,6 +10569,16 @@ inherit_event(struct perf_event *parent_event, return child_event; } +/* + * Inherits an event group. + * + * This will quietly suppress orphaned events; !inherit_event() is not an error. + * This matches with perf_event_release_kernel() removing all child events. + * + * Returns: + * - 0 on success + * - <0 on error + */ static int inherit_group(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, @@ -10577,6 +10593,11 @@ static int inherit_group(struct perf_event *parent_event, child, NULL, child_ctx); if (IS_ERR(leader)) return PTR_ERR(leader); + /* + * @leader can be NULL here because of is_orphaned_event(). In this + * case inherit_event() will create individual events, similar to what + * perf_group_detach() would do anyway. + */ list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); @@ -10586,6 +10607,17 @@ static int inherit_group(struct perf_event *parent_event, return 0; } +/* + * Creates the child task context and tries to inherit the event-group. + * + * Clears @inherited_all on !attr.inherited or error. Note that we'll leave + * inherited_all set when we 'fail' to inherit an orphaned event; this is + * consistent with perf_event_release_kernel() removing all child events. + * + * Returns: + * - 0 on success + * - <0 on error + */ static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, @@ -10608,7 +10640,6 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * First allocate and initialize a context for the * child. */ - child_ctx = alloc_perf_context(parent_ctx->pmu, child); if (!child_ctx) return -ENOMEM; @@ -10670,7 +10701,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) - break; + goto out_unlock; } /* @@ -10686,7 +10717,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); if (ret) - break; + goto out_unlock; } raw_spin_lock_irqsave(&parent_ctx->lock, flags); @@ -10714,6 +10745,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) } raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); +out_unlock: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); diff --git a/kernel/futex.c b/kernel/futex.c index 229a744b1781..45858ec73941 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2815,7 +2815,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, { struct hrtimer_sleeper timeout, *to = NULL; struct rt_mutex_waiter rt_waiter; - struct rt_mutex *pi_mutex = NULL; struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; @@ -2899,6 +2898,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) + rt_mutex_unlock(&q.pi_state->pi_mutex); /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. @@ -2907,6 +2908,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, spin_unlock(q.lock_ptr); } } else { + struct rt_mutex *pi_mutex; + /* * We have been woken up by futex_unlock_pi(), a timeout, or a * signal. futex_unlock_pi() will not destroy the lock_ptr nor @@ -2930,18 +2933,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (res) ret = (res < 0) ? res : 0; + /* + * If fixup_pi_state_owner() faulted and was unable to handle + * the fault, unlock the rt_mutex and return the fault to + * userspace. + */ + if (ret && rt_mutex_owner(pi_mutex) == current) + rt_mutex_unlock(pi_mutex); + /* Unqueue and drop the lock. */ unqueue_me_pi(&q); } - /* - * If fixup_pi_state_owner() faulted and was unable to handle the - * fault, unlock the rt_mutex and return the fault to userspace. - */ - if (ret == -EFAULT) { - if (pi_mutex && rt_mutex_owner(pi_mutex) == current) - rt_mutex_unlock(pi_mutex); - } else if (ret == -EINTR) { + if (ret == -EINTR) { /* * We've already been requeued, but cannot restart by calling * futex_lock_pi() directly. We could restart this syscall, but diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 7bc24d477805..c65f7989f850 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -213,10 +213,9 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) */ if (sem->count == 0) break; - if (signal_pending_state(state, current)) { - ret = -EINTR; - goto out; - } + if (signal_pending_state(state, current)) + goto out_nolock; + set_current_state(state); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); @@ -224,12 +223,19 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) } /* got the lock */ sem->count = -1; -out: list_del(&waiter.list); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return ret; + +out_nolock: + list_del(&waiter.list); + if (!list_empty(&sem->wait_list)) + __rwsem_do_wake(sem, 1); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return -EINTR; } void __sched __down_write(struct rw_semaphore *sem) diff --git a/kernel/memremap.c b/kernel/memremap.c index 06123234f118..07e85e5229da 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -247,11 +247,9 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); - lock_device_hotplug(); mem_hotplug_begin(); arch_remove_memory(align_start, align_size); mem_hotplug_done(); - unlock_device_hotplug(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); @@ -364,11 +362,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (error) goto err_pfn_remap; - lock_device_hotplug(); mem_hotplug_begin(); error = arch_add_memory(nid, align_start, align_size, true); mem_hotplug_done(); - unlock_device_hotplug(); if (error) goto err_add_memory; diff --git a/kernel/padata.c b/kernel/padata.c index 05316c9f32da..3202aa17492c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -186,19 +186,20 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) reorder = &next_queue->reorder; + spin_lock(&reorder->lock); if (!list_empty(&reorder->list)) { padata = list_entry(reorder->list.next, struct padata_priv, list); - spin_lock(&reorder->lock); list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); - spin_unlock(&reorder->lock); pd->processed++; + spin_unlock(&reorder->lock); goto out; } + spin_unlock(&reorder->lock); if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) { padata = ERR_PTR(-ENODATA); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0af928712174..266ddcc1d8bb 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -184,11 +184,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task) WARN_ON(!task->ptrace || task->parent != current); + /* + * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely. + * Recheck state under the lock to close this race. + */ spin_lock_irq(&task->sighand->siglock); - if (__fatal_signal_pending(task)) - wake_up_state(task, __TASK_TRACED); - else - task->state = TASK_TRACED; + if (task->state == __TASK_TRACED) { + if (__fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + else + task->state = TASK_TRACED; + } spin_unlock_irq(&task->sighand->siglock); } diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index a08795e21628..00a45c45beca 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -96,10 +96,10 @@ static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable); static int __sched_clock_stable_early = 1; /* - * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset + * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset */ -static __read_mostly u64 raw_offset; -static __read_mostly u64 gtod_offset; +__read_mostly u64 __sched_clock_offset; +static __read_mostly u64 __gtod_offset; struct sched_clock_data { u64 tick_raw; @@ -131,17 +131,24 @@ static void __set_sched_clock_stable(void) /* * Attempt to make the (initial) unstable->stable transition continuous. */ - raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw); + __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", - scd->tick_gtod, gtod_offset, - scd->tick_raw, raw_offset); + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); static_branch_enable(&__sched_clock_stable); tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } -static void __clear_sched_clock_stable(struct work_struct *work) +static void __sched_clock_work(struct work_struct *work) +{ + static_branch_disable(&__sched_clock_stable); +} + +static DECLARE_WORK(sched_clock_work, __sched_clock_work); + +static void __clear_sched_clock_stable(void) { struct sched_clock_data *scd = this_scd(); @@ -154,17 +161,17 @@ static void __clear_sched_clock_stable(struct work_struct *work) * * Still do what we can. */ - gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod); + __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod); printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", - scd->tick_gtod, gtod_offset, - scd->tick_raw, raw_offset); + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); - static_branch_disable(&__sched_clock_stable); tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); -} -static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); + if (sched_clock_stable()) + schedule_work(&sched_clock_work); +} void clear_sched_clock_stable(void) { @@ -173,7 +180,7 @@ void clear_sched_clock_stable(void) smp_mb(); /* matches sched_clock_init_late() */ if (sched_clock_running == 2) - schedule_work(&sched_clock_work); + __clear_sched_clock_stable(); } void sched_clock_init_late(void) @@ -214,7 +221,7 @@ static inline u64 wrap_max(u64 x, u64 y) */ static u64 sched_clock_local(struct sched_clock_data *scd) { - u64 now, clock, old_clock, min_clock, max_clock; + u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; again: @@ -231,9 +238,10 @@ again: * scd->tick_gtod + TICK_NSEC); */ - clock = scd->tick_gtod + gtod_offset + delta; - min_clock = wrap_max(scd->tick_gtod, old_clock); - max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); + gtod = scd->tick_gtod + __gtod_offset; + clock = gtod + delta; + min_clock = wrap_max(gtod, old_clock); + max_clock = wrap_max(old_clock, gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); @@ -317,7 +325,7 @@ u64 sched_clock_cpu(int cpu) u64 clock; if (sched_clock_stable()) - return sched_clock() + raw_offset; + return sched_clock() + __sched_clock_offset; if (unlikely(!sched_clock_running)) return 0ull; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index cd7cd489f739..54c577578da6 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -584,20 +584,14 @@ static int sugov_start(struct cpufreq_policy *policy) for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->sg_policy = sg_policy; - if (policy_is_shared(policy)) { - sg_cpu->util = 0; - sg_cpu->max = 0; - sg_cpu->flags = SCHED_CPUFREQ_RT; - sg_cpu->last_update = 0; - sg_cpu->iowait_boost = 0; - sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - sugov_update_shared); - } else { - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - sugov_update_single); - } + sg_cpu->flags = SCHED_CPUFREQ_RT; + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + policy_is_shared(policy) ? + sugov_update_shared : + sugov_update_single); } return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index acf0a5a06da7..8c8714fcb53c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2133,9 +2133,12 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, if (write) { if (*negp) return -EINVAL; + if (*lvalp > UINT_MAX) + return -EINVAL; *valp = *lvalp; } else { unsigned int val = *valp; + *negp = false; *lvalp = (unsigned long)val; } return 0; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 96fc3c043ad6..54e7a90db848 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4826,9 +4826,9 @@ static __init int test_ringbuffer(void) rb_data[cpu].cnt = cpu; rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], "rbtester/%d", cpu); - if (WARN_ON(!rb_threads[cpu])) { + if (WARN_ON(IS_ERR(rb_threads[cpu]))) { pr_cont("FAILED\n"); - ret = -1; + ret = PTR_ERR(rb_threads[cpu]); goto out_free; } @@ -4838,9 +4838,9 @@ static __init int test_ringbuffer(void) /* Now create the rb hammer! */ rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); - if (WARN_ON(!rb_hammer)) { + if (WARN_ON(IS_ERR(rb_hammer))) { pr_cont("FAILED\n"); - ret = -1; + ret = PTR_ERR(rb_hammer); goto out_free; } |