diff options
Diffstat (limited to 'kernel')
71 files changed, 2215 insertions, 1407 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index eb26e12c6c2a..12c679f769c6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o @@ -115,8 +116,6 @@ obj-$(CONFIG_HAS_IOMEM) += memremap.o $(obj)/configs.o: $(obj)/config_data.h -# config_data.h contains the same information as ikconfig.h but gzipped. -# Info from config_data can be extracted from /proc/config* targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) diff --git a/kernel/audit.c b/kernel/audit.c index 67b9fbd871be..6e399bb69d7c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -107,7 +107,6 @@ static u32 audit_rate_limit; * When set to zero, this means unlimited. */ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) -static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME; static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ @@ -138,11 +137,18 @@ static DEFINE_SPINLOCK(audit_freelist_lock); static int audit_freelist_count; static LIST_HEAD(audit_freelist); -static struct sk_buff_head audit_skb_queue; -/* queue of skbs to send to auditd when/if it comes back */ -static struct sk_buff_head audit_skb_hold_queue; +/* queue msgs to send via kauditd_task */ +static struct sk_buff_head audit_queue; +/* queue msgs due to temporary unicast send problems */ +static struct sk_buff_head audit_retry_queue; +/* queue msgs waiting for new auditd connection */ +static struct sk_buff_head audit_hold_queue; + +/* queue servicing thread */ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); + +/* waitqueue for callers who are blocked on the audit backlog */ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, @@ -338,7 +344,7 @@ static int audit_set_backlog_limit(u32 limit) static int audit_set_backlog_wait_time(u32 timeout) { return audit_do_config_change("audit_backlog_wait_time", - &audit_backlog_wait_time_master, timeout); + &audit_backlog_wait_time, timeout); } static int audit_set_enabled(u32 state) @@ -365,29 +371,10 @@ static int audit_set_failure(u32 state) } /* - * Queue skbs to be sent to auditd when/if it comes back. These skbs should - * already have been sent via prink/syslog and so if these messages are dropped - * it is not a huge concern since we already passed the audit_log_lost() - * notification and stuff. This is just nice to get audit messages during - * boot before auditd is running or messages generated while auditd is stopped. - * This only holds messages is audit_default is set, aka booting with audit=1 - * or building your kernel that way. - */ -static void audit_hold_skb(struct sk_buff *skb) -{ - if (audit_default && - (!audit_backlog_limit || - skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)) - skb_queue_tail(&audit_skb_hold_queue, skb); - else - kfree_skb(skb); -} - -/* * For one reason or another this nlh isn't getting delivered to the userspace * audit daemon, just send it to printk. */ -static void audit_printk_skb(struct sk_buff *skb) +static void kauditd_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); char *data = nlmsg_data(nlh); @@ -398,58 +385,123 @@ static void audit_printk_skb(struct sk_buff *skb) else audit_log_lost("printk limit exceeded"); } +} + +/** + * kauditd_hold_skb - Queue an audit record, waiting for auditd + * @skb: audit record + * + * Description: + * Queue the audit record, waiting for an instance of auditd. When this + * function is called we haven't given up yet on sending the record, but things + * are not looking good. The first thing we want to do is try to write the + * record via printk and then see if we want to try and hold on to the record + * and queue it, if we have room. If we want to hold on to the record, but we + * don't have room, record a record lost message. + */ +static void kauditd_hold_skb(struct sk_buff *skb) +{ + /* at this point it is uncertain if we will ever send this to auditd so + * try to send the message via printk before we go any further */ + kauditd_printk_skb(skb); + + /* can we just silently drop the message? */ + if (!audit_default) { + kfree_skb(skb); + return; + } + + /* if we have room, queue the message */ + if (!audit_backlog_limit || + skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_hold_queue, skb); + return; + } - audit_hold_skb(skb); + /* we have no other options - drop the message */ + audit_log_lost("kauditd hold queue overflow"); + kfree_skb(skb); } -static void kauditd_send_skb(struct sk_buff *skb) +/** + * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd + * @skb: audit record + * + * Description: + * Not as serious as kauditd_hold_skb() as we still have a connected auditd, + * but for some reason we are having problems sending it audit records so + * queue the given record and attempt to resend. + */ +static void kauditd_retry_skb(struct sk_buff *skb) { - int err; - int attempts = 0; -#define AUDITD_RETRIES 5 + /* NOTE: because records should only live in the retry queue for a + * short period of time, before either being sent or moved to the hold + * queue, we don't currently enforce a limit on this queue */ + skb_queue_tail(&audit_retry_queue, skb); +} + +/** + * auditd_reset - Disconnect the auditd connection + * + * Description: + * Break the auditd/kauditd connection and move all the records in the retry + * queue into the hold queue in case auditd reconnects. The audit_cmd_mutex + * must be held when calling this function. + */ +static void auditd_reset(void) +{ + struct sk_buff *skb; + + /* break the connection */ + if (audit_sock) { + sock_put(audit_sock); + audit_sock = NULL; + } + audit_pid = 0; + audit_nlk_portid = 0; + + /* flush all of the retry queue to the hold queue */ + while ((skb = skb_dequeue(&audit_retry_queue))) + kauditd_hold_skb(skb); +} + +/** + * kauditd_send_unicast_skb - Send a record via unicast to auditd + * @skb: audit record + */ +static int kauditd_send_unicast_skb(struct sk_buff *skb) +{ + int rc; -restart: - /* take a reference in case we can't send it and we want to hold it */ + /* if we know nothing is connected, don't even try the netlink call */ + if (!audit_pid) + return -ECONNREFUSED; + + /* get an extra skb reference in case we fail to send */ skb_get(skb); - err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); - if (err < 0) { - pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n", - audit_pid, err); - if (audit_pid) { - if (err == -ECONNREFUSED || err == -EPERM - || ++attempts >= AUDITD_RETRIES) { - char s[32]; - - snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid); - audit_log_lost(s); - audit_pid = 0; - audit_sock = NULL; - } else { - pr_warn("re-scheduling(#%d) write to audit_pid=%d\n", - attempts, audit_pid); - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - goto restart; - } - } - /* we might get lucky and get this in the next auditd */ - audit_hold_skb(skb); - } else - /* drop the extra reference if sent ok */ + rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); + if (rc >= 0) { consume_skb(skb); + rc = 0; + } + + return rc; } /* - * kauditd_send_multicast_skb - send the skb to multicast userspace listeners + * kauditd_send_multicast_skb - Send a record to any multicast listeners + * @skb: audit record * + * Description: * This function doesn't consume an skb as might be expected since it has to * copy it anyways. */ -static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask) +static void kauditd_send_multicast_skb(struct sk_buff *skb) { - struct sk_buff *copy; - struct audit_net *aunet = net_generic(&init_net, audit_net_id); - struct sock *sock = aunet->nlsk; + struct sk_buff *copy; + struct audit_net *aunet = net_generic(&init_net, audit_net_id); + struct sock *sock = aunet->nlsk; + struct nlmsghdr *nlh; if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) return; @@ -464,74 +516,161 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask) * no reason for new multicast clients to continue with this * non-compliance. */ - copy = skb_copy(skb, gfp_mask); + copy = skb_copy(skb, GFP_KERNEL); if (!copy) return; + nlh = nlmsg_hdr(copy); + nlh->nlmsg_len = skb->len; - nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask); + nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); } -/* - * flush_hold_queue - empty the hold queue if auditd appears - * - * If auditd just started, drain the queue of messages already - * sent to syslog/printk. Remember loss here is ok. We already - * called audit_log_lost() if it didn't go out normally. so the - * race between the skb_dequeue and the next check for audit_pid - * doesn't matter. +/** + * kauditd_wake_condition - Return true when it is time to wake kauditd_thread * - * If you ever find kauditd to be too slow we can get a perf win - * by doing our own locking and keeping better track if there - * are messages in this queue. I don't see the need now, but - * in 5 years when I want to play with this again I'll see this - * note and still have no friggin idea what i'm thinking today. + * Description: + * This function is for use by the wait_event_freezable() call in + * kauditd_thread(). */ -static void flush_hold_queue(void) +static int kauditd_wake_condition(void) { - struct sk_buff *skb; - - if (!audit_default || !audit_pid) - return; - - skb = skb_dequeue(&audit_skb_hold_queue); - if (likely(!skb)) - return; + static int pid_last = 0; + int rc; + int pid = audit_pid; - while (skb && audit_pid) { - kauditd_send_skb(skb); - skb = skb_dequeue(&audit_skb_hold_queue); - } + /* wake on new messages or a change in the connected auditd */ + rc = skb_queue_len(&audit_queue) || (pid && pid != pid_last); + if (rc) + pid_last = pid; - /* - * if auditd just disappeared but we - * dequeued an skb we need to drop ref - */ - consume_skb(skb); + return rc; } static int kauditd_thread(void *dummy) { + int rc; + int auditd = 0; + int reschedule = 0; + struct sk_buff *skb; + struct nlmsghdr *nlh; + +#define UNICAST_RETRIES 5 +#define AUDITD_BAD(x,y) \ + ((x) == -ECONNREFUSED || (x) == -EPERM || ++(y) >= UNICAST_RETRIES) + + /* NOTE: we do invalidate the auditd connection flag on any sending + * errors, but we only "restore" the connection flag at specific places + * in the loop in order to help ensure proper ordering of audit + * records */ + set_freezable(); while (!kthread_should_stop()) { - struct sk_buff *skb; - - flush_hold_queue(); + /* NOTE: possible area for future improvement is to look at + * the hold and retry queues, since only this thread + * has access to these queues we might be able to do + * our own queuing and skip some/all of the locking */ + + /* NOTE: it might be a fun experiment to split the hold and + * retry queue handling to another thread, but the + * synchronization issues and other overhead might kill + * any performance gains */ + + /* attempt to flush the hold queue */ + while (auditd && (skb = skb_dequeue(&audit_hold_queue))) { + rc = kauditd_send_unicast_skb(skb); + if (rc) { + /* requeue to the same spot */ + skb_queue_head(&audit_hold_queue, skb); + + auditd = 0; + if (AUDITD_BAD(rc, reschedule)) { + mutex_lock(&audit_cmd_mutex); + auditd_reset(); + mutex_unlock(&audit_cmd_mutex); + reschedule = 0; + } + } else + /* we were able to send successfully */ + reschedule = 0; + } - skb = skb_dequeue(&audit_skb_queue); + /* attempt to flush the retry queue */ + while (auditd && (skb = skb_dequeue(&audit_retry_queue))) { + rc = kauditd_send_unicast_skb(skb); + if (rc) { + auditd = 0; + if (AUDITD_BAD(rc, reschedule)) { + kauditd_hold_skb(skb); + mutex_lock(&audit_cmd_mutex); + auditd_reset(); + mutex_unlock(&audit_cmd_mutex); + reschedule = 0; + } else + /* temporary problem (we hope), queue + * to the same spot and retry */ + skb_queue_head(&audit_retry_queue, skb); + } else + /* we were able to send successfully */ + reschedule = 0; + } + /* standard queue processing, try to be as quick as possible */ +quick_loop: + skb = skb_dequeue(&audit_queue); if (skb) { - if (!audit_backlog_limit || - (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)) - wake_up(&audit_backlog_wait); - if (audit_pid) - kauditd_send_skb(skb); + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ + nlh = nlmsg_hdr(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* attempt to send to any multicast listeners */ + kauditd_send_multicast_skb(skb); + + /* attempt to send to auditd, queue on failure */ + if (auditd) { + rc = kauditd_send_unicast_skb(skb); + if (rc) { + auditd = 0; + if (AUDITD_BAD(rc, reschedule)) { + mutex_lock(&audit_cmd_mutex); + auditd_reset(); + mutex_unlock(&audit_cmd_mutex); + reschedule = 0; + } + + /* move to the retry queue */ + kauditd_retry_skb(skb); + } else + /* everything is working so go fast! */ + goto quick_loop; + } else if (reschedule) + /* we are currently having problems, move to + * the retry queue */ + kauditd_retry_skb(skb); else - audit_printk_skb(skb); - continue; - } + /* dump the message via printk and hold it */ + kauditd_hold_skb(skb); + } else { + /* we have flushed the backlog so wake everyone */ + wake_up(&audit_backlog_wait); + + /* if everything is okay with auditd (if present), go + * to sleep until there is something new in the queue + * or we have a change in the connected auditd; + * otherwise simply reschedule to give things a chance + * to recover */ + if (reschedule) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } else + wait_event_freezable(kauditd_wait, + kauditd_wake_condition()); - wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue)); + /* update the auditd connection status */ + auditd = (audit_pid ? 1 : 0); + } } + return 0; } @@ -596,6 +735,7 @@ static int audit_send_reply_thread(void *arg) kfree(reply); return 0; } + /** * audit_send_reply - send an audit reply message via netlink * @request_skb: skb of request we are replying to (used to target the reply) @@ -832,16 +972,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) return err; - /* As soon as there's any sign of userspace auditd, - * start kauditd to talk to it */ - if (!kauditd_task) { - kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); - if (IS_ERR(kauditd_task)) { - err = PTR_ERR(kauditd_task); - kauditd_task = NULL; - return err; - } - } seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); @@ -855,9 +985,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.rate_limit = audit_rate_limit; s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); - s.backlog = skb_queue_len(&audit_skb_queue); + s.backlog = skb_queue_len(&audit_queue); s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; - s.backlog_wait_time = audit_backlog_wait_time_master; + s.backlog_wait_time = audit_backlog_wait_time; audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } @@ -897,9 +1027,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, audit_pid, 1); - audit_pid = new_pid; - audit_nlk_portid = NETLINK_CB(skb).portid; - audit_sock = skb->sk; + if (new_pid) { + if (audit_sock) + sock_put(audit_sock); + audit_pid = new_pid; + audit_nlk_portid = NETLINK_CB(skb).portid; + sock_hold(skb->sk); + audit_sock = skb->sk; + } else { + auditd_reset(); + } + wake_up_interruptible(&kauditd_wait); } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(s.rate_limit); @@ -1167,10 +1305,10 @@ static void __net_exit audit_net_exit(struct net *net) { struct audit_net *aunet = net_generic(net, audit_net_id); struct sock *sock = aunet->nlsk; - if (sock == audit_sock) { - audit_pid = 0; - audit_sock = NULL; - } + mutex_lock(&audit_cmd_mutex); + if (sock == audit_sock) + auditd_reset(); + mutex_unlock(&audit_cmd_mutex); netlink_kernel_release(sock); aunet->nlsk = NULL; @@ -1195,17 +1333,24 @@ static int __init audit_init(void) audit_default ? "enabled" : "disabled"); register_pernet_subsys(&audit_net_ops); - skb_queue_head_init(&audit_skb_queue); - skb_queue_head_init(&audit_skb_hold_queue); + skb_queue_head_init(&audit_queue); + skb_queue_head_init(&audit_retry_queue); + skb_queue_head_init(&audit_hold_queue); audit_initialized = AUDIT_INITIALIZED; audit_enabled = audit_default; audit_ever_enabled |= !!audit_default; - audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); - for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); + kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); + if (IS_ERR(kauditd_task)) { + int err = PTR_ERR(kauditd_task); + panic("audit: failed to start the kauditd thread (%d)\n", err); + } + + audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); + return 0; } __initcall(audit_init); @@ -1338,24 +1483,6 @@ static inline void audit_get_stamp(struct audit_context *ctx, } } -/* - * Wait for auditd to drain the queue a little - */ -static long wait_for_auditd(long sleep_time) -{ - DECLARE_WAITQUEUE(wait, current); - - if (audit_backlog_limit && - skb_queue_len(&audit_skb_queue) > audit_backlog_limit) { - add_wait_queue_exclusive(&audit_backlog_wait, &wait); - set_current_state(TASK_UNINTERRUPTIBLE); - sleep_time = schedule_timeout(sleep_time); - remove_wait_queue(&audit_backlog_wait, &wait); - } - - return sleep_time; -} - /** * audit_log_start - obtain an audit buffer * @ctx: audit_context (may be NULL) @@ -1374,12 +1501,9 @@ static long wait_for_auditd(long sleep_time) struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { - struct audit_buffer *ab = NULL; - struct timespec t; - unsigned int uninitialized_var(serial); - int reserve = 5; /* Allow atomic callers to go up to five - entries over the normal backlog limit */ - unsigned long timeout_start = jiffies; + struct audit_buffer *ab; + struct timespec t; + unsigned int uninitialized_var(serial); if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1387,38 +1511,48 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE))) return NULL; - if (gfp_mask & __GFP_DIRECT_RECLAIM) { - if (audit_pid && audit_pid == current->tgid) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; - else - reserve = 0; - } - - while (audit_backlog_limit - && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) { - long sleep_time; + /* don't ever fail/sleep on these two conditions: + * 1. auditd generated record - since we need auditd to drain the + * queue; also, when we are checking for auditd, compare PIDs using + * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() + * using a PID anchored in the caller's namespace + * 2. audit command message - record types 1000 through 1099 inclusive + * are command messages/records used to manage the kernel subsystem + * and the audit userspace, blocking on these messages could cause + * problems under load so don't do it (note: not all of these + * command types are valid as record types, but it is quicker to + * just check two ints than a series of ints in a if/switch stmt) */ + if (!((audit_pid && audit_pid == task_tgid_vnr(current)) || + (type >= 1000 && type <= 1099))) { + long sleep_time = audit_backlog_wait_time; + + while (audit_backlog_limit && + (skb_queue_len(&audit_queue) > audit_backlog_limit)) { + /* wake kauditd to try and flush the queue */ + wake_up_interruptible(&kauditd_wait); - sleep_time = timeout_start + audit_backlog_wait_time - jiffies; - if (sleep_time > 0) { - sleep_time = wait_for_auditd(sleep_time); - if (sleep_time > 0) - continue; + /* sleep if we are allowed and we haven't exhausted our + * backlog wait limit */ + if ((gfp_mask & __GFP_DIRECT_RECLAIM) && + (sleep_time > 0)) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&audit_backlog_wait, + &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + sleep_time = schedule_timeout(sleep_time); + remove_wait_queue(&audit_backlog_wait, &wait); + } else { + if (audit_rate_check() && printk_ratelimit()) + pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", + skb_queue_len(&audit_queue), + audit_backlog_limit); + audit_log_lost("backlog limit exceeded"); + return NULL; } } - if (audit_rate_check() && printk_ratelimit()) - pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", - skb_queue_len(&audit_skb_queue), - audit_backlog_limit); - audit_log_lost("backlog limit exceeded"); - audit_backlog_wait_time = 0; - wake_up(&audit_backlog_wait); - return NULL; } - if (!reserve && !audit_backlog_wait_time) - audit_backlog_wait_time = audit_backlog_wait_time_master; - ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); @@ -1426,9 +1560,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); - audit_log_format(ab, "audit(%lu.%03lu:%u): ", t.tv_sec, t.tv_nsec/1000000, serial); + return ab; } @@ -1759,7 +1893,7 @@ void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, * @call_panic: optional pointer to int that will be updated if secid fails */ void audit_log_name(struct audit_context *context, struct audit_names *n, - struct path *path, int record_num, int *call_panic) + const struct path *path, int record_num, int *call_panic) { struct audit_buffer *ab; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); @@ -1947,7 +2081,7 @@ EXPORT_SYMBOL(audit_log_task_info); * @operation: specific link operation * @link: the path that triggered the restriction */ -void audit_log_link_denied(const char *operation, struct path *link) +void audit_log_link_denied(const char *operation, const struct path *link) { struct audit_buffer *ab; struct audit_names *name; @@ -1978,10 +2112,10 @@ out: * audit_log_end - end one audit record * @ab: the audit_buffer * - * netlink_unicast() cannot be called inside an irq context because it blocks - * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed - * on a queue and a tasklet is scheduled to remove them from the queue outside - * the irq context. May be called in any context. + * We can not do a netlink send inside an irq context because it blocks (last + * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a + * queue and a tasklet is scheduled to remove them from the queue outside the + * irq context. May be called in any context. */ void audit_log_end(struct audit_buffer *ab) { @@ -1990,28 +2124,8 @@ void audit_log_end(struct audit_buffer *ab) if (!audit_rate_check()) { audit_log_lost("rate limit exceeded"); } else { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - - nlh->nlmsg_len = ab->skb->len; - kauditd_send_multicast_skb(ab->skb, ab->gfp_mask); - - /* - * The original kaudit unicast socket sends up messages with - * nlmsg_len set to the payload length rather than the entire - * message length. This breaks the standard set by netlink. - * The existing auditd daemon assumes this breakage. Fixing - * this would require co-ordinating a change in the established - * protocol between the kaudit kernel subsystem and the auditd - * userspace code. - */ - nlh->nlmsg_len -= NLMSG_HDRLEN; - - if (audit_pid) { - skb_queue_tail(&audit_skb_queue, ab->skb); - wake_up_interruptible(&kauditd_wait); - } else { - audit_printk_skb(ab->skb); - } + skb_queue_tail(&audit_queue, ab->skb); + wake_up_interruptible(&kauditd_wait); ab->skb = NULL; } audit_buffer_free(ab); diff --git a/kernel/audit.h b/kernel/audit.h index 431444c3708b..960d49c9db5e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -212,7 +212,7 @@ extern void audit_copy_inode(struct audit_names *name, extern void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap); extern void audit_log_name(struct audit_context *context, - struct audit_names *n, struct path *path, + struct audit_names *n, const struct path *path, int record_num, int *call_panic); extern int audit_pid; diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index f84f8d06e1f6..7ea57e516029 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -74,7 +74,7 @@ int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_ } static void audit_update_mark(struct audit_fsnotify_mark *audit_mark, - struct inode *inode) + const struct inode *inode) { audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET; audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET; @@ -130,10 +130,9 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; - audit_log_format(ab, "auid=%u ses=%u op=", + audit_log_format(ab, "auid=%u ses=%u op=%s", from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current)); - audit_log_string(ab, op); + audit_get_sessionid(current), op); audit_log_format(ab, " path="); audit_log_untrustedstring(ab, audit_mark->path); audit_log_key(ab, rule->filterkey); @@ -168,11 +167,11 @@ static int audit_mark_handle_event(struct fsnotify_group *group, struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *dname, u32 cookie) { struct audit_fsnotify_mark *audit_mark; - struct inode *inode = NULL; + const struct inode *inode = NULL; audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); @@ -180,10 +179,10 @@ static int audit_mark_handle_event(struct fsnotify_group *group, switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = ((struct path *)data)->dentry->d_inode; + inode = ((const struct path *)data)->dentry->d_inode; break; case (FSNOTIFY_EVENT_INODE): - inode = (struct inode *)data; + inode = (const struct inode *)data; break; default: BUG(); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 25772476fa4a..8b1dde96a0fa 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -458,8 +458,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule) ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; - audit_log_format(ab, "op="); - audit_log_string(ab, "remove_rule"); + audit_log_format(ab, "op=remove_rule"); audit_log_format(ab, " dir="); audit_log_untrustedstring(ab, rule->tree->pathname); audit_log_key(ab, rule->filterkey); @@ -948,7 +947,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group, struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0d302a87f21b..f79e4658433d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -242,10 +242,9 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; - audit_log_format(ab, "auid=%u ses=%u op=", + audit_log_format(ab, "auid=%u ses=%u op=%s", from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current)); - audit_log_string(ab, op); + audit_get_sessionid(current), op); audit_log_format(ab, " path="); audit_log_untrustedstring(ab, w->path); audit_log_key(ab, r->filterkey); @@ -472,10 +471,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *dname, u32 cookie) { - struct inode *inode; + const struct inode *inode; struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); @@ -484,10 +483,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group, switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = d_backing_inode(((struct path *)data)->dentry); + inode = d_backing_inode(((const struct path *)data)->dentry); break; case (FSNOTIFY_EVENT_INODE): - inode = (struct inode *)data; + inode = (const struct inode *)data; break; default: BUG(); @@ -548,8 +547,8 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) exe_file = get_task_exe_file(tsk); if (!exe_file) return 0; - ino = exe_file->f_inode->i_ino; - dev = exe_file->f_inode->i_sb->s_dev; + ino = file_inode(exe_file)->i_ino; + dev = file_inode(exe_file)->i_sb->s_dev; fput(exe_file); return audit_mark_compare(mark, ino, dev); } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 85d9cac497e4..880519d6cf2a 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -363,6 +363,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_EXIT: case AUDIT_SUCCESS: case AUDIT_INODE: + case AUDIT_SESSIONID: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; @@ -476,6 +477,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if (!gid_valid(f->gid)) goto exit_free; break; + case AUDIT_SESSIONID: case AUDIT_ARCH: entry->rule.arch_f = f; break; @@ -1074,8 +1076,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re return; audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid); audit_log_task_context(ab); - audit_log_format(ab, " op="); - audit_log_string(ab, action); + audit_log_format(ab, " op=%s", action); audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=%d", rule->listnr, res); audit_log_end(ab); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2cd5256dbff7..cf1fa43512c1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -446,6 +446,7 @@ static int audit_filter_rules(struct task_struct *tsk, const struct cred *cred; int i, need_sid = 1; u32 sid; + unsigned int sessionid; cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); @@ -508,6 +509,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_FSGID: result = audit_gid_comparator(cred->fsgid, f->op, f->gid); break; + case AUDIT_SESSIONID: + sessionid = audit_get_sessionid(current); + result = audit_comparator(sessionid, f->op, f->val); + break; case AUDIT_PERS: result = audit_comparator(tsk->personality, f->op, f->val); break; @@ -1000,7 +1005,7 @@ static void audit_log_execve_info(struct audit_context *context, long len_rem; long len_full; long len_buf; - long len_abuf; + long len_abuf = 0; long len_tmp; bool require_data; bool encode; @@ -2025,8 +2030,11 @@ int audit_set_loginuid(kuid_t loginuid) goto out; /* are we setting or clearing? */ - if (uid_valid(loginuid)) + if (uid_valid(loginuid)) { sessionid = (unsigned int)atomic_inc_return(&session_id); + if (unlikely(sessionid == (unsigned int)-1)) + sessionid = (unsigned int)atomic_inc_return(&session_id); + } task->sessionid = sessionid; task->loginuid = loginuid; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 83e0d153b0b4..1eb4f1303756 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -105,19 +105,29 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; + u32 pages, delta; + int ret; BUG_ON(fp_old == NULL); size = round_up(size, PAGE_SIZE); - if (size <= fp_old->pages * PAGE_SIZE) + pages = size / PAGE_SIZE; + if (pages <= fp_old->pages) return fp_old; + delta = pages - fp_old->pages; + ret = __bpf_prog_charge(fp_old->aux->user, delta); + if (ret) + return NULL; + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); - if (fp != NULL) { + if (fp == NULL) { + __bpf_prog_uncharge(fp_old->aux->user, delta); + } else { kmemcheck_annotate_bitfield(fp, meta); memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); - fp->pages = size / PAGE_SIZE; + fp->pages = pages; fp->aux->prog = fp; /* We keep fp->aux from fp_old around in the new @@ -136,28 +146,29 @@ void __bpf_prog_free(struct bpf_prog *fp) vfree(fp); } -#define SHA_BPF_RAW_SIZE \ - round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES) - -/* Called under verifier mutex. */ -void bpf_prog_calc_digest(struct bpf_prog *fp) +int bpf_prog_calc_digest(struct bpf_prog *fp) { const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); - static u32 ws[SHA_WORKSPACE_WORDS]; - static u8 raw[SHA_BPF_RAW_SIZE]; - struct bpf_insn *dst = (void *)raw; + u32 raw_size = bpf_prog_digest_scratch_size(fp); + u32 ws[SHA_WORKSPACE_WORDS]; u32 i, bsize, psize, blocks; + struct bpf_insn *dst; bool was_ld_map; - u8 *todo = raw; + u8 *raw, *todo; __be32 *result; __be64 *bits; + raw = vmalloc(raw_size); + if (!raw) + return -ENOMEM; + sha_init(fp->digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation * since they are unstable from user space side. */ + dst = (void *)raw; for (i = 0, was_ld_map = false; i < fp->len; i++) { dst[i] = fp->insnsi[i]; if (!was_ld_map && @@ -177,12 +188,13 @@ void bpf_prog_calc_digest(struct bpf_prog *fp) } } - psize = fp->len * sizeof(struct bpf_insn); - memset(&raw[psize], 0, sizeof(raw) - psize); + psize = bpf_prog_insn_size(fp); + memset(&raw[psize], 0, raw_size - psize); raw[psize++] = 0x80; bsize = round_up(psize, SHA_MESSAGE_BYTES); blocks = bsize / SHA_MESSAGE_BYTES; + todo = raw; if (bsize - psize >= sizeof(__be64)) { bits = (__be64 *)(todo + bsize - sizeof(__be64)); } else { @@ -199,6 +211,9 @@ void bpf_prog_calc_digest(struct bpf_prog *fp) result = (__force __be32 *)fp->digest; for (i = 0; i < SHA_DIGEST_WORDS; i++) result[i] = cpu_to_be32(fp->digest[i]); + + vfree(raw); + return 0; } static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4819ec9d95f6..e89acea22ecf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -615,19 +615,39 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +int __bpf_prog_charge(struct user_struct *user, u32 pages) +{ + unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + unsigned long user_bufs; + + if (user) { + user_bufs = atomic_long_add_return(pages, &user->locked_vm); + if (user_bufs > memlock_limit) { + atomic_long_sub(pages, &user->locked_vm); + return -EPERM; + } + } + + return 0; +} + +void __bpf_prog_uncharge(struct user_struct *user, u32 pages) +{ + if (user) + atomic_long_sub(pages, &user->locked_vm); +} + static int bpf_prog_charge_memlock(struct bpf_prog *prog) { struct user_struct *user = get_current_user(); - unsigned long memlock_limit; - - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + int ret; - atomic_long_add(prog->pages, &user->locked_vm); - if (atomic_long_read(&user->locked_vm) > memlock_limit) { - atomic_long_sub(prog->pages, &user->locked_vm); + ret = __bpf_prog_charge(user, prog->pages); + if (ret) { free_uid(user); - return -EPERM; + return ret; } + prog->aux->user = user; return 0; } @@ -636,7 +656,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) { struct user_struct *user = prog->aux->user; - atomic_long_sub(prog->pages, &user->locked_vm); + __bpf_prog_uncharge(user, prog->pages); free_uid(user); } @@ -811,7 +831,7 @@ static int bpf_prog_load(union bpf_attr *attr) err = -EFAULT; if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), - prog->len * sizeof(struct bpf_insn)) != 0) + bpf_prog_insn_size(prog)) != 0) goto free_prog; prog->orig_prog = NULL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d28f9a3380a9..83ed2f8f6f22 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -462,14 +462,19 @@ static void init_reg_state(struct bpf_reg_state *regs) regs[BPF_REG_1].type = PTR_TO_CTX; } -static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) +static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) { - BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; regs[regno].id = 0; regs[regno].imm = 0; } +static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + __mark_reg_unknown_value(regs, regno); +} + static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) { regs[regno].min_value = BPF_REGISTER_MIN_RANGE; @@ -1970,8 +1975,13 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { reg->type = type; + /* We don't need id from this point onwards anymore, thus we + * should better reset it, so that state pruning has chances + * to take effect. + */ + reg->id = 0; if (type == UNKNOWN_VALUE) - mark_reg_unknown_value(regs, regno); + __mark_reg_unknown_value(regs, regno); } } @@ -1982,16 +1992,16 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, enum bpf_reg_type type) { struct bpf_reg_state *regs = state->regs; + u32 id = regs[regno].id; int i; for (i = 0; i < MAX_BPF_REG; i++) - mark_map_reg(regs, i, regs[regno].id, type); + mark_map_reg(regs, i, id, type); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; - mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, - regs[regno].id, type); + mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type); } } @@ -2926,6 +2936,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) int insn_cnt = env->prog->len; int i, j, err; + err = bpf_prog_calc_digest(env->prog); + if (err) + return err; + for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { @@ -3173,8 +3187,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log_level = 0; } - bpf_prog_calc_digest(env->prog); - ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; diff --git a/kernel/capability.c b/kernel/capability.c index 00411c82dac5..4984e1f552eb 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -457,6 +457,19 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns, EXPORT_SYMBOL(file_ns_capable); /** + * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? + * @ns: The user namespace in question + * @inode: The inode in question + * + * Return true if the inode uid and gid are within the namespace. + */ +bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode) +{ + return kuid_has_mapping(ns, inode->i_uid) && + kgid_has_mapping(ns, inode->i_gid); +} + +/** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question * @cap: The capability in question @@ -469,7 +482,26 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); - return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) && - kgid_has_mapping(ns, inode->i_gid); + return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); + +/** + * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace + * @tsk: The task that may be ptraced + * @ns: The user namespace to search for CAP_SYS_PTRACE in + * + * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE + * in the specified user namespace. + */ +bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) +{ + int ret = 0; /* An absent tracer adds no restrictions */ + const struct cred *cred; + rcu_read_lock(); + cred = rcu_dereference(tsk->ptracer_cred); + if (cred) + ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE); + rcu_read_unlock(); + return (ret == 0); +} diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0874e2edd275..79517e5549f1 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -598,11 +598,11 @@ return_normal: /* * Wait for the other CPUs to be notified and be waiting for us: */ - time_left = loops_per_jiffy * HZ; + time_left = MSEC_PER_SEC; while (kgdb_do_roundup && --time_left && (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != online_cpus) - cpu_relax(); + udelay(1000); if (!time_left) pr_crit("Timed out waiting for secondary CPUs.\n"); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index fc1ef736253c..e74be38245ad 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -30,6 +30,7 @@ char kdb_prompt_str[CMD_BUFLEN]; int kdb_trap_printk; +int kdb_printf_cpu = -1; static int kgdb_transition_check(char *buffer) { @@ -554,31 +555,26 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) int linecount; int colcount; int logging, saved_loglevel = 0; - int saved_trap_printk; - int got_printf_lock = 0; int retlen = 0; int fnd, len; + int this_cpu, old_cpu; char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; char *moreprompt = "more> "; struct console *c = console_drivers; - static DEFINE_SPINLOCK(kdb_printf_lock); unsigned long uninitialized_var(flags); - preempt_disable(); - saved_trap_printk = kdb_trap_printk; - kdb_trap_printk = 0; - /* Serialize kdb_printf if multiple cpus try to write at once. * But if any cpu goes recursive in kdb, just print the output, * even if it is interleaved with any other text. */ - if (!KDB_STATE(PRINTF_LOCK)) { - KDB_STATE_SET(PRINTF_LOCK); - spin_lock_irqsave(&kdb_printf_lock, flags); - got_printf_lock = 1; - atomic_inc(&kdb_event); - } else { - __acquire(kdb_printf_lock); + local_irq_save(flags); + this_cpu = smp_processor_id(); + for (;;) { + old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu); + if (old_cpu == -1 || old_cpu == this_cpu) + break; + + cpu_relax(); } diag = kdbgetintenv("LINES", &linecount); @@ -697,7 +693,7 @@ kdb_printit: * Write to all consoles. */ retlen = strlen(kdb_buffer); - cp = (char *) printk_skip_level(kdb_buffer); + cp = (char *) printk_skip_headers(kdb_buffer); if (!dbg_kdb_mode && kgdb_connected) { gdbstub_msg_write(cp, retlen - (cp - kdb_buffer)); } else { @@ -847,16 +843,9 @@ kdb_print_out: suspend_grep = 0; /* end of what may have been a recursive call */ if (logging) console_loglevel = saved_loglevel; - if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) { - got_printf_lock = 0; - spin_unlock_irqrestore(&kdb_printf_lock, flags); - KDB_STATE_CLEAR(PRINTF_LOCK); - atomic_dec(&kdb_event); - } else { - __release(kdb_printf_lock); - } - kdb_trap_printk = saved_trap_printk; - preempt_enable(); + /* kdb_printf_cpu locked the code above. */ + smp_store_release(&kdb_printf_cpu, old_cpu); + local_irq_restore(flags); return retlen; } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2a20c0dfdafc..ca183919d302 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -60,7 +60,6 @@ int kdb_grep_trailing; * Kernel debugger state flags */ int kdb_flags; -atomic_t kdb_event; /* * kdb_lock protects updates to kdb_initial_cpu. Used to diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 75014d7f4568..fc224fbcf954 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -132,7 +132,6 @@ extern int kdb_state; #define KDB_STATE_PAGER 0x00000400 /* pager is available */ #define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching * back to initial cpu */ -#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */ #define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */ #define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */ #define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been diff --git a/kernel/events/core.c b/kernel/events/core.c index faf073d0287f..ab15509fab8c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6698,7 +6698,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { - if (filter->inode != file->f_inode) + if (filter->inode != file_inode(file)) return false; if (filter->offset > offset + size) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f9ec9add2164..215871bda3a2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, - &vma); + &vma, NULL); if (ret <= 0) return ret; @@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * essentially a kernel access to the memory. */ result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, - NULL); + NULL, NULL); if (result < 0) return result; diff --git a/kernel/fork.c b/kernel/fork.c index 5957cf8b4c4b..869b8ccc00bf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -229,7 +229,7 @@ static inline void free_thread_stack(struct task_struct *tsk) } local_irq_restore(flags); - vfree(tsk->stack); + vfree_atomic(tsk->stack); return; } #endif @@ -747,7 +747,8 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } -static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) +static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + struct user_namespace *user_ns) { mm->mmap = NULL; mm->mm_rb = RB_ROOT; @@ -787,6 +788,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) if (init_new_context(p, mm)) goto fail_nocontext; + mm->user_ns = get_user_ns(user_ns); return mm; fail_nocontext: @@ -832,7 +834,7 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - return mm_init(mm, current); + return mm_init(mm, current, current_user_ns()); } /* @@ -847,6 +849,7 @@ void __mmdrop(struct mm_struct *mm) destroy_context(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); + put_user_ns(mm->user_ns); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1128,7 +1131,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); - if (!mm_init(mm, tsk)) + if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem; err = dup_mmap(mm, oldmm); @@ -1544,7 +1547,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 2b59c82cc3e1..40c07e4fa116 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -106,7 +106,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * complain: */ if (sysctl_hung_task_warnings) { - sysctl_hung_task_warnings--; + if (sysctl_hung_task_warnings > 0) + sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", t->comm, t->pid, timeout); pr_err(" %s %s %.*s\n", diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 9be9bda7c1f9..4544b115f5eb 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -37,10 +37,10 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) { - int n, nodes; + int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ - for (n = 0, nodes = 0; n < num_online_nodes(); n++) { + for_each_online_node(n) { if (cpumask_intersects(mask, cpumask_of_node(n))) { node_set(n, *nodemsk); nodes++; @@ -82,7 +82,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk); /* - * If the number of nodes in the mask is less than or equal the + * If the number of nodes in the mask is greater than or equal the * number of vectors we just spread the vectors across the nodes. */ if (affv <= nodes) { diff --git a/kernel/kcov.c b/kernel/kcov.c index 3cbb0c879705..cc2fa35ca480 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -1,11 +1,16 @@ #define pr_fmt(fmt) "kcov: " fmt #define DISABLE_BRANCH_PROFILING +#include <linux/atomic.h> #include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/export.h> #include <linux/types.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/init.h> #include <linux/mm.h> +#include <linux/preempt.h> #include <linux/printk.h> #include <linux/sched.h> #include <linux/slab.h> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 561675589511..5617cc412444 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -441,6 +441,8 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, while (hole_end <= crashk_res.end) { unsigned long i; + cond_resched(); + if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; /* See if I overlap any of the segments */ @@ -1467,9 +1469,6 @@ static int __init crash_save_vmcoreinfo_init(void) #endif VMCOREINFO_NUMBER(PG_head_mask); VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_X86 - VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); -#endif #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); #endif diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 037c321c5618..0c2df7f73792 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -428,25 +428,65 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } -/* - * Helper function for placing a buffer in a kexec segment. This assumes - * that kexec_mutex is held. +/** + * arch_kexec_walk_mem - call func(data) on free memory regions + * @kbuf: Context info for the search. Also passed to @func. + * @func: Function to call for each memory region. + * + * Return: The memory walk will stop when func returns a non-zero value + * and that value will be returned. If all free regions are visited without + * func returning non-zero, then zero will be returned. + */ +int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, + int (*func)(u64, u64, void *)) +{ + if (kbuf->image->type == KEXEC_TYPE_CRASH) + return walk_iomem_res_desc(crashk_res.desc, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, + kbuf, func); + else + return walk_system_ram_res(0, ULONG_MAX, kbuf, func); +} + +/** + * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel + * @kbuf: Parameters for the memory search. + * + * On success, kbuf->mem will have the start address of the memory region found. + * + * Return: 0 on success, negative errno on error. */ -int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, - unsigned long memsz, unsigned long buf_align, - unsigned long buf_min, unsigned long buf_max, - bool top_down, unsigned long *load_addr) +int kexec_locate_mem_hole(struct kexec_buf *kbuf) +{ + int ret; + + ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback); + + return ret == 1 ? 0 : -EADDRNOTAVAIL; +} + +/** + * kexec_add_buffer - place a buffer in a kexec segment + * @kbuf: Buffer contents and memory parameters. + * + * This function assumes that kexec_mutex is held. + * On successful return, @kbuf->mem will have the physical address of + * the buffer in memory. + * + * Return: 0 on success, negative errno on error. + */ +int kexec_add_buffer(struct kexec_buf *kbuf) { struct kexec_segment *ksegment; - struct kexec_buf buf, *kbuf; int ret; /* Currently adding segment this way is allowed only in file mode */ - if (!image->file_mode) + if (!kbuf->image->file_mode) return -EINVAL; - if (image->nr_segments >= KEXEC_SEGMENT_MAX) + if (kbuf->image->nr_segments >= KEXEC_SEGMENT_MAX) return -EINVAL; /* @@ -456,45 +496,27 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, * logic goes through list of segments to make sure there are * no destination overlaps. */ - if (!list_empty(&image->control_pages)) { + if (!list_empty(&kbuf->image->control_pages)) { WARN_ON(1); return -EINVAL; } - memset(&buf, 0, sizeof(struct kexec_buf)); - kbuf = &buf; - kbuf->image = image; - kbuf->buffer = buffer; - kbuf->bufsz = bufsz; - - kbuf->memsz = ALIGN(memsz, PAGE_SIZE); - kbuf->buf_align = max(buf_align, PAGE_SIZE); - kbuf->buf_min = buf_min; - kbuf->buf_max = buf_max; - kbuf->top_down = top_down; + /* Ensure minimum alignment needed for segments. */ + kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE); + kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); /* Walk the RAM ranges and allocate a suitable range for the buffer */ - if (image->type == KEXEC_TYPE_CRASH) - ret = walk_iomem_res_desc(crashk_res.desc, - IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, - crashk_res.start, crashk_res.end, kbuf, - locate_mem_hole_callback); - else - ret = walk_system_ram_res(0, -1, kbuf, - locate_mem_hole_callback); - if (ret != 1) { - /* A suitable memory range could not be found for buffer */ - return -EADDRNOTAVAIL; - } + ret = kexec_locate_mem_hole(kbuf); + if (ret) + return ret; /* Found a suitable memory range */ - ksegment = &image->segment[image->nr_segments]; + ksegment = &kbuf->image->segment[kbuf->image->nr_segments]; ksegment->kbuf = kbuf->buffer; ksegment->bufsz = kbuf->bufsz; ksegment->mem = kbuf->mem; ksegment->memsz = kbuf->memsz; - image->nr_segments++; - *load_addr = ksegment->mem; + kbuf->image->nr_segments++; return 0; } @@ -616,13 +638,15 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, unsigned long max, int top_down) { struct purgatory_info *pi = &image->purgatory_info; - unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; - unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned long align, bss_align, bss_sz, bss_pad; + unsigned long entry, load_addr, curr_load_addr, bss_addr, offset; unsigned char *buf_addr, *src; int i, ret = 0, entry_sidx = -1; const Elf_Shdr *sechdrs_c; Elf_Shdr *sechdrs = NULL; - void *purgatory_buf = NULL; + struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1, + .buf_min = min, .buf_max = max, + .top_down = top_down }; /* * sechdrs_c points to section headers in purgatory and are read @@ -688,9 +712,7 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, } /* Determine how much memory is needed to load relocatable object. */ - buf_align = 1; bss_align = 1; - buf_sz = 0; bss_sz = 0; for (i = 0; i < pi->ehdr->e_shnum; i++) { @@ -699,10 +721,10 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, align = sechdrs[i].sh_addralign; if (sechdrs[i].sh_type != SHT_NOBITS) { - if (buf_align < align) - buf_align = align; - buf_sz = ALIGN(buf_sz, align); - buf_sz += sechdrs[i].sh_size; + if (kbuf.buf_align < align) + kbuf.buf_align = align; + kbuf.bufsz = ALIGN(kbuf.bufsz, align); + kbuf.bufsz += sechdrs[i].sh_size; } else { /* bss section */ if (bss_align < align) @@ -714,32 +736,31 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, /* Determine the bss padding required to align bss properly */ bss_pad = 0; - if (buf_sz & (bss_align - 1)) - bss_pad = bss_align - (buf_sz & (bss_align - 1)); + if (kbuf.bufsz & (bss_align - 1)) + bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1)); - memsz = buf_sz + bss_pad + bss_sz; + kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz; /* Allocate buffer for purgatory */ - purgatory_buf = vzalloc(buf_sz); - if (!purgatory_buf) { + kbuf.buffer = vzalloc(kbuf.bufsz); + if (!kbuf.buffer) { ret = -ENOMEM; goto out; } - if (buf_align < bss_align) - buf_align = bss_align; + if (kbuf.buf_align < bss_align) + kbuf.buf_align = bss_align; /* Add buffer to segment list */ - ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, - buf_align, min, max, top_down, - &pi->purgatory_load_addr); + ret = kexec_add_buffer(&kbuf); if (ret) goto out; + pi->purgatory_load_addr = kbuf.mem; /* Load SHF_ALLOC sections */ - buf_addr = purgatory_buf; + buf_addr = kbuf.buffer; load_addr = curr_load_addr = pi->purgatory_load_addr; - bss_addr = load_addr + buf_sz + bss_pad; + bss_addr = load_addr + kbuf.bufsz + bss_pad; for (i = 0; i < pi->ehdr->e_shnum; i++) { if (!(sechdrs[i].sh_flags & SHF_ALLOC)) @@ -785,11 +806,11 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, * Used later to identify which section is purgatory and skip it * from checksumming. */ - pi->purgatory_buf = purgatory_buf; + pi->purgatory_buf = kbuf.buffer; return ret; out: vfree(sechdrs); - vfree(purgatory_buf); + vfree(kbuf.buffer); return ret; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 0a52315d9c62..4cef7e4706b0 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -20,22 +20,6 @@ struct kexec_sha_region { unsigned long len; }; -/* - * Keeps track of buffer parameters as provided by caller for requesting - * memory placement of buffer. - */ -struct kexec_buf { - struct kimage *image; - char *buffer; - unsigned long bufsz; - unsigned long mem; - unsigned long memsz; - unsigned long buf_align; - unsigned long buf_min; - unsigned long buf_max; - bool top_down; /* allocate from top of memory hole */ -}; - void kimage_file_post_load_cleanup(struct kimage *image); #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } diff --git a/kernel/kthread.c b/kernel/kthread.c index 956495f0efaf..2318fba86277 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -261,7 +261,8 @@ static void create_kthread(struct kthread_create_info *create) } } -static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), +static __printf(4, 0) +struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) @@ -635,7 +636,7 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); -static struct kthread_worker * +static __printf(3, 0) struct kthread_worker * __kthread_create_worker(int cpu, unsigned int flags, const char namefmt[], va_list args) { diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7bd265f6b098..7c38f8f3d97b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3191,7 +3191,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr, return 0; } -static int __lock_is_held(struct lockdep_map *lock); +static int __lock_is_held(struct lockdep_map *lock, int read); /* * This gets called for every mutex_lock*()/spin_lock*() operation. @@ -3332,7 +3332,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } chain_key = iterate_chain_key(chain_key, class_idx); - if (nest_lock && !__lock_is_held(nest_lock)) + if (nest_lock && !__lock_is_held(nest_lock, -1)) return print_lock_nested_lock_not_held(curr, hlock, ip); if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) @@ -3579,7 +3579,7 @@ found_it: return 1; } -static int __lock_is_held(struct lockdep_map *lock) +static int __lock_is_held(struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -3587,8 +3587,12 @@ static int __lock_is_held(struct lockdep_map *lock) for (i = 0; i < curr->lockdep_depth; i++) { struct held_lock *hlock = curr->held_locks + i; - if (match_held_lock(hlock, lock)) - return 1; + if (match_held_lock(hlock, lock)) { + if (read == -1 || hlock->read == read) + return 1; + + return 0; + } } return 0; @@ -3772,7 +3776,7 @@ void lock_release(struct lockdep_map *lock, int nested, } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held(struct lockdep_map *lock) +int lock_is_held_type(struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; @@ -3784,13 +3788,13 @@ int lock_is_held(struct lockdep_map *lock) check_flags(flags); current->lockdep_recursion = 1; - ret = __lock_is_held(lock); + ret = __lock_is_held(lock, read); current->lockdep_recursion = 0; raw_local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(lock_is_held); +EXPORT_SYMBOL_GPL(lock_is_held_type); struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index eb0a599fcf58..e852be4851fc 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -108,11 +108,7 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf, /* * Get the counter ID stored in file->f_inode->i_private */ - if (!file->f_inode) { - WARN_ON_ONCE(1); - return -EBADF; - } - counter = (long)(file->f_inode->i_private); + counter = (long)file_inode(file)->i_private; if (counter >= qstat_num) return -EBADF; @@ -177,11 +173,7 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf, /* * Get the counter ID stored in file->f_inode->i_private */ - if (!file->f_inode) { - WARN_ON_ONCE(1); - return -EBADF; - } - if ((long)(file->f_inode->i_private) != qstat_reset_cnts) + if ((long)file_inode(file)->i_private != qstat_reset_cnts) return count; for_each_possible_cpu(cpu) { diff --git a/kernel/module.c b/kernel/module.c index 0e54d5bf0097..f7482db0f843 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -313,8 +313,11 @@ struct load_info { } index; }; -/* We require a truly strong try_module_get(): 0 means failure due to - ongoing or failed initialization etc. */ +/* + * We require a truly strong try_module_get(): 0 means success. + * Otherwise an error is returned due to ongoing or failed + * initialization etc. + */ static inline int strong_try_module_get(struct module *mod) { BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED); @@ -330,7 +333,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag, enum lockdep_ok lockdep_ok) { add_taint(flag, lockdep_ok); - mod->taints |= (1U << flag); + set_bit(flag, &mod->taints); } /* @@ -1138,24 +1141,13 @@ static inline int module_unload_init(struct module *mod) static size_t module_flags_taint(struct module *mod, char *buf) { size_t l = 0; + int i; + + for (i = 0; i < TAINT_FLAGS_COUNT; i++) { + if (taint_flags[i].module && test_bit(i, &mod->taints)) + buf[l++] = taint_flags[i].true; + } - if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) - buf[l++] = 'P'; - if (mod->taints & (1 << TAINT_OOT_MODULE)) - buf[l++] = 'O'; - if (mod->taints & (1 << TAINT_FORCED_MODULE)) - buf[l++] = 'F'; - if (mod->taints & (1 << TAINT_CRAP)) - buf[l++] = 'C'; - if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) - buf[l++] = 'E'; - if (mod->taints & (1 << TAINT_LIVEPATCH)) - buf[l++] = 'K'; - /* - * TAINT_FORCED_RMMOD: could be added. - * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't - * apply to modules. - */ return l; } @@ -1911,6 +1903,9 @@ static void frob_writable_data(const struct module_layout *layout, /* livepatching wants to disable read-only so it can frob module. */ void module_disable_ro(const struct module *mod) { + if (!rodata_enabled) + return; + frob_text(&mod->core_layout, set_memory_rw); frob_rodata(&mod->core_layout, set_memory_rw); frob_ro_after_init(&mod->core_layout, set_memory_rw); @@ -1920,6 +1915,9 @@ void module_disable_ro(const struct module *mod) void module_enable_ro(const struct module *mod, bool after_init) { + if (!rodata_enabled) + return; + frob_text(&mod->core_layout, set_memory_ro); frob_rodata(&mod->core_layout, set_memory_ro); frob_text(&mod->init_layout, set_memory_ro); @@ -1952,6 +1950,9 @@ void set_all_modules_text_rw(void) { struct module *mod; + if (!rodata_enabled) + return; + mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) @@ -1968,9 +1969,18 @@ void set_all_modules_text_ro(void) { struct module *mod; + if (!rodata_enabled) + return; + mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) + /* + * Ignore going modules since it's possible that ro + * protection has already been disabled, otherwise we'll + * run into protection faults at module deallocation. + */ + if (mod->state == MODULE_STATE_UNFORMED || + mod->state == MODULE_STATE_GOING) continue; frob_text(&mod->core_layout, set_memory_ro); @@ -1981,10 +1991,12 @@ void set_all_modules_text_ro(void) static void disable_ro_nx(const struct module_layout *layout) { - frob_text(layout, set_memory_rw); - frob_rodata(layout, set_memory_rw); + if (rodata_enabled) { + frob_text(layout, set_memory_rw); + frob_rodata(layout, set_memory_rw); + frob_ro_after_init(layout, set_memory_rw); + } frob_rodata(layout, set_memory_x); - frob_ro_after_init(layout, set_memory_rw); frob_ro_after_init(layout, set_memory_x); frob_writable_data(layout, set_memory_x); } @@ -3709,6 +3721,7 @@ static int load_module(struct load_info *info, const char __user *uargs, sysfs_cleanup: mod_sysfs_teardown(mod); coming_cleanup: + mod->state = MODULE_STATE_GOING; blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); klp_module_going(mod); @@ -4042,6 +4055,10 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, } #endif /* CONFIG_KALLSYMS */ +/* Maximum number of characters written by module_flags() */ +#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) + +/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ static char *module_flags(struct module *mod, char *buf) { int bx = 0; @@ -4086,7 +4103,7 @@ static void m_stop(struct seq_file *m, void *p) static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); - char buf[8]; + char buf[MODULE_FLAGS_BUF_SIZE]; /* We always ignore unformed modules. */ if (mod->state == MODULE_STATE_UNFORMED) @@ -4257,7 +4274,7 @@ EXPORT_SYMBOL_GPL(__module_text_address); void print_modules(void) { struct module *mod; - char buf[8]; + char buf[MODULE_FLAGS_BUF_SIZE]; printk(KERN_DEFAULT "Modules linked in:"); /* Most callers should already have preempt disabled, but make sure */ diff --git a/kernel/padata.c b/kernel/padata.c index 7848f0566403..05316c9f32da 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -64,15 +64,11 @@ static int padata_cpu_hash(struct parallel_data *pd) static void padata_parallel_worker(struct work_struct *parallel_work) { struct padata_parallel_queue *pqueue; - struct parallel_data *pd; - struct padata_instance *pinst; LIST_HEAD(local_list); local_bh_disable(); pqueue = container_of(parallel_work, struct padata_parallel_queue, work); - pd = pqueue->pd; - pinst = pd->pinst; spin_lock(&pqueue->parallel.lock); list_replace_init(&pqueue->parallel.list, &local_list); diff --git a/kernel/panic.c b/kernel/panic.c index e6480e20379e..c51edaa04fce 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -298,30 +298,27 @@ void panic(const char *fmt, ...) EXPORT_SYMBOL(panic); - -struct tnt { - u8 bit; - char true; - char false; -}; - -static const struct tnt tnts[] = { - { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, - { TAINT_FORCED_MODULE, 'F', ' ' }, - { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' }, - { TAINT_FORCED_RMMOD, 'R', ' ' }, - { TAINT_MACHINE_CHECK, 'M', ' ' }, - { TAINT_BAD_PAGE, 'B', ' ' }, - { TAINT_USER, 'U', ' ' }, - { TAINT_DIE, 'D', ' ' }, - { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, - { TAINT_WARN, 'W', ' ' }, - { TAINT_CRAP, 'C', ' ' }, - { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, - { TAINT_OOT_MODULE, 'O', ' ' }, - { TAINT_UNSIGNED_MODULE, 'E', ' ' }, - { TAINT_SOFTLOCKUP, 'L', ' ' }, - { TAINT_LIVEPATCH, 'K', ' ' }, +/* + * TAINT_FORCED_RMMOD could be a per-module flag but the module + * is being removed anyway. + */ +const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { + { 'P', 'G', true }, /* TAINT_PROPRIETARY_MODULE */ + { 'F', ' ', true }, /* TAINT_FORCED_MODULE */ + { 'S', ' ', false }, /* TAINT_CPU_OUT_OF_SPEC */ + { 'R', ' ', false }, /* TAINT_FORCED_RMMOD */ + { 'M', ' ', false }, /* TAINT_MACHINE_CHECK */ + { 'B', ' ', false }, /* TAINT_BAD_PAGE */ + { 'U', ' ', false }, /* TAINT_USER */ + { 'D', ' ', false }, /* TAINT_DIE */ + { 'A', ' ', false }, /* TAINT_OVERRIDDEN_ACPI_TABLE */ + { 'W', ' ', false }, /* TAINT_WARN */ + { 'C', ' ', true }, /* TAINT_CRAP */ + { 'I', ' ', false }, /* TAINT_FIRMWARE_WORKAROUND */ + { 'O', ' ', true }, /* TAINT_OOT_MODULE */ + { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */ + { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */ + { 'K', ' ', true }, /* TAINT_LIVEPATCH */ }; /** @@ -348,16 +345,16 @@ static const struct tnt tnts[] = { */ const char *print_tainted(void) { - static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")]; + static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")]; if (tainted_mask) { char *s; int i; s = buf + sprintf(buf, "Tainted: "); - for (i = 0; i < ARRAY_SIZE(tnts); i++) { - const struct tnt *t = &tnts[i]; - *s++ = test_bit(t->bit, &tainted_mask) ? + for (i = 0; i < TAINT_FLAGS_COUNT; i++) { + const struct taint_flag *t = &taint_flags[i]; + *s++ = test_bit(i, &tainted_mask) ? t->true : t->false; } *s = 0; diff --git a/kernel/power/main.c b/kernel/power/main.c index 281a697fd458..d401c21136d1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -78,6 +78,78 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(pm_async); +#ifdef CONFIG_SUSPEND +static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + char *s = buf; + suspend_state_t i; + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (mem_sleep_states[i]) { + const char *label = mem_sleep_states[i]; + + if (mem_sleep_current == i) + s += sprintf(s, "[%s] ", label); + else + s += sprintf(s, "%s ", label); + } + + /* Convert the last space to a newline if needed. */ + if (s != buf) + *(s-1) = '\n'; + + return (s - buf); +} + +static suspend_state_t decode_suspend_state(const char *buf, size_t n) +{ + suspend_state_t state; + char *p; + int len; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { + const char *label = mem_sleep_states[state]; + + if (label && len == strlen(label) && !strncmp(buf, label, len)) + return state; + } + + return PM_SUSPEND_ON; +} + +static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + + state = decode_suspend_state(buf, n); + if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON) + mem_sleep_current = state; + else + error = -EINVAL; + + out: + pm_autosleep_unlock(); + return error ? error : n; +} + +power_attr(mem_sleep); +#endif /* CONFIG_SUSPEND */ + #ifdef CONFIG_PM_DEBUG int pm_test_level = TEST_NONE; @@ -368,12 +440,16 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, } state = decode_state(buf, n); - if (state < PM_SUSPEND_MAX) + if (state < PM_SUSPEND_MAX) { + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + error = pm_suspend(state); - else if (state == PM_SUSPEND_MAX) + } else if (state == PM_SUSPEND_MAX) { error = hibernate(); - else + } else { error = -EINVAL; + } out: pm_autosleep_unlock(); @@ -485,6 +561,9 @@ static ssize_t autosleep_store(struct kobject *kobj, && strcmp(buf, "off") && strcmp(buf, "off\n")) return -EINVAL; + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + error = pm_autosleep_set_state(state); return error ? error : n; } @@ -602,6 +681,9 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, +#ifdef CONFIG_SUSPEND + &mem_sleep_attr.attr, +#endif #ifdef CONFIG_PM_AUTOSLEEP &autosleep_attr.attr, #endif diff --git a/kernel/power/power.h b/kernel/power/power.h index 56d1d0dedf76..1dfa0da827d3 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -189,11 +189,15 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); #ifdef CONFIG_SUSPEND /* kernel/power/suspend.c */ -extern const char *pm_labels[]; +extern const char * const pm_labels[]; extern const char *pm_states[]; +extern const char *mem_sleep_states[]; +extern suspend_state_t mem_sleep_current; extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ +#define mem_sleep_current PM_SUSPEND_ON + static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 168ff442ebde..97b0df71303e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -482,16 +482,7 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } - /* - * This function may be called very early during boot, for example, - * from of_clk_init(), where irq needs to stay disabled. - * cancel_delayed_work_sync() assumes that irq is enabled on - * invocation and re-enables it on return. Avoid calling it until - * workqueue is initialized. - */ - if (keventd_up()) - cancel_delayed_work_sync(&req->work); - + cancel_delayed_work_sync(&req->work); __pm_qos_update_request(req, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6ccb08f57fcb..f67ceb7768b8 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -32,8 +32,21 @@ #include "power.h" -const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; +const char * const pm_labels[] = { + [PM_SUSPEND_FREEZE] = "freeze", + [PM_SUSPEND_STANDBY] = "standby", + [PM_SUSPEND_MEM] = "mem", +}; const char *pm_states[PM_SUSPEND_MAX]; +static const char * const mem_sleep_labels[] = { + [PM_SUSPEND_FREEZE] = "s2idle", + [PM_SUSPEND_STANDBY] = "shallow", + [PM_SUSPEND_MEM] = "deep", +}; +const char *mem_sleep_states[PM_SUSPEND_MAX]; + +suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE; +suspend_state_t mem_sleep_default = PM_SUSPEND_MAX; unsigned int pm_suspend_global_flags; EXPORT_SYMBOL_GPL(pm_suspend_global_flags); @@ -110,30 +123,32 @@ static bool valid_state(suspend_state_t state) return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); } -/* - * If this is set, the "mem" label always corresponds to the deepest sleep state - * available, the "standby" label corresponds to the second deepest sleep state - * available (if any), and the "freeze" label corresponds to the remaining - * available sleep state (if there is one). - */ -static bool relative_states; - void __init pm_states_init(void) { + /* "mem" and "freeze" are always present in /sys/power/state. */ + pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM]; + pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE]; /* - * freeze state should be supported even without any suspend_ops, - * initialize pm_states accordingly here + * Suspend-to-idle should be supported even without any suspend_ops, + * initialize mem_sleep_states[] accordingly here. */ - pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; + mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE]; } -static int __init sleep_states_setup(char *str) +static int __init mem_sleep_default_setup(char *str) { - relative_states = !strncmp(str, "1", 1); + suspend_state_t state; + + for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++) + if (mem_sleep_labels[state] && + !strcmp(str, mem_sleep_labels[state])) { + mem_sleep_default = state; + break; + } + return 1; } - -__setup("relative_sleep_states=", sleep_states_setup); +__setup("mem_sleep_default=", mem_sleep_default_setup); /** * suspend_set_ops - Set the global suspend method table. @@ -141,21 +156,21 @@ __setup("relative_sleep_states=", sleep_states_setup); */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - suspend_state_t i; - int j = 0; - lock_system_sleep(); suspend_ops = ops; - for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) - if (valid_state(i)) { - pm_states[i] = pm_labels[j++]; - } else if (!relative_states) { - pm_states[i] = NULL; - j++; - } - pm_states[PM_SUSPEND_FREEZE] = pm_labels[j]; + if (valid_state(PM_SUSPEND_STANDBY)) { + mem_sleep_states[PM_SUSPEND_STANDBY] = mem_sleep_labels[PM_SUSPEND_STANDBY]; + pm_states[PM_SUSPEND_STANDBY] = pm_labels[PM_SUSPEND_STANDBY]; + if (mem_sleep_default == PM_SUSPEND_STANDBY) + mem_sleep_current = PM_SUSPEND_STANDBY; + } + if (valid_state(PM_SUSPEND_MEM)) { + mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; + if (mem_sleep_default >= PM_SUSPEND_MEM) + mem_sleep_current = PM_SUSPEND_MEM; + } unlock_system_sleep(); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a3b1e617bcdc..32e0c232efba 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -307,7 +307,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, + hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { @@ -317,7 +317,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Swap header not found!\n"); @@ -397,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) } else { src = buf; } - return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb); + return hib_submit_io(REQ_OP_WRITE, REQ_SYNC, offset, src, hb); } static void release_swap_writer(struct swap_map_handle *handle) @@ -1000,8 +1000,7 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, - tmp->map, NULL); + error = hib_submit_io(REQ_OP_READ, 0, offset, tmp->map, NULL); if (error) { release_swap_reader(handle); return error; @@ -1025,7 +1024,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb); + error = hib_submit_io(REQ_OP_READ, 0, offset, buf, hb); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1534,7 +1533,7 @@ int swsusp_check(void) if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); - error = hib_submit_io(REQ_OP_READ, READ_SYNC, + error = hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, swsusp_header, NULL); if (error) @@ -1543,7 +1542,7 @@ int swsusp_check(void) if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { @@ -1588,11 +1587,11 @@ int swsusp_unmark(void) { int error; - hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, + hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, swsusp_header, NULL); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index 16bab471c7e2..f011aaef583c 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -67,7 +67,8 @@ static int vprintk_nmi(const char *fmt, va_list args) again: len = atomic_read(&s->len); - if (len >= sizeof(s->buffer)) { + /* The trailing '\0' is not counted into len. */ + if (len >= sizeof(s->buffer) - 1) { atomic_inc(&nmi_message_lost); return 0; } @@ -79,7 +80,7 @@ again: if (!len) smp_rmb(); - add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); /* * Do it once again if the buffer has been flushed in the meantime. @@ -113,16 +114,51 @@ static void printk_nmi_flush_line(const char *text, int len) } -/* - * printk one line from the temporary buffer from @start index until - * and including the @end index. - */ -static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s, - int start, int end) +/* printk part of the temporary buffer line by line */ +static int printk_nmi_flush_buffer(const char *start, size_t len) { - const char *buf = s->buffer + start; + const char *c, *end; + bool header; + + c = start; + end = start + len; + header = true; + + /* Print line by line. */ + while (c < end) { + if (*c == '\n') { + printk_nmi_flush_line(start, c - start + 1); + start = ++c; + header = true; + continue; + } + + /* Handle continuous lines or missing new line. */ + if ((c + 1 < end) && printk_get_level(c)) { + if (header) { + c = printk_skip_level(c); + continue; + } + + printk_nmi_flush_line(start, c - start); + start = c++; + header = true; + continue; + } + + header = false; + c++; + } - printk_nmi_flush_line(buf, (end - start) + 1); + /* Check if there was a partial line. Ignore pure header. */ + if (start < end && !header) { + static const char newline[] = KERN_CONT "\n"; + + printk_nmi_flush_line(start, end - start); + printk_nmi_flush_line(newline, strlen(newline)); + } + + return len; } /* @@ -135,8 +171,8 @@ static void __printk_nmi_flush(struct irq_work *work) __RAW_SPIN_LOCK_INITIALIZER(read_lock); struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); unsigned long flags; - size_t len, size; - int i, last_i; + size_t len; + int i; /* * The lock has two functions. First, one reader has to flush all @@ -154,12 +190,14 @@ more: /* * This is just a paranoid check that nobody has manipulated * the buffer an unexpected way. If we printed something then - * @len must only increase. + * @len must only increase. Also it should never overflow the + * buffer size. */ - if (i && i >= len) { + if ((i && i >= len) || len > sizeof(s->buffer)) { const char *msg = "printk_nmi_flush: internal error\n"; printk_nmi_flush_line(msg, strlen(msg)); + len = 0; } if (!len) @@ -167,22 +205,7 @@ more: /* Make sure that data has been written up to the @len */ smp_rmb(); - - size = min(len, sizeof(s->buffer)); - last_i = i; - - /* Print line by line. */ - for (; i < size; i++) { - if (s->buffer[i] == '\n') { - printk_nmi_flush_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < size) { - printk_nmi_flush_seq_line(s, last_i, size - 1); - printk_nmi_flush_line("\n", strlen("\n")); - } + i += printk_nmi_flush_buffer(s->buffer + i, len - i); /* * Check that nothing has got added in the meantime and truncate diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 577f2288d19f..e2cdd87e7a63 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -356,7 +356,6 @@ DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; -static enum log_flags syslog_prev; static size_t syslog_partial; /* index and sequence number of the first record stored in the buffer */ @@ -370,7 +369,6 @@ static u32 log_next_idx; /* the next printk record to write to the console */ static u64 console_seq; static u32 console_idx; -static enum log_flags console_prev; /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; @@ -639,27 +637,15 @@ static void append_char(char **pp, char *e, char c) } static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, u64 seq, - enum log_flags prev_flags) + struct printk_log *msg, u64 seq) { u64 ts_usec = msg->ts_nsec; - char cont = '-'; do_div(ts_usec, 1000); - /* - * If we couldn't merge continuation line fragments during the print, - * export the stored flags to allow an optional external merge of the - * records. Merging the records isn't always neccessarily correct, like - * when we hit a race during printing. In most cases though, it produces - * better readable output. 'c' in the record flags mark the first - * fragment of a line, '+' the following. - */ - if (msg->flags & LOG_CONT) - cont = (prev_flags & LOG_CONT) ? '+' : 'c'; - return scnprintf(buf, size, "%u,%llu,%llu,%c;", - (msg->facility << 3) | msg->level, seq, ts_usec, cont); + (msg->facility << 3) | msg->level, seq, ts_usec, + msg->flags & LOG_CONT ? 'c' : '-'); } static ssize_t msg_print_ext_body(char *buf, size_t size, @@ -714,7 +700,6 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, struct devkmsg_user { u64 seq; u32 idx; - enum log_flags prev; struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; @@ -748,7 +733,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) return -ENOMEM; buf[len] = '\0'; - if (copy_from_iter(buf, len, from) != len) { + if (!copy_from_iter_full(buf, len, from)) { kfree(buf); return -EFAULT; } @@ -824,12 +809,11 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, msg = log_from_idx(user->idx); len = msg_print_ext_header(user->buf, sizeof(user->buf), - msg, user->seq, user->prev); + msg, user->seq); len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, log_dict(msg), msg->dict_len, log_text(msg), msg->text_len); - user->prev = msg->flags; user->idx = log_next(user->idx); user->seq++; raw_spin_unlock_irq(&logbuf_lock); @@ -1210,26 +1194,12 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf) return len; } -static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, - bool syslog, char *buf, size_t size) +static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { const char *text = log_text(msg); size_t text_size = msg->text_len; - bool prefix = true; - bool newline = true; size_t len = 0; - if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) - prefix = false; - - if (msg->flags & LOG_CONT) { - if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) - prefix = false; - - if (!(msg->flags & LOG_NEWLINE)) - newline = false; - } - do { const char *next = memchr(text, '\n', text_size); size_t text_len; @@ -1247,22 +1217,17 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, text_len + 1 >= size - len) break; - if (prefix) - len += print_prefix(msg, syslog, buf + len); + len += print_prefix(msg, syslog, buf + len); memcpy(buf + len, text, text_len); len += text_len; - if (next || newline) - buf[len++] = '\n'; + buf[len++] = '\n'; } else { /* SYSLOG_ACTION_* buffer size only calculation */ - if (prefix) - len += print_prefix(msg, syslog, NULL); + len += print_prefix(msg, syslog, NULL); len += text_len; - if (next || newline) - len++; + len++; } - prefix = true; text = next; } while (text); @@ -1288,7 +1253,6 @@ static int syslog_print(char __user *buf, int size) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; - syslog_prev = 0; syslog_partial = 0; } if (syslog_seq == log_next_seq) { @@ -1298,13 +1262,11 @@ static int syslog_print(char __user *buf, int size) skip = syslog_partial; msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, syslog_prev, true, text, - LOG_LINE_MAX + PREFIX_MAX); + n = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); syslog_seq++; - syslog_prev = msg->flags; n -= syslog_partial; syslog_partial = 0; } else if (!len){ @@ -1347,7 +1309,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u64 next_seq; u64 seq; u32 idx; - enum log_flags prev; /* * Find first record that fits, including all following records, @@ -1355,12 +1316,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear) */ seq = clear_seq; idx = clear_idx; - prev = 0; while (seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - len += msg_print_text(msg, prev, true, NULL, 0); - prev = msg->flags; + len += msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; } @@ -1368,12 +1327,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* move first record forward until length fits into the buffer */ seq = clear_seq; idx = clear_idx; - prev = 0; while (len > size && seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - len -= msg_print_text(msg, prev, true, NULL, 0); - prev = msg->flags; + len -= msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; } @@ -1386,7 +1343,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct printk_log *msg = log_from_idx(idx); int textlen; - textlen = msg_print_text(msg, prev, true, text, + textlen = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX); if (textlen < 0) { len = textlen; @@ -1394,7 +1351,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) } idx = log_next(idx); seq++; - prev = msg->flags; raw_spin_unlock_irq(&logbuf_lock); if (copy_to_user(buf + len, text, textlen)) @@ -1407,7 +1363,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* messages are gone, move to next one */ seq = log_first_seq; idx = log_first_idx; - prev = 0; } } } @@ -1508,7 +1463,6 @@ int do_syslog(int type, char __user *buf, int len, int source) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; - syslog_prev = 0; syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { @@ -1521,16 +1475,14 @@ int do_syslog(int type, char __user *buf, int len, int source) } else { u64 seq = syslog_seq; u32 idx = syslog_idx; - enum log_flags prev = syslog_prev; error = 0; while (seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - error += msg_print_text(msg, prev, true, NULL, 0); + error += msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; - prev = msg->flags; } error -= syslog_partial; } @@ -1631,46 +1583,25 @@ static inline void printk_delay(void) static struct cont { char buf[LOG_LINE_MAX]; size_t len; /* length == 0 means unused buffer */ - size_t cons; /* bytes written to console */ struct task_struct *owner; /* task of first print*/ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ u8 facility; /* log facility of first message */ enum log_flags flags; /* prefix, newline flags */ - bool flushed:1; /* buffer sealed and committed */ } cont; static void cont_flush(void) { - if (cont.flushed) - return; if (cont.len == 0) return; - if (cont.cons) { - /* - * If a fragment of this line was directly flushed to the - * console; wait for the console to pick up the rest of the - * line. LOG_NOCONS suppresses a duplicated output. - */ - log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS, - cont.ts_nsec, NULL, 0, cont.buf, cont.len); - cont.flushed = true; - } else { - /* - * If no fragment of this line ever reached the console, - * just submit it to the store and free the buffer. - */ - log_store(cont.facility, cont.level, cont.flags, 0, - NULL, 0, cont.buf, cont.len); - cont.len = 0; - } + + log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec, + NULL, 0, cont.buf, cont.len); + cont.len = 0; } static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) { - if (cont.len && cont.flushed) - return false; - /* * If ext consoles are present, flush and skip in-kernel * continuation. See nr_ext_console_drivers definition. Also, if @@ -1687,8 +1618,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * cont.owner = current; cont.ts_nsec = local_clock(); cont.flags = flags; - cont.cons = 0; - cont.flushed = false; } memcpy(cont.buf + cont.len, text, len); @@ -1707,34 +1636,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * return true; } -static size_t cont_print_text(char *text, size_t size) -{ - size_t textlen = 0; - size_t len; - - if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) { - textlen += print_time(cont.ts_nsec, text); - size -= textlen; - } - - len = cont.len - cont.cons; - if (len > 0) { - if (len+1 > size) - len = size-1; - memcpy(text + textlen, cont.buf + cont.cons, len); - textlen += len; - cont.cons = cont.len; - } - - if (cont.flushed) { - if (cont.flags & LOG_NEWLINE) - text[textlen++] = '\n'; - /* got everything, release buffer */ - cont.len = 0; - } - return textlen; -} - static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) { /* @@ -1926,7 +1827,8 @@ int vprintk_default(const char *fmt, va_list args) int r; #ifdef CONFIG_KGDB_KDB - if (unlikely(kdb_trap_printk)) { + /* Allow to pass printk() to kdb but avoid a recursion. */ + if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) { r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); return r; } @@ -1980,33 +1882,24 @@ static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; static u32 console_idx; -static enum log_flags syslog_prev; static u64 log_first_seq; static u32 log_first_idx; static u64 log_next_seq; -static enum log_flags console_prev; -static struct cont { - size_t len; - size_t cons; - u8 level; - bool flushed:1; -} cont; static char *log_text(const struct printk_log *msg) { return NULL; } static char *log_dict(const struct printk_log *msg) { return NULL; } static struct printk_log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, u64 seq, - enum log_flags prev_flags) { return 0; } + struct printk_log *msg, + u64 seq) { return 0; } static ssize_t msg_print_ext_body(char *buf, size_t size, char *dict, size_t dict_len, char *text, size_t text_len) { return 0; } static void call_console_drivers(int level, const char *ext_text, size_t ext_len, const char *text, size_t len) {} -static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, +static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } -static size_t cont_print_text(char *text, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } /* Still needs to be defined for users */ @@ -2270,42 +2163,6 @@ static inline int can_use_console(void) return cpu_online(raw_smp_processor_id()) || have_callable_console(); } -static void console_cont_flush(char *text, size_t size) -{ - unsigned long flags; - size_t len; - - raw_spin_lock_irqsave(&logbuf_lock, flags); - - if (!cont.len) - goto out; - - if (suppress_message_printing(cont.level)) { - cont.cons = cont.len; - if (cont.flushed) - cont.len = 0; - goto out; - } - - /* - * We still queue earlier records, likely because the console was - * busy. The earlier ones need to be printed before this one, we - * did not flush any fragment so far, so just let it queue up. - */ - if (console_seq < log_next_seq && !cont.cons) - goto out; - - len = cont_print_text(text, size); - raw_spin_unlock(&logbuf_lock); - stop_critical_timings(); - call_console_drivers(cont.level, NULL, 0, text, len); - start_critical_timings(); - local_irq_restore(flags); - return; -out: - raw_spin_unlock_irqrestore(&logbuf_lock, flags); -} - /** * console_unlock - unlock the console system * @@ -2359,9 +2216,6 @@ again: return; } - /* flush buffered message fragment immediately to console */ - console_cont_flush(text, sizeof(text)); - for (;;) { struct printk_log *msg; size_t ext_len = 0; @@ -2381,7 +2235,6 @@ again: /* messages are gone, move to first one */ console_seq = log_first_seq; console_idx = log_first_idx; - console_prev = 0; } else { len = 0; } @@ -2391,8 +2244,7 @@ skip: msg = log_from_idx(console_idx); level = msg->level; - if ((msg->flags & LOG_NOCONS) || - suppress_message_printing(level)) { + if (suppress_message_printing(level)) { /* * Skip record we have buffered and already printed * directly to the console when we received it, and @@ -2400,22 +2252,14 @@ skip: */ console_idx = log_next(console_idx); console_seq++; - /* - * We will get here again when we register a new - * CON_PRINTBUFFER console. Clear the flag so we - * will properly dump everything later. - */ - msg->flags &= ~LOG_NOCONS; - console_prev = msg->flags; goto skip; } - len += msg_print_text(msg, console_prev, false, - text + len, sizeof(text) - len); + len += msg_print_text(msg, false, text + len, sizeof(text) - len); if (nr_ext_console_drivers) { ext_len = msg_print_ext_header(ext_text, sizeof(ext_text), - msg, console_seq, console_prev); + msg, console_seq); ext_len += msg_print_ext_body(ext_text + ext_len, sizeof(ext_text) - ext_len, log_dict(msg), msg->dict_len, @@ -2423,7 +2267,6 @@ skip: } console_idx = log_next(console_idx); console_seq++; - console_prev = msg->flags; raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ @@ -2718,7 +2561,6 @@ void register_console(struct console *newcon) raw_spin_lock_irqsave(&logbuf_lock, flags); console_seq = syslog_seq; console_idx = syslog_idx; - console_prev = syslog_prev; raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the @@ -3074,7 +2916,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, goto out; msg = log_from_idx(dumper->cur_idx); - l = msg_print_text(msg, 0, syslog, line, size); + l = msg_print_text(msg, syslog, line, size); dumper->cur_idx = log_next(dumper->cur_idx); dumper->cur_seq++; @@ -3143,7 +2985,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, u32 idx; u64 next_seq; u32 next_idx; - enum log_flags prev; size_t l = 0; bool ret = false; @@ -3166,27 +3007,23 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* calculate length of entire buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; - prev = 0; while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l += msg_print_text(msg, prev, true, NULL, 0); + l += msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; - prev = msg->flags; } /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; - prev = 0; while (l > size && seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l -= msg_print_text(msg, prev, true, NULL, 0); + l -= msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; - prev = msg->flags; } /* last message in next interation */ @@ -3197,10 +3034,9 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l += msg_print_text(msg, prev, syslog, buf + l, size - l); + l += msg_print_text(msg, syslog, buf + l, size - l); idx = log_next(idx); seq++; - prev = msg->flags; } dumper->next_seq = next_seq; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e6474f7272ec..49ba7c1ade9d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -27,6 +27,35 @@ #include <linux/cn_proc.h> #include <linux/compat.h> +/* + * Access another process' address space via ptrace. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + if (!tsk->ptrace || + (current != tsk->parent) || + ((get_dumpable(mm) != SUID_DUMP_USER) && + !ptracer_capable(tsk, mm->user_ns))) { + mmput(mm); + return 0; + } + + ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + mmput(mm); + + return ret; +} + /* * ptrace a task: make the debugger its new parent and @@ -39,6 +68,9 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) BUG_ON(!list_empty(&child->ptrace_entry)); list_add(&child->ptrace_entry, &new_parent->ptraced); child->parent = new_parent; + rcu_read_lock(); + child->ptracer_cred = get_cred(__task_cred(new_parent)); + rcu_read_unlock(); } /** @@ -71,12 +103,16 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) */ void __ptrace_unlink(struct task_struct *child) { + const struct cred *old_cred; BUG_ON(!child->ptrace); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); child->parent = child->real_parent; list_del_init(&child->ptrace_entry); + old_cred = child->ptracer_cred; + child->ptracer_cred = NULL; + put_cred(old_cred); spin_lock(&child->sighand->siglock); child->ptrace = 0; @@ -220,7 +256,7 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; - int dumpable = 0; + struct mm_struct *mm; kuid_t caller_uid; kgid_t caller_gid; @@ -271,16 +307,11 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) return -EPERM; ok: rcu_read_unlock(); - smp_rmb(); - if (task->mm) - dumpable = get_dumpable(task->mm); - rcu_read_lock(); - if (dumpable != SUID_DUMP_USER && - !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { - rcu_read_unlock(); - return -EPERM; - } - rcu_read_unlock(); + mm = task->mm; + if (mm && + ((get_dumpable(mm) != SUID_DUMP_USER) && + !ptrace_has_cap(mm->user_ns, mode))) + return -EPERM; return security_ptrace_access_check(task, mode); } @@ -344,10 +375,6 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) flags |= PT_SEIZED; - rcu_read_lock(); - if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) - flags |= PT_PTRACE_CAP; - rcu_read_unlock(); task->ptrace = flags; __ptrace_link(task, current); @@ -537,7 +564,8 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); + retval = ptrace_access_vm(tsk, src, buf, this_len, FOLL_FORCE); + if (!retval) { if (copied) break; @@ -564,7 +592,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds this_len = (len > sizeof(buf)) ? sizeof(buf) : len; if (copy_from_user(buf, src, this_len)) return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, + retval = ptrace_access_vm(tsk, dst, buf, this_len, FOLL_FORCE | FOLL_WRITE); if (!retval) { if (copied) @@ -1128,7 +1156,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long tmp; int copied; - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); + copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) return -EIO; return put_user(tmp, (unsigned long __user *)data); @@ -1139,7 +1167,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, { int copied; - copied = access_process_vm(tsk, addr, &data, sizeof(data), + copied = ptrace_access_vm(tsk, addr, &data, sizeof(data), FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(data)) ? 0 : -EIO; } @@ -1157,7 +1185,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), + ret = ptrace_access_vm(child, addr, &word, sizeof(word), FOLL_FORCE); if (ret != sizeof(word)) ret = -EIO; @@ -1167,7 +1195,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, case PTRACE_POKETEXT: case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), + ret = ptrace_access_vm(child, addr, &data, sizeof(data), FOLL_FORCE | FOLL_WRITE); ret = (ret != sizeof(data) ? -EIO : 0); break; diff --git a/kernel/relay.c b/kernel/relay.c index da79a109dbeb..8f18d314a96a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -809,11 +809,11 @@ void relay_subbufs_consumed(struct rchan *chan, { struct rchan_buf *buf; - if (!chan) + if (!chan || cpu >= NR_CPUS) return; buf = *per_cpu_ptr(chan->buf, cpu); - if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs) + if (!buf || subbufs_consumed > chan->n_subbufs) return; if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d18804491d9f..966556ebdbb3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5280,6 +5280,7 @@ void init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + idle->flags |= PF_IDLE; kasan_unpoison_task_stack(idle); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 69e06898997d..fd4659313640 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -12,11 +12,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cpufreq.h> +#include <linux/kthread.h> #include <linux/slab.h> #include <trace/events/power.h> #include "sched.h" +#define SUGOV_KTHREAD_PRIORITY 50 + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; @@ -35,8 +38,10 @@ struct sugov_policy { /* The next fields are only needed if fast switch cannot be used. */ struct irq_work irq_work; - struct work_struct work; + struct kthread_work work; struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; bool work_in_progress; bool need_freq_update; @@ -291,7 +296,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, raw_spin_unlock(&sg_policy->update_lock); } -static void sugov_work(struct work_struct *work) +static void sugov_work(struct kthread_work *work) { struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); @@ -308,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work) struct sugov_policy *sg_policy; sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - schedule_work_on(smp_processor_id(), &sg_policy->work); + + /* + * For RT and deadline tasks, the schedutil governor shoots the + * frequency to maximum. Special care must be taken to ensure that this + * kthread doesn't result in the same behavior. + * + * This is (mostly) guaranteed by the work_in_progress flag. The flag is + * updated only at the end of the sugov_work() function and before that + * the schedutil governor rejects all other frequency scaling requests. + * + * There is a very rare case though, where the RT thread yields right + * after the work_in_progress flag is cleared. The effects of that are + * neglected for now. + */ + kthread_queue_work(&sg_policy->worker, &sg_policy->work); } /************************** sysfs interface ************************/ @@ -371,19 +390,64 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) return NULL; sg_policy->policy = policy; - init_irq_work(&sg_policy->irq_work, sugov_irq_work); - INIT_WORK(&sg_policy->work, sugov_work); - mutex_init(&sg_policy->work_lock); raw_spin_lock_init(&sg_policy->update_lock); return sg_policy; } static void sugov_policy_free(struct sugov_policy *sg_policy) { - mutex_destroy(&sg_policy->work_lock); kfree(sg_policy); } +static int sugov_kthread_create(struct sugov_policy *sg_policy) +{ + struct task_struct *thread; + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct cpufreq_policy *policy = sg_policy->policy; + int ret; + + /* kthread only required for slow path */ + if (policy->fast_switch_enabled) + return 0; + + kthread_init_work(&sg_policy->work, sugov_work); + kthread_init_worker(&sg_policy->worker); + thread = kthread_create(kthread_worker_fn, &sg_policy->worker, + "sugov:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR(thread)) { + pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); + return PTR_ERR(thread); + } + + ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + return ret; + } + + sg_policy->thread = thread; + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + mutex_init(&sg_policy->work_lock); + + wake_up_process(thread); + + return 0; +} + +static void sugov_kthread_stop(struct sugov_policy *sg_policy) +{ + /* kthread only required for slow path */ + if (sg_policy->policy->fast_switch_enabled) + return; + + kthread_flush_worker(&sg_policy->worker); + kthread_stop(sg_policy->thread); + mutex_destroy(&sg_policy->work_lock); +} + static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) { struct sugov_tunables *tunables; @@ -416,16 +480,24 @@ static int sugov_init(struct cpufreq_policy *policy) if (policy->governor_data) return -EBUSY; + cpufreq_enable_fast_switch(policy); + sg_policy = sugov_policy_alloc(policy); - if (!sg_policy) - return -ENOMEM; + if (!sg_policy) { + ret = -ENOMEM; + goto disable_fast_switch; + } + + ret = sugov_kthread_create(sg_policy); + if (ret) + goto free_sg_policy; mutex_lock(&global_tunables_lock); if (global_tunables) { if (WARN_ON(have_governor_per_policy())) { ret = -EINVAL; - goto free_sg_policy; + goto stop_kthread; } policy->governor_data = sg_policy; sg_policy->tunables = global_tunables; @@ -437,7 +509,7 @@ static int sugov_init(struct cpufreq_policy *policy) tunables = sugov_tunables_alloc(sg_policy); if (!tunables) { ret = -ENOMEM; - goto free_sg_policy; + goto stop_kthread; } tunables->rate_limit_us = LATENCY_MULTIPLIER; @@ -454,20 +526,25 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail; - out: +out: mutex_unlock(&global_tunables_lock); - - cpufreq_enable_fast_switch(policy); return 0; - fail: +fail: policy->governor_data = NULL; sugov_tunables_free(tunables); - free_sg_policy: +stop_kthread: + sugov_kthread_stop(sg_policy); + +free_sg_policy: mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); + +disable_fast_switch: + cpufreq_disable_fast_switch(policy); + pr_err("initialization failed (error %d)\n", ret); return ret; } @@ -478,8 +555,6 @@ static void sugov_exit(struct cpufreq_policy *policy) struct sugov_tunables *tunables = sg_policy->tunables; unsigned int count; - cpufreq_disable_fast_switch(policy); - mutex_lock(&global_tunables_lock); count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); @@ -489,7 +564,9 @@ static void sugov_exit(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); + sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); + cpufreq_disable_fast_switch(policy); } static int sugov_start(struct cpufreq_policy *policy) @@ -535,8 +612,10 @@ static void sugov_stop(struct cpufreq_policy *policy) synchronize_sched(); - irq_work_sync(&sg_policy->irq_work); - cancel_work_sync(&sg_policy->work); + if (!policy->fast_switch_enabled) { + irq_work_sync(&sg_policy->irq_work); + kthread_cancel_work_sync(&sg_policy->work); + } } static void sugov_limits(struct cpufreq_policy *policy) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1d8718d5300d..6a4bae0a649d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -164,11 +164,14 @@ static void cpuidle_idle_call(void) * timekeeping to prevent timer interrupts from kicking us out of idle * until a proper wakeup interrupt happens. */ - if (idle_should_freeze()) { - entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state > 0) { - local_irq_enable(); - goto exit_idle; + + if (idle_should_freeze() || dev->use_deepest_state) { + if (idle_should_freeze()) { + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state > 0) { + local_irq_enable(); + goto exit_idle; + } } next_state = cpuidle_find_deepest_state(drv, dev); @@ -202,76 +205,65 @@ exit_idle: * * Called with polling cleared. */ -static void cpu_idle_loop(void) +static void do_idle(void) { - int cpu = smp_processor_id(); - - while (1) { - /* - * If the arch has a polling bit, we maintain an invariant: - * - * Our polling bit is clear if we're not scheduled (i.e. if - * rq->curr != rq->idle). This means that, if rq->idle has - * the polling bit set, then setting need_resched is - * guaranteed to cause the cpu to reschedule. - */ - - __current_set_polling(); - quiet_vmstat(); - tick_nohz_idle_enter(); + /* + * If the arch has a polling bit, we maintain an invariant: + * + * Our polling bit is clear if we're not scheduled (i.e. if rq->curr != + * rq->idle). This means that, if rq->idle has the polling bit set, + * then setting need_resched is guaranteed to cause the CPU to + * reschedule. + */ - while (!need_resched()) { - check_pgt_cache(); - rmb(); + __current_set_polling(); + tick_nohz_idle_enter(); - if (cpu_is_offline(cpu)) { - cpuhp_report_idle_dead(); - arch_cpu_idle_dead(); - } + while (!need_resched()) { + check_pgt_cache(); + rmb(); - local_irq_disable(); - arch_cpu_idle_enter(); - - /* - * In poll mode we reenable interrupts and spin. - * - * Also if we detected in the wakeup from idle - * path that the tick broadcast device expired - * for us, we don't want to go deep idle as we - * know that the IPI is going to arrive right - * away - */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) - cpu_idle_poll(); - else - cpuidle_idle_call(); - - arch_cpu_idle_exit(); + if (cpu_is_offline(smp_processor_id())) { + cpuhp_report_idle_dead(); + arch_cpu_idle_dead(); } - /* - * Since we fell out of the loop above, we know - * TIF_NEED_RESCHED must be set, propagate it into - * PREEMPT_NEED_RESCHED. - * - * This is required because for polling idle loops we will - * not have had an IPI to fold the state for us. - */ - preempt_set_need_resched(); - tick_nohz_idle_exit(); - __current_clr_polling(); + local_irq_disable(); + arch_cpu_idle_enter(); /* - * We promise to call sched_ttwu_pending and reschedule - * if need_resched is set while polling is set. That - * means that clearing polling needs to be visible - * before doing these things. + * In poll mode we reenable interrupts and spin. Also if we + * detected in the wakeup from idle path that the tick + * broadcast device expired for us, we don't want to go deep + * idle as we know that the IPI is going to arrive right away. */ - smp_mb__after_atomic(); - - sched_ttwu_pending(); - schedule_preempt_disabled(); + if (cpu_idle_force_poll || tick_check_broadcast_expired()) + cpu_idle_poll(); + else + cpuidle_idle_call(); + arch_cpu_idle_exit(); } + + /* + * Since we fell out of the loop above, we know TIF_NEED_RESCHED must + * be set, propagate it into PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will not have had + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); + tick_nohz_idle_exit(); + __current_clr_polling(); + + /* + * We promise to call sched_ttwu_pending() and reschedule if + * need_resched() is set while polling is set. That means that clearing + * polling needs to be visible before doing these things. + */ + smp_mb__after_atomic(); + + sched_ttwu_pending(); + schedule_preempt_disabled(); } bool cpu_in_idle(unsigned long pc) @@ -280,6 +272,56 @@ bool cpu_in_idle(unsigned long pc) pc < (unsigned long)__cpuidle_text_end; } +struct idle_timer { + struct hrtimer timer; + int done; +}; + +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) +{ + struct idle_timer *it = container_of(timer, struct idle_timer, timer); + + WRITE_ONCE(it->done, 1); + set_tsk_need_resched(current); + + return HRTIMER_NORESTART; +} + +void play_idle(unsigned long duration_ms) +{ + struct idle_timer it; + + /* + * Only FIFO tasks can disable the tick since they don't need the forced + * preemption. + */ + WARN_ON_ONCE(current->policy != SCHED_FIFO); + WARN_ON_ONCE(current->nr_cpus_allowed != 1); + WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); + WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); + WARN_ON_ONCE(!duration_ms); + + rcu_sleep_check(); + preempt_disable(); + current->flags |= PF_IDLE; + cpuidle_use_deepest_state(true); + + it.done = 0; + hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + it.timer.function = idle_inject_timer_fn; + hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED); + + while (!READ_ONCE(it.done)) + do_idle(); + + cpuidle_use_deepest_state(false); + current->flags &= ~PF_IDLE; + + preempt_fold_need_resched(); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(play_idle); + void cpu_startup_entry(enum cpuhp_state state) { /* @@ -299,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state) #endif arch_cpu_idle_prepare(); cpuhp_online_idle(state); - cpu_idle_loop(); + while (1) + do_idle(); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index bff9c774987a..f7ce79a46050 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -41,8 +41,7 @@ * outside of a lifetime-guarded section. In general, this * is only needed for handling filters shared across tasks. * @prev: points to a previously installed, or inherited, filter - * @len: the number of instructions in the program - * @insnsi: the BPF program instructions to evaluate + * @prog: the BPF program to evaluate * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -168,8 +167,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) } /** - * seccomp_run_filters - evaluates all seccomp filters against @syscall - * @syscall: number of the current system call + * seccomp_run_filters - evaluates all seccomp filters against @sd + * @sd: optional seccomp data to be passed to filters * * Returns valid seccomp BPF response codes. */ diff --git a/kernel/signal.c b/kernel/signal.c index 29a410780aa9..ae60996fedff 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2491,6 +2491,13 @@ void __set_current_blocked(const sigset_t *newset) { struct task_struct *tsk = current; + /* + * In case the signal mask hasn't changed, there is nothing we need + * to do. The current->blocked shouldn't be modified by other task. + */ + if (sigequalsets(&tsk->blocked, newset)) + return; + spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, newset); spin_unlock_irq(&tsk->sighand->siglock); diff --git a/kernel/sys.c b/kernel/sys.c index 78c9fb7dd680..9758892a2d09 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1697,16 +1697,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) fput(exe_file); } - /* - * The symlink can be changed only once, just to disallow arbitrary - * transitions malicious software might bring in. This means one - * could make a snapshot over all processes running and monitor - * /proc/pid/exe changes to notice unusual activity if needed. - */ - err = -EPERM; - if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) - goto exit; - err = 0; /* set the new file, lockless */ get_file(exe.file); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 39b3368f6de6..1a292ebcbbb6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -627,7 +627,7 @@ static struct ctl_table kern_table[] = { .data = &tracepoint_printk, .maxlen = sizeof(tracepoint_printk), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = tracepoint_printk_sysctl, }, #endif #ifdef CONFIG_KEXEC_CORE @@ -2389,9 +2389,11 @@ static void validate_coredump_safety(void) #ifdef CONFIG_COREDUMP if (suid_dumpable == SUID_DUMP_ROOT && core_pattern[0] != '/' && core_pattern[0] != '|') { - printk(KERN_WARNING "Unsafe core_pattern used with "\ - "suid_dumpable=2. Pipe handler or fully qualified "\ - "core dump path required.\n"); + printk(KERN_WARNING +"Unsafe core_pattern used with fs.suid_dumpable=2.\n" +"Pipe handler or fully qualified core dump path required.\n" +"Set kernel.core_pattern before fs.suid_dumpable.\n" + ); } #endif } diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 6eb99c17dbd8..ece4b177052b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1354,8 +1354,8 @@ static void deprecated_sysctl_warning(const int *name, int nlen) "warning: process `%s' used the deprecated sysctl " "system call with ", current->comm); for (i = 0; i < nlen; i++) - printk("%d.", name[i]); - printk("\n"); + printk(KERN_CONT "%d.", name[i]); + printk(KERN_CONT "\n"); } return; } diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9b08ca391aed..3921cf7fea8e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -516,7 +516,8 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, spin_lock_irqsave(&ptr->it_lock, flags); if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { - if (posix_timer_event(ptr, 0) != 0) + if (IS_ENABLED(CONFIG_POSIX_TIMERS) && + posix_timer_event(ptr, 0) != 0) ptr->it_overrun++; } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2a96b063d659..d5038005eb5d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -70,6 +70,7 @@ config FTRACE_NMI_ENTER config EVENT_TRACING select CONTEXT_SWITCH_TRACER + select GLOB bool config CONTEXT_SWITCH_TRACER @@ -133,6 +134,7 @@ config FUNCTION_TRACER select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER + select GLOB help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index dbafc5df03f3..95cecbf67f5c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1777,14 +1777,14 @@ void blk_dump_cmd(char *buf, struct request *rq) } } -void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) +void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes) { int i = 0; - if (rw & REQ_PREFLUSH) + if (op & REQ_PREFLUSH) rwbs[i++] = 'F'; - switch (op) { + switch (op & REQ_OP_MASK) { case REQ_OP_WRITE: case REQ_OP_WRITE_SAME: rwbs[i++] = 'W'; @@ -1806,13 +1806,13 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) rwbs[i++] = 'N'; } - if (rw & REQ_FUA) + if (op & REQ_FUA) rwbs[i++] = 'F'; - if (rw & REQ_RAHEAD) + if (op & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & REQ_SYNC) + if (op & REQ_SYNC) rwbs[i++] = 'S'; - if (rw & REQ_META) + if (op & REQ_META) rwbs[i++] = 'M'; rwbs[i] = '\0'; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index da87b3cba5b3..1f0f547c54da 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3511,6 +3511,10 @@ static int ftrace_match(char *str, struct ftrace_glob *g) memcmp(str + slen - g->len, g->search, g->len) == 0) matched = 1; break; + case MATCH_GLOB: + if (glob_match(g->search, str)) + matched = 1; + break; } return matched; @@ -4258,6 +4262,23 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, } EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); +/** + * ftrace_ops_set_global_filter - setup ops to use global filters + * @ops - the ops which will use the global filters + * + * ftrace users who need global function trace filtering should call this. + * It can set the global filter only if ops were not initialized before. + */ +void ftrace_ops_set_global_filter(struct ftrace_ops *ops) +{ + if (ops->flags & FTRACE_OPS_FL_INITIALIZED) + return; + + ftrace_ops_init(ops); + ops->func_hash = &global_ops.local_hash; +} +EXPORT_SYMBOL_GPL(ftrace_ops_set_global_filter); + static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 89a2611a1635..a85739efcc30 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -245,7 +245,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event) EXPORT_SYMBOL_GPL(ring_buffer_event_length); /* inline for ring buffer fast paths */ -static void * +static __always_inline void * rb_event_data(struct ring_buffer_event *event) { if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) @@ -1798,48 +1798,48 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) } EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); -static inline void * +static __always_inline void * __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { return bpage->data + index; } -static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) +static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { return bpage->page->data + index; } -static inline struct ring_buffer_event * +static __always_inline struct ring_buffer_event * rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) { return __rb_page_index(cpu_buffer->reader_page, cpu_buffer->reader_page->read); } -static inline struct ring_buffer_event * +static __always_inline struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { return __rb_page_index(iter->head_page, iter->head); } -static inline unsigned rb_page_commit(struct buffer_page *bpage) +static __always_inline unsigned rb_page_commit(struct buffer_page *bpage) { return local_read(&bpage->page->commit); } /* Size is determined by what has been committed */ -static inline unsigned rb_page_size(struct buffer_page *bpage) +static __always_inline unsigned rb_page_size(struct buffer_page *bpage) { return rb_page_commit(bpage); } -static inline unsigned +static __always_inline unsigned rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) { return rb_page_commit(cpu_buffer->commit_page); } -static inline unsigned +static __always_inline unsigned rb_event_index(struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; @@ -2355,7 +2355,7 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) local_inc(&cpu_buffer->commits); } -static void +static __always_inline void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long max_count; @@ -2410,7 +2410,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) goto again; } -static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; @@ -2455,7 +2455,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -static inline bool +static __always_inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -2469,7 +2469,7 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, rb_commit_index(cpu_buffer) == index; } -static void +static __always_inline void rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -2702,7 +2702,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; } -static struct ring_buffer_event * +static __always_inline struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 54d5270a5042..66f829c47bec 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -40,6 +40,7 @@ #include <linux/poll.h> #include <linux/nmi.h> #include <linux/fs.h> +#include <linux/trace.h> #include <linux/sched/rt.h> #include "trace.h" @@ -68,6 +69,7 @@ bool __read_mostly tracing_selftest_disabled; /* Pipe tracepoints to printk */ struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; +static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { @@ -738,6 +740,31 @@ static inline void ftrace_trace_stack(struct trace_array *tr, #endif +static __always_inline void +trace_event_setup(struct ring_buffer_event *event, + int type, unsigned long flags, int pc) +{ + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, flags, pc); + ent->type = type; +} + +static __always_inline struct ring_buffer_event * +__trace_buffer_lock_reserve(struct ring_buffer *buffer, + int type, + unsigned long len, + unsigned long flags, int pc) +{ + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(buffer, len); + if (event != NULL) + trace_event_setup(event, type, flags, pc); + + return event; +} + static void tracer_tracing_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) @@ -767,6 +794,22 @@ void tracing_on(void) } EXPORT_SYMBOL_GPL(tracing_on); + +static __always_inline void +__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_cmdline_save, true); + + /* If this is the temp buffer, we need to commit fully */ + if (this_cpu_read(trace_buffered_event) == event) { + /* Length is in event->array[0] */ + ring_buffer_write(buffer, event->array[0], &event->array[1]); + /* Release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + } else + ring_buffer_unlock_commit(buffer, event); +} + /** * __trace_puts - write a constant string into the trace buffer. * @ip: The address of the caller @@ -794,8 +837,8 @@ int __trace_puts(unsigned long ip, const char *str, int size) local_save_flags(irq_flags); buffer = global_trace.trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, - irq_flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + irq_flags, pc); if (!event) return 0; @@ -842,8 +885,8 @@ int __trace_bputs(unsigned long ip, const char *str) local_save_flags(irq_flags); buffer = global_trace.trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, - irq_flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, + irq_flags, pc); if (!event) return 0; @@ -1907,35 +1950,19 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, #endif ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | - ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) | (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); -static __always_inline void -trace_event_setup(struct ring_buffer_event *event, - int type, unsigned long flags, int pc) -{ - struct trace_entry *ent = ring_buffer_event_data(event); - - tracing_generic_entry_update(ent, flags, pc); - ent->type = type; -} - struct ring_buffer_event * trace_buffer_lock_reserve(struct ring_buffer *buffer, int type, unsigned long len, unsigned long flags, int pc) { - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(buffer, len); - if (event != NULL) - trace_event_setup(event, type, flags, pc); - - return event; + return __trace_buffer_lock_reserve(buffer, type, len, flags, pc); } DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); @@ -2049,21 +2076,6 @@ void trace_buffered_event_disable(void) preempt_enable(); } -void -__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) -{ - __this_cpu_write(trace_cmdline_save, true); - - /* If this is the temp buffer, we need to commit fully */ - if (this_cpu_read(trace_buffered_event) == event) { - /* Length is in event->array[0] */ - ring_buffer_write(buffer, event->array[0], &event->array[1]); - /* Release the temp buffer */ - this_cpu_dec(trace_buffered_event_cnt); - } else - ring_buffer_unlock_commit(buffer, event); -} - static struct ring_buffer *temp_buffer; struct ring_buffer_event * @@ -2090,8 +2102,8 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, this_cpu_dec(trace_buffered_event_cnt); } - entry = trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); + entry = __trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); /* * If tracing is off, but we have triggers enabled * we still need to look at the event data. Use the temp_buffer @@ -2100,13 +2112,88 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, */ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { *current_rb = temp_buffer; - entry = trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); + entry = __trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); } return entry; } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); +static DEFINE_SPINLOCK(tracepoint_iter_lock); +static DEFINE_MUTEX(tracepoint_printk_mutex); + +static void output_printk(struct trace_event_buffer *fbuffer) +{ + struct trace_event_call *event_call; + struct trace_event *event; + unsigned long flags; + struct trace_iterator *iter = tracepoint_print_iter; + + /* We should never get here if iter is NULL */ + if (WARN_ON_ONCE(!iter)) + return; + + event_call = fbuffer->trace_file->event_call; + if (!event_call || !event_call->event.funcs || + !event_call->event.funcs->trace) + return; + + event = &fbuffer->trace_file->event_call->event; + + spin_lock_irqsave(&tracepoint_iter_lock, flags); + trace_seq_init(&iter->seq); + iter->ent = fbuffer->entry; + event_call->event.funcs->trace(iter, 0, event); + trace_seq_putc(&iter->seq, 0); + printk("%s", iter->seq.buffer); + + spin_unlock_irqrestore(&tracepoint_iter_lock, flags); +} + +int tracepoint_printk_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int save_tracepoint_printk; + int ret; + + mutex_lock(&tracepoint_printk_mutex); + save_tracepoint_printk = tracepoint_printk; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + /* + * This will force exiting early, as tracepoint_printk + * is always zero when tracepoint_printk_iter is not allocated + */ + if (!tracepoint_print_iter) + tracepoint_printk = 0; + + if (save_tracepoint_printk == tracepoint_printk) + goto out; + + if (tracepoint_printk) + static_key_enable(&tracepoint_printk_key.key); + else + static_key_disable(&tracepoint_printk_key.key); + + out: + mutex_unlock(&tracepoint_printk_mutex); + + return ret; +} + +void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) +{ + if (static_key_false(&tracepoint_printk_key.key)) + output_printk(fbuffer); + + event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer, + fbuffer->event, fbuffer->entry, + fbuffer->flags, fbuffer->pc); +} +EXPORT_SYMBOL_GPL(trace_event_buffer_commit); + void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, @@ -2129,6 +2216,139 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, ftrace_trace_userstack(buffer, flags, pc); } +/* + * Similar to trace_buffer_unlock_commit_regs() but do not dump stack. + */ +void +trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + __buffer_unlock_commit(buffer, event); +} + +static void +trace_process_export(struct trace_export *export, + struct ring_buffer_event *event) +{ + struct trace_entry *entry; + unsigned int size = 0; + + entry = ring_buffer_event_data(event); + size = ring_buffer_event_length(event); + export->write(entry, size); +} + +static DEFINE_MUTEX(ftrace_export_lock); + +static struct trace_export __rcu *ftrace_exports_list __read_mostly; + +static DEFINE_STATIC_KEY_FALSE(ftrace_exports_enabled); + +static inline void ftrace_exports_enable(void) +{ + static_branch_enable(&ftrace_exports_enabled); +} + +static inline void ftrace_exports_disable(void) +{ + static_branch_disable(&ftrace_exports_enabled); +} + +void ftrace_exports(struct ring_buffer_event *event) +{ + struct trace_export *export; + + preempt_disable_notrace(); + + export = rcu_dereference_raw_notrace(ftrace_exports_list); + while (export) { + trace_process_export(export, event); + export = rcu_dereference_raw_notrace(export->next); + } + + preempt_enable_notrace(); +} + +static inline void +add_trace_export(struct trace_export **list, struct trace_export *export) +{ + rcu_assign_pointer(export->next, *list); + /* + * We are entering export into the list but another + * CPU might be walking that list. We need to make sure + * the export->next pointer is valid before another CPU sees + * the export pointer included into the list. + */ + rcu_assign_pointer(*list, export); +} + +static inline int +rm_trace_export(struct trace_export **list, struct trace_export *export) +{ + struct trace_export **p; + + for (p = list; *p != NULL; p = &(*p)->next) + if (*p == export) + break; + + if (*p != export) + return -1; + + rcu_assign_pointer(*p, (*p)->next); + + return 0; +} + +static inline void +add_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + if (*list == NULL) + ftrace_exports_enable(); + + add_trace_export(list, export); +} + +static inline int +rm_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + int ret; + + ret = rm_trace_export(list, export); + if (*list == NULL) + ftrace_exports_disable(); + + return ret; +} + +int register_ftrace_export(struct trace_export *export) +{ + if (WARN_ON_ONCE(!export->write)) + return -1; + + mutex_lock(&ftrace_export_lock); + + add_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ftrace_export); + +int unregister_ftrace_export(struct trace_export *export) +{ + int ret; + + mutex_lock(&ftrace_export_lock); + + ret = rm_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_export); + void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, @@ -2139,16 +2359,19 @@ trace_function(struct trace_array *tr, struct ring_buffer_event *event; struct ftrace_entry *entry; - event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), - flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), + flags, pc); if (!event) return; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) { + if (static_branch_unlikely(&ftrace_exports_enabled)) + ftrace_exports(event); __buffer_unlock_commit(buffer, event); + } } #ifdef CONFIG_STACKTRACE @@ -2216,8 +2439,8 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, size *= sizeof(unsigned long); - event = trace_buffer_lock_reserve(buffer, TRACE_STACK, - sizeof(*entry) + size, flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, + sizeof(*entry) + size, flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -2318,8 +2541,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) __this_cpu_inc(user_stack_count); - event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, - sizeof(*entry), flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, + sizeof(*entry), flags, pc); if (!event) goto out_drop_count; entry = ring_buffer_event_data(event); @@ -2489,8 +2712,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -2545,8 +2768,8 @@ __trace_array_vprintk(struct ring_buffer *buffer, local_save_flags(flags); size = sizeof(*entry) + len + 1; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -4055,6 +4278,7 @@ static const char readme_msg[] = " x86-tsc: TSC cycle counter\n" #endif "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" + "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n" " tracing_cpumask\t- Limit which CPUs to trace\n" " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" "\t\t\t Remove sub-buffer with rmdir\n" @@ -4066,7 +4290,7 @@ static const char readme_msg[] = "\n available_filter_functions - list of functions that can be filtered on\n" " set_ftrace_filter\t- echo function name in here to only trace these\n" "\t\t\t functions\n" - "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + "\t accepts: func_full_name or glob-matching-pattern\n" "\t modules: Can select a group via module\n" "\t Format: :mod:<module-name>\n" "\t example: echo :mod:ext3 > set_ftrace_filter\n" @@ -5519,21 +5743,18 @@ static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { - unsigned long addr = (unsigned long)ubuf; struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; struct ring_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; - struct page *pages[2]; - void *map_page[2]; - int nr_pages = 1; + const char faulted[] = "<faulted>"; ssize_t written; - int offset; int size; int len; - int ret; - int i; + +/* Used in tracing_mark_raw_write() as well */ +#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */ if (tracing_disabled) return -EINVAL; @@ -5544,60 +5765,33 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; - /* - * Userspace is injecting traces into the kernel trace buffer. - * We want to be as non intrusive as possible. - * To do so, we do not want to allocate any special buffers - * or take any locks, but instead write the userspace data - * straight into the ring buffer. - * - * First we need to pin the userspace buffer into memory, - * which, most likely it is, because it just referenced it. - * But there's no guarantee that it is. By using get_user_pages_fast() - * and kmap_atomic/kunmap_atomic() we can get access to the - * pages directly. We then write the data directly into the - * ring buffer. - */ BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - /* check if we cross pages */ - if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) - nr_pages = 2; - - offset = addr & (PAGE_SIZE - 1); - addr &= PAGE_MASK; - - ret = get_user_pages_fast(addr, nr_pages, 0, pages); - if (ret < nr_pages) { - while (--ret >= 0) - put_page(pages[ret]); - written = -EFAULT; - goto out; - } + local_save_flags(irq_flags); + size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ - for (i = 0; i < nr_pages; i++) - map_page[i] = kmap_atomic(pages[i]); + /* If less than "<faulted>", then make sure we can still add that */ + if (cnt < FAULTED_SIZE) + size += FAULTED_SIZE - cnt; - local_save_flags(irq_flags); - size = sizeof(*entry) + cnt + 2; /* possible \n added */ buffer = tr->trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - irq_flags, preempt_count()); - if (!event) { + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + irq_flags, preempt_count()); + if (unlikely(!event)) /* Ring buffer disabled, return as if not open for write */ - written = -EBADF; - goto out_unlock; - } + return -EBADF; entry = ring_buffer_event_data(event); entry->ip = _THIS_IP_; - if (nr_pages == 2) { - len = PAGE_SIZE - offset; - memcpy(&entry->buf, map_page[0] + offset, len); - memcpy(&entry->buf[len], map_page[1], cnt - len); + len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); + if (len) { + memcpy(&entry->buf, faulted, FAULTED_SIZE); + cnt = FAULTED_SIZE; + written = -EFAULT; } else - memcpy(&entry->buf, map_page[0] + offset, cnt); + written = cnt; + len = cnt; if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; @@ -5607,16 +5801,73 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, __buffer_unlock_commit(buffer, event); - written = cnt; + if (written > 0) + *fpos += written; - *fpos += written; + return written; +} + +/* Limit it for now to 3K (including tag) */ +#define RAW_DATA_MAX_SIZE (1024*3) + +static ssize_t +tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct trace_array *tr = filp->private_data; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct raw_data_entry *entry; + const char faulted[] = "<faulted>"; + unsigned long irq_flags; + ssize_t written; + int size; + int len; + +#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + + if (tracing_disabled) + return -EINVAL; + + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + + /* The marker must at least have a tag id */ + if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE) + return -EINVAL; + + if (cnt > TRACE_BUF_SIZE) + cnt = TRACE_BUF_SIZE; + + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + + local_save_flags(irq_flags); + size = sizeof(*entry) + cnt; + if (cnt < FAULT_SIZE_ID) + size += FAULT_SIZE_ID - cnt; + + buffer = tr->trace_buffer.buffer; + event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, + irq_flags, preempt_count()); + if (!event) + /* Ring buffer disabled, return as if not open for write */ + return -EBADF; + + entry = ring_buffer_event_data(event); + + len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); + if (len) { + entry->id = -1; + memcpy(&entry->buf, faulted, FAULTED_SIZE); + written = -EFAULT; + } else + written = cnt; + + __buffer_unlock_commit(buffer, event); + + if (written > 0) + *fpos += written; - out_unlock: - for (i = nr_pages - 1; i >= 0; i--) { - kunmap_atomic(map_page[i]); - put_page(pages[i]); - } - out: return written; } @@ -5946,6 +6197,13 @@ static const struct file_operations tracing_mark_fops = { .release = tracing_release_generic_tr, }; +static const struct file_operations tracing_mark_raw_fops = { + .open = tracing_open_generic_tr, + .write = tracing_mark_raw_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + static const struct file_operations trace_clock_fops = { .open = tracing_clock_open, .read = seq_read, @@ -7215,6 +7473,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); + trace_create_file("trace_marker_raw", 0220, d_tracer, + tr, &tracing_mark_raw_fops); + trace_create_file("trace_clock", 0644, d_tracer, tr, &trace_clock_fops); @@ -7752,6 +8013,8 @@ void __init trace_init(void) kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); if (WARN_ON(!tracepoint_print_iter)) tracepoint_printk = 0; + else + static_key_enable(&tracepoint_printk_key.key); } tracer_alloc_buffers(); trace_event_init(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fd24b1f9ac43..c2234494f40c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -15,6 +15,7 @@ #include <linux/trace_events.h> #include <linux/compiler.h> #include <linux/trace_seq.h> +#include <linux/glob.h> #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_SYSCALLS */ @@ -39,6 +40,7 @@ enum trace_type { TRACE_BLK, TRACE_BPUTS, TRACE_HWLAT, + TRACE_RAW_DATA, __TRACE_LAST_TYPE, }; @@ -330,6 +332,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ + IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ @@ -599,8 +602,8 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); -void __buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event); +void trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer, + struct ring_buffer_event *event); int trace_empty(struct trace_iterator *iter); @@ -843,6 +846,17 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) return 0; } #endif /* CONFIG_DYNAMIC_FTRACE */ + +extern unsigned int fgraph_max_depth; + +static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace) +{ + /* trace it when it is-nested-in or is a function enabled. */ + return !(trace->depth || ftrace_graph_addr(trace->func)) || + (trace->depth < 0) || + (fgraph_max_depth && trace->depth >= fgraph_max_depth); +} + #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags) @@ -1257,6 +1271,7 @@ enum regex_type { MATCH_FRONT_ONLY, MATCH_MIDDLE_ONLY, MATCH_END_ONLY, + MATCH_GLOB, }; struct regex { diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 0f109c4130d3..e3b488825ae3 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -21,6 +21,8 @@ static u64 bm_stddev; static unsigned int bm_avg; static unsigned int bm_std; +static bool ok_to_run; + /* * This gets called in a loop recording the time it took to write * the tracepoint. What it writes is the time statistics of the last @@ -164,11 +166,21 @@ static int benchmark_event_kthread(void *arg) * When the benchmark tracepoint is enabled, it calls this * function and the thread that calls the tracepoint is created. */ -void trace_benchmark_reg(void) +int trace_benchmark_reg(void) { + if (!ok_to_run) { + pr_warning("trace benchmark cannot be started via kernel command line\n"); + return -EBUSY; + } + bm_event_thread = kthread_run(benchmark_event_kthread, NULL, "event_benchmark"); - WARN_ON(!bm_event_thread); + if (!bm_event_thread) { + pr_warning("trace benchmark failed to create kernel thread\n"); + return -ENOMEM; + } + + return 0; } /* @@ -182,6 +194,7 @@ void trace_benchmark_unreg(void) return; kthread_stop(bm_event_thread); + bm_event_thread = NULL; strcpy(bm_str, "START"); bm_total = 0; @@ -196,3 +209,12 @@ void trace_benchmark_unreg(void) bm_avg = 0; bm_stddev = 0; } + +static __init int ok_to_run_trace_benchmark(void) +{ + ok_to_run = true; + + return 0; +} + +early_initcall(ok_to_run_trace_benchmark); diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h index 3c1df1df4e29..ebdbfc2f2a64 100644 --- a/kernel/trace/trace_benchmark.h +++ b/kernel/trace/trace_benchmark.h @@ -6,7 +6,7 @@ #include <linux/tracepoint.h> -extern void trace_benchmark_reg(void); +extern int trace_benchmark_reg(void); extern void trace_benchmark_unreg(void); #define BENCHMARK_EVENT_STRLEN 128 diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 3a2a73716a5b..75489de546b6 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -81,7 +81,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->correct = val == expect; if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); out: current->trace_recursion &= ~TRACE_BRANCH_BIT; diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index d1cc37e78f99..eb7396b7e7c3 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -244,6 +244,21 @@ FTRACE_ENTRY(print, print_entry, FILTER_OTHER ); +FTRACE_ENTRY(raw_data, raw_data_entry, + + TRACE_RAW_DATA, + + F_STRUCT( + __field( unsigned int, id ) + __dynamic_array( char, buf ) + ), + + F_printk("id:%04x %08x", + __entry->id, (int)__entry->buf[0]), + + FILTER_OTHER +); + FTRACE_ENTRY(bputs, bputs_entry, TRACE_BPUTS, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 03c0a48c3ac4..93116549a284 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -283,46 +283,6 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, } EXPORT_SYMBOL_GPL(trace_event_buffer_reserve); -static DEFINE_SPINLOCK(tracepoint_iter_lock); - -static void output_printk(struct trace_event_buffer *fbuffer) -{ - struct trace_event_call *event_call; - struct trace_event *event; - unsigned long flags; - struct trace_iterator *iter = tracepoint_print_iter; - - if (!iter) - return; - - event_call = fbuffer->trace_file->event_call; - if (!event_call || !event_call->event.funcs || - !event_call->event.funcs->trace) - return; - - event = &fbuffer->trace_file->event_call->event; - - spin_lock_irqsave(&tracepoint_iter_lock, flags); - trace_seq_init(&iter->seq); - iter->ent = fbuffer->entry; - event_call->event.funcs->trace(iter, 0, event); - trace_seq_putc(&iter->seq, 0); - printk("%s", iter->seq.buffer); - - spin_unlock_irqrestore(&tracepoint_iter_lock, flags); -} - -void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) -{ - if (tracepoint_printk) - output_printk(fbuffer); - - event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer, - fbuffer->event, fbuffer->entry, - fbuffer->flags, fbuffer->pc); -} -EXPORT_SYMBOL_GPL(trace_event_buffer_commit); - int trace_event_reg(struct trace_event_call *call, enum trace_reg type, void *data) { @@ -742,6 +702,7 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, struct trace_event_call *call; const char *name; int ret = -EINVAL; + int eret = 0; list_for_each_entry(file, &tr->events, list) { @@ -765,9 +726,17 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, if (event && strcmp(event, name) != 0) continue; - ftrace_event_enable_disable(file, set); + ret = ftrace_event_enable_disable(file, set); - ret = 0; + /* + * Save the first error and return that. Some events + * may still have been enabled, but let the user + * know that something went wrong. + */ + if (ret && !eret) + eret = ret; + + ret = eret; } return ret; @@ -2843,20 +2812,32 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) return -ENOMEM; } + entry = trace_create_file("enable", 0644, d_events, + tr, &ftrace_tr_enable_fops); + if (!entry) { + pr_warn("Could not create tracefs 'enable' entry\n"); + return -ENOMEM; + } + + /* There are not as crucial, just warn if they are not created */ + entry = tracefs_create_file("set_event_pid", 0644, parent, tr, &ftrace_set_event_pid_fops); + if (!entry) + pr_warn("Could not create tracefs 'set_event_pid' entry\n"); /* ring buffer internal formats */ - trace_create_file("header_page", 0444, d_events, - ring_buffer_print_page_header, - &ftrace_show_header_fops); - - trace_create_file("header_event", 0444, d_events, - ring_buffer_print_entry_header, - &ftrace_show_header_fops); + entry = trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + if (!entry) + pr_warn("Could not create tracefs 'header_page' entry\n"); - trace_create_file("enable", 0644, d_events, - tr, &ftrace_tr_enable_fops); + entry = trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + if (!entry) + pr_warn("Could not create tracefs 'header_event' entry\n"); tr->event_dir = d_events; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9daa9b3bc6d9..59a411ff60c7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -108,12 +108,12 @@ static char *err_text[] = { }; struct opstack_op { - int op; + enum filter_op_ids op; struct list_head list; }; struct postfix_elt { - int op; + enum filter_op_ids op; char *operand; struct list_head list; }; @@ -145,34 +145,50 @@ struct pred_stack { /* If not of not match is equal to not of not, then it is a match */ #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event) \ +static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = 0; \ - \ - switch (pred->op) { \ - case OP_LT: \ - match = (*addr < val); \ - break; \ - case OP_LE: \ - match = (*addr <= val); \ - break; \ - case OP_GT: \ - match = (*addr > val); \ - break; \ - case OP_GE: \ - match = (*addr >= val); \ - break; \ - case OP_BAND: \ - match = (*addr & val); \ - break; \ - default: \ - break; \ - } \ - \ + int match = (*addr < val); \ return !!match == !pred->not; \ -} +} \ +static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = (*addr <= val); \ + return !!match == !pred->not; \ +} \ +static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = (*addr > val); \ + return !!match == !pred->not; \ +} \ +static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = (*addr >= val); \ + return !!match == !pred->not; \ +} \ +static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = !!(*addr & val); \ + return match == !pred->not; \ +} \ +static const filter_pred_fn_t pred_funcs_##type[] = { \ + filter_pred_LT_##type, \ + filter_pred_LE_##type, \ + filter_pred_GT_##type, \ + filter_pred_GE_##type, \ + filter_pred_BAND_##type, \ +}; + +#define PRED_FUNC_START OP_LT #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ @@ -344,6 +360,12 @@ static int regex_match_end(char *str, struct regex *r, int len) return 0; } +static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused) +{ + if (glob_match(r->pattern, str)) + return 1; + return 0; +} /** * filter_parse_regex - parse a basic regex * @buff: the raw regex @@ -380,14 +402,20 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) if (!i) { *search = buff + 1; type = MATCH_END_ONLY; - } else { + } else if (i == len - 1) { if (type == MATCH_END_ONLY) type = MATCH_MIDDLE_ONLY; else type = MATCH_FRONT_ONLY; buff[i] = 0; break; + } else { /* pattern continues, use full glob */ + type = MATCH_GLOB; + break; } + } else if (strchr("[?\\", buff[i])) { + type = MATCH_GLOB; + break; } } @@ -420,6 +448,9 @@ static void filter_build_regex(struct filter_pred *pred) case MATCH_END_ONLY: r->match = regex_match_end; break; + case MATCH_GLOB: + r->match = regex_match_glob; + break; } pred->not ^= not; @@ -946,7 +977,7 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static bool is_legal_op(struct ftrace_event_field *field, int op) +static bool is_legal_op(struct ftrace_event_field *field, enum filter_op_ids op) { if (is_string_field(field) && (op != OP_EQ && op != OP_NE && op != OP_GLOB)) @@ -957,8 +988,8 @@ static bool is_legal_op(struct ftrace_event_field *field, int op) return true; } -static filter_pred_fn_t select_comparison_fn(int op, int field_size, - int field_is_signed) +static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, + int field_size, int field_is_signed) { filter_pred_fn_t fn = NULL; @@ -967,33 +998,33 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, if (op == OP_EQ || op == OP_NE) fn = filter_pred_64; else if (field_is_signed) - fn = filter_pred_s64; + fn = pred_funcs_s64[op - PRED_FUNC_START]; else - fn = filter_pred_u64; + fn = pred_funcs_u64[op - PRED_FUNC_START]; break; case 4: if (op == OP_EQ || op == OP_NE) fn = filter_pred_32; else if (field_is_signed) - fn = filter_pred_s32; + fn = pred_funcs_s32[op - PRED_FUNC_START]; else - fn = filter_pred_u32; + fn = pred_funcs_u32[op - PRED_FUNC_START]; break; case 2: if (op == OP_EQ || op == OP_NE) fn = filter_pred_16; else if (field_is_signed) - fn = filter_pred_s16; + fn = pred_funcs_s16[op - PRED_FUNC_START]; else - fn = filter_pred_u16; + fn = pred_funcs_u16[op - PRED_FUNC_START]; break; case 1: if (op == OP_EQ || op == OP_NE) fn = filter_pred_8; else if (field_is_signed) - fn = filter_pred_s8; + fn = pred_funcs_s8[op - PRED_FUNC_START]; else - fn = filter_pred_u8; + fn = pred_funcs_u8[op - PRED_FUNC_START]; break; } @@ -1166,7 +1197,8 @@ static inline int append_operand_char(struct filter_parse_state *ps, char c) return 0; } -static int filter_opstack_push(struct filter_parse_state *ps, int op) +static int filter_opstack_push(struct filter_parse_state *ps, + enum filter_op_ids op) { struct opstack_op *opstack_op; @@ -1200,7 +1232,7 @@ static int filter_opstack_top(struct filter_parse_state *ps) static int filter_opstack_pop(struct filter_parse_state *ps) { struct opstack_op *opstack_op; - int op; + enum filter_op_ids op; if (filter_opstack_empty(ps)) return OP_NONE; @@ -1245,7 +1277,7 @@ static int postfix_append_operand(struct filter_parse_state *ps, char *operand) return 0; } -static int postfix_append_op(struct filter_parse_state *ps, int op) +static int postfix_append_op(struct filter_parse_state *ps, enum filter_op_ids op) { struct postfix_elt *elt; @@ -1275,8 +1307,8 @@ static void postfix_clear(struct filter_parse_state *ps) static int filter_parse(struct filter_parse_state *ps) { + enum filter_op_ids op, top_op; int in_string = 0; - int op, top_op; char ch; while ((ch = infix_next(ps))) { @@ -1367,7 +1399,8 @@ parse_operand: static struct filter_pred *create_pred(struct filter_parse_state *ps, struct trace_event_call *call, - int op, char *operand1, char *operand2) + enum filter_op_ids op, + char *operand1, char *operand2) { struct ftrace_event_field *field; static struct filter_pred pred; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4e480e870474..d56123cdcc89 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -65,7 +65,7 @@ struct fgraph_data { #define TRACE_GRAPH_INDENT 2 -static unsigned int max_depth; +unsigned int fgraph_max_depth; static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ @@ -358,7 +358,7 @@ int __trace_graph_entry(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->graph_ent = *trace; if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); return 1; } @@ -384,10 +384,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (!ftrace_trace_task(tr)) return 0; - /* trace it when it is-nested-in or is a function enabled. */ - if ((!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) || (trace->depth < 0) || - (max_depth && trace->depth >= max_depth)) + if (ftrace_graph_ignore_func(trace)) + return 0; + + if (ftrace_graph_ignore_irqs()) return 0; /* @@ -469,7 +469,7 @@ void __trace_graph_return(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->ret = *trace; if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } void trace_graph_return(struct ftrace_graph_ret *trace) @@ -842,6 +842,10 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data = per_cpu_ptr(data->cpu_data, cpu); + /* If a graph tracer ignored set_graph_notrace */ + if (call->depth < -1) + call->depth += FTRACE_NOTRACE_DEPTH; + /* * Comments display at + 1 to depth. Since * this is a leaf function, keep the comments @@ -850,7 +854,8 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data->depth = call->depth - 1; /* No need to keep this function around for this depth */ - if (call->depth < FTRACE_RETFUNC_DEPTH) + if (call->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(call->depth < 0)) cpu_data->enter_funcs[call->depth] = 0; } @@ -880,11 +885,16 @@ print_graph_entry_nested(struct trace_iterator *iter, struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; + /* If a graph tracer ignored set_graph_notrace */ + if (call->depth < -1) + call->depth += FTRACE_NOTRACE_DEPTH; + cpu_data = per_cpu_ptr(data->cpu_data, cpu); cpu_data->depth = call->depth; /* Save this function pointer to see if the exit matches */ - if (call->depth < FTRACE_RETFUNC_DEPTH) + if (call->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(call->depth < 0)) cpu_data->enter_funcs[call->depth] = call->func; } @@ -1114,7 +1124,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, */ cpu_data->depth = trace->depth - 1; - if (trace->depth < FTRACE_RETFUNC_DEPTH) { + if (trace->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(trace->depth < 0)) { if (cpu_data->enter_funcs[trace->depth] != trace->func) func_match = 0; cpu_data->enter_funcs[trace->depth] = 0; @@ -1489,7 +1500,7 @@ graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; - max_depth = val; + fgraph_max_depth = val; *ppos += cnt; @@ -1503,7 +1514,7 @@ graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ int n; - n = sprintf(buf, "%d\n", max_depth); + n = sprintf(buf, "%d\n", fgraph_max_depth); return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); } diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b97286c48735..775569ec50d0 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -127,7 +127,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) entry->nmi_count = sample->nmi_count; if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* Macros to encapsulate the time capturing infrastructure */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 03cdff84d026..86654d7e1afe 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -175,6 +175,18 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) int ret; int pc; + if (ftrace_graph_ignore_func(trace)) + return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + if (!func_prolog_dec(tr, &data, &flags)) return 0; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index eb6c9f1d3a93..a133ecd741e4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -73,6 +73,17 @@ static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) return !!strchr(trace_kprobe_symbol(tk), ':'); } +static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) +{ + unsigned long nhit = 0; + int cpu; + + for_each_possible_cpu(cpu) + nhit += *per_cpu_ptr(tk->nhit, cpu); + + return nhit; +} + static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -882,14 +893,10 @@ static const struct file_operations kprobe_events_ops = { static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_kprobe *tk = v; - unsigned long nhit = 0; - int cpu; - - for_each_possible_cpu(cpu) - nhit += *per_cpu_ptr(tk->nhit, cpu); seq_printf(m, " %-44s %15lu %15lu\n", - trace_event_name(&tk->tp.call), nhit, + trace_event_name(&tk->tp.call), + trace_kprobe_nhit(tk), tk->rp.kp.nmissed); return 0; @@ -1354,18 +1361,18 @@ fs_initcall(init_kprobe_trace); #ifdef CONFIG_FTRACE_STARTUP_TEST - /* * The "__used" keeps gcc from removing the function symbol - * from the kallsyms table. + * from the kallsyms table. 'noinline' makes sure that there + * isn't an inlined version used by the test method below */ -static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, - int a4, int a5, int a6) +static __used __init noinline int +kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6) { return a1 + a2 + a3 + a4 + a5 + a6; } -static struct trace_event_file * +static struct __init trace_event_file * find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) { struct trace_event_file *file; @@ -1443,12 +1450,25 @@ static __init int kprobe_trace_self_tests_init(void) ret = target(1, 2, 3, 4, 5, 6); + /* + * Not expecting an error here, the check is only to prevent the + * optimizer from removing the call to target() as otherwise there + * are no side-effects and the call is never performed. + */ + if (ret != 21) + warn++; + /* Disable trace points before removing it */ tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting test probe.\n"); warn++; } else { + if (trace_kprobe_nhit(tk) != 1) { + pr_warn("incorrect number of testprobe hits\n"); + warn++; + } + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); @@ -1462,6 +1482,11 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting 2nd test probe.\n"); warn++; } else { + if (trace_kprobe_nhit(tk) != 1) { + pr_warn("incorrect number of testprobe2 hits\n"); + warn++; + } + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3fc20422c166..5d33a7352919 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1288,6 +1288,35 @@ static struct trace_event trace_print_event = { .funcs = &trace_print_funcs, }; +static enum print_line_t trace_raw_data(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct raw_data_entry *field; + int i; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "# %x buf:", field->id); + + for (i = 0; i < iter->ent_size - offsetof(struct raw_data_entry, buf); i++) + trace_seq_printf(&iter->seq, " %02x", + (unsigned char)field->buf[i]); + + trace_seq_putc(&iter->seq, '\n'); + + return trace_handle_return(&iter->seq); +} + +static struct trace_event_functions trace_raw_data_funcs = { + .trace = trace_raw_data, + .raw = trace_raw_data, +}; + +static struct trace_event trace_raw_data_event = { + .type = TRACE_RAW_DATA, + .funcs = &trace_raw_data_funcs, +}; + static struct trace_event *events[] __initdata = { &trace_fn_event, @@ -1299,6 +1328,7 @@ static struct trace_event *events[] __initdata = { &trace_bprint_event, &trace_print_event, &trace_hwlat_event, + &trace_raw_data_event, NULL }; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9d4399b553a3..5d0bb025bb21 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -239,6 +239,18 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) unsigned long flags; int pc, ret = 0; + if (ftrace_graph_ignore_func(trace)) + return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + if (!func_prolog_preempt_disable(tr, &data, &pc)) return 0; @@ -790,6 +802,7 @@ static struct tracer wakeup_dl_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, + .allow_instances = true, .use_max_tr = true, }; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index d0639d917899..1f9a31f934a4 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -194,9 +194,13 @@ static int tracepoint_add_func(struct tracepoint *tp, struct tracepoint_func *func, int prio) { struct tracepoint_func *old, *tp_funcs; + int ret; - if (tp->regfunc && !static_key_enabled(&tp->key)) - tp->regfunc(); + if (tp->regfunc && !static_key_enabled(&tp->key)) { + ret = tp->regfunc(); + if (ret < 0) + return ret; + } tp_funcs = rcu_dereference_protected(tp->funcs, lockdep_is_held(&tracepoints_mutex)); @@ -529,7 +533,7 @@ EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint); /* NB: reg/unreg are called while guarded with the tracepoints_mutex */ static int sys_tracepoint_refcount; -void syscall_regfunc(void) +int syscall_regfunc(void) { struct task_struct *p, *t; @@ -541,6 +545,8 @@ void syscall_regfunc(void) read_unlock(&tasklist_lock); } sys_tracepoint_refcount++; + + return 0; } void syscall_unregfunc(void) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9acb29f280ec..d4b0fa01cae3 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,32 +24,14 @@ #include <asm/irq_regs.h> #include <linux/kvm_para.h> -#include <linux/perf_event.h> #include <linux/kthread.h> -/* - * The run state of the lockup detectors is controlled by the content of the - * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - - * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. - * - * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' - * are variables that are only used as an 'interface' between the parameters - * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The - * 'watchdog_thresh' variable is handled differently because its value is not - * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' - * is equal zero. - */ -#define NMI_WATCHDOG_ENABLED_BIT 0 -#define SOFT_WATCHDOG_ENABLED_BIT 1 -#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) -#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) - static DEFINE_MUTEX(watchdog_proc_mutex); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; #else -static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif int __read_mostly nmi_watchdog_enabled; int __read_mostly soft_watchdog_enabled; @@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10; #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#else -#define sysctl_softlockup_all_cpu_backtrace 0 -#define sysctl_hardlockup_all_cpu_backtrace 0 #endif static struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -100,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static DEFINE_PER_CPU(bool, hard_watchdog_warn); -static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); -static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -#endif static unsigned long soft_lockup_nmi_warn; -/* boot commands */ -/* - * Should we panic when a soft-lockup or hard-lockup occurs: - */ -#ifdef CONFIG_HARDLOCKUP_DETECTOR -unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; -static unsigned long hardlockup_allcpu_dumped; -/* - * We may not want to enable hard lockup detection by default in all cases, - * for example when running the kernel as a guest on a hypervisor. In these - * cases this function can be called to disable hard lockup detection. This - * function should only be executed once by the boot processor before the - * kernel command line parameters are parsed, because otherwise it is not - * possible to override this in hardlockup_panic_setup(). - */ -void hardlockup_detector_disable(void) -{ - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; -} - -static int __init hardlockup_panic_setup(char *str) -{ - if (!strncmp(str, "panic", 5)) - hardlockup_panic = 1; - else if (!strncmp(str, "nopanic", 7)) - hardlockup_panic = 0; - else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; - else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - return 1; -} -__setup("nmi_watchdog=", hardlockup_panic_setup); -#endif - unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -264,32 +202,14 @@ void touch_all_softlockup_watchdogs(void) wq_watchdog_touch(-1); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR -void touch_nmi_watchdog(void) -{ - /* - * Using __raw here because some code paths have - * preemption enabled. If preemption is enabled - * then interrupts should be enabled too, in which - * case we shouldn't have to worry about the watchdog - * going off. - */ - raw_cpu_write(watchdog_nmi_touch, true); - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -#endif - void touch_softlockup_watchdog_sync(void) { __this_cpu_write(softlockup_touch_sync, true); __this_cpu_write(watchdog_touch_ts, 0); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR /* watchdog detector functions */ -static bool is_hardlockup(void) +bool is_hardlockup(void) { unsigned long hrint = __this_cpu_read(hrtimer_interrupts); @@ -299,7 +219,6 @@ static bool is_hardlockup(void) __this_cpu_write(hrtimer_interrupts_saved, hrint); return false; } -#endif static int is_softlockup(unsigned long touch_ts) { @@ -313,78 +232,22 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -#ifdef CONFIG_HARDLOCKUP_DETECTOR - -static struct perf_event_attr wd_hw_attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, -}; - -/* Callback function for perf event subsystem */ -static void watchdog_overflow_callback(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - /* Ensure the watchdog never gets throttled */ - event->hw.interrupts = 0; - - if (__this_cpu_read(watchdog_nmi_touch) == true) { - __this_cpu_write(watchdog_nmi_touch, false); - return; - } - - /* check for a hardlockup - * This is done by making sure our timer interrupt - * is incrementing. The timer interrupt should have - * fired multiple times before we overflow'd. If it hasn't - * then this is a good indication the cpu is stuck - */ - if (is_hardlockup()) { - int this_cpu = smp_processor_id(); - struct pt_regs *regs = get_irq_regs(); - - /* only print hardlockups once */ - if (__this_cpu_read(hard_watchdog_warn) == true) - return; - - pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - - /* - * Perform all-CPU dump only once to avoid multiple hardlockups - * generating interleaving traces - */ - if (sysctl_hardlockup_all_cpu_backtrace && - !test_and_set_bit(0, &hardlockup_allcpu_dumped)) - trigger_allbutself_cpu_backtrace(); - - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); - - __this_cpu_write(hard_watchdog_warn, true); - return; - } - - __this_cpu_write(hard_watchdog_warn, false); - return; -} -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ - static void watchdog_interrupt_count(void) { __this_cpu_inc(hrtimer_interrupts); } -static int watchdog_nmi_enable(unsigned int cpu); -static void watchdog_nmi_disable(unsigned int cpu); +/* + * These two functions are mostly architecture specific + * defining them as weak here. + */ +int __weak watchdog_nmi_enable(unsigned int cpu) +{ + return 0; +} +void __weak watchdog_nmi_disable(unsigned int cpu) +{ +} static int watchdog_enable_all_cpus(void); static void watchdog_disable_all_cpus(void); @@ -577,109 +440,6 @@ static void watchdog(unsigned int cpu) watchdog_nmi_disable(cpu); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR -/* - * People like the simple clean cpu node info on boot. - * Reduce the watchdog noise by only printing messages - * that are different from what cpu0 displayed. - */ -static unsigned long cpu0_err; - -static int watchdog_nmi_enable(unsigned int cpu) -{ - struct perf_event_attr *wd_attr; - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - /* nothing to do if the hard lockup detector is disabled */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto out; - - /* is it already setup and enabled? */ - if (event && event->state > PERF_EVENT_STATE_OFF) - goto out; - - /* it is setup but not enabled */ - if (event != NULL) - goto out_enable; - - wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - - /* Try to register using hardware perf events */ - event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); - - /* save cpu0 error for future comparision */ - if (cpu == 0 && IS_ERR(event)) - cpu0_err = PTR_ERR(event); - - if (!IS_ERR(event)) { - /* only print for cpu0 or different than cpu0 */ - if (cpu == 0 || cpu0_err) - pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); - goto out_save; - } - - /* - * Disable the hard lockup detector if _any_ CPU fails to set up - * set up the hardware perf event. The watchdog() function checks - * the NMI_WATCHDOG_ENABLED bit periodically. - * - * The barriers are for syncing up watchdog_enabled across all the - * cpus, as clear_bit() does not use barriers. - */ - smp_mb__before_atomic(); - clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); - smp_mb__after_atomic(); - - /* skip displaying the same error again */ - if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) - return PTR_ERR(event); - - /* vary the KERN level based on the returned errno */ - if (PTR_ERR(event) == -EOPNOTSUPP) - pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); - else if (PTR_ERR(event) == -ENOENT) - pr_warn("disabled (cpu%i): hardware events not enabled\n", - cpu); - else - pr_err("disabled (cpu%i): unable to create perf event: %ld\n", - cpu, PTR_ERR(event)); - - pr_info("Shutting down hard lockup detector on all cpus\n"); - - return PTR_ERR(event); - - /* success path */ -out_save: - per_cpu(watchdog_ev, cpu) = event; -out_enable: - perf_event_enable(per_cpu(watchdog_ev, cpu)); -out: - return 0; -} - -static void watchdog_nmi_disable(unsigned int cpu) -{ - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - if (event) { - perf_event_disable(event); - per_cpu(watchdog_ev, cpu) = NULL; - - /* should be in cleanup, but blocks oprofile */ - perf_event_release_kernel(event); - } - if (cpu == 0) { - /* watchdog_nmi_enable() expects this to be zero initially. */ - cpu0_err = 0; - } -} - -#else -static int watchdog_nmi_enable(unsigned int cpu) { return 0; } -static void watchdog_nmi_disable(unsigned int cpu) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ - static struct smp_hotplug_thread watchdog_threads = { .store = &softlockup_watchdog, .thread_should_run = watchdog_should_run, diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c new file mode 100644 index 000000000000..84016c8aee6b --- /dev/null +++ b/kernel/watchdog_hld.c @@ -0,0 +1,227 @@ +/* + * Detect hard lockups on a system + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Note: Most of this code is borrowed heavily from the original softlockup + * detector, so thanks to Ingo for the initial implementation. + * Some chunks also taken from the old x86-specific nmi watchdog code, thanks + * to those contributors as well. + */ + +#define pr_fmt(fmt) "NMI watchdog: " fmt + +#include <linux/nmi.h> +#include <linux/module.h> +#include <asm/irq_regs.h> +#include <linux/perf_event.h> + +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, watchdog_nmi_touch); +static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); + +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +unsigned int __read_mostly hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +static unsigned long hardlockup_allcpu_dumped; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void hardlockup_detector_disable(void) +{ + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +} + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; + else if (!strncmp(str, "0", 1)) + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); + +void touch_nmi_watchdog(void) +{ + /* + * Using __raw here because some code paths have + * preemption enabled. If preemption is enabled + * then interrupts should be enabled too, in which + * case we shouldn't have to worry about the watchdog + * going off. + */ + raw_cpu_write(watchdog_nmi_touch, true); + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +static struct perf_event_attr wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +/* Callback function for perf event subsystem */ +static void watchdog_overflow_callback(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + /* Ensure the watchdog never gets throttled */ + event->hw.interrupts = 0; + + if (__this_cpu_read(watchdog_nmi_touch) == true) { + __this_cpu_write(watchdog_nmi_touch, false); + return; + } + + /* check for a hardlockup + * This is done by making sure our timer interrupt + * is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup()) { + int this_cpu = smp_processor_id(); + + /* only print hardlockups once */ + if (__this_cpu_read(hard_watchdog_warn) == true) + return; + + pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + /* + * Perform all-CPU dump only once to avoid multiple hardlockups + * generating interleaving traces + */ + if (sysctl_hardlockup_all_cpu_backtrace && + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); + + __this_cpu_write(hard_watchdog_warn, true); + return; + } + + __this_cpu_write(hard_watchdog_warn, false); + return; +} + +/* + * People like the simple clean cpu node info on boot. + * Reduce the watchdog noise by only printing messages + * that are different from what cpu0 displayed. + */ +static unsigned long cpu0_err; + +int watchdog_nmi_enable(unsigned int cpu) +{ + struct perf_event_attr *wd_attr; + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + /* nothing to do if the hard lockup detector is disabled */ + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + goto out; + + /* is it already setup and enabled? */ + if (event && event->state > PERF_EVENT_STATE_OFF) + goto out; + + /* it is setup but not enabled */ + if (event != NULL) + goto out_enable; + + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + + /* Try to register using hardware perf events */ + event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + + /* save cpu0 error for future comparision */ + if (cpu == 0 && IS_ERR(event)) + cpu0_err = PTR_ERR(event); + + if (!IS_ERR(event)) { + /* only print for cpu0 or different than cpu0 */ + if (cpu == 0 || cpu0_err) + pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); + goto out_save; + } + + /* + * Disable the hard lockup detector if _any_ CPU fails to set up + * set up the hardware perf event. The watchdog() function checks + * the NMI_WATCHDOG_ENABLED bit periodically. + * + * The barriers are for syncing up watchdog_enabled across all the + * cpus, as clear_bit() does not use barriers. + */ + smp_mb__before_atomic(); + clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); + smp_mb__after_atomic(); + + /* skip displaying the same error again */ + if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + return PTR_ERR(event); + + /* vary the KERN level based on the returned errno */ + if (PTR_ERR(event) == -EOPNOTSUPP) + pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); + else if (PTR_ERR(event) == -ENOENT) + pr_warn("disabled (cpu%i): hardware events not enabled\n", + cpu); + else + pr_err("disabled (cpu%i): unable to create perf event: %ld\n", + cpu, PTR_ERR(event)); + + pr_info("Shutting down hard lockup detector on all cpus\n"); + + return PTR_ERR(event); + + /* success path */ +out_save: + per_cpu(watchdog_ev, cpu) = event; +out_enable: + perf_event_enable(per_cpu(watchdog_ev, cpu)); +out: + return 0; +} + +void watchdog_nmi_disable(unsigned int cpu) +{ + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) { + perf_event_disable(event); + per_cpu(watchdog_ev, cpu) = NULL; + + /* should be in cleanup, but blocks oprofile */ + perf_event_release_kernel(event); + } + if (cpu == 0) { + /* watchdog_nmi_enable() expects this to be zero initially. */ + cpu0_err = 0; + } +} diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 479d840db286..1d9fb6543a66 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -290,6 +290,8 @@ module_param_named(disable_numa, wq_disable_numa, bool, 0444); static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); +static bool wq_online; /* can kworkers be created yet? */ + static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ @@ -2583,6 +2585,9 @@ void flush_workqueue(struct workqueue_struct *wq) }; int next_color; + if (WARN_ON(!wq_online)) + return; + lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); @@ -2843,6 +2848,9 @@ bool flush_work(struct work_struct *work) { struct wq_barrier barr; + if (WARN_ON(!wq_online)) + return false; + lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); @@ -2913,7 +2921,13 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) mark_work_canceling(work); local_irq_restore(flags); - flush_work(work); + /* + * This allows canceling during early boot. We know that @work + * isn't executing. + */ + if (wq_online) + flush_work(work); + clear_work_data(work); /* @@ -3364,7 +3378,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) goto fail; /* create and start the initial worker */ - if (!create_worker(pool)) + if (wq_online && !create_worker(pool)) goto fail; /* install */ @@ -3429,6 +3443,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) { struct workqueue_struct *wq = pwq->wq; bool freezable = wq->flags & WQ_FREEZABLE; + unsigned long flags; /* for @wq->saved_max_active */ lockdep_assert_held(&wq->mutex); @@ -3437,7 +3452,8 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) if (!freezable && pwq->max_active == wq->saved_max_active) return; - spin_lock_irq(&pwq->pool->lock); + /* this function can be called during early boot w/ irq disabled */ + spin_lock_irqsave(&pwq->pool->lock, flags); /* * During [un]freezing, the caller is responsible for ensuring that @@ -3460,7 +3476,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = 0; } - spin_unlock_irq(&pwq->pool->lock); + spin_unlock_irqrestore(&pwq->pool->lock, flags); } /* initialize newly alloced @pwq which is associated with @wq and @pool */ @@ -4033,6 +4049,7 @@ void destroy_workqueue(struct workqueue_struct *wq) for (i = 0; i < WORK_NR_COLORS; i++) { if (WARN_ON(pwq->nr_in_flight[i])) { mutex_unlock(&wq->mutex); + show_workqueue_state(); return; } } @@ -4041,6 +4058,7 @@ void destroy_workqueue(struct workqueue_struct *wq) WARN_ON(pwq->nr_active) || WARN_ON(!list_empty(&pwq->delayed_works))) { mutex_unlock(&wq->mutex); + show_workqueue_state(); return; } } @@ -5467,7 +5485,17 @@ static void __init wq_numa_init(void) wq_numa_enabled = true; } -static int __init init_workqueues(void) +/** + * workqueue_init_early - early init for workqueue subsystem + * + * This is the first half of two-staged workqueue subsystem initialization + * and invoked as soon as the bare basics - memory allocation, cpumasks and + * idr are up. It sets up all the data structures and system workqueues + * and allows early boot code to create workqueues and queue/cancel work + * items. Actual work item execution starts only after kthreads can be + * created and scheduled right before early initcalls. + */ +int __init workqueue_init_early(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; @@ -5479,8 +5507,6 @@ static int __init init_workqueues(void) pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); - wq_numa_init(); - /* initialize CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; @@ -5500,16 +5526,6 @@ static int __init init_workqueues(void) } } - /* create the initial worker */ - for_each_online_cpu(cpu) { - struct worker_pool *pool; - - for_each_cpu_worker_pool(pool, cpu) { - pool->flags &= ~POOL_DISASSOCIATED; - BUG_ON(!create_worker(pool)); - } - } - /* create default unbound and ordered wq attrs */ for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; @@ -5546,8 +5562,59 @@ static int __init init_workqueues(void) !system_power_efficient_wq || !system_freezable_power_efficient_wq); + return 0; +} + +/** + * workqueue_init - bring workqueue subsystem fully online + * + * This is the latter half of two-staged workqueue subsystem initialization + * and invoked as soon as kthreads can be created and scheduled. + * Workqueues have been created and work items queued on them, but there + * are no kworkers executing the work items yet. Populate the worker pools + * with the initial workers and enable future kworker creations. + */ +int __init workqueue_init(void) +{ + struct workqueue_struct *wq; + struct worker_pool *pool; + int cpu, bkt; + + /* + * It'd be simpler to initialize NUMA in workqueue_init_early() but + * CPU to node mapping may not be available that early on some + * archs such as power and arm64. As per-cpu pools created + * previously could be missing node hint and unbound pools NUMA + * affinity, fix them up. + */ + wq_numa_init(); + + mutex_lock(&wq_pool_mutex); + + for_each_possible_cpu(cpu) { + for_each_cpu_worker_pool(pool, cpu) { + pool->node = cpu_to_node(cpu); + } + } + + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, smp_processor_id(), true); + + mutex_unlock(&wq_pool_mutex); + + /* create the initial workers */ + for_each_online_cpu(cpu) { + for_each_cpu_worker_pool(pool, cpu) { + pool->flags &= ~POOL_DISASSOCIATED; + BUG_ON(!create_worker(pool)); + } + } + + hash_for_each(unbound_pool_hash, bkt, pool, hash_node) + BUG_ON(!create_worker(pool)); + + wq_online = true; wq_watchdog_init(); return 0; } -early_initcall(init_workqueues); |