148 files changed, 14354 insertions, 5256 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 670665c6e2a6..e7478cb58079 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1099,8 +1099,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
 
 	if (audit_enabled == AUDIT_OFF)
 		return;
-
-	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
+	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE);
 	if (!ab)
 		return;
 	audit_log_task_info(ab, current);
@@ -2317,8 +2316,7 @@ void audit_log_link_denied(const char *operation)
 		return;
 
 	/* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
-	ab = audit_log_start(current->audit_context, GFP_KERNEL,
-			     AUDIT_ANOM_LINK);
+	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_LINK);
 	if (!ab)
 		return;
 	audit_log_format(ab, "op=%s", operation);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 9eb8b3511636..f1ba88994508 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -274,7 +274,7 @@ static void audit_update_watch(struct audit_parent *parent,
 		/* If the update involves invalidating rules, do the inode-based
 		 * filtering now, so we don't omit records. */
 		if (invalidating && !audit_dummy_context())
-			audit_filter_inodes(current, current->audit_context);
+			audit_filter_inodes(current, audit_context());
 
 		/* updating ino will likely change which audit_hash_list we
 		 * are on so we need a new watch for the new list */
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index d7a807e81451..eaa320148d97 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -426,7 +426,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 			return -EINVAL;
 		break;
 	case AUDIT_EXE:
-		if (f->op != Audit_equal)
+		if (f->op != Audit_not_equal && f->op != Audit_equal)
 			return -EINVAL;
 		if (entry->rule.listnr != AUDIT_FILTER_EXIT)
 			return -EINVAL;
@@ -1089,8 +1089,6 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
 static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
 {
 	struct audit_buffer *ab;
-	uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
-	unsigned int sessionid = audit_get_sessionid(current);
 
 	if (!audit_enabled)
 		return;
@@ -1098,7 +1096,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
 	if (!ab)
 		return;
-	audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
+	audit_log_session_info(ab);
 	audit_log_task_context(ab);
 	audit_log_format(ab, " op=%s", action);
 	audit_log_key(ab, rule->filterkey);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4e0a4ac803db..ceb1c4596c51 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -374,7 +374,7 @@ static int audit_field_compare(struct task_struct *tsk,
 	case AUDIT_COMPARE_EGID_TO_OBJ_GID:
 		return audit_compare_gid(cred->egid, name, f, ctx);
 	case AUDIT_COMPARE_AUID_TO_OBJ_UID:
-		return audit_compare_uid(tsk->loginuid, name, f, ctx);
+		return audit_compare_uid(audit_get_loginuid(tsk), name, f, ctx);
 	case AUDIT_COMPARE_SUID_TO_OBJ_UID:
 		return audit_compare_uid(cred->suid, name, f, ctx);
 	case AUDIT_COMPARE_SGID_TO_OBJ_GID:
@@ -385,7 +385,8 @@ static int audit_field_compare(struct task_struct *tsk,
 		return audit_compare_gid(cred->fsgid, name, f, ctx);
 	/* uid comparisons */
 	case AUDIT_COMPARE_UID_TO_AUID:
-		return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);
+		return audit_uid_comparator(cred->uid, f->op,
+					    audit_get_loginuid(tsk));
 	case AUDIT_COMPARE_UID_TO_EUID:
 		return audit_uid_comparator(cred->uid, f->op, cred->euid);
 	case AUDIT_COMPARE_UID_TO_SUID:
@@ -394,11 +395,14 @@ static int audit_field_compare(struct task_struct *tsk,
 		return audit_uid_comparator(cred->uid, f->op, cred->fsuid);
 	/* auid comparisons */
 	case AUDIT_COMPARE_AUID_TO_EUID:
-		return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);
+		return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+					    cred->euid);
 	case AUDIT_COMPARE_AUID_TO_SUID:
-		return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);
+		return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+					    cred->suid);
 	case AUDIT_COMPARE_AUID_TO_FSUID:
-		return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);
+		return audit_uid_comparator(audit_get_loginuid(tsk), f->op,
+					    cred->fsuid);
 	/* euid comparisons */
 	case AUDIT_COMPARE_EUID_TO_SUID:
 		return audit_uid_comparator(cred->euid, f->op, cred->suid);
@@ -471,6 +475,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 			break;
 		case AUDIT_EXE:
 			result = audit_exe_compare(tsk, rule->exe);
+			if (f->op == Audit_not_equal)
+				result = !result;
 			break;
 		case AUDIT_UID:
 			result = audit_uid_comparator(cred->uid, f->op, f->uid);
@@ -511,7 +517,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 			result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
 			break;
 		case AUDIT_SESSIONID:
-			sessionid = audit_get_sessionid(current);
+			sessionid = audit_get_sessionid(tsk);
 			result = audit_comparator(sessionid, f->op, f->val);
 			break;
 		case AUDIT_PERS:
@@ -609,7 +615,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 				result = match_tree_refs(ctx, rule->tree);
 			break;
 		case AUDIT_LOGINUID:
-			result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
+			result = audit_uid_comparator(audit_get_loginuid(tsk),
+						      f->op, f->uid);
 			break;
 		case AUDIT_LOGINUID_SET:
 			result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
@@ -863,7 +870,7 @@ static inline struct audit_context *audit_take_context(struct task_struct *tsk,
 		audit_filter_inodes(tsk, context);
 	}
 
-	tsk->audit_context = NULL;
+	audit_set_context(tsk, NULL);
 	return context;
 }
 
@@ -950,7 +957,7 @@ int audit_alloc(struct task_struct *tsk)
 	}
 	context->filterkey = key;
 
-	tsk->audit_context  = context;
+	audit_set_context(tsk, context);
 	set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
 	return 0;
 }
@@ -1507,8 +1514,7 @@ void __audit_free(struct task_struct *tsk)
 void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
 			   unsigned long a3, unsigned long a4)
 {
-	struct task_struct *tsk = current;
-	struct audit_context *context = tsk->audit_context;
+	struct audit_context *context = audit_context();
 	enum audit_state     state;
 
 	if (!audit_enabled || !context)
@@ -1523,7 +1529,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
 	context->dummy = !audit_n_rules;
 	if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
 		context->prio = 0;
-		if (auditd_test_task(tsk))
+		if (auditd_test_task(current))
 			return;
 	}
 
@@ -1553,7 +1559,6 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
  */
 void __audit_syscall_exit(int success, long return_code)
 {
-	struct task_struct *tsk = current;
 	struct audit_context *context;
 
 	if (success)
@@ -1561,12 +1566,12 @@ void __audit_syscall_exit(int success, long return_code)
 	else
 		success = AUDITSC_FAILURE;
 
-	context = audit_take_context(tsk, success, return_code);
+	context = audit_take_context(current, success, return_code);
 	if (!context)
 		return;
 
 	if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
-		audit_log_exit(context, tsk);
+		audit_log_exit(context, current);
 
 	context->in_syscall = 0;
 	context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
@@ -1588,7 +1593,7 @@ void __audit_syscall_exit(int success, long return_code)
 		kfree(context->filterkey);
 		context->filterkey = NULL;
 	}
-	tsk->audit_context = context;
+	audit_set_context(current, context);
 }
 
 static inline void handle_one(const struct inode *inode)
@@ -1600,7 +1605,7 @@ static inline void handle_one(const struct inode *inode)
 	int count;
 	if (likely(!inode->i_fsnotify_marks))
 		return;
-	context = current->audit_context;
+	context = audit_context();
 	p = context->trees;
 	count = context->tree_count;
 	rcu_read_lock();
@@ -1631,7 +1636,7 @@ static void handle_path(const struct dentry *dentry)
 	unsigned long seq;
 	int count;
 
-	context = current->audit_context;
+	context = audit_context();
 	p = context->trees;
 	count = context->tree_count;
 retry:
@@ -1713,7 +1718,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
 struct filename *
 __audit_reusename(const __user char *uptr)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	struct audit_names *n;
 
 	list_for_each_entry(n, &context->names_list, list) {
@@ -1736,7 +1741,7 @@ __audit_reusename(const __user char *uptr)
  */
 void __audit_getname(struct filename *name)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	struct audit_names *n;
 
 	if (!context->in_syscall)
@@ -1764,7 +1769,7 @@ void __audit_getname(struct filename *name)
 void __audit_inode(struct filename *name, const struct dentry *dentry,
 		   unsigned int flags)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	struct inode *inode = d_backing_inode(dentry);
 	struct audit_names *n;
 	bool parent = flags & AUDIT_INODE_PARENT;
@@ -1863,7 +1868,7 @@ void __audit_inode_child(struct inode *parent,
 			 const struct dentry *dentry,
 			 const unsigned char type)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	struct inode *inode = d_backing_inode(dentry);
 	const char *dname = dentry->d_name.name;
 	struct audit_names *n, *found_parent = NULL, *found_child = NULL;
@@ -2048,7 +2053,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 int audit_set_loginuid(kuid_t loginuid)
 {
 	struct task_struct *task = current;
-	unsigned int oldsessionid, sessionid = (unsigned int)-1;
+	unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
 	kuid_t oldloginuid;
 	int rc;
 
@@ -2062,7 +2067,7 @@ int audit_set_loginuid(kuid_t loginuid)
 	/* are we setting or clearing? */
 	if (uid_valid(loginuid)) {
 		sessionid = (unsigned int)atomic_inc_return(&session_id);
-		if (unlikely(sessionid == (unsigned int)-1))
+		if (unlikely(sessionid == AUDIT_SID_UNSET))
 			sessionid = (unsigned int)atomic_inc_return(&session_id);
 	}
 
@@ -2082,7 +2087,7 @@ out:
  */
 void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	if (attr)
 		memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
@@ -2106,7 +2111,7 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
 			const struct timespec64 *abs_timeout)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	struct timespec64 *p = &context->mq_sendrecv.abs_timeout;
 
 	if (abs_timeout)
@@ -2130,7 +2135,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
 
 void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	if (notification)
 		context->mq_notify.sigev_signo = notification->sigev_signo;
@@ -2149,7 +2154,7 @@ void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
  */
 void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	context->mq_getsetattr.mqdes = mqdes;
 	context->mq_getsetattr.mqstat = *mqstat;
 	context->type = AUDIT_MQ_GETSETATTR;
@@ -2162,7 +2167,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
  */
 void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	context->ipc.uid = ipcp->uid;
 	context->ipc.gid = ipcp->gid;
 	context->ipc.mode = ipcp->mode;
@@ -2182,7 +2187,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
  */
 void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	context->ipc.qbytes = qbytes;
 	context->ipc.perm_uid = uid;
@@ -2193,7 +2198,7 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
 
 void __audit_bprm(struct linux_binprm *bprm)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	context->type = AUDIT_EXECVE;
 	context->execve.argc = bprm->argc;
@@ -2208,7 +2213,7 @@ void __audit_bprm(struct linux_binprm *bprm)
  */
 int __audit_socketcall(int nargs, unsigned long *args)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
 		return -EINVAL;
@@ -2226,7 +2231,7 @@ int __audit_socketcall(int nargs, unsigned long *args)
  */
 void __audit_fd_pair(int fd1, int fd2)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	context->fds[0] = fd1;
 	context->fds[1] = fd2;
 }
@@ -2240,7 +2245,7 @@ void __audit_fd_pair(int fd1, int fd2)
  */
 int __audit_sockaddr(int len, void *a)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	if (!context->sockaddr) {
 		void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
@@ -2256,7 +2261,7 @@ int __audit_sockaddr(int len, void *a)
 
 void __audit_ptrace(struct task_struct *t)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	context->target_pid = task_tgid_nr(t);
 	context->target_auid = audit_get_loginuid(t);
@@ -2277,19 +2282,19 @@ void __audit_ptrace(struct task_struct *t)
 int audit_signal_info(int sig, struct task_struct *t)
 {
 	struct audit_aux_data_pids *axp;
-	struct task_struct *tsk = current;
-	struct audit_context *ctx = tsk->audit_context;
-	kuid_t uid = current_uid(), t_uid = task_uid(t);
+	struct audit_context *ctx = audit_context();
+	kuid_t uid = current_uid(), auid, t_uid = task_uid(t);
 
 	if (auditd_test_task(t) &&
 	    (sig == SIGTERM || sig == SIGHUP ||
 	     sig == SIGUSR1 || sig == SIGUSR2)) {
-		audit_sig_pid = task_tgid_nr(tsk);
-		if (uid_valid(tsk->loginuid))
-			audit_sig_uid = tsk->loginuid;
+		audit_sig_pid = task_tgid_nr(current);
+		auid = audit_get_loginuid(current);
+		if (uid_valid(auid))
+			audit_sig_uid = auid;
 		else
 			audit_sig_uid = uid;
-		security_task_getsecid(tsk, &audit_sig_sid);
+		security_task_getsecid(current, &audit_sig_sid);
 	}
 
 	if (!audit_signals || audit_dummy_context())
@@ -2345,7 +2350,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 			   const struct cred *new, const struct cred *old)
 {
 	struct audit_aux_data_bprm_fcaps *ax;
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	struct cpu_vfs_cap_data vcaps;
 
 	ax = kmalloc(sizeof(*ax), GFP_KERNEL);
@@ -2385,7 +2390,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
  */
 void __audit_log_capset(const struct cred *new, const struct cred *old)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	context->capset.pid = task_tgid_nr(current);
 	context->capset.cap.effective   = new->cap_effective;
 	context->capset.cap.inheritable = new->cap_effective;
@@ -2396,7 +2401,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
 
 void __audit_mmap_fd(int fd, int flags)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 	context->mmap.fd = fd;
 	context->mmap.flags = flags;
 	context->type = AUDIT_MMAP;
@@ -2404,7 +2409,7 @@ void __audit_mmap_fd(int fd, int flags)
 
 void __audit_log_kern_module(char *name)
 {
-	struct audit_context *context = current->audit_context;
+	struct audit_context *context = audit_context();
 
 	context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL);
 	strcpy(context->module.name, name);
@@ -2413,7 +2418,7 @@ void __audit_log_kern_module(char *name)
 
 void __audit_fanotify(unsigned int response)
 {
-	audit_log(current->audit_context, GFP_KERNEL,
+	audit_log(audit_context(), GFP_KERNEL,
 		AUDIT_FANOTIFY,	"resp=%u", response);
 }
 
@@ -2464,7 +2469,19 @@ void audit_core_dumps(long signr)
 	audit_log_end(ab);
 }
 
-void __audit_seccomp(unsigned long syscall, long signr, int code)
+/**
+ * audit_seccomp - record information about a seccomp action
+ * @syscall: syscall number
+ * @signr: signal value
+ * @code: the seccomp action
+ *
+ * Record the information associated with a seccomp action. Event filtering for
+ * seccomp actions that are not to be logged is done in seccomp_log().
+ * Therefore, this function forces auditing independent of the audit_enabled
+ * and dummy context state because seccomp actions should be logged even when
+ * audit is not in use.
+ */
+void audit_seccomp(unsigned long syscall, long signr, int code)
 {
 	struct audit_buffer *ab;
 
@@ -2478,9 +2495,29 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
 	audit_log_end(ab);
 }
 
+void audit_seccomp_actions_logged(const char *names, const char *old_names,
+				  int res)
+{
+	struct audit_buffer *ab;
+
+	if (!audit_enabled)
+		return;
+
+	ab = audit_log_start(audit_context(), GFP_KERNEL,
+			     AUDIT_CONFIG_CHANGE);
+	if (unlikely(!ab))
+		return;
+
+	audit_log_format(ab, "op=seccomp-logging");
+	audit_log_format(ab, " actions=%s", names);
+	audit_log_format(ab, " old-actions=%s", old_names);
+	audit_log_format(ab, " res=%d", res);
+	audit_log_end(ab);
+}
+
 struct list_head *audit_killed_trees(void)
 {
-	struct audit_context *ctx = current->audit_context;
+	struct audit_context *ctx = audit_context();
 	if (likely(!ctx || !ctx->in_syscall))
 		return NULL;
 	return &ctx->killed_trees;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a713fd23ec88..f27f5496d6fe 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -4,9 +4,13 @@ obj-y := core.o
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+ifeq ($(CONFIG_XDP_SOCKETS),y)
+obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
+endif
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 ifeq ($(CONFIG_INET),y)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 14750e7c5ee4..544e58f5f642 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,11 +11,13 @@
  * General Public License for more details.
  */
 #include <linux/bpf.h>
+#include <linux/btf.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/filter.h>
 #include <linux/perf_event.h>
+#include <uapi/linux/btf.h>
 
 #include "map_in_map.h"
 
@@ -336,6 +338,52 @@ static void array_map_free(struct bpf_map *map)
 	bpf_map_area_free(array);
 }
 
+static void array_map_seq_show_elem(struct bpf_map *map, void *key,
+				    struct seq_file *m)
+{
+	void *value;
+
+	rcu_read_lock();
+
+	value = array_map_lookup_elem(map, key);
+	if (!value) {
+		rcu_read_unlock();
+		return;
+	}
+
+	seq_printf(m, "%u: ", *(u32 *)key);
+	btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
+	seq_puts(m, "\n");
+
+	rcu_read_unlock();
+}
+
+static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf,
+			       u32 btf_key_id, u32 btf_value_id)
+{
+	const struct btf_type *key_type, *value_type;
+	u32 key_size, value_size;
+	u32 int_data;
+
+	key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+	if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+		return -EINVAL;
+
+	int_data = *(u32 *)(key_type + 1);
+	/* bpf array can only take a u32 key.  This check makes
+	 * sure that the btf matches the attr used during map_create.
+	 */
+	if (BTF_INT_BITS(int_data) != 32 || key_size != 4 ||
+	    BTF_INT_OFFSET(int_data))
+		return -EINVAL;
+
+	value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
+	if (!value_type || value_size > map->value_size)
+		return -EINVAL;
+
+	return 0;
+}
+
 const struct bpf_map_ops array_map_ops = {
 	.map_alloc_check = array_map_alloc_check,
 	.map_alloc = array_map_alloc,
@@ -345,6 +393,8 @@ const struct bpf_map_ops array_map_ops = {
 	.map_update_elem = array_map_update_elem,
 	.map_delete_elem = array_map_delete_elem,
 	.map_gen_lookup = array_map_gen_lookup,
+	.map_seq_show_elem = array_map_seq_show_elem,
+	.map_check_btf = array_map_check_btf,
 };
 
 const struct bpf_map_ops percpu_array_map_ops = {
@@ -476,7 +526,7 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr)
 }
 
 /* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_fd_array_map_clear(struct bpf_map *map)
+static void bpf_fd_array_map_clear(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
@@ -495,6 +545,7 @@ const struct bpf_map_ops prog_array_map_ops = {
 	.map_fd_get_ptr = prog_fd_array_get_ptr,
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
 	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
+	.map_release_uref = bpf_fd_array_map_clear,
 };
 
 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
new file mode 100644
index 000000000000..8653ab004c73
--- /dev/null
+++ b/kernel/bpf/btf.c
@@ -0,0 +1,2348 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#include <uapi/linux/btf.h>
+#include <uapi/linux/types.h>
+#include <linux/seq_file.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/idr.h>
+#include <linux/sort.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+
+/* BTF (BPF Type Format) is the meta data format which describes
+ * the data types of BPF program/map.  Hence, it basically focus
+ * on the C programming language which the modern BPF is primary
+ * using.
+ *
+ * ELF Section:
+ * ~~~~~~~~~~~
+ * The BTF data is stored under the ".BTF" ELF section
+ *
+ * struct btf_type:
+ * ~~~~~~~~~~~~~~~
+ * Each 'struct btf_type' object describes a C data type.
+ * Depending on the type it is describing, a 'struct btf_type'
+ * object may be followed by more data.  F.e.
+ * To describe an array, 'struct btf_type' is followed by
+ * 'struct btf_array'.
+ *
+ * 'struct btf_type' and any extra data following it are
+ * 4 bytes aligned.
+ *
+ * Type section:
+ * ~~~~~~~~~~~~~
+ * The BTF type section contains a list of 'struct btf_type' objects.
+ * Each one describes a C type.  Recall from the above section
+ * that a 'struct btf_type' object could be immediately followed by extra
+ * data in order to desribe some particular C types.
+ *
+ * type_id:
+ * ~~~~~~~
+ * Each btf_type object is identified by a type_id.  The type_id
+ * is implicitly implied by the location of the btf_type object in
+ * the BTF type section.  The first one has type_id 1.  The second
+ * one has type_id 2...etc.  Hence, an earlier btf_type has
+ * a smaller type_id.
+ *
+ * A btf_type object may refer to another btf_type object by using
+ * type_id (i.e. the "type" in the "struct btf_type").
+ *
+ * NOTE that we cannot assume any reference-order.
+ * A btf_type object can refer to an earlier btf_type object
+ * but it can also refer to a later btf_type object.
+ *
+ * For example, to describe "const void *".  A btf_type
+ * object describing "const" may refer to another btf_type
+ * object describing "void *".  This type-reference is done
+ * by specifying type_id:
+ *
+ * [1] CONST (anon) type_id=2
+ * [2] PTR (anon) type_id=0
+ *
+ * The above is the btf_verifier debug log:
+ *   - Each line started with "[?]" is a btf_type object
+ *   - [?] is the type_id of the btf_type object.
+ *   - CONST/PTR is the BTF_KIND_XXX
+ *   - "(anon)" is the name of the type.  It just
+ *     happens that CONST and PTR has no name.
+ *   - type_id=XXX is the 'u32 type' in btf_type
+ *
+ * NOTE: "void" has type_id 0
+ *
+ * String section:
+ * ~~~~~~~~~~~~~~
+ * The BTF string section contains the names used by the type section.
+ * Each string is referred by an "offset" from the beginning of the
+ * string section.
+ *
+ * Each string is '\0' terminated.
+ *
+ * The first character in the string section must be '\0'
+ * which is used to mean 'anonymous'. Some btf_type may not
+ * have a name.
+ */
+
+/* BTF verification:
+ *
+ * To verify BTF data, two passes are needed.
+ *
+ * Pass #1
+ * ~~~~~~~
+ * The first pass is to collect all btf_type objects to
+ * an array: "btf->types".
+ *
+ * Depending on the C type that a btf_type is describing,
+ * a btf_type may be followed by extra data.  We don't know
+ * how many btf_type is there, and more importantly we don't
+ * know where each btf_type is located in the type section.
+ *
+ * Without knowing the location of each type_id, most verifications
+ * cannot be done.  e.g. an earlier btf_type may refer to a later
+ * btf_type (recall the "const void *" above), so we cannot
+ * check this type-reference in the first pass.
+ *
+ * In the first pass, it still does some verifications (e.g.
+ * checking the name is a valid offset to the string section).
+ *
+ * Pass #2
+ * ~~~~~~~
+ * The main focus is to resolve a btf_type that is referring
+ * to another type.
+ *
+ * We have to ensure the referring type:
+ * 1) does exist in the BTF (i.e. in btf->types[])
+ * 2) does not cause a loop:
+ *	struct A {
+ *		struct B b;
+ *	};
+ *
+ *	struct B {
+ *		struct A a;
+ *	};
+ *
+ * btf_type_needs_resolve() decides if a btf_type needs
+ * to be resolved.
+ *
+ * The needs_resolve type implements the "resolve()" ops which
+ * essentially does a DFS and detects backedge.
+ *
+ * During resolve (or DFS), different C types have different
+ * "RESOLVED" conditions.
+ *
+ * When resolving a BTF_KIND_STRUCT, we need to resolve all its
+ * members because a member is always referring to another
+ * type.  A struct's member can be treated as "RESOLVED" if
+ * it is referring to a BTF_KIND_PTR.  Otherwise, the
+ * following valid C struct would be rejected:
+ *
+ *	struct A {
+ *		int m;
+ *		struct A *a;
+ *	};
+ *
+ * When resolving a BTF_KIND_PTR, it needs to keep resolving if
+ * it is referring to another BTF_KIND_PTR.  Otherwise, we cannot
+ * detect a pointer loop, e.g.:
+ * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR +
+ *                        ^                                         |
+ *                        +-----------------------------------------+
+ *
+ */
+
+#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
+#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
+#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
+#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
+#define BITS_ROUNDUP_BYTES(bits) \
+	(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
+
+#define BTF_INFO_MASK 0x0f00ffff
+#define BTF_INT_MASK 0x0fffffff
+#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
+#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
+
+/* 16MB for 64k structs and each has 16 members and
+ * a few MB spaces for the string section.
+ * The hard limit is S32_MAX.
+ */
+#define BTF_MAX_SIZE (16 * 1024 * 1024)
+
+#define for_each_member(i, struct_type, member)			\
+	for (i = 0, member = btf_type_member(struct_type);	\
+	     i < btf_type_vlen(struct_type);			\
+	     i++, member++)
+
+#define for_each_member_from(i, from, struct_type, member)		\
+	for (i = from, member = btf_type_member(struct_type) + from;	\
+	     i < btf_type_vlen(struct_type);				\
+	     i++, member++)
+
+static DEFINE_IDR(btf_idr);
+static DEFINE_SPINLOCK(btf_idr_lock);
+
+struct btf {
+	void *data;
+	struct btf_type **types;
+	u32 *resolved_ids;
+	u32 *resolved_sizes;
+	const char *strings;
+	void *nohdr_data;
+	struct btf_header hdr;
+	u32 nr_types;
+	u32 types_size;
+	u32 data_size;
+	refcount_t refcnt;
+	u32 id;
+	struct rcu_head rcu;
+};
+
+enum verifier_phase {
+	CHECK_META,
+	CHECK_TYPE,
+};
+
+struct resolve_vertex {
+	const struct btf_type *t;
+	u32 type_id;
+	u16 next_member;
+};
+
+enum visit_state {
+	NOT_VISITED,
+	VISITED,
+	RESOLVED,
+};
+
+enum resolve_mode {
+	RESOLVE_TBD,	/* To Be Determined */
+	RESOLVE_PTR,	/* Resolving for Pointer */
+	RESOLVE_STRUCT_OR_ARRAY,	/* Resolving for struct/union
+					 * or array
+					 */
+};
+
+#define MAX_RESOLVE_DEPTH 32
+
+struct btf_sec_info {
+	u32 off;
+	u32 len;
+};
+
+struct btf_verifier_env {
+	struct btf *btf;
+	u8 *visit_states;
+	struct resolve_vertex stack[MAX_RESOLVE_DEPTH];
+	struct bpf_verifier_log log;
+	u32 log_type_id;
+	u32 top_stack;
+	enum verifier_phase phase;
+	enum resolve_mode resolve_mode;
+};
+
+static const char * const btf_kind_str[NR_BTF_KINDS] = {
+	[BTF_KIND_UNKN]		= "UNKNOWN",
+	[BTF_KIND_INT]		= "INT",
+	[BTF_KIND_PTR]		= "PTR",
+	[BTF_KIND_ARRAY]	= "ARRAY",
+	[BTF_KIND_STRUCT]	= "STRUCT",
+	[BTF_KIND_UNION]	= "UNION",
+	[BTF_KIND_ENUM]		= "ENUM",
+	[BTF_KIND_FWD]		= "FWD",
+	[BTF_KIND_TYPEDEF]	= "TYPEDEF",
+	[BTF_KIND_VOLATILE]	= "VOLATILE",
+	[BTF_KIND_CONST]	= "CONST",
+	[BTF_KIND_RESTRICT]	= "RESTRICT",
+};
+
+struct btf_kind_operations {
+	s32 (*check_meta)(struct btf_verifier_env *env,
+			  const struct btf_type *t,
+			  u32 meta_left);
+	int (*resolve)(struct btf_verifier_env *env,
+		       const struct resolve_vertex *v);
+	int (*check_member)(struct btf_verifier_env *env,
+			    const struct btf_type *struct_type,
+			    const struct btf_member *member,
+			    const struct btf_type *member_type);
+	void (*log_details)(struct btf_verifier_env *env,
+			    const struct btf_type *t);
+	void (*seq_show)(const struct btf *btf, const struct btf_type *t,
+			 u32 type_id, void *data, u8 bits_offsets,
+			 struct seq_file *m);
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
+static struct btf_type btf_void;
+
+static bool btf_type_is_modifier(const struct btf_type *t)
+{
+	/* Some of them is not strictly a C modifier
+	 * but they are grouped into the same bucket
+	 * for BTF concern:
+	 *   A type (t) that refers to another
+	 *   type through t->type AND its size cannot
+	 *   be determined without following the t->type.
+	 *
+	 * ptr does not fall into this bucket
+	 * because its size is always sizeof(void *).
+	 */
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_CONST:
+	case BTF_KIND_RESTRICT:
+		return true;
+	}
+
+	return false;
+}
+
+static bool btf_type_is_void(const struct btf_type *t)
+{
+	/* void => no type and size info.
+	 * Hence, FWD is also treated as void.
+	 */
+	return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_is_void_or_null(const struct btf_type *t)
+{
+	return !t || btf_type_is_void(t);
+}
+
+/* union is only a special case of struct:
+ * all its offsetof(member) == 0
+ */
+static bool btf_type_is_struct(const struct btf_type *t)
+{
+	u8 kind = BTF_INFO_KIND(t->info);
+
+	return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
+}
+
+static bool btf_type_is_array(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
+}
+
+static bool btf_type_is_ptr(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
+}
+
+static bool btf_type_is_int(const struct btf_type *t)
+{
+	return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
+}
+
+/* What types need to be resolved?
+ *
+ * btf_type_is_modifier() is an obvious one.
+ *
+ * btf_type_is_struct() because its member refers to
+ * another type (through member->type).
+
+ * btf_type_is_array() because its element (array->type)
+ * refers to another type.  Array can be thought of a
+ * special case of struct while array just has the same
+ * member-type repeated by array->nelems of times.
+ */
+static bool btf_type_needs_resolve(const struct btf_type *t)
+{
+	return btf_type_is_modifier(t) ||
+		btf_type_is_ptr(t) ||
+		btf_type_is_struct(t) ||
+		btf_type_is_array(t);
+}
+
+/* t->size can be used */
+static bool btf_type_has_size(const struct btf_type *t)
+{
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_INT:
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION:
+	case BTF_KIND_ENUM:
+		return true;
+	}
+
+	return false;
+}
+
+static const char *btf_int_encoding_str(u8 encoding)
+{
+	if (encoding == 0)
+		return "(none)";
+	else if (encoding == BTF_INT_SIGNED)
+		return "SIGNED";
+	else if (encoding == BTF_INT_CHAR)
+		return "CHAR";
+	else if (encoding == BTF_INT_BOOL)
+		return "BOOL";
+	else
+		return "UNKN";
+}
+
+static u16 btf_type_vlen(const struct btf_type *t)
+{
+	return BTF_INFO_VLEN(t->info);
+}
+
+static u32 btf_type_int(const struct btf_type *t)
+{
+	return *(u32 *)(t + 1);
+}
+
+static const struct btf_array *btf_type_array(const struct btf_type *t)
+{
+	return (const struct btf_array *)(t + 1);
+}
+
+static const struct btf_member *btf_type_member(const struct btf_type *t)
+{
+	return (const struct btf_member *)(t + 1);
+}
+
+static const struct btf_enum *btf_type_enum(const struct btf_type *t)
+{
+	return (const struct btf_enum *)(t + 1);
+}
+
+static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
+{
+	return kind_ops[BTF_INFO_KIND(t->info)];
+}
+
+static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
+{
+	return BTF_STR_OFFSET_VALID(offset) &&
+		offset < btf->hdr.str_len;
+}
+
+static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+{
+	if (!offset)
+		return "(anon)";
+	else if (offset < btf->hdr.str_len)
+		return &btf->strings[offset];
+	else
+		return "(invalid-name-offset)";
+}
+
+static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
+{
+	if (type_id > btf->nr_types)
+		return NULL;
+
+	return btf->types[type_id];
+}
+
+/*
+ * Regular int is not a bit field and it must be either
+ * u8/u16/u32/u64.
+ */
+static bool btf_type_int_is_regular(const struct btf_type *t)
+{
+	u16 nr_bits, nr_bytes;
+	u32 int_data;
+
+	int_data = btf_type_int(t);
+	nr_bits = BTF_INT_BITS(int_data);
+	nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+	if (BITS_PER_BYTE_MASKED(nr_bits) ||
+	    BTF_INT_OFFSET(int_data) ||
+	    (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
+	     nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+		return false;
+	}
+
+	return true;
+}
+
+__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
+					      const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(log, fmt, args);
+	va_end(args);
+}
+
+__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env,
+					    const char *fmt, ...)
+{
+	struct bpf_verifier_log *log = &env->log;
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(log, fmt, args);
+	va_end(args);
+}
+
+__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
+						   const struct btf_type *t,
+						   bool log_details,
+						   const char *fmt, ...)
+{
+	struct bpf_verifier_log *log = &env->log;
+	u8 kind = BTF_INFO_KIND(t->info);
+	struct btf *btf = env->btf;
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	__btf_verifier_log(log, "[%u] %s %s%s",
+			   env->log_type_id,
+			   btf_kind_str[kind],
+			   btf_name_by_offset(btf, t->name_off),
+			   log_details ? " " : "");
+
+	if (log_details)
+		btf_type_ops(t)->log_details(env, t);
+
+	if (fmt && *fmt) {
+		__btf_verifier_log(log, " ");
+		va_start(args, fmt);
+		bpf_verifier_vlog(log, fmt, args);
+		va_end(args);
+	}
+
+	__btf_verifier_log(log, "\n");
+}
+
+#define btf_verifier_log_type(env, t, ...) \
+	__btf_verifier_log_type((env), (t), true, __VA_ARGS__)
+#define btf_verifier_log_basic(env, t, ...) \
+	__btf_verifier_log_type((env), (t), false, __VA_ARGS__)
+
+__printf(4, 5)
+static void btf_verifier_log_member(struct btf_verifier_env *env,
+				    const struct btf_type *struct_type,
+				    const struct btf_member *member,
+				    const char *fmt, ...)
+{
+	struct bpf_verifier_log *log = &env->log;
+	struct btf *btf = env->btf;
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	/* The CHECK_META phase already did a btf dump.
+	 *
+	 * If member is logged again, it must hit an error in
+	 * parsing this member.  It is useful to print out which
+	 * struct this member belongs to.
+	 */
+	if (env->phase != CHECK_META)
+		btf_verifier_log_type(env, struct_type, NULL);
+
+	__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
+			   btf_name_by_offset(btf, member->name_off),
+			   member->type, member->offset);
+
+	if (fmt && *fmt) {
+		__btf_verifier_log(log, " ");
+		va_start(args, fmt);
+		bpf_verifier_vlog(log, fmt, args);
+		va_end(args);
+	}
+
+	__btf_verifier_log(log, "\n");
+}
+
+static void btf_verifier_log_hdr(struct btf_verifier_env *env,
+				 u32 btf_data_size)
+{
+	struct bpf_verifier_log *log = &env->log;
+	const struct btf *btf = env->btf;
+	const struct btf_header *hdr;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	hdr = &btf->hdr;
+	__btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
+	__btf_verifier_log(log, "version: %u\n", hdr->version);
+	__btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
+	__btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len);
+	__btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
+	__btf_verifier_log(log, "type_len: %u\n", hdr->type_len);
+	__btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
+	__btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
+	__btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size);
+}
+
+static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
+{
+	struct btf *btf = env->btf;
+
+	/* < 2 because +1 for btf_void which is always in btf->types[0].
+	 * btf_void is not accounted in btf->nr_types because btf_void
+	 * does not come from the BTF file.
+	 */
+	if (btf->types_size - btf->nr_types < 2) {
+		/* Expand 'types' array */
+
+		struct btf_type **new_types;
+		u32 expand_by, new_size;
+
+		if (btf->types_size == BTF_MAX_TYPE) {
+			btf_verifier_log(env, "Exceeded max num of types");
+			return -E2BIG;
+		}
+
+		expand_by = max_t(u32, btf->types_size >> 2, 16);
+		new_size = min_t(u32, BTF_MAX_TYPE,
+				 btf->types_size + expand_by);
+
+		new_types = kvzalloc(new_size * sizeof(*new_types),
+				     GFP_KERNEL | __GFP_NOWARN);
+		if (!new_types)
+			return -ENOMEM;
+
+		if (btf->nr_types == 0)
+			new_types[0] = &btf_void;
+		else
+			memcpy(new_types, btf->types,
+			       sizeof(*btf->types) * (btf->nr_types + 1));
+
+		kvfree(btf->types);
+		btf->types = new_types;
+		btf->types_size = new_size;
+	}
+
+	btf->types[++(btf->nr_types)] = t;
+
+	return 0;
+}
+
+static int btf_alloc_id(struct btf *btf)
+{
+	int id;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_bh(&btf_idr_lock);
+	id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC);
+	if (id > 0)
+		btf->id = id;
+	spin_unlock_bh(&btf_idr_lock);
+	idr_preload_end();
+
+	if (WARN_ON_ONCE(!id))
+		return -ENOSPC;
+
+	return id > 0 ? 0 : id;
+}
+
+static void btf_free_id(struct btf *btf)
+{
+	unsigned long flags;
+
+	/*
+	 * In map-in-map, calling map_delete_elem() on outer
+	 * map will call bpf_map_put on the inner map.
+	 * It will then eventually call btf_free_id()
+	 * on the inner map.  Some of the map_delete_elem()
+	 * implementation may have irq disabled, so
+	 * we need to use the _irqsave() version instead
+	 * of the _bh() version.
+	 */
+	spin_lock_irqsave(&btf_idr_lock, flags);
+	idr_remove(&btf_idr, btf->id);
+	spin_unlock_irqrestore(&btf_idr_lock, flags);
+}
+
+static void btf_free(struct btf *btf)
+{
+	kvfree(btf->types);
+	kvfree(btf->resolved_sizes);
+	kvfree(btf->resolved_ids);
+	kvfree(btf->data);
+	kfree(btf);
+}
+
+static void btf_free_rcu(struct rcu_head *rcu)
+{
+	struct btf *btf = container_of(rcu, struct btf, rcu);
+
+	btf_free(btf);
+}
+
+void btf_put(struct btf *btf)
+{
+	if (btf && refcount_dec_and_test(&btf->refcnt)) {
+		btf_free_id(btf);
+		call_rcu(&btf->rcu, btf_free_rcu);
+	}
+}
+
+static int env_resolve_init(struct btf_verifier_env *env)
+{
+	struct btf *btf = env->btf;
+	u32 nr_types = btf->nr_types;
+	u32 *resolved_sizes = NULL;
+	u32 *resolved_ids = NULL;
+	u8 *visit_states = NULL;
+
+	/* +1 for btf_void */
+	resolved_sizes = kvzalloc((nr_types + 1) * sizeof(*resolved_sizes),
+				  GFP_KERNEL | __GFP_NOWARN);
+	if (!resolved_sizes)
+		goto nomem;
+
+	resolved_ids = kvzalloc((nr_types + 1) * sizeof(*resolved_ids),
+				GFP_KERNEL | __GFP_NOWARN);
+	if (!resolved_ids)
+		goto nomem;
+
+	visit_states = kvzalloc((nr_types + 1) * sizeof(*visit_states),
+				GFP_KERNEL | __GFP_NOWARN);
+	if (!visit_states)
+		goto nomem;
+
+	btf->resolved_sizes = resolved_sizes;
+	btf->resolved_ids = resolved_ids;
+	env->visit_states = visit_states;
+
+	return 0;
+
+nomem:
+	kvfree(resolved_sizes);
+	kvfree(resolved_ids);
+	kvfree(visit_states);
+	return -ENOMEM;
+}
+
+static void btf_verifier_env_free(struct btf_verifier_env *env)
+{
+	kvfree(env->visit_states);
+	kfree(env);
+}
+
+static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
+				     const struct btf_type *next_type)
+{
+	switch (env->resolve_mode) {
+	case RESOLVE_TBD:
+		/* int, enum or void is a sink */
+		return !btf_type_needs_resolve(next_type);
+	case RESOLVE_PTR:
+		/* int, enum, void, struct or array is a sink for ptr */
+		return !btf_type_is_modifier(next_type) &&
+			!btf_type_is_ptr(next_type);
+	case RESOLVE_STRUCT_OR_ARRAY:
+		/* int, enum, void or ptr is a sink for struct and array */
+		return !btf_type_is_modifier(next_type) &&
+			!btf_type_is_array(next_type) &&
+			!btf_type_is_struct(next_type);
+	default:
+		BUG();
+	}
+}
+
+static bool env_type_is_resolved(const struct btf_verifier_env *env,
+				 u32 type_id)
+{
+	return env->visit_states[type_id] == RESOLVED;
+}
+
+static int env_stack_push(struct btf_verifier_env *env,
+			  const struct btf_type *t, u32 type_id)
+{
+	struct resolve_vertex *v;
+
+	if (env->top_stack == MAX_RESOLVE_DEPTH)
+		return -E2BIG;
+
+	if (env->visit_states[type_id] != NOT_VISITED)
+		return -EEXIST;
+
+	env->visit_states[type_id] = VISITED;
+
+	v = &env->stack[env->top_stack++];
+	v->t = t;
+	v->type_id = type_id;
+	v->next_member = 0;
+
+	if (env->resolve_mode == RESOLVE_TBD) {
+		if (btf_type_is_ptr(t))
+			env->resolve_mode = RESOLVE_PTR;
+		else if (btf_type_is_struct(t) || btf_type_is_array(t))
+			env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY;
+	}
+
+	return 0;
+}
+
+static void env_stack_set_next_member(struct btf_verifier_env *env,
+				      u16 next_member)
+{
+	env->stack[env->top_stack - 1].next_member = next_member;
+}
+
+static void env_stack_pop_resolved(struct btf_verifier_env *env,
+				   u32 resolved_type_id,
+				   u32 resolved_size)
+{
+	u32 type_id = env->stack[--(env->top_stack)].type_id;
+	struct btf *btf = env->btf;
+
+	btf->resolved_sizes[type_id] = resolved_size;
+	btf->resolved_ids[type_id] = resolved_type_id;
+	env->visit_states[type_id] = RESOLVED;
+}
+
+static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
+{
+	return env->top_stack ? &env->stack[env->top_stack - 1] : NULL;
+}
+
+/* The input param "type_id" must point to a needs_resolve type */
+static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
+						  u32 *type_id)
+{
+	*type_id = btf->resolved_ids[*type_id];
+	return btf_type_by_id(btf, *type_id);
+}
+
+const struct btf_type *btf_type_id_size(const struct btf *btf,
+					u32 *type_id, u32 *ret_size)
+{
+	const struct btf_type *size_type;
+	u32 size_type_id = *type_id;
+	u32 size = 0;
+
+	size_type = btf_type_by_id(btf, size_type_id);
+	if (btf_type_is_void_or_null(size_type))
+		return NULL;
+
+	if (btf_type_has_size(size_type)) {
+		size = size_type->size;
+	} else if (btf_type_is_array(size_type)) {
+		size = btf->resolved_sizes[size_type_id];
+	} else if (btf_type_is_ptr(size_type)) {
+		size = sizeof(void *);
+	} else {
+		if (WARN_ON_ONCE(!btf_type_is_modifier(size_type)))
+			return NULL;
+
+		size = btf->resolved_sizes[size_type_id];
+		size_type_id = btf->resolved_ids[size_type_id];
+		size_type = btf_type_by_id(btf, size_type_id);
+		if (btf_type_is_void(size_type))
+			return NULL;
+	}
+
+	*type_id = size_type_id;
+	if (ret_size)
+		*ret_size = size;
+
+	return size_type;
+}
+
+static int btf_df_check_member(struct btf_verifier_env *env,
+			       const struct btf_type *struct_type,
+			       const struct btf_member *member,
+			       const struct btf_type *member_type)
+{
+	btf_verifier_log_basic(env, struct_type,
+			       "Unsupported check_member");
+	return -EINVAL;
+}
+
+static int btf_df_resolve(struct btf_verifier_env *env,
+			  const struct resolve_vertex *v)
+{
+	btf_verifier_log_basic(env, v->t, "Unsupported resolve");
+	return -EINVAL;
+}
+
+static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t,
+			    u32 type_id, void *data, u8 bits_offsets,
+			    struct seq_file *m)
+{
+	seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
+}
+
+static int btf_int_check_member(struct btf_verifier_env *env,
+				const struct btf_type *struct_type,
+				const struct btf_member *member,
+				const struct btf_type *member_type)
+{
+	u32 int_data = btf_type_int(member_type);
+	u32 struct_bits_off = member->offset;
+	u32 struct_size = struct_type->size;
+	u32 nr_copy_bits;
+	u32 bytes_offset;
+
+	if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"bits_offset exceeds U32_MAX");
+		return -EINVAL;
+	}
+
+	struct_bits_off += BTF_INT_OFFSET(int_data);
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	nr_copy_bits = BTF_INT_BITS(int_data) +
+		BITS_PER_BYTE_MASKED(struct_bits_off);
+
+	if (nr_copy_bits > BITS_PER_U64) {
+		btf_verifier_log_member(env, struct_type, member,
+					"nr_copy_bits exceeds 64");
+		return -EINVAL;
+	}
+
+	if (struct_size < bytes_offset ||
+	    struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static s32 btf_int_check_meta(struct btf_verifier_env *env,
+			      const struct btf_type *t,
+			      u32 meta_left)
+{
+	u32 int_data, nr_bits, meta_needed = sizeof(int_data);
+	u16 encoding;
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	int_data = btf_type_int(t);
+	if (int_data & ~BTF_INT_MASK) {
+		btf_verifier_log_basic(env, t, "Invalid int_data:%x",
+				       int_data);
+		return -EINVAL;
+	}
+
+	nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
+
+	if (nr_bits > BITS_PER_U64) {
+		btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
+				      BITS_PER_U64);
+		return -EINVAL;
+	}
+
+	if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) {
+		btf_verifier_log_type(env, t, "nr_bits exceeds type_size");
+		return -EINVAL;
+	}
+
+	/*
+	 * Only one of the encoding bits is allowed and it
+	 * should be sufficient for the pretty print purpose (i.e. decoding).
+	 * Multiple bits can be allowed later if it is found
+	 * to be insufficient.
+	 */
+	encoding = BTF_INT_ENCODING(int_data);
+	if (encoding &&
+	    encoding != BTF_INT_SIGNED &&
+	    encoding != BTF_INT_CHAR &&
+	    encoding != BTF_INT_BOOL) {
+		btf_verifier_log_type(env, t, "Unsupported encoding");
+		return -ENOTSUPP;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return meta_needed;
+}
+
+static void btf_int_log(struct btf_verifier_env *env,
+			const struct btf_type *t)
+{
+	int int_data = btf_type_int(t);
+
+	btf_verifier_log(env,
+			 "size=%u bits_offset=%u nr_bits=%u encoding=%s",
+			 t->size, BTF_INT_OFFSET(int_data),
+			 BTF_INT_BITS(int_data),
+			 btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
+}
+
+static void btf_int_bits_seq_show(const struct btf *btf,
+				  const struct btf_type *t,
+				  void *data, u8 bits_offset,
+				  struct seq_file *m)
+{
+	u32 int_data = btf_type_int(t);
+	u16 nr_bits = BTF_INT_BITS(int_data);
+	u16 total_bits_offset;
+	u16 nr_copy_bytes;
+	u16 nr_copy_bits;
+	u8 nr_upper_bits;
+	union {
+		u64 u64_num;
+		u8  u8_nums[8];
+	} print_num;
+
+	total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
+	data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
+	bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
+	nr_copy_bits = nr_bits + bits_offset;
+	nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
+
+	print_num.u64_num = 0;
+	memcpy(&print_num.u64_num, data, nr_copy_bytes);
+
+	/* Ditch the higher order bits */
+	nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits);
+	if (nr_upper_bits) {
+		/* We need to mask out some bits of the upper byte. */
+		u8 mask = (1 << nr_upper_bits) - 1;
+
+		print_num.u8_nums[nr_copy_bytes - 1] &= mask;
+	}
+
+	print_num.u64_num >>= bits_offset;
+
+	seq_printf(m, "0x%llx", print_num.u64_num);
+}
+
+static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
+			     u32 type_id, void *data, u8 bits_offset,
+			     struct seq_file *m)
+{
+	u32 int_data = btf_type_int(t);
+	u8 encoding = BTF_INT_ENCODING(int_data);
+	bool sign = encoding & BTF_INT_SIGNED;
+	u32 nr_bits = BTF_INT_BITS(int_data);
+
+	if (bits_offset || BTF_INT_OFFSET(int_data) ||
+	    BITS_PER_BYTE_MASKED(nr_bits)) {
+		btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+		return;
+	}
+
+	switch (nr_bits) {
+	case 64:
+		if (sign)
+			seq_printf(m, "%lld", *(s64 *)data);
+		else
+			seq_printf(m, "%llu", *(u64 *)data);
+		break;
+	case 32:
+		if (sign)
+			seq_printf(m, "%d", *(s32 *)data);
+		else
+			seq_printf(m, "%u", *(u32 *)data);
+		break;
+	case 16:
+		if (sign)
+			seq_printf(m, "%d", *(s16 *)data);
+		else
+			seq_printf(m, "%u", *(u16 *)data);
+		break;
+	case 8:
+		if (sign)
+			seq_printf(m, "%d", *(s8 *)data);
+		else
+			seq_printf(m, "%u", *(u8 *)data);
+		break;
+	default:
+		btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+	}
+}
+
+static const struct btf_kind_operations int_ops = {
+	.check_meta = btf_int_check_meta,
+	.resolve = btf_df_resolve,
+	.check_member = btf_int_check_member,
+	.log_details = btf_int_log,
+	.seq_show = btf_int_seq_show,
+};
+
+static int btf_modifier_check_member(struct btf_verifier_env *env,
+				     const struct btf_type *struct_type,
+				     const struct btf_member *member,
+				     const struct btf_type *member_type)
+{
+	const struct btf_type *resolved_type;
+	u32 resolved_type_id = member->type;
+	struct btf_member resolved_member;
+	struct btf *btf = env->btf;
+
+	resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
+	if (!resolved_type) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Invalid member");
+		return -EINVAL;
+	}
+
+	resolved_member = *member;
+	resolved_member.type = resolved_type_id;
+
+	return btf_type_ops(resolved_type)->check_member(env, struct_type,
+							 &resolved_member,
+							 resolved_type);
+}
+
+static int btf_ptr_check_member(struct btf_verifier_env *env,
+				const struct btf_type *struct_type,
+				const struct btf_member *member,
+				const struct btf_type *member_type)
+{
+	u32 struct_size, struct_bits_off, bytes_offset;
+
+	struct_size = struct_type->size;
+	struct_bits_off = member->offset;
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+
+	if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member is not byte aligned");
+		return -EINVAL;
+	}
+
+	if (struct_size - bytes_offset < sizeof(void *)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int btf_ref_type_check_meta(struct btf_verifier_env *env,
+				   const struct btf_type *t,
+				   u32 meta_left)
+{
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	if (!BTF_TYPE_ID_VALID(t->type)) {
+		btf_verifier_log_type(env, t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return 0;
+}
+
+static int btf_modifier_resolve(struct btf_verifier_env *env,
+				const struct resolve_vertex *v)
+{
+	const struct btf_type *t = v->t;
+	const struct btf_type *next_type;
+	u32 next_type_id = t->type;
+	struct btf *btf = env->btf;
+	u32 next_type_size = 0;
+
+	next_type = btf_type_by_id(btf, next_type_id);
+	if (!next_type) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	/* "typedef void new_void", "const void"...etc */
+	if (btf_type_is_void(next_type))
+		goto resolved;
+
+	if (!env_type_is_resolve_sink(env, next_type) &&
+	    !env_type_is_resolved(env, next_type_id))
+		return env_stack_push(env, next_type, next_type_id);
+
+	/* Figure out the resolved next_type_id with size.
+	 * They will be stored in the current modifier's
+	 * resolved_ids and resolved_sizes such that it can
+	 * save us a few type-following when we use it later (e.g. in
+	 * pretty print).
+	 */
+	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+	    !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+resolved:
+	env_stack_pop_resolved(env, next_type_id, next_type_size);
+
+	return 0;
+}
+
+static int btf_ptr_resolve(struct btf_verifier_env *env,
+			   const struct resolve_vertex *v)
+{
+	const struct btf_type *next_type;
+	const struct btf_type *t = v->t;
+	u32 next_type_id = t->type;
+	struct btf *btf = env->btf;
+	u32 next_type_size = 0;
+
+	next_type = btf_type_by_id(btf, next_type_id);
+	if (!next_type) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+	/* "void *" */
+	if (btf_type_is_void(next_type))
+		goto resolved;
+
+	if (!env_type_is_resolve_sink(env, next_type) &&
+	    !env_type_is_resolved(env, next_type_id))
+		return env_stack_push(env, next_type, next_type_id);
+
+	/* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY,
+	 * the modifier may have stopped resolving when it was resolved
+	 * to a ptr (last-resolved-ptr).
+	 *
+	 * We now need to continue from the last-resolved-ptr to
+	 * ensure the last-resolved-ptr will not referring back to
+	 * the currenct ptr (t).
+	 */
+	if (btf_type_is_modifier(next_type)) {
+		const struct btf_type *resolved_type;
+		u32 resolved_type_id;
+
+		resolved_type_id = next_type_id;
+		resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
+
+		if (btf_type_is_ptr(resolved_type) &&
+		    !env_type_is_resolve_sink(env, resolved_type) &&
+		    !env_type_is_resolved(env, resolved_type_id))
+			return env_stack_push(env, resolved_type,
+					      resolved_type_id);
+	}
+
+	if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+	    !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+		btf_verifier_log_type(env, v->t, "Invalid type_id");
+		return -EINVAL;
+	}
+
+resolved:
+	env_stack_pop_resolved(env, next_type_id, 0);
+
+	return 0;
+}
+
+static void btf_modifier_seq_show(const struct btf *btf,
+				  const struct btf_type *t,
+				  u32 type_id, void *data,
+				  u8 bits_offset, struct seq_file *m)
+{
+	t = btf_type_id_resolve(btf, &type_id);
+
+	btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+}
+
+static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,
+			     u32 type_id, void *data, u8 bits_offset,
+			     struct seq_file *m)
+{
+	/* It is a hashed value */
+	seq_printf(m, "%p", *(void **)data);
+}
+
+static void btf_ref_type_log(struct btf_verifier_env *env,
+			     const struct btf_type *t)
+{
+	btf_verifier_log(env, "type_id=%u", t->type);
+}
+
+static struct btf_kind_operations modifier_ops = {
+	.check_meta = btf_ref_type_check_meta,
+	.resolve = btf_modifier_resolve,
+	.check_member = btf_modifier_check_member,
+	.log_details = btf_ref_type_log,
+	.seq_show = btf_modifier_seq_show,
+};
+
+static struct btf_kind_operations ptr_ops = {
+	.check_meta = btf_ref_type_check_meta,
+	.resolve = btf_ptr_resolve,
+	.check_member = btf_ptr_check_member,
+	.log_details = btf_ref_type_log,
+	.seq_show = btf_ptr_seq_show,
+};
+
+static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
+			      const struct btf_type *t,
+			      u32 meta_left)
+{
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	if (t->type) {
+		btf_verifier_log_type(env, t, "type != 0");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return 0;
+}
+
+static struct btf_kind_operations fwd_ops = {
+	.check_meta = btf_fwd_check_meta,
+	.resolve = btf_df_resolve,
+	.check_member = btf_df_check_member,
+	.log_details = btf_ref_type_log,
+	.seq_show = btf_df_seq_show,
+};
+
+static int btf_array_check_member(struct btf_verifier_env *env,
+				  const struct btf_type *struct_type,
+				  const struct btf_member *member,
+				  const struct btf_type *member_type)
+{
+	u32 struct_bits_off = member->offset;
+	u32 struct_size, bytes_offset;
+	u32 array_type_id, array_size;
+	struct btf *btf = env->btf;
+
+	if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member is not byte aligned");
+		return -EINVAL;
+	}
+
+	array_type_id = member->type;
+	btf_type_id_size(btf, &array_type_id, &array_size);
+	struct_size = struct_type->size;
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	if (struct_size - bytes_offset < array_size) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static s32 btf_array_check_meta(struct btf_verifier_env *env,
+				const struct btf_type *t,
+				u32 meta_left)
+{
+	const struct btf_array *array = btf_type_array(t);
+	u32 meta_needed = sizeof(*array);
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (btf_type_vlen(t)) {
+		btf_verifier_log_type(env, t, "vlen != 0");
+		return -EINVAL;
+	}
+
+	if (t->size) {
+		btf_verifier_log_type(env, t, "size != 0");
+		return -EINVAL;
+	}
+
+	/* Array elem type and index type cannot be in type void,
+	 * so !array->type and !array->index_type are not allowed.
+	 */
+	if (!array->type || !BTF_TYPE_ID_VALID(array->type)) {
+		btf_verifier_log_type(env, t, "Invalid elem");
+		return -EINVAL;
+	}
+
+	if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) {
+		btf_verifier_log_type(env, t, "Invalid index");
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	return meta_needed;
+}
+
+static int btf_array_resolve(struct btf_verifier_env *env,
+			     const struct resolve_vertex *v)
+{
+	const struct btf_array *array = btf_type_array(v->t);
+	const struct btf_type *elem_type, *index_type;
+	u32 elem_type_id, index_type_id;
+	struct btf *btf = env->btf;
+	u32 elem_size;
+
+	/* Check array->index_type */
+	index_type_id = array->index_type;
+	index_type = btf_type_by_id(btf, index_type_id);
+	if (btf_type_is_void_or_null(index_type)) {
+		btf_verifier_log_type(env, v->t, "Invalid index");
+		return -EINVAL;
+	}
+
+	if (!env_type_is_resolve_sink(env, index_type) &&
+	    !env_type_is_resolved(env, index_type_id))
+		return env_stack_push(env, index_type, index_type_id);
+
+	index_type = btf_type_id_size(btf, &index_type_id, NULL);
+	if (!index_type || !btf_type_is_int(index_type) ||
+	    !btf_type_int_is_regular(index_type)) {
+		btf_verifier_log_type(env, v->t, "Invalid index");
+		return -EINVAL;
+	}
+
+	/* Check array->type */
+	elem_type_id = array->type;
+	elem_type = btf_type_by_id(btf, elem_type_id);
+	if (btf_type_is_void_or_null(elem_type)) {
+		btf_verifier_log_type(env, v->t,
+				      "Invalid elem");
+		return -EINVAL;
+	}
+
+	if (!env_type_is_resolve_sink(env, elem_type) &&
+	    !env_type_is_resolved(env, elem_type_id))
+		return env_stack_push(env, elem_type, elem_type_id);
+
+	elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+	if (!elem_type) {
+		btf_verifier_log_type(env, v->t, "Invalid elem");
+		return -EINVAL;
+	}
+
+	if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) {
+		btf_verifier_log_type(env, v->t, "Invalid array of int");
+		return -EINVAL;
+	}
+
+	if (array->nelems && elem_size > U32_MAX / array->nelems) {
+		btf_verifier_log_type(env, v->t,
+				      "Array size overflows U32_MAX");
+		return -EINVAL;
+	}
+
+	env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems);
+
+	return 0;
+}
+
+static void btf_array_log(struct btf_verifier_env *env,
+			  const struct btf_type *t)
+{
+	const struct btf_array *array = btf_type_array(t);
+
+	btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u",
+			 array->type, array->index_type, array->nelems);
+}
+
+static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t,
+			       u32 type_id, void *data, u8 bits_offset,
+			       struct seq_file *m)
+{
+	const struct btf_array *array = btf_type_array(t);
+	const struct btf_kind_operations *elem_ops;
+	const struct btf_type *elem_type;
+	u32 i, elem_size, elem_type_id;
+
+	elem_type_id = array->type;
+	elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+	elem_ops = btf_type_ops(elem_type);
+	seq_puts(m, "[");
+	for (i = 0; i < array->nelems; i++) {
+		if (i)
+			seq_puts(m, ",");
+
+		elem_ops->seq_show(btf, elem_type, elem_type_id, data,
+				   bits_offset, m);
+		data += elem_size;
+	}
+	seq_puts(m, "]");
+}
+
+static struct btf_kind_operations array_ops = {
+	.check_meta = btf_array_check_meta,
+	.resolve = btf_array_resolve,
+	.check_member = btf_array_check_member,
+	.log_details = btf_array_log,
+	.seq_show = btf_array_seq_show,
+};
+
+static int btf_struct_check_member(struct btf_verifier_env *env,
+				   const struct btf_type *struct_type,
+				   const struct btf_member *member,
+				   const struct btf_type *member_type)
+{
+	u32 struct_bits_off = member->offset;
+	u32 struct_size, bytes_offset;
+
+	if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member is not byte aligned");
+		return -EINVAL;
+	}
+
+	struct_size = struct_type->size;
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	if (struct_size - bytes_offset < member_type->size) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static s32 btf_struct_check_meta(struct btf_verifier_env *env,
+				 const struct btf_type *t,
+				 u32 meta_left)
+{
+	bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION;
+	const struct btf_member *member;
+	struct btf *btf = env->btf;
+	u32 struct_size = t->size;
+	u32 meta_needed;
+	u16 i;
+
+	meta_needed = btf_type_vlen(t) * sizeof(*member);
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	for_each_member(i, t, member) {
+		if (!btf_name_offset_valid(btf, member->name_off)) {
+			btf_verifier_log_member(env, t, member,
+						"Invalid member name_offset:%u",
+						member->name_off);
+			return -EINVAL;
+		}
+
+		/* A member cannot be in type void */
+		if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {
+			btf_verifier_log_member(env, t, member,
+						"Invalid type_id");
+			return -EINVAL;
+		}
+
+		if (is_union && member->offset) {
+			btf_verifier_log_member(env, t, member,
+						"Invalid member bits_offset");
+			return -EINVAL;
+		}
+
+		if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
+			btf_verifier_log_member(env, t, member,
+						"Memmber bits_offset exceeds its struct size");
+			return -EINVAL;
+		}
+
+		btf_verifier_log_member(env, t, member, NULL);
+	}
+
+	return meta_needed;
+}
+
+static int btf_struct_resolve(struct btf_verifier_env *env,
+			      const struct resolve_vertex *v)
+{
+	const struct btf_member *member;
+	int err;
+	u16 i;
+
+	/* Before continue resolving the next_member,
+	 * ensure the last member is indeed resolved to a
+	 * type with size info.
+	 */
+	if (v->next_member) {
+		const struct btf_type *last_member_type;
+		const struct btf_member *last_member;
+		u16 last_member_type_id;
+
+		last_member = btf_type_member(v->t) + v->next_member - 1;
+		last_member_type_id = last_member->type;
+		if (WARN_ON_ONCE(!env_type_is_resolved(env,
+						       last_member_type_id)))
+			return -EINVAL;
+
+		last_member_type = btf_type_by_id(env->btf,
+						  last_member_type_id);
+		err = btf_type_ops(last_member_type)->check_member(env, v->t,
+							last_member,
+							last_member_type);
+		if (err)
+			return err;
+	}
+
+	for_each_member_from(i, v->next_member, v->t, member) {
+		u32 member_type_id = member->type;
+		const struct btf_type *member_type = btf_type_by_id(env->btf,
+								member_type_id);
+
+		if (btf_type_is_void_or_null(member_type)) {
+			btf_verifier_log_member(env, v->t, member,
+						"Invalid member");
+			return -EINVAL;
+		}
+
+		if (!env_type_is_resolve_sink(env, member_type) &&
+		    !env_type_is_resolved(env, member_type_id)) {
+			env_stack_set_next_member(env, i + 1);
+			return env_stack_push(env, member_type, member_type_id);
+		}
+
+		err = btf_type_ops(member_type)->check_member(env, v->t,
+							      member,
+							      member_type);
+		if (err)
+			return err;
+	}
+
+	env_stack_pop_resolved(env, 0, 0);
+
+	return 0;
+}
+
+static void btf_struct_log(struct btf_verifier_env *env,
+			   const struct btf_type *t)
+{
+	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
+				u32 type_id, void *data, u8 bits_offset,
+				struct seq_file *m)
+{
+	const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ",";
+	const struct btf_member *member;
+	u32 i;
+
+	seq_puts(m, "{");
+	for_each_member(i, t, member) {
+		const struct btf_type *member_type = btf_type_by_id(btf,
+								member->type);
+		u32 member_offset = member->offset;
+		u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
+		u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
+		const struct btf_kind_operations *ops;
+
+		if (i)
+			seq_puts(m, seq);
+
+		ops = btf_type_ops(member_type);
+		ops->seq_show(btf, member_type, member->type,
+			      data + bytes_offset, bits8_offset, m);
+	}
+	seq_puts(m, "}");
+}
+
+static struct btf_kind_operations struct_ops = {
+	.check_meta = btf_struct_check_meta,
+	.resolve = btf_struct_resolve,
+	.check_member = btf_struct_check_member,
+	.log_details = btf_struct_log,
+	.seq_show = btf_struct_seq_show,
+};
+
+static int btf_enum_check_member(struct btf_verifier_env *env,
+				 const struct btf_type *struct_type,
+				 const struct btf_member *member,
+				 const struct btf_type *member_type)
+{
+	u32 struct_bits_off = member->offset;
+	u32 struct_size, bytes_offset;
+
+	if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member is not byte aligned");
+		return -EINVAL;
+	}
+
+	struct_size = struct_type->size;
+	bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+	if (struct_size - bytes_offset < sizeof(int)) {
+		btf_verifier_log_member(env, struct_type, member,
+					"Member exceeds struct_size");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static s32 btf_enum_check_meta(struct btf_verifier_env *env,
+			       const struct btf_type *t,
+			       u32 meta_left)
+{
+	const struct btf_enum *enums = btf_type_enum(t);
+	struct btf *btf = env->btf;
+	u16 i, nr_enums;
+	u32 meta_needed;
+
+	nr_enums = btf_type_vlen(t);
+	meta_needed = nr_enums * sizeof(*enums);
+
+	if (meta_left < meta_needed) {
+		btf_verifier_log_basic(env, t,
+				       "meta_left:%u meta_needed:%u",
+				       meta_left, meta_needed);
+		return -EINVAL;
+	}
+
+	if (t->size != sizeof(int)) {
+		btf_verifier_log_type(env, t, "Expected size:%zu",
+				      sizeof(int));
+		return -EINVAL;
+	}
+
+	btf_verifier_log_type(env, t, NULL);
+
+	for (i = 0; i < nr_enums; i++) {
+		if (!btf_name_offset_valid(btf, enums[i].name_off)) {
+			btf_verifier_log(env, "\tInvalid name_offset:%u",
+					 enums[i].name_off);
+			return -EINVAL;
+		}
+
+		btf_verifier_log(env, "\t%s val=%d\n",
+				 btf_name_by_offset(btf, enums[i].name_off),
+				 enums[i].val);
+	}
+
+	return meta_needed;
+}
+
+static void btf_enum_log(struct btf_verifier_env *env,
+			 const struct btf_type *t)
+{
+	btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
+			      u32 type_id, void *data, u8 bits_offset,
+			      struct seq_file *m)
+{
+	const struct btf_enum *enums = btf_type_enum(t);
+	u32 i, nr_enums = btf_type_vlen(t);
+	int v = *(int *)data;
+
+	for (i = 0; i < nr_enums; i++) {
+		if (v == enums[i].val) {
+			seq_printf(m, "%s",
+				   btf_name_by_offset(btf, enums[i].name_off));
+			return;
+		}
+	}
+
+	seq_printf(m, "%d", v);
+}
+
+static struct btf_kind_operations enum_ops = {
+	.check_meta = btf_enum_check_meta,
+	.resolve = btf_df_resolve,
+	.check_member = btf_enum_check_member,
+	.log_details = btf_enum_log,
+	.seq_show = btf_enum_seq_show,
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
+	[BTF_KIND_INT] = &int_ops,
+	[BTF_KIND_PTR] = &ptr_ops,
+	[BTF_KIND_ARRAY] = &array_ops,
+	[BTF_KIND_STRUCT] = &struct_ops,
+	[BTF_KIND_UNION] = &struct_ops,
+	[BTF_KIND_ENUM] = &enum_ops,
+	[BTF_KIND_FWD] = &fwd_ops,
+	[BTF_KIND_TYPEDEF] = &modifier_ops,
+	[BTF_KIND_VOLATILE] = &modifier_ops,
+	[BTF_KIND_CONST] = &modifier_ops,
+	[BTF_KIND_RESTRICT] = &modifier_ops,
+};
+
+static s32 btf_check_meta(struct btf_verifier_env *env,
+			  const struct btf_type *t,
+			  u32 meta_left)
+{
+	u32 saved_meta_left = meta_left;
+	s32 var_meta_size;
+
+	if (meta_left < sizeof(*t)) {
+		btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu",
+				 env->log_type_id, meta_left, sizeof(*t));
+		return -EINVAL;
+	}
+	meta_left -= sizeof(*t);
+
+	if (t->info & ~BTF_INFO_MASK) {
+		btf_verifier_log(env, "[%u] Invalid btf_info:%x",
+				 env->log_type_id, t->info);
+		return -EINVAL;
+	}
+
+	if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
+	    BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
+		btf_verifier_log(env, "[%u] Invalid kind:%u",
+				 env->log_type_id, BTF_INFO_KIND(t->info));
+		return -EINVAL;
+	}
+
+	if (!btf_name_offset_valid(env->btf, t->name_off)) {
+		btf_verifier_log(env, "[%u] Invalid name_offset:%u",
+				 env->log_type_id, t->name_off);
+		return -EINVAL;
+	}
+
+	var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left);
+	if (var_meta_size < 0)
+		return var_meta_size;
+
+	meta_left -= var_meta_size;
+
+	return saved_meta_left - meta_left;
+}
+
+static int btf_check_all_metas(struct btf_verifier_env *env)
+{
+	struct btf *btf = env->btf;
+	struct btf_header *hdr;
+	void *cur, *end;
+
+	hdr = &btf->hdr;
+	cur = btf->nohdr_data + hdr->type_off;
+	end = btf->nohdr_data + hdr->type_len;
+
+	env->log_type_id = 1;
+	while (cur < end) {
+		struct btf_type *t = cur;
+		s32 meta_size;
+
+		meta_size = btf_check_meta(env, t, end - cur);
+		if (meta_size < 0)
+			return meta_size;
+
+		btf_add_type(env, t);
+		cur += meta_size;
+		env->log_type_id++;
+	}
+
+	return 0;
+}
+
+static int btf_resolve(struct btf_verifier_env *env,
+		       const struct btf_type *t, u32 type_id)
+{
+	const struct resolve_vertex *v;
+	int err = 0;
+
+	env->resolve_mode = RESOLVE_TBD;
+	env_stack_push(env, t, type_id);
+	while (!err && (v = env_stack_peak(env))) {
+		env->log_type_id = v->type_id;
+		err = btf_type_ops(v->t)->resolve(env, v);
+	}
+
+	env->log_type_id = type_id;
+	if (err == -E2BIG)
+		btf_verifier_log_type(env, t,
+				      "Exceeded max resolving depth:%u",
+				      MAX_RESOLVE_DEPTH);
+	else if (err == -EEXIST)
+		btf_verifier_log_type(env, t, "Loop detected");
+
+	return err;
+}
+
+static bool btf_resolve_valid(struct btf_verifier_env *env,
+			      const struct btf_type *t,
+			      u32 type_id)
+{
+	struct btf *btf = env->btf;
+
+	if (!env_type_is_resolved(env, type_id))
+		return false;
+
+	if (btf_type_is_struct(t))
+		return !btf->resolved_ids[type_id] &&
+			!btf->resolved_sizes[type_id];
+
+	if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) {
+		t = btf_type_id_resolve(btf, &type_id);
+		return t && !btf_type_is_modifier(t);
+	}
+
+	if (btf_type_is_array(t)) {
+		const struct btf_array *array = btf_type_array(t);
+		const struct btf_type *elem_type;
+		u32 elem_type_id = array->type;
+		u32 elem_size;
+
+		elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+		return elem_type && !btf_type_is_modifier(elem_type) &&
+			(array->nelems * elem_size ==
+			 btf->resolved_sizes[type_id]);
+	}
+
+	return false;
+}
+
+static int btf_check_all_types(struct btf_verifier_env *env)
+{
+	struct btf *btf = env->btf;
+	u32 type_id;
+	int err;
+
+	err = env_resolve_init(env);
+	if (err)
+		return err;
+
+	env->phase++;
+	for (type_id = 1; type_id <= btf->nr_types; type_id++) {
+		const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+		env->log_type_id = type_id;
+		if (btf_type_needs_resolve(t) &&
+		    !env_type_is_resolved(env, type_id)) {
+			err = btf_resolve(env, t, type_id);
+			if (err)
+				return err;
+		}
+
+		if (btf_type_needs_resolve(t) &&
+		    !btf_resolve_valid(env, t, type_id)) {
+			btf_verifier_log_type(env, t, "Invalid resolve state");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int btf_parse_type_sec(struct btf_verifier_env *env)
+{
+	const struct btf_header *hdr = &env->btf->hdr;
+	int err;
+
+	/* Type section must align to 4 bytes */
+	if (hdr->type_off & (sizeof(u32) - 1)) {
+		btf_verifier_log(env, "Unaligned type_off");
+		return -EINVAL;
+	}
+
+	if (!hdr->type_len) {
+		btf_verifier_log(env, "No type found");
+		return -EINVAL;
+	}
+
+	err = btf_check_all_metas(env);
+	if (err)
+		return err;
+
+	return btf_check_all_types(env);
+}
+
+static int btf_parse_str_sec(struct btf_verifier_env *env)
+{
+	const struct btf_header *hdr;
+	struct btf *btf = env->btf;
+	const char *start, *end;
+
+	hdr = &btf->hdr;
+	start = btf->nohdr_data + hdr->str_off;
+	end = start + hdr->str_len;
+
+	if (end != btf->data + btf->data_size) {
+		btf_verifier_log(env, "String section is not at the end");
+		return -EINVAL;
+	}
+
+	if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
+	    start[0] || end[-1]) {
+		btf_verifier_log(env, "Invalid string section");
+		return -EINVAL;
+	}
+
+	btf->strings = start;
+
+	return 0;
+}
+
+static const size_t btf_sec_info_offset[] = {
+	offsetof(struct btf_header, type_off),
+	offsetof(struct btf_header, str_off),
+};
+
+static int btf_sec_info_cmp(const void *a, const void *b)
+{
+	const struct btf_sec_info *x = a;
+	const struct btf_sec_info *y = b;
+
+	return (int)(x->off - y->off) ? : (int)(x->len - y->len);
+}
+
+static int btf_check_sec_info(struct btf_verifier_env *env,
+			      u32 btf_data_size)
+{
+	struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)];
+	u32 total, expected_total, i;
+	const struct btf_header *hdr;
+	const struct btf *btf;
+
+	btf = env->btf;
+	hdr = &btf->hdr;
+
+	/* Populate the secs from hdr */
+	for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++)
+		secs[i] = *(struct btf_sec_info *)((void *)hdr +
+						   btf_sec_info_offset[i]);
+
+	sort(secs, ARRAY_SIZE(btf_sec_info_offset),
+	     sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL);
+
+	/* Check for gaps and overlap among sections */
+	total = 0;
+	expected_total = btf_data_size - hdr->hdr_len;
+	for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) {
+		if (expected_total < secs[i].off) {
+			btf_verifier_log(env, "Invalid section offset");
+			return -EINVAL;
+		}
+		if (total < secs[i].off) {
+			/* gap */
+			btf_verifier_log(env, "Unsupported section found");
+			return -EINVAL;
+		}
+		if (total > secs[i].off) {
+			btf_verifier_log(env, "Section overlap found");
+			return -EINVAL;
+		}
+		if (expected_total - total < secs[i].len) {
+			btf_verifier_log(env,
+					 "Total section length too long");
+			return -EINVAL;
+		}
+		total += secs[i].len;
+	}
+
+	/* There is data other than hdr and known sections */
+	if (expected_total != total) {
+		btf_verifier_log(env, "Unsupported section found");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data,
+			 u32 btf_data_size)
+{
+	const struct btf_header *hdr;
+	u32 hdr_len, hdr_copy;
+	/*
+	 * Minimal part of the "struct btf_header" that
+	 * contains the hdr_len.
+	 */
+	struct btf_min_header {
+		u16	magic;
+		u8	version;
+		u8	flags;
+		u32	hdr_len;
+	} __user *min_hdr;
+	struct btf *btf;
+	int err;
+
+	btf = env->btf;
+	min_hdr = btf_data;
+
+	if (btf_data_size < sizeof(*min_hdr)) {
+		btf_verifier_log(env, "hdr_len not found");
+		return -EINVAL;
+	}
+
+	if (get_user(hdr_len, &min_hdr->hdr_len))
+		return -EFAULT;
+
+	if (btf_data_size < hdr_len) {
+		btf_verifier_log(env, "btf_header not found");
+		return -EINVAL;
+	}
+
+	err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len);
+	if (err) {
+		if (err == -E2BIG)
+			btf_verifier_log(env, "Unsupported btf_header");
+		return err;
+	}
+
+	hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr));
+	if (copy_from_user(&btf->hdr, btf_data, hdr_copy))
+		return -EFAULT;
+
+	hdr = &btf->hdr;
+
+	btf_verifier_log_hdr(env, btf_data_size);
+
+	if (hdr->magic != BTF_MAGIC) {
+		btf_verifier_log(env, "Invalid magic");
+		return -EINVAL;
+	}
+
+	if (hdr->version != BTF_VERSION) {
+		btf_verifier_log(env, "Unsupported version");
+		return -ENOTSUPP;
+	}
+
+	if (hdr->flags) {
+		btf_verifier_log(env, "Unsupported flags");
+		return -ENOTSUPP;
+	}
+
+	if (btf_data_size == hdr->hdr_len) {
+		btf_verifier_log(env, "No data");
+		return -EINVAL;
+	}
+
+	err = btf_check_sec_info(env, btf_data_size);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
+			     u32 log_level, char __user *log_ubuf, u32 log_size)
+{
+	struct btf_verifier_env *env = NULL;
+	struct bpf_verifier_log *log;
+	struct btf *btf = NULL;
+	u8 *data;
+	int err;
+
+	if (btf_data_size > BTF_MAX_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+	if (!env)
+		return ERR_PTR(-ENOMEM);
+
+	log = &env->log;
+	if (log_level || log_ubuf || log_size) {
+		/* user requested verbose verifier output
+		 * and supplied buffer to store the verification trace
+		 */
+		log->level = log_level;
+		log->ubuf = log_ubuf;
+		log->len_total = log_size;
+
+		/* log attributes have to be sane */
+		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+		    !log->level || !log->ubuf) {
+			err = -EINVAL;
+			goto errout;
+		}
+	}
+
+	btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+	if (!btf) {
+		err = -ENOMEM;
+		goto errout;
+	}
+	env->btf = btf;
+
+	err = btf_parse_hdr(env, btf_data, btf_data_size);
+	if (err)
+		goto errout;
+
+	data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+	if (!data) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	btf->data = data;
+	btf->data_size = btf_data_size;
+	btf->nohdr_data = btf->data + btf->hdr.hdr_len;
+
+	if (copy_from_user(data, btf_data, btf_data_size)) {
+		err = -EFAULT;
+		goto errout;
+	}
+
+	err = btf_parse_str_sec(env);
+	if (err)
+		goto errout;
+
+	err = btf_parse_type_sec(env);
+	if (err)
+		goto errout;
+
+	if (log->level && bpf_verifier_log_full(log)) {
+		err = -ENOSPC;
+		goto errout;
+	}
+
+	btf_verifier_env_free(env);
+	refcount_set(&btf->refcnt, 1);
+	return btf;
+
+errout:
+	btf_verifier_env_free(env);
+	if (btf)
+		btf_free(btf);
+	return ERR_PTR(err);
+}
+
+void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
+		       struct seq_file *m)
+{
+	const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+	btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
+}
+
+static int btf_release(struct inode *inode, struct file *filp)
+{
+	btf_put(filp->private_data);
+	return 0;
+}
+
+const struct file_operations btf_fops = {
+	.release	= btf_release,
+};
+
+static int __btf_new_fd(struct btf *btf)
+{
+	return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
+}
+
+int btf_new_fd(const union bpf_attr *attr)
+{
+	struct btf *btf;
+	int ret;
+
+	btf = btf_parse(u64_to_user_ptr(attr->btf),
+			attr->btf_size, attr->btf_log_level,
+			u64_to_user_ptr(attr->btf_log_buf),
+			attr->btf_log_size);
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+
+	ret = btf_alloc_id(btf);
+	if (ret) {
+		btf_free(btf);
+		return ret;
+	}
+
+	/*
+	 * The BTF ID is published to the userspace.
+	 * All BTF free must go through call_rcu() from
+	 * now on (i.e. free by calling btf_put()).
+	 */
+
+	ret = __btf_new_fd(btf);
+	if (ret < 0)
+		btf_put(btf);
+
+	return ret;
+}
+
+struct btf *btf_get_by_fd(int fd)
+{
+	struct btf *btf;
+	struct fd f;
+
+	f = fdget(fd);
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &btf_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	btf = f.file->private_data;
+	refcount_inc(&btf->refcnt);
+	fdput(f);
+
+	return btf;
+}
+
+int btf_get_info_by_fd(const struct btf *btf,
+		       const union bpf_attr *attr,
+		       union bpf_attr __user *uattr)
+{
+	struct bpf_btf_info __user *uinfo;
+	struct bpf_btf_info info = {};
+	u32 info_copy, btf_copy;
+	void __user *ubtf;
+	u32 uinfo_len;
+
+	uinfo = u64_to_user_ptr(attr->info.info);
+	uinfo_len = attr->info.info_len;
+
+	info_copy = min_t(u32, uinfo_len, sizeof(info));
+	if (copy_from_user(&info, uinfo, info_copy))
+		return -EFAULT;
+
+	info.id = btf->id;
+	ubtf = u64_to_user_ptr(info.btf);
+	btf_copy = min_t(u32, btf->data_size, info.btf_size);
+	if (copy_to_user(ubtf, btf->data, btf_copy))
+		return -EFAULT;
+	info.btf_size = btf->data_size;
+
+	if (copy_to_user(uinfo, &info, info_copy) ||
+	    put_user(info_copy, &uattr->info.info_len))
+		return -EFAULT;
+
+	return 0;
+}
+
+int btf_get_fd_by_id(u32 id)
+{
+	struct btf *btf;
+	int fd;
+
+	rcu_read_lock();
+	btf = idr_find(&btf_idr, id);
+	if (!btf || !refcount_inc_not_zero(&btf->refcnt))
+		btf = ERR_PTR(-ENOENT);
+	rcu_read_unlock();
+
+	if (IS_ERR(btf))
+		return PTR_ERR(btf);
+
+	fd = __btf_new_fd(btf);
+	if (fd < 0)
+		btf_put(btf);
+
+	return fd;
+}
+
+u32 btf_id(const struct btf *btf)
+{
+	return btf->id;
+}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 43171a0bb02b..f7c00bd6f8e4 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -500,6 +500,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  * @sk: sock struct that will use sockaddr
  * @uaddr: sockaddr struct provided by user
  * @type: The type of program to be exectuted
+ * @t_ctx: Pointer to attach type specific context
  *
  * socket is expected to be of type INET or INET6.
  *
@@ -508,12 +509,15 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
  */
 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 				      struct sockaddr *uaddr,
-				      enum bpf_attach_type type)
+				      enum bpf_attach_type type,
+				      void *t_ctx)
 {
 	struct bpf_sock_addr_kern ctx = {
 		.sk = sk,
 		.uaddr = uaddr,
+		.t_ctx = t_ctx,
 	};
+	struct sockaddr_storage unspec;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -523,6 +527,11 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 		return 0;
 
+	if (!ctx.uaddr) {
+		memset(&unspec, 0, sizeof(unspec));
+		ctx.uaddr = (struct sockaddr *)&unspec;
+	}
+
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d315b393abdd..9f1493705f40 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -31,6 +31,7 @@
 #include <linux/rbtree_latch.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
+#include <linux/perf_event.h>
 
 #include <asm/unaligned.h>
 
@@ -218,47 +219,84 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
 	return 0;
 }
 
-static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
+				u32 curr, const bool probe_pass)
 {
+	const s64 imm_min = S32_MIN, imm_max = S32_MAX;
+	s64 imm = insn->imm;
+
+	if (curr < pos && curr + imm + 1 > pos)
+		imm += delta;
+	else if (curr > pos + delta && curr + imm + 1 <= pos + delta)
+		imm -= delta;
+	if (imm < imm_min || imm > imm_max)
+		return -ERANGE;
+	if (!probe_pass)
+		insn->imm = imm;
+	return 0;
+}
+
+static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
+				u32 curr, const bool probe_pass)
+{
+	const s32 off_min = S16_MIN, off_max = S16_MAX;
+	s32 off = insn->off;
+
+	if (curr < pos && curr + off + 1 > pos)
+		off += delta;
+	else if (curr > pos + delta && curr + off + 1 <= pos + delta)
+		off -= delta;
+	if (off < off_min || off > off_max)
+		return -ERANGE;
+	if (!probe_pass)
+		insn->off = off;
+	return 0;
+}
+
+static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
+			    const bool probe_pass)
+{
+	u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0);
 	struct bpf_insn *insn = prog->insnsi;
-	u32 i, insn_cnt = prog->len;
-	bool pseudo_call;
-	u8 code;
-	int off;
+	int ret = 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
+		u8 code;
+
+		/* In the probing pass we still operate on the original,
+		 * unpatched image in order to check overflows before we
+		 * do any other adjustments. Therefore skip the patchlet.
+		 */
+		if (probe_pass && i == pos) {
+			i += delta + 1;
+			insn++;
+		}
 		code = insn->code;
-		if (BPF_CLASS(code) != BPF_JMP)
-			continue;
-		if (BPF_OP(code) == BPF_EXIT)
+		if (BPF_CLASS(code) != BPF_JMP ||
+		    BPF_OP(code) == BPF_EXIT)
 			continue;
+		/* Adjust offset of jmps if we cross patch boundaries. */
 		if (BPF_OP(code) == BPF_CALL) {
-			if (insn->src_reg == BPF_PSEUDO_CALL)
-				pseudo_call = true;
-			else
+			if (insn->src_reg != BPF_PSEUDO_CALL)
 				continue;
+			ret = bpf_adj_delta_to_imm(insn, pos, delta, i,
+						   probe_pass);
 		} else {
-			pseudo_call = false;
+			ret = bpf_adj_delta_to_off(insn, pos, delta, i,
+						   probe_pass);
 		}
-		off = pseudo_call ? insn->imm : insn->off;
-
-		/* Adjust offset of jmps if we cross boundaries. */
-		if (i < pos && i + off + 1 > pos)
-			off += delta;
-		else if (i > pos + delta && i + off + 1 <= pos + delta)
-			off -= delta;
-
-		if (pseudo_call)
-			insn->imm = off;
-		else
-			insn->off = off;
+		if (ret)
+			break;
 	}
+
+	return ret;
 }
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len)
 {
 	u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+	const u32 cnt_max = S16_MAX;
 	struct bpf_prog *prog_adj;
 
 	/* Since our patchlet doesn't expand the image, we're done. */
@@ -269,6 +307,15 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 
 	insn_adj_cnt = prog->len + insn_delta;
 
+	/* Reject anything that would potentially let the insn->off
+	 * target overflow when we have excessive program expansions.
+	 * We need to probe here before we do any reallocation where
+	 * we afterwards may not fail anymore.
+	 */
+	if (insn_adj_cnt > cnt_max &&
+	    bpf_adj_branches(prog, off, insn_delta, true))
+		return NULL;
+
 	/* Several new instructions need to be inserted. Make room
 	 * for them. Likely, there's no need for a new allocation as
 	 * last page could have large enough tailroom.
@@ -294,7 +341,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 		sizeof(*patch) * insn_rest);
 	memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
 
-	bpf_adj_branches(prog_adj, off, insn_delta);
+	/* We are guaranteed to not fail at this point, otherwise
+	 * the ship has sailed to reverse to the original state. An
+	 * overflow cannot happen at this point.
+	 */
+	BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
 
 	return prog_adj;
 }
@@ -633,23 +684,6 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
 		*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
 		break;
 
-	case BPF_LD | BPF_ABS | BPF_W:
-	case BPF_LD | BPF_ABS | BPF_H:
-	case BPF_LD | BPF_ABS | BPF_B:
-		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
-		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
-		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
-		break;
-
-	case BPF_LD | BPF_IND | BPF_W:
-	case BPF_LD | BPF_IND | BPF_H:
-	case BPF_LD | BPF_IND | BPF_B:
-		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
-		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
-		*to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg);
-		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
-		break;
-
 	case BPF_LD | BPF_IMM | BPF_DW:
 		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
 		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
@@ -890,14 +924,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
 	INSN_3(LDX, MEM, W),			\
 	INSN_3(LDX, MEM, DW),			\
 	/*   Immediate based. */		\
-	INSN_3(LD, IMM, DW),			\
-	/*   Misc (old cBPF carry-over). */	\
-	INSN_3(LD, ABS, B),			\
-	INSN_3(LD, ABS, H),			\
-	INSN_3(LD, ABS, W),			\
-	INSN_3(LD, IND, B),			\
-	INSN_3(LD, IND, H),			\
-	INSN_3(LD, IND, W)
+	INSN_3(LD, IMM, DW)
 
 bool bpf_opcode_in_insntable(u8 code)
 {
@@ -907,6 +934,13 @@ bool bpf_opcode_in_insntable(u8 code)
 		[0 ... 255] = false,
 		/* Now overwrite non-defaults ... */
 		BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
+		/* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
+		[BPF_LD | BPF_ABS | BPF_B] = true,
+		[BPF_LD | BPF_ABS | BPF_H] = true,
+		[BPF_LD | BPF_ABS | BPF_W] = true,
+		[BPF_LD | BPF_IND | BPF_B] = true,
+		[BPF_LD | BPF_IND | BPF_H] = true,
+		[BPF_LD | BPF_IND | BPF_W] = true,
 	};
 #undef BPF_INSN_3_TBL
 #undef BPF_INSN_2_TBL
@@ -937,8 +971,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 #undef BPF_INSN_3_LBL
 #undef BPF_INSN_2_LBL
 	u32 tail_call_cnt = 0;
-	void *ptr;
-	int off;
 
 #define CONT	 ({ insn++; goto select_insn; })
 #define CONT_JMP ({ insn++; goto select_insn; })
@@ -1265,67 +1297,6 @@ out:
 		atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
 			     (DST + insn->off));
 		CONT;
-	LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
-		off = IMM;
-load_word:
-		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
-		 * appearing in the programs where ctx == skb
-		 * (see may_access_skb() in the verifier). All programs
-		 * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6,
-		 * bpf_convert_filter() saves it in BPF_R6, internal BPF
-		 * verifier will check that BPF_R6 == ctx.
-		 *
-		 * BPF_ABS and BPF_IND are wrappers of function calls,
-		 * so they scratch BPF_R1-BPF_R5 registers, preserve
-		 * BPF_R6-BPF_R9, and store return value into BPF_R0.
-		 *
-		 * Implicit input:
-		 *   ctx == skb == BPF_R6 == CTX
-		 *
-		 * Explicit input:
-		 *   SRC == any register
-		 *   IMM == 32-bit immediate
-		 *
-		 * Output:
-		 *   BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
-		 */
-
-		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
-		if (likely(ptr != NULL)) {
-			BPF_R0 = get_unaligned_be32(ptr);
-			CONT;
-		}
-
-		return 0;
-	LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
-		off = IMM;
-load_half:
-		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
-		if (likely(ptr != NULL)) {
-			BPF_R0 = get_unaligned_be16(ptr);
-			CONT;
-		}
-
-		return 0;
-	LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
-		off = IMM;
-load_byte:
-		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
-		if (likely(ptr != NULL)) {
-			BPF_R0 = *(u8 *)ptr;
-			CONT;
-		}
-
-		return 0;
-	LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
-		off = IMM + SRC;
-		goto load_word;
-	LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
-		off = IMM + SRC;
-		goto load_half;
-	LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
-		off = IMM + SRC;
-		goto load_byte;
 
 	default_label:
 		/* If we ever reach this, we have a bug somewhere. Die hard here
@@ -1572,13 +1543,32 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
 	return cnt;
 }
 
+static bool bpf_prog_array_copy_core(struct bpf_prog **prog,
+				     u32 *prog_ids,
+				     u32 request_cnt)
+{
+	int i = 0;
+
+	for (; *prog; prog++) {
+		if (*prog == &dummy_bpf_prog.prog)
+			continue;
+		prog_ids[i] = (*prog)->aux->id;
+		if (++i == request_cnt) {
+			prog++;
+			break;
+		}
+	}
+
+	return !!(*prog);
+}
+
 int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 				__u32 __user *prog_ids, u32 cnt)
 {
 	struct bpf_prog **prog;
 	unsigned long err = 0;
-	u32 i = 0, *ids;
 	bool nospc;
+	u32 *ids;
 
 	/* users of this function are doing:
 	 * cnt = bpf_prog_array_length();
@@ -1595,16 +1585,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 		return -ENOMEM;
 	rcu_read_lock();
 	prog = rcu_dereference(progs)->progs;
-	for (; *prog; prog++) {
-		if (*prog == &dummy_bpf_prog.prog)
-			continue;
-		ids[i] = (*prog)->aux->id;
-		if (++i == cnt) {
-			prog++;
-			break;
-		}
-	}
-	nospc = !!(*prog);
+	nospc = bpf_prog_array_copy_core(prog, ids, cnt);
 	rcu_read_unlock();
 	err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
 	kfree(ids);
@@ -1635,6 +1616,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 	int new_prog_cnt, carry_prog_cnt = 0;
 	struct bpf_prog **existing_prog;
 	struct bpf_prog_array *array;
+	bool found_exclude = false;
 	int new_prog_idx = 0;
 
 	/* Figure out how many existing progs we need to carry over to
@@ -1643,14 +1625,20 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 	if (old_array) {
 		existing_prog = old_array->progs;
 		for (; *existing_prog; existing_prog++) {
-			if (*existing_prog != exclude_prog &&
-			    *existing_prog != &dummy_bpf_prog.prog)
+			if (*existing_prog == exclude_prog) {
+				found_exclude = true;
+				continue;
+			}
+			if (*existing_prog != &dummy_bpf_prog.prog)
 				carry_prog_cnt++;
 			if (*existing_prog == include_prog)
 				return -EEXIST;
 		}
 	}
 
+	if (exclude_prog && !found_exclude)
+		return -ENOENT;
+
 	/* How many progs (not NULL) will be in the new array? */
 	new_prog_cnt = carry_prog_cnt;
 	if (include_prog)
@@ -1683,22 +1671,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 }
 
 int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
-			     __u32 __user *prog_ids, u32 request_cnt,
-			     __u32 __user *prog_cnt)
+			     u32 *prog_ids, u32 request_cnt,
+			     u32 *prog_cnt)
 {
+	struct bpf_prog **prog;
 	u32 cnt = 0;
 
 	if (array)
 		cnt = bpf_prog_array_length(array);
 
-	if (copy_to_user(prog_cnt, &cnt, sizeof(cnt)))
-		return -EFAULT;
+	*prog_cnt = cnt;
 
 	/* return early if user requested only program count or nothing to copy */
 	if (!request_cnt || !cnt)
 		return 0;
 
-	return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt);
+	/* this function is called under trace/bpf_trace.c: bpf_event_mutex */
+	prog = rcu_dereference_check(array, 1)->progs;
+	return bpf_prog_array_copy_core(prog, prog_ids, request_cnt) ? -ENOSPC
+								     : 0;
 }
 
 static void bpf_prog_free_deferred(struct work_struct *work)
@@ -1709,6 +1700,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	aux = container_of(work, struct bpf_prog_aux, work);
 	if (bpf_prog_is_dev_bound(aux))
 		bpf_prog_offload_destroy(aux->prog);
+#ifdef CONFIG_PERF_EVENTS
+	if (aux->prog->has_callchain_buf)
+		put_callchain_buffers();
+#endif
 	for (i = 0; i < aux->func_cnt; i++)
 		bpf_jit_free(aux->func[i]);
 	if (aux->func_cnt) {
@@ -1769,6 +1764,8 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
@@ -1781,6 +1778,7 @@ bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 {
 	return -ENOTSUPP;
 }
+EXPORT_SYMBOL_GPL(bpf_event_output);
 
 /* Always built-in helper functions. */
 const struct bpf_func_proto bpf_tail_call_proto = {
@@ -1827,9 +1825,3 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 #include <linux/bpf_trace.h>
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
-
-/* These are only used within the BPF_SYSCALL code */
-#ifdef CONFIG_BPF_SYSCALL
-EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
-EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
-#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a4bb0b34375a..e0918d180f08 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -19,6 +19,7 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/ptr_ring.h>
+#include <net/xdp.h>
 
 #include <linux/sched.h>
 #include <linux/workqueue.h>
@@ -137,27 +138,6 @@ free_cmap:
 	return ERR_PTR(err);
 }
 
-static void __cpu_map_queue_destructor(void *ptr)
-{
-	/* The tear-down procedure should have made sure that queue is
-	 * empty.  See __cpu_map_entry_replace() and work-queue
-	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
-	 * gracefully and warn once.
-	 */
-	if (WARN_ON_ONCE(ptr))
-		page_frag_free(ptr);
-}
-
-static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
-	if (atomic_dec_and_test(&rcpu->refcnt)) {
-		/* The queue should be empty at this point */
-		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
-		kfree(rcpu->queue);
-		kfree(rcpu);
-	}
-}
-
 static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 {
 	atomic_inc(&rcpu->refcnt);
@@ -179,45 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work)
 	kthread_stop(rcpu->kthread);
 }
 
-/* For now, xdp_pkt is a cpumap internal data structure, with info
- * carried between enqueue to dequeue. It is mapped into the top
- * headroom of the packet, to avoid allocating separate mem.
- */
-struct xdp_pkt {
-	void *data;
-	u16 len;
-	u16 headroom;
-	u16 metasize;
-	struct net_device *dev_rx;
-};
-
-/* Convert xdp_buff to xdp_pkt */
-static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
-{
-	struct xdp_pkt *xdp_pkt;
-	int metasize;
-	int headroom;
-
-	/* Assure headroom is available for storing info */
-	headroom = xdp->data - xdp->data_hard_start;
-	metasize = xdp->data - xdp->data_meta;
-	metasize = metasize > 0 ? metasize : 0;
-	if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
-		return NULL;
-
-	/* Store info in top of packet */
-	xdp_pkt = xdp->data_hard_start;
-
-	xdp_pkt->data = xdp->data;
-	xdp_pkt->len  = xdp->data_end - xdp->data;
-	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
-	xdp_pkt->metasize = metasize;
-
-	return xdp_pkt;
-}
-
 static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
-					 struct xdp_pkt *xdp_pkt)
+					 struct xdp_frame *xdpf)
 {
 	unsigned int frame_size;
 	void *pkt_data_start;
@@ -232,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * would be preferred to set frame_size to 2048 or 4096
 	 * depending on the driver.
 	 *   frame_size = 2048;
-	 *   frame_len  = frame_size - sizeof(*xdp_pkt);
+	 *   frame_len  = frame_size - sizeof(*xdp_frame);
 	 *
 	 * Instead, with info avail, skb_shared_info in placed after
 	 * packet len.  This, unfortunately fakes the truesize.
@@ -240,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * is not at a fixed memory location, with mixed length
 	 * packets, which is bad for cache-line hotness.
 	 */
-	frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+	frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom +
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
-	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+	pkt_data_start = xdpf->data - xdpf->headroom;
 	skb = build_skb(pkt_data_start, frame_size);
 	if (!skb)
 		return NULL;
 
-	skb_reserve(skb, xdp_pkt->headroom);
-	__skb_put(skb, xdp_pkt->len);
-	if (xdp_pkt->metasize)
-		skb_metadata_set(skb, xdp_pkt->metasize);
+	skb_reserve(skb, xdpf->headroom);
+	__skb_put(skb, xdpf->len);
+	if (xdpf->metasize)
+		skb_metadata_set(skb, xdpf->metasize);
 
 	/* Essential SKB info: protocol and skb->dev */
-	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+	skb->protocol = eth_type_trans(skb, xdpf->dev_rx);
 
 	/* Optional SKB info, currently missing:
 	 * - HW checksum info		(skb->ip_summed)
@@ -265,6 +208,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	return skb;
 }
 
+static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
+{
+	/* The tear-down procedure should have made sure that queue is
+	 * empty.  See __cpu_map_entry_replace() and work-queue
+	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+	 * gracefully and warn once.
+	 */
+	struct xdp_frame *xdpf;
+
+	while ((xdpf = ptr_ring_consume(ring)))
+		if (WARN_ON_ONCE(xdpf))
+			xdp_return_frame(xdpf);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	if (atomic_dec_and_test(&rcpu->refcnt)) {
+		/* The queue should be empty at this point */
+		__cpu_map_ring_cleanup(rcpu->queue);
+		ptr_ring_cleanup(rcpu->queue, NULL);
+		kfree(rcpu->queue);
+		kfree(rcpu);
+	}
+}
+
 static int cpu_map_kthread_run(void *data)
 {
 	struct bpf_cpu_map_entry *rcpu = data;
@@ -278,7 +246,7 @@ static int cpu_map_kthread_run(void *data)
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
 		unsigned int processed = 0, drops = 0, sched = 0;
-		struct xdp_pkt *xdp_pkt;
+		struct xdp_frame *xdpf;
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
@@ -301,13 +269,13 @@ static int cpu_map_kthread_run(void *data)
 		 * kthread CPU pinned. Lockless access to ptr_ring
 		 * consume side valid as no-resize allowed of queue.
 		 */
-		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+		while ((xdpf = __ptr_ring_consume(rcpu->queue))) {
 			struct sk_buff *skb;
 			int ret;
 
-			skb = cpu_map_build_skb(rcpu, xdp_pkt);
+			skb = cpu_map_build_skb(rcpu, xdpf);
 			if (!skb) {
-				page_frag_free(xdp_pkt);
+				xdp_return_frame(xdpf);
 				continue;
 			}
 
@@ -604,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	spin_lock(&q->producer_lock);
 
 	for (i = 0; i < bq->count; i++) {
-		void *xdp_pkt = bq->q[i];
+		struct xdp_frame *xdpf = bq->q[i];
 		int err;
 
-		err = __ptr_ring_produce(q, xdp_pkt);
+		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			page_frag_free(xdp_pkt); /* Free xdp_pkt */
+			xdp_return_frame_rx_napi(xdpf);
 		}
 		processed++;
 	}
@@ -625,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
 
@@ -636,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 	 * driver to code invoking us to finished, due to driver
 	 * (e.g. ixgbe) recycle tricks based on page-refcnt.
 	 *
-	 * Thus, incoming xdp_pkt is always queued here (else we race
+	 * Thus, incoming xdp_frame is always queued here (else we race
 	 * with another CPU on page-refcnt and remaining driver code).
 	 * Queue time is very short, as driver will invoke flush
 	 * operation, when completing napi->poll call.
 	 */
-	bq->q[bq->count++] = xdp_pkt;
+	bq->q[bq->count++] = xdpf;
 	return 0;
 }
 
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
-	struct xdp_pkt *xdp_pkt;
+	struct xdp_frame *xdpf;
 
-	xdp_pkt = convert_to_xdp_pkt(xdp);
-	if (unlikely(!xdp_pkt))
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
 	/* Info needed when constructing SKB on remote CPU */
-	xdp_pkt->dev_rx = dev_rx;
+	xdpf->dev_rx = dev_rx;
 
-	bq_enqueue(rcpu, xdp_pkt);
+	bq_enqueue(rcpu, xdpf);
 	return 0;
 }
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 565f9ece9115..a7cc7b3494a9 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,15 +48,25 @@
  * calls will fail at this point.
  */
 #include <linux/bpf.h>
+#include <net/xdp.h>
 #include <linux/filter.h>
+#include <trace/events/xdp.h>
 
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+#define DEV_MAP_BULK_SIZE 16
+struct xdp_bulk_queue {
+	struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+	struct net_device *dev_rx;
+	unsigned int count;
+};
+
 struct bpf_dtab_netdev {
-	struct net_device *dev;
+	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct bpf_dtab *dtab;
 	unsigned int bit;
+	struct xdp_bulk_queue __percpu *bulkq;
 	struct rcu_head rcu;
 };
 
@@ -206,6 +216,50 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 	__set_bit(bit, bitmap);
 }
 
+static int bq_xmit_all(struct bpf_dtab_netdev *obj,
+		       struct xdp_bulk_queue *bq, u32 flags)
+{
+	struct net_device *dev = obj->dev;
+	int sent = 0, drops = 0, err = 0;
+	int i;
+
+	if (unlikely(!bq->count))
+		return 0;
+
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+
+		prefetch(xdpf);
+	}
+
+	sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
+	if (sent < 0) {
+		err = sent;
+		sent = 0;
+		goto error;
+	}
+	drops = bq->count - sent;
+out:
+	bq->count = 0;
+
+	trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
+			      sent, drops, bq->dev_rx, dev, err);
+	bq->dev_rx = NULL;
+	return 0;
+error:
+	/* If ndo_xdp_xmit fails with an errno, no frames have been
+	 * xmit'ed and it's our responsibility to them free all.
+	 */
+	for (i = 0; i < bq->count; i++) {
+		struct xdp_frame *xdpf = bq->q[i];
+
+		/* RX path under NAPI protection, can return frames faster */
+		xdp_return_frame_rx_napi(xdpf);
+		drops++;
+	}
+	goto out;
+}
+
 /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
  * from the driver before returning from its napi->poll() routine. The poll()
  * routine is called either from busy_poll context or net_rx_action signaled
@@ -221,7 +275,7 @@ void __dev_map_flush(struct bpf_map *map)
 
 	for_each_set_bit(bit, bitmap, map->max_entries) {
 		struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
-		struct net_device *netdev;
+		struct xdp_bulk_queue *bq;
 
 		/* This is possible if the dev entry is removed by user space
 		 * between xdp redirect and flush op.
@@ -230,9 +284,9 @@ void __dev_map_flush(struct bpf_map *map)
 			continue;
 
 		__clear_bit(bit, bitmap);
-		netdev = dev->dev;
-		if (likely(netdev->netdev_ops->ndo_xdp_flush))
-			netdev->netdev_ops->ndo_xdp_flush(netdev);
+
+		bq = this_cpu_ptr(dev->bulkq);
+		bq_xmit_all(dev, bq, XDP_XMIT_FLUSH);
 	}
 }
 
@@ -240,37 +294,79 @@ void __dev_map_flush(struct bpf_map *map)
  * update happens in parallel here a dev_put wont happen until after reading the
  * ifindex.
  */
-struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	struct bpf_dtab_netdev *dev;
+	struct bpf_dtab_netdev *obj;
 
 	if (key >= map->max_entries)
 		return NULL;
 
-	dev = READ_ONCE(dtab->netdev_map[key]);
-	return dev ? dev->dev : NULL;
+	obj = READ_ONCE(dtab->netdev_map[key]);
+	return obj;
+}
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+		      struct net_device *dev_rx)
+
+{
+	struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
+
+	if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
+		bq_xmit_all(obj, bq, 0);
+
+	/* Ingress dev_rx will be the same for all xdp_frame's in
+	 * bulk_queue, because bq stored per-CPU and must be flushed
+	 * from net_device drivers NAPI func end.
+	 */
+	if (!bq->dev_rx)
+		bq->dev_rx = dev_rx;
+
+	bq->q[bq->count++] = xdpf;
+	return 0;
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
+{
+	struct net_device *dev = dst->dev;
+	struct xdp_frame *xdpf;
+
+	if (!dev->netdev_ops->ndo_xdp_xmit)
+		return -EOPNOTSUPP;
+
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	return bq_enqueue(dst, xdpf, dev_rx);
 }
 
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
-	struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
+	struct net_device *dev = obj ? obj->dev : NULL;
 
 	return dev ? &dev->ifindex : NULL;
 }
 
 static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 {
-	if (dev->dev->netdev_ops->ndo_xdp_flush) {
-		struct net_device *fl = dev->dev;
+	if (dev->dev->netdev_ops->ndo_xdp_xmit) {
+		struct xdp_bulk_queue *bq;
 		unsigned long *bitmap;
+
 		int cpu;
 
 		for_each_online_cpu(cpu) {
 			bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
 			__clear_bit(dev->bit, bitmap);
 
-			fl->netdev_ops->ndo_xdp_flush(dev->dev);
+			bq = per_cpu_ptr(dev->bulkq, cpu);
+			bq_xmit_all(dev, bq, XDP_XMIT_FLUSH);
 		}
 	}
 }
@@ -281,6 +377,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
 
 	dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
 	dev_map_flush_old(dev);
+	free_percpu(dev->bulkq);
 	dev_put(dev->dev);
 	kfree(dev);
 }
@@ -313,6 +410,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct net *net = current->nsproxy->net_ns;
+	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 	struct bpf_dtab_netdev *dev, *old_dev;
 	u32 i = *(u32 *)key;
 	u32 ifindex = *(u32 *)value;
@@ -327,13 +425,20 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (!ifindex) {
 		dev = NULL;
 	} else {
-		dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
-				   map->numa_node);
+		dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
 		if (!dev)
 			return -ENOMEM;
 
+		dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+						sizeof(void *), gfp);
+		if (!dev->bulkq) {
+			kfree(dev);
+			return -ENOMEM;
+		}
+
 		dev->dev = dev_get_by_index(net, ifindex);
 		if (!dev->dev) {
+			free_percpu(dev->bulkq);
 			kfree(dev);
 			return -EINVAL;
 		}
@@ -405,6 +510,9 @@ static struct notifier_block dev_map_notifier = {
 
 static int __init dev_map_init(void)
 {
+	/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
+	BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
+		     offsetof(struct _bpf_dtab_netdev, dev));
 	register_netdevice_notifier(&dev_map_notifier);
 	return 0;
 }
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b76828f23b49..3ca2198a6d22 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -503,7 +503,9 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 	struct bpf_insn *insn = insn_buf;
 	const int ret = BPF_REG_0;
 
-	*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+		     (void *(*)(struct bpf_map *map, void *key))NULL));
+	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 				offsetof(struct htab_elem, key) +
@@ -530,7 +532,9 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
 	const int ret = BPF_REG_0;
 	const int ref_reg = BPF_REG_1;
 
-	*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+		     (void *(*)(struct bpf_map *map, void *key))NULL));
+	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
 	*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
 			      offsetof(struct htab_elem, lru_node) +
@@ -1369,7 +1373,9 @@ static u32 htab_of_map_gen_lookup(struct bpf_map *map,
 	struct bpf_insn *insn = insn_buf;
 	const int ret = BPF_REG_0;
 
-	*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem);
+	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
+		     (void *(*)(struct bpf_map *map, void *key))NULL));
+	*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
 	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 				offsetof(struct htab_elem, key) +
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3d24e238221e..73065e2d23c2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -179,3 +179,18 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
 	.arg2_type	= ARG_CONST_SIZE,
 };
+
+#ifdef CONFIG_CGROUPS
+BPF_CALL_0(bpf_get_current_cgroup_id)
+{
+	struct cgroup *cgrp = task_dfl_cgroup(current);
+
+	return cgrp->kn->id.id;
+}
+
+const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
+	.func		= bpf_get_current_cgroup_id,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+#endif
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index bf6da59ae0d0..ed13645bd80c 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,8 +150,154 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return 0;
 }
 
+struct map_iter {
+	void *key;
+	bool done;
+};
+
+static struct map_iter *map_iter(struct seq_file *m)
+{
+	return m->private;
+}
+
+static struct bpf_map *seq_file_to_map(struct seq_file *m)
+{
+	return file_inode(m->file)->i_private;
+}
+
+static void map_iter_free(struct map_iter *iter)
+{
+	if (iter) {
+		kfree(iter->key);
+		kfree(iter);
+	}
+}
+
+static struct map_iter *map_iter_alloc(struct bpf_map *map)
+{
+	struct map_iter *iter;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN);
+	if (!iter)
+		goto error;
+
+	iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN);
+	if (!iter->key)
+		goto error;
+
+	return iter;
+
+error:
+	map_iter_free(iter);
+	return NULL;
+}
+
+static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct bpf_map *map = seq_file_to_map(m);
+	void *key = map_iter(m)->key;
+
+	if (map_iter(m)->done)
+		return NULL;
+
+	if (unlikely(v == SEQ_START_TOKEN))
+		goto done;
+
+	if (map->ops->map_get_next_key(map, key, key)) {
+		map_iter(m)->done = true;
+		return NULL;
+	}
+
+done:
+	++(*pos);
+	return key;
+}
+
+static void *map_seq_start(struct seq_file *m, loff_t *pos)
+{
+	if (map_iter(m)->done)
+		return NULL;
+
+	return *pos ? map_iter(m)->key : SEQ_START_TOKEN;
+}
+
+static void map_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+static int map_seq_show(struct seq_file *m, void *v)
+{
+	struct bpf_map *map = seq_file_to_map(m);
+	void *key = map_iter(m)->key;
+
+	if (unlikely(v == SEQ_START_TOKEN)) {
+		seq_puts(m, "# WARNING!! The output is for debug purpose only\n");
+		seq_puts(m, "# WARNING!! The output format will change\n");
+	} else {
+		map->ops->map_seq_show_elem(map, key, m);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations bpffs_map_seq_ops = {
+	.start	= map_seq_start,
+	.next	= map_seq_next,
+	.show	= map_seq_show,
+	.stop	= map_seq_stop,
+};
+
+static int bpffs_map_open(struct inode *inode, struct file *file)
+{
+	struct bpf_map *map = inode->i_private;
+	struct map_iter *iter;
+	struct seq_file *m;
+	int err;
+
+	iter = map_iter_alloc(map);
+	if (!iter)
+		return -ENOMEM;
+
+	err = seq_open(file, &bpffs_map_seq_ops);
+	if (err) {
+		map_iter_free(iter);
+		return err;
+	}
+
+	m = file->private_data;
+	m->private = iter;
+
+	return 0;
+}
+
+static int bpffs_map_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+
+	map_iter_free(map_iter(m));
+
+	return seq_release(inode, file);
+}
+
+/* bpffs_map_fops should only implement the basic
+ * read operation for a BPF map.  The purpose is to
+ * provide a simple user intuitive way to do
+ * "cat bpffs/pathto/a-pinned-map".
+ *
+ * Other operations (e.g. write, lookup...) should be realized by
+ * the userspace tools (e.g. bpftool) through the
+ * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update
+ * interface.
+ */
+static const struct file_operations bpffs_map_fops = {
+	.open		= bpffs_map_open,
+	.read		= seq_read,
+	.release	= bpffs_map_release,
+};
+
 static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
-			 const struct inode_operations *iops)
+			 const struct inode_operations *iops,
+			 const struct file_operations *fops)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
 	struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode);
@@ -159,6 +305,7 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 		return PTR_ERR(inode);
 
 	inode->i_op = iops;
+	inode->i_fop = fops;
 	inode->i_private = raw;
 
 	bpf_dentry_finalize(dentry, inode, dir);
@@ -167,12 +314,15 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
 
 static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
 {
-	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops);
+	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, NULL);
 }
 
 static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
 {
-	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops);
+	struct bpf_map *map = arg;
+
+	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
+			     map->btf ? &bpffs_map_fops : NULL);
 }
 
 static struct dentry *
@@ -279,13 +429,6 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
 	ret = bpf_obj_do_pin(pname, raw, type);
 	if (ret != 0)
 		bpf_any_put(raw, type);
-	if ((trace_bpf_obj_pin_prog_enabled() ||
-	     trace_bpf_obj_pin_map_enabled()) && !ret) {
-		if (type == BPF_TYPE_PROG)
-			trace_bpf_obj_pin_prog(raw, ufd, pname);
-		if (type == BPF_TYPE_MAP)
-			trace_bpf_obj_pin_map(raw, ufd, pname);
-	}
 out:
 	putname(pname);
 	return ret;
@@ -352,15 +495,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags)
 	else
 		goto out;
 
-	if (ret < 0) {
+	if (ret < 0)
 		bpf_any_put(raw, type);
-	} else if (trace_bpf_obj_get_prog_enabled() ||
-		   trace_bpf_obj_get_map_enabled()) {
-		if (type == BPF_TYPE_PROG)
-			trace_bpf_obj_get_prog(raw, ret, pname);
-		if (type == BPF_TYPE_MAP)
-			trace_bpf_obj_get_map(raw, ret, pname);
-	}
 out:
 	putname(pname);
 	return ret;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index c9401075b58c..ac747d5cf7c6 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 Netronome Systems, Inc.
+ * Copyright (C) 2017-2018 Netronome Systems, Inc.
  *
  * This software is licensed under the GNU General License Version 2,
  * June 1991 as shown in the file COPYING in the top-level directory of this
@@ -474,8 +474,10 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
 	struct bpf_prog_offload *offload;
 	bool ret;
 
-	if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map))
+	if (!bpf_prog_is_dev_bound(prog->aux))
 		return false;
+	if (!bpf_map_is_dev_bound(map))
+		return bpf_map_offload_neutral(map);
 
 	down_read(&bpf_devs_lock);
 	offload = prog->aux->offload;
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index d2bda5aa25d7..52a91d816c0e 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -43,18 +43,45 @@
 #include <net/tcp.h>
 #include <linux/ptr_ring.h>
 #include <net/inet_common.h>
+#include <linux/sched/signal.h>
 
 #define SOCK_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
-struct bpf_stab {
-	struct bpf_map map;
-	struct sock **sock_map;
+struct bpf_sock_progs {
 	struct bpf_prog *bpf_tx_msg;
 	struct bpf_prog *bpf_parse;
 	struct bpf_prog *bpf_verdict;
 };
 
+struct bpf_stab {
+	struct bpf_map map;
+	struct sock **sock_map;
+	struct bpf_sock_progs progs;
+};
+
+struct bucket {
+	struct hlist_head head;
+	raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+	struct bpf_map map;
+	struct bucket *buckets;
+	atomic_t count;
+	u32 n_buckets;
+	u32 elem_size;
+	struct bpf_sock_progs progs;
+};
+
+struct htab_elem {
+	struct rcu_head rcu;
+	struct hlist_node hash_node;
+	u32 hash;
+	struct sock *sk;
+	char key[0];
+};
+
 enum smap_psock_state {
 	SMAP_TX_RUNNING,
 };
@@ -62,6 +89,8 @@ enum smap_psock_state {
 struct smap_psock_map_entry {
 	struct list_head list;
 	struct sock **entry;
+	struct htab_elem *hash_link;
+	struct bpf_htab *htab;
 };
 
 struct smap_psock {
@@ -182,12 +211,20 @@ static void bpf_tcp_release(struct sock *sk)
 		psock->cork = NULL;
 	}
 
-	sk->sk_prot = psock->sk_proto;
-	psock->sk_proto = NULL;
+	if (psock->sk_proto) {
+		sk->sk_prot = psock->sk_proto;
+		psock->sk_proto = NULL;
+	}
 out:
 	rcu_read_unlock();
 }
 
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+	atomic_dec(&htab->count);
+	kfree_rcu(l, rcu);
+}
+
 static void bpf_tcp_close(struct sock *sk, long timeout)
 {
 	void (*close_fun)(struct sock *sk, long timeout);
@@ -211,6 +248,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
 	close_fun = psock->save_close;
 
 	write_lock_bh(&sk->sk_callback_lock);
+	if (psock->cork) {
+		free_start_sg(psock->sock, psock->cork);
+		kfree(psock->cork);
+		psock->cork = NULL;
+	}
+
 	list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
 		list_del(&md->list);
 		free_start_sg(psock->sock, md);
@@ -218,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
 	}
 
 	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		osk = cmpxchg(e->entry, sk, NULL);
-		if (osk == sk) {
-			list_del(&e->list);
-			smap_release_sock(psock, sk);
+		if (e->entry) {
+			osk = cmpxchg(e->entry, sk, NULL);
+			if (osk == sk) {
+				list_del(&e->list);
+				smap_release_sock(psock, sk);
+			}
+		} else {
+			hlist_del_rcu(&e->hash_link->hash_node);
+			smap_release_sock(psock, e->hash_link->sk);
+			free_htab_elem(e->htab, e->hash_link);
 		}
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
@@ -317,6 +366,9 @@ retry:
 			if (ret > 0) {
 				if (apply)
 					apply_bytes -= ret;
+
+				sg->offset += ret;
+				sg->length -= ret;
 				size -= ret;
 				offset += ret;
 				if (uncharge)
@@ -324,8 +376,6 @@ retry:
 				goto retry;
 			}
 
-			sg->length = size;
-			sg->offset = offset;
 			return ret;
 		}
 
@@ -383,7 +433,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
 	} while (i != md->sg_end);
 }
 
-static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
+static void free_bytes_sg(struct sock *sk, int bytes,
+			  struct sk_msg_buff *md, bool charge)
 {
 	struct scatterlist *sg = md->sg_data;
 	int i = md->sg_start, free;
@@ -393,11 +444,13 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
 		if (bytes < free) {
 			sg[i].length -= bytes;
 			sg[i].offset += bytes;
-			sk_mem_uncharge(sk, bytes);
+			if (charge)
+				sk_mem_uncharge(sk, bytes);
 			break;
 		}
 
-		sk_mem_uncharge(sk, sg[i].length);
+		if (charge)
+			sk_mem_uncharge(sk, sg[i].length);
 		put_page(sg_page(&sg[i]));
 		bytes -= sg[i].length;
 		sg[i].length = 0;
@@ -408,6 +461,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
 		if (i == MAX_SKB_FRAGS)
 			i = 0;
 	}
+	md->sg_start = i;
 }
 
 static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
@@ -447,7 +501,7 @@ static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
 static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
 {
 	return ((_rc == SK_PASS) ?
-	       (md->map ? __SK_REDIRECT : __SK_PASS) :
+	       (md->sk_redir ? __SK_REDIRECT : __SK_PASS) :
 	       __SK_DROP);
 }
 
@@ -469,6 +523,7 @@ static unsigned int smap_do_tx_msg(struct sock *sk,
 	}
 
 	bpf_compute_data_pointers_sg(md);
+	md->sk = sk;
 	rc = (*prog->bpf_func)(md, prog->insnsi);
 	psock->apply_bytes = md->apply_bytes;
 
@@ -515,8 +570,6 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
 	i = md->sg_start;
 
 	do {
-		r->sg_data[i] = md->sg_data[i];
-
 		size = (apply && apply_bytes < md->sg_data[i].length) ?
 			apply_bytes : md->sg_data[i].length;
 
@@ -527,6 +580,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
 		}
 
 		sk_mem_charge(sk, size);
+		r->sg_data[i] = md->sg_data[i];
 		r->sg_data[i].length = size;
 		md->sg_data[i].length -= size;
 		md->sg_data[i].offset += size;
@@ -567,10 +621,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
 				       struct sk_msg_buff *md,
 				       int flags)
 {
+	bool ingress = !!(md->flags & BPF_F_INGRESS);
 	struct smap_psock *psock;
 	struct scatterlist *sg;
-	int i, err, free = 0;
-	bool ingress = !!(md->flags & BPF_F_INGRESS);
+	int err = 0;
 
 	sg = md->sg_data;
 
@@ -598,16 +652,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
 out_rcu:
 	rcu_read_unlock();
 out:
-	i = md->sg_start;
-	while (sg[i].length) {
-		free += sg[i].length;
-		put_page(sg_page(&sg[i]));
-		sg[i].length = 0;
-		i++;
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
-	}
-	return free;
+	free_bytes_sg(NULL, send, md, false);
+	return err;
 }
 
 static inline void bpf_md_init(struct smap_psock *psock)
@@ -692,19 +738,26 @@ more_data:
 		err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
 		lock_sock(sk);
 
+		if (unlikely(err < 0)) {
+			free_start_sg(sk, m);
+			psock->sg_size = 0;
+			if (!cork)
+				*copied -= send;
+		} else {
+			psock->sg_size -= send;
+		}
+
 		if (cork) {
 			free_start_sg(sk, m);
+			psock->sg_size = 0;
 			kfree(m);
 			m = NULL;
+			err = 0;
 		}
-		if (unlikely(err))
-			*copied -= err;
-		else
-			psock->sg_size -= send;
 		break;
 	case __SK_DROP:
 	default:
-		free_bytes_sg(sk, send, m);
+		free_bytes_sg(sk, send, m, true);
 		apply_bytes_dec(psock, send);
 		*copied -= send;
 		psock->sg_size -= send;
@@ -724,6 +777,26 @@ out_err:
 	return err;
 }
 
+static int bpf_wait_data(struct sock *sk,
+			 struct smap_psock *psk, int flags,
+			 long timeo, int *err)
+{
+	int rc;
+
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	rc = sk_wait_event(sk, &timeo,
+			   !list_empty(&psk->ingress) ||
+			   !skb_queue_empty(&sk->sk_receive_queue),
+			   &wait);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	return rc;
+}
+
 static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			   int nonblock, int flags, int *addr_len)
 {
@@ -747,6 +820,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
 
 	lock_sock(sk);
+bytes_ready:
 	while (copied != len) {
 		struct scatterlist *sg;
 		struct sk_msg_buff *md;
@@ -801,6 +875,28 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		}
 	}
 
+	if (!copied) {
+		long timeo;
+		int data;
+		int err = 0;
+
+		timeo = sock_rcvtimeo(sk, nonblock);
+		data = bpf_wait_data(sk, psock, flags, timeo, &err);
+
+		if (data) {
+			if (!skb_queue_empty(&sk->sk_receive_queue)) {
+				release_sock(sk);
+				smap_release_sock(psock, sk);
+				copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+				return copied;
+			}
+			goto bytes_ready;
+		}
+
+		if (err)
+			copied = err;
+	}
+
 	release_sock(sk);
 	smap_release_sock(psock, sk);
 	return copied;
@@ -1037,7 +1133,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 	 * when we orphan the skb so that we don't have the possibility
 	 * to reference a stale map.
 	 */
-	TCP_SKB_CB(skb)->bpf.map = NULL;
+	TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
 	skb->sk = psock->sock;
 	bpf_compute_data_pointers(skb);
 	preempt_disable();
@@ -1047,7 +1143,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 
 	/* Moving return codes from UAPI namespace into internal namespace */
 	return rc == SK_PASS ?
-		(TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) :
+		(TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) :
 		__SK_DROP;
 }
 
@@ -1317,7 +1413,6 @@ static int smap_init_sock(struct smap_psock *psock,
 }
 
 static void smap_init_progs(struct smap_psock *psock,
-			    struct bpf_stab *stab,
 			    struct bpf_prog *verdict,
 			    struct bpf_prog *parse)
 {
@@ -1395,14 +1490,13 @@ static void smap_gc_work(struct work_struct *w)
 	kfree(psock);
 }
 
-static struct smap_psock *smap_init_psock(struct sock *sock,
-					  struct bpf_stab *stab)
+static struct smap_psock *smap_init_psock(struct sock *sock, int node)
 {
 	struct smap_psock *psock;
 
 	psock = kzalloc_node(sizeof(struct smap_psock),
 			     GFP_ATOMIC | __GFP_NOWARN,
-			     stab->map.numa_node);
+			     node);
 	if (!psock)
 		return ERR_PTR(-ENOMEM);
 
@@ -1434,9 +1528,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	    attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
-	if (attr->value_size > KMALLOC_MAX_SIZE)
-		return ERR_PTR(-E2BIG);
-
 	err = bpf_tcp_ulp_register();
 	if (err && err != -EEXIST)
 		return ERR_PTR(err);
@@ -1473,12 +1564,14 @@ free_stab:
 	return ERR_PTR(err);
 }
 
-static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+static void smap_list_remove(struct smap_psock *psock,
+			     struct sock **entry,
+			     struct htab_elem *hash_link)
 {
 	struct smap_psock_map_entry *e, *tmp;
 
 	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		if (e->entry == entry) {
+		if (e->entry == entry || e->hash_link == hash_link) {
 			list_del(&e->list);
 			break;
 		}
@@ -1516,7 +1609,7 @@ static void sock_map_free(struct bpf_map *map)
 		 * to be null and queued for garbage collection.
 		 */
 		if (likely(psock)) {
-			smap_list_remove(psock, &stab->sock_map[i]);
+			smap_list_remove(psock, &stab->sock_map[i], NULL);
 			smap_release_sock(psock, sock);
 		}
 		write_unlock_bh(&sock->sk_callback_lock);
@@ -1575,7 +1668,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
 
 	if (psock->bpf_parse)
 		smap_stop_sock(psock, sock);
-	smap_list_remove(psock, &stab->sock_map[k]);
+	smap_list_remove(psock, &stab->sock_map[k], NULL);
 	smap_release_sock(psock, sock);
 out:
 	write_unlock_bh(&sock->sk_callback_lock);
@@ -1610,40 +1703,26 @@ out:
  *  - sock_map must use READ_ONCE and (cmp)xchg operations
  *  - BPF verdict/parse programs must use READ_ONCE and xchg operations
  */
-static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
-				    struct bpf_map *map,
-				    void *key, u64 flags)
+
+static int __sock_map_ctx_update_elem(struct bpf_map *map,
+				      struct bpf_sock_progs *progs,
+				      struct sock *sock,
+				      struct sock **map_link,
+				      void *key)
 {
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	struct smap_psock_map_entry *e = NULL;
 	struct bpf_prog *verdict, *parse, *tx_msg;
-	struct sock *osock, *sock;
+	struct smap_psock_map_entry *e = NULL;
 	struct smap_psock *psock;
-	u32 i = *(u32 *)key;
 	bool new = false;
-	int err;
-
-	if (unlikely(flags > BPF_EXIST))
-		return -EINVAL;
-
-	if (unlikely(i >= stab->map.max_entries))
-		return -E2BIG;
-
-	sock = READ_ONCE(stab->sock_map[i]);
-	if (flags == BPF_EXIST && !sock)
-		return -ENOENT;
-	else if (flags == BPF_NOEXIST && sock)
-		return -EEXIST;
-
-	sock = skops->sk;
+	int err = 0;
 
 	/* 1. If sock map has BPF programs those will be inherited by the
 	 * sock being added. If the sock is already attached to BPF programs
 	 * this results in an error.
 	 */
-	verdict = READ_ONCE(stab->bpf_verdict);
-	parse = READ_ONCE(stab->bpf_parse);
-	tx_msg = READ_ONCE(stab->bpf_tx_msg);
+	verdict = READ_ONCE(progs->bpf_verdict);
+	parse = READ_ONCE(progs->bpf_parse);
+	tx_msg = READ_ONCE(progs->bpf_tx_msg);
 
 	if (parse && verdict) {
 		/* bpf prog refcnt may be zero if a concurrent attach operation
@@ -1651,11 +1730,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		 * we increment the refcnt. If this is the case abort with an
 		 * error.
 		 */
-		verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
+		verdict = bpf_prog_inc_not_zero(verdict);
 		if (IS_ERR(verdict))
 			return PTR_ERR(verdict);
 
-		parse = bpf_prog_inc_not_zero(stab->bpf_parse);
+		parse = bpf_prog_inc_not_zero(parse);
 		if (IS_ERR(parse)) {
 			bpf_prog_put(verdict);
 			return PTR_ERR(parse);
@@ -1663,12 +1742,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	}
 
 	if (tx_msg) {
-		tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg);
+		tx_msg = bpf_prog_inc_not_zero(tx_msg);
 		if (IS_ERR(tx_msg)) {
-			if (verdict)
-				bpf_prog_put(verdict);
-			if (parse)
+			if (parse && verdict) {
 				bpf_prog_put(parse);
+				bpf_prog_put(verdict);
+			}
 			return PTR_ERR(tx_msg);
 		}
 	}
@@ -1696,7 +1775,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 			goto out_progs;
 		}
 	} else {
-		psock = smap_init_psock(sock, stab);
+		psock = smap_init_psock(sock, map->numa_node);
 		if (IS_ERR(psock)) {
 			err = PTR_ERR(psock);
 			goto out_progs;
@@ -1706,12 +1785,13 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		new = true;
 	}
 
-	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
-	if (!e) {
-		err = -ENOMEM;
-		goto out_progs;
+	if (map_link) {
+		e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+		if (!e) {
+			err = -ENOMEM;
+			goto out_progs;
+		}
 	}
-	e->entry = &stab->sock_map[i];
 
 	/* 3. At this point we have a reference to a valid psock that is
 	 * running. Attach any BPF programs needed.
@@ -1728,7 +1808,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		err = smap_init_sock(psock, sock);
 		if (err)
 			goto out_free;
-		smap_init_progs(psock, stab, verdict, parse);
+		smap_init_progs(psock, verdict, parse);
 		smap_start_sock(psock, sock);
 	}
 
@@ -1737,50 +1817,93 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 	 * it with. Because we can only have a single set of programs if
 	 * old_sock has a strp we can stop it.
 	 */
-	list_add_tail(&e->list, &psock->maps);
+	if (map_link) {
+		e->entry = map_link;
+		list_add_tail(&e->list, &psock->maps);
+	}
 	write_unlock_bh(&sock->sk_callback_lock);
+	return err;
+out_free:
+	smap_release_sock(psock, sock);
+out_progs:
+	if (parse && verdict) {
+		bpf_prog_put(parse);
+		bpf_prog_put(verdict);
+	}
+	if (tx_msg)
+		bpf_prog_put(tx_msg);
+	write_unlock_bh(&sock->sk_callback_lock);
+	kfree(e);
+	return err;
+}
+
+static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+				    struct bpf_map *map,
+				    void *key, u64 flags)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_sock_progs *progs = &stab->progs;
+	struct sock *osock, *sock;
+	u32 i = *(u32 *)key;
+	int err;
+
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	if (unlikely(i >= stab->map.max_entries))
+		return -E2BIG;
+
+	sock = READ_ONCE(stab->sock_map[i]);
+	if (flags == BPF_EXIST && !sock)
+		return -ENOENT;
+	else if (flags == BPF_NOEXIST && sock)
+		return -EEXIST;
+
+	sock = skops->sk;
+	err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i],
+					 key);
+	if (err)
+		goto out;
 
 	osock = xchg(&stab->sock_map[i], sock);
 	if (osock) {
 		struct smap_psock *opsock = smap_psock_sk(osock);
 
 		write_lock_bh(&osock->sk_callback_lock);
-		smap_list_remove(opsock, &stab->sock_map[i]);
+		smap_list_remove(opsock, &stab->sock_map[i], NULL);
 		smap_release_sock(opsock, osock);
 		write_unlock_bh(&osock->sk_callback_lock);
 	}
-	return 0;
-out_free:
-	smap_release_sock(psock, sock);
-out_progs:
-	if (verdict)
-		bpf_prog_put(verdict);
-	if (parse)
-		bpf_prog_put(parse);
-	if (tx_msg)
-		bpf_prog_put(tx_msg);
-	write_unlock_bh(&sock->sk_callback_lock);
-	kfree(e);
+out:
 	return err;
 }
 
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
 {
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_sock_progs *progs;
 	struct bpf_prog *orig;
 
-	if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
+	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+		progs = &stab->progs;
+	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+		progs = &htab->progs;
+	} else {
 		return -EINVAL;
+	}
 
 	switch (type) {
 	case BPF_SK_MSG_VERDICT:
-		orig = xchg(&stab->bpf_tx_msg, prog);
+		orig = xchg(&progs->bpf_tx_msg, prog);
 		break;
 	case BPF_SK_SKB_STREAM_PARSER:
-		orig = xchg(&stab->bpf_parse, prog);
+		orig = xchg(&progs->bpf_parse, prog);
 		break;
 	case BPF_SK_SKB_STREAM_VERDICT:
-		orig = xchg(&stab->bpf_verdict, prog);
+		orig = xchg(&progs->bpf_verdict, prog);
 		break;
 	default:
 		return -EOPNOTSUPP;
@@ -1826,23 +1949,423 @@ static int sock_map_update_elem(struct bpf_map *map,
 	return err;
 }
 
-static void sock_map_release(struct bpf_map *map, struct file *map_file)
+static void sock_map_release(struct bpf_map *map)
 {
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_sock_progs *progs;
 	struct bpf_prog *orig;
 
-	orig = xchg(&stab->bpf_parse, NULL);
+	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+		progs = &stab->progs;
+	} else {
+		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+		progs = &htab->progs;
+	}
+
+	orig = xchg(&progs->bpf_parse, NULL);
 	if (orig)
 		bpf_prog_put(orig);
-	orig = xchg(&stab->bpf_verdict, NULL);
+	orig = xchg(&progs->bpf_verdict, NULL);
 	if (orig)
 		bpf_prog_put(orig);
 
-	orig = xchg(&stab->bpf_tx_msg, NULL);
+	orig = xchg(&progs->bpf_tx_msg, NULL);
 	if (orig)
 		bpf_prog_put(orig);
 }
 
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+	struct bpf_htab *htab;
+	int i, err;
+	u64 cost;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->value_size != 4 ||
+	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+		return ERR_PTR(-EINVAL);
+
+	if (attr->key_size > MAX_BPF_STACK)
+		/* eBPF programs initialize keys on stack, so they cannot be
+		 * larger than max stack size
+		 */
+		return ERR_PTR(-E2BIG);
+
+	err = bpf_tcp_ulp_register();
+	if (err && err != -EEXIST)
+		return ERR_PTR(err);
+
+	htab = kzalloc(sizeof(*htab), GFP_USER);
+	if (!htab)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&htab->map, attr);
+
+	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+	htab->elem_size = sizeof(struct htab_elem) +
+			  round_up(htab->map.key_size, 8);
+	err = -EINVAL;
+	if (htab->n_buckets == 0 ||
+	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
+		goto free_htab;
+
+	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+	       (u64) htab->elem_size * htab->map.max_entries;
+
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_htab;
+
+	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	err = bpf_map_precharge_memlock(htab->map.pages);
+	if (err)
+		goto free_htab;
+
+	err = -ENOMEM;
+	htab->buckets = bpf_map_area_alloc(
+				htab->n_buckets * sizeof(struct bucket),
+				htab->map.numa_node);
+	if (!htab->buckets)
+		goto free_htab;
+
+	for (i = 0; i < htab->n_buckets; i++) {
+		INIT_HLIST_HEAD(&htab->buckets[i].head);
+		raw_spin_lock_init(&htab->buckets[i].lock);
+	}
+
+	return &htab->map;
+free_htab:
+	kfree(htab);
+	return ERR_PTR(err);
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &__select_bucket(htab, hash)->head;
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	int i;
+
+	synchronize_rcu();
+
+	/* At this point no update, lookup or delete operations can happen.
+	 * However, be aware we can still get a socket state event updates,
+	 * and data ready callabacks that reference the psock from sk_user_data
+	 * Also psock worker threads are still in-flight. So smap_release_sock
+	 * will only free the psock after cancel_sync on the worker threads
+	 * and a grace period expire to ensure psock is really safe to remove.
+	 */
+	rcu_read_lock();
+	for (i = 0; i < htab->n_buckets; i++) {
+		struct hlist_head *head = select_bucket(htab, i);
+		struct hlist_node *n;
+		struct htab_elem *l;
+
+		hlist_for_each_entry_safe(l, n, head, hash_node) {
+			struct sock *sock = l->sk;
+			struct smap_psock *psock;
+
+			hlist_del_rcu(&l->hash_node);
+			write_lock_bh(&sock->sk_callback_lock);
+			psock = smap_psock_sk(sock);
+			/* This check handles a racing sock event that can get
+			 * the sk_callback_lock before this case but after xchg
+			 * causing the refcnt to hit zero and sock user data
+			 * (psock) to be null and queued for garbage collection.
+			 */
+			if (likely(psock)) {
+				smap_list_remove(psock, NULL, l);
+				smap_release_sock(psock, sock);
+			}
+			write_unlock_bh(&sock->sk_callback_lock);
+			kfree(l);
+		}
+	}
+	rcu_read_unlock();
+	bpf_map_area_free(htab->buckets);
+	kfree(htab);
+}
+
+static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
+					      void *key, u32 key_size, u32 hash,
+					      struct sock *sk,
+					      struct htab_elem *old_elem)
+{
+	struct htab_elem *l_new;
+
+	if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+		if (!old_elem) {
+			atomic_dec(&htab->count);
+			return ERR_PTR(-E2BIG);
+		}
+	}
+	l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+			     htab->map.numa_node);
+	if (!l_new)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(l_new->key, key, key_size);
+	l_new->sk = sk;
+	l_new->hash = hash;
+	return l_new;
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+					 u32 hash, void *key, u32 key_size)
+{
+	struct htab_elem *l;
+
+	hlist_for_each_entry_rcu(l, head, hash_node) {
+		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+			return l;
+	}
+
+	return NULL;
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+	return jhash(key, key_len, 0);
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map,
+				  void *key, void *next_key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l, *next_l;
+	struct hlist_head *h;
+	u32 hash, key_size;
+	int i = 0;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+	if (!key)
+		goto find_first_elem;
+	hash = htab_map_hash(key, key_size);
+	h = select_bucket(htab, hash);
+
+	l = lookup_elem_raw(h, hash, key, key_size);
+	if (!l)
+		goto find_first_elem;
+	next_l = hlist_entry_safe(
+		     rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+		     struct htab_elem, hash_node);
+	if (next_l) {
+		memcpy(next_key, next_l->key, key_size);
+		return 0;
+	}
+
+	/* no more elements in this hash list, go to the next bucket */
+	i = hash & (htab->n_buckets - 1);
+	i++;
+
+find_first_elem:
+	/* iterate over buckets */
+	for (; i < htab->n_buckets; i++) {
+		h = select_bucket(htab, i);
+
+		/* pick first element in the bucket */
+		next_l = hlist_entry_safe(
+				rcu_dereference_raw(hlist_first_rcu(h)),
+				struct htab_elem, hash_node);
+		if (next_l) {
+			/* if it's not empty, just return it */
+			memcpy(next_key, next_l->key, key_size);
+			return 0;
+		}
+	}
+
+	/* iterated over all buckets and all elements */
+	return -ENOENT;
+}
+
+static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+				     struct bpf_map *map,
+				     void *key, u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct bpf_sock_progs *progs = &htab->progs;
+	struct htab_elem *l_new = NULL, *l_old;
+	struct smap_psock_map_entry *e = NULL;
+	struct hlist_head *head;
+	struct smap_psock *psock;
+	u32 key_size, hash;
+	struct sock *sock;
+	struct bucket *b;
+	int err;
+
+	sock = skops->sk;
+
+	if (sock->sk_type != SOCK_STREAM ||
+	    sock->sk_protocol != IPPROTO_TCP)
+		return -EOPNOTSUPP;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+
+	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+	if (!e)
+		return -ENOMEM;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
+	if (err)
+		goto err;
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	raw_spin_lock_bh(&b->lock);
+	l_old = lookup_elem_raw(head, hash, key, key_size);
+	if (l_old && map_flags == BPF_NOEXIST) {
+		err = -EEXIST;
+		goto bucket_err;
+	}
+	if (!l_old && map_flags == BPF_EXIST) {
+		err = -ENOENT;
+		goto bucket_err;
+	}
+
+	l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
+	if (IS_ERR(l_new)) {
+		err = PTR_ERR(l_new);
+		goto bucket_err;
+	}
+
+	psock = smap_psock_sk(sock);
+	if (unlikely(!psock)) {
+		err = -EINVAL;
+		goto bucket_err;
+	}
+
+	e->hash_link = l_new;
+	e->htab = container_of(map, struct bpf_htab, map);
+	list_add_tail(&e->list, &psock->maps);
+
+	/* add new element to the head of the list, so that
+	 * concurrent search will find it before old elem
+	 */
+	hlist_add_head_rcu(&l_new->hash_node, head);
+	if (l_old) {
+		psock = smap_psock_sk(l_old->sk);
+
+		hlist_del_rcu(&l_old->hash_node);
+		smap_list_remove(psock, NULL, l_old);
+		smap_release_sock(psock, l_old->sk);
+		free_htab_elem(htab, l_old);
+	}
+	raw_spin_unlock_bh(&b->lock);
+	return 0;
+bucket_err:
+	raw_spin_unlock_bh(&b->lock);
+err:
+	kfree(e);
+	psock = smap_psock_sk(sock);
+	if (psock)
+		smap_release_sock(psock, sock);
+	return err;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map,
+				void *key, void *value, u64 flags)
+{
+	struct bpf_sock_ops_kern skops;
+	u32 fd = *(u32 *)value;
+	struct socket *socket;
+	int err;
+
+	socket = sockfd_lookup(fd, &err);
+	if (!socket)
+		return err;
+
+	skops.sk = socket->sk;
+	if (!skops.sk) {
+		fput(socket->file);
+		return -EINVAL;
+	}
+
+	err = sock_hash_ctx_update_elem(&skops, map, key, flags);
+	fput(socket->file);
+	return err;
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct bucket *b;
+	struct htab_elem *l;
+	u32 hash, key_size;
+	int ret = -ENOENT;
+
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	raw_spin_lock_bh(&b->lock);
+	l = lookup_elem_raw(head, hash, key, key_size);
+	if (l) {
+		struct sock *sock = l->sk;
+		struct smap_psock *psock;
+
+		hlist_del_rcu(&l->hash_node);
+		write_lock_bh(&sock->sk_callback_lock);
+		psock = smap_psock_sk(sock);
+		/* This check handles a racing sock event that can get the
+		 * sk_callback_lock before this case but after xchg happens
+		 * causing the refcnt to hit zero and sock user data (psock)
+		 * to be null and queued for garbage collection.
+		 */
+		if (likely(psock)) {
+			smap_list_remove(psock, NULL, l);
+			smap_release_sock(psock, sock);
+		}
+		write_unlock_bh(&sock->sk_callback_lock);
+		free_htab_elem(htab, l);
+		ret = 0;
+	}
+	raw_spin_unlock_bh(&b->lock);
+	return ret;
+}
+
+struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l;
+	u32 key_size, hash;
+	struct bucket *b;
+	struct sock *sk;
+
+	key_size = map->key_size;
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	raw_spin_lock_bh(&b->lock);
+	l = lookup_elem_raw(head, hash, key, key_size);
+	sk = l ? l->sk : NULL;
+	raw_spin_unlock_bh(&b->lock);
+	return sk;
+}
+
 const struct bpf_map_ops sock_map_ops = {
 	.map_alloc = sock_map_alloc,
 	.map_free = sock_map_free,
@@ -1850,7 +2373,16 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_get_next_key = sock_map_get_next_key,
 	.map_update_elem = sock_map_update_elem,
 	.map_delete_elem = sock_map_delete_elem,
-	.map_release = sock_map_release,
+	.map_release_uref = sock_map_release,
+};
+
+const struct bpf_map_ops sock_hash_ops = {
+	.map_alloc = sock_hash_alloc,
+	.map_free = sock_hash_free,
+	.map_lookup_elem = sock_map_lookup,
+	.map_get_next_key = sock_hash_get_next_key,
+	.map_update_elem = sock_hash_update_elem,
+	.map_delete_elem = sock_hash_delete_elem,
 };
 
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
@@ -1870,3 +2402,21 @@ const struct bpf_func_proto bpf_sock_map_update_proto = {
 	.arg3_type	= ARG_PTR_TO_MAP_KEY,
 	.arg4_type	= ARG_ANYTHING,
 };
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+	.func		= bpf_sock_hash_update,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 57eeb1234b67..b675a3f3d141 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,7 @@
 #include <linux/perf_event.h>
 #include <linux/elf.h>
 #include <linux/pagemap.h>
+#include <linux/irq_work.h>
 #include "percpu_freelist.h"
 
 #define STACK_CREATE_FLAG_MASK					\
@@ -32,6 +33,23 @@ struct bpf_stack_map {
 	struct stack_map_bucket *buckets[];
 };
 
+/* irq_work to run up_read() for build_id lookup in nmi context */
+struct stack_map_irq_work {
+	struct irq_work irq_work;
+	struct rw_semaphore *sem;
+};
+
+static void do_up_read(struct irq_work *entry)
+{
+	struct stack_map_irq_work *work;
+
+	work = container_of(entry, struct stack_map_irq_work, irq_work);
+	up_read(work->sem);
+	work->sem = NULL;
+}
+
+static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
+
 static inline bool stack_map_use_build_id(struct bpf_map *map)
 {
 	return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -262,27 +280,31 @@ out:
 	return ret;
 }
 
-static void stack_map_get_build_id_offset(struct bpf_map *map,
-					  struct stack_map_bucket *bucket,
+static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 					  u64 *ips, u32 trace_nr, bool user)
 {
 	int i;
 	struct vm_area_struct *vma;
-	struct bpf_stack_build_id *id_offs;
-
-	bucket->nr = trace_nr;
-	id_offs = (struct bpf_stack_build_id *)bucket->data;
+	bool irq_work_busy = false;
+	struct stack_map_irq_work *work = NULL;
+
+	if (in_nmi()) {
+		work = this_cpu_ptr(&up_read_work);
+		if (work->irq_work.flags & IRQ_WORK_BUSY)
+			/* cannot queue more up_read, fallback */
+			irq_work_busy = true;
+	}
 
 	/*
-	 * We cannot do up_read() in nmi context, so build_id lookup is
-	 * only supported for non-nmi events. If at some point, it is
-	 * possible to run find_vma() without taking the semaphore, we
-	 * would like to allow build_id lookup in nmi context.
+	 * We cannot do up_read() in nmi context. To do build_id lookup
+	 * in nmi context, we need to run up_read() in irq_work. We use
+	 * a percpu variable to do the irq_work. If the irq_work is
+	 * already used by another lookup, we fall back to report ips.
 	 *
 	 * Same fallback is used for kernel stack (!user) on a stackmap
 	 * with build_id.
 	 */
-	if (!user || !current || !current->mm || in_nmi() ||
+	if (!user || !current || !current->mm || irq_work_busy ||
 	    down_read_trylock(&current->mm->mmap_sem) == 0) {
 		/* cannot access current->mm, fall back to ips */
 		for (i = 0; i < trace_nr; i++) {
@@ -304,7 +326,13 @@ static void stack_map_get_build_id_offset(struct bpf_map *map,
 			- vma->vm_start;
 		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
 	}
-	up_read(&current->mm->mmap_sem);
+
+	if (!work) {
+		up_read(&current->mm->mmap_sem);
+	} else {
+		work->sem = &current->mm->mmap_sem;
+		irq_work_queue(&work->irq_work);
+	}
 }
 
 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
@@ -361,8 +389,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 			pcpu_freelist_pop(&smap->freelist);
 		if (unlikely(!new_bucket))
 			return -ENOMEM;
-		stack_map_get_build_id_offset(map, new_bucket, ips,
-					      trace_nr, user);
+		new_bucket->nr = trace_nr;
+		stack_map_get_build_id_offset(
+			(struct bpf_stack_build_id *)new_bucket->data,
+			ips, trace_nr, user);
 		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
 		if (hash_matches && bucket->nr == trace_nr &&
 		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
@@ -405,6 +435,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+	   u64, flags)
+{
+	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+	bool user = flags & BPF_F_USER_STACK;
+	struct perf_callchain_entry *trace;
+	bool kernel = !user;
+	int err = -EINVAL;
+	u64 *ips;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_USER_BUILD_ID)))
+		goto clear;
+	if (kernel && user_build_id)
+		goto clear;
+
+	elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
+					    : sizeof(u64);
+	if (unlikely(size % elem_size))
+		goto clear;
+
+	num_elem = size / elem_size;
+	if (sysctl_perf_event_max_stack < num_elem)
+		init_nr = 0;
+	else
+		init_nr = sysctl_perf_event_max_stack - num_elem;
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
+	if (unlikely(!trace))
+		goto err_fault;
+
+	trace_nr = trace->nr - init_nr;
+	if (trace_nr < skip)
+		goto err_fault;
+
+	trace_nr -= skip;
+	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+	copy_len = trace_nr * elem_size;
+	ips = trace->ip + skip + init_nr;
+	if (user && user_build_id)
+		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
+	else
+		memcpy(buf, ips, copy_len);
+
+	if (size > copy_len)
+		memset(buf + copy_len, 0, size - copy_len);
+	return copy_len;
+
+err_fault:
+	err = -EFAULT;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+	.func		= bpf_get_stack,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
@@ -511,3 +608,16 @@ const struct bpf_map_ops stack_map_ops = {
 	.map_update_elem = stack_map_update_elem,
 	.map_delete_elem = stack_map_delete_elem,
 };
+
+static int __init stack_map_init(void)
+{
+	int cpu;
+	struct stack_map_irq_work *work;
+
+	for_each_possible_cpu(cpu) {
+		work = per_cpu_ptr(&up_read_work, cpu);
+		init_irq_work(&work->irq_work, do_up_read);
+	}
+	return 0;
+}
+subsys_initcall(stack_map_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0244973ee544..0fa20624707f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -11,13 +11,17 @@
  */
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/bpf_lirc.h>
+#include <linux/btf.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched/signal.h>
 #include <linux/vmalloc.h>
 #include <linux/mmzone.h>
 #include <linux/anon_inodes.h>
+#include <linux/fdtable.h>
 #include <linux/file.h>
+#include <linux/fs.h>
 #include <linux/license.h>
 #include <linux/filter.h>
 #include <linux/version.h>
@@ -26,6 +30,8 @@
 #include <linux/cred.h>
 #include <linux/timekeeping.h>
 #include <linux/ctype.h>
+#include <linux/btf.h>
+#include <linux/nospec.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -62,9 +68,9 @@ static const struct bpf_map_ops * const bpf_map_types[] = {
  * copy_from_user() call. However, this is not a concern since this function is
  * meant to be a future-proofing of bits.
  */
-static int check_uarg_tail_zero(void __user *uaddr,
-				size_t expected_size,
-				size_t actual_size)
+int bpf_check_uarg_tail_zero(void __user *uaddr,
+			     size_t expected_size,
+			     size_t actual_size)
 {
 	unsigned char __user *addr;
 	unsigned char __user *end;
@@ -102,12 +108,14 @@ const struct bpf_map_ops bpf_map_offload_ops = {
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 {
 	const struct bpf_map_ops *ops;
+	u32 type = attr->map_type;
 	struct bpf_map *map;
 	int err;
 
-	if (attr->map_type >= ARRAY_SIZE(bpf_map_types))
+	if (type >= ARRAY_SIZE(bpf_map_types))
 		return ERR_PTR(-EINVAL);
-	ops = bpf_map_types[attr->map_type];
+	type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types));
+	ops = bpf_map_types[type];
 	if (!ops)
 		return ERR_PTR(-EINVAL);
 
@@ -122,7 +130,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return map;
 	map->ops = ops;
-	map->map_type = attr->map_type;
+	map->map_type = type;
 	return map;
 }
 
@@ -257,8 +265,8 @@ static void bpf_map_free_deferred(struct work_struct *work)
 static void bpf_map_put_uref(struct bpf_map *map)
 {
 	if (atomic_dec_and_test(&map->usercnt)) {
-		if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
-			bpf_fd_array_map_clear(map);
+		if (map->ops->map_release_uref)
+			map->ops->map_release_uref(map);
 	}
 }
 
@@ -270,6 +278,7 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
 	if (atomic_dec_and_test(&map->refcnt)) {
 		/* bpf_map_free_id() must be called first */
 		bpf_map_free_id(map, do_idr_lock);
+		btf_put(map->btf);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
 		schedule_work(&map->work);
 	}
@@ -279,6 +288,7 @@ void bpf_map_put(struct bpf_map *map)
 {
 	__bpf_map_put(map, true);
 }
+EXPORT_SYMBOL_GPL(bpf_map_put);
 
 void bpf_map_put_with_uref(struct bpf_map *map)
 {
@@ -317,13 +327,15 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   "value_size:\t%u\n"
 		   "max_entries:\t%u\n"
 		   "map_flags:\t%#x\n"
-		   "memlock:\t%llu\n",
+		   "memlock:\t%llu\n"
+		   "map_id:\t%u\n",
 		   map->map_type,
 		   map->key_size,
 		   map->value_size,
 		   map->max_entries,
 		   map->map_flags,
-		   map->pages * 1ULL << PAGE_SHIFT);
+		   map->pages * 1ULL << PAGE_SHIFT,
+		   map->id);
 
 	if (owner_prog_type) {
 		seq_printf(m, "owner_prog_type:\t%u\n",
@@ -415,7 +427,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
+#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -449,6 +461,33 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
+	if (bpf_map_support_seq_show(map) &&
+	    (attr->btf_key_type_id || attr->btf_value_type_id)) {
+		struct btf *btf;
+
+		if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
+			err = -EINVAL;
+			goto free_map_nouncharge;
+		}
+
+		btf = btf_get_by_fd(attr->btf_fd);
+		if (IS_ERR(btf)) {
+			err = PTR_ERR(btf);
+			goto free_map_nouncharge;
+		}
+
+		err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id,
+					      attr->btf_value_type_id);
+		if (err) {
+			btf_put(btf);
+			goto free_map_nouncharge;
+		}
+
+		map->btf = btf;
+		map->btf_key_type_id = attr->btf_key_type_id;
+		map->btf_value_type_id = attr->btf_value_type_id;
+	}
+
 	err = security_bpf_map_alloc(map);
 	if (err)
 		goto free_map_nouncharge;
@@ -473,7 +512,6 @@ static int map_create(union bpf_attr *attr)
 		return err;
 	}
 
-	trace_bpf_map_create(map, err);
 	return err;
 
 free_map:
@@ -481,6 +519,7 @@ free_map:
 free_map_sec:
 	security_bpf_map_free(map);
 free_map_nouncharge:
+	btf_put(map->btf);
 	map->ops->map_free(map);
 	return err;
 }
@@ -513,6 +552,7 @@ struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
 		atomic_inc(&map->usercnt);
 	return map;
 }
+EXPORT_SYMBOL_GPL(bpf_map_inc);
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 {
@@ -632,7 +672,6 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (copy_to_user(uvalue, value, value_size) != 0)
 		goto free_value;
 
-	trace_bpf_map_lookup_elem(map, ufd, key, value);
 	err = 0;
 
 free_value:
@@ -729,8 +768,6 @@ static int map_update_elem(union bpf_attr *attr)
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
 out:
-	if (!err)
-		trace_bpf_map_update_elem(map, ufd, key, value);
 free_value:
 	kfree(value);
 free_key:
@@ -783,8 +820,6 @@ static int map_delete_elem(union bpf_attr *attr)
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
 out:
-	if (!err)
-		trace_bpf_map_delete_elem(map, ufd, key);
 	kfree(key);
 err_put:
 	fdput(f);
@@ -848,7 +883,6 @@ out:
 	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
 		goto free_next_key;
 
-	trace_bpf_map_next_key(map, ufd, key, next_key);
 	err = 0;
 
 free_next_key:
@@ -871,11 +905,17 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = {
 
 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 {
-	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
+	const struct bpf_prog_ops *ops;
+
+	if (type >= ARRAY_SIZE(bpf_prog_types))
+		return -EINVAL;
+	type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
+	ops = bpf_prog_types[type];
+	if (!ops)
 		return -EINVAL;
 
 	if (!bpf_prog_is_dev_bound(prog->aux))
-		prog->aux->ops = bpf_prog_types[type];
+		prog->aux->ops = ops;
 	else
 		prog->aux->ops = &bpf_offload_prog_ops;
 	prog->type = type;
@@ -996,7 +1036,6 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		int i;
 
-		trace_bpf_prog_put_rcu(prog);
 		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog, do_idr_lock);
 
@@ -1033,11 +1072,13 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 		   "prog_type:\t%u\n"
 		   "prog_jited:\t%u\n"
 		   "prog_tag:\t%s\n"
-		   "memlock:\t%llu\n",
+		   "memlock:\t%llu\n"
+		   "prog_id:\t%u\n",
 		   prog->type,
 		   prog->jited,
 		   prog_tag,
-		   prog->pages * 1ULL << PAGE_SHIFT);
+		   prog->pages * 1ULL << PAGE_SHIFT,
+		   prog->aux->id);
 }
 #endif
 
@@ -1163,11 +1204,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
 				       bool attach_drv)
 {
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv);
-
-	if (!IS_ERR(prog))
-		trace_bpf_prog_get_type(prog);
-	return prog;
+	return __bpf_prog_get(ufd, &type, attach_drv);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
 
@@ -1217,6 +1254,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_INET6_CONNECT:
+		case BPF_CGROUP_UDP4_SENDMSG:
+		case BPF_CGROUP_UDP6_SENDMSG:
 			return 0;
 		default:
 			return -EINVAL;
@@ -1226,18 +1265,6 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 	}
 }
 
-static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
-					     enum bpf_attach_type attach_type)
-{
-	switch (prog->type) {
-	case BPF_PROG_TYPE_CGROUP_SOCK:
-	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
-		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
-	default:
-		return 0;
-	}
-}
-
 /* last field in 'union bpf_attr' used by this command */
 #define	BPF_PROG_LOAD_LAST_FIELD expected_attach_type
 
@@ -1354,7 +1381,6 @@ static int bpf_prog_load(union bpf_attr *attr)
 	}
 
 	bpf_prog_kallsyms_add(prog);
-	trace_bpf_prog_load(prog, err);
 	return err;
 
 free_used_maps:
@@ -1465,6 +1491,18 @@ out_free_tp:
 
 #ifdef CONFIG_CGROUP_BPF
 
+static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
+					     enum bpf_attach_type attach_type)
+{
+	switch (prog->type) {
+	case BPF_PROG_TYPE_CGROUP_SOCK:
+	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+	default:
+		return 0;
+	}
+}
+
 #define BPF_PROG_ATTACH_LAST_FIELD attach_flags
 
 static int sockmap_get_from_fd(const union bpf_attr *attr,
@@ -1534,6 +1572,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -1547,6 +1587,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true);
+	case BPF_LIRC_MODE2:
+		return lirc_prog_attach(attr);
 	default:
 		return -EINVAL;
 	}
@@ -1604,6 +1646,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -1617,6 +1661,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false);
+	case BPF_LIRC_MODE2:
+		return lirc_prog_detach(attr);
 	default:
 		return -EINVAL;
 	}
@@ -1661,9 +1707,13 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET6_POST_BIND:
 	case BPF_CGROUP_INET4_CONNECT:
 	case BPF_CGROUP_INET6_CONNECT:
+	case BPF_CGROUP_UDP4_SENDMSG:
+	case BPF_CGROUP_UDP6_SENDMSG:
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 		break;
+	case BPF_LIRC_MODE2:
+		return lirc_prog_query(attr, uattr);
 	default:
 		return -EINVAL;
 	}
@@ -1870,7 +1920,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	u32 ulen;
 	int err;
 
-	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
 	if (err)
 		return err;
 	info_len = min_t(u32, sizeof(info), info_len);
@@ -1883,6 +1933,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	info.load_time = prog->aux->load_time;
 	info.created_by_uid = from_kuid_munged(current_user_ns(),
 					       prog->aux->user->uid);
+	info.gpl_compatible = prog->gpl_compatible;
 
 	memcpy(info.tag, prog->tag, sizeof(prog->tag));
 	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
@@ -1903,6 +1954,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
+		info.nr_jited_ksyms = 0;
 		goto done;
 	}
 
@@ -1939,18 +1991,93 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	 * for offload.
 	 */
 	ulen = info.jited_prog_len;
-	info.jited_prog_len = prog->jited_len;
+	if (prog->aux->func_cnt) {
+		u32 i;
+
+		info.jited_prog_len = 0;
+		for (i = 0; i < prog->aux->func_cnt; i++)
+			info.jited_prog_len += prog->aux->func[i]->jited_len;
+	} else {
+		info.jited_prog_len = prog->jited_len;
+	}
+
 	if (info.jited_prog_len && ulen) {
 		if (bpf_dump_raw_ok()) {
 			uinsns = u64_to_user_ptr(info.jited_prog_insns);
 			ulen = min_t(u32, info.jited_prog_len, ulen);
-			if (copy_to_user(uinsns, prog->bpf_func, ulen))
-				return -EFAULT;
+
+			/* for multi-function programs, copy the JITed
+			 * instructions for all the functions
+			 */
+			if (prog->aux->func_cnt) {
+				u32 len, free, i;
+				u8 *img;
+
+				free = ulen;
+				for (i = 0; i < prog->aux->func_cnt; i++) {
+					len = prog->aux->func[i]->jited_len;
+					len = min_t(u32, len, free);
+					img = (u8 *) prog->aux->func[i]->bpf_func;
+					if (copy_to_user(uinsns, img, len))
+						return -EFAULT;
+					uinsns += len;
+					free -= len;
+					if (!free)
+						break;
+				}
+			} else {
+				if (copy_to_user(uinsns, prog->bpf_func, ulen))
+					return -EFAULT;
+			}
 		} else {
 			info.jited_prog_insns = 0;
 		}
 	}
 
+	ulen = info.nr_jited_ksyms;
+	info.nr_jited_ksyms = prog->aux->func_cnt;
+	if (info.nr_jited_ksyms && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u64 __user *user_ksyms;
+			ulong ksym_addr;
+			u32 i;
+
+			/* copy the address of the kernel symbol
+			 * corresponding to each function
+			 */
+			ulen = min_t(u32, info.nr_jited_ksyms, ulen);
+			user_ksyms = u64_to_user_ptr(info.jited_ksyms);
+			for (i = 0; i < ulen; i++) {
+				ksym_addr = (ulong) prog->aux->func[i]->bpf_func;
+				ksym_addr &= PAGE_MASK;
+				if (put_user((u64) ksym_addr, &user_ksyms[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_ksyms = 0;
+		}
+	}
+
+	ulen = info.nr_jited_func_lens;
+	info.nr_jited_func_lens = prog->aux->func_cnt;
+	if (info.nr_jited_func_lens && ulen) {
+		if (bpf_dump_raw_ok()) {
+			u32 __user *user_lens;
+			u32 func_len, i;
+
+			/* copy the JITed image lengths for each function */
+			ulen = min_t(u32, info.nr_jited_func_lens, ulen);
+			user_lens = u64_to_user_ptr(info.jited_func_lens);
+			for (i = 0; i < ulen; i++) {
+				func_len = prog->aux->func[i]->jited_len;
+				if (put_user(func_len, &user_lens[i]))
+					return -EFAULT;
+			}
+		} else {
+			info.jited_func_lens = 0;
+		}
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
@@ -1968,7 +2095,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	u32 info_len = attr->info.info_len;
 	int err;
 
-	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len);
 	if (err)
 		return err;
 	info_len = min_t(u32, sizeof(info), info_len);
@@ -1981,6 +2108,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.map_flags = map->map_flags;
 	memcpy(info.name, map->name, sizeof(map->name));
 
+	if (map->btf) {
+		info.btf_id = btf_id(map->btf);
+		info.btf_key_type_id = map->btf_key_type_id;
+		info.btf_value_type_id = map->btf_value_type_id;
+	}
+
 	if (bpf_map_is_dev_bound(map)) {
 		err = bpf_map_offload_info_fill(&info, map);
 		if (err)
@@ -1994,6 +2127,21 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	return 0;
 }
 
+static int bpf_btf_get_info_by_fd(struct btf *btf,
+				  const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+	u32 info_len = attr->info.info_len;
+	int err;
+
+	err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len);
+	if (err)
+		return err;
+
+	return btf_get_info_by_fd(btf, attr, uattr);
+}
+
 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
 
 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -2016,6 +2164,8 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	else if (f.file->f_op == &bpf_map_fops)
 		err = bpf_map_get_info_by_fd(f.file->private_data, attr,
 					     uattr);
+	else if (f.file->f_op == &btf_fops)
+		err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr);
 	else
 		err = -EINVAL;
 
@@ -2023,6 +2173,158 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	return err;
 }
 
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+
+static int bpf_btf_load(const union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_BTF_LOAD))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btf_new_fd(attr);
+}
+
+#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
+
+static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
+{
+	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btf_get_fd_by_id(attr->btf_id);
+}
+
+static int bpf_task_fd_query_copy(const union bpf_attr *attr,
+				    union bpf_attr __user *uattr,
+				    u32 prog_id, u32 fd_type,
+				    const char *buf, u64 probe_offset,
+				    u64 probe_addr)
+{
+	char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
+	u32 len = buf ? strlen(buf) : 0, input_len;
+	int err = 0;
+
+	if (put_user(len, &uattr->task_fd_query.buf_len))
+		return -EFAULT;
+	input_len = attr->task_fd_query.buf_len;
+	if (input_len && ubuf) {
+		if (!len) {
+			/* nothing to copy, just make ubuf NULL terminated */
+			char zero = '\0';
+
+			if (put_user(zero, ubuf))
+				return -EFAULT;
+		} else if (input_len >= len + 1) {
+			/* ubuf can hold the string with NULL terminator */
+			if (copy_to_user(ubuf, buf, len + 1))
+				return -EFAULT;
+		} else {
+			/* ubuf cannot hold the string with NULL terminator,
+			 * do a partial copy with NULL terminator.
+			 */
+			char zero = '\0';
+
+			err = -ENOSPC;
+			if (copy_to_user(ubuf, buf, input_len - 1))
+				return -EFAULT;
+			if (put_user(zero, ubuf + input_len - 1))
+				return -EFAULT;
+		}
+	}
+
+	if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
+	    put_user(fd_type, &uattr->task_fd_query.fd_type) ||
+	    put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
+	    put_user(probe_addr, &uattr->task_fd_query.probe_addr))
+		return -EFAULT;
+
+	return err;
+}
+
+#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
+
+static int bpf_task_fd_query(const union bpf_attr *attr,
+			     union bpf_attr __user *uattr)
+{
+	pid_t pid = attr->task_fd_query.pid;
+	u32 fd = attr->task_fd_query.fd;
+	const struct perf_event *event;
+	struct files_struct *files;
+	struct task_struct *task;
+	struct file *file;
+	int err;
+
+	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (attr->task_fd_query.flags != 0)
+		return -EINVAL;
+
+	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+	if (!task)
+		return -ENOENT;
+
+	files = get_files_struct(task);
+	put_task_struct(task);
+	if (!files)
+		return -ENOENT;
+
+	err = 0;
+	spin_lock(&files->file_lock);
+	file = fcheck_files(files, fd);
+	if (!file)
+		err = -EBADF;
+	else
+		get_file(file);
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+	if (err)
+		goto out;
+
+	if (file->f_op == &bpf_raw_tp_fops) {
+		struct bpf_raw_tracepoint *raw_tp = file->private_data;
+		struct bpf_raw_event_map *btp = raw_tp->btp;
+
+		err = bpf_task_fd_query_copy(attr, uattr,
+					     raw_tp->prog->aux->id,
+					     BPF_FD_TYPE_RAW_TRACEPOINT,
+					     btp->tp->name, 0, 0);
+		goto put_file;
+	}
+
+	event = perf_get_event(file);
+	if (!IS_ERR(event)) {
+		u64 probe_offset, probe_addr;
+		u32 prog_id, fd_type;
+		const char *buf;
+
+		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
+					      &buf, &probe_offset,
+					      &probe_addr);
+		if (!err)
+			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
+						     fd_type, buf,
+						     probe_offset,
+						     probe_addr);
+		goto put_file;
+	}
+
+	err = -ENOTSUPP;
+put_file:
+	fput(file);
+out:
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2031,7 +2333,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
 	if (err)
 		return err;
 	size = min_t(u32, size, sizeof(attr));
@@ -2103,6 +2405,15 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_RAW_TRACEPOINT_OPEN:
 		err = bpf_raw_tracepoint_open(&attr);
 		break;
+	case BPF_BTF_LOAD:
+		err = bpf_btf_load(&attr);
+		break;
+	case BPF_BTF_GET_FD_BY_ID:
+		err = bpf_btf_get_fd_by_id(&attr);
+		break;
+	case BPF_TASK_FD_QUERY:
+		err = bpf_task_fd_query(&attr, uattr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 1f4bf68c12db..938d41211be7 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -43,6 +43,16 @@ struct tnum tnum_rshift(struct tnum a, u8 shift)
 	return TNUM(a.value >> shift, a.mask >> shift);
 }
 
+struct tnum tnum_arshift(struct tnum a, u8 min_shift)
+{
+	/* if a.value is negative, arithmetic shifting by minimum shift
+	 * will have larger negative offset compared to more shifting.
+	 * If a.value is nonnegative, arithmetic shifting by minimum shift
+	 * will have larger positive offset compare to more shifting.
+	 */
+	return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift);
+}
+
 struct tnum tnum_add(struct tnum a, struct tnum b)
 {
 	u64 sm, sv, sigma, chi, mu;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5dd1dcb902bf..d6403b5166f4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -22,6 +22,7 @@
 #include <linux/stringify.h>
 #include <linux/bsearch.h>
 #include <linux/sort.h>
+#include <linux/perf_event.h>
 
 #include "disasm.h"
 
@@ -156,7 +157,29 @@ struct bpf_verifier_stack_elem {
 #define BPF_COMPLEXITY_LIMIT_INSNS	131072
 #define BPF_COMPLEXITY_LIMIT_STACK	1024
 
-#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
+#define BPF_MAP_PTR_UNPRIV	1UL
+#define BPF_MAP_PTR_POISON	((void *)((0xeB9FUL << 1) +	\
+					  POISON_POINTER_DELTA))
+#define BPF_MAP_PTR(X)		((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
+
+static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
+{
+	return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON;
+}
+
+static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
+{
+	return aux->map_state & BPF_MAP_PTR_UNPRIV;
+}
+
+static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
+			      const struct bpf_map *map, bool unpriv)
+{
+	BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
+	unpriv |= bpf_map_ptr_unpriv(aux);
+	aux->map_state = (unsigned long)map |
+			 (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+}
 
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
@@ -164,6 +187,8 @@ struct bpf_call_arg_meta {
 	bool pkt_access;
 	int regno;
 	int access_size;
+	s64 msize_smax_value;
+	u64 msize_umax_value;
 };
 
 static DEFINE_MUTEX(bpf_verifier_lock);
@@ -738,18 +763,19 @@ enum reg_arg_type {
 
 static int cmp_subprogs(const void *a, const void *b)
 {
-	return *(int *)a - *(int *)b;
+	return ((struct bpf_subprog_info *)a)->start -
+	       ((struct bpf_subprog_info *)b)->start;
 }
 
 static int find_subprog(struct bpf_verifier_env *env, int off)
 {
-	u32 *p;
+	struct bpf_subprog_info *p;
 
-	p = bsearch(&off, env->subprog_starts, env->subprog_cnt,
-		    sizeof(env->subprog_starts[0]), cmp_subprogs);
+	p = bsearch(&off, env->subprog_info, env->subprog_cnt,
+		    sizeof(env->subprog_info[0]), cmp_subprogs);
 	if (!p)
 		return -ENOENT;
-	return p - env->subprog_starts;
+	return p - env->subprog_info;
 
 }
 
@@ -769,18 +795,24 @@ static int add_subprog(struct bpf_verifier_env *env, int off)
 		verbose(env, "too many subprograms\n");
 		return -E2BIG;
 	}
-	env->subprog_starts[env->subprog_cnt++] = off;
-	sort(env->subprog_starts, env->subprog_cnt,
-	     sizeof(env->subprog_starts[0]), cmp_subprogs, NULL);
+	env->subprog_info[env->subprog_cnt++].start = off;
+	sort(env->subprog_info, env->subprog_cnt,
+	     sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
 	return 0;
 }
 
 static int check_subprogs(struct bpf_verifier_env *env)
 {
 	int i, ret, subprog_start, subprog_end, off, cur_subprog = 0;
+	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
 	int insn_cnt = env->prog->len;
 
+	/* Add entry function. */
+	ret = add_subprog(env, 0);
+	if (ret < 0)
+		return ret;
+
 	/* determine subprog starts. The end is one before the next starts */
 	for (i = 0; i < insn_cnt; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
@@ -800,16 +832,18 @@ static int check_subprogs(struct bpf_verifier_env *env)
 			return ret;
 	}
 
+	/* Add a fake 'exit' subprog which could simplify subprog iteration
+	 * logic. 'subprog_cnt' should not be increased.
+	 */
+	subprog[env->subprog_cnt].start = insn_cnt;
+
 	if (env->log.level > 1)
 		for (i = 0; i < env->subprog_cnt; i++)
-			verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]);
+			verbose(env, "func#%d @%d\n", i, subprog[i].start);
 
 	/* now check that all jumps are within the same subprog */
-	subprog_start = 0;
-	if (env->subprog_cnt == cur_subprog)
-		subprog_end = insn_cnt;
-	else
-		subprog_end = env->subprog_starts[cur_subprog++];
+	subprog_start = subprog[cur_subprog].start;
+	subprog_end = subprog[cur_subprog + 1].start;
 	for (i = 0; i < insn_cnt; i++) {
 		u8 code = insn[i].code;
 
@@ -834,10 +868,9 @@ next:
 				return -EINVAL;
 			}
 			subprog_start = subprog_end;
-			if (env->subprog_cnt == cur_subprog)
-				subprog_end = insn_cnt;
-			else
-				subprog_end = env->subprog_starts[cur_subprog++];
+			cur_subprog++;
+			if (cur_subprog < env->subprog_cnt)
+				subprog_end = subprog[cur_subprog + 1].start;
 		}
 	}
 	return 0;
@@ -978,7 +1011,7 @@ static bool register_is_null(struct bpf_reg_state *reg)
  */
 static int check_stack_write(struct bpf_verifier_env *env,
 			     struct bpf_func_state *state, /* func where register points to */
-			     int off, int size, int value_regno)
+			     int off, int size, int value_regno, int insn_idx)
 {
 	struct bpf_func_state *cur; /* state of the current function */
 	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
@@ -1017,8 +1050,33 @@ static int check_stack_write(struct bpf_verifier_env *env,
 		state->stack[spi].spilled_ptr = cur->regs[value_regno];
 		state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
 
-		for (i = 0; i < BPF_REG_SIZE; i++)
+		for (i = 0; i < BPF_REG_SIZE; i++) {
+			if (state->stack[spi].slot_type[i] == STACK_MISC &&
+			    !env->allow_ptr_leaks) {
+				int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
+				int soff = (-spi - 1) * BPF_REG_SIZE;
+
+				/* detected reuse of integer stack slot with a pointer
+				 * which means either llvm is reusing stack slot or
+				 * an attacker is trying to exploit CVE-2018-3639
+				 * (speculative store bypass)
+				 * Have to sanitize that slot with preemptive
+				 * store of zero.
+				 */
+				if (*poff && *poff != soff) {
+					/* disallow programs where single insn stores
+					 * into two different stack slots, since verifier
+					 * cannot sanitize them
+					 */
+					verbose(env,
+						"insn %d cannot access two stack slots fp%d and fp%d",
+						insn_idx, *poff, soff);
+					return -EINVAL;
+				}
+				*poff = soff;
+			}
 			state->stack[spi].slot_type[i] = STACK_SPILL;
+		}
 	} else {
 		u8 type = STACK_MISC;
 
@@ -1251,6 +1309,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	switch (env->prog->type) {
 	case BPF_PROG_TYPE_LWT_IN:
 	case BPF_PROG_TYPE_LWT_OUT:
+	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
 		/* dst_input() and dst_output() can't write for now */
 		if (t == BPF_WRITE)
 			return false;
@@ -1470,13 +1529,13 @@ static int update_stack_depth(struct bpf_verifier_env *env,
 			      const struct bpf_func_state *func,
 			      int off)
 {
-	u16 stack = env->subprog_stack_depth[func->subprogno];
+	u16 stack = env->subprog_info[func->subprogno].stack_depth;
 
 	if (stack >= -off)
 		return 0;
 
 	/* update known max for given subprogram */
-	env->subprog_stack_depth[func->subprogno] = -off;
+	env->subprog_info[func->subprogno].stack_depth = -off;
 	return 0;
 }
 
@@ -1488,9 +1547,9 @@ static int update_stack_depth(struct bpf_verifier_env *env,
  */
 static int check_max_stack_depth(struct bpf_verifier_env *env)
 {
-	int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end;
+	int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
+	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
-	int insn_cnt = env->prog->len;
 	int ret_insn[MAX_CALL_FRAMES];
 	int ret_prog[MAX_CALL_FRAMES];
 
@@ -1498,17 +1557,14 @@ process_func:
 	/* round up to 32-bytes, since this is granularity
 	 * of interpreter stack size
 	 */
-	depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+	depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
 	if (depth > MAX_BPF_STACK) {
 		verbose(env, "combined stack size of %d calls is %d. Too large\n",
 			frame + 1, depth);
 		return -EACCES;
 	}
 continue_func:
-	if (env->subprog_cnt == subprog)
-		subprog_end = insn_cnt;
-	else
-		subprog_end = env->subprog_starts[subprog];
+	subprog_end = subprog[idx + 1].start;
 	for (; i < subprog_end; i++) {
 		if (insn[i].code != (BPF_JMP | BPF_CALL))
 			continue;
@@ -1516,17 +1572,16 @@ continue_func:
 			continue;
 		/* remember insn and function to return to */
 		ret_insn[frame] = i + 1;
-		ret_prog[frame] = subprog;
+		ret_prog[frame] = idx;
 
 		/* find the callee */
 		i = i + insn[i].imm + 1;
-		subprog = find_subprog(env, i);
-		if (subprog < 0) {
+		idx = find_subprog(env, i);
+		if (idx < 0) {
 			WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
 				  i);
 			return -EFAULT;
 		}
-		subprog++;
 		frame++;
 		if (frame >= MAX_CALL_FRAMES) {
 			WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
@@ -1539,10 +1594,10 @@ continue_func:
 	 */
 	if (frame == 0)
 		return 0;
-	depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32);
+	depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
 	frame--;
 	i = ret_insn[frame];
-	subprog = ret_prog[frame];
+	idx = ret_prog[frame];
 	goto continue_func;
 }
 
@@ -1558,8 +1613,7 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
 			  start);
 		return -EFAULT;
 	}
-	subprog++;
-	return env->subprog_stack_depth[subprog];
+	return env->subprog_info[subprog].stack_depth;
 }
 #endif
 
@@ -1694,7 +1748,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 		if (t == BPF_WRITE)
 			err = check_stack_write(env, state, off, size,
-						value_regno);
+						value_regno, insn_idx);
 		else
 			err = check_stack_read(env, state, off, size,
 					       value_regno);
@@ -1914,7 +1968,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	if (arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
-		if (!type_is_pkt_pointer(type) &&
+		if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
 		    type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_CONST_SIZE ||
@@ -1966,14 +2020,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose(env, "invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		if (type_is_pkt_pointer(type))
-			err = check_packet_access(env, regno, reg->off,
-						  meta->map_ptr->key_size,
-						  false);
-		else
-			err = check_stack_boundary(env, regno,
-						   meta->map_ptr->key_size,
-						   false, NULL);
+		err = check_helper_mem_access(env, regno,
+					      meta->map_ptr->key_size, false,
+					      NULL);
 	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
 		/* bpf_map_xxx(..., map_ptr, ..., value) call:
 		 * check [value, value + map->value_size) validity
@@ -1983,17 +2032,18 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose(env, "invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		if (type_is_pkt_pointer(type))
-			err = check_packet_access(env, regno, reg->off,
-						  meta->map_ptr->value_size,
-						  false);
-		else
-			err = check_stack_boundary(env, regno,
-						   meta->map_ptr->value_size,
-						   false, NULL);
+		err = check_helper_mem_access(env, regno,
+					      meta->map_ptr->value_size, false,
+					      NULL);
 	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
+		/* remember the mem_size which may be used later
+		 * to refine return values.
+		 */
+		meta->msize_smax_value = reg->smax_value;
+		meta->msize_umax_value = reg->umax_value;
+
 		/* The register is SCALAR_VALUE; the access check
 		 * happens using its boundaries.
 		 */
@@ -2071,8 +2121,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
-	/* Restrict bpf side of cpumap, open when use-cases appear */
+	/* Restrict bpf side of cpumap and xskmap, open when use-cases
+	 * appear.
+	 */
 	case BPF_MAP_TYPE_CPUMAP:
+	case BPF_MAP_TYPE_XSKMAP:
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
@@ -2088,6 +2141,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_msg_redirect_map)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_SOCKHASH:
+		if (func_id != BPF_FUNC_sk_redirect_hash &&
+		    func_id != BPF_FUNC_sock_hash_update &&
+		    func_id != BPF_FUNC_map_delete_elem &&
+		    func_id != BPF_FUNC_msg_redirect_hash)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2097,7 +2157,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_FUNC_tail_call:
 		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
 			goto error;
-		if (env->subprog_cnt) {
+		if (env->subprog_cnt > 1) {
 			verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n");
 			return -EINVAL;
 		}
@@ -2119,16 +2179,20 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_FUNC_redirect_map:
 		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
-		    map->map_type != BPF_MAP_TYPE_CPUMAP)
+		    map->map_type != BPF_MAP_TYPE_CPUMAP &&
+		    map->map_type != BPF_MAP_TYPE_XSKMAP)
 			goto error;
 		break;
 	case BPF_FUNC_sk_redirect_map:
 	case BPF_FUNC_msg_redirect_map:
+	case BPF_FUNC_sock_map_update:
 		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
 			goto error;
 		break;
-	case BPF_FUNC_sock_map_update:
-		if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+	case BPF_FUNC_sk_redirect_hash:
+	case BPF_FUNC_msg_redirect_hash:
+	case BPF_FUNC_sock_hash_update:
+		if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
 			goto error;
 		break;
 	default:
@@ -2269,7 +2333,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			/* remember the callsite, it will be used by bpf_exit */
 			*insn_idx /* callsite */,
 			state->curframe + 1 /* frameno within this callchain */,
-			subprog + 1 /* subprog number within this prog */);
+			subprog /* subprog number within this prog */);
 
 	/* copy r1 - r5 args that callee can access */
 	for (i = BPF_REG_1; i <= BPF_REG_5; i++)
@@ -2333,6 +2397,49 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	return 0;
 }
 
+static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
+				   int func_id,
+				   struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
+
+	if (ret_type != RET_INTEGER ||
+	    (func_id != BPF_FUNC_get_stack &&
+	     func_id != BPF_FUNC_probe_read_str))
+		return;
+
+	ret_reg->smax_value = meta->msize_smax_value;
+	ret_reg->umax_value = meta->msize_umax_value;
+	__reg_deduce_bounds(ret_reg);
+	__reg_bound_offset(ret_reg);
+}
+
+static int
+record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
+		int func_id, int insn_idx)
+{
+	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+
+	if (func_id != BPF_FUNC_tail_call &&
+	    func_id != BPF_FUNC_map_lookup_elem &&
+	    func_id != BPF_FUNC_map_update_elem &&
+	    func_id != BPF_FUNC_map_delete_elem)
+		return 0;
+
+	if (meta->map_ptr == NULL) {
+		verbose(env, "kernel subsystem misconfigured verifier\n");
+		return -EINVAL;
+	}
+
+	if (!BPF_MAP_PTR(aux->map_state))
+		bpf_map_ptr_store(aux, meta->map_ptr,
+				  meta->map_ptr->unpriv_array);
+	else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr)
+		bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
+				  meta->map_ptr->unpriv_array);
+	return 0;
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
 	const struct bpf_func_proto *fn = NULL;
@@ -2358,7 +2465,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
 	if (!env->prog->gpl_compatible && fn->gpl_only) {
-		verbose(env, "cannot call GPL only function from proprietary program\n");
+		verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n");
 		return -EINVAL;
 	}
 
@@ -2387,13 +2494,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
 	if (err)
 		return err;
-	if (func_id == BPF_FUNC_tail_call) {
-		if (meta.map_ptr == NULL) {
-			verbose(env, "verifier bug\n");
-			return -EINVAL;
-		}
-		env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
-	}
 	err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
 	if (err)
 		return err;
@@ -2404,6 +2504,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (err)
 		return err;
 
+	err = record_func_map(env, &meta, func_id, insn_idx);
+	if (err)
+		return err;
+
 	/* Mark slots with STACK_MISC in case of raw mode, stack offset
 	 * is inferred from register state.
 	 */
@@ -2428,8 +2532,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	} else if (fn->ret_type == RET_VOID) {
 		regs[BPF_REG_0].type = NOT_INIT;
 	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
-		struct bpf_insn_aux_data *insn_aux;
-
 		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
 		/* There is no offset yet applied, variable or fixed */
 		mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -2445,21 +2547,36 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
 		regs[BPF_REG_0].id = ++env->id_gen;
-		insn_aux = &env->insn_aux_data[insn_idx];
-		if (!insn_aux->map_ptr)
-			insn_aux->map_ptr = meta.map_ptr;
-		else if (insn_aux->map_ptr != meta.map_ptr)
-			insn_aux->map_ptr = BPF_MAP_PTR_POISON;
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
 		return -EINVAL;
 	}
 
+	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+
 	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
 		return err;
 
+	if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
+		const char *err_str;
+
+#ifdef CONFIG_PERF_EVENTS
+		err = get_callchain_buffers(sysctl_perf_event_max_stack);
+		err_str = "cannot get callchain buffer for func %s#%d\n";
+#else
+		err = -ENOTSUPP;
+		err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n";
+#endif
+		if (err) {
+			verbose(env, err_str, func_id_name(func_id), func_id);
+			return err;
+		}
+
+		env->prog->has_callchain_buf = true;
+	}
+
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
@@ -2904,10 +3021,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			dst_reg->umin_value <<= umin_val;
 			dst_reg->umax_value <<= umax_val;
 		}
-		if (src_known)
-			dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
-		else
-			dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+		dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
 		/* We may learn something more from the var_off */
 		__update_reg_bounds(dst_reg);
 		break;
@@ -2935,16 +3049,35 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		 */
 		dst_reg->smin_value = S64_MIN;
 		dst_reg->smax_value = S64_MAX;
-		if (src_known)
-			dst_reg->var_off = tnum_rshift(dst_reg->var_off,
-						       umin_val);
-		else
-			dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+		dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
 		dst_reg->umin_value >>= umax_val;
 		dst_reg->umax_value >>= umin_val;
 		/* We may learn something more from the var_off */
 		__update_reg_bounds(dst_reg);
 		break;
+	case BPF_ARSH:
+		if (umax_val >= insn_bitness) {
+			/* Shifts greater than 31 or 63 are undefined.
+			 * This includes shifts by a negative number.
+			 */
+			mark_reg_unknown(env, regs, insn->dst_reg);
+			break;
+		}
+
+		/* Upon reaching here, src_known is true and
+		 * umax_val is equal to umin_val.
+		 */
+		dst_reg->smin_value >>= umin_val;
+		dst_reg->smax_value >>= umin_val;
+		dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val);
+
+		/* blow away the dst_reg umin_value/umax_value and rely on
+		 * dst_reg var_off to refine the result.
+		 */
+		dst_reg->umin_value = 0;
+		dst_reg->umax_value = U64_MAX;
+		__update_reg_bounds(dst_reg);
+		break;
 	default:
 		mark_reg_unknown(env, regs, insn->dst_reg);
 		break;
@@ -3828,7 +3961,12 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return -EINVAL;
 	}
 
-	if (env->subprog_cnt) {
+	if (!env->ops->gen_ld_abs) {
+		verbose(env, "bpf verifier is misconfigured\n");
+		return -EINVAL;
+	}
+
+	if (env->subprog_cnt > 1) {
 		/* when program has LD_ABS insn JITs and interpreter assume
 		 * that r1 == ctx == skb which is not the case for callees
 		 * that can have arbitrary arguments. It's problematic
@@ -4859,15 +4997,15 @@ process_bpf_exit:
 
 	verbose(env, "processed %d insns (limit %d), stack depth ",
 		insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
-	for (i = 0; i < env->subprog_cnt + 1; i++) {
-		u32 depth = env->subprog_stack_depth[i];
+	for (i = 0; i < env->subprog_cnt; i++) {
+		u32 depth = env->subprog_info[i].stack_depth;
 
 		verbose(env, "%d", depth);
-		if (i + 1 < env->subprog_cnt + 1)
+		if (i + 1 < env->subprog_cnt)
 			verbose(env, "+");
 	}
 	verbose(env, "\n");
-	env->prog->aux->stack_depth = env->subprog_stack_depth[0];
+	env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
 	return 0;
 }
 
@@ -4991,7 +5129,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 			/* hold the map. If the program is rejected by verifier,
 			 * the map will be released by release_maps() or it
 			 * will be used by the valid program until it's unloaded
-			 * and all maps are released in free_bpf_prog_info()
+			 * and all maps are released in free_used_maps()
 			 */
 			map = bpf_map_inc(map, false);
 			if (IS_ERR(map)) {
@@ -5073,10 +5211,11 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len
 
 	if (len == 1)
 		return;
-	for (i = 0; i < env->subprog_cnt; i++) {
-		if (env->subprog_starts[i] < off)
+	/* NOTE: fake 'exit' subprog should be updated as well. */
+	for (i = 0; i <= env->subprog_cnt; i++) {
+		if (env->subprog_info[i].start < off)
 			continue;
-		env->subprog_starts[i] += len - 1;
+		env->subprog_info[i].start += len - 1;
 	}
 }
 
@@ -5150,7 +5289,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 	}
 
-	if (!ops->convert_ctx_access)
+	if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux))
 		return 0;
 
 	insn = env->prog->insnsi + delta;
@@ -5169,6 +5308,34 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		else
 			continue;
 
+		if (type == BPF_WRITE &&
+		    env->insn_aux_data[i + delta].sanitize_stack_off) {
+			struct bpf_insn patch[] = {
+				/* Sanitize suspicious stack slot with zero.
+				 * There are no memory dependencies for this store,
+				 * since it's only using frame pointer and immediate
+				 * constant of zero
+				 */
+				BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+					   env->insn_aux_data[i + delta].sanitize_stack_off,
+					   0),
+				/* the original STX instruction will immediately
+				 * overwrite the same stack slot with appropriate value
+				 */
+				*insn,
+			};
+
+			cnt = ARRAY_SIZE(patch);
+			new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 		if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
 			continue;
 
@@ -5182,6 +5349,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		 */
 		is_narrower_load = size < ctx_field_size;
 		if (is_narrower_load) {
+			u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size);
 			u32 off = insn->off;
 			u8 size_code;
 
@@ -5196,7 +5364,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			else if (ctx_field_size == 8)
 				size_code = BPF_DW;
 
-			insn->off = off & ~(ctx_field_size - 1);
+			insn->off = off & ~(size_default - 1);
 			insn->code = BPF_LDX | BPF_MEM | size_code;
 		}
 
@@ -5240,7 +5408,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	void *old_bpf_func;
 	int err = -ENOMEM;
 
-	if (env->subprog_cnt == 0)
+	if (env->subprog_cnt <= 1)
 		return 0;
 
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
@@ -5256,7 +5424,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		/* temporarily remember subprog id inside insn instead of
 		 * aux_data, since next loop will split up all insns into funcs
 		 */
-		insn->off = subprog + 1;
+		insn->off = subprog;
 		/* remember original imm in case JIT fails and fallback
 		 * to interpreter will be needed
 		 */
@@ -5265,16 +5433,13 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		insn->imm = 1;
 	}
 
-	func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL);
+	func = kzalloc(sizeof(prog) * env->subprog_cnt, GFP_KERNEL);
 	if (!func)
 		return -ENOMEM;
 
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		subprog_start = subprog_end;
-		if (env->subprog_cnt == i)
-			subprog_end = prog->len;
-		else
-			subprog_end = env->subprog_starts[i];
+		subprog_end = env->subprog_info[i + 1].start;
 
 		len = subprog_end - subprog_start;
 		func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
@@ -5291,7 +5456,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		 * Long term would need debug info to populate names
 		 */
 		func[i]->aux->name[0] = 'F';
-		func[i]->aux->stack_depth = env->subprog_stack_depth[i];
+		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
 		func[i]->jit_requested = 1;
 		func[i] = bpf_int_jit_compile(func[i]);
 		if (!func[i]->jited) {
@@ -5304,20 +5469,33 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * now populate all bpf_calls with correct addresses and
 	 * run last pass of JIT
 	 */
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		insn = func[i]->insnsi;
 		for (j = 0; j < func[i]->len; j++, insn++) {
 			if (insn->code != (BPF_JMP | BPF_CALL) ||
 			    insn->src_reg != BPF_PSEUDO_CALL)
 				continue;
 			subprog = insn->off;
-			insn->off = 0;
 			insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
 				func[subprog]->bpf_func -
 				__bpf_call_base;
 		}
+
+		/* we use the aux data to keep a list of the start addresses
+		 * of the JITed images for each function in the program
+		 *
+		 * for some architectures, such as powerpc64, the imm field
+		 * might not be large enough to hold the offset of the start
+		 * address of the callee's JITed image from __bpf_call_base
+		 *
+		 * in such cases, we can lookup the start address of a callee
+		 * by using its subprog id, available from the off field of
+		 * the call instruction, as an index for this list
+		 */
+		func[i]->aux->func = func;
+		func[i]->aux->func_cnt = env->subprog_cnt;
 	}
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		old_bpf_func = func[i]->bpf_func;
 		tmp = bpf_int_jit_compile(func[i]);
 		if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
@@ -5331,7 +5509,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	/* finally lock prog and jit images for all functions and
 	 * populate kallsysm
 	 */
-	for (i = 0; i <= env->subprog_cnt; i++) {
+	for (i = 0; i < env->subprog_cnt; i++) {
 		bpf_prog_lock_ro(func[i]);
 		bpf_prog_kallsyms_add(func[i]);
 	}
@@ -5341,26 +5519,21 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * later look the same as if they were interpreted only.
 	 */
 	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
-		unsigned long addr;
-
 		if (insn->code != (BPF_JMP | BPF_CALL) ||
 		    insn->src_reg != BPF_PSEUDO_CALL)
 			continue;
 		insn->off = env->insn_aux_data[i].call_imm;
 		subprog = find_subprog(env, i + insn->off + 1);
-		addr  = (unsigned long)func[subprog + 1]->bpf_func;
-		addr &= PAGE_MASK;
-		insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
-			    addr - __bpf_call_base;
+		insn->imm = subprog;
 	}
 
 	prog->jited = 1;
 	prog->bpf_func = func[0]->bpf_func;
 	prog->aux->func = func;
-	prog->aux->func_cnt = env->subprog_cnt + 1;
+	prog->aux->func_cnt = env->subprog_cnt;
 	return 0;
 out_free:
-	for (i = 0; i <= env->subprog_cnt; i++)
+	for (i = 0; i < env->subprog_cnt; i++)
 		if (func[i])
 			bpf_jit_free(func[i]);
 	kfree(func);
@@ -5417,6 +5590,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 	struct bpf_insn *insn = prog->insnsi;
 	const struct bpf_func_proto *fn;
 	const int insn_cnt = prog->len;
+	const struct bpf_map_ops *ops;
+	struct bpf_insn_aux_data *aux;
 	struct bpf_insn insn_buf[16];
 	struct bpf_prog *new_prog;
 	struct bpf_map *map_ptr;
@@ -5463,6 +5638,25 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			continue;
 		}
 
+		if (BPF_CLASS(insn->code) == BPF_LD &&
+		    (BPF_MODE(insn->code) == BPF_ABS ||
+		     BPF_MODE(insn->code) == BPF_IND)) {
+			cnt = env->ops->gen_ld_abs(insn, insn_buf);
+			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+				verbose(env, "bpf verifier is misconfigured\n");
+				return -EINVAL;
+			}
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
 		if (insn->code != (BPF_JMP | BPF_CALL))
 			continue;
 		if (insn->src_reg == BPF_PSEUDO_CALL)
@@ -5491,19 +5685,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn->imm = 0;
 			insn->code = BPF_JMP | BPF_TAIL_CALL;
 
+			aux = &env->insn_aux_data[i + delta];
+			if (!bpf_map_ptr_unpriv(aux))
+				continue;
+
 			/* instead of changing every JIT dealing with tail_call
 			 * emit two extra insns:
 			 * if (index >= max_entries) goto out;
 			 * index &= array->index_mask;
 			 * to avoid out-of-bounds cpu speculation
 			 */
-			map_ptr = env->insn_aux_data[i + delta].map_ptr;
-			if (map_ptr == BPF_MAP_PTR_POISON) {
+			if (bpf_map_ptr_poisoned(aux)) {
 				verbose(env, "tail_call abusing map_ptr\n");
 				return -EINVAL;
 			}
-			if (!map_ptr->unpriv_array)
-				continue;
+
+			map_ptr = BPF_MAP_PTR(aux->map_state);
 			insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
 						  map_ptr->max_entries, 2);
 			insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
@@ -5523,32 +5720,61 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 		}
 
 		/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
-		 * handlers are currently limited to 64 bit only.
+		 * and other inlining handlers are currently limited to 64 bit
+		 * only.
 		 */
 		if (prog->jit_requested && BITS_PER_LONG == 64 &&
-		    insn->imm == BPF_FUNC_map_lookup_elem) {
-			map_ptr = env->insn_aux_data[i + delta].map_ptr;
-			if (map_ptr == BPF_MAP_PTR_POISON ||
-			    !map_ptr->ops->map_gen_lookup)
+		    (insn->imm == BPF_FUNC_map_lookup_elem ||
+		     insn->imm == BPF_FUNC_map_update_elem ||
+		     insn->imm == BPF_FUNC_map_delete_elem)) {
+			aux = &env->insn_aux_data[i + delta];
+			if (bpf_map_ptr_poisoned(aux))
 				goto patch_call_imm;
 
-			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
-			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
-				verbose(env, "bpf verifier is misconfigured\n");
-				return -EINVAL;
-			}
+			map_ptr = BPF_MAP_PTR(aux->map_state);
+			ops = map_ptr->ops;
+			if (insn->imm == BPF_FUNC_map_lookup_elem &&
+			    ops->map_gen_lookup) {
+				cnt = ops->map_gen_lookup(map_ptr, insn_buf);
+				if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+					verbose(env, "bpf verifier is misconfigured\n");
+					return -EINVAL;
+				}
 
-			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
-						       cnt);
-			if (!new_prog)
-				return -ENOMEM;
+				new_prog = bpf_patch_insn_data(env, i + delta,
+							       insn_buf, cnt);
+				if (!new_prog)
+					return -ENOMEM;
 
-			delta += cnt - 1;
+				delta    += cnt - 1;
+				env->prog = prog = new_prog;
+				insn      = new_prog->insnsi + i + delta;
+				continue;
+			}
 
-			/* keep walking new program and skip insns we just inserted */
-			env->prog = prog = new_prog;
-			insn      = new_prog->insnsi + i + delta;
-			continue;
+			BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
+				     (void *(*)(struct bpf_map *map, void *key))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
+				     (int (*)(struct bpf_map *map, void *key))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_update_elem,
+				     (int (*)(struct bpf_map *map, void *key, void *value,
+					      u64 flags))NULL));
+			switch (insn->imm) {
+			case BPF_FUNC_map_lookup_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
+					    __bpf_call_base;
+				continue;
+			case BPF_FUNC_map_update_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_update_elem) -
+					    __bpf_call_base;
+				continue;
+			case BPF_FUNC_map_delete_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
+					    __bpf_call_base;
+				continue;
+			}
+
+			goto patch_call_imm;
 		}
 
 		if (insn->imm == BPF_FUNC_redirect_map) {
@@ -5660,16 +5886,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
 
+	ret = replace_map_fd_with_map_ptr(env);
+	if (ret < 0)
+		goto skip_full_check;
+
 	if (bpf_prog_is_dev_bound(env->prog->aux)) {
 		ret = bpf_prog_offload_verifier_prep(env);
 		if (ret)
-			goto err_unlock;
+			goto skip_full_check;
 	}
 
-	ret = replace_map_fd_with_map_ptr(env);
-	if (ret < 0)
-		goto skip_full_check;
-
 	env->explored_states = kcalloc(env->prog->len,
 				       sizeof(struct bpf_verifier_state_list *),
 				       GFP_USER);
@@ -5740,7 +5966,7 @@ skip_full_check:
 err_release_maps:
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
-		 * them now. Otherwise free_bpf_prog_info() will release them.
+		 * them now. Otherwise free_used_maps() will release them.
 		 */
 		release_maps(env);
 	*prog = env->prog;
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
new file mode 100644
index 000000000000..b3c557476a8d
--- /dev/null
+++ b/kernel/bpf/xskmap.c
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: GPL-2.0
+/* XSKMAP used for AF_XDP sockets
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <linux/bpf.h>
+#include <linux/capability.h>
+#include <net/xdp_sock.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+struct xsk_map {
+	struct bpf_map map;
+	struct xdp_sock **xsk_map;
+	struct list_head __percpu *flush_list;
+};
+
+static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
+{
+	int cpu, err = -EINVAL;
+	struct xsk_map *m;
+	u64 cost;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 ||
+	    attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY))
+		return ERR_PTR(-EINVAL);
+
+	m = kzalloc(sizeof(*m), GFP_USER);
+	if (!m)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&m->map, attr);
+
+	cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
+	cost += sizeof(struct list_head) * num_possible_cpus();
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_m;
+
+	m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* Notice returns -EPERM on if map size is larger than memlock limit */
+	err = bpf_map_precharge_memlock(m->map.pages);
+	if (err)
+		goto free_m;
+
+	err = -ENOMEM;
+
+	m->flush_list = alloc_percpu(struct list_head);
+	if (!m->flush_list)
+		goto free_m;
+
+	for_each_possible_cpu(cpu)
+		INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
+
+	m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
+					sizeof(struct xdp_sock *),
+					m->map.numa_node);
+	if (!m->xsk_map)
+		goto free_percpu;
+	return &m->map;
+
+free_percpu:
+	free_percpu(m->flush_list);
+free_m:
+	kfree(m);
+	return ERR_PTR(err);
+}
+
+static void xsk_map_free(struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	int i;
+
+	synchronize_net();
+
+	for (i = 0; i < map->max_entries; i++) {
+		struct xdp_sock *xs;
+
+		xs = m->xsk_map[i];
+		if (!xs)
+			continue;
+
+		sock_put((struct sock *)xs);
+	}
+
+	free_percpu(m->flush_list);
+	bpf_map_area_free(m->xsk_map);
+	kfree(m);
+}
+
+static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = next_key;
+
+	if (index >= m->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == m->map.max_entries - 1)
+		return -ENOENT;
+	*next = index + 1;
+	return 0;
+}
+
+struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct xdp_sock *xs;
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	xs = READ_ONCE(m->xsk_map[key]);
+	return xs;
+}
+
+int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
+		       struct xdp_sock *xs)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+	int err;
+
+	err = xsk_rcv(xs, xdp);
+	if (err)
+		return err;
+
+	if (!xs->flush_node.prev)
+		list_add(&xs->flush_node, flush_list);
+
+	return 0;
+}
+
+void __xsk_map_flush(struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+	struct xdp_sock *xs, *tmp;
+
+	list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
+		xsk_flush(xs);
+		__list_del(xs->flush_node.prev, xs->flush_node.next);
+		xs->flush_node.prev = NULL;
+	}
+}
+
+static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+			       u64 map_flags)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	u32 i = *(u32 *)key, fd = *(u32 *)value;
+	struct xdp_sock *xs, *old_xs;
+	struct socket *sock;
+	int err;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(i >= m->map.max_entries))
+		return -E2BIG;
+	if (unlikely(map_flags == BPF_NOEXIST))
+		return -EEXIST;
+
+	sock = sockfd_lookup(fd, &err);
+	if (!sock)
+		return err;
+
+	if (sock->sk->sk_family != PF_XDP) {
+		sockfd_put(sock);
+		return -EOPNOTSUPP;
+	}
+
+	xs = (struct xdp_sock *)sock->sk;
+
+	if (!xsk_is_setup_for_bpf_map(xs)) {
+		sockfd_put(sock);
+		return -EOPNOTSUPP;
+	}
+
+	sock_hold(sock->sk);
+
+	old_xs = xchg(&m->xsk_map[i], xs);
+	if (old_xs) {
+		/* Make sure we've flushed everything. */
+		synchronize_net();
+		sock_put((struct sock *)old_xs);
+	}
+
+	sockfd_put(sock);
+	return 0;
+}
+
+static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+	struct xdp_sock *old_xs;
+	int k = *(u32 *)key;
+
+	if (k >= map->max_entries)
+		return -EINVAL;
+
+	old_xs = xchg(&m->xsk_map[k], NULL);
+	if (old_xs) {
+		/* Make sure we've flushed everything. */
+		synchronize_net();
+		sock_put((struct sock *)old_xs);
+	}
+
+	return 0;
+}
+
+const struct bpf_map_ops xsk_map_ops = {
+	.map_alloc = xsk_map_alloc,
+	.map_free = xsk_map_free,
+	.map_get_next_key = xsk_map_get_next_key,
+	.map_lookup_elem = xsk_map_lookup_elem,
+	.map_update_elem = xsk_map_update_elem,
+	.map_delete_elem = xsk_map_delete_elem,
+};
+
+
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 2be89a003185..bfcdae896122 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
+obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index b928b27050c6..77ff1cd6a252 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -201,13 +201,12 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 int cgroup_task_count(const struct cgroup *cgrp);
 
 /*
- * stat.c
+ * rstat.c
  */
-void cgroup_stat_flush(struct cgroup *cgrp);
-int cgroup_stat_init(struct cgroup *cgrp);
-void cgroup_stat_exit(struct cgroup *cgrp);
-void cgroup_stat_show_cputime(struct seq_file *seq);
-void cgroup_stat_boot(void);
+int cgroup_rstat_init(struct cgroup *cgrp);
+void cgroup_rstat_exit(struct cgroup *cgrp);
+void cgroup_rstat_boot(void);
+void cgroup_base_stat_cputime_show(struct seq_file *seq);
 
 /*
  * namespace.c
@@ -218,9 +217,9 @@ extern const struct proc_ns_operations cgroupns_operations;
  * cgroup-v1.c
  */
 extern struct cftype cgroup1_base_files[];
-extern const struct file_operations proc_cgroupstats_operations;
 extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
 
+int proc_cgroupstats_show(struct seq_file *m, void *v);
 bool cgroup1_ssid_disabled(int ssid);
 void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
 void cgroup1_release_agent(struct work_struct *work);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index a2c05d2476ac..e06c97f3ed1a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -682,7 +682,7 @@ struct cftype cgroup1_base_files[] = {
 };
 
 /* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
+int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
 	struct cgroup_subsys *ss;
 	int i;
@@ -705,18 +705,6 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-const struct file_operations proc_cgroupstats_operations = {
-	.open = cgroupstats_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
 /**
  * cgroupstats_build - build and fill cgroupstats
  * @stats: cgroupstats to fill information into
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a662bfcbea0e..077370bf8964 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -54,6 +54,7 @@
 #include <linux/proc_ns.h>
 #include <linux/nsproxy.h>
 #include <linux/file.h>
+#include <linux/sched/cputime.h>
 #include <net/sock.h>
 
 #define CREATE_TRACE_POINTS
@@ -61,6 +62,8 @@
 
 #define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
 					 MAX_CFTYPE_NAME + 2)
+/* let's not notify more than 100 times per second */
+#define CGROUP_FILE_NOTIFY_MIN_INTV	DIV_ROUND_UP(HZ, 100)
 
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
@@ -142,14 +145,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 };
 #undef SUBSYS
 
-static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
 
 /*
  * The default hierarchy, reserved for the subsystems that are otherwise
  * unattached - it never has more than a single cgroup, and all tasks are
  * part of that cgroup.
  */
-struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
+struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 
 /*
@@ -1554,6 +1557,8 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 		spin_lock_irq(&cgroup_file_kn_lock);
 		cfile->kn = NULL;
 		spin_unlock_irq(&cgroup_file_kn_lock);
+
+		del_timer_sync(&cfile->notify_timer);
 	}
 
 	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
@@ -1573,8 +1578,17 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
 
 	css->flags &= ~CSS_VISIBLE;
 
-	list_for_each_entry(cfts, &css->ss->cfts, node)
+	if (!css->ss) {
+		if (cgroup_on_dfl(cgrp))
+			cfts = cgroup_base_files;
+		else
+			cfts = cgroup1_base_files;
+
 		cgroup_addrm_files(css, cgrp, cfts, false);
+	} else {
+		list_for_each_entry(cfts, &css->ss->cfts, node)
+			cgroup_addrm_files(css, cgrp, cfts, false);
+	}
 }
 
 /**
@@ -1598,14 +1612,16 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
 		else
 			cfts = cgroup1_base_files;
 
-		return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
-	}
-
-	list_for_each_entry(cfts, &css->ss->cfts, node) {
-		ret = cgroup_addrm_files(css, cgrp, cfts, true);
-		if (ret < 0) {
-			failed_cfts = cfts;
-			goto err;
+		ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
+		if (ret < 0)
+			return ret;
+	} else {
+		list_for_each_entry(cfts, &css->ss->cfts, node) {
+			ret = cgroup_addrm_files(css, cgrp, cfts, true);
+			if (ret < 0) {
+				failed_cfts = cfts;
+				goto err;
+			}
 		}
 	}
 
@@ -1782,13 +1798,6 @@ static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 
-	spin_lock_irq(&css_set_lock);
-
-	if (use_task_css_set_links)
-		goto out_unlock;
-
-	use_task_css_set_links = true;
-
 	/*
 	 * We need tasklist_lock because RCU is not safe against
 	 * while_each_thread(). Besides, a forking task that has passed
@@ -1797,6 +1806,13 @@ static void cgroup_enable_task_cg_lists(void)
 	 * tasklist if we walk through it with RCU.
 	 */
 	read_lock(&tasklist_lock);
+	spin_lock_irq(&css_set_lock);
+
+	if (use_task_css_set_links)
+		goto out_unlock;
+
+	use_task_css_set_links = true;
+
 	do_each_thread(g, p) {
 		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
 			     task_css_set(p) != &init_css_set);
@@ -1824,9 +1840,9 @@ static void cgroup_enable_task_cg_lists(void)
 		}
 		spin_unlock(&p->sighand->siglock);
 	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
 out_unlock:
 	spin_unlock_irq(&css_set_lock);
+	read_unlock(&tasklist_lock);
 }
 
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1844,6 +1860,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	cgrp->dom_cgrp = cgrp;
 	cgrp->max_descendants = INT_MAX;
 	cgrp->max_depth = INT_MAX;
+	INIT_LIST_HEAD(&cgrp->rstat_css_list);
+	prev_cputime_init(&cgrp->prev_cputime);
 
 	for_each_subsys(ss, ssid)
 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
@@ -3381,7 +3399,7 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
 	struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
 	int ret = 0;
 
-	cgroup_stat_show_cputime(seq);
+	cgroup_base_stat_cputime_show(seq);
 #ifdef CONFIG_CGROUP_SCHED
 	ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
 #endif
@@ -3521,6 +3539,12 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
 	return kernfs_setattr(kn, &iattr);
 }
 
+static void cgroup_file_notify_timer(struct timer_list *timer)
+{
+	cgroup_file_notify(container_of(timer, struct cgroup_file,
+					notify_timer));
+}
+
 static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 			   struct cftype *cft)
 {
@@ -3547,6 +3571,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 	if (cft->file_offset) {
 		struct cgroup_file *cfile = (void *)css + cft->file_offset;
 
+		timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
+
 		spin_lock_irq(&cgroup_file_kn_lock);
 		cfile->kn = kn;
 		spin_unlock_irq(&cgroup_file_kn_lock);
@@ -3796,8 +3822,17 @@ void cgroup_file_notify(struct cgroup_file *cfile)
 	unsigned long flags;
 
 	spin_lock_irqsave(&cgroup_file_kn_lock, flags);
-	if (cfile->kn)
-		kernfs_notify(cfile->kn);
+	if (cfile->kn) {
+		unsigned long last = cfile->notified_at;
+		unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
+
+		if (time_in_range(jiffies, last, next)) {
+			timer_reduce(&cfile->notify_timer, next);
+		} else {
+			kernfs_notify(cfile->kn);
+			cfile->notified_at = jiffies;
+		}
+	}
 	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
 }
 
@@ -4560,7 +4595,7 @@ static void css_free_rwork_fn(struct work_struct *work)
 			cgroup_put(cgroup_parent(cgrp));
 			kernfs_put(cgrp->kn);
 			if (cgroup_on_dfl(cgrp))
-				cgroup_stat_exit(cgrp);
+				cgroup_rstat_exit(cgrp);
 			kfree(cgrp);
 		} else {
 			/*
@@ -4587,6 +4622,11 @@ static void css_release_work_fn(struct work_struct *work)
 
 	if (ss) {
 		/* css release path */
+		if (!list_empty(&css->rstat_css_node)) {
+			cgroup_rstat_flush(cgrp);
+			list_del_rcu(&css->rstat_css_node);
+		}
+
 		cgroup_idr_replace(&ss->css_idr, NULL, css->id);
 		if (ss->css_released)
 			ss->css_released(css);
@@ -4597,7 +4637,7 @@ static void css_release_work_fn(struct work_struct *work)
 		trace_cgroup_release(cgrp);
 
 		if (cgroup_on_dfl(cgrp))
-			cgroup_stat_flush(cgrp);
+			cgroup_rstat_flush(cgrp);
 
 		for (tcgrp = cgroup_parent(cgrp); tcgrp;
 		     tcgrp = cgroup_parent(tcgrp))
@@ -4648,6 +4688,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 	css->id = -1;
 	INIT_LIST_HEAD(&css->sibling);
 	INIT_LIST_HEAD(&css->children);
+	INIT_LIST_HEAD(&css->rstat_css_node);
 	css->serial_nr = css_serial_nr_next++;
 	atomic_set(&css->online_cnt, 0);
 
@@ -4656,6 +4697,9 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 		css_get(css->parent);
 	}
 
+	if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
+		list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
+
 	BUG_ON(cgroup_css(cgrp, ss));
 }
 
@@ -4757,6 +4801,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 err_list_del:
 	list_del_rcu(&css->sibling);
 err_free_css:
+	list_del_rcu(&css->rstat_css_node);
 	INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
 	queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
 	return ERR_PTR(err);
@@ -4775,8 +4820,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	int ret;
 
 	/* allocate the cgroup and its ID, 0 is reserved for the root */
-	cgrp = kzalloc(sizeof(*cgrp) +
-		       sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
+	cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
+		       GFP_KERNEL);
 	if (!cgrp)
 		return ERR_PTR(-ENOMEM);
 
@@ -4785,7 +4830,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 		goto out_free_cgrp;
 
 	if (cgroup_on_dfl(parent)) {
-		ret = cgroup_stat_init(cgrp);
+		ret = cgroup_rstat_init(cgrp);
 		if (ret)
 			goto out_cancel_ref;
 	}
@@ -4850,7 +4895,7 @@ out_idr_free:
 	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
 out_stat_exit:
 	if (cgroup_on_dfl(parent))
-		cgroup_stat_exit(cgrp);
+		cgroup_rstat_exit(cgrp);
 out_cancel_ref:
 	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5090,10 +5135,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	for_each_css(css, ssid, cgrp)
 		kill_css(css);
 
-	/*
-	 * Remove @cgrp directory along with the base files.  @cgrp has an
-	 * extra ref on its kn.
-	 */
+	/* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
+	css_clear_dir(&cgrp->self);
 	kernfs_remove(cgrp->kn);
 
 	if (parent && cgroup_is_threaded(cgrp))
@@ -5245,7 +5288,7 @@ int __init cgroup_init(void)
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
 
-	cgroup_stat_boot();
+	cgroup_rstat_boot();
 
 	/*
 	 * The latency of the synchronize_sched() is too high for cgroups,
@@ -5335,7 +5378,7 @@ int __init cgroup_init(void)
 	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
 	WARN_ON(register_filesystem(&cgroup_fs_type));
 	WARN_ON(register_filesystem(&cgroup2_fs_type));
-	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
+	WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
 
 	return 0;
 }
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index defad3c5e7dc..d3bbb757ee49 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -362,35 +362,32 @@ EXPORT_SYMBOL(rdmacg_unregister_device);
 static int parse_resource(char *c, int *intval)
 {
 	substring_t argstr;
-	const char **table = &rdmacg_resource_names[0];
 	char *name, *value = c;
 	size_t len;
-	int ret, i = 0;
+	int ret, i;
 
 	name = strsep(&value, "=");
 	if (!name || !value)
 		return -EINVAL;
 
-	len = strlen(value);
+	i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
+	if (i < 0)
+		return i;
 
-	for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
-		if (strcmp(table[i], name))
-			continue;
+	len = strlen(value);
 
-		argstr.from = value;
-		argstr.to = value + len;
+	argstr.from = value;
+	argstr.to = value + len;
 
-		ret = match_int(&argstr, intval);
-		if (ret >= 0) {
-			if (*intval < 0)
-				break;
-			return i;
-		}
-		if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
-			*intval = S32_MAX;
-			return i;
-		}
-		break;
+	ret = match_int(&argstr, intval);
+	if (ret >= 0) {
+		if (*intval < 0)
+			return -EINVAL;
+		return i;
+	}
+	if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+		*intval = S32_MAX;
+		return i;
 	}
 	return -EINVAL;
 }
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
new file mode 100644
index 000000000000..d503d1a9007c
--- /dev/null
+++ b/kernel/cgroup/rstat.c
@@ -0,0 +1,416 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/cputime.h>
+
+static DEFINE_SPINLOCK(cgroup_rstat_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
+
+static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
+
+static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
+{
+	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
+}
+
+/**
+ * cgroup_rstat_updated - keep track of updated rstat_cpu
+ * @cgrp: target cgroup
+ * @cpu: cpu on which rstat_cpu was updated
+ *
+ * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
+ * rstat_cpu->updated_children list.  See the comment on top of
+ * cgroup_rstat_cpu definition for details.
+ */
+void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
+{
+	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
+	struct cgroup *parent;
+	unsigned long flags;
+
+	/* nothing to do for root */
+	if (!cgroup_parent(cgrp))
+		return;
+
+	/*
+	 * Paired with the one in cgroup_rstat_cpu_pop_upated().  Either we
+	 * see NULL updated_next or they see our updated stat.
+	 */
+	smp_mb();
+
+	/*
+	 * Because @parent's updated_children is terminated with @parent
+	 * instead of NULL, we can tell whether @cgrp is on the list by
+	 * testing the next pointer for NULL.
+	 */
+	if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
+		return;
+
+	raw_spin_lock_irqsave(cpu_lock, flags);
+
+	/* put @cgrp and all ancestors on the corresponding updated lists */
+	for (parent = cgroup_parent(cgrp); parent;
+	     cgrp = parent, parent = cgroup_parent(cgrp)) {
+		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+		struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
+
+		/*
+		 * Both additions and removals are bottom-up.  If a cgroup
+		 * is already in the tree, all ancestors are.
+		 */
+		if (rstatc->updated_next)
+			break;
+
+		rstatc->updated_next = prstatc->updated_children;
+		prstatc->updated_children = cgrp;
+	}
+
+	raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+EXPORT_SYMBOL_GPL(cgroup_rstat_updated);
+
+/**
+ * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
+ * the traversal and %NULL return indicates the end.  During traversal,
+ * each returned cgroup is unlinked from the tree.  Must be called with the
+ * matching cgroup_rstat_cpu_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
+						   struct cgroup *root, int cpu)
+{
+	struct cgroup_rstat_cpu *rstatc;
+	struct cgroup *parent;
+
+	if (pos == root)
+		return NULL;
+
+	/*
+	 * We're gonna walk down to the first leaf and visit/remove it.  We
+	 * can pick whatever unvisited node as the starting point.
+	 */
+	if (!pos)
+		pos = root;
+	else
+		pos = cgroup_parent(pos);
+
+	/* walk down to the first leaf */
+	while (true) {
+		rstatc = cgroup_rstat_cpu(pos, cpu);
+		if (rstatc->updated_children == pos)
+			break;
+		pos = rstatc->updated_children;
+	}
+
+	/*
+	 * Unlink @pos from the tree.  As the updated_children list is
+	 * singly linked, we have to walk it to find the removal point.
+	 * However, due to the way we traverse, @pos will be the first
+	 * child in most cases. The only exception is @root.
+	 */
+	parent = cgroup_parent(pos);
+	if (parent && rstatc->updated_next) {
+		struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
+		struct cgroup_rstat_cpu *nrstatc;
+		struct cgroup **nextp;
+
+		nextp = &prstatc->updated_children;
+		while (true) {
+			nrstatc = cgroup_rstat_cpu(*nextp, cpu);
+			if (*nextp == pos)
+				break;
+
+			WARN_ON_ONCE(*nextp == parent);
+			nextp = &nrstatc->updated_next;
+		}
+
+		*nextp = rstatc->updated_next;
+		rstatc->updated_next = NULL;
+
+		/*
+		 * Paired with the one in cgroup_rstat_cpu_updated().
+		 * Either they see NULL updated_next or we see their
+		 * updated stat.
+		 */
+		smp_mb();
+	}
+
+	return pos;
+}
+
+/* see cgroup_rstat_flush() */
+static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
+	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
+{
+	int cpu;
+
+	lockdep_assert_held(&cgroup_rstat_lock);
+
+	for_each_possible_cpu(cpu) {
+		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
+						       cpu);
+		struct cgroup *pos = NULL;
+
+		raw_spin_lock(cpu_lock);
+		while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
+			struct cgroup_subsys_state *css;
+
+			cgroup_base_stat_flush(pos, cpu);
+
+			rcu_read_lock();
+			list_for_each_entry_rcu(css, &pos->rstat_css_list,
+						rstat_css_node)
+				css->ss->css_rstat_flush(css, cpu);
+			rcu_read_unlock();
+		}
+		raw_spin_unlock(cpu_lock);
+
+		/* if @may_sleep, play nice and yield if necessary */
+		if (may_sleep && (need_resched() ||
+				  spin_needbreak(&cgroup_rstat_lock))) {
+			spin_unlock_irq(&cgroup_rstat_lock);
+			if (!cond_resched())
+				cpu_relax();
+			spin_lock_irq(&cgroup_rstat_lock);
+		}
+	}
+}
+
+/**
+ * cgroup_rstat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards.  After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ *
+ * This function may block.
+ */
+void cgroup_rstat_flush(struct cgroup *cgrp)
+{
+	might_sleep();
+
+	spin_lock_irq(&cgroup_rstat_lock);
+	cgroup_rstat_flush_locked(cgrp, true);
+	spin_unlock_irq(&cgroup_rstat_lock);
+}
+
+/**
+ * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
+ * @cgrp: target cgroup
+ *
+ * This function can be called from any context.
+ */
+void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cgroup_rstat_lock, flags);
+	cgroup_rstat_flush_locked(cgrp, false);
+	spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
+}
+
+/**
+ * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
+ * @cgrp: target cgroup
+ *
+ * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
+ * paired with cgroup_rstat_flush_release().
+ *
+ * This function may block.
+ */
+void cgroup_rstat_flush_hold(struct cgroup *cgrp)
+	__acquires(&cgroup_rstat_lock)
+{
+	might_sleep();
+	spin_lock_irq(&cgroup_rstat_lock);
+	cgroup_rstat_flush_locked(cgrp, true);
+}
+
+/**
+ * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
+ */
+void cgroup_rstat_flush_release(void)
+	__releases(&cgroup_rstat_lock)
+{
+	spin_unlock_irq(&cgroup_rstat_lock);
+}
+
+int cgroup_rstat_init(struct cgroup *cgrp)
+{
+	int cpu;
+
+	/* the root cgrp has rstat_cpu preallocated */
+	if (!cgrp->rstat_cpu) {
+		cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
+		if (!cgrp->rstat_cpu)
+			return -ENOMEM;
+	}
+
+	/* ->updated_children list is self terminated */
+	for_each_possible_cpu(cpu) {
+		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+
+		rstatc->updated_children = cgrp;
+		u64_stats_init(&rstatc->bsync);
+	}
+
+	return 0;
+}
+
+void cgroup_rstat_exit(struct cgroup *cgrp)
+{
+	int cpu;
+
+	cgroup_rstat_flush(cgrp);
+
+	/* sanity check */
+	for_each_possible_cpu(cpu) {
+		struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+
+		if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
+		    WARN_ON_ONCE(rstatc->updated_next))
+			return;
+	}
+
+	free_percpu(cgrp->rstat_cpu);
+	cgrp->rstat_cpu = NULL;
+}
+
+void __init cgroup_rstat_boot(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
+
+	BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
+}
+
+/*
+ * Functions for cgroup basic resource statistics implemented on top of
+ * rstat.
+ */
+static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat,
+					struct cgroup_base_stat *src_bstat)
+{
+	dst_bstat->cputime.utime += src_bstat->cputime.utime;
+	dst_bstat->cputime.stime += src_bstat->cputime.stime;
+	dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+}
+
+static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+	struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+	struct task_cputime *last_cputime = &rstatc->last_bstat.cputime;
+	struct task_cputime cputime;
+	struct cgroup_base_stat delta;
+	unsigned seq;
+
+	/* fetch the current per-cpu values */
+	do {
+		seq = __u64_stats_fetch_begin(&rstatc->bsync);
+		cputime = rstatc->bstat.cputime;
+	} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
+
+	/* calculate the delta to propgate */
+	delta.cputime.utime = cputime.utime - last_cputime->utime;
+	delta.cputime.stime = cputime.stime - last_cputime->stime;
+	delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+					 last_cputime->sum_exec_runtime;
+	*last_cputime = cputime;
+
+	/* transfer the pending stat into delta */
+	cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat);
+	memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat));
+
+	/* propagate delta into the global stat and the parent's pending */
+	cgroup_base_stat_accumulate(&cgrp->bstat, &delta);
+	if (parent)
+		cgroup_base_stat_accumulate(&parent->pending_bstat, &delta);
+}
+
+static struct cgroup_rstat_cpu *
+cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
+{
+	struct cgroup_rstat_cpu *rstatc;
+
+	rstatc = get_cpu_ptr(cgrp->rstat_cpu);
+	u64_stats_update_begin(&rstatc->bsync);
+	return rstatc;
+}
+
+static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
+						 struct cgroup_rstat_cpu *rstatc)
+{
+	u64_stats_update_end(&rstatc->bsync);
+	cgroup_rstat_updated(cgrp, smp_processor_id());
+	put_cpu_ptr(rstatc);
+}
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+	struct cgroup_rstat_cpu *rstatc;
+
+	rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+	rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
+	cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+}
+
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+				    enum cpu_usage_stat index, u64 delta_exec)
+{
+	struct cgroup_rstat_cpu *rstatc;
+
+	rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+
+	switch (index) {
+	case CPUTIME_USER:
+	case CPUTIME_NICE:
+		rstatc->bstat.cputime.utime += delta_exec;
+		break;
+	case CPUTIME_SYSTEM:
+	case CPUTIME_IRQ:
+	case CPUTIME_SOFTIRQ:
+		rstatc->bstat.cputime.stime += delta_exec;
+		break;
+	default:
+		break;
+	}
+
+	cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+}
+
+void cgroup_base_stat_cputime_show(struct seq_file *seq)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	u64 usage, utime, stime;
+
+	if (!cgroup_parent(cgrp))
+		return;
+
+	cgroup_rstat_flush_hold(cgrp);
+	usage = cgrp->bstat.cputime.sum_exec_runtime;
+	cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime);
+	cgroup_rstat_flush_release();
+
+	do_div(usage, NSEC_PER_USEC);
+	do_div(utime, NSEC_PER_USEC);
+	do_div(stime, NSEC_PER_USEC);
+
+	seq_printf(seq, "usage_usec %llu\n"
+		   "user_usec %llu\n"
+		   "system_usec %llu\n",
+		   usage, utime, stime);
+}
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
deleted file mode 100644
index 1e111dd455c4..000000000000
--- a/kernel/cgroup/stat.c
+++ /dev/null
@@ -1,338 +0,0 @@
-#include "cgroup-internal.h"
-
-#include <linux/sched/cputime.h>
-
-static DEFINE_MUTEX(cgroup_stat_mutex);
-static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
-
-static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
-{
-	return per_cpu_ptr(cgrp->cpu_stat, cpu);
-}
-
-/**
- * cgroup_cpu_stat_updated - keep track of updated cpu_stat
- * @cgrp: target cgroup
- * @cpu: cpu on which cpu_stat was updated
- *
- * @cgrp's cpu_stat on @cpu was updated.  Put it on the parent's matching
- * cpu_stat->updated_children list.  See the comment on top of
- * cgroup_cpu_stat definition for details.
- */
-static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
-{
-	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
-	struct cgroup *parent;
-	unsigned long flags;
-
-	/*
-	 * Speculative already-on-list test.  This may race leading to
-	 * temporary inaccuracies, which is fine.
-	 *
-	 * Because @parent's updated_children is terminated with @parent
-	 * instead of NULL, we can tell whether @cgrp is on the list by
-	 * testing the next pointer for NULL.
-	 */
-	if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
-		return;
-
-	raw_spin_lock_irqsave(cpu_lock, flags);
-
-	/* put @cgrp and all ancestors on the corresponding updated lists */
-	for (parent = cgroup_parent(cgrp); parent;
-	     cgrp = parent, parent = cgroup_parent(cgrp)) {
-		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-		struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
-
-		/*
-		 * Both additions and removals are bottom-up.  If a cgroup
-		 * is already in the tree, all ancestors are.
-		 */
-		if (cstat->updated_next)
-			break;
-
-		cstat->updated_next = pcstat->updated_children;
-		pcstat->updated_children = cgrp;
-	}
-
-	raw_spin_unlock_irqrestore(cpu_lock, flags);
-}
-
-/**
- * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
- * @pos: current position
- * @root: root of the tree to traversal
- * @cpu: target cpu
- *
- * Walks the udpated cpu_stat tree on @cpu from @root.  %NULL @pos starts
- * the traversal and %NULL return indicates the end.  During traversal,
- * each returned cgroup is unlinked from the tree.  Must be called with the
- * matching cgroup_cpu_stat_lock held.
- *
- * The only ordering guarantee is that, for a parent and a child pair
- * covered by a given traversal, if a child is visited, its parent is
- * guaranteed to be visited afterwards.
- */
-static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
-						  struct cgroup *root, int cpu)
-{
-	struct cgroup_cpu_stat *cstat;
-	struct cgroup *parent;
-
-	if (pos == root)
-		return NULL;
-
-	/*
-	 * We're gonna walk down to the first leaf and visit/remove it.  We
-	 * can pick whatever unvisited node as the starting point.
-	 */
-	if (!pos)
-		pos = root;
-	else
-		pos = cgroup_parent(pos);
-
-	/* walk down to the first leaf */
-	while (true) {
-		cstat = cgroup_cpu_stat(pos, cpu);
-		if (cstat->updated_children == pos)
-			break;
-		pos = cstat->updated_children;
-	}
-
-	/*
-	 * Unlink @pos from the tree.  As the updated_children list is
-	 * singly linked, we have to walk it to find the removal point.
-	 * However, due to the way we traverse, @pos will be the first
-	 * child in most cases. The only exception is @root.
-	 */
-	parent = cgroup_parent(pos);
-	if (parent && cstat->updated_next) {
-		struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
-		struct cgroup_cpu_stat *ncstat;
-		struct cgroup **nextp;
-
-		nextp = &pcstat->updated_children;
-		while (true) {
-			ncstat = cgroup_cpu_stat(*nextp, cpu);
-			if (*nextp == pos)
-				break;
-
-			WARN_ON_ONCE(*nextp == parent);
-			nextp = &ncstat->updated_next;
-		}
-
-		*nextp = cstat->updated_next;
-		cstat->updated_next = NULL;
-	}
-
-	return pos;
-}
-
-static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
-				   struct cgroup_stat *src_stat)
-{
-	dst_stat->cputime.utime += src_stat->cputime.utime;
-	dst_stat->cputime.stime += src_stat->cputime.stime;
-	dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
-}
-
-static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
-{
-	struct cgroup *parent = cgroup_parent(cgrp);
-	struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-	struct task_cputime *last_cputime = &cstat->last_cputime;
-	struct task_cputime cputime;
-	struct cgroup_stat delta;
-	unsigned seq;
-
-	lockdep_assert_held(&cgroup_stat_mutex);
-
-	/* fetch the current per-cpu values */
-	do {
-		seq = __u64_stats_fetch_begin(&cstat->sync);
-		cputime = cstat->cputime;
-	} while (__u64_stats_fetch_retry(&cstat->sync, seq));
-
-	/* accumulate the deltas to propgate */
-	delta.cputime.utime = cputime.utime - last_cputime->utime;
-	delta.cputime.stime = cputime.stime - last_cputime->stime;
-	delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
-					 last_cputime->sum_exec_runtime;
-	*last_cputime = cputime;
-
-	/* transfer the pending stat into delta */
-	cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
-	memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
-
-	/* propagate delta into the global stat and the parent's pending */
-	cgroup_stat_accumulate(&cgrp->stat, &delta);
-	if (parent)
-		cgroup_stat_accumulate(&parent->pending_stat, &delta);
-}
-
-/* see cgroup_stat_flush() */
-static void cgroup_stat_flush_locked(struct cgroup *cgrp)
-{
-	int cpu;
-
-	lockdep_assert_held(&cgroup_stat_mutex);
-
-	for_each_possible_cpu(cpu) {
-		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
-		struct cgroup *pos = NULL;
-
-		raw_spin_lock_irq(cpu_lock);
-		while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
-			cgroup_cpu_stat_flush_one(pos, cpu);
-		raw_spin_unlock_irq(cpu_lock);
-	}
-}
-
-/**
- * cgroup_stat_flush - flush stats in @cgrp's subtree
- * @cgrp: target cgroup
- *
- * Collect all per-cpu stats in @cgrp's subtree into the global counters
- * and propagate them upwards.  After this function returns, all cgroups in
- * the subtree have up-to-date ->stat.
- *
- * This also gets all cgroups in the subtree including @cgrp off the
- * ->updated_children lists.
- */
-void cgroup_stat_flush(struct cgroup *cgrp)
-{
-	mutex_lock(&cgroup_stat_mutex);
-	cgroup_stat_flush_locked(cgrp);
-	mutex_unlock(&cgroup_stat_mutex);
-}
-
-static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
-{
-	struct cgroup_cpu_stat *cstat;
-
-	cstat = get_cpu_ptr(cgrp->cpu_stat);
-	u64_stats_update_begin(&cstat->sync);
-	return cstat;
-}
-
-static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
-					struct cgroup_cpu_stat *cstat)
-{
-	u64_stats_update_end(&cstat->sync);
-	cgroup_cpu_stat_updated(cgrp, smp_processor_id());
-	put_cpu_ptr(cstat);
-}
-
-void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
-{
-	struct cgroup_cpu_stat *cstat;
-
-	cstat = cgroup_cpu_stat_account_begin(cgrp);
-	cstat->cputime.sum_exec_runtime += delta_exec;
-	cgroup_cpu_stat_account_end(cgrp, cstat);
-}
-
-void __cgroup_account_cputime_field(struct cgroup *cgrp,
-				    enum cpu_usage_stat index, u64 delta_exec)
-{
-	struct cgroup_cpu_stat *cstat;
-
-	cstat = cgroup_cpu_stat_account_begin(cgrp);
-
-	switch (index) {
-	case CPUTIME_USER:
-	case CPUTIME_NICE:
-		cstat->cputime.utime += delta_exec;
-		break;
-	case CPUTIME_SYSTEM:
-	case CPUTIME_IRQ:
-	case CPUTIME_SOFTIRQ:
-		cstat->cputime.stime += delta_exec;
-		break;
-	default:
-		break;
-	}
-
-	cgroup_cpu_stat_account_end(cgrp, cstat);
-}
-
-void cgroup_stat_show_cputime(struct seq_file *seq)
-{
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	u64 usage, utime, stime;
-
-	if (!cgroup_parent(cgrp))
-		return;
-
-	mutex_lock(&cgroup_stat_mutex);
-
-	cgroup_stat_flush_locked(cgrp);
-
-	usage = cgrp->stat.cputime.sum_exec_runtime;
-	cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
-		       &utime, &stime);
-
-	mutex_unlock(&cgroup_stat_mutex);
-
-	do_div(usage, NSEC_PER_USEC);
-	do_div(utime, NSEC_PER_USEC);
-	do_div(stime, NSEC_PER_USEC);
-
-	seq_printf(seq, "usage_usec %llu\n"
-		   "user_usec %llu\n"
-		   "system_usec %llu\n",
-		   usage, utime, stime);
-}
-
-int cgroup_stat_init(struct cgroup *cgrp)
-{
-	int cpu;
-
-	/* the root cgrp has cpu_stat preallocated */
-	if (!cgrp->cpu_stat) {
-		cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
-		if (!cgrp->cpu_stat)
-			return -ENOMEM;
-	}
-
-	/* ->updated_children list is self terminated */
-	for_each_possible_cpu(cpu) {
-		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-
-		cstat->updated_children = cgrp;
-		u64_stats_init(&cstat->sync);
-	}
-
-	prev_cputime_init(&cgrp->stat.prev_cputime);
-
-	return 0;
-}
-
-void cgroup_stat_exit(struct cgroup *cgrp)
-{
-	int cpu;
-
-	cgroup_stat_flush(cgrp);
-
-	/* sanity check */
-	for_each_possible_cpu(cpu) {
-		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
-
-		if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
-		    WARN_ON_ONCE(cstat->updated_next))
-			return;
-	}
-
-	free_percpu(cgrp->cpu_stat);
-	cgrp->cpu_stat = NULL;
-}
-
-void __init cgroup_stat_boot(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
-
-	BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
-}
diff --git a/kernel/compat.c b/kernel/compat.c
index 6d21894806b4..702aa846ddac 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -34,6 +34,7 @@ int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp)
 {
 	struct compat_timex tx32;
 
+	memset(txc, 0, sizeof(struct timex));
 	if (copy_from_user(&tx32, utp, sizeof(struct compat_timex)))
 		return -EFAULT;
 
@@ -120,50 +121,6 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
 			__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
 
-static int __compat_get_timespec64(struct timespec64 *ts64,
-				   const struct compat_timespec __user *cts)
-{
-	struct compat_timespec ts;
-	int ret;
-
-	ret = copy_from_user(&ts, cts, sizeof(ts));
-	if (ret)
-		return -EFAULT;
-
-	ts64->tv_sec = ts.tv_sec;
-	ts64->tv_nsec = ts.tv_nsec;
-
-	return 0;
-}
-
-static int __compat_put_timespec64(const struct timespec64 *ts64,
-				   struct compat_timespec __user *cts)
-{
-	struct compat_timespec ts = {
-		.tv_sec = ts64->tv_sec,
-		.tv_nsec = ts64->tv_nsec
-	};
-	return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
-}
-
-int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
-{
-	if (COMPAT_USE_64BIT_TIME)
-		return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
-	else
-		return __compat_get_timespec64(ts, uts);
-}
-EXPORT_SYMBOL_GPL(compat_get_timespec64);
-
-int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
-{
-	if (COMPAT_USE_64BIT_TIME)
-		return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
-	else
-		return __compat_put_timespec64(ts, uts);
-}
-EXPORT_SYMBOL_GPL(compat_put_timespec64);
-
 int compat_get_timeval(struct timeval *tv, const void __user *utv)
 {
 	if (COMPAT_USE_64BIT_TIME)
@@ -367,6 +324,14 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t,  pid, unsigned int, len,
 	return ret;
 }
 
+/* Todo: Delete these extern declarations when get/put_compat_itimerspec64()
+ * are moved to kernel/time/time.c .
+ */
+extern int __compat_get_timespec64(struct timespec64 *ts64,
+				   const struct compat_timespec __user *cts);
+extern int __compat_put_timespec64(const struct timespec64 *ts64,
+				   struct compat_timespec __user *cts);
+
 int get_compat_itimerspec64(struct itimerspec64 *its,
 			const struct compat_itimerspec __user *uits)
 {
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index a93590cdd9e1..f7674d676889 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -454,6 +454,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_NUMBER(PG_lru);
 	VMCOREINFO_NUMBER(PG_private);
 	VMCOREINFO_NUMBER(PG_swapcache);
+	VMCOREINFO_NUMBER(PG_swapbacked);
 	VMCOREINFO_NUMBER(PG_slab);
 #ifdef CONFIG_MEMORY_FAILURE
 	VMCOREINFO_NUMBER(PG_hwpoison);
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 90ff129c88a2..62c301ad0773 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -242,11 +242,11 @@ static void kdb_printbp(kdb_bp_t *bp, int i)
 	kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
 
 	if (bp->bp_enabled)
-		kdb_printf("\n    is enabled");
+		kdb_printf("\n    is enabled ");
 	else
 		kdb_printf("\n    is disabled");
 
-	kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
+	kdb_printf("  addr at %016lx, hardtype=%d installed=%d\n",
 		   bp->bp_addr, bp->bp_type, bp->bp_installed);
 
 	kdb_printf("\n");
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index dbb0781a0533..e405677ee08d 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1150,6 +1150,16 @@ void kdb_set_current_task(struct task_struct *p)
 	kdb_current_regs = NULL;
 }
 
+static void drop_newline(char *buf)
+{
+	size_t len = strlen(buf);
+
+	if (len == 0)
+		return;
+	if (*(buf + len - 1) == '\n')
+		*(buf + len - 1) = '\0';
+}
+
 /*
  * kdb_local - The main code for kdb.  This routine is invoked on a
  *	specific processor, it is not global.  The main kdb() routine
@@ -1327,6 +1337,7 @@ do_full_getstr:
 		cmdptr = cmd_head;
 		diag = kdb_parse(cmdbuf);
 		if (diag == KDB_NOTFOUND) {
+			drop_newline(cmdbuf);
 			kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
 			diag = 0;
 		}
@@ -1566,6 +1577,7 @@ static int kdb_md(int argc, const char **argv)
 	int symbolic = 0;
 	int valid = 0;
 	int phys = 0;
+	int raw = 0;
 
 	kdbgetintenv("MDCOUNT", &mdcount);
 	kdbgetintenv("RADIX", &radix);
@@ -1575,9 +1587,10 @@ static int kdb_md(int argc, const char **argv)
 	repeat = mdcount * 16 / bytesperword;
 
 	if (strcmp(argv[0], "mdr") == 0) {
-		if (argc != 2)
+		if (argc == 2 || (argc == 0 && last_addr != 0))
+			valid = raw = 1;
+		else
 			return KDB_ARGCOUNT;
-		valid = 1;
 	} else if (isdigit(argv[0][2])) {
 		bytesperword = (int)(argv[0][2] - '0');
 		if (bytesperword == 0) {
@@ -1613,7 +1626,10 @@ static int kdb_md(int argc, const char **argv)
 		radix = last_radix;
 		bytesperword = last_bytesperword;
 		repeat = last_repeat;
-		mdcount = ((repeat * bytesperword) + 15) / 16;
+		if (raw)
+			mdcount = repeat;
+		else
+			mdcount = ((repeat * bytesperword) + 15) / 16;
 	}
 
 	if (argc) {
@@ -1630,7 +1646,10 @@ static int kdb_md(int argc, const char **argv)
 			diag = kdbgetularg(argv[nextarg], &val);
 			if (!diag) {
 				mdcount = (int) val;
-				repeat = mdcount * 16 / bytesperword;
+				if (raw)
+					repeat = mdcount;
+				else
+					repeat = mdcount * 16 / bytesperword;
 			}
 		}
 		if (argc >= nextarg+1) {
@@ -1640,8 +1659,15 @@ static int kdb_md(int argc, const char **argv)
 		}
 	}
 
-	if (strcmp(argv[0], "mdr") == 0)
-		return kdb_mdr(addr, mdcount);
+	if (strcmp(argv[0], "mdr") == 0) {
+		int ret;
+		last_addr = addr;
+		ret = kdb_mdr(addr, mdcount);
+		last_addr += mdcount;
+		last_repeat = mdcount;
+		last_bytesperword = bytesperword; // to make REPEAT happy
+		return ret;
+	}
 
 	switch (radix) {
 	case 10:
@@ -2473,41 +2499,6 @@ static int kdb_kill(int argc, const char **argv)
 	return 0;
 }
 
-struct kdb_tm {
-	int tm_sec;	/* seconds */
-	int tm_min;	/* minutes */
-	int tm_hour;	/* hours */
-	int tm_mday;	/* day of the month */
-	int tm_mon;	/* month */
-	int tm_year;	/* year */
-};
-
-static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
-{
-	/* This will work from 1970-2099, 2100 is not a leap year */
-	static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
-				 31, 30, 31, 30, 31 };
-	memset(tm, 0, sizeof(*tm));
-	tm->tm_sec  = tv->tv_sec % (24 * 60 * 60);
-	tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
-		(2 * 365 + 1); /* shift base from 1970 to 1968 */
-	tm->tm_min =  tm->tm_sec / 60 % 60;
-	tm->tm_hour = tm->tm_sec / 60 / 60;
-	tm->tm_sec =  tm->tm_sec % 60;
-	tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
-	tm->tm_mday %= (4*365+1);
-	mon_day[1] = 29;
-	while (tm->tm_mday >= mon_day[tm->tm_mon]) {
-		tm->tm_mday -= mon_day[tm->tm_mon];
-		if (++tm->tm_mon == 12) {
-			tm->tm_mon = 0;
-			++tm->tm_year;
-			mon_day[1] = 28;
-		}
-	}
-	++tm->tm_mday;
-}
-
 /*
  * Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
  * I cannot call that code directly from kdb, it has an unconditional
@@ -2515,10 +2506,10 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
  */
 static void kdb_sysinfo(struct sysinfo *val)
 {
-	struct timespec uptime;
-	ktime_get_ts(&uptime);
+	u64 uptime = ktime_get_mono_fast_ns();
+
 	memset(val, 0, sizeof(*val));
-	val->uptime = uptime.tv_sec;
+	val->uptime = div_u64(uptime, NSEC_PER_SEC);
 	val->loads[0] = avenrun[0];
 	val->loads[1] = avenrun[1];
 	val->loads[2] = avenrun[2];
@@ -2533,8 +2524,8 @@ static void kdb_sysinfo(struct sysinfo *val)
  */
 static int kdb_summary(int argc, const char **argv)
 {
-	struct timespec now;
-	struct kdb_tm tm;
+	time64_t now;
+	struct tm tm;
 	struct sysinfo val;
 
 	if (argc)
@@ -2548,9 +2539,9 @@ static int kdb_summary(int argc, const char **argv)
 	kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
 	kdb_printf("ccversion  %s\n", __stringify(CCVERSION));
 
-	now = __current_kernel_time();
-	kdb_gmtime(&now, &tm);
-	kdb_printf("date       %04d-%02d-%02d %02d:%02d:%02d "
+	now = __ktime_get_real_seconds();
+	time64_to_tm(now, 0, &tm);
+	kdb_printf("date       %04ld-%02d-%02d %02d:%02d:%02d "
 		   "tz_minuteswest %d\n",
 		1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
 		tm.tm_hour, tm.tm_min, tm.tm_sec,
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index d35cc2d3a4cc..990b3cc526c8 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -129,13 +129,13 @@ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
 		}
 		if (i >= ARRAY_SIZE(kdb_name_table)) {
 			debug_kfree(kdb_name_table[0]);
-			memcpy(kdb_name_table, kdb_name_table+1,
+			memmove(kdb_name_table, kdb_name_table+1,
 			       sizeof(kdb_name_table[0]) *
 			       (ARRAY_SIZE(kdb_name_table)-1));
 		} else {
 			debug_kfree(knt1);
 			knt1 = kdb_name_table[i];
-			memcpy(kdb_name_table+i, kdb_name_table+i+1,
+			memmove(kdb_name_table+i, kdb_name_table+i+1,
 			       sizeof(kdb_name_table[0]) *
 			       (ARRAY_SIZE(kdb_name_table)-i-1));
 		}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index e2764d767f18..ca8ac2824f0b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -44,23 +44,24 @@ void __delayacct_tsk_init(struct task_struct *tsk)
 {
 	tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
 	if (tsk->delays)
-		spin_lock_init(&tsk->delays->lock);
+		raw_spin_lock_init(&tsk->delays->lock);
 }
 
 /*
  * Finish delay accounting for a statistic using its timestamps (@start),
  * accumalator (@total) and @count
  */
-static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total,
+			  u32 *count)
 {
 	s64 ns = ktime_get_ns() - *start;
 	unsigned long flags;
 
 	if (ns > 0) {
-		spin_lock_irqsave(lock, flags);
+		raw_spin_lock_irqsave(lock, flags);
 		*total += ns;
 		(*count)++;
-		spin_unlock_irqrestore(lock, flags);
+		raw_spin_unlock_irqrestore(lock, flags);
 	}
 }
 
@@ -127,7 +128,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 
 	/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
 
-	spin_lock_irqsave(&tsk->delays->lock, flags);
+	raw_spin_lock_irqsave(&tsk->delays->lock, flags);
 	tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
 	d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
 	tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
@@ -137,7 +138,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
 	d->freepages_count += tsk->delays->freepages_count;
-	spin_unlock_irqrestore(&tsk->delays->lock, flags);
+	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 	return 0;
 }
@@ -147,10 +148,10 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
 	__u64 ret;
 	unsigned long flags;
 
-	spin_lock_irqsave(&tsk->delays->lock, flags);
+	raw_spin_lock_irqsave(&tsk->delays->lock, flags);
 	ret = nsec_to_clock_t(tsk->delays->blkio_delay +
 				tsk->delays->swapin_delay);
-	spin_unlock_irqrestore(&tsk->delays->lock, flags);
+	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 	return ret;
 }
 
diff --git a/kernel/dma.c b/kernel/dma.c
index 3506fc34a712..40f152936316 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -135,21 +135,9 @@ static int proc_dma_show(struct seq_file *m, void *v)
 }
 #endif /* MAX_DMA_CHANNELS */
 
-static int proc_dma_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, proc_dma_show, NULL);
-}
-
-static const struct file_operations proc_dma_operations = {
-	.open		= proc_dma_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_dma_init(void)
 {
-	proc_create("dma", 0, NULL, &proc_dma_operations);
+	proc_create_single("dma", 0, NULL, proc_dma_show);
 	return 0;
 }
 
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 772a43fea825..c187aa3df3c8 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -119,23 +119,20 @@ int get_callchain_buffers(int event_max_stack)
 		goto exit;
 	}
 
-	if (count > 1) {
-		/* If the allocation failed, give up */
-		if (!callchain_cpus_entries)
-			err = -ENOMEM;
-		/*
-		 * If requesting per event more than the global cap,
-		 * return a different error to help userspace figure
-		 * this out.
-		 *
-		 * And also do it here so that we have &callchain_mutex held.
-		 */
-		if (event_max_stack > sysctl_perf_event_max_stack)
-			err = -EOVERFLOW;
+	/*
+	 * If requesting per event more than the global cap,
+	 * return a different error to help userspace figure
+	 * this out.
+	 *
+	 * And also do it here so that we have &callchain_mutex held.
+	 */
+	if (event_max_stack > sysctl_perf_event_max_stack) {
+		err = -EOVERFLOW;
 		goto exit;
 	}
 
-	err = alloc_callchain_buffers();
+	if (count == 1)
+		err = alloc_callchain_buffers();
 exit:
 	if (err)
 		atomic_dec(&nr_callchain_events);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc1c330c6bd6..80cca2b30c4f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4447,6 +4447,9 @@ static void _free_event(struct perf_event *event)
 	if (event->ctx)
 		put_ctx(event->ctx);
 
+	if (event->hw.target)
+		put_task_struct(event->hw.target);
+
 	exclusive_event_destroy(event);
 	module_put(event->pmu->module);
 
@@ -5117,6 +5120,8 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
 	switch (_IOC_NR(cmd)) {
 	case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
 	case _IOC_NR(PERF_EVENT_IOC_ID):
+	case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
+	case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
 		/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
 		if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
 			cmd &= ~IOCSIZE_MASK;
@@ -6665,7 +6670,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
 
 	raw_spin_lock_irqsave(&ifh->lock, flags);
 	list_for_each_entry(filter, &ifh->list, entry) {
-		if (filter->inode) {
+		if (filter->path.dentry) {
 			event->addr_filters_offs[count] = 0;
 			restart++;
 		}
@@ -7330,7 +7335,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
 				     struct file *file, unsigned long offset,
 				     unsigned long size)
 {
-	if (filter->inode != file_inode(file))
+	if (d_inode(filter->path.dentry) != file_inode(file))
 		return false;
 
 	if (filter->offset > offset + size)
@@ -7584,6 +7589,10 @@ static void perf_event_switch(struct task_struct *task,
 		},
 	};
 
+	if (!sched_in && task->state == TASK_RUNNING)
+		switch_event.event_id.header.misc |=
+				PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
+
 	perf_iterate_sb(perf_event_switch_output,
 		       &switch_event,
 		       NULL);
@@ -8397,6 +8406,10 @@ static int perf_kprobe_event_init(struct perf_event *event)
 
 	if (event->attr.type != perf_kprobe.type)
 		return -ENOENT;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	/*
 	 * no branch sampling for probe events
 	 */
@@ -8434,6 +8447,10 @@ static int perf_uprobe_event_init(struct perf_event *event)
 
 	if (event->attr.type != perf_uprobe.type)
 		return -ENOENT;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	/*
 	 * no branch sampling for probe events
 	 */
@@ -8671,8 +8688,7 @@ static void free_filters_list(struct list_head *filters)
 	struct perf_addr_filter *filter, *iter;
 
 	list_for_each_entry_safe(filter, iter, filters, entry) {
-		if (filter->inode)
-			iput(filter->inode);
+		path_put(&filter->path);
 		list_del(&filter->entry);
 		kfree(filter);
 	}
@@ -8769,7 +8785,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
 		 * Adjust base offset if the filter is associated to a binary
 		 * that needs to be mapped:
 		 */
-		if (filter->inode)
+		if (filter->path.dentry)
 			event->addr_filters_offs[count] =
 				perf_addr_filter_apply(filter, mm);
 
@@ -8843,7 +8859,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 {
 	struct perf_addr_filter *filter = NULL;
 	char *start, *orig, *filename = NULL;
-	struct path path;
 	substring_t args[MAX_OPT_ARGS];
 	int state = IF_STATE_ACTION, token;
 	unsigned int kernel = 0;
@@ -8956,19 +8971,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
 					goto fail_free_name;
 
 				/* look up the path and grab its inode */
-				ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+				ret = kern_path(filename, LOOKUP_FOLLOW,
+						&filter->path);
 				if (ret)
 					goto fail_free_name;
 
-				filter->inode = igrab(d_inode(path.dentry));
-				path_put(&path);
 				kfree(filename);
 				filename = NULL;
 
 				ret = -EINVAL;
-				if (!filter->inode ||
-				    !S_ISREG(filter->inode->i_mode))
-					/* free_filters_list() will iput() */
+				if (!filter->path.dentry ||
+				    !S_ISREG(d_inode(filter->path.dentry)
+					     ->i_mode))
 					goto fail;
 
 				event->addr_filters.nr_file_filters++;
@@ -9955,6 +9969,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		 * and we cannot use the ctx information because we need the
 		 * pmu before we get a ctx.
 		 */
+		get_task_struct(task);
 		event->hw.target = task;
 	}
 
@@ -10070,6 +10085,8 @@ err_ns:
 		perf_detach_cgroup(event);
 	if (event->ns)
 		put_pid_ns(event->ns);
+	if (event->hw.target)
+		put_task_struct(event->hw.target);
 	kfree(event);
 
 	return ERR_PTR(err);
@@ -10191,9 +10208,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 		 * __u16 sample size limit.
 		 */
 		if (attr->sample_stack_user >= USHRT_MAX)
-			ret = -EINVAL;
+			return -EINVAL;
 		else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
-			ret = -EINVAL;
+			return -EINVAL;
 	}
 
 	if (!attr->sample_max_stack)
@@ -10503,19 +10520,20 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (pmu->task_ctx_nr == perf_sw_context)
 		event->event_caps |= PERF_EV_CAP_SOFTWARE;
 
-	if (group_leader &&
-	    (is_software_event(event) != is_software_event(group_leader))) {
-		if (is_software_event(event)) {
+	if (group_leader) {
+		if (is_software_event(event) &&
+		    !in_software_context(group_leader)) {
 			/*
-			 * If event and group_leader are not both a software
-			 * event, and event is, then group leader is not.
+			 * If the event is a sw event, but the group_leader
+			 * is on hw context.
 			 *
-			 * Allow the addition of software events to !software
-			 * groups, this is safe because software events never
-			 * fail to schedule.
+			 * Allow the addition of software events to hw
+			 * groups, this is safe because software events
+			 * never fail to schedule.
 			 */
-			pmu = group_leader->pmu;
-		} else if (is_software_event(group_leader) &&
+			pmu = group_leader->ctx->pmu;
+		} else if (!is_software_event(event) &&
+			   is_software_event(group_leader) &&
 			   (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
 			/*
 			 * In case the group is a pure software group, and we
@@ -11194,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
 	return file;
 }
 
+const struct perf_event *perf_get_event(struct file *file)
+{
+	if (file->f_op != &perf_fops)
+		return ERR_PTR(-EINVAL);
+
+	return file->private_data;
+}
+
 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
 	if (!event)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 6c6b3c48db71..1d8ca9ea9979 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/circ_buf.h>
 #include <linux/poll.h>
+#include <linux/nospec.h>
 
 #include "internal.h"
 
@@ -867,8 +868,10 @@ perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
 			return NULL;
 
 		/* AUX space */
-		if (pgoff >= rb->aux_pgoff)
-			return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]);
+		if (pgoff >= rb->aux_pgoff) {
+			int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages);
+			return virt_to_page(rb->aux_pages[aux_pgoff]);
+		}
 	}
 
 	return __perf_mmap_to_page(rb, pgoff);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ce6848e46e94..1725b902983f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -491,7 +491,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 	if (!uprobe)
 		return NULL;
 
-	uprobe->inode = igrab(inode);
+	uprobe->inode = inode;
 	uprobe->offset = offset;
 	init_rwsem(&uprobe->register_rwsem);
 	init_rwsem(&uprobe->consumer_rwsem);
@@ -502,7 +502,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
 	if (cur_uprobe) {
 		kfree(uprobe);
 		uprobe = cur_uprobe;
-		iput(inode);
 	}
 
 	return uprobe;
@@ -701,7 +700,6 @@ static void delete_uprobe(struct uprobe *uprobe)
 	rb_erase(&uprobe->rb_node, &uprobes_tree);
 	spin_unlock(&uprobes_treelock);
 	RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
-	iput(uprobe->inode);
 	put_uprobe(uprobe);
 }
 
@@ -873,7 +871,8 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
  * tuple).  Creation refcount stops uprobe_unregister from freeing the
  * @uprobe even before the register operation is complete. Creation
  * refcount is released when the last @uc for the @uprobe
- * unregisters.
+ * unregisters. Caller of uprobe_register() is required to keep @inode
+ * (and the containing mount) referenced.
  *
  * Return errno if it cannot successully install probes
  * else return 0 (success)
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index a5697119290e..33f07c5f2515 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,21 +27,9 @@ static int execdomains_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int execdomains_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, execdomains_proc_show, NULL);
-}
-
-static const struct file_operations execdomains_proc_fops = {
-	.open		= execdomains_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int __init proc_execdomains_init(void)
 {
-	proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
+	proc_create_single("execdomains", 0, NULL, execdomains_proc_show);
 	return 0;
 }
 module_init(proc_execdomains_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index 242c8c93d285..80b48a8fb47b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -216,10 +216,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 		if (!s)
 			continue;
 
-#ifdef CONFIG_DEBUG_KMEMLEAK
 		/* Clear stale pointers from reused stack. */
 		memset(s->addr, 0, THREAD_SIZE);
-#endif
+
 		tsk->stack_vm_area = s;
 		return s->addr;
 	}
@@ -1713,7 +1712,7 @@ static __latent_entropy struct task_struct *copy_process(
 	p->start_time = ktime_get_ns();
 	p->real_start_time = ktime_get_boot_ns();
 	p->io_context = NULL;
-	p->audit_context = NULL;
+	audit_set_context(p, NULL);
 	cgroup_fork(p);
 #ifdef CONFIG_NUMA
 	p->mempolicy = mpol_dup(p->mempolicy);
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index a37a3b4b6342..f4f29b9d90ee 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
 	}
 }
 
-static cpumask_var_t *alloc_node_to_possible_cpumask(void)
+static cpumask_var_t *alloc_node_to_cpumask(void)
 {
 	cpumask_var_t *masks;
 	int node;
@@ -62,7 +62,7 @@ out_unwind:
 	return NULL;
 }
 
-static void free_node_to_possible_cpumask(cpumask_var_t *masks)
+static void free_node_to_cpumask(cpumask_var_t *masks)
 {
 	int node;
 
@@ -71,7 +71,7 @@ static void free_node_to_possible_cpumask(cpumask_var_t *masks)
 	kfree(masks);
 }
 
-static void build_node_to_possible_cpumask(cpumask_var_t *masks)
+static void build_node_to_cpumask(cpumask_var_t *masks)
 {
 	int cpu;
 
@@ -79,14 +79,14 @@ static void build_node_to_possible_cpumask(cpumask_var_t *masks)
 		cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
 }
 
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
+static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
 				const struct cpumask *mask, nodemask_t *nodemsk)
 {
 	int n, nodes = 0;
 
 	/* Calculate the number of nodes in the supplied affinity mask */
 	for_each_node(n) {
-		if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
+		if (cpumask_intersects(mask, node_to_cpumask[n])) {
 			node_set(n, *nodemsk);
 			nodes++;
 		}
@@ -94,73 +94,46 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
 	return nodes;
 }
 
-/**
- * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
- * @nvecs:	The total number of vectors
- * @affd:	Description of the affinity requirements
- *
- * Returns the masks pointer or NULL if allocation failed.
- */
-struct cpumask *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+static int irq_build_affinity_masks(const struct irq_affinity *affd,
+				    int startvec, int numvecs,
+				    cpumask_var_t *node_to_cpumask,
+				    const struct cpumask *cpu_mask,
+				    struct cpumask *nmsk,
+				    struct cpumask *masks)
 {
-	int n, nodes, cpus_per_vec, extra_vecs, curvec;
-	int affv = nvecs - affd->pre_vectors - affd->post_vectors;
-	int last_affv = affv + affd->pre_vectors;
+	int n, nodes, cpus_per_vec, extra_vecs, done = 0;
+	int last_affv = affd->pre_vectors + numvecs;
+	int curvec = startvec;
 	nodemask_t nodemsk = NODE_MASK_NONE;
-	struct cpumask *masks;
-	cpumask_var_t nmsk, *node_to_possible_cpumask;
-
-	/*
-	 * If there aren't any vectors left after applying the pre/post
-	 * vectors don't bother with assigning affinity.
-	 */
-	if (!affv)
-		return NULL;
-
-	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
-		return NULL;
-
-	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
-	if (!masks)
-		goto out;
 
-	node_to_possible_cpumask = alloc_node_to_possible_cpumask();
-	if (!node_to_possible_cpumask)
-		goto out;
+	if (!cpumask_weight(cpu_mask))
+		return 0;
 
-	/* Fill out vectors at the beginning that don't need affinity */
-	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
-		cpumask_copy(masks + curvec, irq_default_affinity);
-
-	/* Stabilize the cpumasks */
-	get_online_cpus();
-	build_node_to_possible_cpumask(node_to_possible_cpumask);
-	nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask,
-				     &nodemsk);
+	nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
 
 	/*
 	 * If the number of nodes in the mask is greater than or equal the
 	 * number of vectors we just spread the vectors across the nodes.
 	 */
-	if (affv <= nodes) {
+	if (numvecs <= nodes) {
 		for_each_node_mask(n, nodemsk) {
-			cpumask_copy(masks + curvec,
-				     node_to_possible_cpumask[n]);
-			if (++curvec == last_affv)
+			cpumask_copy(masks + curvec, node_to_cpumask[n]);
+			if (++done == numvecs)
 				break;
+			if (++curvec == last_affv)
+				curvec = affd->pre_vectors;
 		}
-		goto done;
+		goto out;
 	}
 
 	for_each_node_mask(n, nodemsk) {
 		int ncpus, v, vecs_to_assign, vecs_per_node;
 
 		/* Spread the vectors per node */
-		vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
+		vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes;
 
 		/* Get the cpus on this node which are in the mask */
-		cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]);
+		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
 
 		/* Calculate the number of cpus per vector */
 		ncpus = cpumask_weight(nmsk);
@@ -181,19 +154,96 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 			irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
 		}
 
-		if (curvec >= last_affv)
+		done += v;
+		if (done >= numvecs)
 			break;
+		if (curvec >= last_affv)
+			curvec = affd->pre_vectors;
 		--nodes;
 	}
 
-done:
+out:
+	return done;
+}
+
+/**
+ * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
+ * @nvecs:	The total number of vectors
+ * @affd:	Description of the affinity requirements
+ *
+ * Returns the masks pointer or NULL if allocation failed.
+ */
+struct cpumask *
+irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+{
+	int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+	int curvec, usedvecs;
+	cpumask_var_t nmsk, npresmsk, *node_to_cpumask;
+	struct cpumask *masks = NULL;
+
+	/*
+	 * If there aren't any vectors left after applying the pre/post
+	 * vectors don't bother with assigning affinity.
+	 */
+	if (nvecs == affd->pre_vectors + affd->post_vectors)
+		return NULL;
+
+	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+		return NULL;
+
+	if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
+		goto outcpumsk;
+
+	node_to_cpumask = alloc_node_to_cpumask();
+	if (!node_to_cpumask)
+		goto outnpresmsk;
+
+	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
+	if (!masks)
+		goto outnodemsk;
+
+	/* Fill out vectors at the beginning that don't need affinity */
+	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+		cpumask_copy(masks + curvec, irq_default_affinity);
+
+	/* Stabilize the cpumasks */
+	get_online_cpus();
+	build_node_to_cpumask(node_to_cpumask);
+
+	/* Spread on present CPUs starting from affd->pre_vectors */
+	usedvecs = irq_build_affinity_masks(affd, curvec, affvecs,
+					    node_to_cpumask, cpu_present_mask,
+					    nmsk, masks);
+
+	/*
+	 * Spread on non present CPUs starting from the next vector to be
+	 * handled. If the spreading of present CPUs already exhausted the
+	 * vector space, assign the non present CPUs to the already spread
+	 * out vectors.
+	 */
+	if (usedvecs >= affvecs)
+		curvec = affd->pre_vectors;
+	else
+		curvec = affd->pre_vectors + usedvecs;
+	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
+	usedvecs += irq_build_affinity_masks(affd, curvec, affvecs,
+					     node_to_cpumask, npresmsk,
+					     nmsk, masks);
 	put_online_cpus();
 
 	/* Fill out vectors at the end that don't need affinity */
+	if (usedvecs >= affvecs)
+		curvec = affd->pre_vectors + affvecs;
+	else
+		curvec = affd->pre_vectors + usedvecs;
 	for (; curvec < nvecs; curvec++)
 		cpumask_copy(masks + curvec, irq_default_affinity);
-	free_node_to_possible_cpumask(node_to_possible_cpumask);
-out:
+
+outnodemsk:
+	free_node_to_cpumask(node_to_cpumask);
+outnpresmsk:
+	free_cpumask_var(npresmsk);
+outcpumsk:
 	free_cpumask_var(nmsk);
 	return masks;
 }
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index fc4f361a86bb..dd20d0d528d4 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -1,11 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+
 /*
- * Copyright (C) 2017 Bartosz Golaszewski <brgl@bgdev.pl>
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
+ * Copyright (C) 2017-2018 Bartosz Golaszewski <brgl@bgdev.pl>
  */
 
 #include <linux/slab.h>
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 2a8571f72b17..4ca2fd46645d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -76,6 +76,19 @@ static inline void irq_chip_write_msi_msg(struct irq_data *data,
 	data->chip->irq_write_msi_msg(data, msg);
 }
 
+static void msi_check_level(struct irq_domain *domain, struct msi_msg *msg)
+{
+	struct msi_domain_info *info = domain->host_data;
+
+	/*
+	 * If the MSI provider has messed with the second message and
+	 * not advertized that it is level-capable, signal the breakage.
+	 */
+	WARN_ON(!((info->flags & MSI_FLAG_LEVEL_CAPABLE) &&
+		  (info->chip->flags & IRQCHIP_SUPPORTS_LEVEL_MSI)) &&
+		(msg[1].address_lo || msg[1].address_hi || msg[1].data));
+}
+
 /**
  * msi_domain_set_affinity - Generic affinity setter function for MSI domains
  * @irq_data:	The irq data associated to the interrupt
@@ -89,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data,
 			    const struct cpumask *mask, bool force)
 {
 	struct irq_data *parent = irq_data->parent_data;
-	struct msi_msg msg;
+	struct msi_msg msg[2] = { [1] = { }, };
 	int ret;
 
 	ret = parent->chip->irq_set_affinity(parent, mask, force);
 	if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
-		BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
-		irq_chip_write_msi_msg(irq_data, &msg);
+		BUG_ON(irq_chip_compose_msi_msg(irq_data, msg));
+		msi_check_level(irq_data->domain, msg);
+		irq_chip_write_msi_msg(irq_data, msg);
 	}
 
 	return ret;
@@ -104,20 +118,21 @@ int msi_domain_set_affinity(struct irq_data *irq_data,
 static int msi_domain_activate(struct irq_domain *domain,
 			       struct irq_data *irq_data, bool early)
 {
-	struct msi_msg msg;
+	struct msi_msg msg[2] = { [1] = { }, };
 
-	BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
-	irq_chip_write_msi_msg(irq_data, &msg);
+	BUG_ON(irq_chip_compose_msi_msg(irq_data, msg));
+	msi_check_level(irq_data->domain, msg);
+	irq_chip_write_msi_msg(irq_data, msg);
 	return 0;
 }
 
 static void msi_domain_deactivate(struct irq_domain *domain,
 				  struct irq_data *irq_data)
 {
-	struct msi_msg msg;
+	struct msi_msg msg[2];
 
-	memset(&msg, 0, sizeof(msg));
-	irq_chip_write_msi_msg(irq_data, &msg);
+	memset(msg, 0, sizeof(msg));
+	irq_chip_write_msi_msg(irq_data, msg);
 }
 
 static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7cb091d81d91..37eda10f5c36 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -185,11 +185,6 @@ static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
 	return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
 }
 
-static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
-}
-
 static const struct file_operations irq_affinity_proc_fops = {
 	.open		= irq_affinity_proc_open,
 	.read		= seq_read,
@@ -198,13 +193,6 @@ static const struct file_operations irq_affinity_proc_fops = {
 	.write		= irq_affinity_proc_write,
 };
 
-static const struct file_operations irq_affinity_hint_proc_fops = {
-	.open		= irq_affinity_hint_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static const struct file_operations irq_affinity_list_proc_fops = {
 	.open		= irq_affinity_list_proc_open,
 	.read		= seq_read,
@@ -223,32 +211,6 @@ static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v)
 {
 	return show_irq_affinity(EFFECTIVE_LIST, m);
 }
-
-static int irq_effective_aff_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode));
-}
-
-static int irq_effective_aff_list_proc_open(struct inode *inode,
-					    struct file *file)
-{
-	return single_open(file, irq_effective_aff_list_proc_show,
-			   PDE_DATA(inode));
-}
-
-static const struct file_operations irq_effective_aff_proc_fops = {
-	.open		= irq_effective_aff_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static const struct file_operations irq_effective_aff_list_proc_fops = {
-	.open		= irq_effective_aff_list_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 static int default_affinity_show(struct seq_file *m, void *v)
@@ -313,18 +275,6 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
 	seq_printf(m, "%d\n", irq_desc_get_node(desc));
 	return 0;
 }
-
-static int irq_node_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, irq_node_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations irq_node_proc_fops = {
-	.open		= irq_node_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif
 
 static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -337,18 +287,6 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int irq_spurious_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
-}
-
-static const struct file_operations irq_spurious_proc_fops = {
-	.open		= irq_spurious_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 #define MAX_NAMELEN 128
 
 static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -421,24 +359,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 			 &irq_affinity_proc_fops, irqp);
 
 	/* create /proc/irq/<irq>/affinity_hint */
-	proc_create_data("affinity_hint", 0444, desc->dir,
-			 &irq_affinity_hint_proc_fops, irqp);
+	proc_create_single_data("affinity_hint", 0444, desc->dir,
+			irq_affinity_hint_proc_show, irqp);
 
 	/* create /proc/irq/<irq>/smp_affinity_list */
 	proc_create_data("smp_affinity_list", 0644, desc->dir,
 			 &irq_affinity_list_proc_fops, irqp);
 
-	proc_create_data("node", 0444, desc->dir,
-			 &irq_node_proc_fops, irqp);
+	proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show,
+			irqp);
 # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
-	proc_create_data("effective_affinity", 0444, desc->dir,
-			 &irq_effective_aff_proc_fops, irqp);
-	proc_create_data("effective_affinity_list", 0444, desc->dir,
-			 &irq_effective_aff_list_proc_fops, irqp);
+	proc_create_single_data("effective_affinity", 0444, desc->dir,
+			irq_effective_aff_proc_show, irqp);
+	proc_create_single_data("effective_affinity_list", 0444, desc->dir,
+			irq_effective_aff_list_proc_show, irqp);
 # endif
 #endif
-	proc_create_data("spurious", 0444, desc->dir,
-			 &irq_spurious_proc_fops, (void *)(long)irq);
+	proc_create_single_data("spurious", 0444, desc->dir,
+			irq_spurious_proc_show, (void *)(long)irq);
 
 out_unlock:
 	mutex_unlock(&register_lock);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index e5bcd94c1efb..75d8e7cf040e 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -22,50 +22,123 @@
 #include <linux/ima.h>
 #include <crypto/hash.h>
 #include <crypto/sha.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <linux/slab.h>
 #include <linux/syscalls.h>
 #include <linux/vmalloc.h>
 #include "kexec_internal.h"
 
 static int kexec_calculate_store_digests(struct kimage *image);
 
+/*
+ * Currently this is the only default function that is exported as some
+ * architectures need it to do additional handlings.
+ * In the future, other default functions may be exported too if required.
+ */
+int kexec_image_probe_default(struct kimage *image, void *buf,
+			      unsigned long buf_len)
+{
+	const struct kexec_file_ops * const *fops;
+	int ret = -ENOEXEC;
+
+	for (fops = &kexec_file_loaders[0]; *fops && (*fops)->probe; ++fops) {
+		ret = (*fops)->probe(buf, buf_len);
+		if (!ret) {
+			image->fops = *fops;
+			return ret;
+		}
+	}
+
+	return ret;
+}
+
 /* Architectures can provide this probe function */
 int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
 					 unsigned long buf_len)
 {
-	return -ENOEXEC;
+	return kexec_image_probe_default(image, buf, buf_len);
+}
+
+static void *kexec_image_load_default(struct kimage *image)
+{
+	if (!image->fops || !image->fops->load)
+		return ERR_PTR(-ENOEXEC);
+
+	return image->fops->load(image, image->kernel_buf,
+				 image->kernel_buf_len, image->initrd_buf,
+				 image->initrd_buf_len, image->cmdline_buf,
+				 image->cmdline_buf_len);
 }
 
 void * __weak arch_kexec_kernel_image_load(struct kimage *image)
 {
-	return ERR_PTR(-ENOEXEC);
+	return kexec_image_load_default(image);
+}
+
+static int kexec_image_post_load_cleanup_default(struct kimage *image)
+{
+	if (!image->fops || !image->fops->cleanup)
+		return 0;
+
+	return image->fops->cleanup(image->image_loader_data);
 }
 
 int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
 {
-	return -EINVAL;
+	return kexec_image_post_load_cleanup_default(image);
 }
 
 #ifdef CONFIG_KEXEC_VERIFY_SIG
+static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
+					  unsigned long buf_len)
+{
+	if (!image->fops || !image->fops->verify_sig) {
+		pr_debug("kernel loader does not support signature verification.\n");
+		return -EKEYREJECTED;
+	}
+
+	return image->fops->verify_sig(buf, buf_len);
+}
+
 int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
 					unsigned long buf_len)
 {
-	return -EKEYREJECTED;
+	return kexec_image_verify_sig_default(image, buf, buf_len);
 }
 #endif
 
-/* Apply relocations of type RELA */
+/*
+ * arch_kexec_apply_relocations_add - apply relocations of type RELA
+ * @pi:		Purgatory to be relocated.
+ * @section:	Section relocations applying to.
+ * @relsec:	Section containing RELAs.
+ * @symtab:	Corresponding symtab.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
 int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-				 unsigned int relsec)
+arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section,
+				 const Elf_Shdr *relsec, const Elf_Shdr *symtab)
 {
 	pr_err("RELA relocation unsupported.\n");
 	return -ENOEXEC;
 }
 
-/* Apply relocations of type REL */
+/*
+ * arch_kexec_apply_relocations - apply relocations of type REL
+ * @pi:		Purgatory to be relocated.
+ * @section:	Section relocations applying to.
+ * @relsec:	Section containing RELs.
+ * @symtab:	Corresponding symtab.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
 int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
-			     unsigned int relsec)
+arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section,
+			     const Elf_Shdr *relsec, const Elf_Shdr *symtab)
 {
 	pr_err("REL relocation unsupported.\n");
 	return -ENOEXEC;
@@ -532,6 +605,9 @@ static int kexec_calculate_store_digests(struct kimage *image)
 	struct kexec_sha_region *sha_regions;
 	struct purgatory_info *pi = &image->purgatory_info;
 
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY))
+		return 0;
+
 	zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
 	zero_buf_sz = PAGE_SIZE;
 
@@ -633,87 +709,29 @@ out:
 	return ret;
 }
 
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
-				  unsigned long max, int top_down)
+#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY
+/*
+ * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory.
+ * @pi:		Purgatory to be loaded.
+ * @kbuf:	Buffer to setup.
+ *
+ * Allocates the memory needed for the buffer. Caller is responsible to free
+ * the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+static int kexec_purgatory_setup_kbuf(struct purgatory_info *pi,
+				      struct kexec_buf *kbuf)
 {
-	struct purgatory_info *pi = &image->purgatory_info;
-	unsigned long align, bss_align, bss_sz, bss_pad;
-	unsigned long entry, load_addr, curr_load_addr, bss_addr, offset;
-	unsigned char *buf_addr, *src;
-	int i, ret = 0, entry_sidx = -1;
-	const Elf_Shdr *sechdrs_c;
-	Elf_Shdr *sechdrs = NULL;
-	struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1,
-				  .buf_min = min, .buf_max = max,
-				  .top_down = top_down };
-
-	/*
-	 * sechdrs_c points to section headers in purgatory and are read
-	 * only. No modifications allowed.
-	 */
-	sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-
-	/*
-	 * We can not modify sechdrs_c[] and its fields. It is read only.
-	 * Copy it over to a local copy where one can store some temporary
-	 * data and free it at the end. We need to modify ->sh_addr and
-	 * ->sh_offset fields to keep track of permanent and temporary
-	 * locations of sections.
-	 */
-	sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-	if (!sechdrs)
-		return -ENOMEM;
-
-	memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-
-	/*
-	 * We seem to have multiple copies of sections. First copy is which
-	 * is embedded in kernel in read only section. Some of these sections
-	 * will be copied to a temporary buffer and relocated. And these
-	 * sections will finally be copied to their final destination at
-	 * segment load time.
-	 *
-	 * Use ->sh_offset to reflect section address in memory. It will
-	 * point to original read only copy if section is not allocatable.
-	 * Otherwise it will point to temporary copy which will be relocated.
-	 *
-	 * Use ->sh_addr to contain final address of the section where it
-	 * will go during execution time.
-	 */
-	for (i = 0; i < pi->ehdr->e_shnum; i++) {
-		if (sechdrs[i].sh_type == SHT_NOBITS)
-			continue;
-
-		sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
-						sechdrs[i].sh_offset;
-	}
-
-	/*
-	 * Identify entry point section and make entry relative to section
-	 * start.
-	 */
-	entry = pi->ehdr->e_entry;
-	for (i = 0; i < pi->ehdr->e_shnum; i++) {
-		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
-			continue;
-
-		if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
-			continue;
-
-		/* Make entry section relative */
-		if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
-		    ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
-		     pi->ehdr->e_entry)) {
-			entry_sidx = i;
-			entry -= sechdrs[i].sh_addr;
-			break;
-		}
-	}
+	const Elf_Shdr *sechdrs;
+	unsigned long bss_align;
+	unsigned long bss_sz;
+	unsigned long align;
+	int i, ret;
 
-	/* Determine how much memory is needed to load relocatable object. */
-	bss_align = 1;
-	bss_sz = 0;
+	sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+	kbuf->buf_align = bss_align = 1;
+	kbuf->bufsz = bss_sz = 0;
 
 	for (i = 0; i < pi->ehdr->e_shnum; i++) {
 		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -721,111 +739,124 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
 
 		align = sechdrs[i].sh_addralign;
 		if (sechdrs[i].sh_type != SHT_NOBITS) {
-			if (kbuf.buf_align < align)
-				kbuf.buf_align = align;
-			kbuf.bufsz = ALIGN(kbuf.bufsz, align);
-			kbuf.bufsz += sechdrs[i].sh_size;
+			if (kbuf->buf_align < align)
+				kbuf->buf_align = align;
+			kbuf->bufsz = ALIGN(kbuf->bufsz, align);
+			kbuf->bufsz += sechdrs[i].sh_size;
 		} else {
-			/* bss section */
 			if (bss_align < align)
 				bss_align = align;
 			bss_sz = ALIGN(bss_sz, align);
 			bss_sz += sechdrs[i].sh_size;
 		}
 	}
+	kbuf->bufsz = ALIGN(kbuf->bufsz, bss_align);
+	kbuf->memsz = kbuf->bufsz + bss_sz;
+	if (kbuf->buf_align < bss_align)
+		kbuf->buf_align = bss_align;
 
-	/* Determine the bss padding required to align bss properly */
-	bss_pad = 0;
-	if (kbuf.bufsz & (bss_align - 1))
-		bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1));
-
-	kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz;
+	kbuf->buffer = vzalloc(kbuf->bufsz);
+	if (!kbuf->buffer)
+		return -ENOMEM;
+	pi->purgatory_buf = kbuf->buffer;
 
-	/* Allocate buffer for purgatory */
-	kbuf.buffer = vzalloc(kbuf.bufsz);
-	if (!kbuf.buffer) {
-		ret = -ENOMEM;
+	ret = kexec_add_buffer(kbuf);
+	if (ret)
 		goto out;
-	}
 
-	if (kbuf.buf_align < bss_align)
-		kbuf.buf_align = bss_align;
+	return 0;
+out:
+	vfree(pi->purgatory_buf);
+	pi->purgatory_buf = NULL;
+	return ret;
+}
 
-	/* Add buffer to segment list */
-	ret = kexec_add_buffer(&kbuf);
-	if (ret)
-		goto out;
-	pi->purgatory_load_addr = kbuf.mem;
+/*
+ * kexec_purgatory_setup_sechdrs - prepares the pi->sechdrs buffer.
+ * @pi:		Purgatory to be loaded.
+ * @kbuf:	Buffer prepared to store purgatory.
+ *
+ * Allocates the memory needed for the buffer. Caller is responsible to free
+ * the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi,
+					 struct kexec_buf *kbuf)
+{
+	unsigned long bss_addr;
+	unsigned long offset;
+	Elf_Shdr *sechdrs;
+	int i;
+
+	/*
+	 * The section headers in kexec_purgatory are read-only. In order to
+	 * have them modifiable make a temporary copy.
+	 */
+	sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+	if (!sechdrs)
+		return -ENOMEM;
+	memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff,
+	       pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+	pi->sechdrs = sechdrs;
 
-	/* Load SHF_ALLOC sections */
-	buf_addr = kbuf.buffer;
-	load_addr = curr_load_addr = pi->purgatory_load_addr;
-	bss_addr = load_addr + kbuf.bufsz + bss_pad;
+	offset = 0;
+	bss_addr = kbuf->mem + kbuf->bufsz;
+	kbuf->image->start = pi->ehdr->e_entry;
 
 	for (i = 0; i < pi->ehdr->e_shnum; i++) {
+		unsigned long align;
+		void *src, *dst;
+
 		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
 			continue;
 
 		align = sechdrs[i].sh_addralign;
-		if (sechdrs[i].sh_type != SHT_NOBITS) {
-			curr_load_addr = ALIGN(curr_load_addr, align);
-			offset = curr_load_addr - load_addr;
-			/* We already modifed ->sh_offset to keep src addr */
-			src = (char *) sechdrs[i].sh_offset;
-			memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
-			/* Store load address and source address of section */
-			sechdrs[i].sh_addr = curr_load_addr;
-
-			/*
-			 * This section got copied to temporary buffer. Update
-			 * ->sh_offset accordingly.
-			 */
-			sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
-			/* Advance to the next address */
-			curr_load_addr += sechdrs[i].sh_size;
-		} else {
+		if (sechdrs[i].sh_type == SHT_NOBITS) {
 			bss_addr = ALIGN(bss_addr, align);
 			sechdrs[i].sh_addr = bss_addr;
 			bss_addr += sechdrs[i].sh_size;
+			continue;
 		}
-	}
 
-	/* Update entry point based on load address of text section */
-	if (entry_sidx >= 0)
-		entry += sechdrs[entry_sidx].sh_addr;
+		offset = ALIGN(offset, align);
+		if (sechdrs[i].sh_flags & SHF_EXECINSTR &&
+		    pi->ehdr->e_entry >= sechdrs[i].sh_addr &&
+		    pi->ehdr->e_entry < (sechdrs[i].sh_addr
+					 + sechdrs[i].sh_size)) {
+			kbuf->image->start -= sechdrs[i].sh_addr;
+			kbuf->image->start += kbuf->mem + offset;
+		}
 
-	/* Make kernel jump to purgatory after shutdown */
-	image->start = entry;
+		src = (void *)pi->ehdr + sechdrs[i].sh_offset;
+		dst = pi->purgatory_buf + offset;
+		memcpy(dst, src, sechdrs[i].sh_size);
 
-	/* Used later to get/set symbol values */
-	pi->sechdrs = sechdrs;
+		sechdrs[i].sh_addr = kbuf->mem + offset;
+		sechdrs[i].sh_offset = offset;
+		offset += sechdrs[i].sh_size;
+	}
 
-	/*
-	 * Used later to identify which section is purgatory and skip it
-	 * from checksumming.
-	 */
-	pi->purgatory_buf = kbuf.buffer;
-	return ret;
-out:
-	vfree(sechdrs);
-	vfree(kbuf.buffer);
-	return ret;
+	return 0;
 }
 
 static int kexec_apply_relocations(struct kimage *image)
 {
 	int i, ret;
 	struct purgatory_info *pi = &image->purgatory_info;
-	Elf_Shdr *sechdrs = pi->sechdrs;
+	const Elf_Shdr *sechdrs;
+
+	sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
 
-	/* Apply relocations */
 	for (i = 0; i < pi->ehdr->e_shnum; i++) {
-		Elf_Shdr *section, *symtab;
+		const Elf_Shdr *relsec;
+		const Elf_Shdr *symtab;
+		Elf_Shdr *section;
+
+		relsec = sechdrs + i;
 
-		if (sechdrs[i].sh_type != SHT_RELA &&
-		    sechdrs[i].sh_type != SHT_REL)
+		if (relsec->sh_type != SHT_RELA &&
+		    relsec->sh_type != SHT_REL)
 			continue;
 
 		/*
@@ -834,12 +865,12 @@ static int kexec_apply_relocations(struct kimage *image)
 		 * symbol table. And ->sh_info contains section header
 		 * index of section to which relocations apply.
 		 */
-		if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
-		    sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+		if (relsec->sh_info >= pi->ehdr->e_shnum ||
+		    relsec->sh_link >= pi->ehdr->e_shnum)
 			return -ENOEXEC;
 
-		section = &sechdrs[sechdrs[i].sh_info];
-		symtab = &sechdrs[sechdrs[i].sh_link];
+		section = pi->sechdrs + relsec->sh_info;
+		symtab = sechdrs + relsec->sh_link;
 
 		if (!(section->sh_flags & SHF_ALLOC))
 			continue;
@@ -856,12 +887,12 @@ static int kexec_apply_relocations(struct kimage *image)
 		 * Respective architecture needs to provide support for applying
 		 * relocations of type SHT_RELA/SHT_REL.
 		 */
-		if (sechdrs[i].sh_type == SHT_RELA)
-			ret = arch_kexec_apply_relocations_add(pi->ehdr,
-							       sechdrs, i);
-		else if (sechdrs[i].sh_type == SHT_REL)
-			ret = arch_kexec_apply_relocations(pi->ehdr,
-							   sechdrs, i);
+		if (relsec->sh_type == SHT_RELA)
+			ret = arch_kexec_apply_relocations_add(pi, section,
+							       relsec, symtab);
+		else if (relsec->sh_type == SHT_REL)
+			ret = arch_kexec_apply_relocations(pi, section,
+							   relsec, symtab);
 		if (ret)
 			return ret;
 	}
@@ -869,10 +900,18 @@ static int kexec_apply_relocations(struct kimage *image)
 	return 0;
 }
 
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
-			 unsigned long max, int top_down,
-			 unsigned long *load_addr)
+/*
+ * kexec_load_purgatory - Load and relocate the purgatory object.
+ * @image:	Image to add the purgatory to.
+ * @kbuf:	Memory parameters to use.
+ *
+ * Allocates the memory needed for image->purgatory_info.sechdrs and
+ * image->purgatory_info.purgatory_buf/kbuf->buffer. Caller is responsible
+ * to free the memory after use.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf)
 {
 	struct purgatory_info *pi = &image->purgatory_info;
 	int ret;
@@ -880,55 +919,51 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
 	if (kexec_purgatory_size <= 0)
 		return -EINVAL;
 
-	if (kexec_purgatory_size < sizeof(Elf_Ehdr))
-		return -ENOEXEC;
-
-	pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-
-	if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
-	    || pi->ehdr->e_type != ET_REL
-	    || !elf_check_arch(pi->ehdr)
-	    || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
-		return -ENOEXEC;
-
-	if (pi->ehdr->e_shoff >= kexec_purgatory_size
-	    || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
-	    kexec_purgatory_size - pi->ehdr->e_shoff))
-		return -ENOEXEC;
+	pi->ehdr = (const Elf_Ehdr *)kexec_purgatory;
 
-	ret = __kexec_load_purgatory(image, min, max, top_down);
+	ret = kexec_purgatory_setup_kbuf(pi, kbuf);
 	if (ret)
 		return ret;
 
+	ret = kexec_purgatory_setup_sechdrs(pi, kbuf);
+	if (ret)
+		goto out_free_kbuf;
+
 	ret = kexec_apply_relocations(image);
 	if (ret)
 		goto out;
 
-	*load_addr = pi->purgatory_load_addr;
 	return 0;
 out:
 	vfree(pi->sechdrs);
 	pi->sechdrs = NULL;
-
+out_free_kbuf:
 	vfree(pi->purgatory_buf);
 	pi->purgatory_buf = NULL;
 	return ret;
 }
 
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
-					    const char *name)
+/*
+ * kexec_purgatory_find_symbol - find a symbol in the purgatory
+ * @pi:		Purgatory to search in.
+ * @name:	Name of the symbol.
+ *
+ * Return: pointer to symbol in read-only symtab on success, NULL on error.
+ */
+static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+						  const char *name)
 {
-	Elf_Sym *syms;
-	Elf_Shdr *sechdrs;
-	Elf_Ehdr *ehdr;
-	int i, k;
+	const Elf_Shdr *sechdrs;
+	const Elf_Ehdr *ehdr;
+	const Elf_Sym *syms;
 	const char *strtab;
+	int i, k;
 
-	if (!pi->sechdrs || !pi->ehdr)
+	if (!pi->ehdr)
 		return NULL;
 
-	sechdrs = pi->sechdrs;
 	ehdr = pi->ehdr;
+	sechdrs = (void *)ehdr + ehdr->e_shoff;
 
 	for (i = 0; i < ehdr->e_shnum; i++) {
 		if (sechdrs[i].sh_type != SHT_SYMTAB)
@@ -937,8 +972,8 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
 		if (sechdrs[i].sh_link >= ehdr->e_shnum)
 			/* Invalid strtab section number */
 			continue;
-		strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
-		syms = (Elf_Sym *)sechdrs[i].sh_offset;
+		strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset;
+		syms = (void *)ehdr + sechdrs[i].sh_offset;
 
 		/* Go through symbols for a match */
 		for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
@@ -966,7 +1001,7 @@ static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
 void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
 {
 	struct purgatory_info *pi = &image->purgatory_info;
-	Elf_Sym *sym;
+	const Elf_Sym *sym;
 	Elf_Shdr *sechdr;
 
 	sym = kexec_purgatory_find_symbol(pi, name);
@@ -989,9 +1024,9 @@ void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
 int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 				   void *buf, unsigned int size, bool get_value)
 {
-	Elf_Sym *sym;
-	Elf_Shdr *sechdrs;
 	struct purgatory_info *pi = &image->purgatory_info;
+	const Elf_Sym *sym;
+	Elf_Shdr *sec;
 	char *sym_buf;
 
 	sym = kexec_purgatory_find_symbol(pi, name);
@@ -1004,16 +1039,15 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 		return -EINVAL;
 	}
 
-	sechdrs = pi->sechdrs;
+	sec = pi->sechdrs + sym->st_shndx;
 
-	if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+	if (sec->sh_type == SHT_NOBITS) {
 		pr_err("symbol %s is in a bss section. Cannot %s\n", name,
 		       get_value ? "get" : "set");
 		return -EINVAL;
 	}
 
-	sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
-					sym->st_value;
+	sym_buf = (char *)pi->purgatory_buf + sec->sh_offset + sym->st_value;
 
 	if (get_value)
 		memcpy((void *)buf, sym_buf, size);
@@ -1022,3 +1056,174 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
 
 	return 0;
 }
+#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */
+
+int crash_exclude_mem_range(struct crash_mem *mem,
+			    unsigned long long mstart, unsigned long long mend)
+{
+	int i, j;
+	unsigned long long start, end;
+	struct crash_mem_range temp_range = {0, 0};
+
+	for (i = 0; i < mem->nr_ranges; i++) {
+		start = mem->ranges[i].start;
+		end = mem->ranges[i].end;
+
+		if (mstart > end || mend < start)
+			continue;
+
+		/* Truncate any area outside of range */
+		if (mstart < start)
+			mstart = start;
+		if (mend > end)
+			mend = end;
+
+		/* Found completely overlapping range */
+		if (mstart == start && mend == end) {
+			mem->ranges[i].start = 0;
+			mem->ranges[i].end = 0;
+			if (i < mem->nr_ranges - 1) {
+				/* Shift rest of the ranges to left */
+				for (j = i; j < mem->nr_ranges - 1; j++) {
+					mem->ranges[j].start =
+						mem->ranges[j+1].start;
+					mem->ranges[j].end =
+							mem->ranges[j+1].end;
+				}
+			}
+			mem->nr_ranges--;
+			return 0;
+		}
+
+		if (mstart > start && mend < end) {
+			/* Split original range */
+			mem->ranges[i].end = mstart - 1;
+			temp_range.start = mend + 1;
+			temp_range.end = end;
+		} else if (mstart != start)
+			mem->ranges[i].end = mstart - 1;
+		else
+			mem->ranges[i].start = mend + 1;
+		break;
+	}
+
+	/* If a split happened, add the split to array */
+	if (!temp_range.end)
+		return 0;
+
+	/* Split happened */
+	if (i == mem->max_nr_ranges - 1)
+		return -ENOMEM;
+
+	/* Location where new range should go */
+	j = i + 1;
+	if (j < mem->nr_ranges) {
+		/* Move over all ranges one slot towards the end */
+		for (i = mem->nr_ranges - 1; i >= j; i--)
+			mem->ranges[i + 1] = mem->ranges[i];
+	}
+
+	mem->ranges[j].start = temp_range.start;
+	mem->ranges[j].end = temp_range.end;
+	mem->nr_ranges++;
+	return 0;
+}
+
+int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
+			  void **addr, unsigned long *sz)
+{
+	Elf64_Ehdr *ehdr;
+	Elf64_Phdr *phdr;
+	unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+	unsigned char *buf;
+	unsigned int cpu, i;
+	unsigned long long notes_addr;
+	unsigned long mstart, mend;
+
+	/* extra phdr for vmcoreinfo elf note */
+	nr_phdr = nr_cpus + 1;
+	nr_phdr += mem->nr_ranges;
+
+	/*
+	 * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+	 * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64).
+	 * I think this is required by tools like gdb. So same physical
+	 * memory will be mapped in two elf headers. One will contain kernel
+	 * text virtual addresses and other will have __va(physical) addresses.
+	 */
+
+	nr_phdr++;
+	elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+	elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+	buf = vzalloc(elf_sz);
+	if (!buf)
+		return -ENOMEM;
+
+	ehdr = (Elf64_Ehdr *)buf;
+	phdr = (Elf64_Phdr *)(ehdr + 1);
+	memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+	ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+	ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+	ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+	ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+	memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+	ehdr->e_type = ET_CORE;
+	ehdr->e_machine = ELF_ARCH;
+	ehdr->e_version = EV_CURRENT;
+	ehdr->e_phoff = sizeof(Elf64_Ehdr);
+	ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+	ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+	/* Prepare one phdr of type PT_NOTE for each present cpu */
+	for_each_present_cpu(cpu) {
+		phdr->p_type = PT_NOTE;
+		notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+		phdr->p_offset = phdr->p_paddr = notes_addr;
+		phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+		(ehdr->e_phnum)++;
+		phdr++;
+	}
+
+	/* Prepare one PT_NOTE header for vmcoreinfo */
+	phdr->p_type = PT_NOTE;
+	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
+	(ehdr->e_phnum)++;
+	phdr++;
+
+	/* Prepare PT_LOAD type program header for kernel text region */
+	if (kernel_map) {
+		phdr->p_type = PT_LOAD;
+		phdr->p_flags = PF_R|PF_W|PF_X;
+		phdr->p_vaddr = (Elf64_Addr)_text;
+		phdr->p_filesz = phdr->p_memsz = _end - _text;
+		phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+		ehdr->e_phnum++;
+		phdr++;
+	}
+
+	/* Go through all the ranges in mem->ranges[] and prepare phdr */
+	for (i = 0; i < mem->nr_ranges; i++) {
+		mstart = mem->ranges[i].start;
+		mend = mem->ranges[i].end;
+
+		phdr->p_type = PT_LOAD;
+		phdr->p_flags = PF_R|PF_W|PF_X;
+		phdr->p_offset  = mstart;
+
+		phdr->p_paddr = mstart;
+		phdr->p_vaddr = (unsigned long long) __va(mstart);
+		phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+		phdr->p_align = 0;
+		ehdr->e_phnum++;
+		phdr++;
+		pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+			phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+			ehdr->e_phnum, phdr->p_offset);
+	}
+
+	*addr = buf;
+	*sz = elf_sz;
+	return 0;
+}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 102160ff5c66..ea619021d901 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2428,7 +2428,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
 	struct kprobe_blacklist_entry *ent =
 		list_entry(v, struct kprobe_blacklist_entry, list);
 
-	seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr,
+	seq_printf(m, "0x%px-0x%px\t%ps\n", (void *)ent->start_addr,
 		   (void *)ent->end_addr, (void *)ent->start_addr);
 	return 0;
 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index cd50e99202b0..481951bf091d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -55,7 +55,6 @@ enum KTHREAD_BITS {
 	KTHREAD_IS_PER_CPU = 0,
 	KTHREAD_SHOULD_STOP,
 	KTHREAD_SHOULD_PARK,
-	KTHREAD_IS_PARKED,
 };
 
 static inline void set_kthread_struct(void *kthread)
@@ -177,14 +176,12 @@ void *kthread_probe_data(struct task_struct *task)
 
 static void __kthread_parkme(struct kthread *self)
 {
-	__set_current_state(TASK_PARKED);
-	while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
-		if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
-			complete(&self->parked);
+	for (;;) {
+		set_current_state(TASK_PARKED);
+		if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
+			break;
 		schedule();
-		__set_current_state(TASK_PARKED);
 	}
-	clear_bit(KTHREAD_IS_PARKED, &self->flags);
 	__set_current_state(TASK_RUNNING);
 }
 
@@ -194,6 +191,11 @@ void kthread_parkme(void)
 }
 EXPORT_SYMBOL_GPL(kthread_parkme);
 
+void kthread_park_complete(struct task_struct *k)
+{
+	complete_all(&to_kthread(k)->parked);
+}
+
 static int kthread(void *_create)
 {
 	/* Copy data: it's on kthread's stack */
@@ -450,22 +452,16 @@ void kthread_unpark(struct task_struct *k)
 {
 	struct kthread *kthread = to_kthread(k);
 
-	clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
 	/*
-	 * We clear the IS_PARKED bit here as we don't wait
-	 * until the task has left the park code. So if we'd
-	 * park before that happens we'd see the IS_PARKED bit
-	 * which might be about to be cleared.
+	 * Newly created kthread was parked when the CPU was offline.
+	 * The binding was lost and we need to set it again.
 	 */
-	if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
-		/*
-		 * Newly created kthread was parked when the CPU was offline.
-		 * The binding was lost and we need to set it again.
-		 */
-		if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
-			__kthread_bind(k, kthread->cpu, TASK_PARKED);
-		wake_up_state(k, TASK_PARKED);
-	}
+	if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+		__kthread_bind(k, kthread->cpu, TASK_PARKED);
+
+	reinit_completion(&kthread->parked);
+	clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+	wake_up_state(k, TASK_PARKED);
 }
 EXPORT_SYMBOL_GPL(kthread_unpark);
 
@@ -488,12 +484,10 @@ int kthread_park(struct task_struct *k)
 	if (WARN_ON(k->flags & PF_EXITING))
 		return -ENOSYS;
 
-	if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
-		set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
-		if (k != current) {
-			wake_up_process(k);
-			wait_for_completion(&kthread->parked);
-		}
+	set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+	if (k != current) {
+		wake_up_process(k);
+		wait_for_completion(&kthread->parked);
 	}
 
 	return 0;
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index fdac27588d60..83958c814439 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -113,8 +113,10 @@ void *klp_shadow_get(void *obj, unsigned long id)
 }
 EXPORT_SYMBOL_GPL(klp_shadow_get);
 
-static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
-		       size_t size, gfp_t gfp_flags, bool warn_on_exist)
+static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id,
+				       size_t size, gfp_t gfp_flags,
+				       klp_shadow_ctor_t ctor, void *ctor_data,
+				       bool warn_on_exist)
 {
 	struct klp_shadow *new_shadow;
 	void *shadow_data;
@@ -125,18 +127,15 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
 	if (shadow_data)
 		goto exists;
 
-	/* Allocate a new shadow variable for use inside the lock below */
+	/*
+	 * Allocate a new shadow variable.  Fill it with zeroes by default.
+	 * More complex setting can be done by @ctor function.  But it is
+	 * called only when the buffer is really used (under klp_shadow_lock).
+	 */
 	new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags);
 	if (!new_shadow)
 		return NULL;
 
-	new_shadow->obj = obj;
-	new_shadow->id = id;
-
-	/* Initialize the shadow variable if data provided */
-	if (data)
-		memcpy(new_shadow->data, data, size);
-
 	/* Look for <obj, id> again under the lock */
 	spin_lock_irqsave(&klp_shadow_lock, flags);
 	shadow_data = klp_shadow_get(obj, id);
@@ -150,6 +149,22 @@ static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
 		goto exists;
 	}
 
+	new_shadow->obj = obj;
+	new_shadow->id = id;
+
+	if (ctor) {
+		int err;
+
+		err = ctor(obj, new_shadow->data, ctor_data);
+		if (err) {
+			spin_unlock_irqrestore(&klp_shadow_lock, flags);
+			kfree(new_shadow);
+			pr_err("Failed to construct shadow variable <%p, %lx> (%d)\n",
+			       obj, id, err);
+			return NULL;
+		}
+	}
+
 	/* No <obj, id> found, so attach the newly allocated one */
 	hash_add_rcu(klp_shadow_hash, &new_shadow->node,
 		     (unsigned long)new_shadow->obj);
@@ -170,26 +185,32 @@ exists:
  * klp_shadow_alloc() - allocate and add a new shadow variable
  * @obj:	pointer to parent object
  * @id:		data identifier
- * @data:	pointer to data to attach to parent
  * @size:	size of attached data
  * @gfp_flags:	GFP mask for allocation
+ * @ctor:	custom constructor to initialize the shadow data (optional)
+ * @ctor_data:	pointer to any data needed by @ctor (optional)
+ *
+ * Allocates @size bytes for new shadow variable data using @gfp_flags.
+ * The data are zeroed by default.  They are further initialized by @ctor
+ * function if it is not NULL.  The new shadow variable is then added
+ * to the global hashtable.
  *
- * Allocates @size bytes for new shadow variable data using @gfp_flags
- * and copies @size bytes from @data into the new shadow variable's own
- * data space.  If @data is NULL, @size bytes are still allocated, but
- * no copy is performed.  The new shadow variable is then added to the
- * global hashtable.
+ * If an existing <obj, id> shadow variable can be found, this routine will
+ * issue a WARN, exit early and return NULL.
  *
- * If an existing <obj, id> shadow variable can be found, this routine
- * will issue a WARN, exit early and return NULL.
+ * This function guarantees that the constructor function is called only when
+ * the variable did not exist before.  The cost is that @ctor is called
+ * in atomic context under a spin lock.
  *
  * Return: the shadow variable data element, NULL on duplicate or
  * failure.
  */
-void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
-		       size_t size, gfp_t gfp_flags)
+void *klp_shadow_alloc(void *obj, unsigned long id,
+		       size_t size, gfp_t gfp_flags,
+		       klp_shadow_ctor_t ctor, void *ctor_data)
 {
-	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true);
+	return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags,
+					 ctor, ctor_data, true);
 }
 EXPORT_SYMBOL_GPL(klp_shadow_alloc);
 
@@ -197,37 +218,51 @@ EXPORT_SYMBOL_GPL(klp_shadow_alloc);
  * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable
  * @obj:	pointer to parent object
  * @id:		data identifier
- * @data:	pointer to data to attach to parent
  * @size:	size of attached data
  * @gfp_flags:	GFP mask for allocation
+ * @ctor:	custom constructor to initialize the shadow data (optional)
+ * @ctor_data:	pointer to any data needed by @ctor (optional)
  *
  * Returns a pointer to existing shadow data if an <obj, id> shadow
  * variable is already present.  Otherwise, it creates a new shadow
  * variable like klp_shadow_alloc().
  *
- * This function guarantees that only one shadow variable exists with
- * the given @id for the given @obj.  It also guarantees that the shadow
- * variable will be initialized by the given @data only when it did not
- * exist before.
+ * This function guarantees that only one shadow variable exists with the given
+ * @id for the given @obj.  It also guarantees that the constructor function
+ * will be called only when the variable did not exist before.  The cost is
+ * that @ctor is called in atomic context under a spin lock.
  *
  * Return: the shadow variable data element, NULL on failure.
  */
-void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
-			       size_t size, gfp_t gfp_flags)
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id,
+			      size_t size, gfp_t gfp_flags,
+			      klp_shadow_ctor_t ctor, void *ctor_data)
 {
-	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false);
+	return __klp_shadow_get_or_alloc(obj, id, size, gfp_flags,
+					 ctor, ctor_data, false);
 }
 EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
 
+static void klp_shadow_free_struct(struct klp_shadow *shadow,
+				   klp_shadow_dtor_t dtor)
+{
+	hash_del_rcu(&shadow->node);
+	if (dtor)
+		dtor(shadow->obj, shadow->data);
+	kfree_rcu(shadow, rcu_head);
+}
+
 /**
  * klp_shadow_free() - detach and free a <obj, id> shadow variable
  * @obj:	pointer to parent object
  * @id:		data identifier
+ * @dtor:	custom callback that can be used to unregister the variable
+ *		and/or free data that the shadow variable points to (optional)
  *
  * This function releases the memory for this <obj, id> shadow variable
  * instance, callers should stop referencing it accordingly.
  */
-void klp_shadow_free(void *obj, unsigned long id)
+void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor)
 {
 	struct klp_shadow *shadow;
 	unsigned long flags;
@@ -239,8 +274,7 @@ void klp_shadow_free(void *obj, unsigned long id)
 			       (unsigned long)obj) {
 
 		if (klp_shadow_match(shadow, obj, id)) {
-			hash_del_rcu(&shadow->node);
-			kfree_rcu(shadow, rcu_head);
+			klp_shadow_free_struct(shadow, dtor);
 			break;
 		}
 	}
@@ -252,11 +286,13 @@ EXPORT_SYMBOL_GPL(klp_shadow_free);
 /**
  * klp_shadow_free_all() - detach and free all <*, id> shadow variables
  * @id:		data identifier
+ * @dtor:	custom callback that can be used to unregister the variable
+ *		and/or free data that the shadow variable points to (optional)
  *
  * This function releases the memory for all <*, id> shadow variable
  * instances, callers should stop referencing them accordingly.
  */
-void klp_shadow_free_all(unsigned long id)
+void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor)
 {
 	struct klp_shadow *shadow;
 	unsigned long flags;
@@ -266,10 +302,8 @@ void klp_shadow_free_all(unsigned long id)
 
 	/* Delete all <*, id> from hash */
 	hash_for_each(klp_shadow_hash, i, shadow, node) {
-		if (klp_shadow_match(shadow, shadow->obj, id)) {
-			hash_del_rcu(&shadow->node);
-			kfree_rcu(shadow, rcu_head);
-		}
+		if (klp_shadow_match(shadow, shadow->obj, id))
+			klp_shadow_free_struct(shadow, dtor);
 	}
 
 	spin_unlock_irqrestore(&klp_shadow_lock, flags);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 023386338269..edcac5de7ebc 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -561,20 +561,24 @@ static void print_lock(struct held_lock *hlock)
 	printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
 }
 
-static void lockdep_print_held_locks(struct task_struct *curr)
+static void lockdep_print_held_locks(struct task_struct *p)
 {
-	int i, depth = curr->lockdep_depth;
+	int i, depth = READ_ONCE(p->lockdep_depth);
 
-	if (!depth) {
-		printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr));
+	if (!depth)
+		printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p));
+	else
+		printk("%d lock%s held by %s/%d:\n", depth,
+		       depth > 1 ? "s" : "", p->comm, task_pid_nr(p));
+	/*
+	 * It's not reliable to print a task's held locks if it's not sleeping
+	 * and it's not the current task.
+	 */
+	if (p->state == TASK_RUNNING && p != current)
 		return;
-	}
-	printk("%d lock%s held by %s/%d:\n",
-		depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr));
-
 	for (i = 0; i < depth; i++) {
 		printk(" #%d: ", i);
-		print_lock(curr->held_locks + i);
+		print_lock(p->held_locks + i);
 	}
 }
 
@@ -4451,8 +4455,6 @@ EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
 void debug_show_all_locks(void)
 {
 	struct task_struct *g, *p;
-	int count = 10;
-	int unlock = 1;
 
 	if (unlikely(!debug_locks)) {
 		pr_warn("INFO: lockdep is turned off.\n");
@@ -4460,50 +4462,18 @@ void debug_show_all_locks(void)
 	}
 	pr_warn("\nShowing all locks held in the system:\n");
 
-	/*
-	 * Here we try to get the tasklist_lock as hard as possible,
-	 * if not successful after 2 seconds we ignore it (but keep
-	 * trying). This is to enable a debug printout even if a
-	 * tasklist_lock-holding task deadlocks or crashes.
-	 */
-retry:
-	if (!read_trylock(&tasklist_lock)) {
-		if (count == 10)
-			pr_warn("hm, tasklist_lock locked, retrying... ");
-		if (count) {
-			count--;
-			pr_cont(" #%d", 10-count);
-			mdelay(200);
-			goto retry;
-		}
-		pr_cont(" ignoring it.\n");
-		unlock = 0;
-	} else {
-		if (count != 10)
-			pr_cont(" locked it.\n");
-	}
-
-	do_each_thread(g, p) {
-		/*
-		 * It's not reliable to print a task's held locks
-		 * if it's not sleeping (or if it's not the current
-		 * task):
-		 */
-		if (p->state == TASK_RUNNING && p != current)
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		if (!p->lockdep_depth)
 			continue;
-		if (p->lockdep_depth)
-			lockdep_print_held_locks(p);
-		if (!unlock)
-			if (read_trylock(&tasklist_lock))
-				unlock = 1;
+		lockdep_print_held_locks(p);
 		touch_nmi_watchdog();
-	} while_each_thread(g, p);
+		touch_all_softlockup_watchdogs();
+	}
+	rcu_read_unlock();
 
 	pr_warn("\n");
 	pr_warn("=============================================\n\n");
-
-	if (unlock)
-		read_unlock(&tasklist_lock);
 }
 EXPORT_SYMBOL_GPL(debug_show_all_locks);
 #endif
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index ad69bbc9bd28..3dd980dfba2d 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -101,18 +101,6 @@ static const struct seq_operations lockdep_ops = {
 	.show	= l_show,
 };
 
-static int lockdep_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &lockdep_ops);
-}
-
-static const struct file_operations proc_lockdep_operations = {
-	.open		= lockdep_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 #ifdef CONFIG_PROVE_LOCKING
 static void *lc_start(struct seq_file *m, loff_t *pos)
 {
@@ -170,18 +158,6 @@ static const struct seq_operations lockdep_chains_ops = {
 	.stop	= lc_stop,
 	.show	= lc_show,
 };
-
-static int lockdep_chains_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &lockdep_chains_ops);
-}
-
-static const struct file_operations proc_lockdep_chains_operations = {
-	.open		= lockdep_chains_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
 #endif /* CONFIG_PROVE_LOCKING */
 
 static void lockdep_stats_debug_show(struct seq_file *m)
@@ -355,18 +331,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int lockdep_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, lockdep_stats_show, NULL);
-}
-
-static const struct file_operations proc_lockdep_stats_operations = {
-	.open		= lockdep_stats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 #ifdef CONFIG_LOCK_STAT
 
 struct lock_stat_data {
@@ -682,14 +646,11 @@ static const struct file_operations proc_lock_stat_operations = {
 
 static int __init lockdep_proc_init(void)
 {
-	proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+	proc_create_seq("lockdep", S_IRUSR, NULL, &lockdep_ops);
 #ifdef CONFIG_PROVE_LOCKING
-	proc_create("lockdep_chains", S_IRUSR, NULL,
-		    &proc_lockdep_chains_operations);
+	proc_create_seq("lockdep_chains", S_IRUSR, NULL, &lockdep_chains_ops);
 #endif
-	proc_create("lockdep_stats", S_IRUSR, NULL,
-		    &proc_lockdep_stats_operations);
-
+	proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show);
 #ifdef CONFIG_LOCK_STAT
 	proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
 		    &proc_lock_stat_operations);
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index f046b7ce9dd6..5e10153b4d3c 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -23,13 +23,15 @@ struct mcs_spinlock {
 
 #ifndef arch_mcs_spin_lock_contended
 /*
- * Using smp_load_acquire() provides a memory barrier that ensures
- * subsequent operations happen after the lock is acquired.
+ * Using smp_cond_load_acquire() provides the acquire semantics
+ * required so that subsequent operations happen after the
+ * lock is acquired. Additionally, some architectures such as
+ * ARM64 would like to do spin-waiting instead of purely
+ * spinning, and smp_cond_load_acquire() provides that behavior.
  */
 #define arch_mcs_spin_lock_contended(l)					\
 do {									\
-	while (!(smp_load_acquire(l)))					\
-		cpu_relax();						\
+	smp_cond_load_acquire(l, VAL);					\
 } while (0)
 #endif
 
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 2048359f33d2..f44f658ae629 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -139,8 +139,9 @@ static inline bool __mutex_trylock(struct mutex *lock)
 static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
 {
 	unsigned long curr = (unsigned long)current;
+	unsigned long zero = 0UL;
 
-	if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+	if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
 		return true;
 
 	return false;
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index d880296245c5..bfaeb05123ff 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -12,11 +12,11 @@
  * GNU General Public License for more details.
  *
  * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
- * (C) Copyright 2013-2014 Red Hat, Inc.
+ * (C) Copyright 2013-2014,2018 Red Hat, Inc.
  * (C) Copyright 2015 Intel Corp.
  * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
  *
- * Authors: Waiman Long <waiman.long@hpe.com>
+ * Authors: Waiman Long <longman@redhat.com>
  *          Peter Zijlstra <peterz@infradead.org>
  */
 
@@ -33,6 +33,11 @@
 #include <asm/qspinlock.h>
 
 /*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
  * The basic principle of a queue-based spinlock can best be understood
  * by studying a classic queue-based spinlock implementation called the
  * MCS lock. The paper below provides a good description for this kind
@@ -77,6 +82,18 @@
 #endif
 
 /*
+ * The pending bit spinning loop count.
+ * This heuristic is used to limit the number of lockword accesses
+ * made by atomic_cond_read_relaxed when waiting for the lock to
+ * transition out of the "== _Q_PENDING_VAL" state. We don't spin
+ * indefinitely because there's no guarantee that we'll make forward
+ * progress.
+ */
+#ifndef _Q_PENDING_LOOPS
+#define _Q_PENDING_LOOPS	1
+#endif
+
+/*
  * Per-CPU queue node structures; we can never have more than 4 nested
  * contexts: task, softirq, hardirq, nmi.
  *
@@ -114,41 +131,18 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
 
 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
 
-/*
- * By using the whole 2nd least significant byte for the pending bit, we
- * can allow better optimization of the lock acquisition for the pending
- * bit holder.
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
  *
- * This internal structure is also used by the set_locked function which
- * is not restricted to _Q_PENDING_BITS == 8.
+ * *,1,* -> *,0,*
  */
-struct __qspinlock {
-	union {
-		atomic_t val;
-#ifdef __LITTLE_ENDIAN
-		struct {
-			u8	locked;
-			u8	pending;
-		};
-		struct {
-			u16	locked_pending;
-			u16	tail;
-		};
-#else
-		struct {
-			u16	tail;
-			u16	locked_pending;
-		};
-		struct {
-			u8	reserved[2];
-			u8	pending;
-			u8	locked;
-		};
-#endif
-	};
-};
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+	WRITE_ONCE(lock->pending, 0);
+}
 
-#if _Q_PENDING_BITS == 8
 /**
  * clear_pending_set_locked - take ownership and clear the pending bit.
  * @lock: Pointer to queued spinlock structure
@@ -159,9 +153,7 @@ struct __qspinlock {
  */
 static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
-	WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
 }
 
 /*
@@ -176,19 +168,28 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
  */
 static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 {
-	struct __qspinlock *l = (void *)lock;
-
 	/*
-	 * Use release semantics to make sure that the MCS node is properly
-	 * initialized before changing the tail code.
+	 * We can use relaxed semantics since the caller ensures that the
+	 * MCS node is properly initialized before updating the tail.
 	 */
-	return (u32)xchg_release(&l->tail,
+	return (u32)xchg_relaxed(&lock->tail,
 				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
 }
 
 #else /* _Q_PENDING_BITS == 8 */
 
 /**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+	atomic_andnot(_Q_PENDING_VAL, &lock->val);
+}
+
+/**
  * clear_pending_set_locked - take ownership and clear the pending bit.
  * @lock: Pointer to queued spinlock structure
  *
@@ -216,10 +217,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 	for (;;) {
 		new = (val & _Q_LOCKED_PENDING_MASK) | tail;
 		/*
-		 * Use release semantics to make sure that the MCS node is
-		 * properly initialized before changing the tail code.
+		 * We can use relaxed semantics since the caller ensures that
+		 * the MCS node is properly initialized before updating the
+		 * tail.
 		 */
-		old = atomic_cmpxchg_release(&lock->val, val, new);
+		old = atomic_cmpxchg_relaxed(&lock->val, val, new);
 		if (old == val)
 			break;
 
@@ -237,9 +239,7 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
  */
 static __always_inline void set_locked(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
-	WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
 }
 
 
@@ -294,86 +294,83 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
 void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 {
 	struct mcs_spinlock *prev, *next, *node;
-	u32 new, old, tail;
+	u32 old, tail;
 	int idx;
 
 	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
 
 	if (pv_enabled())
-		goto queue;
+		goto pv_queue;
 
 	if (virt_spin_lock(lock))
 		return;
 
 	/*
-	 * wait for in-progress pending->locked hand-overs
+	 * Wait for in-progress pending->locked hand-overs with a bounded
+	 * number of spins so that we guarantee forward progress.
 	 *
 	 * 0,1,0 -> 0,0,1
 	 */
 	if (val == _Q_PENDING_VAL) {
-		while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
-			cpu_relax();
+		int cnt = _Q_PENDING_LOOPS;
+		val = atomic_cond_read_relaxed(&lock->val,
+					       (VAL != _Q_PENDING_VAL) || !cnt--);
 	}
 
 	/*
+	 * If we observe any contention; queue.
+	 */
+	if (val & ~_Q_LOCKED_MASK)
+		goto queue;
+
+	/*
 	 * trylock || pending
 	 *
 	 * 0,0,0 -> 0,0,1 ; trylock
 	 * 0,0,1 -> 0,1,1 ; pending
 	 */
-	for (;;) {
+	val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+	if (!(val & ~_Q_LOCKED_MASK)) {
 		/*
-		 * If we observe any contention; queue.
+		 * We're pending, wait for the owner to go away.
+		 *
+		 * *,1,1 -> *,1,0
+		 *
+		 * this wait loop must be a load-acquire such that we match the
+		 * store-release that clears the locked bit and create lock
+		 * sequentiality; this is because not all
+		 * clear_pending_set_locked() implementations imply full
+		 * barriers.
 		 */
-		if (val & ~_Q_LOCKED_MASK)
-			goto queue;
-
-		new = _Q_LOCKED_VAL;
-		if (val == new)
-			new |= _Q_PENDING_VAL;
+		if (val & _Q_LOCKED_MASK) {
+			atomic_cond_read_acquire(&lock->val,
+						 !(VAL & _Q_LOCKED_MASK));
+		}
 
 		/*
-		 * Acquire semantic is required here as the function may
-		 * return immediately if the lock was free.
+		 * take ownership and clear the pending bit.
+		 *
+		 * *,1,0 -> *,0,1
 		 */
-		old = atomic_cmpxchg_acquire(&lock->val, val, new);
-		if (old == val)
-			break;
-
-		val = old;
-	}
-
-	/*
-	 * we won the trylock
-	 */
-	if (new == _Q_LOCKED_VAL)
+		clear_pending_set_locked(lock);
+		qstat_inc(qstat_lock_pending, true);
 		return;
+	}
 
 	/*
-	 * we're pending, wait for the owner to go away.
-	 *
-	 * *,1,1 -> *,1,0
-	 *
-	 * this wait loop must be a load-acquire such that we match the
-	 * store-release that clears the locked bit and create lock
-	 * sequentiality; this is because not all clear_pending_set_locked()
-	 * implementations imply full barriers.
-	 */
-	smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
-
-	/*
-	 * take ownership and clear the pending bit.
-	 *
-	 * *,1,0 -> *,0,1
+	 * If pending was clear but there are waiters in the queue, then
+	 * we need to undo our setting of pending before we queue ourselves.
 	 */
-	clear_pending_set_locked(lock);
-	return;
+	if (!(val & _Q_PENDING_MASK))
+		clear_pending(lock);
 
 	/*
 	 * End of pending bit optimistic spinning and beginning of MCS
 	 * queuing.
 	 */
 queue:
+	qstat_inc(qstat_lock_slowpath, true);
+pv_queue:
 	node = this_cpu_ptr(&mcs_nodes[0]);
 	idx = node->count++;
 	tail = encode_tail(smp_processor_id(), idx);
@@ -400,12 +397,18 @@ queue:
 		goto release;
 
 	/*
+	 * Ensure that the initialisation of @node is complete before we
+	 * publish the updated tail via xchg_tail() and potentially link
+	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
+	 */
+	smp_wmb();
+
+	/*
+	 * Publish the updated tail.
 	 * We have already touched the queueing cacheline; don't bother with
 	 * pending stuff.
 	 *
 	 * p,*,* -> n,*,*
-	 *
-	 * RELEASE, such that the stores to @node must be complete.
 	 */
 	old = xchg_tail(lock, tail);
 	next = NULL;
@@ -417,14 +420,8 @@ queue:
 	if (old & _Q_TAIL_MASK) {
 		prev = decode_tail(old);
 
-		/*
-		 * We must ensure that the stores to @node are observed before
-		 * the write to prev->next. The address dependency from
-		 * xchg_tail is not sufficient to ensure this because the read
-		 * component of xchg_tail is unordered with respect to the
-		 * initialisation of @node.
-		 */
-		smp_store_release(&prev->next, node);
+		/* Link @node into the waitqueue. */
+		WRITE_ONCE(prev->next, node);
 
 		pv_wait_node(node, prev);
 		arch_mcs_spin_lock_contended(&node->locked);
@@ -453,8 +450,8 @@ queue:
 	 *
 	 * The PV pv_wait_head_or_lock function, if active, will acquire
 	 * the lock and return a non-zero value. So we have to skip the
-	 * smp_cond_load_acquire() call. As the next PV queue head hasn't been
-	 * designated yet, there is no way for the locked value to become
+	 * atomic_cond_read_acquire() call. As the next PV queue head hasn't
+	 * been designated yet, there is no way for the locked value to become
 	 * _Q_SLOW_VAL. So both the set_locked() and the
 	 * atomic_cmpxchg_relaxed() calls will be safe.
 	 *
@@ -464,44 +461,38 @@ queue:
 	if ((val = pv_wait_head_or_lock(lock, node)))
 		goto locked;
 
-	val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
+	val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));
 
 locked:
 	/*
 	 * claim the lock:
 	 *
 	 * n,0,0 -> 0,0,1 : lock, uncontended
-	 * *,0,0 -> *,0,1 : lock, contended
+	 * *,*,0 -> *,*,1 : lock, contended
 	 *
-	 * If the queue head is the only one in the queue (lock value == tail),
-	 * clear the tail code and grab the lock. Otherwise, we only need
-	 * to grab the lock.
+	 * If the queue head is the only one in the queue (lock value == tail)
+	 * and nobody is pending, clear the tail code and grab the lock.
+	 * Otherwise, we only need to grab the lock.
 	 */
-	for (;;) {
-		/* In the PV case we might already have _Q_LOCKED_VAL set */
-		if ((val & _Q_TAIL_MASK) != tail) {
-			set_locked(lock);
-			break;
-		}
-		/*
-		 * The smp_cond_load_acquire() call above has provided the
-		 * necessary acquire semantics required for locking. At most
-		 * two iterations of this loop may be ran.
-		 */
-		old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
-		if (old == val)
-			goto release;	/* No contention */
 
-		val = old;
-	}
+	/*
+	 * In the PV case we might already have _Q_LOCKED_VAL set.
+	 *
+	 * The atomic_cond_read_acquire() call above has provided the
+	 * necessary acquire semantics required for locking.
+	 */
+	if (((val & _Q_TAIL_MASK) == tail) &&
+	    atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+		goto release; /* No contention */
+
+	/* Either somebody is queued behind us or _Q_PENDING_VAL is set */
+	set_locked(lock);
 
 	/*
 	 * contended path; wait for next if not observed yet, release.
 	 */
-	if (!next) {
-		while (!(next = READ_ONCE(node->next)))
-			cpu_relax();
-	}
+	if (!next)
+		next = smp_cond_load_relaxed(&node->next, (VAL));
 
 	arch_mcs_spin_unlock_contended(&next->locked);
 	pv_kick_node(lock, next);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 6ee477765e6c..5a0cf5f9008c 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -56,11 +56,6 @@ struct pv_node {
 };
 
 /*
- * Include queued spinlock statistics code
- */
-#include "qspinlock_stat.h"
-
-/*
  * Hybrid PV queued/unfair lock
  *
  * By replacing the regular queued_spin_trylock() with the function below,
@@ -87,8 +82,6 @@ struct pv_node {
 #define queued_spin_trylock(l)	pv_hybrid_queued_unfair_trylock(l)
 static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
 	/*
 	 * Stay in unfair lock mode as long as queued mode waiters are
 	 * present in the MCS wait queue but the pending bit isn't set.
@@ -97,7 +90,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
 		int val = atomic_read(&lock->val);
 
 		if (!(val & _Q_LOCKED_PENDING_MASK) &&
-		   (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+		   (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
 			qstat_inc(qstat_pv_lock_stealing, true);
 			return true;
 		}
@@ -117,16 +110,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
 #if _Q_PENDING_BITS == 8
 static __always_inline void set_pending(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
-	WRITE_ONCE(l->pending, 1);
-}
-
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
-	struct __qspinlock *l = (void *)lock;
-
-	WRITE_ONCE(l->pending, 0);
+	WRITE_ONCE(lock->pending, 1);
 }
 
 /*
@@ -136,10 +120,8 @@ static __always_inline void clear_pending(struct qspinlock *lock)
  */
 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
-
-	return !READ_ONCE(l->locked) &&
-	       (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL,
+	return !READ_ONCE(lock->locked) &&
+	       (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL,
 				_Q_LOCKED_VAL) == _Q_PENDING_VAL);
 }
 #else /* _Q_PENDING_BITS == 8 */
@@ -148,11 +130,6 @@ static __always_inline void set_pending(struct qspinlock *lock)
 	atomic_or(_Q_PENDING_VAL, &lock->val);
 }
 
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
-	atomic_andnot(_Q_PENDING_VAL, &lock->val);
-}
-
 static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 {
 	int val = atomic_read(&lock->val);
@@ -384,7 +361,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
-	struct __qspinlock *l = (void *)lock;
 
 	/*
 	 * If the vCPU is indeed halted, advance its state to match that of
@@ -413,7 +389,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 	 * the hash table later on at unlock time, no atomic instruction is
 	 * needed.
 	 */
-	WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+	WRITE_ONCE(lock->locked, _Q_SLOW_VAL);
 	(void)pv_hash(lock, pn);
 }
 
@@ -428,7 +404,6 @@ static u32
 pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 {
 	struct pv_node *pn = (struct pv_node *)node;
-	struct __qspinlock *l = (void *)lock;
 	struct qspinlock **lp = NULL;
 	int waitcnt = 0;
 	int loop;
@@ -443,7 +418,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 	/*
 	 * Tracking # of slowpath locking operations
 	 */
-	qstat_inc(qstat_pv_lock_slowpath, true);
+	qstat_inc(qstat_lock_slowpath, true);
 
 	for (;; waitcnt++) {
 		/*
@@ -479,13 +454,13 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 			 *
 			 * Matches the smp_rmb() in __pv_queued_spin_unlock().
 			 */
-			if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
+			if (xchg(&lock->locked, _Q_SLOW_VAL) == 0) {
 				/*
 				 * The lock was free and now we own the lock.
 				 * Change the lock value back to _Q_LOCKED_VAL
 				 * and unhash the table.
 				 */
-				WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
+				WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
 				WRITE_ONCE(*lp, NULL);
 				goto gotlock;
 			}
@@ -493,7 +468,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 		WRITE_ONCE(pn->state, vcpu_hashed);
 		qstat_inc(qstat_pv_wait_head, true);
 		qstat_inc(qstat_pv_wait_again, waitcnt);
-		pv_wait(&l->locked, _Q_SLOW_VAL);
+		pv_wait(&lock->locked, _Q_SLOW_VAL);
 
 		/*
 		 * Because of lock stealing, the queue head vCPU may not be
@@ -518,7 +493,6 @@ gotlock:
 __visible void
 __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 {
-	struct __qspinlock *l = (void *)lock;
 	struct pv_node *node;
 
 	if (unlikely(locked != _Q_SLOW_VAL)) {
@@ -547,7 +521,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 	 * Now that we have a reference to the (likely) blocked pv_node,
 	 * release the lock.
 	 */
-	smp_store_release(&l->locked, 0);
+	smp_store_release(&lock->locked, 0);
 
 	/*
 	 * At this point the memory pointed at by lock can be freed/reused,
@@ -573,7 +547,6 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 #ifndef __pv_queued_spin_unlock
 __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 {
-	struct __qspinlock *l = (void *)lock;
 	u8 locked;
 
 	/*
@@ -581,7 +554,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
 	 * unhash. Otherwise it would be possible to have multiple @lock
 	 * entries, which would be BAD.
 	 */
-	locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
+	locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0);
 	if (likely(locked == _Q_LOCKED_VAL))
 		return;
 
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 4a30ef63c607..6bd78c0740fc 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -22,13 +22,14 @@
  *   pv_kick_wake	- # of vCPU kicks used for computing pv_latency_wake
  *   pv_latency_kick	- average latency (ns) of vCPU kick operation
  *   pv_latency_wake	- average latency (ns) from vCPU kick to wakeup
- *   pv_lock_slowpath	- # of locking operations via the slowpath
  *   pv_lock_stealing	- # of lock stealing operations
  *   pv_spurious_wakeup	- # of spurious wakeups in non-head vCPUs
  *   pv_wait_again	- # of wait's after a queue head vCPU kick
  *   pv_wait_early	- # of early vCPU wait's
  *   pv_wait_head	- # of vCPU wait's at the queue head
  *   pv_wait_node	- # of vCPU wait's at a non-head queue node
+ *   lock_pending	- # of locking operations via pending code
+ *   lock_slowpath	- # of locking operations via MCS lock queue
  *
  * Writing to the "reset_counters" file will reset all the above counter
  * values.
@@ -46,13 +47,14 @@ enum qlock_stats {
 	qstat_pv_kick_wake,
 	qstat_pv_latency_kick,
 	qstat_pv_latency_wake,
-	qstat_pv_lock_slowpath,
 	qstat_pv_lock_stealing,
 	qstat_pv_spurious_wakeup,
 	qstat_pv_wait_again,
 	qstat_pv_wait_early,
 	qstat_pv_wait_head,
 	qstat_pv_wait_node,
+	qstat_lock_pending,
+	qstat_lock_slowpath,
 	qstat_num,	/* Total number of statistical counters */
 	qstat_reset_cnts = qstat_num,
 };
@@ -73,12 +75,13 @@ static const char * const qstat_names[qstat_num + 1] = {
 	[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
 	[qstat_pv_latency_kick]	   = "pv_latency_kick",
 	[qstat_pv_latency_wake]    = "pv_latency_wake",
-	[qstat_pv_lock_slowpath]   = "pv_lock_slowpath",
 	[qstat_pv_lock_stealing]   = "pv_lock_stealing",
 	[qstat_pv_wait_again]      = "pv_wait_again",
 	[qstat_pv_wait_early]      = "pv_wait_early",
 	[qstat_pv_wait_head]       = "pv_wait_head",
 	[qstat_pv_wait_node]       = "pv_wait_node",
+	[qstat_lock_pending]       = "lock_pending",
+	[qstat_lock_slowpath]      = "lock_slowpath",
 	[qstat_reset_cnts]         = "reset_counters",
 };
 
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index e795908f3607..3064c50e181e 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -347,30 +347,31 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 	}
 }
 
+static inline bool owner_on_cpu(struct task_struct *owner)
+{
+	/*
+	 * As lock holder preemption issue, we both skip spinning if
+	 * task is not on cpu or its cpu is preempted
+	 */
+	return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
+}
+
 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 {
 	struct task_struct *owner;
 	bool ret = true;
 
+	BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
+
 	if (need_resched())
 		return false;
 
 	rcu_read_lock();
 	owner = READ_ONCE(sem->owner);
-	if (!rwsem_owner_is_writer(owner)) {
-		/*
-		 * Don't spin if the rwsem is readers owned.
-		 */
-		ret = !rwsem_owner_is_reader(owner);
-		goto done;
+	if (owner) {
+		ret = is_rwsem_owner_spinnable(owner) &&
+		      owner_on_cpu(owner);
 	}
-
-	/*
-	 * As lock holder preemption issue, we both skip spinning if task is not
-	 * on cpu or its cpu is preempted
-	 */
-	ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
-done:
 	rcu_read_unlock();
 	return ret;
 }
@@ -382,11 +383,11 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
 {
 	struct task_struct *owner = READ_ONCE(sem->owner);
 
-	if (!rwsem_owner_is_writer(owner))
-		goto out;
+	if (!is_rwsem_owner_spinnable(owner))
+		return false;
 
 	rcu_read_lock();
-	while (sem->owner == owner) {
+	while (owner && (READ_ONCE(sem->owner) == owner)) {
 		/*
 		 * Ensure we emit the owner->on_cpu, dereference _after_
 		 * checking sem->owner still matches owner, if that fails,
@@ -399,8 +400,7 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
 		 * abort spinning when need_resched or owner is not running or
 		 * owner's cpu is preempted.
 		 */
-		if (!owner->on_cpu || need_resched() ||
-				vcpu_is_preempted(task_cpu(owner))) {
+		if (need_resched() || !owner_on_cpu(owner)) {
 			rcu_read_unlock();
 			return false;
 		}
@@ -408,12 +408,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
 		cpu_relax();
 	}
 	rcu_read_unlock();
-out:
+
 	/*
 	 * If there is a new owner or the owner is not set, we continue
 	 * spinning.
 	 */
-	return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
+	return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
 }
 
 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 30465a2f2b6c..bc1e507be9ff 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -221,5 +221,3 @@ void up_read_non_owner(struct rw_semaphore *sem)
 EXPORT_SYMBOL(up_read_non_owner);
 
 #endif
-
-
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index a17cba8d94bb..b9d0e72aa80f 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,20 +1,24 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * The owner field of the rw_semaphore structure will be set to
- * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
+ * RWSEM_READER_OWNED when a reader grabs the lock. A writer will clear
  * the owner field when it unlocks. A reader, on the other hand, will
  * not touch the owner field when it unlocks.
  *
- * In essence, the owner field now has the following 3 states:
+ * In essence, the owner field now has the following 4 states:
  *  1) 0
  *     - lock is free or the owner hasn't set the field yet
  *  2) RWSEM_READER_OWNED
  *     - lock is currently or previously owned by readers (lock is free
  *       or not set by owner yet)
- *  3) Other non-zero value
- *     - a writer owns the lock
+ *  3) RWSEM_ANONYMOUSLY_OWNED bit set with some other bits set as well
+ *     - lock is owned by an anonymous writer, so spinning on the lock
+ *       owner should be disabled.
+ *  4) Other non-zero value
+ *     - a writer owns the lock and other writers can spin on the lock owner.
  */
-#define RWSEM_READER_OWNED	((struct task_struct *)1UL)
+#define RWSEM_ANONYMOUSLY_OWNED	(1UL << 0)
+#define RWSEM_READER_OWNED	((struct task_struct *)RWSEM_ANONYMOUSLY_OWNED)
 
 #ifdef CONFIG_DEBUG_RWSEMS
 # define DEBUG_RWSEMS_WARN_ON(c)	DEBUG_LOCKS_WARN_ON(c)
@@ -51,14 +55,22 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
 		WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
 }
 
-static inline bool rwsem_owner_is_writer(struct task_struct *owner)
+/*
+ * Return true if the a rwsem waiter can spin on the rwsem's owner
+ * and steal the lock, i.e. the lock is not anonymously owned.
+ * N.B. !owner is considered spinnable.
+ */
+static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
 {
-	return owner && owner != RWSEM_READER_OWNED;
+	return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
 }
 
-static inline bool rwsem_owner_is_reader(struct task_struct *owner)
+/*
+ * Return true if rwsem is owned by an anonymous writer or readers.
+ */
+static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
 {
-	return owner == RWSEM_READER_OWNED;
+	return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
 }
 #else
 static inline void rwsem_set_owner(struct rw_semaphore *sem)
diff --git a/kernel/module.c b/kernel/module.c
index a6e43a5806a1..68469b37d61a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1472,7 +1472,8 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
 {
 	struct module_sect_attr *sattr =
 		container_of(mattr, struct module_sect_attr, mattr);
-	return sprintf(buf, "0x%pK\n", (void *)sattr->address);
+	return sprintf(buf, "0x%px\n", kptr_restrict < 2 ?
+		       (void *)sattr->address : NULL);
 }
 
 static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -1603,8 +1604,7 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info)
 	if (notes == 0)
 		return;
 
-	notes_attrs = kzalloc(sizeof(*notes_attrs)
-			      + notes * sizeof(notes_attrs->attrs[0]),
+	notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
 			      GFP_KERNEL);
 	if (notes_attrs == NULL)
 		return;
@@ -3516,6 +3516,11 @@ static noinline int do_init_module(struct module *mod)
 	 * walking this with preempt disabled.  In all the failure paths, we
 	 * call synchronize_sched(), but we don't want to slow down the success
 	 * path, so use actual RCU here.
+	 * Note that module_alloc() on most architectures creates W+X page
+	 * mappings which won't be cleaned up until do_free_init() runs.  Any
+	 * code such as mark_rodata_ro() which depends on those mappings to
+	 * be cleaned up needs to sync with the queued work - ie
+	 * rcu_barrier_sched()
 	 */
 	call_rcu_sched(&freeinit->rcu, do_free_init);
 	mutex_unlock(&module_mutex);
diff --git a/kernel/panic.c b/kernel/panic.c
index 9d833d913c84..42e487488554 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,7 +34,8 @@
 #define PANIC_BLINK_SPD 18
 
 int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
-static unsigned long tainted_mask;
+static unsigned long tainted_mask =
+	IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0;
 static int pause_on_oops;
 static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -308,52 +309,40 @@ EXPORT_SYMBOL(panic);
  * is being removed anyway.
  */
 const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
-	{ 'P', 'G', true },	/* TAINT_PROPRIETARY_MODULE */
-	{ 'F', ' ', true },	/* TAINT_FORCED_MODULE */
-	{ 'S', ' ', false },	/* TAINT_CPU_OUT_OF_SPEC */
-	{ 'R', ' ', false },	/* TAINT_FORCED_RMMOD */
-	{ 'M', ' ', false },	/* TAINT_MACHINE_CHECK */
-	{ 'B', ' ', false },	/* TAINT_BAD_PAGE */
-	{ 'U', ' ', false },	/* TAINT_USER */
-	{ 'D', ' ', false },	/* TAINT_DIE */
-	{ 'A', ' ', false },	/* TAINT_OVERRIDDEN_ACPI_TABLE */
-	{ 'W', ' ', false },	/* TAINT_WARN */
-	{ 'C', ' ', true },	/* TAINT_CRAP */
-	{ 'I', ' ', false },	/* TAINT_FIRMWARE_WORKAROUND */
-	{ 'O', ' ', true },	/* TAINT_OOT_MODULE */
-	{ 'E', ' ', true },	/* TAINT_UNSIGNED_MODULE */
-	{ 'L', ' ', false },	/* TAINT_SOFTLOCKUP */
-	{ 'K', ' ', true },	/* TAINT_LIVEPATCH */
-	{ 'X', ' ', true },	/* TAINT_AUX */
+	[ TAINT_PROPRIETARY_MODULE ]	= { 'P', 'G', true },
+	[ TAINT_FORCED_MODULE ]		= { 'F', ' ', true },
+	[ TAINT_CPU_OUT_OF_SPEC ]	= { 'S', ' ', false },
+	[ TAINT_FORCED_RMMOD ]		= { 'R', ' ', false },
+	[ TAINT_MACHINE_CHECK ]		= { 'M', ' ', false },
+	[ TAINT_BAD_PAGE ]		= { 'B', ' ', false },
+	[ TAINT_USER ]			= { 'U', ' ', false },
+	[ TAINT_DIE ]			= { 'D', ' ', false },
+	[ TAINT_OVERRIDDEN_ACPI_TABLE ]	= { 'A', ' ', false },
+	[ TAINT_WARN ]			= { 'W', ' ', false },
+	[ TAINT_CRAP ]			= { 'C', ' ', true },
+	[ TAINT_FIRMWARE_WORKAROUND ]	= { 'I', ' ', false },
+	[ TAINT_OOT_MODULE ]		= { 'O', ' ', true },
+	[ TAINT_UNSIGNED_MODULE ]	= { 'E', ' ', true },
+	[ TAINT_SOFTLOCKUP ]		= { 'L', ' ', false },
+	[ TAINT_LIVEPATCH ]		= { 'K', ' ', true },
+	[ TAINT_AUX ]			= { 'X', ' ', true },
+	[ TAINT_RANDSTRUCT ]		= { 'T', ' ', true },
 };
 
 /**
- *	print_tainted - return a string to represent the kernel taint state.
+ * print_tainted - return a string to represent the kernel taint state.
  *
- *  'P' - Proprietary module has been loaded.
- *  'F' - Module has been forcibly loaded.
- *  'S' - SMP with CPUs not designed for SMP.
- *  'R' - User forced a module unload.
- *  'M' - System experienced a machine check exception.
- *  'B' - System has hit bad_page.
- *  'U' - Userspace-defined naughtiness.
- *  'D' - Kernel has oopsed before
- *  'A' - ACPI table overridden.
- *  'W' - Taint on warning.
- *  'C' - modules from drivers/staging are loaded.
- *  'I' - Working around severe firmware bug.
- *  'O' - Out-of-tree module has been loaded.
- *  'E' - Unsigned module has been loaded.
- *  'L' - A soft lockup has previously occurred.
- *  'K' - Kernel has been live patched.
- *  'X' - Auxiliary taint, for distros' use.
+ * For individual taint flag meanings, see Documentation/sysctl/kernel.txt
  *
- *	The string is overwritten by the next call to print_tainted().
+ * The string is overwritten by the next call to print_tainted(),
+ * but is always NULL terminated.
  */
 const char *print_tainted(void)
 {
 	static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")];
 
+	BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT);
+
 	if (tainted_mask) {
 		char *s;
 		int i;
@@ -554,6 +543,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 	else
 		dump_stack();
 
+	print_irqtrace_events(current);
+
 	print_oops_end_marker();
 
 	/* Just a warning, don't kill lockdep. */
diff --git a/kernel/params.c b/kernel/params.c
index cc9108c2a1fd..ce89f757e6da 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -111,8 +111,8 @@ bool parameq(const char *a, const char *b)
 static void param_check_unsafe(const struct kernel_param *kp)
 {
 	if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
-		pr_warn("Setting dangerous option %s - tainting kernel\n",
-			kp->name);
+		pr_notice("Setting dangerous option %s - tainting kernel\n",
+			  kp->name);
 		add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 	}
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index ed6c343fe50d..157fe4b19971 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -70,7 +70,7 @@ int pid_max_max = PID_MAX_LIMIT;
  */
 struct pid_namespace init_pid_ns = {
 	.kref = KREF_INIT(2),
-	.idr = IDR_INIT,
+	.idr = IDR_INIT(init_pid_ns.idr),
 	.pid_allocated = PIDNS_ADDING,
 	.level = 0,
 	.child_reaper = &init_task,
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 5454cc639a8d..9c85c7822383 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -287,6 +287,8 @@ static int create_image(int platform_mode)
 
 	local_irq_disable();
 
+	system_state = SYSTEM_SUSPEND;
+
 	error = syscore_suspend();
 	if (error) {
 		pr_err("Some system devices failed to power down, aborting hibernation\n");
@@ -317,6 +319,7 @@ static int create_image(int platform_mode)
 	syscore_resume();
 
  Enable_irqs:
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 
  Enable_cpus:
@@ -445,6 +448,7 @@ static int resume_target_kernel(bool platform_mode)
 		goto Enable_cpus;
 
 	local_irq_disable();
+	system_state = SYSTEM_SUSPEND;
 
 	error = syscore_suspend();
 	if (error)
@@ -478,6 +482,7 @@ static int resume_target_kernel(bool platform_mode)
 	syscore_resume();
 
  Enable_irqs:
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 
  Enable_cpus:
@@ -563,6 +568,7 @@ int hibernation_platform_enter(void)
 		goto Enable_cpus;
 
 	local_irq_disable();
+	system_state = SYSTEM_SUSPEND;
 	syscore_suspend();
 	if (pm_wakeup_pending()) {
 		error = -EAGAIN;
@@ -575,6 +581,7 @@ int hibernation_platform_enter(void)
 
  Power_up:
 	syscore_resume();
+	system_state = SYSTEM_RUNNING;
 	local_irq_enable();
 
  Enable_cpus:
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9d7503910ce2..86d72ffb811b 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -184,7 +184,6 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
 	c->target_value = value;
 }
 
-static inline int pm_qos_get_value(struct pm_qos_constraints *c);
 static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused)
 {
 	struct pm_qos_object *qos = (struct pm_qos_object *)s->private;
@@ -295,6 +294,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
 		 * changed
 		 */
 		plist_del(node, &c->list);
+		/* fall through */
 	case PM_QOS_ADD_REQ:
 		plist_node_init(node, new_value);
 		plist_add(node, &c->list);
@@ -367,6 +367,7 @@ bool pm_qos_update_flags(struct pm_qos_flags *pqf,
 		break;
 	case PM_QOS_UPDATE_REQ:
 		pm_qos_flags_remove_req(pqf, req);
+		/* fall through */
 	case PM_QOS_ADD_REQ:
 		req->flags = val;
 		INIT_LIST_HEAD(&req->node);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4c10be0f4843..87331565e505 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -27,6 +27,7 @@
 #include <linux/export.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
+#include <linux/swait.h>
 #include <linux/ftrace.h>
 #include <trace/events/power.h>
 #include <linux/compiler.h>
@@ -57,10 +58,10 @@ EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
 
 static const struct platform_suspend_ops *suspend_ops;
 static const struct platform_s2idle_ops *s2idle_ops;
-static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head);
+static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
 
 enum s2idle_states __read_mostly s2idle_state;
-static DEFINE_SPINLOCK(s2idle_lock);
+static DEFINE_RAW_SPINLOCK(s2idle_lock);
 
 void s2idle_set_ops(const struct platform_s2idle_ops *ops)
 {
@@ -78,12 +79,12 @@ static void s2idle_enter(void)
 {
 	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
 
-	spin_lock_irq(&s2idle_lock);
+	raw_spin_lock_irq(&s2idle_lock);
 	if (pm_wakeup_pending())
 		goto out;
 
 	s2idle_state = S2IDLE_STATE_ENTER;
-	spin_unlock_irq(&s2idle_lock);
+	raw_spin_unlock_irq(&s2idle_lock);
 
 	get_online_cpus();
 	cpuidle_resume();
@@ -91,17 +92,17 @@ static void s2idle_enter(void)
 	/* Push all the CPUs into the idle loop. */
 	wake_up_all_idle_cpus();
 	/* Make the current CPU wait so it can enter the idle loop too. */
-	wait_event(s2idle_wait_head,
-		   s2idle_state == S2IDLE_STATE_WAKE);
+	swait_event(s2idle_wait_head,
+		    s2idle_state == S2IDLE_STATE_WAKE);
 
 	cpuidle_pause();
 	put_online_cpus();
 
-	spin_lock_irq(&s2idle_lock);
+	raw_spin_lock_irq(&s2idle_lock);
 
  out:
 	s2idle_state = S2IDLE_STATE_NONE;
-	spin_unlock_irq(&s2idle_lock);
+	raw_spin_unlock_irq(&s2idle_lock);
 
 	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false);
 }
@@ -156,12 +157,12 @@ void s2idle_wake(void)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&s2idle_lock, flags);
+	raw_spin_lock_irqsave(&s2idle_lock, flags);
 	if (s2idle_state > S2IDLE_STATE_NONE) {
 		s2idle_state = S2IDLE_STATE_WAKE;
-		wake_up(&s2idle_wait_head);
+		swake_up(&s2idle_wait_head);
 	}
-	spin_unlock_irqrestore(&s2idle_lock, flags);
+	raw_spin_unlock_irqrestore(&s2idle_lock, flags);
 }
 EXPORT_SYMBOL_GPL(s2idle_wake);
 
@@ -428,6 +429,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	arch_suspend_disable_irqs();
 	BUG_ON(!irqs_disabled());
 
+	system_state = SYSTEM_SUSPEND;
+
 	error = syscore_suspend();
 	if (!error) {
 		*wakeup = pm_wakeup_pending();
@@ -443,6 +446,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 		syscore_resume();
 	}
 
+	system_state = SYSTEM_RUNNING;
+
 	arch_suspend_enable_irqs();
 	BUG_ON(irqs_disabled());
 
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11b4282c2d20..1efcb5b0c3ed 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -269,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
 	struct bio *bio;
 	int error = 0;
 
-	bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
+	bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1);
 	bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
 	bio_set_dev(bio, hib_resume_bdev);
 	bio_set_op_attrs(bio, op, op_flags);
@@ -376,7 +376,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
 		return -ENOSPC;
 
 	if (hb) {
-		src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN |
+		src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN |
 		                              __GFP_NORETRY);
 		if (src) {
 			copy_page(src, buf);
@@ -384,7 +384,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
 			ret = hib_wait_io(hb); /* Free pages */
 			if (ret)
 				return ret;
-			src = (void *)__get_free_page(__GFP_RECLAIM |
+			src = (void *)__get_free_page(GFP_NOIO |
 			                              __GFP_NOWARN |
 			                              __GFP_NORETRY);
 			if (src) {
@@ -691,7 +691,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	nr_threads = num_online_cpus() - 1;
 	nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
 
-	page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
+	page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH);
 	if (!page) {
 		pr_err("Failed to allocate LZO page\n");
 		ret = -ENOMEM;
@@ -989,7 +989,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 		last = tmp;
 
 		tmp->map = (struct swap_map_page *)
-			   __get_free_page(__GFP_RECLAIM | __GFP_HIGH);
+			   __get_free_page(GFP_NOIO | __GFP_HIGH);
 		if (!tmp->map) {
 			release_swap_reader(handle);
 			return -ENOMEM;
@@ -1261,8 +1261,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
 	for (i = 0; i < read_pages; i++) {
 		page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
-						  __GFP_RECLAIM | __GFP_HIGH :
-						  __GFP_RECLAIM | __GFP_NOWARN |
+						  GFP_NOIO | __GFP_HIGH :
+						  GFP_NOIO | __GFP_NOWARN |
 						  __GFP_NORETRY);
 
 		if (!page[i]) {
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 75c959de4b29..abd225550271 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -186,6 +186,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 		res = PAGE_SIZE - pg_offp;
 	}
 
+	if (!data_of(data->handle)) {
+		res = -EINVAL;
+		goto unlock;
+	}
+
 	res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
 			buf, count);
 	if (res > 0)
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index dfba59be190b..4210152e56f0 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -188,6 +188,7 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
 		return ERR_PTR(-ENOMEM);
 	}
 	wl->ws.name = wl->name;
+	wl->ws.last_time = ktime_get();
 	wakeup_source_add(&wl->ws);
 	rb_link_node(&wl->node, parent, node);
 	rb_insert_color(&wl->node, &wakelocks_tree);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 704e55129c3a..247808333ba4 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -51,6 +51,7 @@
 #include <linux/uaccess.h>
 #include <asm/sections.h>
 
+#include <trace/events/initcall.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/printk.h>
 
@@ -1907,6 +1908,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 		preempt_enable();
 	}
 
+	wake_up_klogd();
 	return printed_len;
 }
 EXPORT_SYMBOL(vprintk_emit);
@@ -2288,9 +2290,7 @@ void console_unlock(void)
 {
 	static char ext_text[CONSOLE_EXT_LOG_MAX];
 	static char text[LOG_LINE_MAX + PREFIX_MAX];
-	static u64 seen_seq;
 	unsigned long flags;
-	bool wake_klogd = false;
 	bool do_cond_resched, retry;
 
 	if (console_suspended) {
@@ -2334,11 +2334,6 @@ again:
 
 		printk_safe_enter_irqsave(flags);
 		raw_spin_lock(&logbuf_lock);
-		if (seen_seq != log_next_seq) {
-			wake_klogd = true;
-			seen_seq = log_next_seq;
-		}
-
 		if (console_seq < log_first_seq) {
 			len = sprintf(text, "** %u printk messages dropped **\n",
 				      (unsigned)(log_first_seq - console_seq));
@@ -2396,7 +2391,7 @@ skip:
 
 		if (console_lock_spinning_disable_and_check()) {
 			printk_safe_exit_irqrestore(flags);
-			goto out;
+			return;
 		}
 
 		printk_safe_exit_irqrestore(flags);
@@ -2428,10 +2423,6 @@ skip:
 
 	if (retry && console_trylock())
 		goto again;
-
-out:
-	if (wake_klogd)
-		wake_up_klogd();
 }
 EXPORT_SYMBOL(console_unlock);
 
@@ -2780,6 +2771,7 @@ EXPORT_SYMBOL(unregister_console);
  */
 void __init console_init(void)
 {
+	int ret;
 	initcall_t *call;
 
 	/* Setup the default TTY line discipline. */
@@ -2790,8 +2782,11 @@ void __init console_init(void)
 	 * inform about problems etc..
 	 */
 	call = __con_initcall_start;
+	trace_initcall_level("console");
 	while (call < __con_initcall_end) {
-		(*call)();
+		trace_initcall_start((*call));
+		ret = (*call)();
+		trace_initcall_finish((*call), ret);
 		call++;
 	}
 }
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 3e3c2004bb23..d7d091309054 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -82,6 +82,7 @@ static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
 {
 	int add;
 	size_t len;
+	va_list ap;
 
 again:
 	len = atomic_read(&s->len);
@@ -100,7 +101,9 @@ again:
 	if (!len)
 		smp_rmb();
 
-	add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+	va_copy(ap, args);
+	add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap);
+	va_end(ap);
 	if (!add)
 		return 0;
 
@@ -278,7 +281,7 @@ void printk_safe_flush_on_panic(void)
 	 * Make sure that we could access the main ring buffer.
 	 * Do not risk a double release when more CPUs are up.
 	 */
-	if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
+	if (raw_spin_is_locked(&logbuf_lock)) {
 		if (num_online_cpus() > 1)
 			return;
 
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 7a693e31184a..40cea6735c2d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -270,6 +270,12 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
 	}
 }
 
+/* Returns first leaf rcu_node of the specified RCU flavor. */
+#define rcu_first_leaf_node(rsp) ((rsp)->level[rcu_num_lvls - 1])
+
+/* Is this rcu_node a leaf? */
+#define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1)
+
 /*
  * Do a full breadth-first scan of the rcu_node structures for the
  * specified rcu_state structure.
@@ -284,8 +290,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
  * rcu_node tree with but one rcu_node structure, this loop is a no-op.
  */
 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
-	for ((rnp) = &(rsp)->node[0]; \
-	     (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
+	for ((rnp) = &(rsp)->node[0]; !rcu_is_leaf_node(rsp, rnp); (rnp)++)
 
 /*
  * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -294,7 +299,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
  * It is still a leaf node, even if it is also the root node.
  */
 #define rcu_for_each_leaf_node(rsp, rnp) \
-	for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
+	for ((rnp) = rcu_first_leaf_node(rsp); \
 	     (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
 
 /*
@@ -486,6 +491,7 @@ void rcu_force_quiescent_state(void);
 void rcu_bh_force_quiescent_state(void);
 void rcu_sched_force_quiescent_state(void);
 extern struct workqueue_struct *rcu_gp_wq;
+extern struct workqueue_struct *rcu_par_gp_wq;
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
 #ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 88cba7c2956c..5aff271adf1e 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -404,24 +404,6 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 }
 
 /*
- * Scan the specified rcu_segcblist structure for callbacks that need
- * a grace period later than the one specified by "seq".  We don't look
- * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
- * have a grace-period sequence number.
- */
-bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
-				    unsigned long seq)
-{
-	int i;
-
-	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
-		if (rsclp->tails[i - 1] != rsclp->tails[i] &&
-		    ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
-			return true;
-	return false;
-}
-
-/*
  * Merge the source rcu_segcblist structure into the destination
  * rcu_segcblist structure, then initialize the source.  Any pending
  * callbacks from the source get to start over.  It is best to
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 581c12b63544..948470cef385 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -134,7 +134,5 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
 				   struct rcu_cblist *rclp);
 void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
 bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
-bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
-				    unsigned long seq);
 void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 			 struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 777e7a6a0292..e232846516b3 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -369,7 +369,7 @@ static bool __maybe_unused torturing_tasks(void)
  */
 static void rcu_perf_wait_shutdown(void)
 {
-	cond_resched_rcu_qs();
+	cond_resched_tasks_rcu_qs();
 	if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters)
 		return;
 	while (!torture_must_stop())
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 680c96d8c00f..e628fcfd1bde 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -593,7 +593,12 @@ static void srcu_torture_init(void)
 
 static void srcu_torture_cleanup(void)
 {
-	cleanup_srcu_struct(&srcu_ctld);
+	static DEFINE_TORTURE_RANDOM(rand);
+
+	if (torture_random(&rand) & 0x800)
+		cleanup_srcu_struct(&srcu_ctld);
+	else
+		cleanup_srcu_struct_quiesced(&srcu_ctld);
 	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
 }
 
@@ -1609,6 +1614,9 @@ static enum cpuhp_state rcutor_hp;
 static void
 rcu_torture_cleanup(void)
 {
+	int flags = 0;
+	unsigned long gpnum = 0;
+	unsigned long completed = 0;
 	int i;
 
 	rcutorture_record_test_transition();
@@ -1639,6 +1647,11 @@ rcu_torture_cleanup(void)
 		fakewriter_tasks = NULL;
 	}
 
+	rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed);
+	srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
+				&flags, &gpnum, &completed);
+	pr_alert("%s:  End-test grace-period state: g%lu c%lu f%#x\n",
+		 cur_ops->name, gpnum, completed, flags);
 	torture_stop_kthread(rcu_torture_stats, stats_task);
 	torture_stop_kthread(rcu_torture_fqs, fqs_task);
 	for (i = 0; i < ncbflooders; i++)
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 76ac5f50b2c7..622792abe41a 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
  * Must invoke this after you are finished using a given srcu_struct that
  * was initialized via init_srcu_struct(), else you leak memory.
  */
-void cleanup_srcu_struct(struct srcu_struct *sp)
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
 {
 	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
-	flush_work(&sp->srcu_work);
+	if (quiesced)
+		WARN_ON(work_pending(&sp->srcu_work));
+	else
+		flush_work(&sp->srcu_work);
 	WARN_ON(sp->srcu_gp_running);
 	WARN_ON(sp->srcu_gp_waiting);
 	WARN_ON(sp->srcu_cb_head);
 	WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
 }
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
 
 /*
  * Removes the count for the old reader from the appropriate element of
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index fb560fca9ef4..b4123d7a2cec 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp)
 	return SRCU_INTERVAL;
 }
 
-/**
- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
- *
- * Must invoke this after you are finished using a given srcu_struct that
- * was initialized via init_srcu_struct(), else you leak memory.
- */
-void cleanup_srcu_struct(struct srcu_struct *sp)
+/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
+void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced)
 {
 	int cpu;
 
 	if (WARN_ON(!srcu_get_delay(sp)))
-		return; /* Leakage unless caller handles error. */
+		return; /* Just leak it! */
 	if (WARN_ON(srcu_readers_active(sp)))
-		return; /* Leakage unless caller handles error. */
-	flush_delayed_work(&sp->work);
+		return; /* Just leak it! */
+	if (quiesced) {
+		if (WARN_ON(delayed_work_pending(&sp->work)))
+			return; /* Just leak it! */
+	} else {
+		flush_delayed_work(&sp->work);
+	}
 	for_each_possible_cpu(cpu)
-		flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+		if (quiesced) {
+			if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work)))
+				return; /* Just leak it! */
+		} else {
+			flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+		}
 	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
 	    WARN_ON(srcu_readers_active(sp))) {
 		pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
@@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 	free_percpu(sp->sda);
 	sp->sda = NULL;
 }
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
 
 /*
  * Counts the new reader in the appropriate per-CPU element of the
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2a734692a581..aa7cade1b9f3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -524,8 +524,6 @@ module_param(rcu_kick_kthreads, bool, 0644);
 static ulong jiffies_till_sched_qs = HZ / 10;
 module_param(jiffies_till_sched_qs, ulong, 0444);
 
-static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
-				  struct rcu_data *rdp);
 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp));
 static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(void);
@@ -711,44 +709,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 }
 
 /*
- * Is there any need for future grace periods?
- * Interrupts must be disabled.  If the caller does not hold the root
- * rnp_node structure's ->lock, the results are advisory only.
- */
-static int rcu_future_needs_gp(struct rcu_state *rsp)
-{
-	struct rcu_node *rnp = rcu_get_root(rsp);
-	int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
-	int *fp = &rnp->need_future_gp[idx];
-
-	lockdep_assert_irqs_disabled();
-	return READ_ONCE(*fp);
-}
-
-/*
- * Does the current CPU require a not-yet-started grace period?
- * The caller must have disabled interrupts to prevent races with
- * normal callback registry.
- */
-static bool
-cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
-{
-	lockdep_assert_irqs_disabled();
-	if (rcu_gp_in_progress(rsp))
-		return false;  /* No, a grace period is already in progress. */
-	if (rcu_future_needs_gp(rsp))
-		return true;  /* Yes, a no-CBs CPU needs one. */
-	if (!rcu_segcblist_is_enabled(&rdp->cblist))
-		return false;  /* No, this is a no-CBs (or offline) CPU. */
-	if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
-		return true;  /* Yes, CPU has newly registered callbacks. */
-	if (rcu_segcblist_future_gp_needed(&rdp->cblist,
-					   READ_ONCE(rsp->completed)))
-		return true;  /* Yes, CBs for future grace period. */
-	return false; /* No grace period needed. */
-}
-
-/*
  * Enter an RCU extended quiescent state, which can be either the
  * idle loop or adaptive-tickless usermode execution.
  *
@@ -1234,10 +1194,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	}
 
 	/*
-	 * Has this CPU encountered a cond_resched_rcu_qs() since the
-	 * beginning of the grace period?  For this to be the case,
-	 * the CPU has to have noticed the current grace period.  This
-	 * might not be the case for nohz_full CPUs looping in the kernel.
+	 * Has this CPU encountered a cond_resched() since the beginning
+	 * of the grace period?  For this to be the case, the CPU has to
+	 * have noticed the current grace period.  This might not be the
+	 * case for nohz_full CPUs looping in the kernel.
 	 */
 	jtsq = jiffies_till_sched_qs;
 	ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
@@ -1642,18 +1602,30 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
 		return rnp->completed + 1;
 
 	/*
+	 * If the current rcu_node structure believes that RCU is
+	 * idle, and if the rcu_state structure does not yet reflect
+	 * the start of a new grace period, then the next grace period
+	 * will suffice.  The memory barrier is needed to accurately
+	 * sample the rsp->gpnum, and pairs with the second lock
+	 * acquisition in rcu_gp_init(), which is augmented with
+	 * smp_mb__after_unlock_lock() for this purpose.
+	 */
+	if (rnp->gpnum == rnp->completed) {
+		smp_mb(); /* See above block comment. */
+		if (READ_ONCE(rsp->gpnum) == rnp->completed)
+			return rnp->completed + 1;
+	}
+
+	/*
 	 * Otherwise, wait for a possible partial grace period and
 	 * then the subsequent full grace period.
 	 */
 	return rnp->completed + 2;
 }
 
-/*
- * Trace-event helper function for rcu_start_future_gp() and
- * rcu_nocb_wait_gp().
- */
-static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
-				unsigned long c, const char *s)
+/* Trace-event wrapper function for trace_rcu_future_grace_period.  */
+static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+			      unsigned long c, const char *s)
 {
 	trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
 				      rnp->completed, c, rnp->level,
@@ -1661,96 +1633,67 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
 }
 
 /*
- * Start some future grace period, as needed to handle newly arrived
+ * Start the specified grace period, as needed to handle newly arrived
  * callbacks.  The required future grace periods are recorded in each
- * rcu_node structure's ->need_future_gp field.  Returns true if there
+ * rcu_node structure's ->need_future_gp[] field.  Returns true if there
  * is reason to awaken the grace-period kthread.
  *
- * The caller must hold the specified rcu_node structure's ->lock.
+ * The caller must hold the specified rcu_node structure's ->lock, which
+ * is why the caller is responsible for waking the grace-period kthread.
  */
-static bool __maybe_unused
-rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
-		    unsigned long *c_out)
+static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+			      unsigned long c)
 {
-	unsigned long c;
 	bool ret = false;
-	struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
-
-	raw_lockdep_assert_held_rcu_node(rnp);
-
-	/*
-	 * Pick up grace-period number for new callbacks.  If this
-	 * grace period is already marked as needed, return to the caller.
-	 */
-	c = rcu_cbs_completed(rdp->rsp, rnp);
-	trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
-	if (rnp->need_future_gp[c & 0x1]) {
-		trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
-		goto out;
-	}
+	struct rcu_state *rsp = rdp->rsp;
+	struct rcu_node *rnp_root;
 
 	/*
-	 * If either this rcu_node structure or the root rcu_node structure
-	 * believe that a grace period is in progress, then we must wait
-	 * for the one following, which is in "c".  Because our request
-	 * will be noticed at the end of the current grace period, we don't
-	 * need to explicitly start one.  We only do the lockless check
-	 * of rnp_root's fields if the current rcu_node structure thinks
-	 * there is no grace period in flight, and because we hold rnp->lock,
-	 * the only possible change is when rnp_root's two fields are
-	 * equal, in which case rnp_root->gpnum might be concurrently
-	 * incremented.  But that is OK, as it will just result in our
-	 * doing some extra useless work.
+	 * Use funnel locking to either acquire the root rcu_node
+	 * structure's lock or bail out if the need for this grace period
+	 * has already been recorded -- or has already started.  If there
+	 * is already a grace period in progress in a non-leaf node, no
+	 * recording is needed because the end of the grace period will
+	 * scan the leaf rcu_node structures.  Note that rnp->lock must
+	 * not be released.
 	 */
-	if (rnp->gpnum != rnp->completed ||
-	    READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) {
-		rnp->need_future_gp[c & 0x1]++;
-		trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
-		goto out;
+	raw_lockdep_assert_held_rcu_node(rnp);
+	trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf"));
+	for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) {
+		if (rnp_root != rnp)
+			raw_spin_lock_rcu_node(rnp_root);
+		WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum +
+					  need_future_gp_mask(), c));
+		if (need_future_gp_element(rnp_root, c) ||
+		    ULONG_CMP_GE(rnp_root->gpnum, c) ||
+		    (rnp != rnp_root &&
+		     rnp_root->gpnum != rnp_root->completed)) {
+			trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted"));
+			goto unlock_out;
+		}
+		need_future_gp_element(rnp_root, c) = true;
+		if (rnp_root != rnp && rnp_root->parent != NULL)
+			raw_spin_unlock_rcu_node(rnp_root);
+		if (!rnp_root->parent)
+			break;  /* At root, and perhaps also leaf. */
 	}
 
-	/*
-	 * There might be no grace period in progress.  If we don't already
-	 * hold it, acquire the root rcu_node structure's lock in order to
-	 * start one (if needed).
-	 */
-	if (rnp != rnp_root)
-		raw_spin_lock_rcu_node(rnp_root);
-
-	/*
-	 * Get a new grace-period number.  If there really is no grace
-	 * period in progress, it will be smaller than the one we obtained
-	 * earlier.  Adjust callbacks as needed.
-	 */
-	c = rcu_cbs_completed(rdp->rsp, rnp_root);
-	if (!rcu_is_nocb_cpu(rdp->cpu))
-		(void)rcu_segcblist_accelerate(&rdp->cblist, c);
-
-	/*
-	 * If the needed for the required grace period is already
-	 * recorded, trace and leave.
-	 */
-	if (rnp_root->need_future_gp[c & 0x1]) {
-		trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
+	/* If GP already in progress, just leave, otherwise start one. */
+	if (rnp_root->gpnum != rnp_root->completed) {
+		trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot"));
 		goto unlock_out;
 	}
-
-	/* Record the need for the future grace period. */
-	rnp_root->need_future_gp[c & 0x1]++;
-
-	/* If a grace period is not already in progress, start one. */
-	if (rnp_root->gpnum != rnp_root->completed) {
-		trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
-	} else {
-		trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
-		ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+	trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot"));
+	WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT);
+	if (!rsp->gp_kthread) {
+		trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread"));
+		goto unlock_out;
 	}
+	trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq"));
+	ret = true;  /* Caller must wake GP kthread. */
 unlock_out:
 	if (rnp != rnp_root)
 		raw_spin_unlock_rcu_node(rnp_root);
-out:
-	if (c_out != NULL)
-		*c_out = c;
 	return ret;
 }
 
@@ -1758,16 +1701,16 @@ out:
  * Clean up any old requests for the just-ended grace period.  Also return
  * whether any additional grace periods have been requested.
  */
-static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 {
-	int c = rnp->completed;
-	int needmore;
+	unsigned long c = rnp->completed;
+	bool needmore;
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 
-	rnp->need_future_gp[c & 0x1] = 0;
-	needmore = rnp->need_future_gp[(c + 1) & 0x1];
-	trace_rcu_future_gp(rnp, rdp, c,
-			    needmore ? TPS("CleanupMore") : TPS("Cleanup"));
+	need_future_gp_element(rnp, c) = false;
+	needmore = need_any_future_gp(rnp);
+	trace_rcu_this_gp(rnp, rdp, c,
+			  needmore ? TPS("CleanupMore") : TPS("Cleanup"));
 	return needmore;
 }
 
@@ -1802,6 +1745,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
 static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
 			       struct rcu_data *rdp)
 {
+	unsigned long c;
 	bool ret = false;
 
 	raw_lockdep_assert_held_rcu_node(rnp);
@@ -1820,8 +1764,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
 	 * accelerating callback invocation to an earlier grace-period
 	 * number.
 	 */
-	if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp)))
-		ret = rcu_start_future_gp(rnp, rdp, NULL);
+	c = rcu_cbs_completed(rsp, rnp);
+	if (rcu_segcblist_accelerate(&rdp->cblist, c))
+		ret = rcu_start_this_gp(rnp, rdp, c);
 
 	/* Trace depending on how much we were able to accelerate. */
 	if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
@@ -2049,7 +1994,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 					    rnp->level, rnp->grplo,
 					    rnp->grphi, rnp->qsmask);
 		raw_spin_unlock_irq_rcu_node(rnp);
-		cond_resched_rcu_qs();
+		cond_resched_tasks_rcu_qs();
 		WRITE_ONCE(rsp->gp_activity, jiffies);
 	}
 
@@ -2108,7 +2053,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 {
 	unsigned long gp_duration;
 	bool needgp = false;
-	int nocb = 0;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	struct swait_queue_head *sq;
@@ -2147,31 +2091,35 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 		if (rnp == rdp->mynode)
 			needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
 		/* smp_mb() provided by prior unlock-lock pair. */
-		nocb += rcu_future_gp_cleanup(rsp, rnp);
+		needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp;
 		sq = rcu_nocb_gp_get(rnp);
 		raw_spin_unlock_irq_rcu_node(rnp);
 		rcu_nocb_gp_cleanup(sq);
-		cond_resched_rcu_qs();
+		cond_resched_tasks_rcu_qs();
 		WRITE_ONCE(rsp->gp_activity, jiffies);
 		rcu_gp_slow(rsp, gp_cleanup_delay);
 	}
 	rnp = rcu_get_root(rsp);
 	raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */
-	rcu_nocb_gp_set(rnp, nocb);
 
 	/* Declare grace period done. */
 	WRITE_ONCE(rsp->completed, rsp->gpnum);
 	trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
 	rsp->gp_state = RCU_GP_IDLE;
+	/* Check for GP requests since above loop. */
 	rdp = this_cpu_ptr(rsp->rda);
+	if (need_any_future_gp(rnp)) {
+		trace_rcu_this_gp(rnp, rdp, rsp->completed - 1,
+				  TPS("CleanupMore"));
+		needgp = true;
+	}
 	/* Advance CBs to reduce false positives below. */
-	needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
-	if (needgp || cpu_needs_another_gp(rsp, rdp)) {
+	if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) {
 		WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
-		trace_rcu_grace_period(rsp->name,
-				       READ_ONCE(rsp->gpnum),
+		trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
 				       TPS("newreq"));
 	}
+	WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT);
 	raw_spin_unlock_irq_rcu_node(rnp);
 }
 
@@ -2202,7 +2150,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 			/* Locking provides needed memory barrier. */
 			if (rcu_gp_init(rsp))
 				break;
-			cond_resched_rcu_qs();
+			cond_resched_tasks_rcu_qs();
 			WRITE_ONCE(rsp->gp_activity, jiffies);
 			WARN_ON(signal_pending(current));
 			trace_rcu_grace_period(rsp->name,
@@ -2247,7 +2195,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 				trace_rcu_grace_period(rsp->name,
 						       READ_ONCE(rsp->gpnum),
 						       TPS("fqsend"));
-				cond_resched_rcu_qs();
+				cond_resched_tasks_rcu_qs();
 				WRITE_ONCE(rsp->gp_activity, jiffies);
 				ret = 0; /* Force full wait till next FQS. */
 				j = jiffies_till_next_fqs;
@@ -2260,7 +2208,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 				}
 			} else {
 				/* Deal with stray signal. */
-				cond_resched_rcu_qs();
+				cond_resched_tasks_rcu_qs();
 				WRITE_ONCE(rsp->gp_activity, jiffies);
 				WARN_ON(signal_pending(current));
 				trace_rcu_grace_period(rsp->name,
@@ -2283,71 +2231,6 @@ static int __noreturn rcu_gp_kthread(void *arg)
 }
 
 /*
- * Start a new RCU grace period if warranted, re-initializing the hierarchy
- * in preparation for detecting the next grace period.  The caller must hold
- * the root node's ->lock and hard irqs must be disabled.
- *
- * Note that it is legal for a dying CPU (which is marked as offline) to
- * invoke this function.  This can happen when the dying CPU reports its
- * quiescent state.
- *
- * Returns true if the grace-period kthread must be awakened.
- */
-static bool
-rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
-		      struct rcu_data *rdp)
-{
-	raw_lockdep_assert_held_rcu_node(rnp);
-	if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
-		/*
-		 * Either we have not yet spawned the grace-period
-		 * task, this CPU does not need another grace period,
-		 * or a grace period is already in progress.
-		 * Either way, don't start a new grace period.
-		 */
-		return false;
-	}
-	WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT);
-	trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum),
-			       TPS("newreq"));
-
-	/*
-	 * We can't do wakeups while holding the rnp->lock, as that
-	 * could cause possible deadlocks with the rq->lock. Defer
-	 * the wakeup to our caller.
-	 */
-	return true;
-}
-
-/*
- * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
- * callbacks.  Note that rcu_start_gp_advanced() cannot do this because it
- * is invoked indirectly from rcu_advance_cbs(), which would result in
- * endless recursion -- or would do so if it wasn't for the self-deadlock
- * that is encountered beforehand.
- *
- * Returns true if the grace-period kthread needs to be awakened.
- */
-static bool rcu_start_gp(struct rcu_state *rsp)
-{
-	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
-	struct rcu_node *rnp = rcu_get_root(rsp);
-	bool ret = false;
-
-	/*
-	 * If there is no grace period in progress right now, any
-	 * callbacks we have up to this point will be satisfied by the
-	 * next grace period.  Also, advancing the callbacks reduces the
-	 * probability of false positives from cpu_needs_another_gp()
-	 * resulting in pointless grace periods.  So, advance callbacks
-	 * then start the grace period!
-	 */
-	ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
-	ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
-	return ret;
-}
-
-/*
  * Report a full set of quiescent states to the specified rcu_state data
  * structure.  Invoke rcu_gp_kthread_wake() to awaken the grace-period
  * kthread if another grace period is required.  Whether we wake
@@ -2398,7 +2281,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 			return;
 		}
 		WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
-		WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 &&
+		WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
 			     rcu_preempt_blocked_readers_cgp(rnp));
 		rnp->qsmask &= ~mask;
 		trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
@@ -2782,7 +2665,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
 	struct rcu_node *rnp;
 
 	rcu_for_each_leaf_node(rsp, rnp) {
-		cond_resched_rcu_qs();
+		cond_resched_tasks_rcu_qs();
 		mask = 0;
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->qsmask == 0) {
@@ -2874,22 +2757,27 @@ __rcu_process_callbacks(struct rcu_state *rsp)
 	unsigned long flags;
 	bool needwake;
 	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
+	struct rcu_node *rnp;
 
 	WARN_ON_ONCE(!rdp->beenonline);
 
 	/* Update RCU state based on any recent quiescent states. */
 	rcu_check_quiescent_state(rsp, rdp);
 
-	/* Does this CPU require a not-yet-started grace period? */
-	local_irq_save(flags);
-	if (cpu_needs_another_gp(rsp, rdp)) {
-		raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
-		needwake = rcu_start_gp(rsp);
-		raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
-		if (needwake)
-			rcu_gp_kthread_wake(rsp);
-	} else {
-		local_irq_restore(flags);
+	/* No grace period and unregistered callbacks? */
+	if (!rcu_gp_in_progress(rsp) &&
+	    rcu_segcblist_is_enabled(&rdp->cblist)) {
+		local_irq_save(flags);
+		if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) {
+			local_irq_restore(flags);
+		} else {
+			rnp = rdp->mynode;
+			raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+			needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+			if (needwake)
+				rcu_gp_kthread_wake(rsp);
+		}
 	}
 
 	/* If there are callbacks ready, invoke them. */
@@ -2973,11 +2861,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
 
 		/* Start a new grace period if one not already started. */
 		if (!rcu_gp_in_progress(rsp)) {
-			struct rcu_node *rnp_root = rcu_get_root(rsp);
+			struct rcu_node *rnp = rdp->mynode;
 
-			raw_spin_lock_rcu_node(rnp_root);
-			needwake = rcu_start_gp(rsp);
-			raw_spin_unlock_rcu_node(rnp_root);
+			raw_spin_lock_rcu_node(rnp);
+			needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
+			raw_spin_unlock_rcu_node(rnp);
 			if (needwake)
 				rcu_gp_kthread_wake(rsp);
 		} else {
@@ -3368,7 +3256,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 		return 1;
 
 	/* Has RCU gone idle with this CPU needing another grace period? */
-	if (cpu_needs_another_gp(rsp, rdp))
+	if (!rcu_gp_in_progress(rsp) &&
+	    rcu_segcblist_is_enabled(&rdp->cblist) &&
+	    !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
 		return 1;
 
 	/* Has another RCU grace period completed?  */
@@ -3775,6 +3665,8 @@ int rcutree_dead_cpu(unsigned int cpu)
 	return 0;
 }
 
+static DEFINE_PER_CPU(int, rcu_cpu_started);
+
 /*
  * Mark the specified CPU as being online so that subsequent grace periods
  * (both expedited and normal) will wait on it.  Note that this means that
@@ -3796,6 +3688,11 @@ void rcu_cpu_starting(unsigned int cpu)
 	struct rcu_node *rnp;
 	struct rcu_state *rsp;
 
+	if (per_cpu(rcu_cpu_started, cpu))
+		return;
+
+	per_cpu(rcu_cpu_started, cpu) = 1;
+
 	for_each_rcu_flavor(rsp) {
 		rdp = per_cpu_ptr(rsp->rda, cpu);
 		rnp = rdp->mynode;
@@ -3852,6 +3749,8 @@ void rcu_report_dead(unsigned int cpu)
 	preempt_enable();
 	for_each_rcu_flavor(rsp)
 		rcu_cleanup_dying_idle_cpu(cpu, rsp);
+
+	per_cpu(rcu_cpu_started, cpu) = 0;
 }
 
 /* Migrate the dead CPU's callbacks to the current CPU. */
@@ -3861,6 +3760,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
 	struct rcu_data *my_rdp;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+	bool needwake;
 
 	if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
 		return;  /* No callbacks to migrate. */
@@ -3872,12 +3772,15 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
 		return;
 	}
 	raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
-	rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */
-	rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */
+	/* Leverage recent GPs and set GP for new callbacks. */
+	needwake = rcu_advance_cbs(rsp, rnp_root, rdp) ||
+		   rcu_advance_cbs(rsp, rnp_root, my_rdp);
 	rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
 	WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
 		     !rcu_segcblist_n_cbs(&my_rdp->cblist));
 	raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+	if (needwake)
+		rcu_gp_kthread_wake(rsp);
 	WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
 		  !rcu_segcblist_empty(&rdp->cblist),
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
@@ -4056,7 +3959,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 
 	init_swait_queue_head(&rsp->gp_wq);
 	init_swait_queue_head(&rsp->expedited_wq);
-	rnp = rsp->level[rcu_num_lvls - 1];
+	rnp = rcu_first_leaf_node(rsp);
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
 			rnp++;
@@ -4168,6 +4071,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
 }
 
 struct workqueue_struct *rcu_gp_wq;
+struct workqueue_struct *rcu_par_gp_wq;
 
 void __init rcu_init(void)
 {
@@ -4199,6 +4103,8 @@ void __init rcu_init(void)
 	/* Create workqueue for expedited GPs and for Tree SRCU. */
 	rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
 	WARN_ON(!rcu_gp_wq);
+	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+	WARN_ON(!rcu_par_gp_wq);
 }
 
 #include "tree_exp.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f491ab4f2e8e..78e051dffc5b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -58,6 +58,14 @@ struct rcu_dynticks {
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
 
+/* Communicate arguments to a workqueue handler. */
+struct rcu_exp_work {
+	smp_call_func_t rew_func;
+	struct rcu_state *rew_rsp;
+	unsigned long rew_s;
+	struct work_struct rew_work;
+};
+
 /* RCU's kthread states for tracing. */
 #define RCU_KTHREAD_STOPPED  0
 #define RCU_KTHREAD_RUNNING  1
@@ -150,15 +158,32 @@ struct rcu_node {
 	struct swait_queue_head nocb_gp_wq[2];
 				/* Place for rcu_nocb_kthread() to wait GP. */
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
-	int need_future_gp[2];
-				/* Counts of upcoming no-CB GP requests. */
+	u8 need_future_gp[4];	/* Counts of upcoming GP requests. */
 	raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
 
 	spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
 	unsigned long exp_seq_rq;
 	wait_queue_head_t exp_wq[4];
+	struct rcu_exp_work rew;
+	bool exp_need_flush;	/* Need to flush workitem? */
 } ____cacheline_internodealigned_in_smp;
 
+/* Accessors for ->need_future_gp[] array. */
+#define need_future_gp_mask() \
+	(ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1)
+#define need_future_gp_element(rnp, c) \
+	((rnp)->need_future_gp[(c) & need_future_gp_mask()])
+#define need_any_future_gp(rnp)						\
+({									\
+	int __i;							\
+	bool __nonzero = false;						\
+									\
+	for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++)	\
+		__nonzero = __nonzero ||				\
+			    READ_ONCE((rnp)->need_future_gp[__i]);	\
+	__nonzero;							\
+})
+
 /*
  * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and
  * are indexed relative to this interval rather than the global CPU ID space.
@@ -224,10 +249,6 @@ struct rcu_data {
 #ifdef CONFIG_RCU_FAST_NO_HZ
 	struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-	atomic_long_t exp_workdone0;	/* # done by workqueue. */
-	atomic_long_t exp_workdone1;	/* # done by others #1. */
-	atomic_long_t exp_workdone2;	/* # done by others #2. */
-	atomic_long_t exp_workdone3;	/* # done by others #3. */
 	int exp_dynticks_snap;		/* Double-check need for IPI. */
 
 	/* 6) Callback offloading. */
@@ -408,7 +429,6 @@ extern struct rcu_state rcu_preempt_state;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 
 int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
-bool rcu_eqs_special_set(int cpu);
 
 #ifdef CONFIG_RCU_BOOST
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -438,7 +458,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
 static void invoke_rcu_callbacks_kthread(void);
 static bool rcu_is_callbacks_kthread(void);
 #ifdef CONFIG_RCU_BOOST
-static void rcu_preempt_do_callbacks(void);
 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 						 struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
@@ -454,7 +473,6 @@ static void print_cpu_stall_info_end(void);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index f72eefab8543..d40708e8c5d6 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -20,6 +20,8 @@
  * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 
+#include <linux/lockdep.h>
+
 /*
  * Record the start of an expedited grace period.
  */
@@ -154,15 +156,35 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
  * for the current expedited grace period.  Works only for preemptible
  * RCU -- other RCU implementation use other means.
  *
- * Caller must hold the rcu_state's exp_mutex.
+ * Caller must hold the specificed rcu_node structure's ->lock
  */
 static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 {
+	raw_lockdep_assert_held_rcu_node(rnp);
+
 	return rnp->exp_tasks == NULL &&
 	       READ_ONCE(rnp->expmask) == 0;
 }
 
 /*
+ * Like sync_rcu_preempt_exp_done(), but this function assumes the caller
+ * doesn't hold the rcu_node's ->lock, and will acquire and release the lock
+ * itself
+ */
+static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp)
+{
+	unsigned long flags;
+	bool ret;
+
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	ret = sync_rcu_preempt_exp_done(rnp);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+	return ret;
+}
+
+
+/*
  * Report the exit from RCU read-side critical section for the last task
  * that queued itself during or before the current expedited preemptible-RCU
  * grace period.  This event is reported either to the rcu_node structure on
@@ -170,8 +192,7 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
  * recursively up the tree.  (Calm down, calm down, we do the recursion
  * iteratively!)
  *
- * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
- * structure's ->lock.
+ * Caller must hold the specified rcu_node structure's ->lock.
  */
 static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 				 bool wake, unsigned long flags)
@@ -207,8 +228,6 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 /*
  * Report expedited quiescent state for specified node.  This is a
  * lock-acquisition wrapper function for __rcu_report_exp_rnp().
- *
- * Caller must hold the rcu_state's exp_mutex.
  */
 static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 					      struct rcu_node *rnp, bool wake)
@@ -221,8 +240,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 
 /*
  * Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure.  Caller must hold the rcu_state's
- * exp_mutex.
+ * specified leaf rcu_node structure.
  */
 static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 				    unsigned long mask, bool wake)
@@ -248,14 +266,12 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
 }
 
 /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
-			       unsigned long s)
+static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s)
 {
 	if (rcu_exp_gp_seq_done(rsp, s)) {
 		trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
 		/* Ensure test happens before caller kfree(). */
 		smp_mb__before_atomic(); /* ^^^ */
-		atomic_long_inc(stat);
 		return true;
 	}
 	return false;
@@ -289,7 +305,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	 * promoting locality and is not strictly needed for correctness.
 	 */
 	for (; rnp != NULL; rnp = rnp->parent) {
-		if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+		if (sync_exp_work_done(rsp, s))
 			return true;
 
 		/* Work not done, either wait here or go up. */
@@ -302,8 +318,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 						  rnp->grplo, rnp->grphi,
 						  TPS("wait"));
 			wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
-				   sync_exp_work_done(rsp,
-						      &rdp->exp_workdone2, s));
+				   sync_exp_work_done(rsp, s));
 			return true;
 		}
 		rnp->exp_seq_rq = s; /* Followers can wait on us. */
@@ -313,7 +328,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	}
 	mutex_lock(&rsp->exp_mutex);
 fastpath:
-	if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+	if (sync_exp_work_done(rsp, s)) {
 		mutex_unlock(&rsp->exp_mutex);
 		return true;
 	}
@@ -362,93 +377,129 @@ static void sync_sched_exp_online_cleanup(int cpu)
 }
 
 /*
- * Select the nodes that the upcoming expedited grace period needs
- * to wait for.
+ * Select the CPUs within the specified rcu_node that the upcoming
+ * expedited grace period needs to wait for.
  */
-static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
-				     smp_call_func_t func)
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
 {
 	int cpu;
 	unsigned long flags;
+	smp_call_func_t func;
 	unsigned long mask_ofl_test;
 	unsigned long mask_ofl_ipi;
 	int ret;
-	struct rcu_node *rnp;
-
-	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
-	sync_exp_reset_tree(rsp);
-	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
-	rcu_for_each_leaf_node(rsp, rnp) {
-		raw_spin_lock_irqsave_rcu_node(rnp, flags);
+	struct rcu_exp_work *rewp =
+		container_of(wp, struct rcu_exp_work, rew_work);
+	struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
+	struct rcu_state *rsp = rewp->rew_rsp;
 
-		/* Each pass checks a CPU for identity, offline, and idle. */
-		mask_ofl_test = 0;
-		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
-			unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
-			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-			struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
-			int snap;
+	func = rewp->rew_func;
+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 
-			if (raw_smp_processor_id() == cpu ||
-			    !(rnp->qsmaskinitnext & mask)) {
+	/* Each pass checks a CPU for identity, offline, and idle. */
+	mask_ofl_test = 0;
+	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+		unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+		struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+		struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
+		int snap;
+
+		if (raw_smp_processor_id() == cpu ||
+		    !(rnp->qsmaskinitnext & mask)) {
+			mask_ofl_test |= mask;
+		} else {
+			snap = rcu_dynticks_snap(rdtp);
+			if (rcu_dynticks_in_eqs(snap))
 				mask_ofl_test |= mask;
-			} else {
-				snap = rcu_dynticks_snap(rdtp);
-				if (rcu_dynticks_in_eqs(snap))
-					mask_ofl_test |= mask;
-				else
-					rdp->exp_dynticks_snap = snap;
-			}
+			else
+				rdp->exp_dynticks_snap = snap;
 		}
-		mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
-
-		/*
-		 * Need to wait for any blocked tasks as well.  Note that
-		 * additional blocking tasks will also block the expedited
-		 * GP until such time as the ->expmask bits are cleared.
-		 */
-		if (rcu_preempt_has_tasks(rnp))
-			rnp->exp_tasks = rnp->blkd_tasks.next;
-		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+	}
+	mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
 
-		/* IPI the remaining CPUs for expedited quiescent state. */
-		for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
-			unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
-			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+	/*
+	 * Need to wait for any blocked tasks as well.	Note that
+	 * additional blocking tasks will also block the expedited GP
+	 * until such time as the ->expmask bits are cleared.
+	 */
+	if (rcu_preempt_has_tasks(rnp))
+		rnp->exp_tasks = rnp->blkd_tasks.next;
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
-			if (!(mask_ofl_ipi & mask))
-				continue;
+	/* IPI the remaining CPUs for expedited quiescent state. */
+	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+		unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+		struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+
+		if (!(mask_ofl_ipi & mask))
+			continue;
 retry_ipi:
-			if (rcu_dynticks_in_eqs_since(rdp->dynticks,
-						      rdp->exp_dynticks_snap)) {
-				mask_ofl_test |= mask;
-				continue;
-			}
-			ret = smp_call_function_single(cpu, func, rsp, 0);
-			if (!ret) {
-				mask_ofl_ipi &= ~mask;
-				continue;
-			}
-			/* Failed, raced with CPU hotplug operation. */
-			raw_spin_lock_irqsave_rcu_node(rnp, flags);
-			if ((rnp->qsmaskinitnext & mask) &&
-			    (rnp->expmask & mask)) {
-				/* Online, so delay for a bit and try again. */
-				raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-				trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
-				schedule_timeout_uninterruptible(1);
-				goto retry_ipi;
-			}
-			/* CPU really is offline, so we can ignore it. */
-			if (!(rnp->expmask & mask))
-				mask_ofl_ipi &= ~mask;
+		if (rcu_dynticks_in_eqs_since(rdp->dynticks,
+					      rdp->exp_dynticks_snap)) {
+			mask_ofl_test |= mask;
+			continue;
+		}
+		ret = smp_call_function_single(cpu, func, rsp, 0);
+		if (!ret) {
+			mask_ofl_ipi &= ~mask;
+			continue;
+		}
+		/* Failed, raced with CPU hotplug operation. */
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
+		if ((rnp->qsmaskinitnext & mask) &&
+		    (rnp->expmask & mask)) {
+			/* Online, so delay for a bit and try again. */
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+			trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
+			schedule_timeout_uninterruptible(1);
+			goto retry_ipi;
+		}
+		/* CPU really is offline, so we can ignore it. */
+		if (!(rnp->expmask & mask))
+			mask_ofl_ipi &= ~mask;
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+	}
+	/* Report quiescent states for those that went offline. */
+	mask_ofl_test |= mask_ofl_ipi;
+	if (mask_ofl_test)
+		rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+}
+
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+				     smp_call_func_t func)
+{
+	struct rcu_node *rnp;
+
+	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
+	sync_exp_reset_tree(rsp);
+	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
+
+	/* Schedule work for each leaf rcu_node structure. */
+	rcu_for_each_leaf_node(rsp, rnp) {
+		rnp->exp_need_flush = false;
+		if (!READ_ONCE(rnp->expmask))
+			continue; /* Avoid early boot non-existent wq. */
+		rnp->rew.rew_func = func;
+		rnp->rew.rew_rsp = rsp;
+		if (!READ_ONCE(rcu_par_gp_wq) ||
+		    rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+			/* No workqueues yet. */
+			sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
+			continue;
 		}
-		/* Report quiescent states for those that went offline. */
-		mask_ofl_test |= mask_ofl_ipi;
-		if (mask_ofl_test)
-			rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+		INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+		queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work);
+		rnp->exp_need_flush = true;
 	}
+
+	/* Wait for workqueue jobs (if any) to complete. */
+	rcu_for_each_leaf_node(rsp, rnp)
+		if (rnp->exp_need_flush)
+			flush_work(&rnp->rew.rew_work);
 }
 
 static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@@ -469,9 +520,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	for (;;) {
 		ret = swait_event_timeout(
 				rsp->expedited_wq,
-				sync_rcu_preempt_exp_done(rnp_root),
+				sync_rcu_preempt_exp_done_unlocked(rnp_root),
 				jiffies_stall);
-		if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
+		if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root))
 			return;
 		WARN_ON(ret < 0);  /* workqueues should not be signaled. */
 		if (rcu_cpu_stall_suppress)
@@ -504,7 +555,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 			rcu_for_each_node_breadth_first(rsp, rnp) {
 				if (rnp == rnp_root)
 					continue; /* printed unconditionally */
-				if (sync_rcu_preempt_exp_done(rnp))
+				if (sync_rcu_preempt_exp_done_unlocked(rnp))
 					continue;
 				pr_cont(" l=%u:%d-%d:%#lx/%c",
 					rnp->level, rnp->grplo, rnp->grphi,
@@ -560,14 +611,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
 	mutex_unlock(&rsp->exp_wake_mutex);
 }
 
-/* Let the workqueue handler know what it is supposed to do. */
-struct rcu_exp_work {
-	smp_call_func_t rew_func;
-	struct rcu_state *rew_rsp;
-	unsigned long rew_s;
-	struct work_struct rew_work;
-};
-
 /*
  * Common code to drive an expedited grace period forward, used by
  * workqueues and mid-boot-time tasks.
@@ -633,7 +676,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
 	rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
 	rnp = rcu_get_root(rsp);
 	wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
-		   sync_exp_work_done(rsp, &rdp->exp_workdone0, s));
+		   sync_exp_work_done(rsp, s));
 	smp_mb(); /* Workqueue actions happen before return. */
 
 	/* Let the next expedited grace period start. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 84fbee4686d3..7fd12039e512 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -182,7 +182,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
 
 	raw_lockdep_assert_held_rcu_node(rnp);
 	WARN_ON_ONCE(rdp->mynode != rnp);
-	WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
+	WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
 
 	/*
 	 * Decide where to queue the newly blocked task.  In theory,
@@ -384,6 +384,50 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 }
 
 /*
+ * Preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+	current->rcu_read_lock_nesting++;
+	barrier();  /* critical section after entry code. */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+	struct task_struct *t = current;
+
+	if (t->rcu_read_lock_nesting != 1) {
+		--t->rcu_read_lock_nesting;
+	} else {
+		barrier();  /* critical section before exit code. */
+		t->rcu_read_lock_nesting = INT_MIN;
+		barrier();  /* assign before ->rcu_read_unlock_special load */
+		if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
+			rcu_read_unlock_special(t);
+		barrier();  /* ->rcu_read_unlock_special load before assign */
+		t->rcu_read_lock_nesting = 0;
+	}
+#ifdef CONFIG_PROVE_LOCKING
+	{
+		int rrln = READ_ONCE(t->rcu_read_lock_nesting);
+
+		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+	}
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
  * Advance a ->blkd_tasks-list pointer to the next entry, instead
  * returning NULL if at the end of the list.
  */
@@ -489,7 +533,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 		rnp = t->rcu_blocked_node;
 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 		WARN_ON_ONCE(rnp != t->rcu_blocked_node);
-		WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
+		WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 		empty_exp = sync_rcu_preempt_exp_done(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -685,15 +729,6 @@ static void rcu_preempt_check_callbacks(void)
 		t->rcu_read_unlock_special.b.need_qs = true;
 }
 
-#ifdef CONFIG_RCU_BOOST
-
-static void rcu_preempt_do_callbacks(void)
-{
-	rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
 /**
  * call_rcu() - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -1140,7 +1175,7 @@ static void rcu_kthread_do_work(void)
 {
 	rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
 	rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
-	rcu_preempt_do_callbacks();
+	rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
 }
 
 static void rcu_cpu_kthread_setup(unsigned int cpu)
@@ -1607,7 +1642,7 @@ static int rcu_oom_notify(struct notifier_block *self,
 
 	for_each_online_cpu(cpu) {
 		smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
-		cond_resched_rcu_qs();
+		cond_resched_tasks_rcu_qs();
 	}
 
 	/* Unconditionally decrement: no need to wake ourselves up. */
@@ -1780,19 +1815,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 	swake_up_all(sq);
 }
 
-/*
- * Set the root rcu_node structure's ->need_future_gp field
- * based on the sum of those of all rcu_node structures.  This does
- * double-count the root rcu_node structure's requests, but this
- * is necessary to handle the possibility of a rcu_nocb_kthread()
- * having awakened during the time that the rcu_node structures
- * were being updated for the end of the previous grace period.
- */
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
-{
-	rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
-}
-
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
 {
 	return &rnp->nocb_gp_wq[rnp->completed & 0x1];
@@ -1966,7 +1988,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 					    TPS("WakeOvf"));
 		} else {
-			wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+			wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE,
 					       TPS("WakeOvfIsDeferred"));
 		}
 		rdp->qlen_last_fqs_check = LONG_MAX / 2;
@@ -2048,7 +2070,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	struct rcu_node *rnp = rdp->mynode;
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	needwake = rcu_start_future_gp(rnp, rdp, &c);
+	c = rcu_cbs_completed(rdp->rsp, rnp);
+	needwake = rcu_start_this_gp(rnp, rdp, c);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	if (needwake)
 		rcu_gp_kthread_wake(rdp->rsp);
@@ -2057,7 +2080,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	 * Wait for the grace period.  Do so interruptibly to avoid messing
 	 * up the load average.
 	 */
-	trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
+	trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
 	for (;;) {
 		swait_event_interruptible(
 			rnp->nocb_gp_wq[c & 0x1],
@@ -2065,9 +2088,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 		if (likely(d))
 			break;
 		WARN_ON(signal_pending(current));
-		trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
+		trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait"));
 	}
-	trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
+	trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait"));
 	smp_mb(); /* Ensure that CB invocation happens after GP end. */
 }
 
@@ -2236,7 +2259,7 @@ static int rcu_nocb_kthread(void *arg)
 				cl++;
 			c++;
 			local_bh_enable();
-			cond_resched_rcu_qs();
+			cond_resched_tasks_rcu_qs();
 			list = next;
 		}
 		trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
@@ -2292,7 +2315,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
 void __init rcu_init_nohz(void)
 {
 	int cpu;
-	bool need_rcu_nocb_mask = true;
+	bool need_rcu_nocb_mask = false;
 	struct rcu_state *rsp;
 
 #if defined(CONFIG_NO_HZ_FULL)
@@ -2315,7 +2338,7 @@ void __init rcu_init_nohz(void)
 #endif /* #if defined(CONFIG_NO_HZ_FULL) */
 
 	if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
-		pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
+		pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
 		cpumask_and(rcu_nocb_mask, cpu_possible_mask,
 			    rcu_nocb_mask);
 	}
@@ -2495,10 +2518,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 {
 }
 
-static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
-{
-}
-
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
 {
 	return NULL;
@@ -2587,8 +2606,7 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
 }
 
 /*
- * Bind the grace-period kthread for the sysidle flavor of RCU to the
- * timekeeping CPU.
+ * Bind the RCU grace-period kthreads to the housekeeping CPU.
  */
 static void rcu_bind_gp_kthread(void)
 {
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 68fa19a5e7bd..4c230a60ece4 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -226,54 +226,6 @@ core_initcall(rcu_set_runtime_mode);
 
 #endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
 
-#ifdef CONFIG_PREEMPT_RCU
-
-/*
- * Preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
-	current->rcu_read_lock_nesting++;
-	barrier();  /* critical section after entry code. */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-
-/*
- * Preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
-	struct task_struct *t = current;
-
-	if (t->rcu_read_lock_nesting != 1) {
-		--t->rcu_read_lock_nesting;
-	} else {
-		barrier();  /* critical section before exit code. */
-		t->rcu_read_lock_nesting = INT_MIN;
-		barrier();  /* assign before ->rcu_read_unlock_special load */
-		if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
-			rcu_read_unlock_special(t);
-		barrier();  /* ->rcu_read_unlock_special load before assign */
-		t->rcu_read_lock_nesting = 0;
-	}
-#ifdef CONFIG_PROVE_LOCKING
-	{
-		int rrln = READ_ONCE(t->rcu_read_lock_nesting);
-
-		WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
-	}
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
@@ -624,7 +576,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
  * grace period has elapsed, in other words after all currently
  * executing rcu-tasks read-side critical sections have elapsed.  These
  * read-side critical sections are delimited by calls to schedule(),
- * cond_resched_rcu_qs(), idle execution, userspace execution, calls
+ * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls
  * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
  *
  * This is a very specialized primitive, intended only for a few uses in
diff --git a/kernel/resource.c b/kernel/resource.c
index e270b5048988..b589dda910b3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -87,7 +87,7 @@ enum { MAX_IORES_LEVEL = 5 };
 static void *r_start(struct seq_file *m, loff_t *pos)
 	__acquires(resource_lock)
 {
-	struct resource *p = m->private;
+	struct resource *p = PDE_DATA(file_inode(m->file));
 	loff_t l = 0;
 	read_lock(&resource_lock);
 	for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
@@ -103,7 +103,7 @@ static void r_stop(struct seq_file *m, void *v)
 
 static int r_show(struct seq_file *m, void *v)
 {
-	struct resource *root = m->private;
+	struct resource *root = PDE_DATA(file_inode(m->file));
 	struct resource *r = v, *p;
 	unsigned long long start, end;
 	int width = root->end < 0x10000 ? 4 : 8;
@@ -135,44 +135,11 @@ static const struct seq_operations resource_op = {
 	.show	= r_show,
 };
 
-static int ioports_open(struct inode *inode, struct file *file)
-{
-	int res = seq_open(file, &resource_op);
-	if (!res) {
-		struct seq_file *m = file->private_data;
-		m->private = &ioport_resource;
-	}
-	return res;
-}
-
-static int iomem_open(struct inode *inode, struct file *file)
-{
-	int res = seq_open(file, &resource_op);
-	if (!res) {
-		struct seq_file *m = file->private_data;
-		m->private = &iomem_resource;
-	}
-	return res;
-}
-
-static const struct file_operations proc_ioports_operations = {
-	.open		= ioports_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static const struct file_operations proc_iomem_operations = {
-	.open		= iomem_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
 static int __init ioresources_init(void)
 {
-	proc_create("ioports", 0, NULL, &proc_ioports_operations);
-	proc_create("iomem", 0, NULL, &proc_iomem_operations);
+	proc_create_seq_data("ioports", 0, NULL, &resource_op,
+			&ioport_resource);
+	proc_create_seq_data("iomem", 0, NULL, &resource_op, &iomem_resource);
 	return 0;
 }
 __initcall(ioresources_init);
@@ -651,7 +618,8 @@ static int __find_resource(struct resource *root, struct resource *old,
 			alloc.start = constraint->alignf(constraint->alignf_data, &avail,
 					size, constraint->align);
 			alloc.end = alloc.start + size - 1;
-			if (resource_contains(&avail, &alloc)) {
+			if (alloc.start <= alloc.end &&
+			    resource_contains(&avail, &alloc)) {
 				new->start = alloc.start;
 				new->end = alloc.end;
 				return 0;
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 6be6c575b6cd..2d4ff5353ded 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -2,6 +2,7 @@
 /*
  * Auto-group scheduling implementation:
  */
+#include <linux/nospec.h>
 #include "sched.h"
 
 unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
@@ -209,7 +210,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 	static unsigned long next = INITIAL_JIFFIES;
 	struct autogroup *ag;
 	unsigned long shares;
-	int err;
+	int err, idx;
 
 	if (nice < MIN_NICE || nice > MAX_NICE)
 		return -EINVAL;
@@ -227,7 +228,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 
 	next = HZ / 10 + jiffies;
 	ag = autogroup_task_get(p);
-	shares = scale_load(sched_prio_to_weight[nice + 20]);
+
+	idx = array_index_nospec(nice + 20, 40);
+	shares = scale_load(sched_prio_to_weight[idx]);
 
 	down_write(&ag->lock);
 	err = sched_group_set_shares(ag->tg, shares);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8afd6086f23..e9866f86f304 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,6 +7,9 @@
  */
 #include "sched.h"
 
+#include <linux/kthread.h>
+#include <linux/nospec.h>
+
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
 
@@ -874,10 +877,37 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 	 * this case, we can save a useless back to back clock update.
 	 */
 	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
-		rq_clock_skip_update(rq, true);
+		rq_clock_skip_update(rq);
 }
 
 #ifdef CONFIG_SMP
+
+static inline bool is_per_cpu_kthread(struct task_struct *p)
+{
+	if (!(p->flags & PF_KTHREAD))
+		return false;
+
+	if (p->nr_cpus_allowed != 1)
+		return false;
+
+	return true;
+}
+
+/*
+ * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
+ * __set_cpus_allowed_ptr() and select_fallback_rq().
+ */
+static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
+{
+	if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+		return false;
+
+	if (is_per_cpu_kthread(p))
+		return cpu_online(cpu);
+
+	return cpu_active(cpu);
+}
+
 /*
  * This is how migration works:
  *
@@ -935,16 +965,8 @@ struct migration_arg {
 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
 				 struct task_struct *p, int dest_cpu)
 {
-	if (p->flags & PF_KTHREAD) {
-		if (unlikely(!cpu_online(dest_cpu)))
-			return rq;
-	} else {
-		if (unlikely(!cpu_active(dest_cpu)))
-			return rq;
-	}
-
 	/* Affinity changed (again). */
-	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+	if (!is_cpu_allowed(p, dest_cpu))
 		return rq;
 
 	update_rq_clock(rq);
@@ -1473,10 +1495,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 	for (;;) {
 		/* Any allowed, online CPU? */
 		for_each_cpu(dest_cpu, &p->cpus_allowed) {
-			if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
-				continue;
-			if (!cpu_online(dest_cpu))
+			if (!is_cpu_allowed(p, dest_cpu))
 				continue;
+
 			goto out;
 		}
 
@@ -1539,8 +1560,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 	 * [ this allows ->select_task() to simply return task_cpu(p) and
 	 *   not worry about this generic constraint ]
 	 */
-	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
-		     !cpu_online(cpu)))
+	if (unlikely(!is_cpu_allowed(p, cpu)))
 		cpu = select_fallback_rq(task_cpu(p), p);
 
 	return cpu;
@@ -2174,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
 
-#ifdef CONFIG_NUMA_BALANCING
-	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
-		p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-		p->mm->numa_scan_seq = 0;
-	}
-
-	if (clone_flags & CLONE_VM)
-		p->numa_preferred_nid = current->numa_preferred_nid;
-	else
-		p->numa_preferred_nid = -1;
-
-	p->node_stamp = 0ULL;
-	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
-	p->numa_work.next = &p->numa_work;
-	p->numa_faults = NULL;
-	p->last_task_numa_placement = 0;
-	p->last_sum_exec_runtime = 0;
-
-	p->numa_group = NULL;
-#endif /* CONFIG_NUMA_BALANCING */
+	init_numa_balancing(clone_flags, p);
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -2718,20 +2718,28 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		membarrier_mm_sync_core_before_usermode(mm);
 		mmdrop(mm);
 	}
-	if (unlikely(prev_state == TASK_DEAD)) {
-		if (prev->sched_class->task_dead)
-			prev->sched_class->task_dead(prev);
+	if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) {
+		switch (prev_state) {
+		case TASK_DEAD:
+			if (prev->sched_class->task_dead)
+				prev->sched_class->task_dead(prev);
 
-		/*
-		 * Remove function-return probe instances associated with this
-		 * task and put them back on the free list.
-		 */
-		kprobe_flush_task(prev);
+			/*
+			 * Remove function-return probe instances associated with this
+			 * task and put them back on the free list.
+			 */
+			kprobe_flush_task(prev);
 
-		/* Task is done with its stack. */
-		put_task_stack(prev);
+			/* Task is done with its stack. */
+			put_task_stack(prev);
 
-		put_task_struct(prev);
+			put_task_struct(prev);
+			break;
+
+		case TASK_PARKED:
+			kthread_park_complete(prev);
+			break;
+		}
 	}
 
 	tick_nohz_task_switch();
@@ -3498,23 +3506,8 @@ static void __sched notrace __schedule(bool preempt)
 
 void __noreturn do_task_dead(void)
 {
-	/*
-	 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
-	 * when the following two conditions become true.
-	 *   - There is race condition of mmap_sem (It is acquired by
-	 *     exit_mm()), and
-	 *   - SMI occurs before setting TASK_RUNINNG.
-	 *     (or hypervisor of virtual machine switches to other guest)
-	 *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
-	 *
-	 * To avoid it, we have to wait for releasing tsk->pi_lock which
-	 * is held by try_to_wake_up()
-	 */
-	raw_spin_lock_irq(&current->pi_lock);
-	raw_spin_unlock_irq(&current->pi_lock);
-
 	/* Causes final put_task_struct in finish_task_switch(): */
-	__set_current_state(TASK_DEAD);
+	set_special_state(TASK_DEAD);
 
 	/* Tell freezer to ignore us: */
 	current->flags |= PF_NOFREEZE;
@@ -4037,6 +4030,23 @@ int idle_cpu(int cpu)
 }
 
 /**
+ * available_idle_cpu - is a given CPU idle for enqueuing work.
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int available_idle_cpu(int cpu)
+{
+	if (!idle_cpu(cpu))
+		return 0;
+
+	if (vcpu_is_preempted(cpu))
+		return 0;
+
+	return 1;
+}
+
+/**
  * idle_task - return the idle task for a given CPU.
  * @cpu: the processor in question.
  *
@@ -5012,20 +5022,6 @@ int __cond_resched_lock(spinlock_t *lock)
 }
 EXPORT_SYMBOL(__cond_resched_lock);
 
-int __sched __cond_resched_softirq(void)
-{
-	BUG_ON(!in_softirq());
-
-	if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
-		local_bh_enable();
-		preempt_schedule_common();
-		local_bh_disable();
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(__cond_resched_softirq);
-
 /**
  * yield - yield the current processor to other threads.
  *
@@ -6928,11 +6924,15 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
 				     struct cftype *cft, s64 nice)
 {
 	unsigned long weight;
+	int idx;
 
 	if (nice < MIN_NICE || nice > MAX_NICE)
 		return -ERANGE;
 
-	weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
+	idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
+	idx = array_index_nospec(idx, 40);
+	weight = sched_prio_to_weight[idx];
+
 	return sched_group_set_shares(css_tg(css), scale_load(weight));
 }
 #endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2b124811947d..3cde46483f0a 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -51,7 +51,7 @@ struct sugov_cpu {
 	bool			iowait_boost_pending;
 	unsigned int		iowait_boost;
 	unsigned int		iowait_boost_max;
-	u64 last_update;
+	u64			last_update;
 
 	/* The fields below are only needed when sharing a policy: */
 	unsigned long		util_cfs;
@@ -89,46 +89,52 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
 	 * schedule the kthread.
 	 */
 	if (sg_policy->policy->fast_switch_enabled &&
-	    !cpufreq_can_do_remote_dvfs(sg_policy->policy))
+	    !cpufreq_this_cpu_can_update(sg_policy->policy))
 		return false;
 
-	if (sg_policy->work_in_progress)
-		return false;
-
-	if (unlikely(sg_policy->need_freq_update)) {
-		sg_policy->need_freq_update = false;
-		/*
-		 * This happens when limits change, so forget the previous
-		 * next_freq value and force an update.
-		 */
-		sg_policy->next_freq = UINT_MAX;
+	if (unlikely(sg_policy->need_freq_update))
 		return true;
-	}
 
 	delta_ns = time - sg_policy->last_freq_update_time;
 
 	return delta_ns >= sg_policy->freq_update_delay_ns;
 }
 
-static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
-				unsigned int next_freq)
+static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
+				   unsigned int next_freq)
 {
-	struct cpufreq_policy *policy = sg_policy->policy;
-
 	if (sg_policy->next_freq == next_freq)
-		return;
+		return false;
 
 	sg_policy->next_freq = next_freq;
 	sg_policy->last_freq_update_time = time;
 
-	if (policy->fast_switch_enabled) {
-		next_freq = cpufreq_driver_fast_switch(policy, next_freq);
-		if (!next_freq)
-			return;
+	return true;
+}
 
-		policy->cur = next_freq;
-		trace_cpu_frequency(next_freq, smp_processor_id());
-	} else {
+static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
+			      unsigned int next_freq)
+{
+	struct cpufreq_policy *policy = sg_policy->policy;
+
+	if (!sugov_update_next_freq(sg_policy, time, next_freq))
+		return;
+
+	next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+	if (!next_freq)
+		return;
+
+	policy->cur = next_freq;
+	trace_cpu_frequency(next_freq, smp_processor_id());
+}
+
+static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
+				  unsigned int next_freq)
+{
+	if (!sugov_update_next_freq(sg_policy, time, next_freq))
+		return;
+
+	if (!sg_policy->work_in_progress) {
 		sg_policy->work_in_progress = true;
 		irq_work_queue(&sg_policy->irq_work);
 	}
@@ -165,8 +171,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 
 	freq = (freq + (freq >> 2)) * util / max;
 
-	if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+	if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
 		return sg_policy->next_freq;
+
+	sg_policy->need_freq_update = false;
 	sg_policy->cached_raw_freq = freq;
 	return cpufreq_driver_resolve_freq(policy, freq);
 }
@@ -183,61 +191,137 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
 static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
 {
 	struct rq *rq = cpu_rq(sg_cpu->cpu);
-	unsigned long util;
 
-	if (rq->rt.rt_nr_running) {
-		util = sg_cpu->max;
-	} else {
-		util = sg_cpu->util_dl;
-		if (rq->cfs.h_nr_running)
-			util += sg_cpu->util_cfs;
-	}
+	if (rq->rt.rt_nr_running)
+		return sg_cpu->max;
 
 	/*
+	 * Utilization required by DEADLINE must always be granted while, for
+	 * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
+	 * gracefully reduce the frequency when no tasks show up for longer
+	 * periods of time.
+	 *
 	 * Ideally we would like to set util_dl as min/guaranteed freq and
 	 * util_cfs + util_dl as requested freq. However, cpufreq is not yet
 	 * ready for such an interface. So, we only do the latter for now.
 	 */
-	return min(util, sg_cpu->max);
+	return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
 }
 
-static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
+/**
+ * sugov_iowait_reset() - Reset the IO boost status of a CPU.
+ * @sg_cpu: the sugov data for the CPU to boost
+ * @time: the update time from the caller
+ * @set_iowait_boost: true if an IO boost has been requested
+ *
+ * The IO wait boost of a task is disabled after a tick since the last update
+ * of a CPU. If a new IO wait boost is requested after more then a tick, then
+ * we enable the boost starting from the minimum frequency, which improves
+ * energy efficiency by ignoring sporadic wakeups from IO.
+ */
+static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
+			       bool set_iowait_boost)
 {
-	if (flags & SCHED_CPUFREQ_IOWAIT) {
-		if (sg_cpu->iowait_boost_pending)
-			return;
+	s64 delta_ns = time - sg_cpu->last_update;
 
-		sg_cpu->iowait_boost_pending = true;
+	/* Reset boost only if a tick has elapsed since last request */
+	if (delta_ns <= TICK_NSEC)
+		return false;
 
-		if (sg_cpu->iowait_boost) {
-			sg_cpu->iowait_boost <<= 1;
-			if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
-				sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
-		} else {
-			sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
-		}
-	} else if (sg_cpu->iowait_boost) {
-		s64 delta_ns = time - sg_cpu->last_update;
+	sg_cpu->iowait_boost = set_iowait_boost
+		? sg_cpu->sg_policy->policy->min : 0;
+	sg_cpu->iowait_boost_pending = set_iowait_boost;
 
-		/* Clear iowait_boost if the CPU apprears to have been idle. */
-		if (delta_ns > TICK_NSEC) {
-			sg_cpu->iowait_boost = 0;
-			sg_cpu->iowait_boost_pending = false;
-		}
+	return true;
+}
+
+/**
+ * sugov_iowait_boost() - Updates the IO boost status of a CPU.
+ * @sg_cpu: the sugov data for the CPU to boost
+ * @time: the update time from the caller
+ * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
+ *
+ * Each time a task wakes up after an IO operation, the CPU utilization can be
+ * boosted to a certain utilization which doubles at each "frequent and
+ * successive" wakeup from IO, ranging from the utilization of the minimum
+ * OPP to the utilization of the maximum OPP.
+ * To keep doubling, an IO boost has to be requested at least once per tick,
+ * otherwise we restart from the utilization of the minimum OPP.
+ */
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+			       unsigned int flags)
+{
+	bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
+
+	/* Reset boost if the CPU appears to have been idle enough */
+	if (sg_cpu->iowait_boost &&
+	    sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
+		return;
+
+	/* Boost only tasks waking up after IO */
+	if (!set_iowait_boost)
+		return;
+
+	/* Ensure boost doubles only one time at each request */
+	if (sg_cpu->iowait_boost_pending)
+		return;
+	sg_cpu->iowait_boost_pending = true;
+
+	/* Double the boost at each request */
+	if (sg_cpu->iowait_boost) {
+		sg_cpu->iowait_boost <<= 1;
+		if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+			sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+		return;
 	}
+
+	/* First wakeup after IO: start with minimum boost */
+	sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
 }
 
-static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
-			       unsigned long *max)
+/**
+ * sugov_iowait_apply() - Apply the IO boost to a CPU.
+ * @sg_cpu: the sugov data for the cpu to boost
+ * @time: the update time from the caller
+ * @util: the utilization to (eventually) boost
+ * @max: the maximum value the utilization can be boosted to
+ *
+ * A CPU running a task which woken up after an IO operation can have its
+ * utilization boosted to speed up the completion of those IO operations.
+ * The IO boost value is increased each time a task wakes up from IO, in
+ * sugov_iowait_apply(), and it's instead decreased by this function,
+ * each time an increase has not been requested (!iowait_boost_pending).
+ *
+ * A CPU which also appears to have been idle for at least one tick has also
+ * its IO boost utilization reset.
+ *
+ * This mechanism is designed to boost high frequently IO waiting tasks, while
+ * being more conservative on tasks which does sporadic IO operations.
+ */
+static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+			       unsigned long *util, unsigned long *max)
 {
 	unsigned int boost_util, boost_max;
 
+	/* No boost currently required */
 	if (!sg_cpu->iowait_boost)
 		return;
 
+	/* Reset boost if the CPU appears to have been idle enough */
+	if (sugov_iowait_reset(sg_cpu, time, false))
+		return;
+
+	/*
+	 * An IO waiting task has just woken up:
+	 * allow to further double the boost value
+	 */
 	if (sg_cpu->iowait_boost_pending) {
 		sg_cpu->iowait_boost_pending = false;
 	} else {
+		/*
+		 * Otherwise: reduce the boost value and disable it when we
+		 * reach the minimum.
+		 */
 		sg_cpu->iowait_boost >>= 1;
 		if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
 			sg_cpu->iowait_boost = 0;
@@ -245,9 +329,12 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 		}
 	}
 
+	/*
+	 * Apply the current boost value: a CPU is boosted only if its current
+	 * utilization is smaller then the current IO boost level.
+	 */
 	boost_util = sg_cpu->iowait_boost;
 	boost_max = sg_cpu->iowait_boost_max;
-
 	if (*util * boost_max < *max * boost_util) {
 		*util = boost_util;
 		*max = boost_max;
@@ -286,7 +373,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	unsigned int next_f;
 	bool busy;
 
-	sugov_set_iowait_boost(sg_cpu, time, flags);
+	sugov_iowait_boost(sg_cpu, time, flags);
 	sg_cpu->last_update = time;
 
 	ignore_dl_rate_limit(sg_cpu, sg_policy);
@@ -299,7 +386,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	sugov_get_util(sg_cpu);
 	max = sg_cpu->max;
 	util = sugov_aggregate_util(sg_cpu);
-	sugov_iowait_boost(sg_cpu, &util, &max);
+	sugov_iowait_apply(sg_cpu, time, &util, &max);
 	next_f = get_next_freq(sg_policy, util, max);
 	/*
 	 * Do not reduce the frequency if the CPU has not been idle
@@ -312,7 +399,18 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 		sg_policy->cached_raw_freq = 0;
 	}
 
-	sugov_update_commit(sg_policy, time, next_f);
+	/*
+	 * This code runs under rq->lock for the target CPU, so it won't run
+	 * concurrently on two different CPUs for the same target and it is not
+	 * necessary to acquire the lock in the fast switch case.
+	 */
+	if (sg_policy->policy->fast_switch_enabled) {
+		sugov_fast_switch(sg_policy, time, next_f);
+	} else {
+		raw_spin_lock(&sg_policy->update_lock);
+		sugov_deferred_update(sg_policy, time, next_f);
+		raw_spin_unlock(&sg_policy->update_lock);
+	}
 }
 
 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
@@ -325,28 +423,12 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 	for_each_cpu(j, policy->cpus) {
 		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
 		unsigned long j_util, j_max;
-		s64 delta_ns;
 
 		sugov_get_util(j_sg_cpu);
-
-		/*
-		 * If the CFS CPU utilization was last updated before the
-		 * previous frequency update and the time elapsed between the
-		 * last update of the CPU utilization and the last frequency
-		 * update is long enough, reset iowait_boost and util_cfs, as
-		 * they are now probably stale. However, still consider the
-		 * CPU contribution if it has some DEADLINE utilization
-		 * (util_dl).
-		 */
-		delta_ns = time - j_sg_cpu->last_update;
-		if (delta_ns > TICK_NSEC) {
-			j_sg_cpu->iowait_boost = 0;
-			j_sg_cpu->iowait_boost_pending = false;
-		}
-
 		j_max = j_sg_cpu->max;
 		j_util = sugov_aggregate_util(j_sg_cpu);
-		sugov_iowait_boost(j_sg_cpu, &j_util, &j_max);
+		sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
+
 		if (j_util * max > j_max * util) {
 			util = j_util;
 			max = j_max;
@@ -365,14 +447,18 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
 
 	raw_spin_lock(&sg_policy->update_lock);
 
-	sugov_set_iowait_boost(sg_cpu, time, flags);
+	sugov_iowait_boost(sg_cpu, time, flags);
 	sg_cpu->last_update = time;
 
 	ignore_dl_rate_limit(sg_cpu, sg_policy);
 
 	if (sugov_should_update_freq(sg_policy, time)) {
 		next_f = sugov_next_freq_shared(sg_cpu, time);
-		sugov_update_commit(sg_policy, time, next_f);
+
+		if (sg_policy->policy->fast_switch_enabled)
+			sugov_fast_switch(sg_policy, time, next_f);
+		else
+			sugov_deferred_update(sg_policy, time, next_f);
 	}
 
 	raw_spin_unlock(&sg_policy->update_lock);
@@ -381,13 +467,27 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
 static void sugov_work(struct kthread_work *work)
 {
 	struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+	unsigned int freq;
+	unsigned long flags;
+
+	/*
+	 * Hold sg_policy->update_lock shortly to handle the case where:
+	 * incase sg_policy->next_freq is read here, and then updated by
+	 * sugov_deferred_update() just before work_in_progress is set to false
+	 * here, we may miss queueing the new update.
+	 *
+	 * Note: If a work was queued after the update_lock is released,
+	 * sugov_work() will just be called again by kthread_work code; and the
+	 * request will be proceed before the sugov thread sleeps.
+	 */
+	raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
+	freq = sg_policy->next_freq;
+	sg_policy->work_in_progress = false;
+	raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
 
 	mutex_lock(&sg_policy->work_lock);
-	__cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
-				CPUFREQ_RELATION_L);
+	__cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
 	mutex_unlock(&sg_policy->work_lock);
-
-	sg_policy->work_in_progress = false;
 }
 
 static void sugov_irq_work(struct irq_work *irq_work)
@@ -396,19 +496,6 @@ static void sugov_irq_work(struct irq_work *irq_work)
 
 	sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
 
-	/*
-	 * For RT tasks, the schedutil governor shoots the frequency to maximum.
-	 * Special care must be taken to ensure that this kthread doesn't result
-	 * in the same behavior.
-	 *
-	 * This is (mostly) guaranteed by the work_in_progress flag. The flag is
-	 * updated only at the end of the sugov_work() function and before that
-	 * the schedutil governor rejects all other frequency scaling requests.
-	 *
-	 * There is a very rare case though, where the RT thread yields right
-	 * after the work_in_progress flag is cleared. The effects of that are
-	 * neglected for now.
-	 */
 	kthread_queue_work(&sg_policy->worker, &sg_policy->work);
 }
 
@@ -523,11 +610,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
 	}
 
 	sg_policy->thread = thread;
-
-	/* Kthread is bound to all CPUs by default */
-	if (!policy->dvfs_possible_from_any_cpu)
-		kthread_bind_mask(thread, policy->related_cpus);
-
+	kthread_bind_mask(thread, policy->related_cpus);
 	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
 	mutex_init(&sg_policy->work_lock);
 
@@ -631,10 +714,9 @@ fail:
 
 stop_kthread:
 	sugov_kthread_stop(sg_policy);
-
-free_sg_policy:
 	mutex_unlock(&global_tunables_lock);
 
+free_sg_policy:
 	sugov_policy_free(sg_policy);
 
 disable_fast_switch:
@@ -671,7 +753,7 @@ static int sugov_start(struct cpufreq_policy *policy)
 
 	sg_policy->freq_update_delay_ns	= sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
 	sg_policy->last_freq_update_time	= 0;
-	sg_policy->next_freq			= UINT_MAX;
+	sg_policy->next_freq			= 0;
 	sg_policy->work_in_progress		= false;
 	sg_policy->need_freq_update		= false;
 	sg_policy->cached_raw_freq		= 0;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d1c7bf7c7e5b..fbfc3f1d368a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1117,7 +1117,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
  * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
  * So, overflow is not an issue here.
  */
-u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
+static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
 {
 	u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
 	u64 u_act;
@@ -1259,6 +1259,9 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 
 	rq = task_rq_lock(p, &rf);
 
+	sched_clock_tick();
+	update_rq_clock(rq);
+
 	if (!dl_task(p) || p->state == TASK_DEAD) {
 		struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
@@ -1278,9 +1281,6 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 	if (dl_se->dl_non_contending == 0)
 		goto unlock;
 
-	sched_clock_tick();
-	update_rq_clock(rq);
-
 	sub_running_bw(dl_se, &rq->dl);
 	dl_se->dl_non_contending = 0;
 unlock:
@@ -1560,7 +1560,7 @@ static void yield_task_dl(struct rq *rq)
 	 * so we don't do microscopic update in schedule()
 	 * and double the fastpath cost.
 	 */
-	rq_clock_skip_update(rq, true);
+	rq_clock_skip_update(rq);
 }
 
 #ifdef CONFIG_SMP
@@ -2731,8 +2731,6 @@ bool dl_cpu_busy(unsigned int cpu)
 #endif
 
 #ifdef CONFIG_SCHED_DEBUG
-extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
-
 void print_dl_stats(struct seq_file *m, int cpu)
 {
 	print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 15b10e210a6b..e593b4118578 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -823,35 +823,9 @@ static const struct seq_operations sched_debug_sops = {
 	.show		= sched_debug_show,
 };
 
-static int sched_debug_release(struct inode *inode, struct file *file)
-{
-	seq_release(inode, file);
-
-	return 0;
-}
-
-static int sched_debug_open(struct inode *inode, struct file *filp)
-{
-	int ret = 0;
-
-	ret = seq_open(filp, &sched_debug_sops);
-
-	return ret;
-}
-
-static const struct file_operations sched_debug_fops = {
-	.open		= sched_debug_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= sched_debug_release,
-};
-
 static int __init init_sched_debug_procfs(void)
 {
-	struct proc_dir_entry *pe;
-
-	pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
-	if (!pe)
+	if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0951d1c58d2f..e497c05aab7f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p)
 	return max(smin, smax);
 }
 
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+	int mm_users = 0;
+	struct mm_struct *mm = p->mm;
+
+	if (mm) {
+		mm_users = atomic_read(&mm->mm_users);
+		if (mm_users == 1) {
+			mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+			mm->numa_scan_seq = 0;
+		}
+	}
+	p->node_stamp			= 0;
+	p->numa_scan_seq		= mm ? mm->numa_scan_seq : 0;
+	p->numa_scan_period		= sysctl_numa_balancing_scan_delay;
+	p->numa_work.next		= &p->numa_work;
+	p->numa_faults			= NULL;
+	p->numa_group			= NULL;
+	p->last_task_numa_placement	= 0;
+	p->last_sum_exec_runtime	= 0;
+
+	/* New address space, reset the preferred nid */
+	if (!(clone_flags & CLONE_VM)) {
+		p->numa_preferred_nid = -1;
+		return;
+	}
+
+	/*
+	 * New thread, keep existing numa_preferred_nid which should be copied
+	 * already by arch_dup_task_struct but stagger when scans start.
+	 */
+	if (mm) {
+		unsigned int delay;
+
+		delay = min_t(unsigned int, task_scan_max(current),
+			current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+		delay += 2 * TICK_NSEC;
+		p->node_stamp = delay;
+	}
+}
+
 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 {
 	rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -1854,7 +1895,6 @@ static int task_numa_migrate(struct task_struct *p)
 static void numa_migrate_preferred(struct task_struct *p)
 {
 	unsigned long interval = HZ;
-	unsigned long numa_migrate_retry;
 
 	/* This task has no NUMA fault statistics yet */
 	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
@@ -1862,18 +1902,7 @@ static void numa_migrate_preferred(struct task_struct *p)
 
 	/* Periodically retry migrating the task to the preferred node */
 	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
-	numa_migrate_retry = jiffies + interval;
-
-	/*
-	 * Check that the new retry threshold is after the current one. If
-	 * the retry is in the future, it implies that wake_affine has
-	 * temporarily asked NUMA balancing to backoff from placement.
-	 */
-	if (numa_migrate_retry > p->numa_migrate_retry)
-		return;
-
-	/* Safe to try placing the task on the preferred node */
-	p->numa_migrate_retry = numa_migrate_retry;
+	p->numa_migrate_retry = jiffies + interval;
 
 	/* Success if task is already running on preferred CPU */
 	if (task_node(p) == p->numa_preferred_nid)
@@ -5357,6 +5386,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_entity *se = &p->se;
 
 	/*
+	 * The code below (indirectly) updates schedutil which looks at
+	 * the cfs_rq utilization to select a frequency.
+	 * Let's add the task's estimated utilization to the cfs_rq's
+	 * estimated utilization, before we update schedutil.
+	 */
+	util_est_enqueue(&rq->cfs, p);
+
+	/*
 	 * If in_iowait is set, the code below may not trigger any cpufreq
 	 * utilization updates, so do it here explicitly with the IOWAIT flag
 	 * passed.
@@ -5397,7 +5434,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!se)
 		add_nr_running(rq, 1);
 
-	util_est_enqueue(&rq->cfs, p);
 	hrtick_update(rq);
 }
 
@@ -5870,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
 	 * a cpufreq perspective, it's better to have higher utilisation
 	 * on one CPU.
 	 */
-	if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
-		return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
+	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+		return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
 
 	if (sync && cpu_rq(this_cpu)->nr_running == 1)
 		return this_cpu;
@@ -5922,48 +5958,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 	return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-static void
-update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
-{
-	unsigned long interval;
-
-	if (!static_branch_likely(&sched_numa_balancing))
-		return;
-
-	/* If balancing has no preference then continue gathering data */
-	if (p->numa_preferred_nid == -1)
-		return;
-
-	/*
-	 * If the wakeup is not affecting locality then it is neutral from
-	 * the perspective of NUMA balacing so continue gathering data.
-	 */
-	if (cpu_to_node(prev_cpu) == cpu_to_node(target))
-		return;
-
-	/*
-	 * Temporarily prevent NUMA balancing trying to place waker/wakee after
-	 * wakee has been moved by wake_affine. This will potentially allow
-	 * related tasks to converge and update their data placement. The
-	 * 4 * numa_scan_period is to allow the two-pass filter to migrate
-	 * hot data to the wakers node.
-	 */
-	interval = max(sysctl_numa_balancing_scan_delay,
-			 p->numa_scan_period << 2);
-	p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
-
-	interval = max(sysctl_numa_balancing_scan_delay,
-			 current->numa_scan_period << 2);
-	current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
-}
-#else
-static void
-update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
-{
-}
-#endif
-
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 		       int this_cpu, int prev_cpu, int sync)
 {
@@ -5979,7 +5973,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 	if (target == nr_cpumask_bits)
 		return prev_cpu;
 
-	update_wa_numa_placement(p, prev_cpu, target);
 	schedstat_inc(sd->ttwu_move_affine);
 	schedstat_inc(p->se.statistics.nr_wakeups_affine);
 	return target;
@@ -6157,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 
 	/* Traverse only the allowed CPUs */
 	for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
-		if (idle_cpu(i)) {
+		if (available_idle_cpu(i)) {
 			struct rq *rq = cpu_rq(i);
 			struct cpuidle_state *idle = idle_get_state(rq);
 			if (idle && idle->exit_latency < min_exit_latency) {
@@ -6199,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
 		return prev_cpu;
 
+	/*
+	 * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
+	 * last_update_time.
+	 */
+	if (!(sd_flag & SD_BALANCE_FORK))
+		sync_entity_load_avg(&p->se);
+
 	while (sd) {
 		struct sched_group *group;
 		struct sched_domain *tmp;
@@ -6279,7 +6279,7 @@ void __update_idle_core(struct rq *rq)
 		if (cpu == core)
 			continue;
 
-		if (!idle_cpu(cpu))
+		if (!available_idle_cpu(cpu))
 			goto unlock;
 	}
 
@@ -6311,7 +6311,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 
 		for_each_cpu(cpu, cpu_smt_mask(core)) {
 			cpumask_clear_cpu(cpu, cpus);
-			if (!idle_cpu(cpu))
+			if (!available_idle_cpu(cpu))
 				idle = false;
 		}
 
@@ -6340,7 +6340,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 	for_each_cpu(cpu, cpu_smt_mask(target)) {
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
-		if (idle_cpu(cpu))
+		if (available_idle_cpu(cpu))
 			return cpu;
 	}
 
@@ -6403,7 +6403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 			return -1;
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
-		if (idle_cpu(cpu))
+		if (available_idle_cpu(cpu))
 			break;
 	}
 
@@ -6423,13 +6423,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	struct sched_domain *sd;
 	int i, recent_used_cpu;
 
-	if (idle_cpu(target))
+	if (available_idle_cpu(target))
 		return target;
 
 	/*
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
-	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+	if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
 		return prev;
 
 	/* Check a recently used CPU as a potential idle candidate: */
@@ -6437,7 +6437,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
-	    idle_cpu(recent_used_cpu) &&
+	    available_idle_cpu(recent_used_cpu) &&
 	    cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
 		/*
 		 * Replace recent_used_cpu with prev as it is a potential
@@ -6613,7 +6613,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 static int
 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
 {
-	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+	struct sched_domain *tmp, *sd = NULL;
 	int cpu = smp_processor_id();
 	int new_cpu = prev_cpu;
 	int want_affine = 0;
@@ -6636,7 +6636,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		 */
 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
-			affine_sd = tmp;
+			if (cpu != prev_cpu)
+				new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
+
+			sd = NULL; /* Prefer wake_affine over balance flags */
 			break;
 		}
 
@@ -6646,33 +6649,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 			break;
 	}
 
-	if (affine_sd) {
-		sd = NULL; /* Prefer wake_affine over balance flags */
-		if (cpu == prev_cpu)
-			goto pick_cpu;
-
-		new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
-	}
-
-	if (sd && !(sd_flag & SD_BALANCE_FORK)) {
-		/*
-		 * We're going to need the task's util for capacity_spare_wake
-		 * in find_idlest_group. Sync it up to prev_cpu's
-		 * last_update_time.
-		 */
-		sync_entity_load_avg(&p->se);
-	}
+	if (unlikely(sd)) {
+		/* Slow path */
+		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+	} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
+		/* Fast path */
 
-	if (!sd) {
-pick_cpu:
-		if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
-			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
-			if (want_affine)
-				current->recent_used_cpu = cpu;
-		}
-	} else {
-		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
+		if (want_affine)
+			current->recent_used_cpu = cpu;
 	}
 	rcu_read_unlock();
 
@@ -7089,7 +7075,7 @@ static void yield_task_fair(struct rq *rq)
 		 * so we don't do microscopic update in schedule()
 		 * and double the fastpath cost.
 		 */
-		rq_clock_skip_update(rq, true);
+		rq_clock_skip_update(rq);
 	}
 
 	set_skip_buddy(se);
@@ -9847,6 +9833,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 	if (curr_cost > this_rq->max_idle_balance_cost)
 		this_rq->max_idle_balance_cost = curr_cost;
 
+out:
 	/*
 	 * While browsing the domains, we released the rq lock, a task could
 	 * have been enqueued in the meantime. Since we're not going idle,
@@ -9855,7 +9842,6 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
 	if (this_rq->cfs.h_nr_running && !pulled_task)
 		pulled_task = 1;
 
-out:
 	/* Move the next balance forward */
 	if (time_after(this_rq->next_balance, next_balance))
 		this_rq->next_balance = next_balance;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2975f195e1c4..1a3e9bddd17b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -141,13 +141,15 @@ static void cpuidle_idle_call(void)
 	}
 
 	/*
-	 * Tell the RCU framework we are entering an idle section,
-	 * so no more rcu read side critical sections and one more
+	 * The RCU framework needs to be told that we are entering an idle
+	 * section, so no more rcu read side critical sections and one more
 	 * step to the grace period
 	 */
-	rcu_idle_enter();
 
 	if (cpuidle_not_available(drv, dev)) {
+		tick_nohz_idle_stop_tick();
+		rcu_idle_enter();
+
 		default_idle_call();
 		goto exit_idle;
 	}
@@ -164,20 +166,37 @@ static void cpuidle_idle_call(void)
 
 	if (idle_should_enter_s2idle() || dev->use_deepest_state) {
 		if (idle_should_enter_s2idle()) {
+			rcu_idle_enter();
+
 			entered_state = cpuidle_enter_s2idle(drv, dev);
 			if (entered_state > 0) {
 				local_irq_enable();
 				goto exit_idle;
 			}
+
+			rcu_idle_exit();
 		}
 
+		tick_nohz_idle_stop_tick();
+		rcu_idle_enter();
+
 		next_state = cpuidle_find_deepest_state(drv, dev);
 		call_cpuidle(drv, dev, next_state);
 	} else {
+		bool stop_tick = true;
+
 		/*
 		 * Ask the cpuidle framework to choose a convenient idle state.
 		 */
-		next_state = cpuidle_select(drv, dev);
+		next_state = cpuidle_select(drv, dev, &stop_tick);
+
+		if (stop_tick)
+			tick_nohz_idle_stop_tick();
+		else
+			tick_nohz_idle_retain_tick();
+
+		rcu_idle_enter();
+
 		entered_state = call_cpuidle(drv, dev, next_state);
 		/*
 		 * Give the governor an opportunity to reflect on the outcome
@@ -222,6 +241,7 @@ static void do_idle(void)
 		rmb();
 
 		if (cpu_is_offline(cpu)) {
+			tick_nohz_idle_stop_tick_protected();
 			cpuhp_report_idle_dead();
 			arch_cpu_idle_dead();
 		}
@@ -235,10 +255,12 @@ static void do_idle(void)
 		 * broadcast device expired for us, we don't want to go deep
 		 * idle as we know that the IPI is going to arrive right away.
 		 */
-		if (cpu_idle_force_poll || tick_check_broadcast_expired())
+		if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+			tick_nohz_idle_restart_tick();
 			cpu_idle_poll();
-		else
+		} else {
 			cpuidle_idle_call();
+		}
 		arch_cpu_idle_exit();
 	}
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 86b77987435e..ef3c4e6f5345 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -839,6 +839,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 			continue;
 
 		raw_spin_lock(&rq->lock);
+		update_rq_clock(rq);
+
 		if (rt_rq->rt_time) {
 			u64 runtime;
 
@@ -859,7 +861,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 				 * 'runtime'.
 				 */
 				if (rt_rq->rt_nr_running && rq->curr == rq->idle)
-					rq_clock_skip_update(rq, false);
+					rq_clock_cancel_skipupdate(rq);
 			}
 			if (rt_rq->rt_time || rt_rq->rt_nr_running)
 				idle = 0;
@@ -2699,8 +2701,6 @@ int sched_rr_handler(struct ctl_table *table, int write,
 }
 
 #ifdef CONFIG_SCHED_DEBUG
-extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
-
 void print_rt_stats(struct seq_file *m, int cpu)
 {
 	rt_rq_iter_t iter;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c3deaee7a7a2..6601baf2361c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -976,13 +976,20 @@ static inline u64 rq_clock_task(struct rq *rq)
 	return rq->clock_task;
 }
 
-static inline void rq_clock_skip_update(struct rq *rq, bool skip)
+static inline void rq_clock_skip_update(struct rq *rq)
 {
 	lockdep_assert_held(&rq->lock);
-	if (skip)
-		rq->clock_update_flags |= RQCF_REQ_SKIP;
-	else
-		rq->clock_update_flags &= ~RQCF_REQ_SKIP;
+	rq->clock_update_flags |= RQCF_REQ_SKIP;
+}
+
+/*
+ * See rt task throttling, which is the only time a skip
+ * request is cancelled.
+ */
+static inline void rq_clock_cancel_skipupdate(struct rq *rq)
+{
+	lockdep_assert_held(&rq->lock);
+	rq->clock_update_flags &= ~RQCF_REQ_SKIP;
 }
 
 struct rq_flags {
@@ -1062,6 +1069,12 @@ enum numa_faults_stats {
 extern void sched_setnuma(struct task_struct *p, int node);
 extern int migrate_task_to(struct task_struct *p, int cpu);
 extern int migrate_swap(struct task_struct *, struct task_struct *);
+extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
+#else
+static inline void
+init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_SMP
@@ -2018,8 +2031,9 @@ extern bool sched_debug_enabled;
 extern void print_cfs_stats(struct seq_file *m, int cpu);
 extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void print_dl_stats(struct seq_file *m, int cpu);
-extern void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
 #ifdef CONFIG_NUMA_BALANCING
 extern void
 show_numa_stats(struct task_struct *p, struct seq_file *m);
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index ab112cbfd7c8..750fb3c67eed 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -120,22 +120,9 @@ static const struct seq_operations schedstat_sops = {
 	.show  = show_schedstat,
 };
 
-static int schedstat_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &schedstat_sops);
-}
-
-static const struct file_operations proc_schedstat_operations = {
-	.open    = schedstat_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release,
-};
-
 static int __init proc_schedstat_init(void)
 {
-	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
-
+	proc_create_seq("schedstat", 0, NULL, &schedstat_sops);
 	return 0;
 }
 subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 64cc564f5255..61a1125c1ae4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1708,7 +1708,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	rcu_read_unlock();
 
 	if (rq && sched_debug_enabled) {
-		pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+		pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
 			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
 	}
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dc77548167ef..fd023ac24e10 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -19,6 +19,8 @@
 #include <linux/compat.h>
 #include <linux/coredump.h>
 #include <linux/kmemleak.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
 #include <linux/seccomp.h>
@@ -227,8 +229,11 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
 	return true;
 }
 
+void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
+
 static inline void seccomp_assign_mode(struct task_struct *task,
-				       unsigned long seccomp_mode)
+				       unsigned long seccomp_mode,
+				       unsigned long flags)
 {
 	assert_spin_locked(&task->sighand->siglock);
 
@@ -238,6 +243,9 @@ static inline void seccomp_assign_mode(struct task_struct *task,
 	 * filter) is set.
 	 */
 	smp_mb__before_atomic();
+	/* Assume default seccomp processes want spec flaw mitigation. */
+	if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
+		arch_seccomp_spec_mitigate(task);
 	set_tsk_thread_flag(task, TIF_SECCOMP);
 }
 
@@ -305,7 +313,7 @@ static inline pid_t seccomp_can_sync_threads(void)
  * without dropping the locks.
  *
  */
-static inline void seccomp_sync_threads(void)
+static inline void seccomp_sync_threads(unsigned long flags)
 {
 	struct task_struct *thread, *caller;
 
@@ -346,7 +354,8 @@ static inline void seccomp_sync_threads(void)
 		 * allow one thread to transition the other.
 		 */
 		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
-			seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
+			seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
+					    flags);
 	}
 }
 
@@ -469,7 +478,7 @@ static long seccomp_attach_filter(unsigned int flags,
 
 	/* Now that the new filter is in place, synchronize to all threads. */
 	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
-		seccomp_sync_threads();
+		seccomp_sync_threads(flags);
 
 	return 0;
 }
@@ -584,18 +593,15 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
 	}
 
 	/*
-	 * Force an audit message to be emitted when the action is RET_KILL_*,
-	 * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
-	 * allowed to be logged by the admin.
+	 * Emit an audit message when the action is RET_KILL_*, RET_LOG, or the
+	 * FILTER_FLAG_LOG bit was set. The admin has the ability to silence
+	 * any action from being logged by removing the action name from the
+	 * seccomp_actions_logged sysctl.
 	 */
-	if (log)
-		return __audit_seccomp(syscall, signr, action);
+	if (!log)
+		return;
 
-	/*
-	 * Let the audit subsystem decide if the action should be audited based
-	 * on whether the current task itself is being audited.
-	 */
-	return audit_seccomp(syscall, signr, action);
+	audit_seccomp(syscall, signr, action);
 }
 
 /*
@@ -818,7 +824,7 @@ static long seccomp_set_mode_strict(void)
 #ifdef TIF_NOTSC
 	disable_TSC();
 #endif
-	seccomp_assign_mode(current, seccomp_mode);
+	seccomp_assign_mode(current, seccomp_mode, 0);
 	ret = 0;
 
 out:
@@ -876,7 +882,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
 	/* Do not free the successfully attached filter. */
 	prepared = NULL;
 
-	seccomp_assign_mode(current, seccomp_mode);
+	seccomp_assign_mode(current, seccomp_mode, flags);
 out:
 	spin_unlock_irq(&current->sighand->siglock);
 	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
@@ -1135,10 +1141,11 @@ static const struct seccomp_log_name seccomp_log_names[] = {
 };
 
 static bool seccomp_names_from_actions_logged(char *names, size_t size,
-					      u32 actions_logged)
+					      u32 actions_logged,
+					      const char *sep)
 {
 	const struct seccomp_log_name *cur;
-	bool append_space = false;
+	bool append_sep = false;
 
 	for (cur = seccomp_log_names; cur->name && size; cur++) {
 		ssize_t ret;
@@ -1146,15 +1153,15 @@ static bool seccomp_names_from_actions_logged(char *names, size_t size,
 		if (!(actions_logged & cur->log))
 			continue;
 
-		if (append_space) {
-			ret = strscpy(names, " ", size);
+		if (append_sep) {
+			ret = strscpy(names, sep, size);
 			if (ret < 0)
 				return false;
 
 			names += ret;
 			size -= ret;
 		} else
-			append_space = true;
+			append_sep = true;
 
 		ret = strscpy(names, cur->name, size);
 		if (ret < 0)
@@ -1199,46 +1206,102 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
 	return true;
 }
 
-static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
-					  void __user *buffer, size_t *lenp,
-					  loff_t *ppos)
+static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+			       size_t *lenp, loff_t *ppos)
+{
+	char names[sizeof(seccomp_actions_avail)];
+	struct ctl_table table;
+
+	memset(names, 0, sizeof(names));
+
+	if (!seccomp_names_from_actions_logged(names, sizeof(names),
+					       seccomp_actions_logged, " "))
+		return -EINVAL;
+
+	table = *ro_table;
+	table.data = names;
+	table.maxlen = sizeof(names);
+	return proc_dostring(&table, 0, buffer, lenp, ppos);
+}
+
+static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
+				size_t *lenp, loff_t *ppos, u32 *actions_logged)
 {
 	char names[sizeof(seccomp_actions_avail)];
 	struct ctl_table table;
 	int ret;
 
-	if (write && !capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	memset(names, 0, sizeof(names));
 
-	if (!write) {
-		if (!seccomp_names_from_actions_logged(names, sizeof(names),
-						       seccomp_actions_logged))
-			return -EINVAL;
-	}
-
 	table = *ro_table;
 	table.data = names;
 	table.maxlen = sizeof(names);
-	ret = proc_dostring(&table, write, buffer, lenp, ppos);
+	ret = proc_dostring(&table, 1, buffer, lenp, ppos);
 	if (ret)
 		return ret;
 
-	if (write) {
-		u32 actions_logged;
+	if (!seccomp_actions_logged_from_names(actions_logged, table.data))
+		return -EINVAL;
 
-		if (!seccomp_actions_logged_from_names(&actions_logged,
-						       table.data))
-			return -EINVAL;
+	if (*actions_logged & SECCOMP_LOG_ALLOW)
+		return -EINVAL;
 
-		if (actions_logged & SECCOMP_LOG_ALLOW)
-			return -EINVAL;
+	seccomp_actions_logged = *actions_logged;
+	return 0;
+}
 
-		seccomp_actions_logged = actions_logged;
-	}
+static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged,
+				 int ret)
+{
+	char names[sizeof(seccomp_actions_avail)];
+	char old_names[sizeof(seccomp_actions_avail)];
+	const char *new = names;
+	const char *old = old_names;
 
-	return 0;
+	if (!audit_enabled)
+		return;
+
+	memset(names, 0, sizeof(names));
+	memset(old_names, 0, sizeof(old_names));
+
+	if (ret)
+		new = "?";
+	else if (!actions_logged)
+		new = "(none)";
+	else if (!seccomp_names_from_actions_logged(names, sizeof(names),
+						    actions_logged, ","))
+		new = "?";
+
+	if (!old_actions_logged)
+		old = "(none)";
+	else if (!seccomp_names_from_actions_logged(old_names,
+						    sizeof(old_names),
+						    old_actions_logged, ","))
+		old = "?";
+
+	return audit_seccomp_actions_logged(new, old, !ret);
+}
+
+static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
+					  void __user *buffer, size_t *lenp,
+					  loff_t *ppos)
+{
+	int ret;
+
+	if (write) {
+		u32 actions_logged = 0;
+		u32 old_actions_logged = seccomp_actions_logged;
+
+		ret = write_actions_logged(ro_table, buffer, lenp, ppos,
+					   &actions_logged);
+		audit_actions_logged(actions_logged, old_actions_logged, ret);
+	} else
+		ret = read_actions_logged(ro_table, buffer, lenp, ppos);
+
+	return ret;
 }
 
 static struct ctl_path seccomp_sysctl_path[] = {
diff --git a/kernel/signal.c b/kernel/signal.c
index d4ccea599692..0f865d67415d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1539,7 +1539,6 @@ int send_sig_fault(int sig, int code, void __user *addr
 	return send_sig_info(info.si_signo, &info, t);
 }
 
-#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR)
 int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
 {
 	struct siginfo info;
@@ -1568,9 +1567,7 @@ int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *
 	return send_sig_info(info.si_signo, &info, t);
 }
 EXPORT_SYMBOL(send_sig_mceerr);
-#endif
 
-#ifdef SEGV_BNDERR
 int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
 {
 	struct siginfo info;
@@ -1584,7 +1581,6 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
 	info.si_upper = upper;
 	return force_sig_info(info.si_signo, &info, current);
 }
-#endif
 
 #ifdef SEGV_PKUERR
 int force_sig_pkuerr(void __user *addr, u32 pkey)
@@ -1961,14 +1957,27 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
 			return;
 	}
 
+	set_special_state(TASK_TRACED);
+
 	/*
 	 * We're committing to trapping.  TRACED should be visible before
 	 * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
 	 * Also, transition to TRACED and updates to ->jobctl should be
 	 * atomic with respect to siglock and should be done after the arch
 	 * hook as siglock is released and regrabbed across it.
+	 *
+	 *     TRACER				    TRACEE
+	 *
+	 *     ptrace_attach()
+	 * [L]   wait_on_bit(JOBCTL_TRAPPING)	[S] set_special_state(TRACED)
+	 *     do_wait()
+	 *       set_current_state()                smp_wmb();
+	 *       ptrace_do_wait()
+	 *         wait_task_stopped()
+	 *           task_stopped_code()
+	 * [L]         task_is_traced()		[S] task_clear_jobctl_trapping();
 	 */
-	set_current_state(TASK_TRACED);
+	smp_wmb();
 
 	current->last_siginfo = info;
 	current->exit_code = exit_code;
@@ -2176,7 +2185,7 @@ static bool do_signal_stop(int signr)
 		if (task_participate_group_stop(current))
 			notify = CLD_STOPPED;
 
-		__set_current_state(TASK_STOPPED);
+		set_special_state(TASK_STOPPED);
 		spin_unlock_irq(&current->sighand->siglock);
 
 		/*
@@ -2824,8 +2833,19 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
 			[SIGPOLL] = { NSIGPOLL, SIL_POLL },
 			[SIGSYS]  = { NSIGSYS,  SIL_SYS },
 		};
-		if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
+		if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) {
 			layout = filter[sig].layout;
+			/* Handle the exceptions */
+			if ((sig == SIGBUS) &&
+			    (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO))
+				layout = SIL_FAULT_MCEERR;
+			else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR))
+				layout = SIL_FAULT_BNDERR;
+#ifdef SEGV_PKUERR
+			else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR))
+				layout = SIL_FAULT_PKUERR;
+#endif
+		}
 		else if (si_code <= NSIGPOLL)
 			layout = SIL_POLL;
 	} else {
@@ -2835,104 +2855,15 @@ enum siginfo_layout siginfo_layout(int sig, int si_code)
 			layout = SIL_POLL;
 		else if (si_code < 0)
 			layout = SIL_RT;
-		/* Tests to support buggy kernel ABIs */
-#ifdef TRAP_FIXME
-		if ((sig == SIGTRAP) && (si_code == TRAP_FIXME))
-			layout = SIL_FAULT;
-#endif
-#ifdef FPE_FIXME
-		if ((sig == SIGFPE) && (si_code == FPE_FIXME))
-			layout = SIL_FAULT;
-#endif
 	}
 	return layout;
 }
 
 int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
 {
-	int err;
-
-	if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t)))
+	if (copy_to_user(to, from , sizeof(struct siginfo)))
 		return -EFAULT;
-	if (from->si_code < 0)
-		return __copy_to_user(to, from, sizeof(siginfo_t))
-			? -EFAULT : 0;
-	/*
-	 * If you change siginfo_t structure, please be sure
-	 * this code is fixed accordingly.
-	 * Please remember to update the signalfd_copyinfo() function
-	 * inside fs/signalfd.c too, in case siginfo_t changes.
-	 * It should never copy any pad contained in the structure
-	 * to avoid security leaks, but must copy the generic
-	 * 3 ints plus the relevant union member.
-	 */
-	err = __put_user(from->si_signo, &to->si_signo);
-	err |= __put_user(from->si_errno, &to->si_errno);
-	err |= __put_user(from->si_code, &to->si_code);
-	switch (siginfo_layout(from->si_signo, from->si_code)) {
-	case SIL_KILL:
-		err |= __put_user(from->si_pid, &to->si_pid);
-		err |= __put_user(from->si_uid, &to->si_uid);
-		break;
-	case SIL_TIMER:
-		/* Unreached SI_TIMER is negative */
-		break;
-	case SIL_POLL:
-		err |= __put_user(from->si_band, &to->si_band);
-		err |= __put_user(from->si_fd, &to->si_fd);
-		break;
-	case SIL_FAULT:
-		err |= __put_user(from->si_addr, &to->si_addr);
-#ifdef __ARCH_SI_TRAPNO
-		err |= __put_user(from->si_trapno, &to->si_trapno);
-#endif
-#ifdef __ia64__
-		err |= __put_user(from->si_imm, &to->si_imm);
-		err |= __put_user(from->si_flags, &to->si_flags);
-		err |= __put_user(from->si_isr, &to->si_isr);
-#endif
-		/*
-		 * Other callers might not initialize the si_lsb field,
-		 * so check explicitly for the right codes here.
-		 */
-#ifdef BUS_MCEERR_AR
-		if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR)
-			err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
-#endif
-#ifdef BUS_MCEERR_AO
-		if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO)
-			err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
-#endif
-#ifdef SEGV_BNDERR
-		if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) {
-			err |= __put_user(from->si_lower, &to->si_lower);
-			err |= __put_user(from->si_upper, &to->si_upper);
-		}
-#endif
-#ifdef SEGV_PKUERR
-		if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR)
-			err |= __put_user(from->si_pkey, &to->si_pkey);
-#endif
-		break;
-	case SIL_CHLD:
-		err |= __put_user(from->si_pid, &to->si_pid);
-		err |= __put_user(from->si_uid, &to->si_uid);
-		err |= __put_user(from->si_status, &to->si_status);
-		err |= __put_user(from->si_utime, &to->si_utime);
-		err |= __put_user(from->si_stime, &to->si_stime);
-		break;
-	case SIL_RT:
-		err |= __put_user(from->si_pid, &to->si_pid);
-		err |= __put_user(from->si_uid, &to->si_uid);
-		err |= __put_user(from->si_ptr, &to->si_ptr);
-		break;
-	case SIL_SYS:
-		err |= __put_user(from->si_call_addr, &to->si_call_addr);
-		err |= __put_user(from->si_syscall, &to->si_syscall);
-		err |= __put_user(from->si_arch, &to->si_arch);
-		break;
-	}
-	return err;
+	return 0;
 }
 
 #ifdef CONFIG_COMPAT
@@ -2971,27 +2902,28 @@ int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
 #ifdef __ARCH_SI_TRAPNO
 		new.si_trapno = from->si_trapno;
 #endif
-#ifdef BUS_MCEERR_AR
-		if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR))
-			new.si_addr_lsb = from->si_addr_lsb;
-#endif
-#ifdef BUS_MCEERR_AO
-		if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO))
-			new.si_addr_lsb = from->si_addr_lsb;
+		break;
+	case SIL_FAULT_MCEERR:
+		new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		new.si_trapno = from->si_trapno;
 #endif
-#ifdef SEGV_BNDERR
-		if ((from->si_signo == SIGSEGV) &&
-		    (from->si_code == SEGV_BNDERR)) {
-			new.si_lower = ptr_to_compat(from->si_lower);
-			new.si_upper = ptr_to_compat(from->si_upper);
-		}
+		new.si_addr_lsb = from->si_addr_lsb;
+		break;
+	case SIL_FAULT_BNDERR:
+		new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		new.si_trapno = from->si_trapno;
 #endif
-#ifdef SEGV_PKUERR
-		if ((from->si_signo == SIGSEGV) &&
-		    (from->si_code == SEGV_PKUERR))
-			new.si_pkey = from->si_pkey;
+		new.si_lower = ptr_to_compat(from->si_lower);
+		new.si_upper = ptr_to_compat(from->si_upper);
+		break;
+	case SIL_FAULT_PKUERR:
+		new.si_addr = ptr_to_compat(from->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		new.si_trapno = from->si_trapno;
 #endif
-
+		new.si_pkey = from->si_pkey;
 		break;
 	case SIL_CHLD:
 		new.si_pid    = from->si_pid;
@@ -3057,24 +2989,28 @@ int copy_siginfo_from_user32(struct siginfo *to,
 #ifdef __ARCH_SI_TRAPNO
 		to->si_trapno = from.si_trapno;
 #endif
-#ifdef BUS_MCEERR_AR
-		if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR))
-			to->si_addr_lsb = from.si_addr_lsb;
-#endif
-#ifdef BUS_MCEER_AO
-		if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO))
-			to->si_addr_lsb = from.si_addr_lsb;
+		break;
+	case SIL_FAULT_MCEERR:
+		to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		to->si_trapno = from.si_trapno;
 #endif
-#ifdef SEGV_BNDERR
-		if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) {
-			to->si_lower = compat_ptr(from.si_lower);
-			to->si_upper = compat_ptr(from.si_upper);
-		}
+		to->si_addr_lsb = from.si_addr_lsb;
+		break;
+	case SIL_FAULT_BNDERR:
+		to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		to->si_trapno = from.si_trapno;
 #endif
-#ifdef SEGV_PKUERR
-		if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR))
-			to->si_pkey = from.si_pkey;
+		to->si_lower = compat_ptr(from.si_lower);
+		to->si_upper = compat_ptr(from.si_upper);
+		break;
+	case SIL_FAULT_PKUERR:
+		to->si_addr = compat_ptr(from.si_addr);
+#ifdef __ARCH_SI_TRAPNO
+		to->si_trapno = from.si_trapno;
 #endif
+		to->si_pkey = from.si_pkey;
 		break;
 	case SIL_CHLD:
 		to->si_pid    = from.si_pid;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 177de3640c78..de2f57fddc04 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -49,8 +49,8 @@
  */
 
 #ifndef __ARCH_IRQ_STAT
-irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
-EXPORT_SYMBOL(irq_stat);
+DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
 #endif
 
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
@@ -145,8 +145,7 @@ static void __local_bh_enable(unsigned int cnt)
 }
 
 /*
- * Special-case - softirqs can safely be enabled in
- * cond_resched_softirq(), or by __do_softirq(),
+ * Special-case - softirqs can safely be enabled by __do_softirq(),
  * without processing still-pending softirqs:
  */
 void _local_bh_enable(void)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b7591261652d..f89014a2c238 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -21,6 +21,7 @@
 #include <linux/smpboot.h>
 #include <linux/atomic.h>
 #include <linux/nmi.h>
+#include <linux/sched/wake_q.h>
 
 /*
  * Structure to determine completion condition and record errors.  May
@@ -36,7 +37,7 @@ struct cpu_stop_done {
 struct cpu_stopper {
 	struct task_struct	*thread;
 
-	spinlock_t		lock;
+	raw_spinlock_t		lock;
 	bool			enabled;	/* is this stopper enabled? */
 	struct list_head	works;		/* list of pending works */
 
@@ -65,26 +66,30 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done)
 }
 
 static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
-					struct cpu_stop_work *work)
+					struct cpu_stop_work *work,
+					struct wake_q_head *wakeq)
 {
 	list_add_tail(&work->list, &stopper->works);
-	wake_up_process(stopper->thread);
+	wake_q_add(wakeq, stopper->thread);
 }
 
 /* queue @work to @stopper.  if offline, @work is completed immediately */
 static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+	DEFINE_WAKE_Q(wakeq);
 	unsigned long flags;
 	bool enabled;
 
-	spin_lock_irqsave(&stopper->lock, flags);
+	raw_spin_lock_irqsave(&stopper->lock, flags);
 	enabled = stopper->enabled;
 	if (enabled)
-		__cpu_stop_queue_work(stopper, work);
+		__cpu_stop_queue_work(stopper, work, &wakeq);
 	else if (work->done)
 		cpu_stop_signal_done(work->done);
-	spin_unlock_irqrestore(&stopper->lock, flags);
+	raw_spin_unlock_irqrestore(&stopper->lock, flags);
+
+	wake_up_q(&wakeq);
 
 	return enabled;
 }
@@ -229,10 +234,11 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
 {
 	struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
 	struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
+	DEFINE_WAKE_Q(wakeq);
 	int err;
 retry:
-	spin_lock_irq(&stopper1->lock);
-	spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
+	raw_spin_lock_irq(&stopper1->lock);
+	raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
 
 	err = -ENOENT;
 	if (!stopper1->enabled || !stopper2->enabled)
@@ -252,17 +258,20 @@ retry:
 			goto unlock;
 
 	err = 0;
-	__cpu_stop_queue_work(stopper1, work1);
-	__cpu_stop_queue_work(stopper2, work2);
+	__cpu_stop_queue_work(stopper1, work1, &wakeq);
+	__cpu_stop_queue_work(stopper2, work2, &wakeq);
 unlock:
-	spin_unlock(&stopper2->lock);
-	spin_unlock_irq(&stopper1->lock);
+	raw_spin_unlock(&stopper2->lock);
+	raw_spin_unlock_irq(&stopper1->lock);
 
 	if (unlikely(err == -EDEADLK)) {
 		while (stop_cpus_in_progress)
 			cpu_relax();
 		goto retry;
 	}
+
+	wake_up_q(&wakeq);
+
 	return err;
 }
 /**
@@ -448,9 +457,9 @@ static int cpu_stop_should_run(unsigned int cpu)
 	unsigned long flags;
 	int run;
 
-	spin_lock_irqsave(&stopper->lock, flags);
+	raw_spin_lock_irqsave(&stopper->lock, flags);
 	run = !list_empty(&stopper->works);
-	spin_unlock_irqrestore(&stopper->lock, flags);
+	raw_spin_unlock_irqrestore(&stopper->lock, flags);
 	return run;
 }
 
@@ -461,13 +470,13 @@ static void cpu_stopper_thread(unsigned int cpu)
 
 repeat:
 	work = NULL;
-	spin_lock_irq(&stopper->lock);
+	raw_spin_lock_irq(&stopper->lock);
 	if (!list_empty(&stopper->works)) {
 		work = list_first_entry(&stopper->works,
 					struct cpu_stop_work, list);
 		list_del_init(&work->list);
 	}
-	spin_unlock_irq(&stopper->lock);
+	raw_spin_unlock_irq(&stopper->lock);
 
 	if (work) {
 		cpu_stop_fn_t fn = work->fn;
@@ -541,7 +550,7 @@ static int __init cpu_stop_init(void)
 	for_each_possible_cpu(cpu) {
 		struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 
-		spin_lock_init(&stopper->lock);
+		raw_spin_lock_init(&stopper->lock);
 		INIT_LIST_HEAD(&stopper->works);
 	}
 
diff --git a/kernel/sys.c b/kernel/sys.c
index ad692183dfe9..d1b2b8d934bb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -61,6 +61,8 @@
 #include <linux/uidgid.h>
 #include <linux/cred.h>
 
+#include <linux/nospec.h>
+
 #include <linux/kmsg_dump.h>
 /* Move somewhere else to avoid recompiling? */
 #include <generated/utsrelease.h>
@@ -69,6 +71,9 @@
 #include <asm/io.h>
 #include <asm/unistd.h>
 
+/* Hardening for Spectre-v1 */
+#include <linux/nospec.h>
+
 #include "uid16.h"
 
 #ifndef SET_UNALIGN_CTL
@@ -1451,6 +1456,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
 
+	resource = array_index_nospec(resource, RLIM_NLIMITS);
 	task_lock(current->group_leader);
 	x = current->signal->rlim[resource];
 	task_unlock(current->group_leader);
@@ -1470,6 +1476,7 @@ COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
 	if (resource >= RLIM_NLIMITS)
 		return -EINVAL;
 
+	resource = array_index_nospec(resource, RLIM_NLIMITS);
 	task_lock(current->group_leader);
 	r = current->signal->rlim[resource];
 	task_unlock(current->group_leader);
@@ -2242,6 +2249,17 @@ static int propagate_has_child_subreaper(struct task_struct *p, void *data)
 	return 1;
 }
 
+int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
+{
+	return -EINVAL;
+}
+
+int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
+				    unsigned long ctrl)
+{
+	return -EINVAL;
+}
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -2450,6 +2468,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_SVE_GET_VL:
 		error = SVE_GET_VL();
 		break;
+	case PR_GET_SPECULATION_CTRL:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_prctl_spec_ctrl_get(me, arg2);
+		break;
+	case PR_SET_SPECULATION_CTRL:
+		if (arg4 || arg5)
+			return -EINVAL;
+		error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6cafc008f6db..183169c2a75b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -5,6 +5,11 @@
 
 #include <asm/unistd.h>
 
+#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
+/* Architectures may override COND_SYSCALL and COND_SYSCALL_COMPAT */
+#include <asm/syscall_wrapper.h>
+#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */
+
 /*  we can't #include <linux/syscalls.h> here,
     but tell gcc to not warn with -Wmissing-prototypes  */
 asmlinkage long sys_ni_syscall(void);
@@ -17,8 +22,13 @@ asmlinkage long sys_ni_syscall(void)
 	return -ENOSYS;
 }
 
+#ifndef COND_SYSCALL
 #define COND_SYSCALL(name) cond_syscall(sys_##name)
+#endif /* COND_SYSCALL */
+
+#ifndef COND_SYSCALL_COMPAT
 #define COND_SYSCALL_COMPAT(name) cond_syscall(compat_sys_##name)
+#endif /* COND_SYSCALL_COMPAT */
 
 /*
  * This list is kept in the same order as include/uapi/asm-generic/unistd.h.
@@ -33,7 +43,9 @@ COND_SYSCALL(io_submit);
 COND_SYSCALL_COMPAT(io_submit);
 COND_SYSCALL(io_cancel);
 COND_SYSCALL(io_getevents);
+COND_SYSCALL(io_pgetevents);
 COND_SYSCALL_COMPAT(io_getevents);
+COND_SYSCALL_COMPAT(io_pgetevents);
 
 /* fs/xattr.c */
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bdf7090b106d..6a78cf70761d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1340,7 +1340,7 @@ static struct ctl_table vm_table[] = {
 	{
 		.procname	= "dirtytime_expire_seconds",
 		.data		= &dirtytime_expire_interval,
-		.maxlen		= sizeof(dirty_expire_interval),
+		.maxlen		= sizeof(dirtytime_expire_interval),
 		.mode		= 0644,
 		.proc_handler	= dirtytime_interval_handler,
 		.extra1		= &zero,
@@ -2511,6 +2511,15 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
 }
 #endif
 
+/**
+ * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_dointvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_dointvec_minmax() handler.
+ */
 struct do_proc_dointvec_minmax_conv_param {
 	int *min;
 	int *max;
@@ -2554,7 +2563,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
  * This routine will ensure the values are within the range specified by
  * table->extra1 (min) and table->extra2 (max).
  *
- * Returns 0 on success.
+ * Returns 0 on success or -EINVAL on write when the range check fails.
  */
 int proc_dointvec_minmax(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2567,6 +2576,15 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
 				do_proc_dointvec_minmax_conv, &param);
 }
 
+/**
+ * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure
+ * @min: pointer to minimum allowable value
+ * @max: pointer to maximum allowable value
+ *
+ * The do_proc_douintvec_minmax_conv_param structure provides the
+ * minimum and maximum values for doing range checking for those sysctl
+ * parameters that use the proc_douintvec_minmax() handler.
+ */
 struct do_proc_douintvec_minmax_conv_param {
 	unsigned int *min;
 	unsigned int *max;
@@ -2614,7 +2632,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
  * check for UINT_MAX to avoid having to support wrap around uses from
  * userspace.
  *
- * Returns 0 on success.
+ * Returns 0 on success or -ERANGE on write when the range check fails.
  */
 int proc_douintvec_minmax(struct ctl_table *table, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e8c0dab4fd65..07148b497451 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -704,24 +704,6 @@ static const struct bin_table bin_net_netfilter_table[] = {
 	{}
 };
 
-static const struct bin_table bin_net_irda_table[] = {
-	{ CTL_INT,	NET_IRDA_DISCOVERY,		"discovery" },
-	{ CTL_STR,	NET_IRDA_DEVNAME,		"devname" },
-	{ CTL_INT,	NET_IRDA_DEBUG,			"debug" },
-	{ CTL_INT,	NET_IRDA_FAST_POLL,		"fast_poll_increase" },
-	{ CTL_INT,	NET_IRDA_DISCOVERY_SLOTS,	"discovery_slots" },
-	{ CTL_INT,	NET_IRDA_DISCOVERY_TIMEOUT,	"discovery_timeout" },
-	{ CTL_INT,	NET_IRDA_SLOT_TIMEOUT,		"slot_timeout" },
-	{ CTL_INT,	NET_IRDA_MAX_BAUD_RATE,		"max_baud_rate" },
-	{ CTL_INT,	NET_IRDA_MIN_TX_TURN_TIME,	"min_tx_turn_time" },
-	{ CTL_INT,	NET_IRDA_MAX_TX_DATA_SIZE,	"max_tx_data_size" },
-	{ CTL_INT,	NET_IRDA_MAX_TX_WINDOW,		"max_tx_window" },
-	{ CTL_INT,	NET_IRDA_MAX_NOREPLY_TIME,	"max_noreply_time" },
-	{ CTL_INT,	NET_IRDA_WARN_NOREPLY_TIME,	"warn_noreply_time" },
-	{ CTL_INT,	NET_IRDA_LAP_KEEPALIVE_TIME,	"lap_keepalive_time" },
-	{}
-};
-
 static const struct bin_table bin_net_table[] = {
 	{ CTL_DIR,	NET_CORE,		"core",		bin_net_core_table },
 	/* NET_ETHER not used */
@@ -743,7 +725,7 @@ static const struct bin_table bin_net_table[] = {
 	{ CTL_DIR,	NET_LLC,		"llc",		bin_net_llc_table },
 	{ CTL_DIR,	NET_NETFILTER,		"netfilter",	bin_net_netfilter_table },
 	/* NET_DCCP "dccp" no longer used */
-	{ CTL_DIR,	NET_IRDA,		"irda",		bin_net_irda_table },
+	/* NET_IRDA "irda" no longer used */
 	{ CTL_INT,	2089,			"nf_conntrack_max" },
 	{}
 };
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 0e974cface0b..f89a78e2792b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -119,8 +119,15 @@ static DEFINE_SPINLOCK(watchdog_lock);
 static int watchdog_running;
 static atomic_t watchdog_reset_pending;
 
-static int clocksource_watchdog_kthread(void *data);
-static void __clocksource_change_rating(struct clocksource *cs, int rating);
+static void inline clocksource_watchdog_lock(unsigned long *flags)
+{
+	spin_lock_irqsave(&watchdog_lock, *flags);
+}
+
+static void inline clocksource_watchdog_unlock(unsigned long *flags)
+{
+	spin_unlock_irqrestore(&watchdog_lock, *flags);
+}
 
 /*
  * Interval: 0.5sec Threshold: 0.0625s
@@ -128,23 +135,24 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating);
 #define WATCHDOG_INTERVAL (HZ >> 1)
 #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
 
-static void clocksource_watchdog_work(struct work_struct *work)
-{
-	/*
-	 * If kthread_run fails the next watchdog scan over the
-	 * watchdog_list will find the unstable clock again.
-	 */
-	kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
-}
-
 static void __clocksource_unstable(struct clocksource *cs)
 {
 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
 	cs->flags |= CLOCK_SOURCE_UNSTABLE;
 
+	/*
+	 * If the clocksource is registered clocksource_watchdog_work() will
+	 * re-rate and re-select.
+	 */
+	if (list_empty(&cs->list)) {
+		cs->rating = 0;
+		return;
+	}
+
 	if (cs->mark_unstable)
 		cs->mark_unstable(cs);
 
+	/* kick clocksource_watchdog_work() */
 	if (finished_booting)
 		schedule_work(&watchdog_work);
 }
@@ -153,10 +161,8 @@ static void __clocksource_unstable(struct clocksource *cs)
  * clocksource_mark_unstable - mark clocksource unstable via watchdog
  * @cs:		clocksource to be marked unstable
  *
- * This function is called instead of clocksource_change_rating from
- * cpu hotplug code to avoid a deadlock between the clocksource mutex
- * and the cpu hotplug mutex. It defers the update of the clocksource
- * to the watchdog thread.
+ * This function is called by the x86 TSC code to mark clocksources as unstable;
+ * it defers demotion and re-selection to a work.
  */
 void clocksource_mark_unstable(struct clocksource *cs)
 {
@@ -164,7 +170,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 
 	spin_lock_irqsave(&watchdog_lock, flags);
 	if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
-		if (list_empty(&cs->wd_list))
+		if (!list_empty(&cs->list) && list_empty(&cs->wd_list))
 			list_add(&cs->wd_list, &watchdog_list);
 		__clocksource_unstable(cs);
 	}
@@ -319,9 +325,8 @@ static void clocksource_resume_watchdog(void)
 
 static void clocksource_enqueue_watchdog(struct clocksource *cs)
 {
-	unsigned long flags;
+	INIT_LIST_HEAD(&cs->wd_list);
 
-	spin_lock_irqsave(&watchdog_lock, flags);
 	if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
 		/* cs is a clocksource to be watched. */
 		list_add(&cs->wd_list, &watchdog_list);
@@ -331,7 +336,6 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
 		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
 			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
 	}
-	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
 static void clocksource_select_watchdog(bool fallback)
@@ -373,9 +377,6 @@ static void clocksource_select_watchdog(bool fallback)
 
 static void clocksource_dequeue_watchdog(struct clocksource *cs)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&watchdog_lock, flags);
 	if (cs != watchdog) {
 		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
 			/* cs is a watched clocksource. */
@@ -384,21 +385,21 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs)
 			clocksource_stop_watchdog();
 		}
 	}
-	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
-static int __clocksource_watchdog_kthread(void)
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
+
+static int __clocksource_watchdog_work(void)
 {
 	struct clocksource *cs, *tmp;
 	unsigned long flags;
-	LIST_HEAD(unstable);
 	int select = 0;
 
 	spin_lock_irqsave(&watchdog_lock, flags);
 	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
 		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
 			list_del_init(&cs->wd_list);
-			list_add(&cs->wd_list, &unstable);
+			__clocksource_change_rating(cs, 0);
 			select = 1;
 		}
 		if (cs->flags & CLOCK_SOURCE_RESELECT) {
@@ -410,21 +411,15 @@ static int __clocksource_watchdog_kthread(void)
 	clocksource_stop_watchdog();
 	spin_unlock_irqrestore(&watchdog_lock, flags);
 
-	/* Needs to be done outside of watchdog lock */
-	list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
-		list_del_init(&cs->wd_list);
-		__clocksource_change_rating(cs, 0);
-	}
 	return select;
 }
 
-static int clocksource_watchdog_kthread(void *data)
+static void clocksource_watchdog_work(struct work_struct *work)
 {
 	mutex_lock(&clocksource_mutex);
-	if (__clocksource_watchdog_kthread())
+	if (__clocksource_watchdog_work())
 		clocksource_select();
 	mutex_unlock(&clocksource_mutex);
-	return 0;
 }
 
 static bool clocksource_is_watchdog(struct clocksource *cs)
@@ -443,10 +438,13 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
 static void clocksource_select_watchdog(bool fallback) { }
 static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
 static inline void clocksource_resume_watchdog(void) { }
-static inline int __clocksource_watchdog_kthread(void) { return 0; }
+static inline int __clocksource_watchdog_work(void) { return 0; }
 static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
 void clocksource_mark_unstable(struct clocksource *cs) { }
 
+static inline void clocksource_watchdog_lock(unsigned long *flags) { }
+static inline void clocksource_watchdog_unlock(unsigned long *flags) { }
+
 #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
 
 /**
@@ -674,7 +672,7 @@ static int __init clocksource_done_booting(void)
 	/*
 	 * Run the watchdog first to eliminate unstable clock sources
 	 */
-	__clocksource_watchdog_kthread();
+	__clocksource_watchdog_work();
 	clocksource_select();
 	mutex_unlock(&clocksource_mutex);
 	return 0;
@@ -779,14 +777,19 @@ EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
  */
 int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
+	unsigned long flags;
 
 	/* Initialize mult/shift and max_idle_ns */
 	__clocksource_update_freq_scale(cs, scale, freq);
 
 	/* Add clocksource to the clocksource list */
 	mutex_lock(&clocksource_mutex);
+
+	clocksource_watchdog_lock(&flags);
 	clocksource_enqueue(cs);
 	clocksource_enqueue_watchdog(cs);
+	clocksource_watchdog_unlock(&flags);
+
 	clocksource_select();
 	clocksource_select_watchdog(false);
 	mutex_unlock(&clocksource_mutex);
@@ -808,8 +811,13 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
  */
 void clocksource_change_rating(struct clocksource *cs, int rating)
 {
+	unsigned long flags;
+
 	mutex_lock(&clocksource_mutex);
+	clocksource_watchdog_lock(&flags);
 	__clocksource_change_rating(cs, rating);
+	clocksource_watchdog_unlock(&flags);
+
 	clocksource_select();
 	clocksource_select_watchdog(false);
 	mutex_unlock(&clocksource_mutex);
@@ -821,6 +829,8 @@ EXPORT_SYMBOL(clocksource_change_rating);
  */
 static int clocksource_unbind(struct clocksource *cs)
 {
+	unsigned long flags;
+
 	if (clocksource_is_watchdog(cs)) {
 		/* Select and try to install a replacement watchdog. */
 		clocksource_select_watchdog(true);
@@ -834,8 +844,12 @@ static int clocksource_unbind(struct clocksource *cs)
 		if (curr_clocksource == cs)
 			return -EBUSY;
 	}
+
+	clocksource_watchdog_lock(&flags);
 	clocksource_dequeue_watchdog(cs);
 	list_del_init(&cs->list);
+	clocksource_watchdog_unlock(&flags);
+
 	return 0;
 }
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9b082ce86325..055a4a728c00 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -91,6 +91,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.get_time = &ktime_get_real,
 		},
 		{
+			.index = HRTIMER_BASE_BOOTTIME,
+			.clockid = CLOCK_BOOTTIME,
+			.get_time = &ktime_get_boottime,
+		},
+		{
 			.index = HRTIMER_BASE_TAI,
 			.clockid = CLOCK_TAI,
 			.get_time = &ktime_get_clocktai,
@@ -106,6 +111,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.get_time = &ktime_get_real,
 		},
 		{
+			.index = HRTIMER_BASE_BOOTTIME_SOFT,
+			.clockid = CLOCK_BOOTTIME,
+			.get_time = &ktime_get_boottime,
+		},
+		{
 			.index = HRTIMER_BASE_TAI_SOFT,
 			.clockid = CLOCK_TAI,
 			.get_time = &ktime_get_clocktai,
@@ -119,7 +129,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
 
 	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
 	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
-	[CLOCK_BOOTTIME]	= HRTIMER_BASE_MONOTONIC,
+	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
 	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
 };
 
@@ -480,6 +490,7 @@ __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
 	while ((base = __next_base((cpu_base), &(active))))
 
 static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
+					 const struct hrtimer *exclude,
 					 unsigned int active,
 					 ktime_t expires_next)
 {
@@ -492,9 +503,22 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
 
 		next = timerqueue_getnext(&base->active);
 		timer = container_of(next, struct hrtimer, node);
+		if (timer == exclude) {
+			/* Get to the next timer in the queue. */
+			next = timerqueue_iterate_next(next);
+			if (!next)
+				continue;
+
+			timer = container_of(next, struct hrtimer, node);
+		}
 		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
 		if (expires < expires_next) {
 			expires_next = expires;
+
+			/* Skip cpu_base update if a timer is being excluded. */
+			if (exclude)
+				continue;
+
 			if (timer->is_soft)
 				cpu_base->softirq_next_timer = timer;
 			else
@@ -538,7 +562,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
 	if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
 		active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
 		cpu_base->softirq_next_timer = NULL;
-		expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX);
+		expires_next = __hrtimer_next_event_base(cpu_base, NULL,
+							 active, KTIME_MAX);
 
 		next_timer = cpu_base->softirq_next_timer;
 	}
@@ -546,7 +571,8 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
 	if (active_mask & HRTIMER_ACTIVE_HARD) {
 		active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
 		cpu_base->next_timer = next_timer;
-		expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next);
+		expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
+							 expires_next);
 	}
 
 	return expires_next;
@@ -555,12 +581,14 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 {
 	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
 	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
 
 	ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
-						   offs_real, offs_tai);
+					    offs_real, offs_boot, offs_tai);
 
 	base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
+	base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
 	base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;
 
 	return now;
@@ -1190,6 +1218,39 @@ u64 hrtimer_get_next_event(void)
 
 	return expires;
 }
+
+/**
+ * hrtimer_next_event_without - time until next expiry event w/o one timer
+ * @exclude:	timer to exclude
+ *
+ * Returns the next expiry time over all timers except for the @exclude one or
+ * KTIME_MAX if none of them is pending.
+ */
+u64 hrtimer_next_event_without(const struct hrtimer *exclude)
+{
+	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+	u64 expires = KTIME_MAX;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&cpu_base->lock, flags);
+
+	if (__hrtimer_hres_active(cpu_base)) {
+		unsigned int active;
+
+		if (!cpu_base->softirq_activated) {
+			active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+			expires = __hrtimer_next_event_base(cpu_base, exclude,
+							    active, KTIME_MAX);
+		}
+		active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+		expires = __hrtimer_next_event_base(cpu_base, exclude, active,
+						    expires);
+	}
+
+	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+
+	return expires;
+}
 #endif
 
 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@ -1698,8 +1759,10 @@ out:
 	return ret;
 }
 
-SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
-		struct timespec __user *, rmtp)
+#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+
+SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
+		struct __kernel_timespec __user *, rmtp)
 {
 	struct timespec64 tu;
 
@@ -1714,7 +1777,9 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 	return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 
-#ifdef CONFIG_COMPAT
+#endif
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
 
 COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
 		       struct compat_timespec __user *, rmtp)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8d70da1b9a0d..a09ded765f6c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -31,7 +31,7 @@
 
 
 /* USER_HZ period (usecs): */
-unsigned long			tick_usec = TICK_USEC;
+unsigned long			tick_usec = USER_TICK_USEC;
 
 /* SHIFTED_HZ period (nsecs): */
 unsigned long			tick_nsec;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2541bd89f20e..5a6251ac6f7a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1205,10 +1205,12 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 			   u64 *newval, u64 *oldval)
 {
 	u64 now;
+	int ret;
 
 	WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
+	ret = cpu_timer_sample_group(clock_idx, tsk, &now);
 
-	if (oldval && cpu_timer_sample_group(clock_idx, tsk, &now) != -EINVAL) {
+	if (oldval && ret != -EINVAL) {
 		/*
 		 * We are setting itimer. The *oldval is absolute and we update
 		 * it to be relative, *newval argument is relative and we update
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 6259dbc0191a..26aa9569e24a 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -19,6 +19,11 @@
 #include <linux/posix-timers.h>
 #include <linux/compat.h>
 
+#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
+/* Architectures may override SYS_NI and COMPAT_SYS_NI */
+#include <asm/syscall_wrapper.h>
+#endif
+
 asmlinkage long sys_ni_posix_timers(void)
 {
 	pr_err_once("process %d (%s) attempted a POSIX timer syscall "
@@ -27,8 +32,13 @@ asmlinkage long sys_ni_posix_timers(void)
 	return -ENOSYS;
 }
 
+#ifndef SYS_NI
 #define SYS_NI(name)  SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers)
+#endif
+
+#ifndef COMPAT_SYS_NI
 #define COMPAT_SYS_NI(name)  SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers)
+#endif
 
 SYS_NI(timer_create);
 SYS_NI(timer_gettime);
@@ -49,7 +59,7 @@ SYS_NI(alarm);
  */
 
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
-		const struct timespec __user *, tp)
+		const struct __kernel_timespec __user *, tp)
 {
 	struct timespec64 new_tp;
 
@@ -73,8 +83,6 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
 	case CLOCK_BOOTTIME:
 		get_monotonic_boottime64(tp);
 		break;
-	case CLOCK_MONOTONIC_ACTIVE:
-		ktime_get_active_ts64(tp);
 	default:
 		return -EINVAL;
 	}
@@ -82,7 +90,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
 	return 0;
 }
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
-		struct timespec __user *, tp)
+		struct __kernel_timespec __user *, tp)
 {
 	int ret;
 	struct timespec64 kernel_tp;
@@ -96,7 +104,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 	return 0;
 }
 
-SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp)
+SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp)
 {
 	struct timespec64 rtn_tp = {
 		.tv_sec = 0,
@@ -116,8 +124,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us
 }
 
 SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
-		const struct timespec __user *, rqtp,
-		struct timespec __user *, rmtp)
+		const struct __kernel_timespec __user *, rqtp,
+		struct __kernel_timespec __user *, rmtp)
 {
 	struct timespec64 t;
 
@@ -150,7 +158,9 @@ COMPAT_SYS_NI(timer_settime);
 COMPAT_SYS_NI(timer_gettime);
 COMPAT_SYS_NI(getitimer);
 COMPAT_SYS_NI(setitimer);
+#endif
 
+#ifdef CONFIG_COMPAT_32BIT_TIME
 COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		       struct compat_timespec __user *, tp)
 {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index b6899b5060bd..e08ce3f27447 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -252,16 +252,15 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *
 	return 0;
 }
 
-static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
 {
-	timekeeping_clocktai64(tp);
+	get_monotonic_boottime64(tp);
 	return 0;
 }
 
-static int posix_get_monotonic_active(clockid_t which_clock,
-				      struct timespec64 *tp)
+static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
 {
-	ktime_get_active_ts64(tp);
+	timekeeping_clocktai64(tp);
 	return 0;
 }
 
@@ -1041,7 +1040,7 @@ void exit_itimers(struct signal_struct *sig)
 }
 
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
-		const struct timespec __user *, tp)
+		const struct __kernel_timespec __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 new_tp;
@@ -1056,7 +1055,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 }
 
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
-		struct timespec __user *,tp)
+		struct __kernel_timespec __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 kernel_tp;
@@ -1097,7 +1096,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
 }
 
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
-		struct timespec __user *, tp)
+		struct __kernel_timespec __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 rtn_tp;
@@ -1114,7 +1113,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 	return error;
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
 
 COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
 		       struct compat_timespec __user *, tp)
@@ -1149,6 +1148,10 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
 	return err;
 }
 
+#endif
+
+#ifdef CONFIG_COMPAT
+
 COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
 		       struct compat_timex __user *, utp)
 {
@@ -1173,6 +1176,10 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
 	return err;
 }
 
+#endif
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+
 COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
 		       struct compat_timespec __user *, tp)
 {
@@ -1204,8 +1211,8 @@ static int common_nsleep(const clockid_t which_clock, int flags,
 }
 
 SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
-		const struct timespec __user *, rqtp,
-		struct timespec __user *, rmtp)
+		const struct __kernel_timespec __user *, rqtp,
+		struct __kernel_timespec __user *, rmtp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 t;
@@ -1228,7 +1235,8 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 	return kc->nsleep(which_clock, flags, &t);
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
+
 COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
 		       struct compat_timespec __user *, rqtp,
 		       struct compat_timespec __user *, rmtp)
@@ -1253,6 +1261,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
 
 	return kc->nsleep(which_clock, flags, &t);
 }
+
 #endif
 
 static const struct k_clock clock_realtime = {
@@ -1317,9 +1326,19 @@ static const struct k_clock clock_tai = {
 	.timer_arm		= common_hrtimer_arm,
 };
 
-static const struct k_clock clock_monotonic_active = {
+static const struct k_clock clock_boottime = {
 	.clock_getres		= posix_get_hrtimer_res,
-	.clock_get		= posix_get_monotonic_active,
+	.clock_get		= posix_get_boottime,
+	.nsleep			= common_nsleep,
+	.timer_create		= common_timer_create,
+	.timer_set		= common_timer_set,
+	.timer_get		= common_timer_get,
+	.timer_del		= common_timer_del,
+	.timer_rearm		= common_hrtimer_rearm,
+	.timer_forward		= common_hrtimer_forward,
+	.timer_remaining	= common_hrtimer_remaining,
+	.timer_try_to_cancel	= common_hrtimer_try_to_cancel,
+	.timer_arm		= common_hrtimer_arm,
 };
 
 static const struct k_clock * const posix_clocks[] = {
@@ -1330,11 +1349,10 @@ static const struct k_clock * const posix_clocks[] = {
 	[CLOCK_MONOTONIC_RAW]		= &clock_monotonic_raw,
 	[CLOCK_REALTIME_COARSE]		= &clock_realtime_coarse,
 	[CLOCK_MONOTONIC_COARSE]	= &clock_monotonic_coarse,
-	[CLOCK_BOOTTIME]		= &clock_monotonic,
+	[CLOCK_BOOTTIME]		= &clock_boottime,
 	[CLOCK_REALTIME_ALARM]		= &alarm_clock,
 	[CLOCK_BOOTTIME_ALARM]		= &alarm_clock,
 	[CLOCK_TAI]			= &clock_tai,
-	[CLOCK_MONOTONIC_ACTIVE]	= &clock_monotonic_active,
 };
 
 static const struct k_clock *clockid_to_kclock(const clockid_t id)
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b398c2ea69b2..aa2094d5dd27 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -612,6 +612,14 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 	now = ktime_get();
 	/* Find all expired events */
 	for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
+		/*
+		 * Required for !SMP because for_each_cpu() reports
+		 * unconditionally CPU0 as set on UP kernels.
+		 */
+		if (!IS_ENABLED(CONFIG_SMP) &&
+		    cpumask_empty(tick_broadcast_oneshot_mask))
+			break;
+
 		td = &per_cpu(tick_cpu_device, cpu);
 		if (td->evtdev->next_event <= now) {
 			cpumask_set_cpu(cpu, tmpmask);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 099572ca4a8f..b7005dd21ec1 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -277,7 +277,8 @@ static bool tick_check_preferred(struct clock_event_device *curdev,
 	 */
 	return !curdev ||
 		newdev->rating > curdev->rating ||
-	       !cpumask_equal(curdev->cpumask, newdev->cpumask);
+	       (!cpumask_equal(curdev->cpumask, newdev->cpumask) &&
+	        !tick_check_percpu(curdev, newdev, smp_processor_id()));
 }
 
 /*
@@ -419,19 +420,6 @@ void tick_suspend_local(void)
 	clockevents_shutdown(td->evtdev);
 }
 
-static void tick_forward_next_period(void)
-{
-	ktime_t delta, now = ktime_get();
-	u64 n;
-
-	delta = ktime_sub(now, tick_next_period);
-	n = ktime_divns(delta, tick_period);
-	tick_next_period += n * tick_period;
-	if (tick_next_period < now)
-		tick_next_period += tick_period;
-	tick_sched_forward_next_period();
-}
-
 /**
  * tick_resume_local - Resume the local tick device
  *
@@ -444,8 +432,6 @@ void tick_resume_local(void)
 	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
 	bool broadcast = tick_resume_check_broadcast();
 
-	tick_forward_next_period();
-
 	clockevents_tick_resume(td->evtdev);
 	if (!broadcast) {
 		if (td->mode == TICKDEV_MODE_PERIODIC)
@@ -505,6 +491,7 @@ void tick_freeze(void)
 	if (tick_freeze_depth == num_online_cpus()) {
 		trace_suspend_resume(TPS("timekeeping_freeze"),
 				     smp_processor_id(), true);
+		system_state = SYSTEM_SUSPEND;
 		timekeeping_suspend();
 	} else {
 		tick_suspend_local();
@@ -528,6 +515,7 @@ void tick_unfreeze(void)
 
 	if (tick_freeze_depth == num_online_cpus()) {
 		timekeeping_resume();
+		system_state = SYSTEM_RUNNING;
 		trace_suspend_resume(TPS("timekeeping_freeze"),
 				     smp_processor_id(), false);
 	} else {
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 21efab7485ca..e277284c2831 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -141,12 +141,6 @@ static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
 static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
 #endif /* !(BROADCAST && ONESHOT) */
 
-#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
-extern void tick_sched_forward_next_period(void);
-#else
-static inline void tick_sched_forward_next_period(void) { }
-#endif
-
 /* NO_HZ_FULL internal */
 #ifdef CONFIG_NO_HZ_FULL
 extern void tick_nohz_init(void);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index c1f518e7aa80..6fe615d57ebb 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -82,16 +82,15 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
 	if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
 		    !tick_device_is_functional(dev)) {
 
-		printk(KERN_INFO "Clockevents: "
-		       "could not switch to one-shot mode:");
+		pr_info("Clockevents: could not switch to one-shot mode:");
 		if (!dev) {
-			printk(" no tick device\n");
+			pr_cont(" no tick device\n");
 		} else {
 			if (!tick_device_is_functional(dev))
-				printk(" %s is not functional.\n", dev->name);
+				pr_cont(" %s is not functional.\n", dev->name);
 			else
-				printk(" %s does not support one-shot mode.\n",
-				       dev->name);
+				pr_cont(" %s does not support one-shot mode.\n",
+					dev->name);
 		}
 		return -EINVAL;
 	}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f3ab08caa2c3..da9455a6b42b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -52,15 +52,6 @@ struct tick_sched *tick_get_tick_sched(int cpu)
 static ktime_t last_jiffies_update;
 
 /*
- * Called after resume. Make sure that jiffies are not fast forwarded due to
- * clock monotonic being forwarded by the suspended time.
- */
-void tick_sched_forward_next_period(void)
-{
-	last_jiffies_update = tick_next_period;
-}
-
-/*
  * Must be called with interrupts disabled !
  */
 static void tick_do_update_jiffies64(ktime_t now)
@@ -122,8 +113,7 @@ static ktime_t tick_init_jiffy_update(void)
 	return period;
 }
 
-
-static void tick_sched_do_timer(ktime_t now)
+static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
 {
 	int cpu = smp_processor_id();
 
@@ -143,6 +133,9 @@ static void tick_sched_do_timer(ktime_t now)
 	/* Check, if the jiffies need an update */
 	if (tick_do_timer_cpu == cpu)
 		tick_do_update_jiffies64(now);
+
+	if (ts->inidle)
+		ts->got_idle_tick = 1;
 }
 
 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
@@ -474,7 +467,9 @@ __setup("nohz=", setup_tick_nohz);
 
 bool tick_nohz_tick_stopped(void)
 {
-	return __this_cpu_read(tick_cpu_sched.tick_stopped);
+	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+	return ts->tick_stopped;
 }
 
 bool tick_nohz_tick_stopped_cpu(int cpu)
@@ -537,14 +532,11 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
 	sched_clock_idle_wakeup_event();
 }
 
-static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+static void tick_nohz_start_idle(struct tick_sched *ts)
 {
-	ktime_t now = ktime_get();
-
-	ts->idle_entrytime = now;
+	ts->idle_entrytime = ktime_get();
 	ts->idle_active = 1;
 	sched_clock_idle_sleep_event();
-	return now;
 }
 
 /**
@@ -653,13 +645,10 @@ static inline bool local_timer_softirq_pending(void)
 	return local_softirq_pending() & TIMER_SOFTIRQ;
 }
 
-static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
-					 ktime_t now, int cpu)
+static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 {
-	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 	u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
 	unsigned long seq, basejiff;
-	ktime_t	tick;
 
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
@@ -668,6 +657,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		basejiff = jiffies;
 	} while (read_seqretry(&jiffies_lock, seq));
 	ts->last_jiffies = basejiff;
+	ts->timer_expires_base = basemono;
 
 	/*
 	 * Keep the periodic tick, when RCU, architecture or irq_work
@@ -712,47 +702,63 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		 * next period, so no point in stopping it either, bail.
 		 */
 		if (!ts->tick_stopped) {
-			tick = 0;
+			ts->timer_expires = 0;
 			goto out;
 		}
 	}
 
 	/*
+	 * If this CPU is the one which had the do_timer() duty last, we limit
+	 * the sleep time to the timekeeping max_deferment value.
+	 * Otherwise we can sleep as long as we want.
+	 */
+	delta = timekeeping_max_deferment();
+	if (cpu != tick_do_timer_cpu &&
+	    (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last))
+		delta = KTIME_MAX;
+
+	/* Calculate the next expiry time */
+	if (delta < (KTIME_MAX - basemono))
+		expires = basemono + delta;
+	else
+		expires = KTIME_MAX;
+
+	ts->timer_expires = min_t(u64, expires, next_tick);
+
+out:
+	return ts->timer_expires;
+}
+
+static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
+{
+	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
+	u64 basemono = ts->timer_expires_base;
+	u64 expires = ts->timer_expires;
+	ktime_t tick = expires;
+
+	/* Make sure we won't be trying to stop it twice in a row. */
+	ts->timer_expires_base = 0;
+
+	/*
 	 * If this CPU is the one which updates jiffies, then give up
 	 * the assignment and let it be taken by the CPU which runs
 	 * the tick timer next, which might be this CPU as well. If we
 	 * don't drop this here the jiffies might be stale and
 	 * do_timer() never invoked. Keep track of the fact that it
-	 * was the one which had the do_timer() duty last. If this CPU
-	 * is the one which had the do_timer() duty last, we limit the
-	 * sleep time to the timekeeping max_deferment value.
-	 * Otherwise we can sleep as long as we want.
+	 * was the one which had the do_timer() duty last.
 	 */
-	delta = timekeeping_max_deferment();
 	if (cpu == tick_do_timer_cpu) {
 		tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 		ts->do_timer_last = 1;
 	} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
-		delta = KTIME_MAX;
 		ts->do_timer_last = 0;
-	} else if (!ts->do_timer_last) {
-		delta = KTIME_MAX;
 	}
 
-	/* Calculate the next expiry time */
-	if (delta < (KTIME_MAX - basemono))
-		expires = basemono + delta;
-	else
-		expires = KTIME_MAX;
-
-	expires = min_t(u64, expires, next_tick);
-	tick = expires;
-
 	/* Skip reprogram of event if its not changed */
 	if (ts->tick_stopped && (expires == ts->next_tick)) {
 		/* Sanity check: make sure clockevent is actually programmed */
 		if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
-			goto out;
+			return;
 
 		WARN_ON_ONCE(1);
 		printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
@@ -786,23 +792,31 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	if (unlikely(expires == KTIME_MAX)) {
 		if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 			hrtimer_cancel(&ts->sched_timer);
-		goto out;
+		return;
 	}
 
-	hrtimer_set_expires(&ts->sched_timer, tick);
+	if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
+		hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
+	} else {
+		hrtimer_set_expires(&ts->sched_timer, tick);
+		tick_program_event(tick, 1);
+	}
+}
 
-	if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
-		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+static void tick_nohz_retain_tick(struct tick_sched *ts)
+{
+	ts->timer_expires_base = 0;
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu)
+{
+	if (tick_nohz_next_event(ts, cpu))
+		tick_nohz_stop_tick(ts, cpu);
 	else
-		tick_program_event(tick, 1);
-out:
-	/*
-	 * Update the estimated sleep length until the next timer
-	 * (not only the tick).
-	 */
-	ts->sleep_length = ktime_sub(dev->next_event, now);
-	return tick;
+		tick_nohz_retain_tick(ts);
 }
+#endif /* CONFIG_NO_HZ_FULL */
 
 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 {
@@ -839,7 +853,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 		return;
 
 	if (can_stop_full_tick(cpu, ts))
-		tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+		tick_nohz_stop_sched_tick(ts, cpu);
 	else if (ts->tick_stopped)
 		tick_nohz_restart_sched_tick(ts, ktime_get());
 #endif
@@ -865,10 +879,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 		return false;
 	}
 
-	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
-		ts->sleep_length = NSEC_PER_SEC / HZ;
+	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
 		return false;
-	}
 
 	if (need_resched())
 		return false;
@@ -903,42 +915,65 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 	return true;
 }
 
-static void __tick_nohz_idle_enter(struct tick_sched *ts)
+static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
 {
-	ktime_t now, expires;
+	ktime_t expires;
 	int cpu = smp_processor_id();
 
-	now = tick_nohz_start_idle(ts);
+	/*
+	 * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
+	 * tick timer expiration time is known already.
+	 */
+	if (ts->timer_expires_base)
+		expires = ts->timer_expires;
+	else if (can_stop_idle_tick(cpu, ts))
+		expires = tick_nohz_next_event(ts, cpu);
+	else
+		return;
+
+	ts->idle_calls++;
 
-	if (can_stop_idle_tick(cpu, ts)) {
+	if (expires > 0LL) {
 		int was_stopped = ts->tick_stopped;
 
-		ts->idle_calls++;
+		tick_nohz_stop_tick(ts, cpu);
 
-		expires = tick_nohz_stop_sched_tick(ts, now, cpu);
-		if (expires > 0LL) {
-			ts->idle_sleeps++;
-			ts->idle_expires = expires;
-		}
+		ts->idle_sleeps++;
+		ts->idle_expires = expires;
 
 		if (!was_stopped && ts->tick_stopped) {
 			ts->idle_jiffies = ts->last_jiffies;
 			nohz_balance_enter_idle(cpu);
 		}
+	} else {
+		tick_nohz_retain_tick(ts);
 	}
 }
 
 /**
- * tick_nohz_idle_enter - stop the idle tick from the idle task
+ * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
  *
  * When the next event is more than a tick into the future, stop the idle tick
- * Called when we start the idle loop.
- *
- * The arch is responsible of calling:
+ */
+void tick_nohz_idle_stop_tick(void)
+{
+	__tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
+}
+
+void tick_nohz_idle_retain_tick(void)
+{
+	tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
+	/*
+	 * Undo the effect of get_next_timer_interrupt() called from
+	 * tick_nohz_next_event().
+	 */
+	timer_clear_idle();
+}
+
+/**
+ * tick_nohz_idle_enter - prepare for entering idle on the current CPU
  *
- * - rcu_idle_enter() after its last use of RCU before the CPU is put
- *  to sleep.
- * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
+ * Called when we start the idle loop.
  */
 void tick_nohz_idle_enter(void)
 {
@@ -949,8 +984,11 @@ void tick_nohz_idle_enter(void)
 	local_irq_disable();
 
 	ts = this_cpu_ptr(&tick_cpu_sched);
+
+	WARN_ON_ONCE(ts->timer_expires_base);
+
 	ts->inidle = 1;
-	__tick_nohz_idle_enter(ts);
+	tick_nohz_start_idle(ts);
 
 	local_irq_enable();
 }
@@ -968,21 +1006,62 @@ void tick_nohz_irq_exit(void)
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 
 	if (ts->inidle)
-		__tick_nohz_idle_enter(ts);
+		tick_nohz_start_idle(ts);
 	else
 		tick_nohz_full_update_tick(ts);
 }
 
 /**
- * tick_nohz_get_sleep_length - return the length of the current sleep
+ * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
+ */
+bool tick_nohz_idle_got_tick(void)
+{
+	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+	if (ts->got_idle_tick) {
+		ts->got_idle_tick = 0;
+		return true;
+	}
+	return false;
+}
+
+/**
+ * tick_nohz_get_sleep_length - return the expected length of the current sleep
+ * @delta_next: duration until the next event if the tick cannot be stopped
  *
  * Called from power state control code with interrupts disabled
  */
-ktime_t tick_nohz_get_sleep_length(void)
+ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 {
+	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+	int cpu = smp_processor_id();
+	/*
+	 * The idle entry time is expected to be a sufficient approximation of
+	 * the current time at this point.
+	 */
+	ktime_t now = ts->idle_entrytime;
+	ktime_t next_event;
+
+	WARN_ON_ONCE(!ts->inidle);
+
+	*delta_next = ktime_sub(dev->next_event, now);
+
+	if (!can_stop_idle_tick(cpu, ts))
+		return *delta_next;
+
+	next_event = tick_nohz_next_event(ts, cpu);
+	if (!next_event)
+		return *delta_next;
 
-	return ts->sleep_length;
+	/*
+	 * If the next highres timer to expire is earlier than next_event, the
+	 * idle governor needs to know that.
+	 */
+	next_event = min_t(u64, next_event,
+			   hrtimer_next_event_without(&ts->sched_timer));
+
+	return ktime_sub(next_event, now);
 }
 
 /**
@@ -1031,6 +1110,20 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 #endif
 }
 
+static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now)
+{
+	tick_nohz_restart_sched_tick(ts, now);
+	tick_nohz_account_idle_ticks(ts);
+}
+
+void tick_nohz_idle_restart_tick(void)
+{
+	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+	if (ts->tick_stopped)
+		__tick_nohz_idle_restart_tick(ts, ktime_get());
+}
+
 /**
  * tick_nohz_idle_exit - restart the idle tick from the idle task
  *
@@ -1041,24 +1134,26 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 void tick_nohz_idle_exit(void)
 {
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+	bool idle_active, tick_stopped;
 	ktime_t now;
 
 	local_irq_disable();
 
 	WARN_ON_ONCE(!ts->inidle);
+	WARN_ON_ONCE(ts->timer_expires_base);
 
 	ts->inidle = 0;
+	idle_active = ts->idle_active;
+	tick_stopped = ts->tick_stopped;
 
-	if (ts->idle_active || ts->tick_stopped)
+	if (idle_active || tick_stopped)
 		now = ktime_get();
 
-	if (ts->idle_active)
+	if (idle_active)
 		tick_nohz_stop_idle(ts, now);
 
-	if (ts->tick_stopped) {
-		tick_nohz_restart_sched_tick(ts, now);
-		tick_nohz_account_idle_ticks(ts);
-	}
+	if (tick_stopped)
+		__tick_nohz_idle_restart_tick(ts, now);
 
 	local_irq_enable();
 }
@@ -1074,7 +1169,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 
 	dev->next_event = KTIME_MAX;
 
-	tick_sched_do_timer(now);
+	tick_sched_do_timer(ts, now);
 	tick_sched_handle(ts, regs);
 
 	/* No need to reprogram if we are running tickless  */
@@ -1169,7 +1264,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 	struct pt_regs *regs = get_irq_regs();
 	ktime_t now = ktime_get();
 
-	tick_sched_do_timer(now);
+	tick_sched_do_timer(ts, now);
 
 	/*
 	 * Do not call, when we are not in irq context and have
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 954b43dbf21c..6de959a854b2 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -38,31 +38,37 @@ enum tick_nohz_mode {
  * @idle_exittime:	Time when the idle state was left
  * @idle_sleeptime:	Sum of the time slept in idle with sched tick stopped
  * @iowait_sleeptime:	Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @sleep_length:	Duration of the current idle sleep
+ * @timer_expires:	Anticipated timer expiration time (in case sched tick is stopped)
+ * @timer_expires_base:	Base time clock monotonic for @timer_expires
  * @do_timer_lst:	CPU was the last one doing do_timer before going idle
+ * @got_idle_tick:	Tick timer function has run with @inidle set
  */
 struct tick_sched {
 	struct hrtimer			sched_timer;
 	unsigned long			check_clocks;
 	enum tick_nohz_mode		nohz_mode;
+
+	unsigned int			inidle		: 1;
+	unsigned int			tick_stopped	: 1;
+	unsigned int			idle_active	: 1;
+	unsigned int			do_timer_last	: 1;
+	unsigned int			got_idle_tick	: 1;
+
 	ktime_t				last_tick;
 	ktime_t				next_tick;
-	int				inidle;
-	int				tick_stopped;
 	unsigned long			idle_jiffies;
 	unsigned long			idle_calls;
 	unsigned long			idle_sleeps;
-	int				idle_active;
 	ktime_t				idle_entrytime;
 	ktime_t				idle_waketime;
 	ktime_t				idle_exittime;
 	ktime_t				idle_sleeptime;
 	ktime_t				iowait_sleeptime;
-	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
+	u64				timer_expires;
+	u64				timer_expires_base;
 	u64				next_timer;
 	ktime_t				idle_expires;
-	int				do_timer_last;
 	atomic_t			tick_dep_mask;
 };
 
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 3044d48ebe56..6fa99213fc72 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -407,7 +407,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
 }
 EXPORT_SYMBOL(mktime64);
 
-#if __BITS_PER_LONG == 32
 /**
  * set_normalized_timespec - set timespec sec and nsec parts and normalize
  *
@@ -468,7 +467,6 @@ struct timespec ns_to_timespec(const s64 nsec)
 	return ts;
 }
 EXPORT_SYMBOL(ns_to_timespec);
-#endif
 
 /**
  * ns_to_timeval - Convert nanoseconds to timeval
@@ -853,9 +851,9 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 }
 
 int get_timespec64(struct timespec64 *ts,
-		   const struct timespec __user *uts)
+		   const struct __kernel_timespec __user *uts)
 {
-	struct timespec kts;
+	struct __kernel_timespec kts;
 	int ret;
 
 	ret = copy_from_user(&kts, uts, sizeof(kts));
@@ -863,6 +861,11 @@ int get_timespec64(struct timespec64 *ts,
 		return -EFAULT;
 
 	ts->tv_sec = kts.tv_sec;
+
+	/* Zero out the padding for 32 bit systems or in compat mode */
+	if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()))
+		kts.tv_nsec &= 0xFFFFFFFFUL;
+
 	ts->tv_nsec = kts.tv_nsec;
 
 	return 0;
@@ -870,16 +873,61 @@ int get_timespec64(struct timespec64 *ts,
 EXPORT_SYMBOL_GPL(get_timespec64);
 
 int put_timespec64(const struct timespec64 *ts,
-		   struct timespec __user *uts)
+		   struct __kernel_timespec __user *uts)
 {
-	struct timespec kts = {
+	struct __kernel_timespec kts = {
 		.tv_sec = ts->tv_sec,
 		.tv_nsec = ts->tv_nsec
 	};
+
 	return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
 }
 EXPORT_SYMBOL_GPL(put_timespec64);
 
+int __compat_get_timespec64(struct timespec64 *ts64,
+				   const struct compat_timespec __user *cts)
+{
+	struct compat_timespec ts;
+	int ret;
+
+	ret = copy_from_user(&ts, cts, sizeof(ts));
+	if (ret)
+		return -EFAULT;
+
+	ts64->tv_sec = ts.tv_sec;
+	ts64->tv_nsec = ts.tv_nsec;
+
+	return 0;
+}
+
+int __compat_put_timespec64(const struct timespec64 *ts64,
+				   struct compat_timespec __user *cts)
+{
+	struct compat_timespec ts = {
+		.tv_sec = ts64->tv_sec,
+		.tv_nsec = ts64->tv_nsec
+	};
+	return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+
+int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
+{
+	if (COMPAT_USE_64BIT_TIME)
+		return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
+	else
+		return __compat_get_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec64);
+
+int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
+{
+	if (COMPAT_USE_64BIT_TIME)
+		return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
+	else
+		return __compat_put_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec64);
+
 int get_itimerspec64(struct itimerspec64 *it,
 			const struct itimerspec __user *uit)
 {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ca90219a1e73..4786df904c22 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -138,12 +138,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
 
 static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
 {
-	/* Update both bases so mono and raw stay coupled. */
-	tk->tkr_mono.base += delta;
-	tk->tkr_raw.base += delta;
-
-	/* Accumulate time spent in suspend */
-	tk->time_suspended += delta;
+	tk->offs_boot = ktime_add(tk->offs_boot, delta);
 }
 
 /*
@@ -473,6 +468,36 @@ u64 ktime_get_raw_fast_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
 
+/**
+ * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
+ *
+ * To keep it NMI safe since we're accessing from tracing, we're not using a
+ * separate timekeeper with updates to monotonic clock and boot offset
+ * protected with seqlocks. This has the following minor side effects:
+ *
+ * (1) Its possible that a timestamp be taken after the boot offset is updated
+ * but before the timekeeper is updated. If this happens, the new boot offset
+ * is added to the old timekeeping making the clock appear to update slightly
+ * earlier:
+ *    CPU 0                                        CPU 1
+ *    timekeeping_inject_sleeptime64()
+ *    __timekeeping_inject_sleeptime(tk, delta);
+ *                                                 timestamp();
+ *    timekeeping_update(tk, TK_CLEAR_NTP...);
+ *
+ * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
+ * partially updated.  Since the tk->offs_boot update is a rare event, this
+ * should be a rare occurrence which postprocessing should be able to handle.
+ */
+u64 notrace ktime_get_boot_fast_ns(void)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+
+	return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
+}
+EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
+
+
 /*
  * See comment for __ktime_get_fast_ns() vs. timestamp ordering
  */
@@ -680,18 +705,19 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 }
 
 /**
- * __getnstimeofday64 - Returns the time of day in a timespec64.
+ * ktime_get_real_ts64 - Returns the time of day in a timespec64.
  * @ts:		pointer to the timespec to be set
  *
- * Updates the time of day in the timespec.
- * Returns 0 on success, or -ve when suspended (timespec will be undefined).
+ * Returns the time of day in a timespec64 (WARN if suspended).
  */
-int __getnstimeofday64(struct timespec64 *ts)
+void ktime_get_real_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long seq;
 	u64 nsecs;
 
+	WARN_ON(timekeeping_suspended);
+
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
@@ -702,28 +728,8 @@ int __getnstimeofday64(struct timespec64 *ts)
 
 	ts->tv_nsec = 0;
 	timespec64_add_ns(ts, nsecs);
-
-	/*
-	 * Do not bail out early, in case there were callers still using
-	 * the value, even in the face of the WARN_ON.
-	 */
-	if (unlikely(timekeeping_suspended))
-		return -EAGAIN;
-	return 0;
 }
-EXPORT_SYMBOL(__getnstimeofday64);
-
-/**
- * getnstimeofday64 - Returns the time of day in a timespec64.
- * @ts:		pointer to the timespec64 to be set
- *
- * Returns the time of day in a timespec64 (WARN if suspended).
- */
-void getnstimeofday64(struct timespec64 *ts)
-{
-	WARN_ON(__getnstimeofday64(ts));
-}
-EXPORT_SYMBOL(getnstimeofday64);
+EXPORT_SYMBOL(ktime_get_real_ts64);
 
 ktime_t ktime_get(void)
 {
@@ -764,6 +770,7 @@ EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
 
 static ktime_t *offsets[TK_OFFS_MAX] = {
 	[TK_OFFS_REAL]	= &tk_core.timekeeper.offs_real,
+	[TK_OFFS_BOOT]	= &tk_core.timekeeper.offs_boot,
 	[TK_OFFS_TAI]	= &tk_core.timekeeper.offs_tai,
 };
 
@@ -788,6 +795,25 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
 }
 EXPORT_SYMBOL_GPL(ktime_get_with_offset);
 
+ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	unsigned int seq;
+	ktime_t base, *offset = offsets[offs];
+
+	WARN_ON(timekeeping_suspended);
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+		base = ktime_add(tk->tkr_mono.base, *offset);
+
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	return base;
+
+}
+EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
+
 /**
  * ktime_mono_to_any() - convert mononotic time to any other time
  * @tmono:	time to convert.
@@ -861,39 +887,6 @@ void ktime_get_ts64(struct timespec64 *ts)
 EXPORT_SYMBOL_GPL(ktime_get_ts64);
 
 /**
- * ktime_get_active_ts64 - Get the active non-suspended monotonic clock
- * @ts:		pointer to timespec variable
- *
- * The function calculates the monotonic clock from the realtime clock and
- * the wall_to_monotonic offset, subtracts the accumulated suspend time and
- * stores the result in normalized timespec64 format in the variable
- * pointed to by @ts.
- */
-void ktime_get_active_ts64(struct timespec64 *ts)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-	struct timespec64 tomono, tsusp;
-	u64 nsec, nssusp;
-	unsigned int seq;
-
-	WARN_ON(timekeeping_suspended);
-
-	do {
-		seq = read_seqcount_begin(&tk_core.seq);
-		ts->tv_sec = tk->xtime_sec;
-		nsec = timekeeping_get_ns(&tk->tkr_mono);
-		tomono = tk->wall_to_monotonic;
-		nssusp = tk->time_suspended;
-	} while (read_seqcount_retry(&tk_core.seq, seq));
-
-	ts->tv_sec += tomono.tv_sec;
-	ts->tv_nsec = 0;
-	timespec64_add_ns(ts, nsec + tomono.tv_nsec);
-	tsusp = ns_to_timespec64(nssusp);
-	*ts = timespec64_sub(*ts, tsusp);
-}
-
-/**
  * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
  *
  * Returns the seconds portion of CLOCK_MONOTONIC with a single non
@@ -1417,12 +1410,12 @@ int timekeeping_notify(struct clocksource *clock)
 }
 
 /**
- * getrawmonotonic64 - Returns the raw monotonic time in a timespec
+ * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
  * @ts:		pointer to the timespec64 to be set
  *
  * Returns the raw monotonic time (completely un-modified by ntp)
  */
-void getrawmonotonic64(struct timespec64 *ts)
+void ktime_get_raw_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long seq;
@@ -1438,7 +1431,7 @@ void getrawmonotonic64(struct timespec64 *ts)
 	ts->tv_nsec = 0;
 	timespec64_add_ns(ts, nsecs);
 }
-EXPORT_SYMBOL(getrawmonotonic64);
+EXPORT_SYMBOL(ktime_get_raw_ts64);
 
 
 /**
@@ -1593,6 +1586,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
 		return;
 	}
 	tk_xtime_add(tk, delta);
+	tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
 	tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
 	tk_debug_account_sleep_time(delta);
 }
@@ -2125,7 +2119,7 @@ out:
 void getboottime64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	ktime_t t = ktime_sub(tk->offs_real, tk->time_suspended);
+	ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
 
 	*ts = ktime_to_timespec64(t);
 }
@@ -2139,30 +2133,20 @@ unsigned long get_seconds(void)
 }
 EXPORT_SYMBOL(get_seconds);
 
-struct timespec __current_kernel_time(void)
+void ktime_get_coarse_real_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-
-	return timespec64_to_timespec(tk_xtime(tk));
-}
-
-struct timespec64 current_kernel_time64(void)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-	struct timespec64 now;
 	unsigned long seq;
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
-		now = tk_xtime(tk);
+		*ts = tk_xtime(tk);
 	} while (read_seqcount_retry(&tk_core.seq, seq));
-
-	return now;
 }
-EXPORT_SYMBOL(current_kernel_time64);
+EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
 
-struct timespec64 get_monotonic_coarse64(void)
+void ktime_get_coarse_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct timespec64 now, mono;
@@ -2175,12 +2159,10 @@ struct timespec64 get_monotonic_coarse64(void)
 		mono = tk->wall_to_monotonic;
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
-	set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
+	set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
 				now.tv_nsec + mono.tv_nsec);
-
-	return now;
 }
-EXPORT_SYMBOL(get_monotonic_coarse64);
+EXPORT_SYMBOL(ktime_get_coarse_ts64);
 
 /*
  * Must hold jiffies_lock
@@ -2195,6 +2177,7 @@ void do_timer(unsigned long ticks)
  * ktime_get_update_offsets_now - hrtimer helper
  * @cwsseq:	pointer to check and store the clock was set sequence number
  * @offs_real:	pointer to storage for monotonic -> realtime offset
+ * @offs_boot:	pointer to storage for monotonic -> boottime offset
  * @offs_tai:	pointer to storage for monotonic -> clock tai offset
  *
  * Returns current monotonic time and updates the offsets if the
@@ -2204,7 +2187,7 @@ void do_timer(unsigned long ticks)
  * Called from hrtimer_interrupt() or retrigger_next_event()
  */
 ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
-				     ktime_t *offs_tai)
+				     ktime_t *offs_boot, ktime_t *offs_tai)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned int seq;
@@ -2221,6 +2204,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
 		if (*cwsseq != tk->clock_was_set_seq) {
 			*cwsseq = tk->clock_was_set_seq;
 			*offs_real = tk->offs_real;
+			*offs_boot = tk->offs_boot;
 			*offs_tai = tk->offs_tai;
 		}
 
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 79b67f5e0343..7a9b4eb7a1d5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -6,6 +6,7 @@
  */
 extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
 					    ktime_t *offs_real,
+					    ktime_t *offs_boot,
 					    ktime_t *offs_tai);
 
 extern int timekeeping_valid_for_hres(void);
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index fdbeeb02dde9..cf5c0828ee31 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -31,6 +31,4 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
 }
 #endif
 
-extern time64_t __ktime_get_real_seconds(void);
-
 #endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 4a4fd567fb26..cc2d23e6ff61 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1251,18 +1251,18 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
  *
  * Note: For !irqsafe timers, you must not hold locks that are held in
  *   interrupt context while calling this function. Even if the lock has
- *   nothing to do with the timer in question.  Here's why:
+ *   nothing to do with the timer in question.  Here's why::
  *
  *    CPU0                             CPU1
  *    ----                             ----
- *                                   <SOFTIRQ>
- *                                   call_timer_fn();
- *                                     base->running_timer = mytimer;
- *  spin_lock_irq(somelock);
+ *                                     <SOFTIRQ>
+ *                                       call_timer_fn();
+ *                                       base->running_timer = mytimer;
+ *    spin_lock_irq(somelock);
  *                                     <IRQ>
  *                                        spin_lock(somelock);
- *  del_timer_sync(mytimer);
- *   while (base->running_timer == mytimer);
+ *    del_timer_sync(mytimer);
+ *    while (base->running_timer == mytimer);
  *
  * Now del_timer_sync() will never return and never release somelock.
  * The interrupt on the other CPU is waiting to grab somelock but
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 0ed768b56c60..d647dabdac97 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -28,8 +28,6 @@ struct timer_list_iter {
 	u64 now;
 };
 
-typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
-
 /*
  * This allows printing both to /proc/timer_list and
  * to the console (on SysRq-Q):
@@ -372,24 +370,12 @@ static const struct seq_operations timer_list_sops = {
 	.show = timer_list_show,
 };
 
-static int timer_list_open(struct inode *inode, struct file *filp)
-{
-	return seq_open_private(filp, &timer_list_sops,
-			sizeof(struct timer_list_iter));
-}
-
-static const struct file_operations timer_list_fops = {
-	.open		= timer_list_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release_private,
-};
-
 static int __init init_timer_list_procfs(void)
 {
 	struct proc_dir_entry *pe;
 
-	pe = proc_create("timer_list", 0400, NULL, &timer_list_fops);
+	pe = proc_create_seq_private("timer_list", 0400, NULL, &timer_list_sops,
+			sizeof(struct timer_list_iter), NULL);
 	if (!pe)
 		return -ENOMEM;
 	return 0;
diff --git a/kernel/torture.c b/kernel/torture.c
index 37b94012a3f8..3de1efbecd6a 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -574,7 +574,7 @@ void stutter_wait(const char *title)
 {
 	int spt;
 
-	cond_resched_rcu_qs();
+	cond_resched_tasks_rcu_qs();
 	spt = READ_ONCE(stutter_pause_test);
 	for (; spt; spt = READ_ONCE(stutter_pause_test)) {
 		if (spt == 1) {
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 0b249e2f0c3c..c4f0f2e4126e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -606,7 +606,10 @@ config HIST_TRIGGERS
 	  event activity as an initial guide for further investigation
 	  using more advanced tools.
 
-	  See Documentation/trace/events.txt.
+	  Inter-event tracing of quantities such as latencies is also
+	  supported using hist triggers under this option.
+
+	  See Documentation/trace/histogram.txt.
 	  If in doubt, say N.
 
 config MMIOTRACE_TEST
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d88e96d4e12c..0ae6829804bc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,12 +14,14 @@
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/kprobes.h>
+#include <linux/syscalls.h>
 #include <linux/error-injection.h>
 
 #include "trace_probe.h"
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 /**
  * trace_call_bpf - invoke BPF program
@@ -474,8 +476,6 @@ BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct cgroup *cgrp;
 
-	if (unlikely(in_interrupt()))
-		return -EINVAL;
 	if (unlikely(idx >= array->map.max_entries))
 		return -E2BIG;
 
@@ -564,6 +564,10 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_probe_read_str:
 		return &bpf_probe_read_str_proto;
+#ifdef CONFIG_CGROUPS
+	case BPF_FUNC_get_current_cgroup_id:
+		return &bpf_get_current_cgroup_id_proto;
+#endif
 	default:
 		return NULL;
 	}
@@ -577,6 +581,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
@@ -664,6 +670,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
+	   u64, flags)
+{
+	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_tp = {
+	.func		= bpf_get_stack_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -672,6 +697,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
@@ -734,6 +761,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	case BPF_FUNC_perf_prog_read_value:
 		return &bpf_perf_prog_read_value_proto;
 	default:
@@ -744,7 +773,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 /*
  * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
  * to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output and/or bpf_get_stack_id
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
  */
 static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
@@ -787,6 +816,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
+	   void *, buf, u32, size, u64, flags)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+	perf_fetch_caller_regs(regs);
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
+	.func		= bpf_get_stack_raw_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -795,6 +844,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_raw_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_raw_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_raw_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
@@ -833,8 +884,14 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
 		return false;
 	if (type != BPF_READ)
 		return false;
-	if (off % size != 0)
-		return false;
+	if (off % size != 0) {
+		if (sizeof(unsigned long) != 4)
+			return false;
+		if (size != 8)
+			return false;
+		if (off % size != 4)
+			return false;
+	}
 
 	switch (off) {
 	case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
@@ -959,6 +1016,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
 
 	old_array = event->tp_event->prog_array;
 	ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+	if (ret == -ENOENT)
+		goto unlock;
 	if (ret < 0) {
 		bpf_prog_array_delete_safe(old_array, event->prog);
 	} else {
@@ -977,6 +1036,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
 {
 	struct perf_event_query_bpf __user *uquery = info;
 	struct perf_event_query_bpf query = {};
+	u32 *ids, prog_cnt, ids_len;
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -985,16 +1045,32 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
 		return -EINVAL;
 	if (copy_from_user(&query, uquery, sizeof(query)))
 		return -EFAULT;
-	if (query.ids_len > BPF_TRACE_MAX_PROGS)
+
+	ids_len = query.ids_len;
+	if (ids_len > BPF_TRACE_MAX_PROGS)
 		return -E2BIG;
+	ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN);
+	if (!ids)
+		return -ENOMEM;
+	/*
+	 * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which
+	 * is required when user only wants to check for uquery->prog_cnt.
+	 * There is no need to check for it since the case is handled
+	 * gracefully in bpf_prog_array_copy_info.
+	 */
 
 	mutex_lock(&bpf_event_mutex);
 	ret = bpf_prog_array_copy_info(event->tp_event->prog_array,
-				       uquery->ids,
-				       query.ids_len,
-				       &uquery->prog_cnt);
+				       ids,
+				       ids_len,
+				       &prog_cnt);
 	mutex_unlock(&bpf_event_mutex);
 
+	if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) ||
+	    copy_to_user(uquery->ids, ids, ids_len * sizeof(u32)))
+		ret = -EFAULT;
+
+	kfree(ids);
 	return ret;
 }
 
@@ -1100,3 +1176,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 	mutex_unlock(&bpf_event_mutex);
 	return err;
 }
+
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+			    u32 *fd_type, const char **buf,
+			    u64 *probe_offset, u64 *probe_addr)
+{
+	bool is_tracepoint, is_syscall_tp;
+	struct bpf_prog *prog;
+	int flags, err = 0;
+
+	prog = event->prog;
+	if (!prog)
+		return -ENOENT;
+
+	/* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
+	if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
+		return -EOPNOTSUPP;
+
+	*prog_id = prog->aux->id;
+	flags = event->tp_event->flags;
+	is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
+	is_syscall_tp = is_syscall_trace_event(event->tp_event);
+
+	if (is_tracepoint || is_syscall_tp) {
+		*buf = is_tracepoint ? event->tp_event->tp->name
+				     : event->tp_event->name;
+		*fd_type = BPF_FD_TYPE_TRACEPOINT;
+		*probe_offset = 0x0;
+		*probe_addr = 0x0;
+	} else {
+		/* kprobe/uprobe */
+		err = -EOPNOTSUPP;
+#ifdef CONFIG_KPROBE_EVENTS
+		if (flags & TRACE_EVENT_FL_KPROBE)
+			err = bpf_get_kprobe_info(event, fd_type, buf,
+						  probe_offset, probe_addr,
+						  event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+		if (flags & TRACE_EVENT_FL_UPROBE)
+			err = bpf_get_uprobe_info(event, fd_type, buf,
+						  probe_offset,
+						  event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+	}
+
+	return err;
+}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eac9ce2c57a2..8d83bcf9ef69 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3902,14 +3902,13 @@ static bool module_exists(const char *module)
 {
 	/* All modules have the symbol __this_module */
 	const char this_mod[] = "__this_module";
-	const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1;
-	char modname[modname_size + 1];
+	char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
 	unsigned long val;
 	int n;
 
-	n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod);
+	n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod);
 
-	if (n > modname_size)
+	if (n > sizeof(modname) - 1)
 		return false;
 
 	val = module_kallsyms_lookup_name(modname);
@@ -5515,10 +5514,10 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
 	ftrace_create_filter_files(&global_ops, d_tracer);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	trace_create_file("set_graph_function", 0444, d_tracer,
+	trace_create_file("set_graph_function", 0644, d_tracer,
 				    NULL,
 				    &ftrace_graph_fops);
-	trace_create_file("set_graph_notrace", 0444, d_tracer,
+	trace_create_file("set_graph_notrace", 0644, d_tracer,
 				    NULL,
 				    &ftrace_graph_notrace_fops);
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dcf1c4dd3efe..6a46af21765c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -22,6 +22,7 @@
 #include <linux/hash.h>
 #include <linux/list.h>
 #include <linux/cpu.h>
+#include <linux/oom.h>
 
 #include <asm/local.h>
 
@@ -41,6 +42,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
 			 RINGBUF_TYPE_PADDING);
 	trace_seq_printf(s, "\ttime_extend : type == %d\n",
 			 RINGBUF_TYPE_TIME_EXTEND);
+	trace_seq_printf(s, "\ttime_stamp : type == %d\n",
+			 RINGBUF_TYPE_TIME_STAMP);
 	trace_seq_printf(s, "\tdata max type_len  == %d\n",
 			 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
 
@@ -140,12 +143,15 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
 
 enum {
 	RB_LEN_TIME_EXTEND = 8,
-	RB_LEN_TIME_STAMP = 16,
+	RB_LEN_TIME_STAMP =  8,
 };
 
 #define skip_time_extend(event) \
 	((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
 
+#define extended_time(event) \
+	(event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
+
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
 	return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -209,7 +215,7 @@ rb_event_ts_length(struct ring_buffer_event *event)
 {
 	unsigned len = 0;
 
-	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+	if (extended_time(event)) {
 		/* time extends include the data event after it */
 		len = RB_LEN_TIME_EXTEND;
 		event = skip_time_extend(event);
@@ -231,7 +237,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
 	unsigned length;
 
-	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+	if (extended_time(event))
 		event = skip_time_extend(event);
 
 	length = rb_event_length(event);
@@ -248,7 +254,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 static __always_inline void *
 rb_event_data(struct ring_buffer_event *event)
 {
-	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+	if (extended_time(event))
 		event = skip_time_extend(event);
 	BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
 	/* If length is in len field, then array[0] has the data */
@@ -275,6 +281,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 #define TS_MASK		((1ULL << TS_SHIFT) - 1)
 #define TS_DELTA_TEST	(~TS_MASK)
 
+/**
+ * ring_buffer_event_time_stamp - return the event's extended timestamp
+ * @event: the event to get the timestamp of
+ *
+ * Returns the extended timestamp associated with a data event.
+ * An extended time_stamp is a 64-bit timestamp represented
+ * internally in a special way that makes the best use of space
+ * contained within a ring buffer event.  This function decodes
+ * it and maps it to a straight u64 value.
+ */
+u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
+{
+	u64 ts;
+
+	ts = event->array[0];
+	ts <<= TS_SHIFT;
+	ts += event->time_delta;
+
+	return ts;
+}
+
 /* Flag when events were overwritten */
 #define RB_MISSED_EVENTS	(1 << 31)
 /* Missed count stored at end */
@@ -451,6 +478,7 @@ struct ring_buffer_per_cpu {
 	struct buffer_page		*reader_page;
 	unsigned long			lost_events;
 	unsigned long			last_overrun;
+	unsigned long			nest;
 	local_t				entries_bytes;
 	local_t				entries;
 	local_t				overrun;
@@ -488,6 +516,7 @@ struct ring_buffer {
 	u64				(*clock)(void);
 
 	struct rb_irq_work		irq_work;
+	bool				time_stamp_abs;
 };
 
 struct ring_buffer_iter {
@@ -780,7 +809,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
  *
  *  You can see, it is legitimate for the previous pointer of
  *  the head (or any page) not to point back to itself. But only
- *  temporarially.
+ *  temporarily.
  */
 
 #define RB_PAGE_NORMAL		0UL
@@ -877,7 +906,7 @@ static void rb_list_head_clear(struct list_head *list)
 }
 
 /*
- * rb_head_page_dactivate - clears head page ptr (for free list)
+ * rb_head_page_deactivate - clears head page ptr (for free list)
  */
 static void
 rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
@@ -1134,30 +1163,60 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
 static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
 {
 	struct buffer_page *bpage, *tmp;
+	bool user_thread = current->mm != NULL;
+	gfp_t mflags;
 	long i;
 
+	/*
+	 * Check if the available memory is there first.
+	 * Note, si_mem_available() only gives us a rough estimate of available
+	 * memory. It may not be accurate. But we don't care, we just want
+	 * to prevent doing any allocation when it is obvious that it is
+	 * not going to succeed.
+	 */
+	i = si_mem_available();
+	if (i < nr_pages)
+		return -ENOMEM;
+
+	/*
+	 * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
+	 * gracefully without invoking oom-killer and the system is not
+	 * destabilized.
+	 */
+	mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
+
+	/*
+	 * If a user thread allocates too much, and si_mem_available()
+	 * reports there's enough memory, even though there is not.
+	 * Make sure the OOM killer kills this thread. This can happen
+	 * even with RETRY_MAYFAIL because another task may be doing
+	 * an allocation after this task has taken all memory.
+	 * This is the task the OOM killer needs to take out during this
+	 * loop, even if it was triggered by an allocation somewhere else.
+	 */
+	if (user_thread)
+		set_current_oom_origin();
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page;
-		/*
-		 * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
-		 * gracefully without invoking oom-killer and the system is not
-		 * destabilized.
-		 */
+
 		bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
-				    GFP_KERNEL | __GFP_RETRY_MAYFAIL,
-				    cpu_to_node(cpu));
+				    mflags, cpu_to_node(cpu));
 		if (!bpage)
 			goto free_pages;
 
 		list_add(&bpage->list, pages);
 
-		page = alloc_pages_node(cpu_to_node(cpu),
-					GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0);
+		page = alloc_pages_node(cpu_to_node(cpu), mflags, 0);
 		if (!page)
 			goto free_pages;
 		bpage->page = page_address(page);
 		rb_init_page(bpage->page);
+
+		if (user_thread && fatal_signal_pending(current))
+			goto free_pages;
 	}
+	if (user_thread)
+		clear_current_oom_origin();
 
 	return 0;
 
@@ -1166,6 +1225,8 @@ free_pages:
 		list_del_init(&bpage->list);
 		free_buffer_page(bpage);
 	}
+	if (user_thread)
+		clear_current_oom_origin();
 
 	return -ENOMEM;
 }
@@ -1382,6 +1443,16 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
 	buffer->clock = clock;
 }
 
+void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
+{
+	buffer->time_stamp_abs = abs;
+}
+
+bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
+{
+	return buffer->time_stamp_abs;
+}
+
 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
 
 static inline unsigned long rb_page_entries(struct buffer_page *bpage)
@@ -1709,7 +1780,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
 
 		put_online_cpus();
 	} else {
-		/* Make sure this CPU has been intitialized */
+		/* Make sure this CPU has been initialized */
 		if (!cpumask_test_cpu(cpu_id, buffer->cpumask))
 			goto out;
 
@@ -2206,12 +2277,15 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 
 /* Slow path, do not inline */
 static noinline struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
 {
-	event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+	if (abs)
+		event->type_len = RINGBUF_TYPE_TIME_STAMP;
+	else
+		event->type_len = RINGBUF_TYPE_TIME_EXTEND;
 
-	/* Not the first event on the page? */
-	if (rb_event_index(event)) {
+	/* Not the first event on the page, or not delta? */
+	if (abs || rb_event_index(event)) {
 		event->time_delta = delta & TS_MASK;
 		event->array[0] = delta >> TS_SHIFT;
 	} else {
@@ -2251,10 +2325,12 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
 
 	/*
 	 * If we need to add a timestamp, then we
-	 * add it to the start of the resevered space.
+	 * add it to the start of the reserved space.
 	 */
 	if (unlikely(info->add_timestamp)) {
-		event = rb_add_time_stamp(event, delta);
+		bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
+
+		event = rb_add_time_stamp(event, info->delta, abs);
 		length -= RB_LEN_TIME_EXTEND;
 		delta = 0;
 	}
@@ -2442,7 +2518,7 @@ static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer
 
 static inline void rb_event_discard(struct ring_buffer_event *event)
 {
-	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+	if (extended_time(event))
 		event = skip_time_extend(event);
 
 	/* array[0] holds the actual length for the discarded event */
@@ -2486,10 +2562,11 @@ rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 			cpu_buffer->write_stamp =
 				cpu_buffer->commit_page->page->time_stamp;
 		else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
-			delta = event->array[0];
-			delta <<= TS_SHIFT;
-			delta += event->time_delta;
+			delta = ring_buffer_event_time_stamp(event);
 			cpu_buffer->write_stamp += delta;
+		} else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
+			delta = ring_buffer_event_time_stamp(event);
+			cpu_buffer->write_stamp = delta;
 		} else
 			cpu_buffer->write_stamp += event->time_delta;
 	}
@@ -2581,10 +2658,10 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 		bit = pc & NMI_MASK ? RB_CTX_NMI :
 			pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
 
-	if (unlikely(val & (1 << bit)))
+	if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
 		return 1;
 
-	val |= (1 << bit);
+	val |= (1 << (bit + cpu_buffer->nest));
 	cpu_buffer->current_context = val;
 
 	return 0;
@@ -2593,7 +2670,57 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 static __always_inline void
 trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	cpu_buffer->current_context &= cpu_buffer->current_context - 1;
+	cpu_buffer->current_context &=
+		cpu_buffer->current_context - (1 << cpu_buffer->nest);
+}
+
+/* The recursive locking above uses 4 bits */
+#define NESTED_BITS 4
+
+/**
+ * ring_buffer_nest_start - Allow to trace while nested
+ * @buffer: The ring buffer to modify
+ *
+ * The ring buffer has a safety mechanism to prevent recursion.
+ * But there may be a case where a trace needs to be done while
+ * tracing something else. In this case, calling this function
+ * will allow this function to nest within a currently active
+ * ring_buffer_lock_reserve().
+ *
+ * Call this function before calling another ring_buffer_lock_reserve() and
+ * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
+ */
+void ring_buffer_nest_start(struct ring_buffer *buffer)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	int cpu;
+
+	/* Enabled by ring_buffer_nest_end() */
+	preempt_disable_notrace();
+	cpu = raw_smp_processor_id();
+	cpu_buffer = buffer->buffers[cpu];
+	/* This is the shift value for the above recursive locking */
+	cpu_buffer->nest += NESTED_BITS;
+}
+
+/**
+ * ring_buffer_nest_end - Allow to trace while nested
+ * @buffer: The ring buffer to modify
+ *
+ * Must be called after ring_buffer_nest_start() and after the
+ * ring_buffer_unlock_commit().
+ */
+void ring_buffer_nest_end(struct ring_buffer *buffer)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	int cpu;
+
+	/* disabled by ring_buffer_nest_start() */
+	cpu = raw_smp_processor_id();
+	cpu_buffer = buffer->buffers[cpu];
+	/* This is the shift value for the above recursive locking */
+	cpu_buffer->nest -= NESTED_BITS;
+	preempt_enable_notrace();
 }
 
 /**
@@ -2637,7 +2764,8 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  sched_clock_stable() ? "" :
 		  "If you just came from a suspend/resume,\n"
 		  "please switch to the trace global clock:\n"
-		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n");
+		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n"
+		  "or add trace_clock=global to the kernel command line\n");
 	info->add_timestamp = 1;
 }
 
@@ -2669,7 +2797,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	 * If this is the first commit on the page, then it has the same
 	 * timestamp as the page itself.
 	 */
-	if (!tail)
+	if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
 		info->delta = 0;
 
 	/* See if we shot pass the end of this buffer page */
@@ -2746,8 +2874,11 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	/* make sure this diff is calculated here */
 	barrier();
 
-	/* Did the write stamp get updated already? */
-	if (likely(info.ts >= cpu_buffer->write_stamp)) {
+	if (ring_buffer_time_stamp_abs(buffer)) {
+		info.delta = info.ts;
+		rb_handle_timestamp(cpu_buffer, &info);
+	} else /* Did the write stamp get updated already? */
+		if (likely(info.ts >= cpu_buffer->write_stamp)) {
 		info.delta = diff;
 		if (unlikely(test_time_stamp(info.delta)))
 			rb_handle_timestamp(cpu_buffer, &info);
@@ -2776,7 +2907,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
  * @buffer: the ring buffer to reserve from
  * @length: the length of the data to reserve (excluding event header)
  *
- * Returns a reseverd event on the ring buffer to copy directly to.
+ * Returns a reserved event on the ring buffer to copy directly to.
  * The user of this interface will need to get the body to write into
  * and can use the ring_buffer_event_data() interface.
  *
@@ -2878,7 +3009,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
  * This function lets the user discard an event in the ring buffer
  * and then that event will not be read later.
  *
- * This function only works if it is called before the the item has been
+ * This function only works if it is called before the item has been
  * committed. It will try to free the event from the ring buffer
  * if another event has not been added behind it.
  *
@@ -3429,14 +3560,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		return;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
-		delta = event->array[0];
-		delta <<= TS_SHIFT;
-		delta += event->time_delta;
+		delta = ring_buffer_event_time_stamp(event);
 		cpu_buffer->read_stamp += delta;
 		return;
 
 	case RINGBUF_TYPE_TIME_STAMP:
-		/* FIXME: not implemented */
+		delta = ring_buffer_event_time_stamp(event);
+		cpu_buffer->read_stamp = delta;
 		return;
 
 	case RINGBUF_TYPE_DATA:
@@ -3460,14 +3590,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
 		return;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
-		delta = event->array[0];
-		delta <<= TS_SHIFT;
-		delta += event->time_delta;
+		delta = ring_buffer_event_time_stamp(event);
 		iter->read_stamp += delta;
 		return;
 
 	case RINGBUF_TYPE_TIME_STAMP:
-		/* FIXME: not implemented */
+		delta = ring_buffer_event_time_stamp(event);
+		iter->read_stamp = delta;
 		return;
 
 	case RINGBUF_TYPE_DATA:
@@ -3691,6 +3820,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
 	struct buffer_page *reader;
 	int nr_loops = 0;
 
+	if (ts)
+		*ts = 0;
  again:
 	/*
 	 * We repeat when a time extend is encountered.
@@ -3727,12 +3858,17 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
 		goto again;
 
 	case RINGBUF_TYPE_TIME_STAMP:
-		/* FIXME: not implemented */
+		if (ts) {
+			*ts = ring_buffer_event_time_stamp(event);
+			ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
+							 cpu_buffer->cpu, ts);
+		}
+		/* Internal data, OK to advance */
 		rb_advance_reader(cpu_buffer);
 		goto again;
 
 	case RINGBUF_TYPE_DATA:
-		if (ts) {
+		if (ts && !(*ts)) {
 			*ts = cpu_buffer->read_stamp + event->time_delta;
 			ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
 							 cpu_buffer->cpu, ts);
@@ -3757,6 +3893,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_event *event;
 	int nr_loops = 0;
 
+	if (ts)
+		*ts = 0;
+
 	cpu_buffer = iter->cpu_buffer;
 	buffer = cpu_buffer->buffer;
 
@@ -3809,12 +3948,17 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 		goto again;
 
 	case RINGBUF_TYPE_TIME_STAMP:
-		/* FIXME: not implemented */
+		if (ts) {
+			*ts = ring_buffer_event_time_stamp(event);
+			ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
+							 cpu_buffer->cpu, ts);
+		}
+		/* Internal data, OK to advance */
 		rb_advance_iter(iter);
 		goto again;
 
 	case RINGBUF_TYPE_DATA:
-		if (ts) {
+		if (ts && !(*ts)) {
 			*ts = iter->read_stamp + event->time_delta;
 			ring_buffer_normalize_time_stamp(buffer,
 							 cpu_buffer->cpu, ts);
@@ -3983,7 +4127,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
  * through the buffer.  Memory is allocated, buffer recording
  * is disabled, and the iterator pointer is returned to the caller.
  *
- * Disabling buffer recordng prevents the reading from being
+ * Disabling buffer recording prevents the reading from being
  * corrupted. This is not a consuming read, so a producer is not
  * expected.
  *
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5071931eb943..108ce3e1dc13 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,6 +41,7 @@
 #include <linux/nmi.h>
 #include <linux/fs.h>
 #include <linux/trace.h>
+#include <linux/sched/clock.h>
 #include <linux/sched/rt.h>
 
 #include "trace.h"
@@ -892,7 +893,7 @@ int __trace_bputs(unsigned long ip, const char *str)
 EXPORT_SYMBOL_GPL(__trace_bputs);
 
 #ifdef CONFIG_TRACER_SNAPSHOT
-static void tracing_snapshot_instance(struct trace_array *tr)
+void tracing_snapshot_instance(struct trace_array *tr)
 {
 	struct tracer *tracer = tr->current_trace;
 	unsigned long flags;
@@ -948,7 +949,7 @@ static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
 					struct trace_buffer *size_buf, int cpu_id);
 static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
 
-static int alloc_snapshot(struct trace_array *tr)
+int tracing_alloc_snapshot_instance(struct trace_array *tr)
 {
 	int ret;
 
@@ -994,7 +995,7 @@ int tracing_alloc_snapshot(void)
 	struct trace_array *tr = &global_trace;
 	int ret;
 
-	ret = alloc_snapshot(tr);
+	ret = tracing_alloc_snapshot_instance(tr);
 	WARN_ON(ret < 0);
 
 	return ret;
@@ -1164,10 +1165,18 @@ static struct {
 	{ trace_clock,			"perf",		1 },
 	{ ktime_get_mono_fast_ns,	"mono",		1 },
 	{ ktime_get_raw_fast_ns,	"mono_raw",	1 },
-	{ ktime_get_mono_fast_ns,	"boot",		1 },
+	{ ktime_get_boot_fast_ns,	"boot",		1 },
 	ARCH_TRACE_CLOCKS
 };
 
+bool trace_clock_in_ns(struct trace_array *tr)
+{
+	if (trace_clocks[tr->clock_id].in_ns)
+		return true;
+
+	return false;
+}
+
 /*
  * trace_parser_get_init - gets the buffer for trace parser
  */
@@ -2269,7 +2278,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 
 	*current_rb = trace_file->tr->trace_buffer.buffer;
 
-	if ((trace_file->flags &
+	if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
 	     (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
 	    (entry = this_cpu_read(trace_buffered_event))) {
 		/* Try to use the per cpu buffer first */
@@ -4386,8 +4395,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
 {
 	char *cmp;
 	int neg = 0;
-	int ret = -ENODEV;
-	int i;
+	int ret;
 	size_t orig_len = strlen(option);
 
 	cmp = strstrip(option);
@@ -4399,16 +4407,12 @@ static int trace_set_options(struct trace_array *tr, char *option)
 
 	mutex_lock(&trace_types_lock);
 
-	for (i = 0; trace_options[i]; i++) {
-		if (strcmp(cmp, trace_options[i]) == 0) {
-			ret = set_tracer_flag(tr, 1 << i, !neg);
-			break;
-		}
-	}
-
+	ret = match_string(trace_options, -1, cmp);
 	/* If no option could be set, test the specific tracer options */
-	if (!trace_options[i])
+	if (ret < 0)
 		ret = set_tracer_option(tr, cmp, neg);
+	else
+		ret = set_tracer_flag(tr, 1 << ret, !neg);
 
 	mutex_unlock(&trace_types_lock);
 
@@ -4515,6 +4519,9 @@ static const char readme_msg[] =
 #ifdef CONFIG_X86_64
 	"     x86-tsc:   TSC cycle counter\n"
 #endif
+	"\n  timestamp_mode\t-view the mode used to timestamp events\n"
+	"       delta:   Delta difference against a buffer-wide timestamp\n"
+	"    absolute:   Absolute (standalone) timestamp\n"
 	"\n  trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
 	"\n  trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
 	"  tracing_cpumask\t- Limit which CPUs to trace\n"
@@ -4691,8 +4698,9 @@ static const char readme_msg[] =
 	"\t            .sym        display an address as a symbol\n"
 	"\t            .sym-offset display an address as a symbol and offset\n"
 	"\t            .execname   display a common_pid as a program name\n"
-	"\t            .syscall    display a syscall id as a syscall name\n\n"
-	"\t            .log2       display log2 value rather than raw number\n\n"
+	"\t            .syscall    display a syscall id as a syscall name\n"
+	"\t            .log2       display log2 value rather than raw number\n"
+	"\t            .usecs      display a common_timestamp in microseconds\n\n"
 	"\t    The 'pause' parameter can be used to pause an existing hist\n"
 	"\t    trigger or to start a hist trigger but not log any events\n"
 	"\t    until told to do so.  'continue' can be used to start or\n"
@@ -5395,7 +5403,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 	if (t->use_max_tr && !had_max_tr) {
-		ret = alloc_snapshot(tr);
+		ret = tracing_alloc_snapshot_instance(tr);
 		if (ret < 0)
 			goto out;
 	}
@@ -6061,6 +6069,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 {
 	struct trace_array *tr = filp->private_data;
 	struct ring_buffer_event *event;
+	enum event_trigger_type tt = ETT_NONE;
 	struct ring_buffer *buffer;
 	struct print_entry *entry;
 	unsigned long irq_flags;
@@ -6109,6 +6118,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 		written = cnt;
 	len = cnt;
 
+	if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) {
+		/* do not add \n before testing triggers, but add \0 */
+		entry->buf[cnt] = '\0';
+		tt = event_triggers_call(tr->trace_marker_file, entry, event);
+	}
+
 	if (entry->buf[cnt - 1] != '\n') {
 		entry->buf[cnt] = '\n';
 		entry->buf[cnt + 1] = '\0';
@@ -6117,6 +6132,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 
 	__buffer_unlock_commit(buffer, event);
 
+	if (tt)
+		event_triggers_post_call(tr->trace_marker_file, tt);
+
 	if (written > 0)
 		*fpos += written;
 
@@ -6202,7 +6220,7 @@ static int tracing_clock_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
+int tracing_set_clock(struct trace_array *tr, const char *clockstr)
 {
 	int i;
 
@@ -6282,6 +6300,71 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
+static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
+{
+	struct trace_array *tr = m->private;
+
+	mutex_lock(&trace_types_lock);
+
+	if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer))
+		seq_puts(m, "delta [absolute]\n");
+	else
+		seq_puts(m, "[delta] absolute\n");
+
+	mutex_unlock(&trace_types_lock);
+
+	return 0;
+}
+
+static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
+{
+	struct trace_array *tr = inode->i_private;
+	int ret;
+
+	if (tracing_disabled)
+		return -ENODEV;
+
+	if (trace_array_get(tr))
+		return -ENODEV;
+
+	ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private);
+	if (ret < 0)
+		trace_array_put(tr);
+
+	return ret;
+}
+
+int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
+{
+	int ret = 0;
+
+	mutex_lock(&trace_types_lock);
+
+	if (abs && tr->time_stamp_abs_ref++)
+		goto out;
+
+	if (!abs) {
+		if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (--tr->time_stamp_abs_ref)
+			goto out;
+	}
+
+	ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+	if (tr->max_buffer.buffer)
+		ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
+#endif
+ out:
+	mutex_unlock(&trace_types_lock);
+
+	return ret;
+}
+
 struct ftrace_buffer_info {
 	struct trace_iterator	iter;
 	void			*spare;
@@ -6373,7 +6456,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		}
 #endif
 		if (!tr->allocated_snapshot) {
-			ret = alloc_snapshot(tr);
+			ret = tracing_alloc_snapshot_instance(tr);
 			if (ret < 0)
 				break;
 		}
@@ -6529,6 +6612,13 @@ static const struct file_operations trace_clock_fops = {
 	.write		= tracing_clock_write,
 };
 
+static const struct file_operations trace_time_stamp_mode_fops = {
+	.open		= tracing_time_stamp_mode_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= tracing_single_release_tr,
+};
+
 #ifdef CONFIG_TRACER_SNAPSHOT
 static const struct file_operations snapshot_fops = {
 	.open		= tracing_snapshot_open,
@@ -7094,7 +7184,7 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash,
 		return ret;
 
  out_reg:
-	ret = alloc_snapshot(tr);
+	ret = tracing_alloc_snapshot_instance(tr);
 	if (ret < 0)
 		goto out;
 
@@ -7699,6 +7789,7 @@ static int instance_mkdir(const char *name)
 
 	INIT_LIST_HEAD(&tr->systems);
 	INIT_LIST_HEAD(&tr->events);
+	INIT_LIST_HEAD(&tr->hist_vars);
 
 	if (allocate_trace_buffers(tr, trace_buf_size) < 0)
 		goto out_free_tr;
@@ -7810,6 +7901,7 @@ static __init void create_trace_instances(struct dentry *d_tracer)
 static void
 init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
+	struct trace_event_file *file;
 	int cpu;
 
 	trace_create_file("available_tracers", 0444, d_tracer,
@@ -7842,6 +7934,12 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("trace_marker", 0220, d_tracer,
 			  tr, &tracing_mark_fops);
 
+	file = __find_event_file(tr, "ftrace", "print");
+	if (file && file->dir)
+		trace_create_file("trigger", 0644, file->dir, file,
+				  &event_trigger_fops);
+	tr->trace_marker_file = file;
+
 	trace_create_file("trace_marker_raw", 0220, d_tracer,
 			  tr, &tracing_mark_raw_fops);
 
@@ -7851,6 +7949,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("tracing_on", 0644, d_tracer,
 			  tr, &rb_simple_fops);
 
+	trace_create_file("timestamp_mode", 0444, d_tracer, tr,
+			  &trace_time_stamp_mode_fops);
+
 	create_trace_options_dir(tr);
 
 #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
@@ -8022,6 +8123,8 @@ static __init int tracer_init_tracefs(void)
 	if (IS_ERR(d_tracer))
 		return 0;
 
+	event_trace_init();
+
 	init_tracer_tracefs(&global_trace, d_tracer);
 	ftrace_init_tracefs_toplevel(&global_trace, d_tracer);
 
@@ -8446,6 +8549,7 @@ __init static int tracer_alloc_buffers(void)
 
 	INIT_LIST_HEAD(&global_trace.systems);
 	INIT_LIST_HEAD(&global_trace.events);
+	INIT_LIST_HEAD(&global_trace.hist_vars);
 	list_add(&global_trace.list, &ftrace_trace_arrays);
 
 	apply_trace_boot_options();
@@ -8507,3 +8611,21 @@ __init static int clear_boot_tracer(void)
 
 fs_initcall(tracer_init_tracefs);
 late_initcall_sync(clear_boot_tracer);
+
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+__init static int tracing_set_default_clock(void)
+{
+	/* sched_clock_stable() is determined in late_initcall */
+	if (!trace_boot_clock && !sched_clock_stable()) {
+		printk(KERN_WARNING
+		       "Unstable clock detected, switching default tracing clock to \"global\"\n"
+		       "If you want to keep using the local clock, then add:\n"
+		       "  \"trace_clock=local\"\n"
+		       "on the kernel command line\n");
+		tracing_set_clock(&global_trace, "global");
+	}
+
+	return 0;
+}
+late_initcall_sync(tracing_set_default_clock);
+#endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2a6d0325a761..630c5a24b2b2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -259,6 +259,7 @@ struct trace_array {
 	struct trace_options	*topts;
 	struct list_head	systems;
 	struct list_head	events;
+	struct trace_event_file *trace_marker_file;
 	cpumask_var_t		tracing_cpumask; /* only trace on set CPUs */
 	int			ref;
 #ifdef CONFIG_FUNCTION_TRACER
@@ -273,6 +274,8 @@ struct trace_array {
 	/* function tracing enabled */
 	int			function_enabled;
 #endif
+	int			time_stamp_abs_ref;
+	struct list_head	hist_vars;
 };
 
 enum {
@@ -286,6 +289,11 @@ extern struct mutex trace_types_lock;
 extern int trace_array_get(struct trace_array *tr);
 extern void trace_array_put(struct trace_array *tr);
 
+extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
+extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
+
+extern bool trace_clock_in_ns(struct trace_array *tr);
+
 /*
  * The global tracer (top) should be the first trace array added,
  * but we check the flag anyway.
@@ -1209,12 +1217,11 @@ struct ftrace_event_field {
 	int			is_signed;
 };
 
+struct prog_entry;
+
 struct event_filter {
-	int			n_preds;	/* Number assigned */
-	int			a_preds;	/* allocated */
-	struct filter_pred __rcu	*preds;
-	struct filter_pred __rcu	*root;
-	char				*filter_string;
+	struct prog_entry __rcu	*prog;
+	char			*filter_string;
 };
 
 struct event_subsystem {
@@ -1291,7 +1298,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
 	unsigned long eflags = file->flags;
 
 	if (eflags & EVENT_FILE_FL_TRIGGER_COND)
-		*tt = event_triggers_call(file, entry);
+		*tt = event_triggers_call(file, entry, event);
 
 	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
 	    (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
@@ -1328,7 +1335,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
 		trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
 
 	if (tt)
-		event_triggers_post_call(file, tt, entry);
+		event_triggers_post_call(file, tt);
 }
 
 /**
@@ -1361,7 +1368,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file,
 						irq_flags, pc, regs);
 
 	if (tt)
-		event_triggers_post_call(file, tt, entry);
+		event_triggers_post_call(file, tt);
 }
 
 #define FILTER_PRED_INVALID	((unsigned short)-1)
@@ -1406,12 +1413,8 @@ struct filter_pred {
 	unsigned short		*ops;
 	struct ftrace_event_field *field;
 	int 			offset;
-	int 			not;
+	int			not;
 	int 			op;
-	unsigned short		index;
-	unsigned short		parent;
-	unsigned short		left;
-	unsigned short		right;
 };
 
 static inline bool is_string_field(struct ftrace_event_field *field)
@@ -1449,9 +1452,13 @@ trace_find_event_field(struct trace_event_call *call, char *name);
 extern void trace_event_enable_cmd_record(bool enable);
 extern void trace_event_enable_tgid_record(bool enable);
 
+extern int event_trace_init(void);
 extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
 extern int event_trace_del_tracer(struct trace_array *tr);
 
+extern struct trace_event_file *__find_event_file(struct trace_array *tr,
+						  const char *system,
+						  const char *event);
 extern struct trace_event_file *find_event_file(struct trace_array *tr,
 						const char *system,
 						const char *event);
@@ -1543,6 +1550,8 @@ extern void pause_named_trigger(struct event_trigger_data *data);
 extern void unpause_named_trigger(struct event_trigger_data *data);
 extern void set_named_trigger_data(struct event_trigger_data *data,
 				   struct event_trigger_data *named_data);
+extern struct event_trigger_data *
+get_named_trigger_data(struct event_trigger_data *data);
 extern int register_event_command(struct event_command *cmd);
 extern int unregister_event_command(struct event_command *cmd);
 extern int register_trigger_hist_enable_disable_cmds(void);
@@ -1586,7 +1595,8 @@ extern int register_trigger_hist_enable_disable_cmds(void);
  */
 struct event_trigger_ops {
 	void			(*func)(struct event_trigger_data *data,
-					void *rec);
+					void *rec,
+					struct ring_buffer_event *rbe);
 	int			(*init)(struct event_trigger_ops *ops,
 					struct event_trigger_data *data);
 	void			(*free)(struct event_trigger_ops *ops,
@@ -1812,6 +1822,17 @@ static inline void __init trace_event_init(void) { }
 static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
 #endif
 
+#ifdef CONFIG_TRACER_SNAPSHOT
+void tracing_snapshot_instance(struct trace_array *tr);
+int tracing_alloc_snapshot_instance(struct trace_array *tr);
+#else
+static inline void tracing_snapshot_instance(struct trace_array *tr) { }
+static inline int tracing_alloc_snapshot_instance(struct trace_array *tr)
+{
+	return 0;
+}
+#endif
+
 extern struct trace_iterator *tracepoint_print_iter;
 
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 22fee766081b..80e0b2aca703 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -159,13 +159,13 @@ static int benchmark_event_kthread(void *arg)
 		 * wants to run, schedule in, but if the CPU is idle,
 		 * we'll keep burning cycles.
 		 *
-		 * Note the _rcu_qs() version of cond_resched() will
+		 * Note the tasks_rcu_qs() version of cond_resched() will
 		 * notify synchronize_rcu_tasks() that this thread has
 		 * passed a quiescent state for rcu_tasks. Otherwise
 		 * this thread will never voluntarily schedule which would
 		 * block synchronize_rcu_tasks() indefinitely.
 		 */
-		cond_resched();
+		cond_resched_tasks_rcu_qs();
 	}
 
 	return 0;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 5fdc779f411d..d8a188e0418a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -96,7 +96,7 @@ u64 notrace trace_clock_global(void)
 	int this_cpu;
 	u64 now;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	this_cpu = raw_smp_processor_id();
 	now = sched_clock_cpu(this_cpu);
@@ -122,7 +122,7 @@ u64 notrace trace_clock_global(void)
 	arch_spin_unlock(&trace_clock_struct.lock);
 
  out:
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return now;
 }
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e954ae3d82c0..1d67464ed95e 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -230,7 +230,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
 	FILTER_OTHER
 );
 
-FTRACE_ENTRY(print, print_entry,
+FTRACE_ENTRY_REG(print, print_entry,
 
 	TRACE_PRINT,
 
@@ -242,7 +242,9 @@ FTRACE_ENTRY(print, print_entry,
 	F_printk("%ps: %s",
 		 (void *)__entry->ip, __entry->buf),
 
-	FILTER_OTHER
+	FILTER_OTHER,
+
+	ftrace_event_register
 );
 
 FTRACE_ENTRY(raw_data, raw_data_entry,
@@ -356,7 +358,7 @@ FTRACE_ENTRY(hwlat, hwlat_entry,
 		__field(	unsigned int,		seqnum		)
 	),
 
-	F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n",
+	F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tnmi-ts:%llu\tnmi-count:%u\n",
 		 __entry->seqnum,
 		 __entry->tv_sec,
 		 __entry->tv_nsec,
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 2c416509b834..c79193e598f5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -252,6 +252,8 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
 		ret = strncpy_from_user(
 			func, u64_to_user_ptr(p_event->attr.kprobe_func),
 			KSYM_NAME_LEN);
+		if (ret == KSYM_NAME_LEN)
+			ret = -E2BIG;
 		if (ret < 0)
 			goto out;
 
@@ -300,6 +302,8 @@ int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe)
 		return -ENOMEM;
 	ret = strncpy_from_user(
 		path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX);
+	if (ret == PATH_MAX)
+		return -E2BIG;
 	if (ret < 0)
 		goto out;
 	if (path[0] == '\0') {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05c7172c6667..14ff4ff3caab 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2007,16 +2007,18 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 			return -1;
 		}
 	}
-	trace_create_file("filter", 0644, file->dir, file,
-			  &ftrace_event_filter_fops);
 
 	/*
 	 * Only event directories that can be enabled should have
-	 * triggers.
+	 * triggers or filters.
 	 */
-	if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
+	if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) {
+		trace_create_file("filter", 0644, file->dir, file,
+				  &ftrace_event_filter_fops);
+
 		trace_create_file("trigger", 0644, file->dir, file,
 				  &event_trigger_fops);
+	}
 
 #ifdef CONFIG_HIST_TRIGGERS
 	trace_create_file("hist", 0444, file->dir, file,
@@ -2473,8 +2475,9 @@ __trace_add_event_dirs(struct trace_array *tr)
 	}
 }
 
+/* Returns any file that matches the system and event */
 struct trace_event_file *
-find_event_file(struct trace_array *tr, const char *system,  const char *event)
+__find_event_file(struct trace_array *tr, const char *system, const char *event)
 {
 	struct trace_event_file *file;
 	struct trace_event_call *call;
@@ -2485,10 +2488,7 @@ find_event_file(struct trace_array *tr, const char *system,  const char *event)
 		call = file->event_call;
 		name = trace_event_name(call);
 
-		if (!name || !call->class || !call->class->reg)
-			continue;
-
-		if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+		if (!name || !call->class)
 			continue;
 
 		if (strcmp(event, name) == 0 &&
@@ -2498,6 +2498,20 @@ find_event_file(struct trace_array *tr, const char *system,  const char *event)
 	return NULL;
 }
 
+/* Returns valid trace event files that match system and event */
+struct trace_event_file *
+find_event_file(struct trace_array *tr, const char *system, const char *event)
+{
+	struct trace_event_file *file;
+
+	file = __find_event_file(tr, system, event);
+	if (!file || !file->event_call->class->reg ||
+	    file->event_call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+		return NULL;
+
+	return file;
+}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 /* Avoid typos */
@@ -3132,7 +3146,7 @@ static __init int event_trace_enable_again(void)
 
 early_initcall(event_trace_enable_again);
 
-static __init int event_trace_init(void)
+__init int event_trace_init(void)
 {
 	struct trace_array *tr;
 	struct dentry *d_tracer;
@@ -3177,8 +3191,6 @@ void __init trace_event_init(void)
 	event_trace_enable();
 }
 
-fs_initcall(event_trace_init);
-
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 
 static DEFINE_SPINLOCK(test_spinlock);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index a764aec3c9a1..0171407d231f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -33,163 +33,595 @@
 	"# Only events with the given fields will be affected.\n"	\
 	"# If no events are modified, an error message will be displayed here"
 
-enum filter_op_ids
-{
-	OP_OR,
-	OP_AND,
-	OP_GLOB,
-	OP_NE,
-	OP_EQ,
-	OP_LT,
-	OP_LE,
-	OP_GT,
-	OP_GE,
-	OP_BAND,
-	OP_NOT,
-	OP_NONE,
-	OP_OPEN_PAREN,
-};
+/* Due to token parsing '<=' must be before '<' and '>=' must be before '>' */
+#define OPS					\
+	C( OP_GLOB,	"~"  ),			\
+	C( OP_NE,	"!=" ),			\
+	C( OP_EQ,	"==" ),			\
+	C( OP_LE,	"<=" ),			\
+	C( OP_LT,	"<"  ),			\
+	C( OP_GE,	">=" ),			\
+	C( OP_GT,	">"  ),			\
+	C( OP_BAND,	"&"  ),			\
+	C( OP_MAX,	NULL )
 
-struct filter_op {
-	int id;
-	char *string;
-	int precedence;
-};
+#undef C
+#define C(a, b)	a
 
-/* Order must be the same as enum filter_op_ids above */
-static struct filter_op filter_ops[] = {
-	{ OP_OR,	"||",		1 },
-	{ OP_AND,	"&&",		2 },
-	{ OP_GLOB,	"~",		4 },
-	{ OP_NE,	"!=",		4 },
-	{ OP_EQ,	"==",		4 },
-	{ OP_LT,	"<",		5 },
-	{ OP_LE,	"<=",		5 },
-	{ OP_GT,	">",		5 },
-	{ OP_GE,	">=",		5 },
-	{ OP_BAND,	"&",		6 },
-	{ OP_NOT,	"!",		6 },
-	{ OP_NONE,	"OP_NONE",	0 },
-	{ OP_OPEN_PAREN, "(",		0 },
-};
+enum filter_op_ids { OPS };
 
-enum {
-	FILT_ERR_NONE,
-	FILT_ERR_INVALID_OP,
-	FILT_ERR_UNBALANCED_PAREN,
-	FILT_ERR_TOO_MANY_OPERANDS,
-	FILT_ERR_OPERAND_TOO_LONG,
-	FILT_ERR_FIELD_NOT_FOUND,
-	FILT_ERR_ILLEGAL_FIELD_OP,
-	FILT_ERR_ILLEGAL_INTVAL,
-	FILT_ERR_BAD_SUBSYS_FILTER,
-	FILT_ERR_TOO_MANY_PREDS,
-	FILT_ERR_MISSING_FIELD,
-	FILT_ERR_INVALID_FILTER,
-	FILT_ERR_IP_FIELD_ONLY,
-	FILT_ERR_ILLEGAL_NOT_OP,
-};
+#undef C
+#define C(a, b)	b
 
-static char *err_text[] = {
-	"No error",
-	"Invalid operator",
-	"Unbalanced parens",
-	"Too many operands",
-	"Operand too long",
-	"Field not found",
-	"Illegal operation for field type",
-	"Illegal integer value",
-	"Couldn't find or set field in one of a subsystem's events",
-	"Too many terms in predicate expression",
-	"Missing field name and/or value",
-	"Meaningless filter expression",
-	"Only 'ip' field is supported for function trace",
-	"Illegal use of '!'",
-};
+static const char * ops[] = { OPS };
 
-struct opstack_op {
-	enum filter_op_ids op;
-	struct list_head list;
-};
+/*
+ * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND
+ * pred_funcs_##type below must match the order of them above.
+ */
+#define PRED_FUNC_START			OP_LE
+#define PRED_FUNC_MAX			(OP_BAND - PRED_FUNC_START)
+
+#define ERRORS								\
+	C(NONE,			"No error"),				\
+	C(INVALID_OP,		"Invalid operator"),			\
+	C(TOO_MANY_OPEN,	"Too many '('"),			\
+	C(TOO_MANY_CLOSE,	"Too few '('"),				\
+	C(MISSING_QUOTE,	"Missing matching quote"),		\
+	C(OPERAND_TOO_LONG,	"Operand too long"),			\
+	C(EXPECT_STRING,	"Expecting string field"),		\
+	C(EXPECT_DIGIT,		"Expecting numeric field"),		\
+	C(ILLEGAL_FIELD_OP,	"Illegal operation for field type"),	\
+	C(FIELD_NOT_FOUND,	"Field not found"),			\
+	C(ILLEGAL_INTVAL,	"Illegal integer value"),		\
+	C(BAD_SUBSYS_FILTER,	"Couldn't find or set field in one of a subsystem's events"), \
+	C(TOO_MANY_PREDS,	"Too many terms in predicate expression"), \
+	C(INVALID_FILTER,	"Meaningless filter expression"),	\
+	C(IP_FIELD_ONLY,	"Only 'ip' field is supported for function trace"), \
+	C(INVALID_VALUE,	"Invalid value (did you forget quotes)?"),
+
+#undef C
+#define C(a, b)		FILT_ERR_##a
+
+enum { ERRORS };
+
+#undef C
+#define C(a, b)		b
+
+static char *err_text[] = { ERRORS };
+
+/* Called after a '!' character but "!=" and "!~" are not "not"s */
+static bool is_not(const char *str)
+{
+	switch (str[1]) {
+	case '=':
+	case '~':
+		return false;
+	}
+	return true;
+}
 
-struct postfix_elt {
-	enum filter_op_ids op;
-	char *operand;
-	struct list_head list;
+/**
+ * prog_entry - a singe entry in the filter program
+ * @target:	     Index to jump to on a branch (actually one minus the index)
+ * @when_to_branch:  The value of the result of the predicate to do a branch
+ * @pred:	     The predicate to execute.
+ */
+struct prog_entry {
+	int			target;
+	int			when_to_branch;
+	struct filter_pred	*pred;
 };
 
-struct filter_parse_state {
-	struct filter_op *ops;
-	struct list_head opstack;
-	struct list_head postfix;
+/**
+ * update_preds- assign a program entry a label target
+ * @prog: The program array
+ * @N: The index of the current entry in @prog
+ * @when_to_branch: What to assign a program entry for its branch condition
+ *
+ * The program entry at @N has a target that points to the index of a program
+ * entry that can have its target and when_to_branch fields updated.
+ * Update the current program entry denoted by index @N target field to be
+ * that of the updated entry. This will denote the entry to update if
+ * we are processing an "||" after an "&&"
+ */
+static void update_preds(struct prog_entry *prog, int N, int invert)
+{
+	int t, s;
+
+	t = prog[N].target;
+	s = prog[t].target;
+	prog[t].when_to_branch = invert;
+	prog[t].target = N;
+	prog[N].target = s;
+}
+
+struct filter_parse_error {
 	int lasterr;
 	int lasterr_pos;
-
-	struct {
-		char *string;
-		unsigned int cnt;
-		unsigned int tail;
-	} infix;
-
-	struct {
-		char string[MAX_FILTER_STR_VAL];
-		int pos;
-		unsigned int tail;
-	} operand;
 };
 
-struct pred_stack {
-	struct filter_pred	**preds;
-	int			index;
+static void parse_error(struct filter_parse_error *pe, int err, int pos)
+{
+	pe->lasterr = err;
+	pe->lasterr_pos = pos;
+}
+
+typedef int (*parse_pred_fn)(const char *str, void *data, int pos,
+			     struct filter_parse_error *pe,
+			     struct filter_pred **pred);
+
+enum {
+	INVERT		= 1,
+	PROCESS_AND	= 2,
+	PROCESS_OR	= 4,
 };
 
-/* If not of not match is equal to not of not, then it is a match */
+/*
+ * Without going into a formal proof, this explains the method that is used in
+ * parsing the logical expressions.
+ *
+ * For example, if we have: "a && !(!b || (c && g)) || d || e && !f"
+ * The first pass will convert it into the following program:
+ *
+ * n1: r=a;       l1: if (!r) goto l4;
+ * n2: r=b;       l2: if (!r) goto l4;
+ * n3: r=c; r=!r; l3: if (r) goto l4;
+ * n4: r=g; r=!r; l4: if (r) goto l5;
+ * n5: r=d;       l5: if (r) goto T
+ * n6: r=e;       l6: if (!r) goto l7;
+ * n7: r=f; r=!r; l7: if (!r) goto F
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * To do this, we use a data structure to represent each of the above
+ * predicate and conditions that has:
+ *
+ *  predicate, when_to_branch, invert, target
+ *
+ * The "predicate" will hold the function to determine the result "r".
+ * The "when_to_branch" denotes what "r" should be if a branch is to be taken
+ * "&&" would contain "!r" or (0) and "||" would contain "r" or (1).
+ * The "invert" holds whether the value should be reversed before testing.
+ * The "target" contains the label "l#" to jump to.
+ *
+ * A stack is created to hold values when parentheses are used.
+ *
+ * To simplify the logic, the labels will start at 0 and not 1.
+ *
+ * The possible invert values are 1 and 0. The number of "!"s that are in scope
+ * before the predicate determines the invert value, if the number is odd then
+ * the invert value is 1 and 0 otherwise. This means the invert value only
+ * needs to be toggled when a new "!" is introduced compared to what is stored
+ * on the stack, where parentheses were used.
+ *
+ * The top of the stack and "invert" are initialized to zero.
+ *
+ * ** FIRST PASS **
+ *
+ * #1 A loop through all the tokens is done:
+ *
+ * #2 If the token is an "(", the stack is push, and the current stack value
+ *    gets the current invert value, and the loop continues to the next token.
+ *    The top of the stack saves the "invert" value to keep track of what
+ *    the current inversion is. As "!(a && !b || c)" would require all
+ *    predicates being affected separately by the "!" before the parentheses.
+ *    And that would end up being equivalent to "(!a || b) && !c"
+ *
+ * #3 If the token is an "!", the current "invert" value gets inverted, and
+ *    the loop continues. Note, if the next token is a predicate, then
+ *    this "invert" value is only valid for the current program entry,
+ *    and does not affect other predicates later on.
+ *
+ * The only other acceptable token is the predicate string.
+ *
+ * #4 A new entry into the program is added saving: the predicate and the
+ *    current value of "invert". The target is currently assigned to the
+ *    previous program index (this will not be its final value).
+ *
+ * #5 We now enter another loop and look at the next token. The only valid
+ *    tokens are ")", "&&", "||" or end of the input string "\0".
+ *
+ * #6 The invert variable is reset to the current value saved on the top of
+ *    the stack.
+ *
+ * #7 The top of the stack holds not only the current invert value, but also
+ *    if a "&&" or "||" needs to be processed. Note, the "&&" takes higher
+ *    precedence than "||". That is "a && b || c && d" is equivalent to
+ *    "(a && b) || (c && d)". Thus the first thing to do is to see if "&&" needs
+ *    to be processed. This is the case if an "&&" was the last token. If it was
+ *    then we call update_preds(). This takes the program, the current index in
+ *    the program, and the current value of "invert".  More will be described
+ *    below about this function.
+ *
+ * #8 If the next token is "&&" then we set a flag in the top of the stack
+ *    that denotes that "&&" needs to be processed, break out of this loop
+ *    and continue with the outer loop.
+ *
+ * #9 Otherwise, if a "||" needs to be processed then update_preds() is called.
+ *    This is called with the program, the current index in the program, but
+ *    this time with an inverted value of "invert" (that is !invert). This is
+ *    because the value taken will become the "when_to_branch" value of the
+ *    program.
+ *    Note, this is called when the next token is not an "&&". As stated before,
+ *    "&&" takes higher precedence, and "||" should not be processed yet if the
+ *    next logical operation is "&&".
+ *
+ * #10 If the next token is "||" then we set a flag in the top of the stack
+ *     that denotes that "||" needs to be processed, break out of this loop
+ *     and continue with the outer loop.
+ *
+ * #11 If this is the end of the input string "\0" then we break out of both
+ *     loops.
+ *
+ * #12 Otherwise, the next token is ")", where we pop the stack and continue
+ *     this inner loop.
+ *
+ * Now to discuss the update_pred() function, as that is key to the setting up
+ * of the program. Remember the "target" of the program is initialized to the
+ * previous index and not the "l" label. The target holds the index into the
+ * program that gets affected by the operand. Thus if we have something like
+ *  "a || b && c", when we process "a" the target will be "-1" (undefined).
+ * When we process "b", its target is "0", which is the index of "a", as that's
+ * the predicate that is affected by "||". But because the next token after "b"
+ * is "&&" we don't call update_preds(). Instead continue to "c". As the
+ * next token after "c" is not "&&" but the end of input, we first process the
+ * "&&" by calling update_preds() for the "&&" then we process the "||" by
+ * callin updates_preds() with the values for processing "||".
+ *
+ * What does that mean? What update_preds() does is to first save the "target"
+ * of the program entry indexed by the current program entry's "target"
+ * (remember the "target" is initialized to previous program entry), and then
+ * sets that "target" to the current index which represents the label "l#".
+ * That entry's "when_to_branch" is set to the value passed in (the "invert"
+ * or "!invert"). Then it sets the current program entry's target to the saved
+ * "target" value (the old value of the program that had its "target" updated
+ * to the label).
+ *
+ * Looking back at "a || b && c", we have the following steps:
+ *  "a"  - prog[0] = { "a", X, -1 } // pred, when_to_branch, target
+ *  "||" - flag that we need to process "||"; continue outer loop
+ *  "b"  - prog[1] = { "b", X, 0 }
+ *  "&&" - flag that we need to process "&&"; continue outer loop
+ * (Notice we did not process "||")
+ *  "c"  - prog[2] = { "c", X, 1 }
+ *  update_preds(prog, 2, 0); // invert = 0 as we are processing "&&"
+ *    t = prog[2].target; // t = 1
+ *    s = prog[t].target; // s = 0
+ *    prog[t].target = 2; // Set target to "l2"
+ *    prog[t].when_to_branch = 0;
+ *    prog[2].target = s;
+ * update_preds(prog, 2, 1); // invert = 1 as we are now processing "||"
+ *    t = prog[2].target; // t = 0
+ *    s = prog[t].target; // s = -1
+ *    prog[t].target = 2; // Set target to "l2"
+ *    prog[t].when_to_branch = 1;
+ *    prog[2].target = s;
+ *
+ * #13 Which brings us to the final step of the first pass, which is to set
+ *     the last program entry's when_to_branch and target, which will be
+ *     when_to_branch = 0; target = N; ( the label after the program entry after
+ *     the last program entry processed above).
+ *
+ * If we denote "TRUE" to be the entry after the last program entry processed,
+ * and "FALSE" the program entry after that, we are now done with the first
+ * pass.
+ *
+ * Making the above "a || b && c" have a progam of:
+ *  prog[0] = { "a", 1, 2 }
+ *  prog[1] = { "b", 0, 2 }
+ *  prog[2] = { "c", 0, 3 }
+ *
+ * Which translates into:
+ * n0: r = a; l0: if (r) goto l2;
+ * n1: r = b; l1: if (!r) goto l2;
+ * n2: r = c; l2: if (!r) goto l3;  // Which is the same as "goto F;"
+ * T: return TRUE; l3:
+ * F: return FALSE
+ *
+ * Although, after the first pass, the program is correct, it is
+ * inefficient. The simple sample of "a || b && c" could be easily been
+ * converted into:
+ * n0: r = a; if (r) goto T
+ * n1: r = b; if (!r) goto F
+ * n2: r = c; if (!r) goto F
+ * T: return TRUE;
+ * F: return FALSE;
+ *
+ * The First Pass is over the input string. The next too passes are over
+ * the program itself.
+ *
+ * ** SECOND PASS **
+ *
+ * Which brings us to the second pass. If a jump to a label has the
+ * same condition as that label, it can instead jump to its target.
+ * The original example of "a && !(!b || (c && g)) || d || e && !f"
+ * where the first pass gives us:
+ *
+ * n1: r=a;       l1: if (!r) goto l4;
+ * n2: r=b;       l2: if (!r) goto l4;
+ * n3: r=c; r=!r; l3: if (r) goto l4;
+ * n4: r=g; r=!r; l4: if (r) goto l5;
+ * n5: r=d;       l5: if (r) goto T
+ * n6: r=e;       l6: if (!r) goto l7;
+ * n7: r=f; r=!r; l7: if (!r) goto F:
+ * T: return TRUE;
+ * F: return FALSE
+ *
+ * We can see that "l3: if (r) goto l4;" and at l4, we have "if (r) goto l5;".
+ * And "l5: if (r) goto T", we could optimize this by converting l3 and l4
+ * to go directly to T. To accomplish this, we start from the last
+ * entry in the program and work our way back. If the target of the entry
+ * has the same "when_to_branch" then we could use that entry's target.
+ * Doing this, the above would end up as:
+ *
+ * n1: r=a;       l1: if (!r) goto l4;
+ * n2: r=b;       l2: if (!r) goto l4;
+ * n3: r=c; r=!r; l3: if (r) goto T;
+ * n4: r=g; r=!r; l4: if (r) goto T;
+ * n5: r=d;       l5: if (r) goto T;
+ * n6: r=e;       l6: if (!r) goto F;
+ * n7: r=f; r=!r; l7: if (!r) goto F;
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * In that same pass, if the "when_to_branch" doesn't match, we can simply
+ * go to the program entry after the label. That is, "l2: if (!r) goto l4;"
+ * where "l4: if (r) goto T;", then we can convert l2 to be:
+ * "l2: if (!r) goto n5;".
+ *
+ * This will have the second pass give us:
+ * n1: r=a;       l1: if (!r) goto n5;
+ * n2: r=b;       l2: if (!r) goto n5;
+ * n3: r=c; r=!r; l3: if (r) goto T;
+ * n4: r=g; r=!r; l4: if (r) goto T;
+ * n5: r=d;       l5: if (r) goto T
+ * n6: r=e;       l6: if (!r) goto F;
+ * n7: r=f; r=!r; l7: if (!r) goto F
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * Notice, all the "l#" labels are no longer used, and they can now
+ * be discarded.
+ *
+ * ** THIRD PASS **
+ *
+ * For the third pass we deal with the inverts. As they simply just
+ * make the "when_to_branch" get inverted, a simple loop over the
+ * program to that does: "when_to_branch ^= invert;" will do the
+ * job, leaving us with:
+ * n1: r=a; if (!r) goto n5;
+ * n2: r=b; if (!r) goto n5;
+ * n3: r=c: if (!r) goto T;
+ * n4: r=g; if (!r) goto T;
+ * n5: r=d; if (r) goto T
+ * n6: r=e; if (!r) goto F;
+ * n7: r=f; if (r) goto F
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * As "r = a; if (!r) goto n5;" is obviously the same as
+ * "if (!a) goto n5;" without doing anything we can interperate the
+ * program as:
+ * n1: if (!a) goto n5;
+ * n2: if (!b) goto n5;
+ * n3: if (!c) goto T;
+ * n4: if (!g) goto T;
+ * n5: if (d) goto T
+ * n6: if (!e) goto F;
+ * n7: if (f) goto F
+ * T: return TRUE
+ * F: return FALSE
+ *
+ * Since the inverts are discarded at the end, there's no reason to store
+ * them in the program array (and waste memory). A separate array to hold
+ * the inverts is used and freed at the end.
+ */
+static struct prog_entry *
+predicate_parse(const char *str, int nr_parens, int nr_preds,
+		parse_pred_fn parse_pred, void *data,
+		struct filter_parse_error *pe)
+{
+	struct prog_entry *prog_stack;
+	struct prog_entry *prog;
+	const char *ptr = str;
+	char *inverts = NULL;
+	int *op_stack;
+	int *top;
+	int invert = 0;
+	int ret = -ENOMEM;
+	int len;
+	int N = 0;
+	int i;
+
+	nr_preds += 2; /* For TRUE and FALSE */
+
+	op_stack = kmalloc(sizeof(*op_stack) * nr_parens, GFP_KERNEL);
+	if (!op_stack)
+		return ERR_PTR(-ENOMEM);
+	prog_stack = kmalloc(sizeof(*prog_stack) * nr_preds, GFP_KERNEL);
+	if (!prog_stack) {
+		parse_error(pe, -ENOMEM, 0);
+		goto out_free;
+	}
+	inverts = kmalloc(sizeof(*inverts) * nr_preds, GFP_KERNEL);
+	if (!inverts) {
+		parse_error(pe, -ENOMEM, 0);
+		goto out_free;
+	}
+
+	top = op_stack;
+	prog = prog_stack;
+	*top = 0;
+
+	/* First pass */
+	while (*ptr) {						/* #1 */
+		const char *next = ptr++;
+
+		if (isspace(*next))
+			continue;
+
+		switch (*next) {
+		case '(':					/* #2 */
+			if (top - op_stack > nr_parens)
+				return ERR_PTR(-EINVAL);
+			*(++top) = invert;
+			continue;
+		case '!':					/* #3 */
+			if (!is_not(next))
+				break;
+			invert = !invert;
+			continue;
+		}
+
+		if (N >= nr_preds) {
+			parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str);
+			goto out_free;
+		}
+
+		inverts[N] = invert;				/* #4 */
+		prog[N].target = N-1;
+
+		len = parse_pred(next, data, ptr - str, pe, &prog[N].pred);
+		if (len < 0) {
+			ret = len;
+			goto out_free;
+		}
+		ptr = next + len;
+
+		N++;
+
+		ret = -1;
+		while (1) {					/* #5 */
+			next = ptr++;
+			if (isspace(*next))
+				continue;
+
+			switch (*next) {
+			case ')':
+			case '\0':
+				break;
+			case '&':
+			case '|':
+				if (next[1] == next[0]) {
+					ptr++;
+					break;
+				}
+			default:
+				parse_error(pe, FILT_ERR_TOO_MANY_PREDS,
+					    next - str);
+				goto out_free;
+			}
+
+			invert = *top & INVERT;
+
+			if (*top & PROCESS_AND) {		/* #7 */
+				update_preds(prog, N - 1, invert);
+				*top &= ~PROCESS_AND;
+			}
+			if (*next == '&') {			/* #8 */
+				*top |= PROCESS_AND;
+				break;
+			}
+			if (*top & PROCESS_OR) {		/* #9 */
+				update_preds(prog, N - 1, !invert);
+				*top &= ~PROCESS_OR;
+			}
+			if (*next == '|') {			/* #10 */
+				*top |= PROCESS_OR;
+				break;
+			}
+			if (!*next)				/* #11 */
+				goto out;
+
+			if (top == op_stack) {
+				ret = -1;
+				/* Too few '(' */
+				parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, ptr - str);
+				goto out_free;
+			}
+			top--;					/* #12 */
+		}
+	}
+ out:
+	if (top != op_stack) {
+		/* Too many '(' */
+		parse_error(pe, FILT_ERR_TOO_MANY_OPEN, ptr - str);
+		goto out_free;
+	}
+
+	prog[N].pred = NULL;					/* #13 */
+	prog[N].target = 1;		/* TRUE */
+	prog[N+1].pred = NULL;
+	prog[N+1].target = 0;		/* FALSE */
+	prog[N-1].target = N;
+	prog[N-1].when_to_branch = false;
+
+	/* Second Pass */
+	for (i = N-1 ; i--; ) {
+		int target = prog[i].target;
+		if (prog[i].when_to_branch == prog[target].when_to_branch)
+			prog[i].target = prog[target].target;
+	}
+
+	/* Third Pass */
+	for (i = 0; i < N; i++) {
+		invert = inverts[i] ^ prog[i].when_to_branch;
+		prog[i].when_to_branch = invert;
+		/* Make sure the program always moves forward */
+		if (WARN_ON(prog[i].target <= i)) {
+			ret = -EINVAL;
+			goto out_free;
+		}
+	}
+
+	return prog;
+out_free:
+	kfree(op_stack);
+	kfree(prog_stack);
+	kfree(inverts);
+	return ERR_PTR(ret);
+}
+
 #define DEFINE_COMPARISON_PRED(type)					\
 static int filter_pred_LT_##type(struct filter_pred *pred, void *event)	\
 {									\
 	type *addr = (type *)(event + pred->offset);			\
 	type val = (type)pred->val;					\
-	int match = (*addr < val);					\
-	return !!match == !pred->not;					\
+	return *addr < val;						\
 }									\
 static int filter_pred_LE_##type(struct filter_pred *pred, void *event)	\
 {									\
 	type *addr = (type *)(event + pred->offset);			\
 	type val = (type)pred->val;					\
-	int match = (*addr <= val);					\
-	return !!match == !pred->not;					\
+	return *addr <= val;						\
 }									\
 static int filter_pred_GT_##type(struct filter_pred *pred, void *event)	\
 {									\
 	type *addr = (type *)(event + pred->offset);			\
 	type val = (type)pred->val;					\
-	int match = (*addr > val);					\
-	return !!match == !pred->not;					\
+	return *addr > val;					\
 }									\
 static int filter_pred_GE_##type(struct filter_pred *pred, void *event)	\
 {									\
 	type *addr = (type *)(event + pred->offset);			\
 	type val = (type)pred->val;					\
-	int match = (*addr >= val);					\
-	return !!match == !pred->not;					\
+	return *addr >= val;						\
 }									\
 static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
 {									\
 	type *addr = (type *)(event + pred->offset);			\
 	type val = (type)pred->val;					\
-	int match = !!(*addr & val);					\
-	return match == !pred->not;					\
+	return !!(*addr & val);						\
 }									\
 static const filter_pred_fn_t pred_funcs_##type[] = {			\
-	filter_pred_LT_##type,						\
 	filter_pred_LE_##type,						\
-	filter_pred_GT_##type,						\
+	filter_pred_LT_##type,						\
 	filter_pred_GE_##type,						\
+	filter_pred_GT_##type,						\
 	filter_pred_BAND_##type,					\
 };
 
-#define PRED_FUNC_START			OP_LT
-
 #define DEFINE_EQUALITY_PRED(size)					\
 static int filter_pred_##size(struct filter_pred *pred, void *event)	\
 {									\
@@ -272,44 +704,36 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
 static int filter_pred_cpu(struct filter_pred *pred, void *event)
 {
 	int cpu, cmp;
-	int match = 0;
 
 	cpu = raw_smp_processor_id();
 	cmp = pred->val;
 
 	switch (pred->op) {
 	case OP_EQ:
-		match = cpu == cmp;
-		break;
+		return cpu == cmp;
+	case OP_NE:
+		return cpu != cmp;
 	case OP_LT:
-		match = cpu < cmp;
-		break;
+		return cpu < cmp;
 	case OP_LE:
-		match = cpu <= cmp;
-		break;
+		return cpu <= cmp;
 	case OP_GT:
-		match = cpu > cmp;
-		break;
+		return cpu > cmp;
 	case OP_GE:
-		match = cpu >= cmp;
-		break;
+		return cpu >= cmp;
 	default:
-		break;
+		return 0;
 	}
-
-	return !!match == !pred->not;
 }
 
 /* Filter predicate for COMM. */
 static int filter_pred_comm(struct filter_pred *pred, void *event)
 {
-	int cmp, match;
+	int cmp;
 
 	cmp = pred->regex.match(current->comm, &pred->regex,
-				pred->regex.field_len);
-	match = cmp ^ pred->not;
-
-	return match;
+				TASK_COMM_LEN);
+	return cmp ^ pred->not;
 }
 
 static int filter_pred_none(struct filter_pred *pred, void *event)
@@ -326,28 +750,32 @@ static int filter_pred_none(struct filter_pred *pred, void *event)
  *
  * Note:
  * - @str might not be NULL-terminated if it's of type DYN_STRING
- *   or STATIC_STRING
+ *   or STATIC_STRING, unless @len is zero.
  */
 
 static int regex_match_full(char *str, struct regex *r, int len)
 {
-	if (strncmp(str, r->pattern, len) == 0)
-		return 1;
-	return 0;
+	/* len of zero means str is dynamic and ends with '\0' */
+	if (!len)
+		return strcmp(str, r->pattern) == 0;
+
+	return strncmp(str, r->pattern, len) == 0;
 }
 
 static int regex_match_front(char *str, struct regex *r, int len)
 {
-	if (strncmp(str, r->pattern, r->len) == 0)
-		return 1;
-	return 0;
+	if (len && len < r->len)
+		return 0;
+
+	return strncmp(str, r->pattern, r->len) == 0;
 }
 
 static int regex_match_middle(char *str, struct regex *r, int len)
 {
-	if (strnstr(str, r->pattern, len))
-		return 1;
-	return 0;
+	if (!len)
+		return strstr(str, r->pattern) != NULL;
+
+	return strnstr(str, r->pattern, len) != NULL;
 }
 
 static int regex_match_end(char *str, struct regex *r, int len)
@@ -366,6 +794,7 @@ static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused)
 		return 1;
 	return 0;
 }
+
 /**
  * filter_parse_regex - parse a basic regex
  * @buff:   the raw regex
@@ -426,10 +855,9 @@ static void filter_build_regex(struct filter_pred *pred)
 	struct regex *r = &pred->regex;
 	char *search;
 	enum regex_type type = MATCH_FULL;
-	int not = 0;
 
 	if (pred->op == OP_GLOB) {
-		type = filter_parse_regex(r->pattern, r->len, &search, &not);
+		type = filter_parse_regex(r->pattern, r->len, &search, &pred->not);
 		r->len = strlen(search);
 		memmove(r->pattern, search, r->len+1);
 	}
@@ -451,210 +879,32 @@ static void filter_build_regex(struct filter_pred *pred)
 		r->match = regex_match_glob;
 		break;
 	}
-
-	pred->not ^= not;
-}
-
-enum move_type {
-	MOVE_DOWN,
-	MOVE_UP_FROM_LEFT,
-	MOVE_UP_FROM_RIGHT
-};
-
-static struct filter_pred *
-get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
-		int index, enum move_type *move)
-{
-	if (pred->parent & FILTER_PRED_IS_RIGHT)
-		*move = MOVE_UP_FROM_RIGHT;
-	else
-		*move = MOVE_UP_FROM_LEFT;
-	pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
-
-	return pred;
-}
-
-enum walk_return {
-	WALK_PRED_ABORT,
-	WALK_PRED_PARENT,
-	WALK_PRED_DEFAULT,
-};
-
-typedef int (*filter_pred_walkcb_t) (enum move_type move,
-				     struct filter_pred *pred,
-				     int *err, void *data);
-
-static int walk_pred_tree(struct filter_pred *preds,
-			  struct filter_pred *root,
-			  filter_pred_walkcb_t cb, void *data)
-{
-	struct filter_pred *pred = root;
-	enum move_type move = MOVE_DOWN;
-	int done = 0;
-
-	if  (!preds)
-		return -EINVAL;
-
-	do {
-		int err = 0, ret;
-
-		ret = cb(move, pred, &err, data);
-		if (ret == WALK_PRED_ABORT)
-			return err;
-		if (ret == WALK_PRED_PARENT)
-			goto get_parent;
-
-		switch (move) {
-		case MOVE_DOWN:
-			if (pred->left != FILTER_PRED_INVALID) {
-				pred = &preds[pred->left];
-				continue;
-			}
-			goto get_parent;
-		case MOVE_UP_FROM_LEFT:
-			pred = &preds[pred->right];
-			move = MOVE_DOWN;
-			continue;
-		case MOVE_UP_FROM_RIGHT:
- get_parent:
-			if (pred == root)
-				break;
-			pred = get_pred_parent(pred, preds,
-					       pred->parent,
-					       &move);
-			continue;
-		}
-		done = 1;
-	} while (!done);
-
-	/* We are fine. */
-	return 0;
-}
-
-/*
- * A series of AND or ORs where found together. Instead of
- * climbing up and down the tree branches, an array of the
- * ops were made in order of checks. We can just move across
- * the array and short circuit if needed.
- */
-static int process_ops(struct filter_pred *preds,
-		       struct filter_pred *op, void *rec)
-{
-	struct filter_pred *pred;
-	int match = 0;
-	int type;
-	int i;
-
-	/*
-	 * Micro-optimization: We set type to true if op
-	 * is an OR and false otherwise (AND). Then we
-	 * just need to test if the match is equal to
-	 * the type, and if it is, we can short circuit the
-	 * rest of the checks:
-	 *
-	 * if ((match && op->op == OP_OR) ||
-	 *     (!match && op->op == OP_AND))
-	 *	  return match;
-	 */
-	type = op->op == OP_OR;
-
-	for (i = 0; i < op->val; i++) {
-		pred = &preds[op->ops[i]];
-		if (!WARN_ON_ONCE(!pred->fn))
-			match = pred->fn(pred, rec);
-		if (!!match == type)
-			break;
-	}
-	/* If not of not match is equal to not of not, then it is a match */
-	return !!match == !op->not;
-}
-
-struct filter_match_preds_data {
-	struct filter_pred *preds;
-	int match;
-	void *rec;
-};
-
-static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
-				 int *err, void *data)
-{
-	struct filter_match_preds_data *d = data;
-
-	*err = 0;
-	switch (move) {
-	case MOVE_DOWN:
-		/* only AND and OR have children */
-		if (pred->left != FILTER_PRED_INVALID) {
-			/* If ops is set, then it was folded. */
-			if (!pred->ops)
-				return WALK_PRED_DEFAULT;
-			/* We can treat folded ops as a leaf node */
-			d->match = process_ops(d->preds, pred, d->rec);
-		} else {
-			if (!WARN_ON_ONCE(!pred->fn))
-				d->match = pred->fn(pred, d->rec);
-		}
-
-		return WALK_PRED_PARENT;
-	case MOVE_UP_FROM_LEFT:
-		/*
-		 * Check for short circuits.
-		 *
-		 * Optimization: !!match == (pred->op == OP_OR)
-		 *   is the same as:
-		 * if ((match && pred->op == OP_OR) ||
-		 *     (!match && pred->op == OP_AND))
-		 */
-		if (!!d->match == (pred->op == OP_OR))
-			return WALK_PRED_PARENT;
-		break;
-	case MOVE_UP_FROM_RIGHT:
-		break;
-	}
-
-	return WALK_PRED_DEFAULT;
 }
 
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
-	struct filter_pred *preds;
-	struct filter_pred *root;
-	struct filter_match_preds_data data = {
-		/* match is currently meaningless */
-		.match = -1,
-		.rec   = rec,
-	};
-	int n_preds, ret;
+	struct prog_entry *prog;
+	int i;
 
 	/* no filter is considered a match */
 	if (!filter)
 		return 1;
 
-	n_preds = filter->n_preds;
-	if (!n_preds)
-		return 1;
-
-	/*
-	 * n_preds, root and filter->preds are protect with preemption disabled.
-	 */
-	root = rcu_dereference_sched(filter->root);
-	if (!root)
+	prog = rcu_dereference_sched(filter->prog);
+	if (!prog)
 		return 1;
 
-	data.preds = preds = rcu_dereference_sched(filter->preds);
-	ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
-	WARN_ON(ret);
-	return data.match;
+	for (i = 0; prog[i].pred; i++) {
+		struct filter_pred *pred = prog[i].pred;
+		int match = pred->fn(pred, rec);
+		if (match == prog[i].when_to_branch)
+			i = prog[i].target;
+	}
+	return prog[i].target;
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
-static void parse_error(struct filter_parse_state *ps, int err, int pos)
-{
-	ps->lasterr = err;
-	ps->lasterr_pos = pos;
-}
-
 static void remove_filter_string(struct event_filter *filter)
 {
 	if (!filter)
@@ -664,57 +914,44 @@ static void remove_filter_string(struct event_filter *filter)
 	filter->filter_string = NULL;
 }
 
-static int replace_filter_string(struct event_filter *filter,
-				 char *filter_string)
-{
-	kfree(filter->filter_string);
-	filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
-	if (!filter->filter_string)
-		return -ENOMEM;
-
-	return 0;
-}
-
-static int append_filter_string(struct event_filter *filter,
-				char *string)
-{
-	int newlen;
-	char *new_filter_string;
-
-	BUG_ON(!filter->filter_string);
-	newlen = strlen(filter->filter_string) + strlen(string) + 1;
-	new_filter_string = kmalloc(newlen, GFP_KERNEL);
-	if (!new_filter_string)
-		return -ENOMEM;
-
-	strcpy(new_filter_string, filter->filter_string);
-	strcat(new_filter_string, string);
-	kfree(filter->filter_string);
-	filter->filter_string = new_filter_string;
-
-	return 0;
-}
-
-static void append_filter_err(struct filter_parse_state *ps,
+static void append_filter_err(struct filter_parse_error *pe,
 			      struct event_filter *filter)
 {
-	int pos = ps->lasterr_pos;
-	char *buf, *pbuf;
+	struct trace_seq *s;
+	int pos = pe->lasterr_pos;
+	char *buf;
+	int len;
+
+	if (WARN_ON(!filter->filter_string))
+		return;
 
-	buf = (char *)__get_free_page(GFP_KERNEL);
-	if (!buf)
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
 		return;
+	trace_seq_init(s);
+
+	len = strlen(filter->filter_string);
+	if (pos > len)
+		pos = len;
 
-	append_filter_string(filter, "\n");
-	memset(buf, ' ', PAGE_SIZE);
-	if (pos > PAGE_SIZE - 128)
-		pos = 0;
-	buf[pos] = '^';
-	pbuf = &buf[pos] + 1;
+	/* indexing is off by one */
+	if (pos)
+		pos++;
 
-	sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
-	append_filter_string(filter, buf);
-	free_page((unsigned long) buf);
+	trace_seq_puts(s, filter->filter_string);
+	if (pe->lasterr > 0) {
+		trace_seq_printf(s, "\n%*s", pos, "^");
+		trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]);
+	} else {
+		trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr);
+	}
+	trace_seq_putc(s, 0);
+	buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL);
+	if (buf) {
+		kfree(filter->filter_string);
+		filter->filter_string = buf;
+	}
+	kfree(s);
 }
 
 static inline struct event_filter *event_filter(struct trace_event_file *file)
@@ -747,108 +984,18 @@ void print_subsystem_event_filter(struct event_subsystem *system,
 	mutex_unlock(&event_mutex);
 }
 
-static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
-{
-	stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
-	if (!stack->preds)
-		return -ENOMEM;
-	stack->index = n_preds;
-	return 0;
-}
-
-static void __free_pred_stack(struct pred_stack *stack)
-{
-	kfree(stack->preds);
-	stack->index = 0;
-}
-
-static int __push_pred_stack(struct pred_stack *stack,
-			     struct filter_pred *pred)
-{
-	int index = stack->index;
-
-	if (WARN_ON(index == 0))
-		return -ENOSPC;
-
-	stack->preds[--index] = pred;
-	stack->index = index;
-	return 0;
-}
-
-static struct filter_pred *
-__pop_pred_stack(struct pred_stack *stack)
-{
-	struct filter_pred *pred;
-	int index = stack->index;
-
-	pred = stack->preds[index++];
-	if (!pred)
-		return NULL;
-
-	stack->index = index;
-	return pred;
-}
-
-static int filter_set_pred(struct event_filter *filter,
-			   int idx,
-			   struct pred_stack *stack,
-			   struct filter_pred *src)
-{
-	struct filter_pred *dest = &filter->preds[idx];
-	struct filter_pred *left;
-	struct filter_pred *right;
-
-	*dest = *src;
-	dest->index = idx;
-
-	if (dest->op == OP_OR || dest->op == OP_AND) {
-		right = __pop_pred_stack(stack);
-		left = __pop_pred_stack(stack);
-		if (!left || !right)
-			return -EINVAL;
-		/*
-		 * If both children can be folded
-		 * and they are the same op as this op or a leaf,
-		 * then this op can be folded.
-		 */
-		if (left->index & FILTER_PRED_FOLD &&
-		    ((left->op == dest->op && !left->not) ||
-		     left->left == FILTER_PRED_INVALID) &&
-		    right->index & FILTER_PRED_FOLD &&
-		    ((right->op == dest->op && !right->not) ||
-		     right->left == FILTER_PRED_INVALID))
-			dest->index |= FILTER_PRED_FOLD;
-
-		dest->left = left->index & ~FILTER_PRED_FOLD;
-		dest->right = right->index & ~FILTER_PRED_FOLD;
-		left->parent = dest->index & ~FILTER_PRED_FOLD;
-		right->parent = dest->index | FILTER_PRED_IS_RIGHT;
-	} else {
-		/*
-		 * Make dest->left invalid to be used as a quick
-		 * way to know this is a leaf node.
-		 */
-		dest->left = FILTER_PRED_INVALID;
-
-		/* All leafs allow folding the parent ops. */
-		dest->index |= FILTER_PRED_FOLD;
-	}
-
-	return __push_pred_stack(stack, dest);
-}
-
-static void __free_preds(struct event_filter *filter)
+static void free_prog(struct event_filter *filter)
 {
+	struct prog_entry *prog;
 	int i;
 
-	if (filter->preds) {
-		for (i = 0; i < filter->n_preds; i++)
-			kfree(filter->preds[i].ops);
-		kfree(filter->preds);
-		filter->preds = NULL;
-	}
-	filter->a_preds = 0;
-	filter->n_preds = 0;
+	prog = rcu_access_pointer(filter->prog);
+	if (!prog)
+		return;
+
+	for (i = 0; prog[i].pred; i++)
+		kfree(prog[i].pred);
+	kfree(prog);
 }
 
 static void filter_disable(struct trace_event_file *file)
@@ -866,7 +1013,7 @@ static void __free_filter(struct event_filter *filter)
 	if (!filter)
 		return;
 
-	__free_preds(filter);
+	free_prog(filter);
 	kfree(filter->filter_string);
 	kfree(filter);
 }
@@ -876,38 +1023,6 @@ void free_event_filter(struct event_filter *filter)
 	__free_filter(filter);
 }
 
-static struct event_filter *__alloc_filter(void)
-{
-	struct event_filter *filter;
-
-	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-	return filter;
-}
-
-static int __alloc_preds(struct event_filter *filter, int n_preds)
-{
-	struct filter_pred *pred;
-	int i;
-
-	if (filter->preds)
-		__free_preds(filter);
-
-	filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL);
-
-	if (!filter->preds)
-		return -ENOMEM;
-
-	filter->a_preds = n_preds;
-	filter->n_preds = 0;
-
-	for (i = 0; i < n_preds; i++) {
-		pred = &filter->preds[i];
-		pred->fn = filter_pred_none;
-	}
-
-	return 0;
-}
-
 static inline void __remove_filter(struct trace_event_file *file)
 {
 	filter_disable(file);
@@ -944,27 +1059,6 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
 	}
 }
 
-static int filter_add_pred(struct filter_parse_state *ps,
-			   struct event_filter *filter,
-			   struct filter_pred *pred,
-			   struct pred_stack *stack)
-{
-	int err;
-
-	if (WARN_ON(filter->n_preds == filter->a_preds)) {
-		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
-		return -ENOSPC;
-	}
-
-	err = filter_set_pred(filter, filter->n_preds, stack, pred);
-	if (err)
-		return err;
-
-	filter->n_preds++;
-
-	return 0;
-}
-
 int filter_assign_type(const char *type)
 {
 	if (strstr(type, "__data_loc") && strstr(type, "char"))
@@ -976,761 +1070,449 @@ int filter_assign_type(const char *type)
 	return FILTER_OTHER;
 }
 
-static bool is_legal_op(struct ftrace_event_field *field, enum filter_op_ids op)
-{
-	if (is_string_field(field) &&
-	    (op != OP_EQ && op != OP_NE && op != OP_GLOB))
-		return false;
-	if (!is_string_field(field) && op == OP_GLOB)
-		return false;
-
-	return true;
-}
-
 static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
 					    int field_size, int field_is_signed)
 {
 	filter_pred_fn_t fn = NULL;
+	int pred_func_index = -1;
+
+	switch (op) {
+	case OP_EQ:
+	case OP_NE:
+		break;
+	default:
+		if (WARN_ON_ONCE(op < PRED_FUNC_START))
+			return NULL;
+		pred_func_index = op - PRED_FUNC_START;
+		if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX))
+			return NULL;
+	}
 
 	switch (field_size) {
 	case 8:
-		if (op == OP_EQ || op == OP_NE)
+		if (pred_func_index < 0)
 			fn = filter_pred_64;
 		else if (field_is_signed)
-			fn = pred_funcs_s64[op - PRED_FUNC_START];
+			fn = pred_funcs_s64[pred_func_index];
 		else
-			fn = pred_funcs_u64[op - PRED_FUNC_START];
+			fn = pred_funcs_u64[pred_func_index];
 		break;
 	case 4:
-		if (op == OP_EQ || op == OP_NE)
+		if (pred_func_index < 0)
 			fn = filter_pred_32;
 		else if (field_is_signed)
-			fn = pred_funcs_s32[op - PRED_FUNC_START];
+			fn = pred_funcs_s32[pred_func_index];
 		else
-			fn = pred_funcs_u32[op - PRED_FUNC_START];
+			fn = pred_funcs_u32[pred_func_index];
 		break;
 	case 2:
-		if (op == OP_EQ || op == OP_NE)
+		if (pred_func_index < 0)
 			fn = filter_pred_16;
 		else if (field_is_signed)
-			fn = pred_funcs_s16[op - PRED_FUNC_START];
+			fn = pred_funcs_s16[pred_func_index];
 		else
-			fn = pred_funcs_u16[op - PRED_FUNC_START];
+			fn = pred_funcs_u16[pred_func_index];
 		break;
 	case 1:
-		if (op == OP_EQ || op == OP_NE)
+		if (pred_func_index < 0)
 			fn = filter_pred_8;
 		else if (field_is_signed)
-			fn = pred_funcs_s8[op - PRED_FUNC_START];
+			fn = pred_funcs_s8[pred_func_index];
 		else
-			fn = pred_funcs_u8[op - PRED_FUNC_START];
+			fn = pred_funcs_u8[pred_func_index];
 		break;
 	}
 
 	return fn;
 }
 
-static int init_pred(struct filter_parse_state *ps,
-		     struct ftrace_event_field *field,
-		     struct filter_pred *pred)
-
+/* Called when a predicate is encountered by predicate_parse() */
+static int parse_pred(const char *str, void *data,
+		      int pos, struct filter_parse_error *pe,
+		      struct filter_pred **pred_ptr)
 {
-	filter_pred_fn_t fn = filter_pred_none;
-	unsigned long long val;
+	struct trace_event_call *call = data;
+	struct ftrace_event_field *field;
+	struct filter_pred *pred = NULL;
+	char num_buf[24];	/* Big enough to hold an address */
+	char *field_name;
+	char q;
+	u64 val;
+	int len;
 	int ret;
+	int op;
+	int s;
+	int i = 0;
 
-	pred->offset = field->offset;
-
-	if (!is_legal_op(field, pred->op)) {
-		parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
-		return -EINVAL;
-	}
-
-	if (field->filter_type == FILTER_COMM) {
-		filter_build_regex(pred);
-		fn = filter_pred_comm;
-		pred->regex.field_len = TASK_COMM_LEN;
-	} else if (is_string_field(field)) {
-		filter_build_regex(pred);
-
-		if (field->filter_type == FILTER_STATIC_STRING) {
-			fn = filter_pred_string;
-			pred->regex.field_len = field->size;
-		} else if (field->filter_type == FILTER_DYN_STRING)
-			fn = filter_pred_strloc;
-		else
-			fn = filter_pred_pchar;
-	} else if (is_function_field(field)) {
-		if (strcmp(field->name, "ip")) {
-			parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0);
-			return -EINVAL;
-		}
-	} else {
-		if (field->is_signed)
-			ret = kstrtoll(pred->regex.pattern, 0, &val);
-		else
-			ret = kstrtoull(pred->regex.pattern, 0, &val);
-		if (ret) {
-			parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
-			return -EINVAL;
-		}
-		pred->val = val;
+	/* First find the field to associate to */
+	while (isspace(str[i]))
+		i++;
+	s = i;
 
-		if (field->filter_type == FILTER_CPU)
-			fn = filter_pred_cpu;
-		else
-			fn = select_comparison_fn(pred->op, field->size,
-					  field->is_signed);
-		if (!fn) {
-			parse_error(ps, FILT_ERR_INVALID_OP, 0);
-			return -EINVAL;
-		}
-	}
+	while (isalnum(str[i]) || str[i] == '_')
+		i++;
 
-	if (pred->op == OP_NE)
-		pred->not ^= 1;
-
-	pred->fn = fn;
-	return 0;
-}
-
-static void parse_init(struct filter_parse_state *ps,
-		       struct filter_op *ops,
-		       char *infix_string)
-{
-	memset(ps, '\0', sizeof(*ps));
+	len = i - s;
 
-	ps->infix.string = infix_string;
-	ps->infix.cnt = strlen(infix_string);
-	ps->ops = ops;
+	if (!len)
+		return -1;
 
-	INIT_LIST_HEAD(&ps->opstack);
-	INIT_LIST_HEAD(&ps->postfix);
-}
-
-static char infix_next(struct filter_parse_state *ps)
-{
-	if (!ps->infix.cnt)
-		return 0;
-
-	ps->infix.cnt--;
-
-	return ps->infix.string[ps->infix.tail++];
-}
-
-static char infix_peek(struct filter_parse_state *ps)
-{
-	if (ps->infix.tail == strlen(ps->infix.string))
-		return 0;
-
-	return ps->infix.string[ps->infix.tail];
-}
-
-static void infix_advance(struct filter_parse_state *ps)
-{
-	if (!ps->infix.cnt)
-		return;
-
-	ps->infix.cnt--;
-	ps->infix.tail++;
-}
-
-static inline int is_precedence_lower(struct filter_parse_state *ps,
-				      int a, int b)
-{
-	return ps->ops[a].precedence < ps->ops[b].precedence;
-}
+	field_name = kmemdup_nul(str + s, len, GFP_KERNEL);
+	if (!field_name)
+		return -ENOMEM;
 
-static inline int is_op_char(struct filter_parse_state *ps, char c)
-{
-	int i;
+	/* Make sure that the field exists */
 
-	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
-		if (ps->ops[i].string[0] == c)
-			return 1;
+	field = trace_find_event_field(call, field_name);
+	kfree(field_name);
+	if (!field) {
+		parse_error(pe, FILT_ERR_FIELD_NOT_FOUND, pos + i);
+		return -EINVAL;
 	}
 
-	return 0;
-}
+	while (isspace(str[i]))
+		i++;
 
-static int infix_get_op(struct filter_parse_state *ps, char firstc)
-{
-	char nextc = infix_peek(ps);
-	char opstr[3];
-	int i;
-
-	opstr[0] = firstc;
-	opstr[1] = nextc;
-	opstr[2] = '\0';
-
-	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
-		if (!strcmp(opstr, ps->ops[i].string)) {
-			infix_advance(ps);
-			return ps->ops[i].id;
-		}
+	/* Make sure this op is supported */
+	for (op = 0; ops[op]; op++) {
+		/* This is why '<=' must come before '<' in ops[] */
+		if (strncmp(str + i, ops[op], strlen(ops[op])) == 0)
+			break;
 	}
 
-	opstr[1] = '\0';
-
-	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
-		if (!strcmp(opstr, ps->ops[i].string))
-			return ps->ops[i].id;
+	if (!ops[op]) {
+		parse_error(pe, FILT_ERR_INVALID_OP, pos + i);
+		goto err_free;
 	}
 
-	return OP_NONE;
-}
-
-static inline void clear_operand_string(struct filter_parse_state *ps)
-{
-	memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
-	ps->operand.tail = 0;
-}
+	i += strlen(ops[op]);
 
-static inline int append_operand_char(struct filter_parse_state *ps, char c)
-{
-	if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
-		return -EINVAL;
-
-	ps->operand.string[ps->operand.tail++] = c;
-
-	return 0;
-}
+	while (isspace(str[i]))
+		i++;
 
-static int filter_opstack_push(struct filter_parse_state *ps,
-			       enum filter_op_ids op)
-{
-	struct opstack_op *opstack_op;
+	s = i;
 
-	opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
-	if (!opstack_op)
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
 		return -ENOMEM;
 
-	opstack_op->op = op;
-	list_add(&opstack_op->list, &ps->opstack);
-
-	return 0;
-}
+	pred->field = field;
+	pred->offset = field->offset;
+	pred->op = op;
 
-static int filter_opstack_empty(struct filter_parse_state *ps)
-{
-	return list_empty(&ps->opstack);
-}
+	if (ftrace_event_is_function(call)) {
+		/*
+		 * Perf does things different with function events.
+		 * It only allows an "ip" field, and expects a string.
+		 * But the string does not need to be surrounded by quotes.
+		 * If it is a string, the assigned function as a nop,
+		 * (perf doesn't use it) and grab everything.
+		 */
+		if (strcmp(field->name, "ip") != 0) {
+			 parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
+			 goto err_free;
+		 }
+		 pred->fn = filter_pred_none;
+
+		 /*
+		  * Quotes are not required, but if they exist then we need
+		  * to read them till we hit a matching one.
+		  */
+		 if (str[i] == '\'' || str[i] == '"')
+			 q = str[i];
+		 else
+			 q = 0;
+
+		 for (i++; str[i]; i++) {
+			 if (q && str[i] == q)
+				 break;
+			 if (!q && (str[i] == ')' || str[i] == '&' ||
+				    str[i] == '|'))
+				 break;
+		 }
+		 /* Skip quotes */
+		 if (q)
+			 s++;
+		len = i - s;
+		if (len >= MAX_FILTER_STR_VAL) {
+			parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
+			goto err_free;
+		}
 
-static int filter_opstack_top(struct filter_parse_state *ps)
-{
-	struct opstack_op *opstack_op;
+		pred->regex.len = len;
+		strncpy(pred->regex.pattern, str + s, len);
+		pred->regex.pattern[len] = 0;
+
+	/* This is either a string, or an integer */
+	} else if (str[i] == '\'' || str[i] == '"') {
+		char q = str[i];
+
+		/* Make sure the op is OK for strings */
+		switch (op) {
+		case OP_NE:
+			pred->not = 1;
+			/* Fall through */
+		case OP_GLOB:
+		case OP_EQ:
+			break;
+		default:
+			parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+			goto err_free;
+		}
 
-	if (filter_opstack_empty(ps))
-		return OP_NONE;
+		/* Make sure the field is OK for strings */
+		if (!is_string_field(field)) {
+			parse_error(pe, FILT_ERR_EXPECT_DIGIT, pos + i);
+			goto err_free;
+		}
 
-	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+		for (i++; str[i]; i++) {
+			if (str[i] == q)
+				break;
+		}
+		if (!str[i]) {
+			parse_error(pe, FILT_ERR_MISSING_QUOTE, pos + i);
+			goto err_free;
+		}
 
-	return opstack_op->op;
-}
+		/* Skip quotes */
+		s++;
+		len = i - s;
+		if (len >= MAX_FILTER_STR_VAL) {
+			parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
+			goto err_free;
+		}
 
-static int filter_opstack_pop(struct filter_parse_state *ps)
-{
-	struct opstack_op *opstack_op;
-	enum filter_op_ids op;
+		pred->regex.len = len;
+		strncpy(pred->regex.pattern, str + s, len);
+		pred->regex.pattern[len] = 0;
 
-	if (filter_opstack_empty(ps))
-		return OP_NONE;
+		filter_build_regex(pred);
 
-	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
-	op = opstack_op->op;
-	list_del(&opstack_op->list);
+		if (field->filter_type == FILTER_COMM) {
+			pred->fn = filter_pred_comm;
 
-	kfree(opstack_op);
+		} else if (field->filter_type == FILTER_STATIC_STRING) {
+			pred->fn = filter_pred_string;
+			pred->regex.field_len = field->size;
 
-	return op;
-}
+		} else if (field->filter_type == FILTER_DYN_STRING)
+			pred->fn = filter_pred_strloc;
+		else
+			pred->fn = filter_pred_pchar;
+		/* go past the last quote */
+		i++;
 
-static void filter_opstack_clear(struct filter_parse_state *ps)
-{
-	while (!filter_opstack_empty(ps))
-		filter_opstack_pop(ps);
-}
+	} else if (isdigit(str[i])) {
 
-static char *curr_operand(struct filter_parse_state *ps)
-{
-	return ps->operand.string;
-}
+		/* Make sure the field is not a string */
+		if (is_string_field(field)) {
+			parse_error(pe, FILT_ERR_EXPECT_STRING, pos + i);
+			goto err_free;
+		}
 
-static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
-{
-	struct postfix_elt *elt;
+		if (op == OP_GLOB) {
+			parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
+			goto err_free;
+		}
 
-	elt = kmalloc(sizeof(*elt), GFP_KERNEL);
-	if (!elt)
-		return -ENOMEM;
+		/* We allow 0xDEADBEEF */
+		while (isalnum(str[i]))
+			i++;
 
-	elt->op = OP_NONE;
-	elt->operand = kstrdup(operand, GFP_KERNEL);
-	if (!elt->operand) {
-		kfree(elt);
-		return -ENOMEM;
-	}
+		len = i - s;
+		/* 0xfeedfacedeadbeef is 18 chars max */
+		if (len >= sizeof(num_buf)) {
+			parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
+			goto err_free;
+		}
 
-	list_add_tail(&elt->list, &ps->postfix);
+		strncpy(num_buf, str + s, len);
+		num_buf[len] = 0;
 
-	return 0;
-}
+		/* Make sure it is a value */
+		if (field->is_signed)
+			ret = kstrtoll(num_buf, 0, &val);
+		else
+			ret = kstrtoull(num_buf, 0, &val);
+		if (ret) {
+			parse_error(pe, FILT_ERR_ILLEGAL_INTVAL, pos + s);
+			goto err_free;
+		}
 
-static int postfix_append_op(struct filter_parse_state *ps, enum filter_op_ids op)
-{
-	struct postfix_elt *elt;
+		pred->val = val;
 
-	elt = kmalloc(sizeof(*elt), GFP_KERNEL);
-	if (!elt)
-		return -ENOMEM;
+		if (field->filter_type == FILTER_CPU)
+			pred->fn = filter_pred_cpu;
+		else {
+			pred->fn = select_comparison_fn(pred->op, field->size,
+							field->is_signed);
+			if (pred->op == OP_NE)
+				pred->not = 1;
+		}
 
-	elt->op = op;
-	elt->operand = NULL;
+	} else {
+		parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i);
+		goto err_free;
+	}
 
-	list_add_tail(&elt->list, &ps->postfix);
+	*pred_ptr = pred;
+	return i;
 
-	return 0;
+err_free:
+	kfree(pred);
+	return -EINVAL;
 }
 
-static void postfix_clear(struct filter_parse_state *ps)
-{
-	struct postfix_elt *elt;
+enum {
+	TOO_MANY_CLOSE		= -1,
+	TOO_MANY_OPEN		= -2,
+	MISSING_QUOTE		= -3,
+};
 
-	while (!list_empty(&ps->postfix)) {
-		elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
-		list_del(&elt->list);
-		kfree(elt->operand);
-		kfree(elt);
-	}
-}
+/*
+ * Read the filter string once to calculate the number of predicates
+ * as well as how deep the parentheses go.
+ *
+ * Returns:
+ *   0 - everything is fine (err is undefined)
+ *  -1 - too many ')'
+ *  -2 - too many '('
+ *  -3 - No matching quote
+ */
+static int calc_stack(const char *str, int *parens, int *preds, int *err)
+{
+	bool is_pred = false;
+	int nr_preds = 0;
+	int open = 1; /* Count the expression as "(E)" */
+	int last_quote = 0;
+	int max_open = 1;
+	int quote = 0;
+	int i;
 
-static int filter_parse(struct filter_parse_state *ps)
-{
-	enum filter_op_ids op, top_op;
-	int in_string = 0;
-	char ch;
+	*err = 0;
 
-	while ((ch = infix_next(ps))) {
-		if (ch == '"') {
-			in_string ^= 1;
+	for (i = 0; str[i]; i++) {
+		if (isspace(str[i]))
 			continue;
-		}
-
-		if (in_string)
-			goto parse_operand;
-
-		if (isspace(ch))
+		if (quote) {
+			if (str[i] == quote)
+			       quote = 0;
 			continue;
+		}
 
-		if (is_op_char(ps, ch)) {
-			op = infix_get_op(ps, ch);
-			if (op == OP_NONE) {
-				parse_error(ps, FILT_ERR_INVALID_OP, 0);
-				return -EINVAL;
-			}
-
-			if (strlen(curr_operand(ps))) {
-				postfix_append_operand(ps, curr_operand(ps));
-				clear_operand_string(ps);
-			}
-
-			while (!filter_opstack_empty(ps)) {
-				top_op = filter_opstack_top(ps);
-				if (!is_precedence_lower(ps, top_op, op)) {
-					top_op = filter_opstack_pop(ps);
-					postfix_append_op(ps, top_op);
-					continue;
-				}
+		switch (str[i]) {
+		case '\'':
+		case '"':
+			quote = str[i];
+			last_quote = i;
+			break;
+		case '|':
+		case '&':
+			if (str[i+1] != str[i])
 				break;
-			}
-
-			filter_opstack_push(ps, op);
+			is_pred = false;
 			continue;
-		}
-
-		if (ch == '(') {
-			filter_opstack_push(ps, OP_OPEN_PAREN);
+		case '(':
+			is_pred = false;
+			open++;
+			if (open > max_open)
+				max_open = open;
 			continue;
-		}
-
-		if (ch == ')') {
-			if (strlen(curr_operand(ps))) {
-				postfix_append_operand(ps, curr_operand(ps));
-				clear_operand_string(ps);
-			}
-
-			top_op = filter_opstack_pop(ps);
-			while (top_op != OP_NONE) {
-				if (top_op == OP_OPEN_PAREN)
-					break;
-				postfix_append_op(ps, top_op);
-				top_op = filter_opstack_pop(ps);
-			}
-			if (top_op == OP_NONE) {
-				parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
-				return -EINVAL;
+		case ')':
+			is_pred = false;
+			if (open == 1) {
+				*err = i;
+				return TOO_MANY_CLOSE;
 			}
+			open--;
 			continue;
 		}
-parse_operand:
-		if (append_operand_char(ps, ch)) {
-			parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
-			return -EINVAL;
-		}
-	}
-
-	if (strlen(curr_operand(ps)))
-		postfix_append_operand(ps, curr_operand(ps));
-
-	while (!filter_opstack_empty(ps)) {
-		top_op = filter_opstack_pop(ps);
-		if (top_op == OP_NONE)
-			break;
-		if (top_op == OP_OPEN_PAREN) {
-			parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
-			return -EINVAL;
+		if (!is_pred) {
+			nr_preds++;
+			is_pred = true;
 		}
-		postfix_append_op(ps, top_op);
-	}
-
-	return 0;
-}
-
-static struct filter_pred *create_pred(struct filter_parse_state *ps,
-				       struct trace_event_call *call,
-				       enum filter_op_ids op,
-				       char *operand1, char *operand2)
-{
-	struct ftrace_event_field *field;
-	static struct filter_pred pred;
-
-	memset(&pred, 0, sizeof(pred));
-	pred.op = op;
-
-	if (op == OP_AND || op == OP_OR)
-		return &pred;
-
-	if (!operand1 || !operand2) {
-		parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
-		return NULL;
 	}
 
-	field = trace_find_event_field(call, operand1);
-	if (!field) {
-		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
-		return NULL;
+	if (quote) {
+		*err = last_quote;
+		return MISSING_QUOTE;
 	}
 
-	strcpy(pred.regex.pattern, operand2);
-	pred.regex.len = strlen(pred.regex.pattern);
-	pred.field = field;
-	return init_pred(ps, field, &pred) ? NULL : &pred;
-}
+	if (open != 1) {
+		int level = open;
 
-static int check_preds(struct filter_parse_state *ps)
-{
-	int n_normal_preds = 0, n_logical_preds = 0;
-	struct postfix_elt *elt;
-	int cnt = 0;
-
-	list_for_each_entry(elt, &ps->postfix, list) {
-		if (elt->op == OP_NONE) {
-			cnt++;
-			continue;
-		}
-
-		if (elt->op == OP_AND || elt->op == OP_OR) {
-			n_logical_preds++;
-			cnt--;
-			continue;
+		/* find the bad open */
+		for (i--; i; i--) {
+			if (quote) {
+				if (str[i] == quote)
+					quote = 0;
+				continue;
+			}
+			switch (str[i]) {
+			case '(':
+				if (level == open) {
+					*err = i;
+					return TOO_MANY_OPEN;
+				}
+				level--;
+				break;
+			case ')':
+				level++;
+				break;
+			case '\'':
+			case '"':
+				quote = str[i];
+				break;
+			}
 		}
-		if (elt->op != OP_NOT)
-			cnt--;
-		n_normal_preds++;
-		/* all ops should have operands */
-		if (cnt < 0)
-			break;
-	}
-
-	if (cnt != 1 || !n_normal_preds || n_logical_preds >= n_normal_preds) {
-		parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
-		return -EINVAL;
+		/* First character is the '(' with missing ')' */
+		*err = 0;
+		return TOO_MANY_OPEN;
 	}
 
+	/* Set the size of the required stacks */
+	*parens = max_open;
+	*preds = nr_preds;
 	return 0;
 }
 
-static int count_preds(struct filter_parse_state *ps)
-{
-	struct postfix_elt *elt;
-	int n_preds = 0;
-
-	list_for_each_entry(elt, &ps->postfix, list) {
-		if (elt->op == OP_NONE)
-			continue;
-		n_preds++;
-	}
-
-	return n_preds;
-}
-
-struct check_pred_data {
-	int count;
-	int max;
-};
-
-static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
-			      int *err, void *data)
-{
-	struct check_pred_data *d = data;
-
-	if (WARN_ON(d->count++ > d->max)) {
-		*err = -EINVAL;
-		return WALK_PRED_ABORT;
-	}
-	return WALK_PRED_DEFAULT;
-}
-
-/*
- * The tree is walked at filtering of an event. If the tree is not correctly
- * built, it may cause an infinite loop. Check here that the tree does
- * indeed terminate.
- */
-static int check_pred_tree(struct event_filter *filter,
-			   struct filter_pred *root)
-{
-	struct check_pred_data data = {
-		/*
-		 * The max that we can hit a node is three times.
-		 * Once going down, once coming up from left, and
-		 * once coming up from right. This is more than enough
-		 * since leafs are only hit a single time.
-		 */
-		.max   = 3 * filter->n_preds,
-		.count = 0,
-	};
-
-	return walk_pred_tree(filter->preds, root,
-			      check_pred_tree_cb, &data);
-}
-
-static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
-			  int *err, void *data)
-{
-	int *count = data;
-
-	if ((move == MOVE_DOWN) &&
-	    (pred->left == FILTER_PRED_INVALID))
-		(*count)++;
-
-	return WALK_PRED_DEFAULT;
-}
-
-static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
-{
-	int count = 0, ret;
-
-	ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
-	WARN_ON(ret);
-	return count;
-}
-
-struct fold_pred_data {
-	struct filter_pred *root;
-	int count;
-	int children;
-};
-
-static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
-			int *err, void *data)
-{
-	struct fold_pred_data *d = data;
-	struct filter_pred *root = d->root;
-
-	if (move != MOVE_DOWN)
-		return WALK_PRED_DEFAULT;
-	if (pred->left != FILTER_PRED_INVALID)
-		return WALK_PRED_DEFAULT;
-
-	if (WARN_ON(d->count == d->children)) {
-		*err = -EINVAL;
-		return WALK_PRED_ABORT;
-	}
-
-	pred->index &= ~FILTER_PRED_FOLD;
-	root->ops[d->count++] = pred->index;
-	return WALK_PRED_DEFAULT;
-}
-
-static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
-{
-	struct fold_pred_data data = {
-		.root  = root,
-		.count = 0,
-	};
-	int children;
-
-	/* No need to keep the fold flag */
-	root->index &= ~FILTER_PRED_FOLD;
-
-	/* If the root is a leaf then do nothing */
-	if (root->left == FILTER_PRED_INVALID)
-		return 0;
-
-	/* count the children */
-	children = count_leafs(preds, &preds[root->left]);
-	children += count_leafs(preds, &preds[root->right]);
-
-	root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL);
-	if (!root->ops)
-		return -ENOMEM;
-
-	root->val = children;
-	data.children = children;
-	return walk_pred_tree(preds, root, fold_pred_cb, &data);
-}
-
-static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
-			     int *err, void *data)
-{
-	struct filter_pred *preds = data;
-
-	if (move != MOVE_DOWN)
-		return WALK_PRED_DEFAULT;
-	if (!(pred->index & FILTER_PRED_FOLD))
-		return WALK_PRED_DEFAULT;
-
-	*err = fold_pred(preds, pred);
-	if (*err)
-		return WALK_PRED_ABORT;
-
-	/* eveyrhing below is folded, continue with parent */
-	return WALK_PRED_PARENT;
-}
-
-/*
- * To optimize the processing of the ops, if we have several "ors" or
- * "ands" together, we can put them in an array and process them all
- * together speeding up the filter logic.
- */
-static int fold_pred_tree(struct event_filter *filter,
-			   struct filter_pred *root)
-{
-	return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
-			      filter->preds);
-}
-
-static int replace_preds(struct trace_event_call *call,
+static int process_preds(struct trace_event_call *call,
+			 const char *filter_string,
 			 struct event_filter *filter,
-			 struct filter_parse_state *ps,
-			 bool dry_run)
+			 struct filter_parse_error *pe)
 {
-	char *operand1 = NULL, *operand2 = NULL;
-	struct filter_pred *pred;
-	struct filter_pred *root;
-	struct postfix_elt *elt;
-	struct pred_stack stack = { }; /* init to NULL */
-	int err;
-	int n_preds = 0;
-
-	n_preds = count_preds(ps);
-	if (n_preds >= MAX_FILTER_PRED) {
-		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
-		return -ENOSPC;
-	}
-
-	err = check_preds(ps);
-	if (err)
-		return err;
-
-	if (!dry_run) {
-		err = __alloc_pred_stack(&stack, n_preds);
-		if (err)
-			return err;
-		err = __alloc_preds(filter, n_preds);
-		if (err)
-			goto fail;
-	}
-
-	n_preds = 0;
-	list_for_each_entry(elt, &ps->postfix, list) {
-		if (elt->op == OP_NONE) {
-			if (!operand1)
-				operand1 = elt->operand;
-			else if (!operand2)
-				operand2 = elt->operand;
-			else {
-				parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
-				err = -EINVAL;
-				goto fail;
-			}
-			continue;
-		}
-
-		if (elt->op == OP_NOT) {
-			if (!n_preds || operand1 || operand2) {
-				parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
-				err = -EINVAL;
-				goto fail;
-			}
-			if (!dry_run)
-				filter->preds[n_preds - 1].not ^= 1;
-			continue;
-		}
-
-		if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
-			parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
-			err = -ENOSPC;
-			goto fail;
-		}
-
-		pred = create_pred(ps, call, elt->op, operand1, operand2);
-		if (!pred) {
-			err = -EINVAL;
-			goto fail;
-		}
+	struct prog_entry *prog;
+	int nr_parens;
+	int nr_preds;
+	int index;
+	int ret;
 
-		if (!dry_run) {
-			err = filter_add_pred(ps, filter, pred, &stack);
-			if (err)
-				goto fail;
+	ret = calc_stack(filter_string, &nr_parens, &nr_preds, &index);
+	if (ret < 0) {
+		switch (ret) {
+		case MISSING_QUOTE:
+			parse_error(pe, FILT_ERR_MISSING_QUOTE, index);
+			break;
+		case TOO_MANY_OPEN:
+			parse_error(pe, FILT_ERR_TOO_MANY_OPEN, index);
+			break;
+		default:
+			parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, index);
 		}
-
-		operand1 = operand2 = NULL;
+		return ret;
 	}
 
-	if (!dry_run) {
-		/* We should have one item left on the stack */
-		pred = __pop_pred_stack(&stack);
-		if (!pred)
-			return -EINVAL;
-		/* This item is where we start from in matching */
-		root = pred;
-		/* Make sure the stack is empty */
-		pred = __pop_pred_stack(&stack);
-		if (WARN_ON(pred)) {
-			err = -EINVAL;
-			filter->root = NULL;
-			goto fail;
-		}
-		err = check_pred_tree(filter, root);
-		if (err)
-			goto fail;
-
-		/* Optimize the tree */
-		err = fold_pred_tree(filter, root);
-		if (err)
-			goto fail;
-
-		/* We don't set root until we know it works */
-		barrier();
-		filter->root = root;
-	}
+	if (!nr_preds)
+		return -EINVAL;
 
-	err = 0;
-fail:
-	__free_pred_stack(&stack);
-	return err;
+	prog = predicate_parse(filter_string, nr_parens, nr_preds,
+			       parse_pred, call, pe);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	rcu_assign_pointer(filter->prog, prog);
+	return 0;
 }
 
 static inline void event_set_filtered_flag(struct trace_event_file *file)
@@ -1780,72 +1562,53 @@ struct filter_list {
 	struct event_filter	*filter;
 };
 
-static int replace_system_preds(struct trace_subsystem_dir *dir,
+static int process_system_preds(struct trace_subsystem_dir *dir,
 				struct trace_array *tr,
-				struct filter_parse_state *ps,
+				struct filter_parse_error *pe,
 				char *filter_string)
 {
 	struct trace_event_file *file;
 	struct filter_list *filter_item;
+	struct event_filter *filter = NULL;
 	struct filter_list *tmp;
 	LIST_HEAD(filter_list);
 	bool fail = true;
 	int err;
 
 	list_for_each_entry(file, &tr->events, list) {
-		if (file->system != dir)
-			continue;
-
-		/*
-		 * Try to see if the filter can be applied
-		 *  (filter arg is ignored on dry_run)
-		 */
-		err = replace_preds(file->event_call, NULL, ps, true);
-		if (err)
-			event_set_no_set_filter_flag(file);
-		else
-			event_clear_no_set_filter_flag(file);
-	}
-
-	list_for_each_entry(file, &tr->events, list) {
-		struct event_filter *filter;
 
 		if (file->system != dir)
 			continue;
 
-		if (event_no_set_filter_flag(file))
-			continue;
-
-		filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
-		if (!filter_item)
-			goto fail_mem;
-
-		list_add_tail(&filter_item->list, &filter_list);
-
-		filter_item->filter = __alloc_filter();
-		if (!filter_item->filter)
+		filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+		if (!filter)
 			goto fail_mem;
-		filter = filter_item->filter;
 
-		/* Can only fail on no memory */
-		err = replace_filter_string(filter, filter_string);
-		if (err)
+		filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+		if (!filter->filter_string)
 			goto fail_mem;
 
-		err = replace_preds(file->event_call, filter, ps, false);
+		err = process_preds(file->event_call, filter_string, filter, pe);
 		if (err) {
 			filter_disable(file);
-			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-			append_filter_err(ps, filter);
+			parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+			append_filter_err(pe, filter);
 		} else
 			event_set_filtered_flag(file);
+
+
+		filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
+		if (!filter_item)
+			goto fail_mem;
+
+		list_add_tail(&filter_item->list, &filter_list);
 		/*
 		 * Regardless of if this returned an error, we still
 		 * replace the filter for the call.
 		 */
-		filter = event_filter(file);
-		event_set_filter(file, filter_item->filter);
-		filter_item->filter = filter;
+		filter_item->filter = event_filter(file);
+		event_set_filter(file, filter);
+		filter = NULL;
 
 		fail = false;
 	}
@@ -1871,9 +1634,10 @@ static int replace_system_preds(struct trace_subsystem_dir *dir,
 		list_del(&filter_item->list);
 		kfree(filter_item);
 	}
-	parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+	parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
 	return -EINVAL;
  fail_mem:
+	kfree(filter);
 	/* If any call succeeded, we still need to sync */
 	if (!fail)
 		synchronize_sched();
@@ -1885,47 +1649,42 @@ static int replace_system_preds(struct trace_subsystem_dir *dir,
 	return -ENOMEM;
 }
 
-static int create_filter_start(char *filter_str, bool set_str,
-			       struct filter_parse_state **psp,
+static int create_filter_start(char *filter_string, bool set_str,
+			       struct filter_parse_error **pse,
 			       struct event_filter **filterp)
 {
 	struct event_filter *filter;
-	struct filter_parse_state *ps = NULL;
+	struct filter_parse_error *pe = NULL;
 	int err = 0;
 
-	WARN_ON_ONCE(*psp || *filterp);
+	if (WARN_ON_ONCE(*pse || *filterp))
+		return -EINVAL;
 
-	/* allocate everything, and if any fails, free all and fail */
-	filter = __alloc_filter();
-	if (filter && set_str)
-		err = replace_filter_string(filter, filter_str);
+	filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	if (filter && set_str) {
+		filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+		if (!filter->filter_string)
+			err = -ENOMEM;
+	}
 
-	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	pe = kzalloc(sizeof(*pe), GFP_KERNEL);
 
-	if (!filter || !ps || err) {
-		kfree(ps);
+	if (!filter || !pe || err) {
+		kfree(pe);
 		__free_filter(filter);
 		return -ENOMEM;
 	}
 
 	/* we're committed to creating a new filter */
 	*filterp = filter;
-	*psp = ps;
+	*pse = pe;
 
-	parse_init(ps, filter_ops, filter_str);
-	err = filter_parse(ps);
-	if (err && set_str)
-		append_filter_err(ps, filter);
-	return err;
+	return 0;
 }
 
-static void create_filter_finish(struct filter_parse_state *ps)
+static void create_filter_finish(struct filter_parse_error *pe)
 {
-	if (ps) {
-		filter_opstack_clear(ps);
-		postfix_clear(ps);
-		kfree(ps);
-	}
+	kfree(pe);
 }
 
 /**
@@ -1945,26 +1704,20 @@ static void create_filter_finish(struct filter_parse_state *ps)
  * freeing it.
  */
 static int create_filter(struct trace_event_call *call,
-			 char *filter_str, bool set_str,
+			 char *filter_string, bool set_str,
 			 struct event_filter **filterp)
 {
-	struct event_filter *filter = NULL;
-	struct filter_parse_state *ps = NULL;
+	struct filter_parse_error *pe = NULL;
 	int err;
 
-	err = create_filter_start(filter_str, set_str, &ps, &filter);
-	if (!err) {
-		err = replace_preds(call, filter, ps, false);
-		if (err && set_str)
-			append_filter_err(ps, filter);
-	}
-	if (err && !set_str) {
-		free_event_filter(filter);
-		filter = NULL;
-	}
-	create_filter_finish(ps);
+	err = create_filter_start(filter_string, set_str, &pe, filterp);
+	if (err)
+		return err;
+
+	err = process_preds(call, filter_string, *filterp, pe);
+	if (err && set_str)
+		append_filter_err(pe, *filterp);
 
-	*filterp = filter;
 	return err;
 }
 
@@ -1988,24 +1741,22 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
 				struct trace_array *tr,
 				char *filter_str, struct event_filter **filterp)
 {
-	struct event_filter *filter = NULL;
-	struct filter_parse_state *ps = NULL;
+	struct filter_parse_error *pe = NULL;
 	int err;
 
-	err = create_filter_start(filter_str, true, &ps, &filter);
+	err = create_filter_start(filter_str, true, &pe, filterp);
 	if (!err) {
-		err = replace_system_preds(dir, tr, ps, filter_str);
+		err = process_system_preds(dir, tr, pe, filter_str);
 		if (!err) {
 			/* System filters just show a default message */
-			kfree(filter->filter_string);
-			filter->filter_string = NULL;
+			kfree((*filterp)->filter_string);
+			(*filterp)->filter_string = NULL;
 		} else {
-			append_filter_err(ps, filter);
+			append_filter_err(pe, *filterp);
 		}
 	}
-	create_filter_finish(ps);
+	create_filter_finish(pe);
 
-	*filterp = filter;
 	return err;
 }
 
@@ -2013,7 +1764,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
 int apply_event_filter(struct trace_event_file *file, char *filter_string)
 {
 	struct trace_event_call *call = file->event_call;
-	struct event_filter *filter;
+	struct event_filter *filter = NULL;
 	int err;
 
 	if (!strcmp(strstrip(filter_string), "0")) {
@@ -2066,7 +1817,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
 {
 	struct event_subsystem *system = dir->subsystem;
 	struct trace_array *tr = dir->tr;
-	struct event_filter *filter;
+	struct event_filter *filter = NULL;
 	int err = 0;
 
 	mutex_lock(&event_mutex);
@@ -2186,66 +1937,80 @@ static int __ftrace_function_set_filter(int filter, char *buf, int len,
 	return ret;
 }
 
-static int ftrace_function_check_pred(struct filter_pred *pred, int leaf)
+static int ftrace_function_check_pred(struct filter_pred *pred)
 {
 	struct ftrace_event_field *field = pred->field;
 
-	if (leaf) {
-		/*
-		 * Check the leaf predicate for function trace, verify:
-		 *  - only '==' and '!=' is used
-		 *  - the 'ip' field is used
-		 */
-		if ((pred->op != OP_EQ) && (pred->op != OP_NE))
-			return -EINVAL;
+	/*
+	 * Check the predicate for function trace, verify:
+	 *  - only '==' and '!=' is used
+	 *  - the 'ip' field is used
+	 */
+	if ((pred->op != OP_EQ) && (pred->op != OP_NE))
+		return -EINVAL;
 
-		if (strcmp(field->name, "ip"))
-			return -EINVAL;
-	} else {
-		/*
-		 * Check the non leaf predicate for function trace, verify:
-		 *  - only '||' is used
-		*/
-		if (pred->op != OP_OR)
-			return -EINVAL;
-	}
+	if (strcmp(field->name, "ip"))
+		return -EINVAL;
 
 	return 0;
 }
 
-static int ftrace_function_set_filter_cb(enum move_type move,
-					 struct filter_pred *pred,
-					 int *err, void *data)
+static int ftrace_function_set_filter_pred(struct filter_pred *pred,
+					   struct function_filter_data *data)
 {
+	int ret;
+
 	/* Checking the node is valid for function trace. */
-	if ((move != MOVE_DOWN) ||
-	    (pred->left != FILTER_PRED_INVALID)) {
-		*err = ftrace_function_check_pred(pred, 0);
-	} else {
-		*err = ftrace_function_check_pred(pred, 1);
-		if (*err)
-			return WALK_PRED_ABORT;
-
-		*err = __ftrace_function_set_filter(pred->op == OP_EQ,
-						    pred->regex.pattern,
-						    pred->regex.len,
-						    data);
-	}
+	ret = ftrace_function_check_pred(pred);
+	if (ret)
+		return ret;
 
-	return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT;
+	return __ftrace_function_set_filter(pred->op == OP_EQ,
+					    pred->regex.pattern,
+					    pred->regex.len,
+					    data);
+}
+
+static bool is_or(struct prog_entry *prog, int i)
+{
+	int target;
+
+	/*
+	 * Only "||" is allowed for function events, thus,
+	 * all true branches should jump to true, and any
+	 * false branch should jump to false.
+	 */
+	target = prog[i].target + 1;
+	/* True and false have NULL preds (all prog entries should jump to one */
+	if (prog[target].pred)
+		return false;
+
+	/* prog[target].target is 1 for TRUE, 0 for FALSE */
+	return prog[i].when_to_branch == prog[target].target;
 }
 
 static int ftrace_function_set_filter(struct perf_event *event,
 				      struct event_filter *filter)
 {
+	struct prog_entry *prog = rcu_dereference_protected(filter->prog,
+						lockdep_is_held(&event_mutex));
 	struct function_filter_data data = {
 		.first_filter  = 1,
 		.first_notrace = 1,
 		.ops           = &event->ftrace_ops,
 	};
+	int i;
+
+	for (i = 0; prog[i].pred; i++) {
+		struct filter_pred *pred = prog[i].pred;
 
-	return walk_pred_tree(filter->preds, filter->root,
-			      ftrace_function_set_filter_cb, &data);
+		if (!is_or(prog, i))
+			return -EINVAL;
+
+		if (ftrace_function_set_filter_pred(pred, &data) < 0)
+			return -EINVAL;
+	}
+	return 0;
 }
 #else
 static int ftrace_function_set_filter(struct perf_event *event,
@@ -2259,7 +2024,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 			      char *filter_str)
 {
 	int err;
-	struct event_filter *filter;
+	struct event_filter *filter = NULL;
 	struct trace_event_call *call;
 
 	mutex_lock(&event_mutex);
@@ -2375,7 +2140,7 @@ static struct test_filter_data_t {
 #undef YES
 #undef NO
 
-#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
+#define DATA_CNT ARRAY_SIZE(test_filter_data)
 
 static int test_pred_visited;
 
@@ -2388,26 +2153,28 @@ static int test_pred_visited_fn(struct filter_pred *pred, void *event)
 	return 1;
 }
 
-static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
-			     int *err, void *data)
+static void update_pred_fn(struct event_filter *filter, char *fields)
 {
-	char *fields = data;
+	struct prog_entry *prog = rcu_dereference_protected(filter->prog,
+						lockdep_is_held(&event_mutex));
+	int i;
 
-	if ((move == MOVE_DOWN) &&
-	    (pred->left == FILTER_PRED_INVALID)) {
+	for (i = 0; prog[i].pred; i++) {
+		struct filter_pred *pred = prog[i].pred;
 		struct ftrace_event_field *field = pred->field;
 
+		WARN_ON_ONCE(!pred->fn);
+
 		if (!field) {
-			WARN(1, "all leafs should have field defined");
-			return WALK_PRED_DEFAULT;
+			WARN_ONCE(1, "all leafs should have field defined %d", i);
+			continue;
 		}
+
 		if (!strchr(fields, *field->name))
-			return WALK_PRED_DEFAULT;
+			continue;
 
-		WARN_ON(!pred->fn);
 		pred->fn = test_pred_visited_fn;
 	}
-	return WALK_PRED_DEFAULT;
 }
 
 static __init int ftrace_test_event_filter(void)
@@ -2431,20 +2198,22 @@ static __init int ftrace_test_event_filter(void)
 			break;
 		}
 
+		/* Needed to dereference filter->prog */
+		mutex_lock(&event_mutex);
 		/*
 		 * The preemption disabling is not really needed for self
 		 * tests, but the rcu dereference will complain without it.
 		 */
 		preempt_disable();
 		if (*d->not_visited)
-			walk_pred_tree(filter->preds, filter->root,
-				       test_walk_pred_cb,
-				       d->not_visited);
+			update_pred_fn(filter, d->not_visited);
 
 		test_pred_visited = 0;
 		err = filter_match_preds(filter, &d->rec);
 		preempt_enable();
 
+		mutex_unlock(&event_mutex);
+
 		__free_filter(filter);
 
 		if (test_pred_visited) {
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1e1558c99d56..046c716a6536 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -20,15 +20,39 @@
 #include <linux/slab.h>
 #include <linux/stacktrace.h>
 #include <linux/rculist.h>
+#include <linux/tracefs.h>
 
 #include "tracing_map.h"
 #include "trace.h"
 
+#define SYNTH_SYSTEM		"synthetic"
+#define SYNTH_FIELDS_MAX	16
+
+#define STR_VAR_LEN_MAX		32 /* must be multiple of sizeof(u64) */
+
 struct hist_field;
 
-typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
+typedef u64 (*hist_field_fn_t) (struct hist_field *field,
+				struct tracing_map_elt *elt,
+				struct ring_buffer_event *rbe,
+				void *event);
 
 #define HIST_FIELD_OPERANDS_MAX	2
+#define HIST_FIELDS_MAX		(TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
+#define HIST_ACTIONS_MAX	8
+
+enum field_op_id {
+	FIELD_OP_NONE,
+	FIELD_OP_PLUS,
+	FIELD_OP_MINUS,
+	FIELD_OP_UNARY_MINUS,
+};
+
+struct hist_var {
+	char				*name;
+	struct hist_trigger_data	*hist_data;
+	unsigned int			idx;
+};
 
 struct hist_field {
 	struct ftrace_event_field	*field;
@@ -37,27 +61,49 @@ struct hist_field {
 	unsigned int			size;
 	unsigned int			offset;
 	unsigned int                    is_signed;
+	const char			*type;
 	struct hist_field		*operands[HIST_FIELD_OPERANDS_MAX];
+	struct hist_trigger_data	*hist_data;
+	struct hist_var			var;
+	enum field_op_id		operator;
+	char				*system;
+	char				*event_name;
+	char				*name;
+	unsigned int			var_idx;
+	unsigned int			var_ref_idx;
+	bool                            read_once;
 };
 
-static u64 hist_field_none(struct hist_field *field, void *event)
+static u64 hist_field_none(struct hist_field *field,
+			   struct tracing_map_elt *elt,
+			   struct ring_buffer_event *rbe,
+			   void *event)
 {
 	return 0;
 }
 
-static u64 hist_field_counter(struct hist_field *field, void *event)
+static u64 hist_field_counter(struct hist_field *field,
+			      struct tracing_map_elt *elt,
+			      struct ring_buffer_event *rbe,
+			      void *event)
 {
 	return 1;
 }
 
-static u64 hist_field_string(struct hist_field *hist_field, void *event)
+static u64 hist_field_string(struct hist_field *hist_field,
+			     struct tracing_map_elt *elt,
+			     struct ring_buffer_event *rbe,
+			     void *event)
 {
 	char *addr = (char *)(event + hist_field->field->offset);
 
 	return (u64)(unsigned long)addr;
 }
 
-static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
+static u64 hist_field_dynstring(struct hist_field *hist_field,
+				struct tracing_map_elt *elt,
+				struct ring_buffer_event *rbe,
+				void *event)
 {
 	u32 str_item = *(u32 *)(event + hist_field->field->offset);
 	int str_loc = str_item & 0xffff;
@@ -66,24 +112,74 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
 	return (u64)(unsigned long)addr;
 }
 
-static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
+static u64 hist_field_pstring(struct hist_field *hist_field,
+			      struct tracing_map_elt *elt,
+			      struct ring_buffer_event *rbe,
+			      void *event)
 {
 	char **addr = (char **)(event + hist_field->field->offset);
 
 	return (u64)(unsigned long)*addr;
 }
 
-static u64 hist_field_log2(struct hist_field *hist_field, void *event)
+static u64 hist_field_log2(struct hist_field *hist_field,
+			   struct tracing_map_elt *elt,
+			   struct ring_buffer_event *rbe,
+			   void *event)
 {
 	struct hist_field *operand = hist_field->operands[0];
 
-	u64 val = operand->fn(operand, event);
+	u64 val = operand->fn(operand, elt, rbe, event);
 
 	return (u64) ilog2(roundup_pow_of_two(val));
 }
 
+static u64 hist_field_plus(struct hist_field *hist_field,
+			   struct tracing_map_elt *elt,
+			   struct ring_buffer_event *rbe,
+			   void *event)
+{
+	struct hist_field *operand1 = hist_field->operands[0];
+	struct hist_field *operand2 = hist_field->operands[1];
+
+	u64 val1 = operand1->fn(operand1, elt, rbe, event);
+	u64 val2 = operand2->fn(operand2, elt, rbe, event);
+
+	return val1 + val2;
+}
+
+static u64 hist_field_minus(struct hist_field *hist_field,
+			    struct tracing_map_elt *elt,
+			    struct ring_buffer_event *rbe,
+			    void *event)
+{
+	struct hist_field *operand1 = hist_field->operands[0];
+	struct hist_field *operand2 = hist_field->operands[1];
+
+	u64 val1 = operand1->fn(operand1, elt, rbe, event);
+	u64 val2 = operand2->fn(operand2, elt, rbe, event);
+
+	return val1 - val2;
+}
+
+static u64 hist_field_unary_minus(struct hist_field *hist_field,
+				  struct tracing_map_elt *elt,
+				  struct ring_buffer_event *rbe,
+				  void *event)
+{
+	struct hist_field *operand = hist_field->operands[0];
+
+	s64 sval = (s64)operand->fn(operand, elt, rbe, event);
+	u64 val = (u64)-sval;
+
+	return val;
+}
+
 #define DEFINE_HIST_FIELD_FN(type)					\
-static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
+	static u64 hist_field_##type(struct hist_field *hist_field,	\
+				     struct tracing_map_elt *elt,	\
+				     struct ring_buffer_event *rbe,	\
+				     void *event)			\
 {									\
 	type *addr = (type *)(event + hist_field->field->offset);	\
 									\
@@ -126,6 +222,19 @@ enum hist_field_flags {
 	HIST_FIELD_FL_SYSCALL		= 1 << 7,
 	HIST_FIELD_FL_STACKTRACE	= 1 << 8,
 	HIST_FIELD_FL_LOG2		= 1 << 9,
+	HIST_FIELD_FL_TIMESTAMP		= 1 << 10,
+	HIST_FIELD_FL_TIMESTAMP_USECS	= 1 << 11,
+	HIST_FIELD_FL_VAR		= 1 << 12,
+	HIST_FIELD_FL_EXPR		= 1 << 13,
+	HIST_FIELD_FL_VAR_REF		= 1 << 14,
+	HIST_FIELD_FL_CPU		= 1 << 15,
+	HIST_FIELD_FL_ALIAS		= 1 << 16,
+};
+
+struct var_defs {
+	unsigned int	n_vars;
+	char		*name[TRACING_MAP_VARS_MAX];
+	char		*expr[TRACING_MAP_VARS_MAX];
 };
 
 struct hist_trigger_attrs {
@@ -133,25 +242,1437 @@ struct hist_trigger_attrs {
 	char		*vals_str;
 	char		*sort_key_str;
 	char		*name;
+	char		*clock;
 	bool		pause;
 	bool		cont;
 	bool		clear;
+	bool		ts_in_usecs;
 	unsigned int	map_bits;
+
+	char		*assignment_str[TRACING_MAP_VARS_MAX];
+	unsigned int	n_assignments;
+
+	char		*action_str[HIST_ACTIONS_MAX];
+	unsigned int	n_actions;
+
+	struct var_defs	var_defs;
+};
+
+struct field_var {
+	struct hist_field	*var;
+	struct hist_field	*val;
+};
+
+struct field_var_hist {
+	struct hist_trigger_data	*hist_data;
+	char				*cmd;
 };
 
 struct hist_trigger_data {
-	struct hist_field               *fields[TRACING_MAP_FIELDS_MAX];
+	struct hist_field               *fields[HIST_FIELDS_MAX];
 	unsigned int			n_vals;
 	unsigned int			n_keys;
 	unsigned int			n_fields;
+	unsigned int			n_vars;
 	unsigned int			key_size;
 	struct tracing_map_sort_key	sort_keys[TRACING_MAP_SORT_KEYS_MAX];
 	unsigned int			n_sort_keys;
 	struct trace_event_file		*event_file;
 	struct hist_trigger_attrs	*attrs;
 	struct tracing_map		*map;
+	bool				enable_timestamps;
+	bool				remove;
+	struct hist_field               *var_refs[TRACING_MAP_VARS_MAX];
+	unsigned int			n_var_refs;
+
+	struct action_data		*actions[HIST_ACTIONS_MAX];
+	unsigned int			n_actions;
+
+	struct hist_field               *synth_var_refs[SYNTH_FIELDS_MAX];
+	unsigned int                    n_synth_var_refs;
+	struct field_var		*field_vars[SYNTH_FIELDS_MAX];
+	unsigned int			n_field_vars;
+	unsigned int			n_field_var_str;
+	struct field_var_hist		*field_var_hists[SYNTH_FIELDS_MAX];
+	unsigned int			n_field_var_hists;
+
+	struct field_var		*max_vars[SYNTH_FIELDS_MAX];
+	unsigned int			n_max_vars;
+	unsigned int			n_max_var_str;
+};
+
+struct synth_field {
+	char *type;
+	char *name;
+	size_t size;
+	bool is_signed;
+	bool is_string;
+};
+
+struct synth_event {
+	struct list_head			list;
+	int					ref;
+	char					*name;
+	struct synth_field			**fields;
+	unsigned int				n_fields;
+	unsigned int				n_u64;
+	struct trace_event_class		class;
+	struct trace_event_call			call;
+	struct tracepoint			*tp;
+};
+
+struct action_data;
+
+typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
+			     struct tracing_map_elt *elt, void *rec,
+			     struct ring_buffer_event *rbe,
+			     struct action_data *data, u64 *var_ref_vals);
+
+struct action_data {
+	action_fn_t		fn;
+	unsigned int		n_params;
+	char			*params[SYNTH_FIELDS_MAX];
+
+	union {
+		struct {
+			unsigned int		var_ref_idx;
+			char			*match_event;
+			char			*match_event_system;
+			char			*synth_event_name;
+			struct synth_event	*synth_event;
+		} onmatch;
+
+		struct {
+			char			*var_str;
+			char			*fn_name;
+			unsigned int		max_var_ref_idx;
+			struct hist_field	*max_var;
+			struct hist_field	*var;
+		} onmax;
+	};
+};
+
+
+static char last_hist_cmd[MAX_FILTER_STR_VAL];
+static char hist_err_str[MAX_FILTER_STR_VAL];
+
+static void last_cmd_set(char *str)
+{
+	if (!str)
+		return;
+
+	strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
+}
+
+static void hist_err(char *str, char *var)
+{
+	int maxlen = MAX_FILTER_STR_VAL - 1;
+
+	if (!str)
+		return;
+
+	if (strlen(hist_err_str))
+		return;
+
+	if (!var)
+		var = "";
+
+	if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen)
+		return;
+
+	strcat(hist_err_str, str);
+	strcat(hist_err_str, var);
+}
+
+static void hist_err_event(char *str, char *system, char *event, char *var)
+{
+	char err[MAX_FILTER_STR_VAL];
+
+	if (system && var)
+		snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
+	else if (system)
+		snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
+	else
+		strncpy(err, var, MAX_FILTER_STR_VAL);
+
+	hist_err(str, err);
+}
+
+static void hist_err_clear(void)
+{
+	hist_err_str[0] = '\0';
+}
+
+static bool have_hist_err(void)
+{
+	if (strlen(hist_err_str))
+		return true;
+
+	return false;
+}
+
+static LIST_HEAD(synth_event_list);
+static DEFINE_MUTEX(synth_event_mutex);
+
+struct synth_trace_event {
+	struct trace_entry	ent;
+	u64			fields[];
+};
+
+static int synth_event_define_fields(struct trace_event_call *call)
+{
+	struct synth_trace_event trace;
+	int offset = offsetof(typeof(trace), fields);
+	struct synth_event *event = call->data;
+	unsigned int i, size, n_u64;
+	char *name, *type;
+	bool is_signed;
+	int ret = 0;
+
+	for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+		size = event->fields[i]->size;
+		is_signed = event->fields[i]->is_signed;
+		type = event->fields[i]->type;
+		name = event->fields[i]->name;
+		ret = trace_define_field(call, type, name, offset, size,
+					 is_signed, FILTER_OTHER);
+		if (ret)
+			break;
+
+		if (event->fields[i]->is_string) {
+			offset += STR_VAR_LEN_MAX;
+			n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+		} else {
+			offset += sizeof(u64);
+			n_u64++;
+		}
+	}
+
+	event->n_u64 = n_u64;
+
+	return ret;
+}
+
+static bool synth_field_signed(char *type)
+{
+	if (strncmp(type, "u", 1) == 0)
+		return false;
+
+	return true;
+}
+
+static int synth_field_is_string(char *type)
+{
+	if (strstr(type, "char[") != NULL)
+		return true;
+
+	return false;
+}
+
+static int synth_field_string_size(char *type)
+{
+	char buf[4], *end, *start;
+	unsigned int len;
+	int size, err;
+
+	start = strstr(type, "char[");
+	if (start == NULL)
+		return -EINVAL;
+	start += strlen("char[");
+
+	end = strchr(type, ']');
+	if (!end || end < start)
+		return -EINVAL;
+
+	len = end - start;
+	if (len > 3)
+		return -EINVAL;
+
+	strncpy(buf, start, len);
+	buf[len] = '\0';
+
+	err = kstrtouint(buf, 0, &size);
+	if (err)
+		return err;
+
+	if (size > STR_VAR_LEN_MAX)
+		return -EINVAL;
+
+	return size;
+}
+
+static int synth_field_size(char *type)
+{
+	int size = 0;
+
+	if (strcmp(type, "s64") == 0)
+		size = sizeof(s64);
+	else if (strcmp(type, "u64") == 0)
+		size = sizeof(u64);
+	else if (strcmp(type, "s32") == 0)
+		size = sizeof(s32);
+	else if (strcmp(type, "u32") == 0)
+		size = sizeof(u32);
+	else if (strcmp(type, "s16") == 0)
+		size = sizeof(s16);
+	else if (strcmp(type, "u16") == 0)
+		size = sizeof(u16);
+	else if (strcmp(type, "s8") == 0)
+		size = sizeof(s8);
+	else if (strcmp(type, "u8") == 0)
+		size = sizeof(u8);
+	else if (strcmp(type, "char") == 0)
+		size = sizeof(char);
+	else if (strcmp(type, "unsigned char") == 0)
+		size = sizeof(unsigned char);
+	else if (strcmp(type, "int") == 0)
+		size = sizeof(int);
+	else if (strcmp(type, "unsigned int") == 0)
+		size = sizeof(unsigned int);
+	else if (strcmp(type, "long") == 0)
+		size = sizeof(long);
+	else if (strcmp(type, "unsigned long") == 0)
+		size = sizeof(unsigned long);
+	else if (strcmp(type, "pid_t") == 0)
+		size = sizeof(pid_t);
+	else if (synth_field_is_string(type))
+		size = synth_field_string_size(type);
+
+	return size;
+}
+
+static const char *synth_field_fmt(char *type)
+{
+	const char *fmt = "%llu";
+
+	if (strcmp(type, "s64") == 0)
+		fmt = "%lld";
+	else if (strcmp(type, "u64") == 0)
+		fmt = "%llu";
+	else if (strcmp(type, "s32") == 0)
+		fmt = "%d";
+	else if (strcmp(type, "u32") == 0)
+		fmt = "%u";
+	else if (strcmp(type, "s16") == 0)
+		fmt = "%d";
+	else if (strcmp(type, "u16") == 0)
+		fmt = "%u";
+	else if (strcmp(type, "s8") == 0)
+		fmt = "%d";
+	else if (strcmp(type, "u8") == 0)
+		fmt = "%u";
+	else if (strcmp(type, "char") == 0)
+		fmt = "%d";
+	else if (strcmp(type, "unsigned char") == 0)
+		fmt = "%u";
+	else if (strcmp(type, "int") == 0)
+		fmt = "%d";
+	else if (strcmp(type, "unsigned int") == 0)
+		fmt = "%u";
+	else if (strcmp(type, "long") == 0)
+		fmt = "%ld";
+	else if (strcmp(type, "unsigned long") == 0)
+		fmt = "%lu";
+	else if (strcmp(type, "pid_t") == 0)
+		fmt = "%d";
+	else if (synth_field_is_string(type))
+		fmt = "%s";
+
+	return fmt;
+}
+
+static enum print_line_t print_synth_event(struct trace_iterator *iter,
+					   int flags,
+					   struct trace_event *event)
+{
+	struct trace_array *tr = iter->tr;
+	struct trace_seq *s = &iter->seq;
+	struct synth_trace_event *entry;
+	struct synth_event *se;
+	unsigned int i, n_u64;
+	char print_fmt[32];
+	const char *fmt;
+
+	entry = (struct synth_trace_event *)iter->ent;
+	se = container_of(event, struct synth_event, call.event);
+
+	trace_seq_printf(s, "%s: ", se->name);
+
+	for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
+		if (trace_seq_has_overflowed(s))
+			goto end;
+
+		fmt = synth_field_fmt(se->fields[i]->type);
+
+		/* parameter types */
+		if (tr->trace_flags & TRACE_ITER_VERBOSE)
+			trace_seq_printf(s, "%s ", fmt);
+
+		snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
+
+		/* parameter values */
+		if (se->fields[i]->is_string) {
+			trace_seq_printf(s, print_fmt, se->fields[i]->name,
+					 (char *)&entry->fields[n_u64],
+					 i == se->n_fields - 1 ? "" : " ");
+			n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+		} else {
+			trace_seq_printf(s, print_fmt, se->fields[i]->name,
+					 entry->fields[n_u64],
+					 i == se->n_fields - 1 ? "" : " ");
+			n_u64++;
+		}
+	}
+end:
+	trace_seq_putc(s, '\n');
+
+	return trace_handle_return(s);
+}
+
+static struct trace_event_functions synth_event_funcs = {
+	.trace		= print_synth_event
+};
+
+static notrace void trace_event_raw_event_synth(void *__data,
+						u64 *var_ref_vals,
+						unsigned int var_ref_idx)
+{
+	struct trace_event_file *trace_file = __data;
+	struct synth_trace_event *entry;
+	struct trace_event_buffer fbuffer;
+	struct ring_buffer *buffer;
+	struct synth_event *event;
+	unsigned int i, n_u64;
+	int fields_size = 0;
+
+	event = trace_file->event_call->data;
+
+	if (trace_trigger_soft_disabled(trace_file))
+		return;
+
+	fields_size = event->n_u64 * sizeof(u64);
+
+	/*
+	 * Avoid ring buffer recursion detection, as this event
+	 * is being performed within another event.
+	 */
+	buffer = trace_file->tr->trace_buffer.buffer;
+	ring_buffer_nest_start(buffer);
+
+	entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+					   sizeof(*entry) + fields_size);
+	if (!entry)
+		goto out;
+
+	for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+		if (event->fields[i]->is_string) {
+			char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
+			char *str_field = (char *)&entry->fields[n_u64];
+
+			strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+			n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+		} else {
+			entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
+			n_u64++;
+		}
+	}
+
+	trace_event_buffer_commit(&fbuffer);
+out:
+	ring_buffer_nest_end(buffer);
+}
+
+static void free_synth_event_print_fmt(struct trace_event_call *call)
+{
+	if (call) {
+		kfree(call->print_fmt);
+		call->print_fmt = NULL;
+	}
+}
+
+static int __set_synth_event_print_fmt(struct synth_event *event,
+				       char *buf, int len)
+{
+	const char *fmt;
+	int pos = 0;
+	int i;
+
+	/* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+	for (i = 0; i < event->n_fields; i++) {
+		fmt = synth_field_fmt(event->fields[i]->type);
+		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
+				event->fields[i]->name, fmt,
+				i == event->n_fields - 1 ? "" : ", ");
+	}
+	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+	for (i = 0; i < event->n_fields; i++) {
+		pos += snprintf(buf + pos, LEN_OR_ZERO,
+				", REC->%s", event->fields[i]->name);
+	}
+
+#undef LEN_OR_ZERO
+
+	/* return the length of print_fmt */
+	return pos;
+}
+
+static int set_synth_event_print_fmt(struct trace_event_call *call)
+{
+	struct synth_event *event = call->data;
+	char *print_fmt;
+	int len;
+
+	/* First: called with 0 length to calculate the needed length */
+	len = __set_synth_event_print_fmt(event, NULL, 0);
+
+	print_fmt = kmalloc(len + 1, GFP_KERNEL);
+	if (!print_fmt)
+		return -ENOMEM;
+
+	/* Second: actually write the @print_fmt */
+	__set_synth_event_print_fmt(event, print_fmt, len + 1);
+	call->print_fmt = print_fmt;
+
+	return 0;
+}
+
+static void free_synth_field(struct synth_field *field)
+{
+	kfree(field->type);
+	kfree(field->name);
+	kfree(field);
+}
+
+static struct synth_field *parse_synth_field(char *field_type,
+					     char *field_name)
+{
+	struct synth_field *field;
+	int len, ret = 0;
+	char *array;
+
+	if (field_type[0] == ';')
+		field_type++;
+
+	len = strlen(field_name);
+	if (field_name[len - 1] == ';')
+		field_name[len - 1] = '\0';
+
+	field = kzalloc(sizeof(*field), GFP_KERNEL);
+	if (!field)
+		return ERR_PTR(-ENOMEM);
+
+	len = strlen(field_type) + 1;
+	array = strchr(field_name, '[');
+	if (array)
+		len += strlen(array);
+	field->type = kzalloc(len, GFP_KERNEL);
+	if (!field->type) {
+		ret = -ENOMEM;
+		goto free;
+	}
+	strcat(field->type, field_type);
+	if (array) {
+		strcat(field->type, array);
+		*array = '\0';
+	}
+
+	field->size = synth_field_size(field->type);
+	if (!field->size) {
+		ret = -EINVAL;
+		goto free;
+	}
+
+	if (synth_field_is_string(field->type))
+		field->is_string = true;
+
+	field->is_signed = synth_field_signed(field->type);
+
+	field->name = kstrdup(field_name, GFP_KERNEL);
+	if (!field->name) {
+		ret = -ENOMEM;
+		goto free;
+	}
+ out:
+	return field;
+ free:
+	free_synth_field(field);
+	field = ERR_PTR(ret);
+	goto out;
+}
+
+static void free_synth_tracepoint(struct tracepoint *tp)
+{
+	if (!tp)
+		return;
+
+	kfree(tp->name);
+	kfree(tp);
+}
+
+static struct tracepoint *alloc_synth_tracepoint(char *name)
+{
+	struct tracepoint *tp;
+
+	tp = kzalloc(sizeof(*tp), GFP_KERNEL);
+	if (!tp)
+		return ERR_PTR(-ENOMEM);
+
+	tp->name = kstrdup(name, GFP_KERNEL);
+	if (!tp->name) {
+		kfree(tp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return tp;
+}
+
+typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
+				    unsigned int var_ref_idx);
+
+static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
+			       unsigned int var_ref_idx)
+{
+	struct tracepoint *tp = event->tp;
+
+	if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
+		struct tracepoint_func *probe_func_ptr;
+		synth_probe_func_t probe_func;
+		void *__data;
+
+		if (!(cpu_online(raw_smp_processor_id())))
+			return;
+
+		probe_func_ptr = rcu_dereference_sched((tp)->funcs);
+		if (probe_func_ptr) {
+			do {
+				probe_func = probe_func_ptr->func;
+				__data = probe_func_ptr->data;
+				probe_func(__data, var_ref_vals, var_ref_idx);
+			} while ((++probe_func_ptr)->func);
+		}
+	}
+}
+
+static struct synth_event *find_synth_event(const char *name)
+{
+	struct synth_event *event;
+
+	list_for_each_entry(event, &synth_event_list, list) {
+		if (strcmp(event->name, name) == 0)
+			return event;
+	}
+
+	return NULL;
+}
+
+static int register_synth_event(struct synth_event *event)
+{
+	struct trace_event_call *call = &event->call;
+	int ret = 0;
+
+	event->call.class = &event->class;
+	event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
+	if (!event->class.system) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	event->tp = alloc_synth_tracepoint(event->name);
+	if (IS_ERR(event->tp)) {
+		ret = PTR_ERR(event->tp);
+		event->tp = NULL;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&call->class->fields);
+	call->event.funcs = &synth_event_funcs;
+	call->class->define_fields = synth_event_define_fields;
+
+	ret = register_trace_event(&call->event);
+	if (!ret) {
+		ret = -ENODEV;
+		goto out;
+	}
+	call->flags = TRACE_EVENT_FL_TRACEPOINT;
+	call->class->reg = trace_event_reg;
+	call->class->probe = trace_event_raw_event_synth;
+	call->data = event;
+	call->tp = event->tp;
+
+	ret = trace_add_event_call(call);
+	if (ret) {
+		pr_warn("Failed to register synthetic event: %s\n",
+			trace_event_name(call));
+		goto err;
+	}
+
+	ret = set_synth_event_print_fmt(call);
+	if (ret < 0) {
+		trace_remove_event_call(call);
+		goto err;
+	}
+ out:
+	return ret;
+ err:
+	unregister_trace_event(&call->event);
+	goto out;
+}
+
+static int unregister_synth_event(struct synth_event *event)
+{
+	struct trace_event_call *call = &event->call;
+	int ret;
+
+	ret = trace_remove_event_call(call);
+
+	return ret;
+}
+
+static void free_synth_event(struct synth_event *event)
+{
+	unsigned int i;
+
+	if (!event)
+		return;
+
+	for (i = 0; i < event->n_fields; i++)
+		free_synth_field(event->fields[i]);
+
+	kfree(event->fields);
+	kfree(event->name);
+	kfree(event->class.system);
+	free_synth_tracepoint(event->tp);
+	free_synth_event_print_fmt(&event->call);
+	kfree(event);
+}
+
+static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
+					     struct synth_field **fields)
+{
+	struct synth_event *event;
+	unsigned int i;
+
+	event = kzalloc(sizeof(*event), GFP_KERNEL);
+	if (!event) {
+		event = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	event->name = kstrdup(event_name, GFP_KERNEL);
+	if (!event->name) {
+		kfree(event);
+		event = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
+	if (!event->fields) {
+		free_synth_event(event);
+		event = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	for (i = 0; i < n_fields; i++)
+		event->fields[i] = fields[i];
+
+	event->n_fields = n_fields;
+ out:
+	return event;
+}
+
+static void action_trace(struct hist_trigger_data *hist_data,
+			 struct tracing_map_elt *elt, void *rec,
+			 struct ring_buffer_event *rbe,
+			 struct action_data *data, u64 *var_ref_vals)
+{
+	struct synth_event *event = data->onmatch.synth_event;
+
+	trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx);
+}
+
+struct hist_var_data {
+	struct list_head list;
+	struct hist_trigger_data *hist_data;
+};
+
+static void add_or_delete_synth_event(struct synth_event *event, int delete)
+{
+	if (delete)
+		free_synth_event(event);
+	else {
+		mutex_lock(&synth_event_mutex);
+		if (!find_synth_event(event->name))
+			list_add(&event->list, &synth_event_list);
+		else
+			free_synth_event(event);
+		mutex_unlock(&synth_event_mutex);
+	}
+}
+
+static int create_synth_event(int argc, char **argv)
+{
+	struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
+	struct synth_event *event = NULL;
+	bool delete_event = false;
+	int i, n_fields = 0, ret = 0;
+	char *name;
+
+	mutex_lock(&synth_event_mutex);
+
+	/*
+	 * Argument syntax:
+	 *  - Add synthetic event: <event_name> field[;field] ...
+	 *  - Remove synthetic event: !<event_name> field[;field] ...
+	 *      where 'field' = type field_name
+	 */
+	if (argc < 1) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	name = argv[0];
+	if (name[0] == '!') {
+		delete_event = true;
+		name++;
+	}
+
+	event = find_synth_event(name);
+	if (event) {
+		if (delete_event) {
+			if (event->ref) {
+				event = NULL;
+				ret = -EBUSY;
+				goto out;
+			}
+			list_del(&event->list);
+			goto out;
+		}
+		event = NULL;
+		ret = -EEXIST;
+		goto out;
+	} else if (delete_event)
+		goto out;
+
+	if (argc < 2) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = 1; i < argc - 1; i++) {
+		if (strcmp(argv[i], ";") == 0)
+			continue;
+		if (n_fields == SYNTH_FIELDS_MAX) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		field = parse_synth_field(argv[i], argv[i + 1]);
+		if (IS_ERR(field)) {
+			ret = PTR_ERR(field);
+			goto err;
+		}
+		fields[n_fields] = field;
+		i++; n_fields++;
+	}
+
+	if (i < argc) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	event = alloc_synth_event(name, n_fields, fields);
+	if (IS_ERR(event)) {
+		ret = PTR_ERR(event);
+		event = NULL;
+		goto err;
+	}
+ out:
+	mutex_unlock(&synth_event_mutex);
+
+	if (event) {
+		if (delete_event) {
+			ret = unregister_synth_event(event);
+			add_or_delete_synth_event(event, !ret);
+		} else {
+			ret = register_synth_event(event);
+			add_or_delete_synth_event(event, ret);
+		}
+	}
+
+	return ret;
+ err:
+	mutex_unlock(&synth_event_mutex);
+
+	for (i = 0; i < n_fields; i++)
+		free_synth_field(fields[i]);
+	free_synth_event(event);
+
+	return ret;
+}
+
+static int release_all_synth_events(void)
+{
+	struct list_head release_events;
+	struct synth_event *event, *e;
+	int ret = 0;
+
+	INIT_LIST_HEAD(&release_events);
+
+	mutex_lock(&synth_event_mutex);
+
+	list_for_each_entry(event, &synth_event_list, list) {
+		if (event->ref) {
+			mutex_unlock(&synth_event_mutex);
+			return -EBUSY;
+		}
+	}
+
+	list_splice_init(&event->list, &release_events);
+
+	mutex_unlock(&synth_event_mutex);
+
+	list_for_each_entry_safe(event, e, &release_events, list) {
+		list_del(&event->list);
+
+		ret = unregister_synth_event(event);
+		add_or_delete_synth_event(event, !ret);
+	}
+
+	return ret;
+}
+
+
+static void *synth_events_seq_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&synth_event_mutex);
+
+	return seq_list_start(&synth_event_list, *pos);
+}
+
+static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &synth_event_list, pos);
+}
+
+static void synth_events_seq_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&synth_event_mutex);
+}
+
+static int synth_events_seq_show(struct seq_file *m, void *v)
+{
+	struct synth_field *field;
+	struct synth_event *event = v;
+	unsigned int i;
+
+	seq_printf(m, "%s\t", event->name);
+
+	for (i = 0; i < event->n_fields; i++) {
+		field = event->fields[i];
+
+		/* parameter values */
+		seq_printf(m, "%s %s%s", field->type, field->name,
+			   i == event->n_fields - 1 ? "" : "; ");
+	}
+
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+static const struct seq_operations synth_events_seq_op = {
+	.start  = synth_events_seq_start,
+	.next   = synth_events_seq_next,
+	.stop   = synth_events_seq_stop,
+	.show   = synth_events_seq_show
+};
+
+static int synth_events_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+		ret = release_all_synth_events();
+		if (ret < 0)
+			return ret;
+	}
+
+	return seq_open(file, &synth_events_seq_op);
+}
+
+static ssize_t synth_events_write(struct file *file,
+				  const char __user *buffer,
+				  size_t count, loff_t *ppos)
+{
+	return trace_parse_run_command(file, buffer, count, ppos,
+				       create_synth_event);
+}
+
+static const struct file_operations synth_events_fops = {
+	.open           = synth_events_open,
+	.write		= synth_events_write,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+};
+
+static u64 hist_field_timestamp(struct hist_field *hist_field,
+				struct tracing_map_elt *elt,
+				struct ring_buffer_event *rbe,
+				void *event)
+{
+	struct hist_trigger_data *hist_data = hist_field->hist_data;
+	struct trace_array *tr = hist_data->event_file->tr;
+
+	u64 ts = ring_buffer_event_time_stamp(rbe);
+
+	if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
+		ts = ns2usecs(ts);
+
+	return ts;
+}
+
+static u64 hist_field_cpu(struct hist_field *hist_field,
+			  struct tracing_map_elt *elt,
+			  struct ring_buffer_event *rbe,
+			  void *event)
+{
+	int cpu = smp_processor_id();
+
+	return cpu;
+}
+
+static struct hist_field *
+check_field_for_var_ref(struct hist_field *hist_field,
+			struct hist_trigger_data *var_data,
+			unsigned int var_idx)
+{
+	struct hist_field *found = NULL;
+
+	if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) {
+		if (hist_field->var.idx == var_idx &&
+		    hist_field->var.hist_data == var_data) {
+			found = hist_field;
+		}
+	}
+
+	return found;
+}
+
+static struct hist_field *
+check_field_for_var_refs(struct hist_trigger_data *hist_data,
+			 struct hist_field *hist_field,
+			 struct hist_trigger_data *var_data,
+			 unsigned int var_idx,
+			 unsigned int level)
+{
+	struct hist_field *found = NULL;
+	unsigned int i;
+
+	if (level > 3)
+		return found;
+
+	if (!hist_field)
+		return found;
+
+	found = check_field_for_var_ref(hist_field, var_data, var_idx);
+	if (found)
+		return found;
+
+	for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
+		struct hist_field *operand;
+
+		operand = hist_field->operands[i];
+		found = check_field_for_var_refs(hist_data, operand, var_data,
+						 var_idx, level + 1);
+		if (found)
+			return found;
+	}
+
+	return found;
+}
+
+static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
+				       struct hist_trigger_data *var_data,
+				       unsigned int var_idx)
+{
+	struct hist_field *hist_field, *found = NULL;
+	unsigned int i;
+
+	for_each_hist_field(i, hist_data) {
+		hist_field = hist_data->fields[i];
+		found = check_field_for_var_refs(hist_data, hist_field,
+						 var_data, var_idx, 0);
+		if (found)
+			return found;
+	}
+
+	for (i = 0; i < hist_data->n_synth_var_refs; i++) {
+		hist_field = hist_data->synth_var_refs[i];
+		found = check_field_for_var_refs(hist_data, hist_field,
+						 var_data, var_idx, 0);
+		if (found)
+			return found;
+	}
+
+	return found;
+}
+
+static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
+					   unsigned int var_idx)
+{
+	struct trace_array *tr = hist_data->event_file->tr;
+	struct hist_field *found = NULL;
+	struct hist_var_data *var_data;
+
+	list_for_each_entry(var_data, &tr->hist_vars, list) {
+		if (var_data->hist_data == hist_data)
+			continue;
+		found = find_var_ref(var_data->hist_data, hist_data, var_idx);
+		if (found)
+			break;
+	}
+
+	return found;
+}
+
+static bool check_var_refs(struct hist_trigger_data *hist_data)
+{
+	struct hist_field *field;
+	bool found = false;
+	int i;
+
+	for_each_hist_field(i, hist_data) {
+		field = hist_data->fields[i];
+		if (field && field->flags & HIST_FIELD_FL_VAR) {
+			if (find_any_var_ref(hist_data, field->var.idx)) {
+				found = true;
+				break;
+			}
+		}
+	}
+
+	return found;
+}
+
+static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data)
+{
+	struct trace_array *tr = hist_data->event_file->tr;
+	struct hist_var_data *var_data, *found = NULL;
+
+	list_for_each_entry(var_data, &tr->hist_vars, list) {
+		if (var_data->hist_data == hist_data) {
+			found = var_data;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static bool field_has_hist_vars(struct hist_field *hist_field,
+				unsigned int level)
+{
+	int i;
+
+	if (level > 3)
+		return false;
+
+	if (!hist_field)
+		return false;
+
+	if (hist_field->flags & HIST_FIELD_FL_VAR ||
+	    hist_field->flags & HIST_FIELD_FL_VAR_REF)
+		return true;
+
+	for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
+		struct hist_field *operand;
+
+		operand = hist_field->operands[i];
+		if (field_has_hist_vars(operand, level + 1))
+			return true;
+	}
+
+	return false;
+}
+
+static bool has_hist_vars(struct hist_trigger_data *hist_data)
+{
+	struct hist_field *hist_field;
+	int i;
+
+	for_each_hist_field(i, hist_data) {
+		hist_field = hist_data->fields[i];
+		if (field_has_hist_vars(hist_field, 0))
+			return true;
+	}
+
+	return false;
+}
+
+static int save_hist_vars(struct hist_trigger_data *hist_data)
+{
+	struct trace_array *tr = hist_data->event_file->tr;
+	struct hist_var_data *var_data;
+
+	var_data = find_hist_vars(hist_data);
+	if (var_data)
+		return 0;
+
+	if (trace_array_get(tr) < 0)
+		return -ENODEV;
+
+	var_data = kzalloc(sizeof(*var_data), GFP_KERNEL);
+	if (!var_data) {
+		trace_array_put(tr);
+		return -ENOMEM;
+	}
+
+	var_data->hist_data = hist_data;
+	list_add(&var_data->list, &tr->hist_vars);
+
+	return 0;
+}
+
+static void remove_hist_vars(struct hist_trigger_data *hist_data)
+{
+	struct trace_array *tr = hist_data->event_file->tr;
+	struct hist_var_data *var_data;
+
+	var_data = find_hist_vars(hist_data);
+	if (!var_data)
+		return;
+
+	if (WARN_ON(check_var_refs(hist_data)))
+		return;
+
+	list_del(&var_data->list);
+
+	kfree(var_data);
+
+	trace_array_put(tr);
+}
+
+static struct hist_field *find_var_field(struct hist_trigger_data *hist_data,
+					 const char *var_name)
+{
+	struct hist_field *hist_field, *found = NULL;
+	int i;
+
+	for_each_hist_field(i, hist_data) {
+		hist_field = hist_data->fields[i];
+		if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR &&
+		    strcmp(hist_field->var.name, var_name) == 0) {
+			found = hist_field;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static struct hist_field *find_var(struct hist_trigger_data *hist_data,
+				   struct trace_event_file *file,
+				   const char *var_name)
+{
+	struct hist_trigger_data *test_data;
+	struct event_trigger_data *test;
+	struct hist_field *hist_field;
+
+	hist_field = find_var_field(hist_data, var_name);
+	if (hist_field)
+		return hist_field;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			test_data = test->private_data;
+			hist_field = find_var_field(test_data, var_name);
+			if (hist_field)
+				return hist_field;
+		}
+	}
+
+	return NULL;
+}
+
+static struct trace_event_file *find_var_file(struct trace_array *tr,
+					      char *system,
+					      char *event_name,
+					      char *var_name)
+{
+	struct hist_trigger_data *var_hist_data;
+	struct hist_var_data *var_data;
+	struct trace_event_file *file, *found = NULL;
+
+	if (system)
+		return find_event_file(tr, system, event_name);
+
+	list_for_each_entry(var_data, &tr->hist_vars, list) {
+		var_hist_data = var_data->hist_data;
+		file = var_hist_data->event_file;
+		if (file == found)
+			continue;
+
+		if (find_var_field(var_hist_data, var_name)) {
+			if (found) {
+				hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
+				return NULL;
+			}
+
+			found = file;
+		}
+	}
+
+	return found;
+}
+
+static struct hist_field *find_file_var(struct trace_event_file *file,
+					const char *var_name)
+{
+	struct hist_trigger_data *test_data;
+	struct event_trigger_data *test;
+	struct hist_field *hist_field;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			test_data = test->private_data;
+			hist_field = find_var_field(test_data, var_name);
+			if (hist_field)
+				return hist_field;
+		}
+	}
+
+	return NULL;
+}
+
+static struct hist_field *
+find_match_var(struct hist_trigger_data *hist_data, char *var_name)
+{
+	struct trace_array *tr = hist_data->event_file->tr;
+	struct hist_field *hist_field, *found = NULL;
+	struct trace_event_file *file;
+	unsigned int i;
+
+	for (i = 0; i < hist_data->n_actions; i++) {
+		struct action_data *data = hist_data->actions[i];
+
+		if (data->fn == action_trace) {
+			char *system = data->onmatch.match_event_system;
+			char *event_name = data->onmatch.match_event;
+
+			file = find_var_file(tr, system, event_name, var_name);
+			if (!file)
+				continue;
+			hist_field = find_file_var(file, var_name);
+			if (hist_field) {
+				if (found) {
+					hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
+					return ERR_PTR(-EINVAL);
+				}
+
+				found = hist_field;
+			}
+		}
+	}
+	return found;
+}
+
+static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
+					 char *system,
+					 char *event_name,
+					 char *var_name)
+{
+	struct trace_array *tr = hist_data->event_file->tr;
+	struct hist_field *hist_field = NULL;
+	struct trace_event_file *file;
+
+	if (!system || !event_name) {
+		hist_field = find_match_var(hist_data, var_name);
+		if (IS_ERR(hist_field))
+			return NULL;
+		if (hist_field)
+			return hist_field;
+	}
+
+	file = find_var_file(tr, system, event_name, var_name);
+	if (!file)
+		return NULL;
+
+	hist_field = find_file_var(file, var_name);
+
+	return hist_field;
+}
+
+struct hist_elt_data {
+	char *comm;
+	u64 *var_ref_vals;
+	char *field_var_str[SYNTH_FIELDS_MAX];
 };
 
+static u64 hist_field_var_ref(struct hist_field *hist_field,
+			      struct tracing_map_elt *elt,
+			      struct ring_buffer_event *rbe,
+			      void *event)
+{
+	struct hist_elt_data *elt_data;
+	u64 var_val = 0;
+
+	elt_data = elt->private_data;
+	var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
+
+	return var_val;
+}
+
+static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key,
+			     u64 *var_ref_vals, bool self)
+{
+	struct hist_trigger_data *var_data;
+	struct tracing_map_elt *var_elt;
+	struct hist_field *hist_field;
+	unsigned int i, var_idx;
+	bool resolved = true;
+	u64 var_val = 0;
+
+	for (i = 0; i < hist_data->n_var_refs; i++) {
+		hist_field = hist_data->var_refs[i];
+		var_idx = hist_field->var.idx;
+		var_data = hist_field->var.hist_data;
+
+		if (var_data == NULL) {
+			resolved = false;
+			break;
+		}
+
+		if ((self && var_data != hist_data) ||
+		    (!self && var_data == hist_data))
+			continue;
+
+		var_elt = tracing_map_lookup(var_data->map, key);
+		if (!var_elt) {
+			resolved = false;
+			break;
+		}
+
+		if (!tracing_map_var_set(var_elt, var_idx)) {
+			resolved = false;
+			break;
+		}
+
+		if (self || !hist_field->read_once)
+			var_val = tracing_map_read_var(var_elt, var_idx);
+		else
+			var_val = tracing_map_read_var_once(var_elt, var_idx);
+
+		var_ref_vals[i] = var_val;
+	}
+
+	return resolved;
+}
+
 static const char *hist_field_name(struct hist_field *field,
 				   unsigned int level)
 {
@@ -162,8 +1683,26 @@ static const char *hist_field_name(struct hist_field *field,
 
 	if (field->field)
 		field_name = field->field->name;
-	else if (field->flags & HIST_FIELD_FL_LOG2)
+	else if (field->flags & HIST_FIELD_FL_LOG2 ||
+		 field->flags & HIST_FIELD_FL_ALIAS)
 		field_name = hist_field_name(field->operands[0], ++level);
+	else if (field->flags & HIST_FIELD_FL_CPU)
+		field_name = "cpu";
+	else if (field->flags & HIST_FIELD_FL_EXPR ||
+		 field->flags & HIST_FIELD_FL_VAR_REF) {
+		if (field->system) {
+			static char full_name[MAX_FILTER_STR_VAL];
+
+			strcat(full_name, field->system);
+			strcat(full_name, ".");
+			strcat(full_name, field->event_name);
+			strcat(full_name, ".");
+			strcat(full_name, field->name);
+			field_name = full_name;
+		} else
+			field_name = field->name;
+	} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
+		field_name = "common_timestamp";
 
 	if (field_name == NULL)
 		field_name = "";
@@ -232,16 +1771,119 @@ static int parse_map_size(char *str)
 
 static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
 {
+	unsigned int i;
+
 	if (!attrs)
 		return;
 
+	for (i = 0; i < attrs->n_assignments; i++)
+		kfree(attrs->assignment_str[i]);
+
+	for (i = 0; i < attrs->n_actions; i++)
+		kfree(attrs->action_str[i]);
+
 	kfree(attrs->name);
 	kfree(attrs->sort_key_str);
 	kfree(attrs->keys_str);
 	kfree(attrs->vals_str);
+	kfree(attrs->clock);
 	kfree(attrs);
 }
 
+static int parse_action(char *str, struct hist_trigger_attrs *attrs)
+{
+	int ret = -EINVAL;
+
+	if (attrs->n_actions >= HIST_ACTIONS_MAX)
+		return ret;
+
+	if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) ||
+	    (strncmp(str, "onmax(", strlen("onmax(")) == 0)) {
+		attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
+		if (!attrs->action_str[attrs->n_actions]) {
+			ret = -ENOMEM;
+			return ret;
+		}
+		attrs->n_actions++;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
+{
+	int ret = 0;
+
+	if ((strncmp(str, "key=", strlen("key=")) == 0) ||
+	    (strncmp(str, "keys=", strlen("keys=")) == 0)) {
+		attrs->keys_str = kstrdup(str, GFP_KERNEL);
+		if (!attrs->keys_str) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	} else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
+		 (strncmp(str, "vals=", strlen("vals=")) == 0) ||
+		 (strncmp(str, "values=", strlen("values=")) == 0)) {
+		attrs->vals_str = kstrdup(str, GFP_KERNEL);
+		if (!attrs->vals_str) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	} else if (strncmp(str, "sort=", strlen("sort=")) == 0) {
+		attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
+		if (!attrs->sort_key_str) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	} else if (strncmp(str, "name=", strlen("name=")) == 0) {
+		attrs->name = kstrdup(str, GFP_KERNEL);
+		if (!attrs->name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	} else if (strncmp(str, "clock=", strlen("clock=")) == 0) {
+		strsep(&str, "=");
+		if (!str) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		str = strstrip(str);
+		attrs->clock = kstrdup(str, GFP_KERNEL);
+		if (!attrs->clock) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	} else if (strncmp(str, "size=", strlen("size=")) == 0) {
+		int map_bits = parse_map_size(str);
+
+		if (map_bits < 0) {
+			ret = map_bits;
+			goto out;
+		}
+		attrs->map_bits = map_bits;
+	} else {
+		char *assignment;
+
+		if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
+			hist_err("Too many variables defined: ", str);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		assignment = kstrdup(str, GFP_KERNEL);
+		if (!assignment) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		attrs->assignment_str[attrs->n_assignments++] = assignment;
+	}
+ out:
+	return ret;
+}
+
 static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 {
 	struct hist_trigger_attrs *attrs;
@@ -254,35 +1896,21 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 	while (trigger_str) {
 		char *str = strsep(&trigger_str, ":");
 
-		if ((strncmp(str, "key=", strlen("key=")) == 0) ||
-		    (strncmp(str, "keys=", strlen("keys=")) == 0))
-			attrs->keys_str = kstrdup(str, GFP_KERNEL);
-		else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
-			 (strncmp(str, "vals=", strlen("vals=")) == 0) ||
-			 (strncmp(str, "values=", strlen("values=")) == 0))
-			attrs->vals_str = kstrdup(str, GFP_KERNEL);
-		else if (strncmp(str, "sort=", strlen("sort=")) == 0)
-			attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
-		else if (strncmp(str, "name=", strlen("name=")) == 0)
-			attrs->name = kstrdup(str, GFP_KERNEL);
-		else if (strcmp(str, "pause") == 0)
+		if (strchr(str, '=')) {
+			ret = parse_assignment(str, attrs);
+			if (ret)
+				goto free;
+		} else if (strcmp(str, "pause") == 0)
 			attrs->pause = true;
 		else if ((strcmp(str, "cont") == 0) ||
 			 (strcmp(str, "continue") == 0))
 			attrs->cont = true;
 		else if (strcmp(str, "clear") == 0)
 			attrs->clear = true;
-		else if (strncmp(str, "size=", strlen("size=")) == 0) {
-			int map_bits = parse_map_size(str);
-
-			if (map_bits < 0) {
-				ret = map_bits;
+		else {
+			ret = parse_action(str, attrs);
+			if (ret)
 				goto free;
-			}
-			attrs->map_bits = map_bits;
-		} else {
-			ret = -EINVAL;
-			goto free;
 		}
 	}
 
@@ -291,6 +1919,14 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 		goto free;
 	}
 
+	if (!attrs->clock) {
+		attrs->clock = kstrdup("global", GFP_KERNEL);
+		if (!attrs->clock) {
+			ret = -ENOMEM;
+			goto free;
+		}
+	}
+
 	return attrs;
  free:
 	destroy_hist_trigger_attrs(attrs);
@@ -313,64 +1949,203 @@ static inline void save_comm(char *comm, struct task_struct *task)
 	memcpy(comm, task->comm, TASK_COMM_LEN);
 }
 
-static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt)
+static void hist_elt_data_free(struct hist_elt_data *elt_data)
 {
-	kfree((char *)elt->private_data);
+	unsigned int i;
+
+	for (i = 0; i < SYNTH_FIELDS_MAX; i++)
+		kfree(elt_data->field_var_str[i]);
+
+	kfree(elt_data->comm);
+	kfree(elt_data);
 }
 
-static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt)
+static void hist_trigger_elt_data_free(struct tracing_map_elt *elt)
+{
+	struct hist_elt_data *elt_data = elt->private_data;
+
+	hist_elt_data_free(elt_data);
+}
+
+static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
 {
 	struct hist_trigger_data *hist_data = elt->map->private_data;
+	unsigned int size = TASK_COMM_LEN;
+	struct hist_elt_data *elt_data;
 	struct hist_field *key_field;
-	unsigned int i;
+	unsigned int i, n_str;
+
+	elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
+	if (!elt_data)
+		return -ENOMEM;
 
 	for_each_hist_key_field(i, hist_data) {
 		key_field = hist_data->fields[i];
 
 		if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
-			unsigned int size = TASK_COMM_LEN + 1;
-
-			elt->private_data = kzalloc(size, GFP_KERNEL);
-			if (!elt->private_data)
+			elt_data->comm = kzalloc(size, GFP_KERNEL);
+			if (!elt_data->comm) {
+				kfree(elt_data);
 				return -ENOMEM;
+			}
 			break;
 		}
 	}
 
+	n_str = hist_data->n_field_var_str + hist_data->n_max_var_str;
+
+	size = STR_VAR_LEN_MAX;
+
+	for (i = 0; i < n_str; i++) {
+		elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
+		if (!elt_data->field_var_str[i]) {
+			hist_elt_data_free(elt_data);
+			return -ENOMEM;
+		}
+	}
+
+	elt->private_data = elt_data;
+
 	return 0;
 }
 
-static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to,
-				       struct tracing_map_elt *from)
+static void hist_trigger_elt_data_init(struct tracing_map_elt *elt)
+{
+	struct hist_elt_data *elt_data = elt->private_data;
+
+	if (elt_data->comm)
+		save_comm(elt_data->comm, current);
+}
+
+static const struct tracing_map_ops hist_trigger_elt_data_ops = {
+	.elt_alloc	= hist_trigger_elt_data_alloc,
+	.elt_free	= hist_trigger_elt_data_free,
+	.elt_init	= hist_trigger_elt_data_init,
+};
+
+static const char *get_hist_field_flags(struct hist_field *hist_field)
+{
+	const char *flags_str = NULL;
+
+	if (hist_field->flags & HIST_FIELD_FL_HEX)
+		flags_str = "hex";
+	else if (hist_field->flags & HIST_FIELD_FL_SYM)
+		flags_str = "sym";
+	else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
+		flags_str = "sym-offset";
+	else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
+		flags_str = "execname";
+	else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
+		flags_str = "syscall";
+	else if (hist_field->flags & HIST_FIELD_FL_LOG2)
+		flags_str = "log2";
+	else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
+		flags_str = "usecs";
+
+	return flags_str;
+}
+
+static void expr_field_str(struct hist_field *field, char *expr)
 {
-	char *comm_from = from->private_data;
-	char *comm_to = to->private_data;
+	if (field->flags & HIST_FIELD_FL_VAR_REF)
+		strcat(expr, "$");
+
+	strcat(expr, hist_field_name(field, 0));
 
-	if (comm_from)
-		memcpy(comm_to, comm_from, TASK_COMM_LEN + 1);
+	if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) {
+		const char *flags_str = get_hist_field_flags(field);
+
+		if (flags_str) {
+			strcat(expr, ".");
+			strcat(expr, flags_str);
+		}
+	}
 }
 
-static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt)
+static char *expr_str(struct hist_field *field, unsigned int level)
 {
-	char *comm = elt->private_data;
+	char *expr;
+
+	if (level > 1)
+		return NULL;
+
+	expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
+	if (!expr)
+		return NULL;
+
+	if (!field->operands[0]) {
+		expr_field_str(field, expr);
+		return expr;
+	}
+
+	if (field->operator == FIELD_OP_UNARY_MINUS) {
+		char *subexpr;
 
-	if (comm)
-		save_comm(comm, current);
+		strcat(expr, "-(");
+		subexpr = expr_str(field->operands[0], ++level);
+		if (!subexpr) {
+			kfree(expr);
+			return NULL;
+		}
+		strcat(expr, subexpr);
+		strcat(expr, ")");
+
+		kfree(subexpr);
+
+		return expr;
+	}
+
+	expr_field_str(field->operands[0], expr);
+
+	switch (field->operator) {
+	case FIELD_OP_MINUS:
+		strcat(expr, "-");
+		break;
+	case FIELD_OP_PLUS:
+		strcat(expr, "+");
+		break;
+	default:
+		kfree(expr);
+		return NULL;
+	}
+
+	expr_field_str(field->operands[1], expr);
+
+	return expr;
 }
 
-static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
-	.elt_alloc	= hist_trigger_elt_comm_alloc,
-	.elt_copy	= hist_trigger_elt_comm_copy,
-	.elt_free	= hist_trigger_elt_comm_free,
-	.elt_init	= hist_trigger_elt_comm_init,
-};
+static int contains_operator(char *str)
+{
+	enum field_op_id field_op = FIELD_OP_NONE;
+	char *op;
+
+	op = strpbrk(str, "+-");
+	if (!op)
+		return FIELD_OP_NONE;
+
+	switch (*op) {
+	case '-':
+		if (*str == '-')
+			field_op = FIELD_OP_UNARY_MINUS;
+		else
+			field_op = FIELD_OP_MINUS;
+		break;
+	case '+':
+		field_op = FIELD_OP_PLUS;
+		break;
+	default:
+		break;
+	}
+
+	return field_op;
+}
 
 static void destroy_hist_field(struct hist_field *hist_field,
 			       unsigned int level)
 {
 	unsigned int i;
 
-	if (level > 2)
+	if (level > 3)
 		return;
 
 	if (!hist_field)
@@ -379,11 +2154,17 @@ static void destroy_hist_field(struct hist_field *hist_field,
 	for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
 		destroy_hist_field(hist_field->operands[i], level + 1);
 
+	kfree(hist_field->var.name);
+	kfree(hist_field->name);
+	kfree(hist_field->type);
+
 	kfree(hist_field);
 }
 
-static struct hist_field *create_hist_field(struct ftrace_event_field *field,
-					    unsigned long flags)
+static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
+					    struct ftrace_event_field *field,
+					    unsigned long flags,
+					    char *var_name)
 {
 	struct hist_field *hist_field;
 
@@ -394,8 +2175,22 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 	if (!hist_field)
 		return NULL;
 
+	hist_field->hist_data = hist_data;
+
+	if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
+		goto out; /* caller will populate */
+
+	if (flags & HIST_FIELD_FL_VAR_REF) {
+		hist_field->fn = hist_field_var_ref;
+		goto out;
+	}
+
 	if (flags & HIST_FIELD_FL_HITCOUNT) {
 		hist_field->fn = hist_field_counter;
+		hist_field->size = sizeof(u64);
+		hist_field->type = kstrdup("u64", GFP_KERNEL);
+		if (!hist_field->type)
+			goto free;
 		goto out;
 	}
 
@@ -407,8 +2202,29 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 	if (flags & HIST_FIELD_FL_LOG2) {
 		unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
 		hist_field->fn = hist_field_log2;
-		hist_field->operands[0] = create_hist_field(field, fl);
+		hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
 		hist_field->size = hist_field->operands[0]->size;
+		hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
+		if (!hist_field->type)
+			goto free;
+		goto out;
+	}
+
+	if (flags & HIST_FIELD_FL_TIMESTAMP) {
+		hist_field->fn = hist_field_timestamp;
+		hist_field->size = sizeof(u64);
+		hist_field->type = kstrdup("u64", GFP_KERNEL);
+		if (!hist_field->type)
+			goto free;
+		goto out;
+	}
+
+	if (flags & HIST_FIELD_FL_CPU) {
+		hist_field->fn = hist_field_cpu;
+		hist_field->size = sizeof(int);
+		hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
+		if (!hist_field->type)
+			goto free;
 		goto out;
 	}
 
@@ -418,6 +2234,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 	if (is_string_field(field)) {
 		flags |= HIST_FIELD_FL_STRING;
 
+		hist_field->size = MAX_FILTER_STR_VAL;
+		hist_field->type = kstrdup(field->type, GFP_KERNEL);
+		if (!hist_field->type)
+			goto free;
+
 		if (field->filter_type == FILTER_STATIC_STRING)
 			hist_field->fn = hist_field_string;
 		else if (field->filter_type == FILTER_DYN_STRING)
@@ -425,6 +2246,12 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 		else
 			hist_field->fn = hist_field_pstring;
 	} else {
+		hist_field->size = field->size;
+		hist_field->is_signed = field->is_signed;
+		hist_field->type = kstrdup(field->type, GFP_KERNEL);
+		if (!hist_field->type)
+			goto free;
+
 		hist_field->fn = select_value_fn(field->size,
 						 field->is_signed);
 		if (!hist_field->fn) {
@@ -436,14 +2263,23 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 	hist_field->field = field;
 	hist_field->flags = flags;
 
+	if (var_name) {
+		hist_field->var.name = kstrdup(var_name, GFP_KERNEL);
+		if (!hist_field->var.name)
+			goto free;
+	}
+
 	return hist_field;
+ free:
+	destroy_hist_field(hist_field, 0);
+	return NULL;
 }
 
 static void destroy_hist_fields(struct hist_trigger_data *hist_data)
 {
 	unsigned int i;
 
-	for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
+	for (i = 0; i < HIST_FIELDS_MAX; i++) {
 		if (hist_data->fields[i]) {
 			destroy_hist_field(hist_data->fields[i], 0);
 			hist_data->fields[i] = NULL;
@@ -451,69 +2287,1612 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data)
 	}
 }
 
-static int create_hitcount_val(struct hist_trigger_data *hist_data)
+static int init_var_ref(struct hist_field *ref_field,
+			struct hist_field *var_field,
+			char *system, char *event_name)
 {
-	hist_data->fields[HITCOUNT_IDX] =
-		create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
-	if (!hist_data->fields[HITCOUNT_IDX])
-		return -ENOMEM;
+	int err = 0;
+
+	ref_field->var.idx = var_field->var.idx;
+	ref_field->var.hist_data = var_field->hist_data;
+	ref_field->size = var_field->size;
+	ref_field->is_signed = var_field->is_signed;
+	ref_field->flags |= var_field->flags &
+		(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
+
+	if (system) {
+		ref_field->system = kstrdup(system, GFP_KERNEL);
+		if (!ref_field->system)
+			return -ENOMEM;
+	}
 
-	hist_data->n_vals++;
+	if (event_name) {
+		ref_field->event_name = kstrdup(event_name, GFP_KERNEL);
+		if (!ref_field->event_name) {
+			err = -ENOMEM;
+			goto free;
+		}
+	}
 
-	if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+	if (var_field->var.name) {
+		ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL);
+		if (!ref_field->name) {
+			err = -ENOMEM;
+			goto free;
+		}
+	} else if (var_field->name) {
+		ref_field->name = kstrdup(var_field->name, GFP_KERNEL);
+		if (!ref_field->name) {
+			err = -ENOMEM;
+			goto free;
+		}
+	}
+
+	ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
+	if (!ref_field->type) {
+		err = -ENOMEM;
+		goto free;
+	}
+ out:
+	return err;
+ free:
+	kfree(ref_field->system);
+	kfree(ref_field->event_name);
+	kfree(ref_field->name);
+
+	goto out;
+}
+
+static struct hist_field *create_var_ref(struct hist_field *var_field,
+					 char *system, char *event_name)
+{
+	unsigned long flags = HIST_FIELD_FL_VAR_REF;
+	struct hist_field *ref_field;
+
+	ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
+	if (ref_field) {
+		if (init_var_ref(ref_field, var_field, system, event_name)) {
+			destroy_hist_field(ref_field, 0);
+			return NULL;
+		}
+	}
+
+	return ref_field;
+}
+
+static bool is_var_ref(char *var_name)
+{
+	if (!var_name || strlen(var_name) < 2 || var_name[0] != '$')
+		return false;
+
+	return true;
+}
+
+static char *field_name_from_var(struct hist_trigger_data *hist_data,
+				 char *var_name)
+{
+	char *name, *field;
+	unsigned int i;
+
+	for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
+		name = hist_data->attrs->var_defs.name[i];
+
+		if (strcmp(var_name, name) == 0) {
+			field = hist_data->attrs->var_defs.expr[i];
+			if (contains_operator(field) || is_var_ref(field))
+				continue;
+			return field;
+		}
+	}
+
+	return NULL;
+}
+
+static char *local_field_var_ref(struct hist_trigger_data *hist_data,
+				 char *system, char *event_name,
+				 char *var_name)
+{
+	struct trace_event_call *call;
+
+	if (system && event_name) {
+		call = hist_data->event_file->event_call;
+
+		if (strcmp(system, call->class->system) != 0)
+			return NULL;
+
+		if (strcmp(event_name, trace_event_name(call)) != 0)
+			return NULL;
+	}
+
+	if (!!system != !!event_name)
+		return NULL;
+
+	if (!is_var_ref(var_name))
+		return NULL;
+
+	var_name++;
+
+	return field_name_from_var(hist_data, var_name);
+}
+
+static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
+					char *system, char *event_name,
+					char *var_name)
+{
+	struct hist_field *var_field = NULL, *ref_field = NULL;
+
+	if (!is_var_ref(var_name))
+		return NULL;
+
+	var_name++;
+
+	var_field = find_event_var(hist_data, system, event_name, var_name);
+	if (var_field)
+		ref_field = create_var_ref(var_field, system, event_name);
+
+	if (!ref_field)
+		hist_err_event("Couldn't find variable: $",
+			       system, event_name, var_name);
+
+	return ref_field;
+}
+
+static struct ftrace_event_field *
+parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
+	    char *field_str, unsigned long *flags)
+{
+	struct ftrace_event_field *field = NULL;
+	char *field_name, *modifier, *str;
+
+	modifier = str = kstrdup(field_str, GFP_KERNEL);
+	if (!modifier)
+		return ERR_PTR(-ENOMEM);
+
+	field_name = strsep(&modifier, ".");
+	if (modifier) {
+		if (strcmp(modifier, "hex") == 0)
+			*flags |= HIST_FIELD_FL_HEX;
+		else if (strcmp(modifier, "sym") == 0)
+			*flags |= HIST_FIELD_FL_SYM;
+		else if (strcmp(modifier, "sym-offset") == 0)
+			*flags |= HIST_FIELD_FL_SYM_OFFSET;
+		else if ((strcmp(modifier, "execname") == 0) &&
+			 (strcmp(field_name, "common_pid") == 0))
+			*flags |= HIST_FIELD_FL_EXECNAME;
+		else if (strcmp(modifier, "syscall") == 0)
+			*flags |= HIST_FIELD_FL_SYSCALL;
+		else if (strcmp(modifier, "log2") == 0)
+			*flags |= HIST_FIELD_FL_LOG2;
+		else if (strcmp(modifier, "usecs") == 0)
+			*flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
+		else {
+			hist_err("Invalid field modifier: ", modifier);
+			field = ERR_PTR(-EINVAL);
+			goto out;
+		}
+	}
+
+	if (strcmp(field_name, "common_timestamp") == 0) {
+		*flags |= HIST_FIELD_FL_TIMESTAMP;
+		hist_data->enable_timestamps = true;
+		if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
+			hist_data->attrs->ts_in_usecs = true;
+	} else if (strcmp(field_name, "cpu") == 0)
+		*flags |= HIST_FIELD_FL_CPU;
+	else {
+		field = trace_find_event_field(file->event_call, field_name);
+		if (!field || !field->size) {
+			hist_err("Couldn't find field: ", field_name);
+			field = ERR_PTR(-EINVAL);
+			goto out;
+		}
+	}
+ out:
+	kfree(str);
+
+	return field;
+}
+
+static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
+				       struct hist_field *var_ref,
+				       char *var_name)
+{
+	struct hist_field *alias = NULL;
+	unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR;
+
+	alias = create_hist_field(hist_data, NULL, flags, var_name);
+	if (!alias)
+		return NULL;
+
+	alias->fn = var_ref->fn;
+	alias->operands[0] = var_ref;
+
+	if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
+		destroy_hist_field(alias, 0);
+		return NULL;
+	}
+
+	return alias;
+}
+
+static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
+				     struct trace_event_file *file, char *str,
+				     unsigned long *flags, char *var_name)
+{
+	char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
+	struct ftrace_event_field *field = NULL;
+	struct hist_field *hist_field = NULL;
+	int ret = 0;
+
+	s = strchr(str, '.');
+	if (s) {
+		s = strchr(++s, '.');
+		if (s) {
+			ref_system = strsep(&str, ".");
+			if (!str) {
+				ret = -EINVAL;
+				goto out;
+			}
+			ref_event = strsep(&str, ".");
+			if (!str) {
+				ret = -EINVAL;
+				goto out;
+			}
+			ref_var = str;
+		}
+	}
+
+	s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
+	if (!s) {
+		hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
+		if (hist_field) {
+			hist_data->var_refs[hist_data->n_var_refs] = hist_field;
+			hist_field->var_ref_idx = hist_data->n_var_refs++;
+			if (var_name) {
+				hist_field = create_alias(hist_data, hist_field, var_name);
+				if (!hist_field) {
+					ret = -ENOMEM;
+					goto out;
+				}
+			}
+			return hist_field;
+		}
+	} else
+		str = s;
+
+	field = parse_field(hist_data, file, str, flags);
+	if (IS_ERR(field)) {
+		ret = PTR_ERR(field);
+		goto out;
+	}
+
+	hist_field = create_hist_field(hist_data, field, *flags, var_name);
+	if (!hist_field) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	return hist_field;
+ out:
+	return ERR_PTR(ret);
+}
+
+static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
+				     struct trace_event_file *file,
+				     char *str, unsigned long flags,
+				     char *var_name, unsigned int level);
+
+static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
+				      struct trace_event_file *file,
+				      char *str, unsigned long flags,
+				      char *var_name, unsigned int level)
+{
+	struct hist_field *operand1, *expr = NULL;
+	unsigned long operand_flags;
+	int ret = 0;
+	char *s;
+
+	/* we support only -(xxx) i.e. explicit parens required */
+
+	if (level > 3) {
+		hist_err("Too many subexpressions (3 max): ", str);
+		ret = -EINVAL;
+		goto free;
+	}
+
+	str++; /* skip leading '-' */
+
+	s = strchr(str, '(');
+	if (s)
+		str++;
+	else {
+		ret = -EINVAL;
+		goto free;
+	}
+
+	s = strrchr(str, ')');
+	if (s)
+		*s = '\0';
+	else {
+		ret = -EINVAL; /* no closing ')' */
+		goto free;
+	}
+
+	flags |= HIST_FIELD_FL_EXPR;
+	expr = create_hist_field(hist_data, NULL, flags, var_name);
+	if (!expr) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	operand_flags = 0;
+	operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
+	if (IS_ERR(operand1)) {
+		ret = PTR_ERR(operand1);
+		goto free;
+	}
+
+	expr->flags |= operand1->flags &
+		(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
+	expr->fn = hist_field_unary_minus;
+	expr->operands[0] = operand1;
+	expr->operator = FIELD_OP_UNARY_MINUS;
+	expr->name = expr_str(expr, 0);
+	expr->type = kstrdup(operand1->type, GFP_KERNEL);
+	if (!expr->type) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	return expr;
+ free:
+	destroy_hist_field(expr, 0);
+	return ERR_PTR(ret);
+}
+
+static int check_expr_operands(struct hist_field *operand1,
+			       struct hist_field *operand2)
+{
+	unsigned long operand1_flags = operand1->flags;
+	unsigned long operand2_flags = operand2->flags;
+
+	if ((operand1_flags & HIST_FIELD_FL_VAR_REF) ||
+	    (operand1_flags & HIST_FIELD_FL_ALIAS)) {
+		struct hist_field *var;
+
+		var = find_var_field(operand1->var.hist_data, operand1->name);
+		if (!var)
+			return -EINVAL;
+		operand1_flags = var->flags;
+	}
+
+	if ((operand2_flags & HIST_FIELD_FL_VAR_REF) ||
+	    (operand2_flags & HIST_FIELD_FL_ALIAS)) {
+		struct hist_field *var;
+
+		var = find_var_field(operand2->var.hist_data, operand2->name);
+		if (!var)
+			return -EINVAL;
+		operand2_flags = var->flags;
+	}
+
+	if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
+	    (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
+		hist_err("Timestamp units in expression don't match", NULL);
 		return -EINVAL;
+	}
 
 	return 0;
 }
 
-static int create_val_field(struct hist_trigger_data *hist_data,
-			    unsigned int val_idx,
-			    struct trace_event_file *file,
-			    char *field_str)
+static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
+				     struct trace_event_file *file,
+				     char *str, unsigned long flags,
+				     char *var_name, unsigned int level)
 {
-	struct ftrace_event_field *field = NULL;
-	unsigned long flags = 0;
-	char *field_name;
+	struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL;
+	unsigned long operand_flags;
+	int field_op, ret = -EINVAL;
+	char *sep, *operand1_str;
+
+	if (level > 3) {
+		hist_err("Too many subexpressions (3 max): ", str);
+		return ERR_PTR(-EINVAL);
+	}
+
+	field_op = contains_operator(str);
+
+	if (field_op == FIELD_OP_NONE)
+		return parse_atom(hist_data, file, str, &flags, var_name);
+
+	if (field_op == FIELD_OP_UNARY_MINUS)
+		return parse_unary(hist_data, file, str, flags, var_name, ++level);
+
+	switch (field_op) {
+	case FIELD_OP_MINUS:
+		sep = "-";
+		break;
+	case FIELD_OP_PLUS:
+		sep = "+";
+		break;
+	default:
+		goto free;
+	}
+
+	operand1_str = strsep(&str, sep);
+	if (!operand1_str || !str)
+		goto free;
+
+	operand_flags = 0;
+	operand1 = parse_atom(hist_data, file, operand1_str,
+			      &operand_flags, NULL);
+	if (IS_ERR(operand1)) {
+		ret = PTR_ERR(operand1);
+		operand1 = NULL;
+		goto free;
+	}
+
+	/* rest of string could be another expression e.g. b+c in a+b+c */
+	operand_flags = 0;
+	operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
+	if (IS_ERR(operand2)) {
+		ret = PTR_ERR(operand2);
+		operand2 = NULL;
+		goto free;
+	}
+
+	ret = check_expr_operands(operand1, operand2);
+	if (ret)
+		goto free;
+
+	flags |= HIST_FIELD_FL_EXPR;
+
+	flags |= operand1->flags &
+		(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
+
+	expr = create_hist_field(hist_data, NULL, flags, var_name);
+	if (!expr) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	operand1->read_once = true;
+	operand2->read_once = true;
+
+	expr->operands[0] = operand1;
+	expr->operands[1] = operand2;
+	expr->operator = field_op;
+	expr->name = expr_str(expr, 0);
+	expr->type = kstrdup(operand1->type, GFP_KERNEL);
+	if (!expr->type) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	switch (field_op) {
+	case FIELD_OP_MINUS:
+		expr->fn = hist_field_minus;
+		break;
+	case FIELD_OP_PLUS:
+		expr->fn = hist_field_plus;
+		break;
+	default:
+		ret = -EINVAL;
+		goto free;
+	}
+
+	return expr;
+ free:
+	destroy_hist_field(operand1, 0);
+	destroy_hist_field(operand2, 0);
+	destroy_hist_field(expr, 0);
+
+	return ERR_PTR(ret);
+}
+
+static char *find_trigger_filter(struct hist_trigger_data *hist_data,
+				 struct trace_event_file *file)
+{
+	struct event_trigger_data *test;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			if (test->private_data == hist_data)
+				return test->filter_str;
+		}
+	}
+
+	return NULL;
+}
+
+static struct event_command trigger_hist_cmd;
+static int event_hist_trigger_func(struct event_command *cmd_ops,
+				   struct trace_event_file *file,
+				   char *glob, char *cmd, char *param);
+
+static bool compatible_keys(struct hist_trigger_data *target_hist_data,
+			    struct hist_trigger_data *hist_data,
+			    unsigned int n_keys)
+{
+	struct hist_field *target_hist_field, *hist_field;
+	unsigned int n, i, j;
+
+	if (hist_data->n_fields - hist_data->n_vals != n_keys)
+		return false;
+
+	i = hist_data->n_vals;
+	j = target_hist_data->n_vals;
+
+	for (n = 0; n < n_keys; n++) {
+		hist_field = hist_data->fields[i + n];
+		target_hist_field = target_hist_data->fields[j + n];
+
+		if (strcmp(hist_field->type, target_hist_field->type) != 0)
+			return false;
+		if (hist_field->size != target_hist_field->size)
+			return false;
+		if (hist_field->is_signed != target_hist_field->is_signed)
+			return false;
+	}
+
+	return true;
+}
+
+static struct hist_trigger_data *
+find_compatible_hist(struct hist_trigger_data *target_hist_data,
+		     struct trace_event_file *file)
+{
+	struct hist_trigger_data *hist_data;
+	struct event_trigger_data *test;
+	unsigned int n_keys;
+
+	n_keys = target_hist_data->n_fields - target_hist_data->n_vals;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			hist_data = test->private_data;
+
+			if (compatible_keys(target_hist_data, hist_data, n_keys))
+				return hist_data;
+		}
+	}
+
+	return NULL;
+}
+
+static struct trace_event_file *event_file(struct trace_array *tr,
+					   char *system, char *event_name)
+{
+	struct trace_event_file *file;
+
+	file = __find_event_file(tr, system, event_name);
+	if (!file)
+		return ERR_PTR(-EINVAL);
+
+	return file;
+}
+
+static struct hist_field *
+find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
+			 char *system, char *event_name, char *field_name)
+{
+	struct hist_field *event_var;
+	char *synthetic_name;
+
+	synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
+	if (!synthetic_name)
+		return ERR_PTR(-ENOMEM);
+
+	strcpy(synthetic_name, "synthetic_");
+	strcat(synthetic_name, field_name);
+
+	event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
+
+	kfree(synthetic_name);
+
+	return event_var;
+}
+
+/**
+ * create_field_var_hist - Automatically create a histogram and var for a field
+ * @target_hist_data: The target hist trigger
+ * @subsys_name: Optional subsystem name
+ * @event_name: Optional event name
+ * @field_name: The name of the field (and the resulting variable)
+ *
+ * Hist trigger actions fetch data from variables, not directly from
+ * events.  However, for convenience, users are allowed to directly
+ * specify an event field in an action, which will be automatically
+ * converted into a variable on their behalf.
+
+ * If a user specifies a field on an event that isn't the event the
+ * histogram currently being defined (the target event histogram), the
+ * only way that can be accomplished is if a new hist trigger is
+ * created and the field variable defined on that.
+ *
+ * This function creates a new histogram compatible with the target
+ * event (meaning a histogram with the same key as the target
+ * histogram), and creates a variable for the specified field, but
+ * with 'synthetic_' prepended to the variable name in order to avoid
+ * collision with normal field variables.
+ *
+ * Return: The variable created for the field.
+ */
+static struct hist_field *
+create_field_var_hist(struct hist_trigger_data *target_hist_data,
+		      char *subsys_name, char *event_name, char *field_name)
+{
+	struct trace_array *tr = target_hist_data->event_file->tr;
+	struct hist_field *event_var = ERR_PTR(-EINVAL);
+	struct hist_trigger_data *hist_data;
+	unsigned int i, n, first = true;
+	struct field_var_hist *var_hist;
+	struct trace_event_file *file;
+	struct hist_field *key_field;
+	char *saved_filter;
+	char *cmd;
+	int ret;
+
+	if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
+		hist_err_event("onmatch: Too many field variables defined: ",
+			       subsys_name, event_name, field_name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	file = event_file(tr, subsys_name, event_name);
+
+	if (IS_ERR(file)) {
+		hist_err_event("onmatch: Event file not found: ",
+			       subsys_name, event_name, field_name);
+		ret = PTR_ERR(file);
+		return ERR_PTR(ret);
+	}
+
+	/*
+	 * Look for a histogram compatible with target.  We'll use the
+	 * found histogram specification to create a new matching
+	 * histogram with our variable on it.  target_hist_data is not
+	 * yet a registered histogram so we can't use that.
+	 */
+	hist_data = find_compatible_hist(target_hist_data, file);
+	if (!hist_data) {
+		hist_err_event("onmatch: Matching event histogram not found: ",
+			       subsys_name, event_name, field_name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* See if a synthetic field variable has already been created */
+	event_var = find_synthetic_field_var(target_hist_data, subsys_name,
+					     event_name, field_name);
+	if (!IS_ERR_OR_NULL(event_var))
+		return event_var;
+
+	var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL);
+	if (!var_hist)
+		return ERR_PTR(-ENOMEM);
+
+	cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
+	if (!cmd) {
+		kfree(var_hist);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Use the same keys as the compatible histogram */
+	strcat(cmd, "keys=");
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+		if (!first)
+			strcat(cmd, ",");
+		strcat(cmd, key_field->field->name);
+		first = false;
+	}
+
+	/* Create the synthetic field variable specification */
+	strcat(cmd, ":synthetic_");
+	strcat(cmd, field_name);
+	strcat(cmd, "=");
+	strcat(cmd, field_name);
+
+	/* Use the same filter as the compatible histogram */
+	saved_filter = find_trigger_filter(hist_data, file);
+	if (saved_filter) {
+		strcat(cmd, " if ");
+		strcat(cmd, saved_filter);
+	}
+
+	var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
+	if (!var_hist->cmd) {
+		kfree(cmd);
+		kfree(var_hist);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Save the compatible histogram information */
+	var_hist->hist_data = hist_data;
+
+	/* Create the new histogram with our variable */
+	ret = event_hist_trigger_func(&trigger_hist_cmd, file,
+				      "", "hist", cmd);
+	if (ret) {
+		kfree(cmd);
+		kfree(var_hist->cmd);
+		kfree(var_hist);
+		hist_err_event("onmatch: Couldn't create histogram for field: ",
+			       subsys_name, event_name, field_name);
+		return ERR_PTR(ret);
+	}
+
+	kfree(cmd);
+
+	/* If we can't find the variable, something went wrong */
+	event_var = find_synthetic_field_var(target_hist_data, subsys_name,
+					     event_name, field_name);
+	if (IS_ERR_OR_NULL(event_var)) {
+		kfree(var_hist->cmd);
+		kfree(var_hist);
+		hist_err_event("onmatch: Couldn't find synthetic variable: ",
+			       subsys_name, event_name, field_name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	n = target_hist_data->n_field_var_hists;
+	target_hist_data->field_var_hists[n] = var_hist;
+	target_hist_data->n_field_var_hists++;
+
+	return event_var;
+}
+
+static struct hist_field *
+find_target_event_var(struct hist_trigger_data *hist_data,
+		      char *subsys_name, char *event_name, char *var_name)
+{
+	struct trace_event_file *file = hist_data->event_file;
+	struct hist_field *hist_field = NULL;
+
+	if (subsys_name) {
+		struct trace_event_call *call;
+
+		if (!event_name)
+			return NULL;
+
+		call = file->event_call;
+
+		if (strcmp(subsys_name, call->class->system) != 0)
+			return NULL;
+
+		if (strcmp(event_name, trace_event_name(call)) != 0)
+			return NULL;
+	}
+
+	hist_field = find_var_field(hist_data, var_name);
+
+	return hist_field;
+}
+
+static inline void __update_field_vars(struct tracing_map_elt *elt,
+				       struct ring_buffer_event *rbe,
+				       void *rec,
+				       struct field_var **field_vars,
+				       unsigned int n_field_vars,
+				       unsigned int field_var_str_start)
+{
+	struct hist_elt_data *elt_data = elt->private_data;
+	unsigned int i, j, var_idx;
+	u64 var_val;
+
+	for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
+		struct field_var *field_var = field_vars[i];
+		struct hist_field *var = field_var->var;
+		struct hist_field *val = field_var->val;
+
+		var_val = val->fn(val, elt, rbe, rec);
+		var_idx = var->var.idx;
+
+		if (val->flags & HIST_FIELD_FL_STRING) {
+			char *str = elt_data->field_var_str[j++];
+			char *val_str = (char *)(uintptr_t)var_val;
+
+			strscpy(str, val_str, STR_VAR_LEN_MAX);
+			var_val = (u64)(uintptr_t)str;
+		}
+		tracing_map_set_var(elt, var_idx, var_val);
+	}
+}
+
+static void update_field_vars(struct hist_trigger_data *hist_data,
+			      struct tracing_map_elt *elt,
+			      struct ring_buffer_event *rbe,
+			      void *rec)
+{
+	__update_field_vars(elt, rbe, rec, hist_data->field_vars,
+			    hist_data->n_field_vars, 0);
+}
+
+static void update_max_vars(struct hist_trigger_data *hist_data,
+			    struct tracing_map_elt *elt,
+			    struct ring_buffer_event *rbe,
+			    void *rec)
+{
+	__update_field_vars(elt, rbe, rec, hist_data->max_vars,
+			    hist_data->n_max_vars, hist_data->n_field_var_str);
+}
+
+static struct hist_field *create_var(struct hist_trigger_data *hist_data,
+				     struct trace_event_file *file,
+				     char *name, int size, const char *type)
+{
+	struct hist_field *var;
+	int idx;
+
+	if (find_var(hist_data, file, name) && !hist_data->remove) {
+		var = ERR_PTR(-EINVAL);
+		goto out;
+	}
+
+	var = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
+	if (!var) {
+		var = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	idx = tracing_map_add_var(hist_data->map);
+	if (idx < 0) {
+		kfree(var);
+		var = ERR_PTR(-EINVAL);
+		goto out;
+	}
+
+	var->flags = HIST_FIELD_FL_VAR;
+	var->var.idx = idx;
+	var->var.hist_data = var->hist_data = hist_data;
+	var->size = size;
+	var->var.name = kstrdup(name, GFP_KERNEL);
+	var->type = kstrdup(type, GFP_KERNEL);
+	if (!var->var.name || !var->type) {
+		kfree(var->var.name);
+		kfree(var->type);
+		kfree(var);
+		var = ERR_PTR(-ENOMEM);
+	}
+ out:
+	return var;
+}
+
+static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
+					  struct trace_event_file *file,
+					  char *field_name)
+{
+	struct hist_field *val = NULL, *var = NULL;
+	unsigned long flags = HIST_FIELD_FL_VAR;
+	struct field_var *field_var;
 	int ret = 0;
 
-	if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
+	if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
+		hist_err("Too many field variables defined: ", field_name);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	val = parse_atom(hist_data, file, field_name, &flags, NULL);
+	if (IS_ERR(val)) {
+		hist_err("Couldn't parse field variable: ", field_name);
+		ret = PTR_ERR(val);
+		goto err;
+	}
+
+	var = create_var(hist_data, file, field_name, val->size, val->type);
+	if (IS_ERR(var)) {
+		hist_err("Couldn't create or find variable: ", field_name);
+		kfree(val);
+		ret = PTR_ERR(var);
+		goto err;
+	}
+
+	field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL);
+	if (!field_var) {
+		kfree(val);
+		kfree(var);
+		ret =  -ENOMEM;
+		goto err;
+	}
+
+	field_var->var = var;
+	field_var->val = val;
+ out:
+	return field_var;
+ err:
+	field_var = ERR_PTR(ret);
+	goto out;
+}
+
+/**
+ * create_target_field_var - Automatically create a variable for a field
+ * @target_hist_data: The target hist trigger
+ * @subsys_name: Optional subsystem name
+ * @event_name: Optional event name
+ * @var_name: The name of the field (and the resulting variable)
+ *
+ * Hist trigger actions fetch data from variables, not directly from
+ * events.  However, for convenience, users are allowed to directly
+ * specify an event field in an action, which will be automatically
+ * converted into a variable on their behalf.
+
+ * This function creates a field variable with the name var_name on
+ * the hist trigger currently being defined on the target event.  If
+ * subsys_name and event_name are specified, this function simply
+ * verifies that they do in fact match the target event subsystem and
+ * event name.
+ *
+ * Return: The variable created for the field.
+ */
+static struct field_var *
+create_target_field_var(struct hist_trigger_data *target_hist_data,
+			char *subsys_name, char *event_name, char *var_name)
+{
+	struct trace_event_file *file = target_hist_data->event_file;
+
+	if (subsys_name) {
+		struct trace_event_call *call;
+
+		if (!event_name)
+			return NULL;
+
+		call = file->event_call;
+
+		if (strcmp(subsys_name, call->class->system) != 0)
+			return NULL;
+
+		if (strcmp(event_name, trace_event_name(call)) != 0)
+			return NULL;
+	}
+
+	return create_field_var(target_hist_data, file, var_name);
+}
+
+static void onmax_print(struct seq_file *m,
+			struct hist_trigger_data *hist_data,
+			struct tracing_map_elt *elt,
+			struct action_data *data)
+{
+	unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx;
+
+	seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx));
+
+	for (i = 0; i < hist_data->n_max_vars; i++) {
+		struct hist_field *save_val = hist_data->max_vars[i]->val;
+		struct hist_field *save_var = hist_data->max_vars[i]->var;
+		u64 val;
+
+		save_var_idx = save_var->var.idx;
+
+		val = tracing_map_read_var(elt, save_var_idx);
+
+		if (save_val->flags & HIST_FIELD_FL_STRING) {
+			seq_printf(m, "  %s: %-32s", save_var->var.name,
+				   (char *)(uintptr_t)(val));
+		} else
+			seq_printf(m, "  %s: %10llu", save_var->var.name, val);
+	}
+}
+
+static void onmax_save(struct hist_trigger_data *hist_data,
+		       struct tracing_map_elt *elt, void *rec,
+		       struct ring_buffer_event *rbe,
+		       struct action_data *data, u64 *var_ref_vals)
+{
+	unsigned int max_idx = data->onmax.max_var->var.idx;
+	unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx;
+
+	u64 var_val, max_val;
+
+	var_val = var_ref_vals[max_var_ref_idx];
+	max_val = tracing_map_read_var(elt, max_idx);
+
+	if (var_val <= max_val)
+		return;
+
+	tracing_map_set_var(elt, max_idx, var_val);
+
+	update_max_vars(hist_data, elt, rbe, rec);
+}
+
+static void onmax_destroy(struct action_data *data)
+{
+	unsigned int i;
+
+	destroy_hist_field(data->onmax.max_var, 0);
+	destroy_hist_field(data->onmax.var, 0);
+
+	kfree(data->onmax.var_str);
+	kfree(data->onmax.fn_name);
+
+	for (i = 0; i < data->n_params; i++)
+		kfree(data->params[i]);
+
+	kfree(data);
+}
+
+static int onmax_create(struct hist_trigger_data *hist_data,
+			struct action_data *data)
+{
+	struct trace_event_file *file = hist_data->event_file;
+	struct hist_field *var_field, *ref_field, *max_var;
+	unsigned int var_ref_idx = hist_data->n_var_refs;
+	struct field_var *field_var;
+	char *onmax_var_str, *param;
+	unsigned long flags;
+	unsigned int i;
+	int ret = 0;
+
+	onmax_var_str = data->onmax.var_str;
+	if (onmax_var_str[0] != '$') {
+		hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str);
 		return -EINVAL;
+	}
+	onmax_var_str++;
 
-	field_name = strsep(&field_str, ".");
-	if (field_str) {
-		if (strcmp(field_str, "hex") == 0)
-			flags |= HIST_FIELD_FL_HEX;
-		else {
+	var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str);
+	if (!var_field) {
+		hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str);
+		return -EINVAL;
+	}
+
+	flags = HIST_FIELD_FL_VAR_REF;
+	ref_field = create_hist_field(hist_data, NULL, flags, NULL);
+	if (!ref_field)
+		return -ENOMEM;
+
+	if (init_var_ref(ref_field, var_field, NULL, NULL)) {
+		destroy_hist_field(ref_field, 0);
+		ret = -ENOMEM;
+		goto out;
+	}
+	hist_data->var_refs[hist_data->n_var_refs] = ref_field;
+	ref_field->var_ref_idx = hist_data->n_var_refs++;
+	data->onmax.var = ref_field;
+
+	data->fn = onmax_save;
+	data->onmax.max_var_ref_idx = var_ref_idx;
+	max_var = create_var(hist_data, file, "max", sizeof(u64), "u64");
+	if (IS_ERR(max_var)) {
+		hist_err("onmax: Couldn't create onmax variable: ", "max");
+		ret = PTR_ERR(max_var);
+		goto out;
+	}
+	data->onmax.max_var = max_var;
+
+	for (i = 0; i < data->n_params; i++) {
+		param = kstrdup(data->params[i], GFP_KERNEL);
+		if (!param) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		field_var = create_target_field_var(hist_data, NULL, NULL, param);
+		if (IS_ERR(field_var)) {
+			hist_err("onmax: Couldn't create field variable: ", param);
+			ret = PTR_ERR(field_var);
+			kfree(param);
+			goto out;
+		}
+
+		hist_data->max_vars[hist_data->n_max_vars++] = field_var;
+		if (field_var->val->flags & HIST_FIELD_FL_STRING)
+			hist_data->n_max_var_str++;
+
+		kfree(param);
+	}
+ out:
+	return ret;
+}
+
+static int parse_action_params(char *params, struct action_data *data)
+{
+	char *param, *saved_param;
+	int ret = 0;
+
+	while (params) {
+		if (data->n_params >= SYNTH_FIELDS_MAX)
+			goto out;
+
+		param = strsep(&params, ",");
+		if (!param) {
 			ret = -EINVAL;
 			goto out;
 		}
+
+		param = strstrip(param);
+		if (strlen(param) < 2) {
+			hist_err("Invalid action param: ", param);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		saved_param = kstrdup(param, GFP_KERNEL);
+		if (!saved_param) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		data->params[data->n_params++] = saved_param;
 	}
+ out:
+	return ret;
+}
 
-	field = trace_find_event_field(file->event_call, field_name);
-	if (!field || !field->size) {
+static struct action_data *onmax_parse(char *str)
+{
+	char *onmax_fn_name, *onmax_var_str;
+	struct action_data *data;
+	int ret = -EINVAL;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	onmax_var_str = strsep(&str, ")");
+	if (!onmax_var_str || !str) {
 		ret = -EINVAL;
-		goto out;
+		goto free;
+	}
+
+	data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL);
+	if (!data->onmax.var_str) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	strsep(&str, ".");
+	if (!str)
+		goto free;
+
+	onmax_fn_name = strsep(&str, "(");
+	if (!onmax_fn_name || !str)
+		goto free;
+
+	if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) {
+		char *params = strsep(&str, ")");
+
+		if (!params) {
+			ret = -EINVAL;
+			goto free;
+		}
+
+		ret = parse_action_params(params, data);
+		if (ret)
+			goto free;
+	} else
+		goto free;
+
+	data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL);
+	if (!data->onmax.fn_name) {
+		ret = -ENOMEM;
+		goto free;
+	}
+ out:
+	return data;
+ free:
+	onmax_destroy(data);
+	data = ERR_PTR(ret);
+	goto out;
+}
+
+static void onmatch_destroy(struct action_data *data)
+{
+	unsigned int i;
+
+	mutex_lock(&synth_event_mutex);
+
+	kfree(data->onmatch.match_event);
+	kfree(data->onmatch.match_event_system);
+	kfree(data->onmatch.synth_event_name);
+
+	for (i = 0; i < data->n_params; i++)
+		kfree(data->params[i]);
+
+	if (data->onmatch.synth_event)
+		data->onmatch.synth_event->ref--;
+
+	kfree(data);
+
+	mutex_unlock(&synth_event_mutex);
+}
+
+static void destroy_field_var(struct field_var *field_var)
+{
+	if (!field_var)
+		return;
+
+	destroy_hist_field(field_var->var, 0);
+	destroy_hist_field(field_var->val, 0);
+
+	kfree(field_var);
+}
+
+static void destroy_field_vars(struct hist_trigger_data *hist_data)
+{
+	unsigned int i;
+
+	for (i = 0; i < hist_data->n_field_vars; i++)
+		destroy_field_var(hist_data->field_vars[i]);
+}
+
+static void save_field_var(struct hist_trigger_data *hist_data,
+			   struct field_var *field_var)
+{
+	hist_data->field_vars[hist_data->n_field_vars++] = field_var;
+
+	if (field_var->val->flags & HIST_FIELD_FL_STRING)
+		hist_data->n_field_var_str++;
+}
+
+
+static void destroy_synth_var_refs(struct hist_trigger_data *hist_data)
+{
+	unsigned int i;
+
+	for (i = 0; i < hist_data->n_synth_var_refs; i++)
+		destroy_hist_field(hist_data->synth_var_refs[i], 0);
+}
+
+static void save_synth_var_ref(struct hist_trigger_data *hist_data,
+			 struct hist_field *var_ref)
+{
+	hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref;
+
+	hist_data->var_refs[hist_data->n_var_refs] = var_ref;
+	var_ref->var_ref_idx = hist_data->n_var_refs++;
+}
+
+static int check_synth_field(struct synth_event *event,
+			     struct hist_field *hist_field,
+			     unsigned int field_pos)
+{
+	struct synth_field *field;
+
+	if (field_pos >= event->n_fields)
+		return -EINVAL;
+
+	field = event->fields[field_pos];
+
+	if (strcmp(field->type, hist_field->type) != 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct hist_field *
+onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data,
+		 char *system, char *event, char *var)
+{
+	struct hist_field *hist_field;
+
+	var++; /* skip '$' */
+
+	hist_field = find_target_event_var(hist_data, system, event, var);
+	if (!hist_field) {
+		if (!system) {
+			system = data->onmatch.match_event_system;
+			event = data->onmatch.match_event;
+		}
+
+		hist_field = find_event_var(hist_data, system, event, var);
+	}
+
+	if (!hist_field)
+		hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var);
+
+	return hist_field;
+}
+
+static struct hist_field *
+onmatch_create_field_var(struct hist_trigger_data *hist_data,
+			 struct action_data *data, char *system,
+			 char *event, char *var)
+{
+	struct hist_field *hist_field = NULL;
+	struct field_var *field_var;
+
+	/*
+	 * First try to create a field var on the target event (the
+	 * currently being defined).  This will create a variable for
+	 * unqualified fields on the target event, or if qualified,
+	 * target fields that have qualified names matching the target.
+	 */
+	field_var = create_target_field_var(hist_data, system, event, var);
+
+	if (field_var && !IS_ERR(field_var)) {
+		save_field_var(hist_data, field_var);
+		hist_field = field_var->var;
+	} else {
+		field_var = NULL;
+		/*
+		 * If no explicit system.event is specfied, default to
+		 * looking for fields on the onmatch(system.event.xxx)
+		 * event.
+		 */
+		if (!system) {
+			system = data->onmatch.match_event_system;
+			event = data->onmatch.match_event;
+		}
+
+		/*
+		 * At this point, we're looking at a field on another
+		 * event.  Because we can't modify a hist trigger on
+		 * another event to add a variable for a field, we need
+		 * to create a new trigger on that event and create the
+		 * variable at the same time.
+		 */
+		hist_field = create_field_var_hist(hist_data, system, event, var);
+		if (IS_ERR(hist_field))
+			goto free;
+	}
+ out:
+	return hist_field;
+ free:
+	destroy_field_var(field_var);
+	hist_field = NULL;
+	goto out;
+}
+
+static int onmatch_create(struct hist_trigger_data *hist_data,
+			  struct trace_event_file *file,
+			  struct action_data *data)
+{
+	char *event_name, *param, *system = NULL;
+	struct hist_field *hist_field, *var_ref;
+	unsigned int i, var_ref_idx;
+	unsigned int field_pos = 0;
+	struct synth_event *event;
+	int ret = 0;
+
+	mutex_lock(&synth_event_mutex);
+	event = find_synth_event(data->onmatch.synth_event_name);
+	if (!event) {
+		hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
+		mutex_unlock(&synth_event_mutex);
+		return -EINVAL;
+	}
+	event->ref++;
+	mutex_unlock(&synth_event_mutex);
+
+	var_ref_idx = hist_data->n_var_refs;
+
+	for (i = 0; i < data->n_params; i++) {
+		char *p;
+
+		p = param = kstrdup(data->params[i], GFP_KERNEL);
+		if (!param) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		system = strsep(&param, ".");
+		if (!param) {
+			param = (char *)system;
+			system = event_name = NULL;
+		} else {
+			event_name = strsep(&param, ".");
+			if (!param) {
+				kfree(p);
+				ret = -EINVAL;
+				goto err;
+			}
+		}
+
+		if (param[0] == '$')
+			hist_field = onmatch_find_var(hist_data, data, system,
+						      event_name, param);
+		else
+			hist_field = onmatch_create_field_var(hist_data, data,
+							      system,
+							      event_name,
+							      param);
+
+		if (!hist_field) {
+			kfree(p);
+			ret = -EINVAL;
+			goto err;
+		}
+
+		if (check_synth_field(event, hist_field, field_pos) == 0) {
+			var_ref = create_var_ref(hist_field, system, event_name);
+			if (!var_ref) {
+				kfree(p);
+				ret = -ENOMEM;
+				goto err;
+			}
+
+			save_synth_var_ref(hist_data, var_ref);
+			field_pos++;
+			kfree(p);
+			continue;
+		}
+
+		hist_err_event("onmatch: Param type doesn't match synthetic event field type: ",
+			       system, event_name, param);
+		kfree(p);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (field_pos != event->n_fields) {
+		hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	data->fn = action_trace;
+	data->onmatch.synth_event = event;
+	data->onmatch.var_ref_idx = var_ref_idx;
+ out:
+	return ret;
+ err:
+	mutex_lock(&synth_event_mutex);
+	event->ref--;
+	mutex_unlock(&synth_event_mutex);
+
+	goto out;
+}
+
+static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
+{
+	char *match_event, *match_event_system;
+	char *synth_event_name, *params;
+	struct action_data *data;
+	int ret = -EINVAL;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	match_event = strsep(&str, ")");
+	if (!match_event || !str) {
+		hist_err("onmatch: Missing closing paren: ", match_event);
+		goto free;
+	}
+
+	match_event_system = strsep(&match_event, ".");
+	if (!match_event) {
+		hist_err("onmatch: Missing subsystem for match event: ", match_event_system);
+		goto free;
+	}
+
+	if (IS_ERR(event_file(tr, match_event_system, match_event))) {
+		hist_err_event("onmatch: Invalid subsystem or event name: ",
+			       match_event_system, match_event, NULL);
+		goto free;
+	}
+
+	data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL);
+	if (!data->onmatch.match_event) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL);
+	if (!data->onmatch.match_event_system) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	strsep(&str, ".");
+	if (!str) {
+		hist_err("onmatch: Missing . after onmatch(): ", str);
+		goto free;
+	}
+
+	synth_event_name = strsep(&str, "(");
+	if (!synth_event_name || !str) {
+		hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name);
+		goto free;
 	}
 
-	hist_data->fields[val_idx] = create_hist_field(field, flags);
-	if (!hist_data->fields[val_idx]) {
+	data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL);
+	if (!data->onmatch.synth_event_name) {
 		ret = -ENOMEM;
+		goto free;
+	}
+
+	params = strsep(&str, ")");
+	if (!params || !str || (str && strlen(str))) {
+		hist_err("onmatch: Missing closing paramlist paren: ", params);
+		goto free;
+	}
+
+	ret = parse_action_params(params, data);
+	if (ret)
+		goto free;
+ out:
+	return data;
+ free:
+	onmatch_destroy(data);
+	data = ERR_PTR(ret);
+	goto out;
+}
+
+static int create_hitcount_val(struct hist_trigger_data *hist_data)
+{
+	hist_data->fields[HITCOUNT_IDX] =
+		create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL);
+	if (!hist_data->fields[HITCOUNT_IDX])
+		return -ENOMEM;
+
+	hist_data->n_vals++;
+	hist_data->n_fields++;
+
+	if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int __create_val_field(struct hist_trigger_data *hist_data,
+			      unsigned int val_idx,
+			      struct trace_event_file *file,
+			      char *var_name, char *field_str,
+			      unsigned long flags)
+{
+	struct hist_field *hist_field;
+	int ret = 0;
+
+	hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0);
+	if (IS_ERR(hist_field)) {
+		ret = PTR_ERR(hist_field);
 		goto out;
 	}
 
+	hist_data->fields[val_idx] = hist_field;
+
 	++hist_data->n_vals;
+	++hist_data->n_fields;
 
-	if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+	if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
 		ret = -EINVAL;
  out:
 	return ret;
 }
 
+static int create_val_field(struct hist_trigger_data *hist_data,
+			    unsigned int val_idx,
+			    struct trace_event_file *file,
+			    char *field_str)
+{
+	if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
+		return -EINVAL;
+
+	return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
+}
+
+static int create_var_field(struct hist_trigger_data *hist_data,
+			    unsigned int val_idx,
+			    struct trace_event_file *file,
+			    char *var_name, char *expr_str)
+{
+	unsigned long flags = 0;
+
+	if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
+		return -EINVAL;
+
+	if (find_var(hist_data, file, var_name) && !hist_data->remove) {
+		hist_err("Variable already defined: ", var_name);
+		return -EINVAL;
+	}
+
+	flags |= HIST_FIELD_FL_VAR;
+	hist_data->n_vars++;
+	if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX))
+		return -EINVAL;
+
+	return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
+}
+
 static int create_val_fields(struct hist_trigger_data *hist_data,
 			     struct trace_event_file *file)
 {
 	char *fields_str, *field_str;
-	unsigned int i, j;
+	unsigned int i, j = 1;
 	int ret;
 
 	ret = create_hitcount_val(hist_data);
@@ -533,12 +3912,15 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
 		field_str = strsep(&fields_str, ",");
 		if (!field_str)
 			break;
+
 		if (strcmp(field_str, "hitcount") == 0)
 			continue;
+
 		ret = create_val_field(hist_data, j++, file, field_str);
 		if (ret)
 			goto out;
 	}
+
 	if (fields_str && (strcmp(fields_str, "hitcount") != 0))
 		ret = -EINVAL;
  out:
@@ -551,12 +3933,13 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 			    struct trace_event_file *file,
 			    char *field_str)
 {
-	struct ftrace_event_field *field = NULL;
+	struct hist_field *hist_field = NULL;
+
 	unsigned long flags = 0;
 	unsigned int key_size;
 	int ret = 0;
 
-	if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
+	if (WARN_ON(key_idx >= HIST_FIELDS_MAX))
 		return -EINVAL;
 
 	flags |= HIST_FIELD_FL_KEY;
@@ -564,57 +3947,40 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 	if (strcmp(field_str, "stacktrace") == 0) {
 		flags |= HIST_FIELD_FL_STACKTRACE;
 		key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH;
+		hist_field = create_hist_field(hist_data, NULL, flags, NULL);
 	} else {
-		char *field_name = strsep(&field_str, ".");
-
-		if (field_str) {
-			if (strcmp(field_str, "hex") == 0)
-				flags |= HIST_FIELD_FL_HEX;
-			else if (strcmp(field_str, "sym") == 0)
-				flags |= HIST_FIELD_FL_SYM;
-			else if (strcmp(field_str, "sym-offset") == 0)
-				flags |= HIST_FIELD_FL_SYM_OFFSET;
-			else if ((strcmp(field_str, "execname") == 0) &&
-				 (strcmp(field_name, "common_pid") == 0))
-				flags |= HIST_FIELD_FL_EXECNAME;
-			else if (strcmp(field_str, "syscall") == 0)
-				flags |= HIST_FIELD_FL_SYSCALL;
-			else if (strcmp(field_str, "log2") == 0)
-				flags |= HIST_FIELD_FL_LOG2;
-			else {
-				ret = -EINVAL;
-				goto out;
-			}
+		hist_field = parse_expr(hist_data, file, field_str, flags,
+					NULL, 0);
+		if (IS_ERR(hist_field)) {
+			ret = PTR_ERR(hist_field);
+			goto out;
 		}
 
-		field = trace_find_event_field(file->event_call, field_name);
-		if (!field || !field->size) {
+		if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
+			hist_err("Using variable references as keys not supported: ", field_str);
+			destroy_hist_field(hist_field, 0);
 			ret = -EINVAL;
 			goto out;
 		}
 
-		if (is_string_field(field))
-			key_size = MAX_FILTER_STR_VAL;
-		else
-			key_size = field->size;
+		key_size = hist_field->size;
 	}
 
-	hist_data->fields[key_idx] = create_hist_field(field, flags);
-	if (!hist_data->fields[key_idx]) {
-		ret = -ENOMEM;
-		goto out;
-	}
+	hist_data->fields[key_idx] = hist_field;
 
 	key_size = ALIGN(key_size, sizeof(u64));
 	hist_data->fields[key_idx]->size = key_size;
 	hist_data->fields[key_idx]->offset = key_offset;
+
 	hist_data->key_size += key_size;
+
 	if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	hist_data->n_keys++;
+	hist_data->n_fields++;
 
 	if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
 		return -EINVAL;
@@ -658,21 +4024,113 @@ static int create_key_fields(struct hist_trigger_data *hist_data,
 	return ret;
 }
 
+static int create_var_fields(struct hist_trigger_data *hist_data,
+			     struct trace_event_file *file)
+{
+	unsigned int i, j = hist_data->n_vals;
+	int ret = 0;
+
+	unsigned int n_vars = hist_data->attrs->var_defs.n_vars;
+
+	for (i = 0; i < n_vars; i++) {
+		char *var_name = hist_data->attrs->var_defs.name[i];
+		char *expr = hist_data->attrs->var_defs.expr[i];
+
+		ret = create_var_field(hist_data, j++, file, var_name, expr);
+		if (ret)
+			goto out;
+	}
+ out:
+	return ret;
+}
+
+static void free_var_defs(struct hist_trigger_data *hist_data)
+{
+	unsigned int i;
+
+	for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
+		kfree(hist_data->attrs->var_defs.name[i]);
+		kfree(hist_data->attrs->var_defs.expr[i]);
+	}
+
+	hist_data->attrs->var_defs.n_vars = 0;
+}
+
+static int parse_var_defs(struct hist_trigger_data *hist_data)
+{
+	char *s, *str, *var_name, *field_str;
+	unsigned int i, j, n_vars = 0;
+	int ret = 0;
+
+	for (i = 0; i < hist_data->attrs->n_assignments; i++) {
+		str = hist_data->attrs->assignment_str[i];
+		for (j = 0; j < TRACING_MAP_VARS_MAX; j++) {
+			field_str = strsep(&str, ",");
+			if (!field_str)
+				break;
+
+			var_name = strsep(&field_str, "=");
+			if (!var_name || !field_str) {
+				hist_err("Malformed assignment: ", var_name);
+				ret = -EINVAL;
+				goto free;
+			}
+
+			if (n_vars == TRACING_MAP_VARS_MAX) {
+				hist_err("Too many variables defined: ", var_name);
+				ret = -EINVAL;
+				goto free;
+			}
+
+			s = kstrdup(var_name, GFP_KERNEL);
+			if (!s) {
+				ret = -ENOMEM;
+				goto free;
+			}
+			hist_data->attrs->var_defs.name[n_vars] = s;
+
+			s = kstrdup(field_str, GFP_KERNEL);
+			if (!s) {
+				kfree(hist_data->attrs->var_defs.name[n_vars]);
+				ret = -ENOMEM;
+				goto free;
+			}
+			hist_data->attrs->var_defs.expr[n_vars++] = s;
+
+			hist_data->attrs->var_defs.n_vars = n_vars;
+		}
+	}
+
+	return ret;
+ free:
+	free_var_defs(hist_data);
+
+	return ret;
+}
+
 static int create_hist_fields(struct hist_trigger_data *hist_data,
 			      struct trace_event_file *file)
 {
 	int ret;
 
+	ret = parse_var_defs(hist_data);
+	if (ret)
+		goto out;
+
 	ret = create_val_fields(hist_data, file);
 	if (ret)
 		goto out;
 
-	ret = create_key_fields(hist_data, file);
+	ret = create_var_fields(hist_data, file);
 	if (ret)
 		goto out;
 
-	hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
+	ret = create_key_fields(hist_data, file);
+	if (ret)
+		goto out;
  out:
+	free_var_defs(hist_data);
+
 	return ret;
 }
 
@@ -695,7 +4153,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
 	char *fields_str = hist_data->attrs->sort_key_str;
 	struct tracing_map_sort_key *sort_key;
 	int descending, ret = 0;
-	unsigned int i, j;
+	unsigned int i, j, k;
 
 	hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */
 
@@ -743,12 +4201,19 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
 			continue;
 		}
 
-		for (j = 1; j < hist_data->n_fields; j++) {
+		for (j = 1, k = 1; j < hist_data->n_fields; j++) {
+			unsigned int idx;
+
 			hist_field = hist_data->fields[j];
+			if (hist_field->flags & HIST_FIELD_FL_VAR)
+				continue;
+
+			idx = k++;
+
 			test_name = hist_field_name(hist_field, 0);
 
 			if (strcmp(field_name, test_name) == 0) {
-				sort_key->field_idx = j;
+				sort_key->field_idx = idx;
 				descending = is_descending(field_str);
 				if (descending < 0) {
 					ret = descending;
@@ -763,16 +4228,230 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
 			break;
 		}
 	}
+
 	hist_data->n_sort_keys = i;
  out:
 	return ret;
 }
 
+static void destroy_actions(struct hist_trigger_data *hist_data)
+{
+	unsigned int i;
+
+	for (i = 0; i < hist_data->n_actions; i++) {
+		struct action_data *data = hist_data->actions[i];
+
+		if (data->fn == action_trace)
+			onmatch_destroy(data);
+		else if (data->fn == onmax_save)
+			onmax_destroy(data);
+		else
+			kfree(data);
+	}
+}
+
+static int parse_actions(struct hist_trigger_data *hist_data)
+{
+	struct trace_array *tr = hist_data->event_file->tr;
+	struct action_data *data;
+	unsigned int i;
+	int ret = 0;
+	char *str;
+
+	for (i = 0; i < hist_data->attrs->n_actions; i++) {
+		str = hist_data->attrs->action_str[i];
+
+		if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) {
+			char *action_str = str + strlen("onmatch(");
+
+			data = onmatch_parse(tr, action_str);
+			if (IS_ERR(data)) {
+				ret = PTR_ERR(data);
+				break;
+			}
+			data->fn = action_trace;
+		} else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) {
+			char *action_str = str + strlen("onmax(");
+
+			data = onmax_parse(action_str);
+			if (IS_ERR(data)) {
+				ret = PTR_ERR(data);
+				break;
+			}
+			data->fn = onmax_save;
+		} else {
+			ret = -EINVAL;
+			break;
+		}
+
+		hist_data->actions[hist_data->n_actions++] = data;
+	}
+
+	return ret;
+}
+
+static int create_actions(struct hist_trigger_data *hist_data,
+			  struct trace_event_file *file)
+{
+	struct action_data *data;
+	unsigned int i;
+	int ret = 0;
+
+	for (i = 0; i < hist_data->attrs->n_actions; i++) {
+		data = hist_data->actions[i];
+
+		if (data->fn == action_trace) {
+			ret = onmatch_create(hist_data, file, data);
+			if (ret)
+				return ret;
+		} else if (data->fn == onmax_save) {
+			ret = onmax_create(hist_data, data);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return ret;
+}
+
+static void print_actions(struct seq_file *m,
+			  struct hist_trigger_data *hist_data,
+			  struct tracing_map_elt *elt)
+{
+	unsigned int i;
+
+	for (i = 0; i < hist_data->n_actions; i++) {
+		struct action_data *data = hist_data->actions[i];
+
+		if (data->fn == onmax_save)
+			onmax_print(m, hist_data, elt, data);
+	}
+}
+
+static void print_onmax_spec(struct seq_file *m,
+			     struct hist_trigger_data *hist_data,
+			     struct action_data *data)
+{
+	unsigned int i;
+
+	seq_puts(m, ":onmax(");
+	seq_printf(m, "%s", data->onmax.var_str);
+	seq_printf(m, ").%s(", data->onmax.fn_name);
+
+	for (i = 0; i < hist_data->n_max_vars; i++) {
+		seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name);
+		if (i < hist_data->n_max_vars - 1)
+			seq_puts(m, ",");
+	}
+	seq_puts(m, ")");
+}
+
+static void print_onmatch_spec(struct seq_file *m,
+			       struct hist_trigger_data *hist_data,
+			       struct action_data *data)
+{
+	unsigned int i;
+
+	seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system,
+		   data->onmatch.match_event);
+
+	seq_printf(m, "%s(", data->onmatch.synth_event->name);
+
+	for (i = 0; i < data->n_params; i++) {
+		if (i)
+			seq_puts(m, ",");
+		seq_printf(m, "%s", data->params[i]);
+	}
+
+	seq_puts(m, ")");
+}
+
+static bool actions_match(struct hist_trigger_data *hist_data,
+			  struct hist_trigger_data *hist_data_test)
+{
+	unsigned int i, j;
+
+	if (hist_data->n_actions != hist_data_test->n_actions)
+		return false;
+
+	for (i = 0; i < hist_data->n_actions; i++) {
+		struct action_data *data = hist_data->actions[i];
+		struct action_data *data_test = hist_data_test->actions[i];
+
+		if (data->fn != data_test->fn)
+			return false;
+
+		if (data->n_params != data_test->n_params)
+			return false;
+
+		for (j = 0; j < data->n_params; j++) {
+			if (strcmp(data->params[j], data_test->params[j]) != 0)
+				return false;
+		}
+
+		if (data->fn == action_trace) {
+			if (strcmp(data->onmatch.synth_event_name,
+				   data_test->onmatch.synth_event_name) != 0)
+				return false;
+			if (strcmp(data->onmatch.match_event_system,
+				   data_test->onmatch.match_event_system) != 0)
+				return false;
+			if (strcmp(data->onmatch.match_event,
+				   data_test->onmatch.match_event) != 0)
+				return false;
+		} else if (data->fn == onmax_save) {
+			if (strcmp(data->onmax.var_str,
+				   data_test->onmax.var_str) != 0)
+				return false;
+			if (strcmp(data->onmax.fn_name,
+				   data_test->onmax.fn_name) != 0)
+				return false;
+		}
+	}
+
+	return true;
+}
+
+
+static void print_actions_spec(struct seq_file *m,
+			       struct hist_trigger_data *hist_data)
+{
+	unsigned int i;
+
+	for (i = 0; i < hist_data->n_actions; i++) {
+		struct action_data *data = hist_data->actions[i];
+
+		if (data->fn == action_trace)
+			print_onmatch_spec(m, hist_data, data);
+		else if (data->fn == onmax_save)
+			print_onmax_spec(m, hist_data, data);
+	}
+}
+
+static void destroy_field_var_hists(struct hist_trigger_data *hist_data)
+{
+	unsigned int i;
+
+	for (i = 0; i < hist_data->n_field_var_hists; i++) {
+		kfree(hist_data->field_var_hists[i]->cmd);
+		kfree(hist_data->field_var_hists[i]);
+	}
+}
+
 static void destroy_hist_data(struct hist_trigger_data *hist_data)
 {
+	if (!hist_data)
+		return;
+
 	destroy_hist_trigger_attrs(hist_data->attrs);
 	destroy_hist_fields(hist_data);
 	tracing_map_destroy(hist_data->map);
+
+	destroy_actions(hist_data);
+	destroy_field_vars(hist_data);
+	destroy_field_var_hists(hist_data);
+	destroy_synth_var_refs(hist_data);
+
 	kfree(hist_data);
 }
 
@@ -781,7 +4460,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
 	struct tracing_map *map = hist_data->map;
 	struct ftrace_event_field *field;
 	struct hist_field *hist_field;
-	int i, idx;
+	int i, idx = 0;
 
 	for_each_hist_field(i, hist_data) {
 		hist_field = hist_data->fields[i];
@@ -792,6 +4471,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
 
 			if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
 				cmp_fn = tracing_map_cmp_none;
+			else if (!field)
+				cmp_fn = tracing_map_cmp_num(hist_field->size,
+							     hist_field->is_signed);
 			else if (is_string_field(field))
 				cmp_fn = tracing_map_cmp_string;
 			else
@@ -800,36 +4482,29 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
 			idx = tracing_map_add_key_field(map,
 							hist_field->offset,
 							cmp_fn);
-
-		} else
+		} else if (!(hist_field->flags & HIST_FIELD_FL_VAR))
 			idx = tracing_map_add_sum_field(map);
 
 		if (idx < 0)
 			return idx;
-	}
-
-	return 0;
-}
-
-static bool need_tracing_map_ops(struct hist_trigger_data *hist_data)
-{
-	struct hist_field *key_field;
-	unsigned int i;
-
-	for_each_hist_key_field(i, hist_data) {
-		key_field = hist_data->fields[i];
 
-		if (key_field->flags & HIST_FIELD_FL_EXECNAME)
-			return true;
+		if (hist_field->flags & HIST_FIELD_FL_VAR) {
+			idx = tracing_map_add_var(map);
+			if (idx < 0)
+				return idx;
+			hist_field->var.idx = idx;
+			hist_field->var.hist_data = hist_data;
+		}
 	}
 
-	return false;
+	return 0;
 }
 
 static struct hist_trigger_data *
 create_hist_data(unsigned int map_bits,
 		 struct hist_trigger_attrs *attrs,
-		 struct trace_event_file *file)
+		 struct trace_event_file *file,
+		 bool remove)
 {
 	const struct tracing_map_ops *map_ops = NULL;
 	struct hist_trigger_data *hist_data;
@@ -840,6 +4515,12 @@ create_hist_data(unsigned int map_bits,
 		return ERR_PTR(-ENOMEM);
 
 	hist_data->attrs = attrs;
+	hist_data->remove = remove;
+	hist_data->event_file = file;
+
+	ret = parse_actions(hist_data);
+	if (ret)
+		goto free;
 
 	ret = create_hist_fields(hist_data, file);
 	if (ret)
@@ -849,8 +4530,7 @@ create_hist_data(unsigned int map_bits,
 	if (ret)
 		goto free;
 
-	if (need_tracing_map_ops(hist_data))
-		map_ops = &hist_trigger_elt_comm_ops;
+	map_ops = &hist_trigger_elt_data_ops;
 
 	hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
 					    map_ops, hist_data);
@@ -863,12 +4543,6 @@ create_hist_data(unsigned int map_bits,
 	ret = create_tracing_map_fields(hist_data);
 	if (ret)
 		goto free;
-
-	ret = tracing_map_init(hist_data->map);
-	if (ret)
-		goto free;
-
-	hist_data->event_file = file;
  out:
 	return hist_data;
  free:
@@ -882,18 +4556,39 @@ create_hist_data(unsigned int map_bits,
 }
 
 static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
-				    struct tracing_map_elt *elt,
-				    void *rec)
+				    struct tracing_map_elt *elt, void *rec,
+				    struct ring_buffer_event *rbe,
+				    u64 *var_ref_vals)
 {
+	struct hist_elt_data *elt_data;
 	struct hist_field *hist_field;
-	unsigned int i;
+	unsigned int i, var_idx;
 	u64 hist_val;
 
+	elt_data = elt->private_data;
+	elt_data->var_ref_vals = var_ref_vals;
+
 	for_each_hist_val_field(i, hist_data) {
 		hist_field = hist_data->fields[i];
-		hist_val = hist_field->fn(hist_field, rec);
+		hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+		if (hist_field->flags & HIST_FIELD_FL_VAR) {
+			var_idx = hist_field->var.idx;
+			tracing_map_set_var(elt, var_idx, hist_val);
+			continue;
+		}
 		tracing_map_update_sum(elt, i, hist_val);
 	}
+
+	for_each_hist_key_field(i, hist_data) {
+		hist_field = hist_data->fields[i];
+		if (hist_field->flags & HIST_FIELD_FL_VAR) {
+			hist_val = hist_field->fn(hist_field, elt, rbe, rec);
+			var_idx = hist_field->var.idx;
+			tracing_map_set_var(elt, var_idx, hist_val);
+		}
+	}
+
+	update_field_vars(hist_data, elt, rbe, rec);
 }
 
 static inline void add_to_key(char *compound_key, void *key,
@@ -920,15 +4615,31 @@ static inline void add_to_key(char *compound_key, void *key,
 	memcpy(compound_key + key_field->offset, key, size);
 }
 
-static void event_hist_trigger(struct event_trigger_data *data, void *rec)
+static void
+hist_trigger_actions(struct hist_trigger_data *hist_data,
+		     struct tracing_map_elt *elt, void *rec,
+		     struct ring_buffer_event *rbe, u64 *var_ref_vals)
+{
+	struct action_data *data;
+	unsigned int i;
+
+	for (i = 0; i < hist_data->n_actions; i++) {
+		data = hist_data->actions[i];
+		data->fn(hist_data, elt, rec, rbe, data, var_ref_vals);
+	}
+}
+
+static void event_hist_trigger(struct event_trigger_data *data, void *rec,
+			       struct ring_buffer_event *rbe)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
 	bool use_compound_key = (hist_data->n_keys > 1);
 	unsigned long entries[HIST_STACKTRACE_DEPTH];
+	u64 var_ref_vals[TRACING_MAP_VARS_MAX];
 	char compound_key[HIST_KEY_SIZE_MAX];
+	struct tracing_map_elt *elt = NULL;
 	struct stack_trace stacktrace;
 	struct hist_field *key_field;
-	struct tracing_map_elt *elt;
 	u64 field_contents;
 	void *key = NULL;
 	unsigned int i;
@@ -949,7 +4660,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 
 			key = entries;
 		} else {
-			field_contents = key_field->fn(key_field, rec);
+			field_contents = key_field->fn(key_field, elt, rbe, rec);
 			if (key_field->flags & HIST_FIELD_FL_STRING) {
 				key = (void *)(unsigned long)field_contents;
 				use_compound_key = true;
@@ -964,9 +4675,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 	if (use_compound_key)
 		key = compound_key;
 
+	if (hist_data->n_var_refs &&
+	    !resolve_var_refs(hist_data, key, var_ref_vals, false))
+		return;
+
 	elt = tracing_map_insert(hist_data->map, key);
-	if (elt)
-		hist_trigger_elt_update(hist_data, elt, rec);
+	if (!elt)
+		return;
+
+	hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
+
+	if (resolve_var_refs(hist_data, key, var_ref_vals, true))
+		hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals);
 }
 
 static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -1023,7 +4743,13 @@ hist_trigger_entry_print(struct seq_file *m,
 			seq_printf(m, "%s: [%llx] %-55s", field_name,
 				   uval, str);
 		} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
-			char *comm = elt->private_data;
+			struct hist_elt_data *elt_data = elt->private_data;
+			char *comm;
+
+			if (WARN_ON_ONCE(!elt_data))
+				return;
+
+			comm = elt_data->comm;
 
 			uval = *(u64 *)(key + key_field->offset);
 			seq_printf(m, "%s: %-16s[%10llu]", field_name,
@@ -1067,6 +4793,10 @@ hist_trigger_entry_print(struct seq_file *m,
 	for (i = 1; i < hist_data->n_vals; i++) {
 		field_name = hist_field_name(hist_data->fields[i], 0);
 
+		if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR ||
+		    hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR)
+			continue;
+
 		if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
 			seq_printf(m, "  %s: %10llx", field_name,
 				   tracing_map_read_sum(elt, i));
@@ -1076,6 +4806,8 @@ hist_trigger_entry_print(struct seq_file *m,
 		}
 	}
 
+	print_actions(m, hist_data, elt);
+
 	seq_puts(m, "\n");
 }
 
@@ -1144,6 +4876,11 @@ static int hist_show(struct seq_file *m, void *v)
 			hist_trigger_show(m, data, n++);
 	}
 
+	if (have_hist_err()) {
+		seq_printf(m, "\nERROR: %s\n", hist_err_str);
+		seq_printf(m, "  Last command: %s\n", last_hist_cmd);
+	}
+
  out_unlock:
 	mutex_unlock(&event_mutex);
 
@@ -1162,36 +4899,31 @@ const struct file_operations event_hist_fops = {
 	.release = single_release,
 };
 
-static const char *get_hist_field_flags(struct hist_field *hist_field)
-{
-	const char *flags_str = NULL;
-
-	if (hist_field->flags & HIST_FIELD_FL_HEX)
-		flags_str = "hex";
-	else if (hist_field->flags & HIST_FIELD_FL_SYM)
-		flags_str = "sym";
-	else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
-		flags_str = "sym-offset";
-	else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
-		flags_str = "execname";
-	else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
-		flags_str = "syscall";
-	else if (hist_field->flags & HIST_FIELD_FL_LOG2)
-		flags_str = "log2";
-
-	return flags_str;
-}
-
 static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
 {
 	const char *field_name = hist_field_name(hist_field, 0);
 
-	seq_printf(m, "%s", field_name);
+	if (hist_field->var.name)
+		seq_printf(m, "%s=", hist_field->var.name);
+
+	if (hist_field->flags & HIST_FIELD_FL_CPU)
+		seq_puts(m, "cpu");
+	else if (field_name) {
+		if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
+		    hist_field->flags & HIST_FIELD_FL_ALIAS)
+			seq_putc(m, '$');
+		seq_printf(m, "%s", field_name);
+	} else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
+		seq_puts(m, "common_timestamp");
+
 	if (hist_field->flags) {
-		const char *flags_str = get_hist_field_flags(hist_field);
+		if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) &&
+		    !(hist_field->flags & HIST_FIELD_FL_EXPR)) {
+			const char *flags = get_hist_field_flags(hist_field);
 
-		if (flags_str)
-			seq_printf(m, ".%s", flags_str);
+			if (flags)
+				seq_printf(m, ".%s", flags);
+		}
 	}
 }
 
@@ -1200,7 +4932,8 @@ static int event_hist_trigger_print(struct seq_file *m,
 				    struct event_trigger_data *data)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
-	struct hist_field *key_field;
+	struct hist_field *field;
+	bool have_var = false;
 	unsigned int i;
 
 	seq_puts(m, "hist:");
@@ -1211,25 +4944,47 @@ static int event_hist_trigger_print(struct seq_file *m,
 	seq_puts(m, "keys=");
 
 	for_each_hist_key_field(i, hist_data) {
-		key_field = hist_data->fields[i];
+		field = hist_data->fields[i];
 
 		if (i > hist_data->n_vals)
 			seq_puts(m, ",");
 
-		if (key_field->flags & HIST_FIELD_FL_STACKTRACE)
+		if (field->flags & HIST_FIELD_FL_STACKTRACE)
 			seq_puts(m, "stacktrace");
 		else
-			hist_field_print(m, key_field);
+			hist_field_print(m, field);
 	}
 
 	seq_puts(m, ":vals=");
 
 	for_each_hist_val_field(i, hist_data) {
+		field = hist_data->fields[i];
+		if (field->flags & HIST_FIELD_FL_VAR) {
+			have_var = true;
+			continue;
+		}
+
 		if (i == HITCOUNT_IDX)
 			seq_puts(m, "hitcount");
 		else {
 			seq_puts(m, ",");
-			hist_field_print(m, hist_data->fields[i]);
+			hist_field_print(m, field);
+		}
+	}
+
+	if (have_var) {
+		unsigned int n = 0;
+
+		seq_puts(m, ":");
+
+		for_each_hist_val_field(i, hist_data) {
+			field = hist_data->fields[i];
+
+			if (field->flags & HIST_FIELD_FL_VAR) {
+				if (n++)
+					seq_puts(m, ",");
+				hist_field_print(m, field);
+			}
 		}
 	}
 
@@ -1237,28 +4992,36 @@ static int event_hist_trigger_print(struct seq_file *m,
 
 	for (i = 0; i < hist_data->n_sort_keys; i++) {
 		struct tracing_map_sort_key *sort_key;
+		unsigned int idx, first_key_idx;
+
+		/* skip VAR vals */
+		first_key_idx = hist_data->n_vals - hist_data->n_vars;
 
 		sort_key = &hist_data->sort_keys[i];
+		idx = sort_key->field_idx;
+
+		if (WARN_ON(idx >= HIST_FIELDS_MAX))
+			return -EINVAL;
 
 		if (i > 0)
 			seq_puts(m, ",");
 
-		if (sort_key->field_idx == HITCOUNT_IDX)
+		if (idx == HITCOUNT_IDX)
 			seq_puts(m, "hitcount");
 		else {
-			unsigned int idx = sort_key->field_idx;
-
-			if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX))
-				return -EINVAL;
-
+			if (idx >= first_key_idx)
+				idx += hist_data->n_vars;
 			hist_field_print(m, hist_data->fields[idx]);
 		}
 
 		if (sort_key->descending)
 			seq_puts(m, ".descending");
 	}
-
 	seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
+	if (hist_data->enable_timestamps)
+		seq_printf(m, ":clock=%s", hist_data->attrs->clock);
+
+	print_actions_spec(m, hist_data);
 
 	if (data->filter_str)
 		seq_printf(m, " if %s", data->filter_str);
@@ -1286,6 +5049,21 @@ static int event_hist_trigger_init(struct event_trigger_ops *ops,
 	return 0;
 }
 
+static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
+{
+	struct trace_event_file *file;
+	unsigned int i;
+	char *cmd;
+	int ret;
+
+	for (i = 0; i < hist_data->n_field_var_hists; i++) {
+		file = hist_data->field_var_hists[i]->hist_data->event_file;
+		cmd = hist_data->field_var_hists[i]->cmd;
+		ret = event_hist_trigger_func(&trigger_hist_cmd, file,
+					      "!hist", "hist", cmd);
+	}
+}
+
 static void event_hist_trigger_free(struct event_trigger_ops *ops,
 				    struct event_trigger_data *data)
 {
@@ -1298,7 +5076,13 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops,
 	if (!data->ref) {
 		if (data->name)
 			del_named_trigger(data);
+
 		trigger_data_free(data);
+
+		remove_hist_vars(hist_data);
+
+		unregister_field_var_hists(hist_data);
+
 		destroy_hist_data(hist_data);
 	}
 }
@@ -1425,6 +5209,15 @@ static bool hist_trigger_match(struct event_trigger_data *data,
 			return false;
 		if (key_field->offset != key_field_test->offset)
 			return false;
+		if (key_field->size != key_field_test->size)
+			return false;
+		if (key_field->is_signed != key_field_test->is_signed)
+			return false;
+		if (!!key_field->var.name != !!key_field_test->var.name)
+			return false;
+		if (key_field->var.name &&
+		    strcmp(key_field->var.name, key_field_test->var.name) != 0)
+			return false;
 	}
 
 	for (i = 0; i < hist_data->n_sort_keys; i++) {
@@ -1440,6 +5233,9 @@ static bool hist_trigger_match(struct event_trigger_data *data,
 	    (strcmp(data->filter_str, data_test->filter_str) != 0))
 		return false;
 
+	if (!actions_match(hist_data, hist_data_test))
+		return false;
+
 	return true;
 }
 
@@ -1456,6 +5252,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 		if (named_data) {
 			if (!hist_trigger_match(data, named_data, named_data,
 						true)) {
+				hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name);
 				ret = -EINVAL;
 				goto out;
 			}
@@ -1475,13 +5272,16 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 				test->paused = false;
 			else if (hist_data->attrs->clear)
 				hist_clear(test);
-			else
+			else {
+				hist_err("Hist trigger already exists", NULL);
 				ret = -EEXIST;
+			}
 			goto out;
 		}
 	}
  new:
 	if (hist_data->attrs->cont || hist_data->attrs->clear) {
+		hist_err("Can't clear or continue a nonexistent hist trigger", NULL);
 		ret = -ENOENT;
 		goto out;
 	}
@@ -1490,7 +5290,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 		data->paused = true;
 
 	if (named_data) {
-		destroy_hist_data(data->private_data);
 		data->private_data = named_data->private_data;
 		set_named_trigger_data(data, named_data);
 		data->ops = &event_hist_trigger_named_ops;
@@ -1502,8 +5301,32 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 			goto out;
 	}
 
-	list_add_rcu(&data->list, &file->triggers);
+	if (hist_data->enable_timestamps) {
+		char *clock = hist_data->attrs->clock;
+
+		ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
+		if (ret) {
+			hist_err("Couldn't set trace_clock: ", clock);
+			goto out;
+		}
+
+		tracing_set_time_stamp_abs(file->tr, true);
+	}
+
+	if (named_data)
+		destroy_hist_data(hist_data);
+
 	ret++;
+ out:
+	return ret;
+}
+
+static int hist_trigger_enable(struct event_trigger_data *data,
+			       struct trace_event_file *file)
+{
+	int ret = 0;
+
+	list_add_tail_rcu(&data->list, &file->triggers);
 
 	update_cond_flag(file);
 
@@ -1512,10 +5335,55 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 		update_cond_flag(file);
 		ret--;
 	}
- out:
+
 	return ret;
 }
 
+static bool have_hist_trigger_match(struct event_trigger_data *data,
+				    struct trace_event_file *file)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+	struct event_trigger_data *test, *named_data = NULL;
+	bool match = false;
+
+	if (hist_data->attrs->name)
+		named_data = find_named_trigger(hist_data->attrs->name);
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			if (hist_trigger_match(data, test, named_data, false)) {
+				match = true;
+				break;
+			}
+		}
+	}
+
+	return match;
+}
+
+static bool hist_trigger_check_refs(struct event_trigger_data *data,
+				    struct trace_event_file *file)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+	struct event_trigger_data *test, *named_data = NULL;
+
+	if (hist_data->attrs->name)
+		named_data = find_named_trigger(hist_data->attrs->name);
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			if (!hist_trigger_match(data, test, named_data, false))
+				continue;
+			hist_data = test->private_data;
+			if (check_var_refs(hist_data))
+				return true;
+			break;
+		}
+	}
+
+	return false;
+}
+
 static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
 				    struct event_trigger_data *data,
 				    struct trace_event_file *file)
@@ -1541,17 +5409,55 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
 
 	if (unregistered && test->ops->free)
 		test->ops->free(test->ops, test);
+
+	if (hist_data->enable_timestamps) {
+		if (!hist_data->remove || unregistered)
+			tracing_set_time_stamp_abs(file->tr, false);
+	}
+}
+
+static bool hist_file_check_refs(struct trace_event_file *file)
+{
+	struct hist_trigger_data *hist_data;
+	struct event_trigger_data *test;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			hist_data = test->private_data;
+			if (check_var_refs(hist_data))
+				return true;
+		}
+	}
+
+	return false;
 }
 
 static void hist_unreg_all(struct trace_event_file *file)
 {
 	struct event_trigger_data *test, *n;
+	struct hist_trigger_data *hist_data;
+	struct synth_event *se;
+	const char *se_name;
+
+	if (hist_file_check_refs(file))
+		return;
 
 	list_for_each_entry_safe(test, n, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			hist_data = test->private_data;
 			list_del_rcu(&test->list);
 			trace_event_trigger_enable_disable(file, 0);
+
+			mutex_lock(&synth_event_mutex);
+			se_name = trace_event_name(file->event_call);
+			se = find_synth_event(se_name);
+			if (se)
+				se->ref--;
+			mutex_unlock(&synth_event_mutex);
+
 			update_cond_flag(file);
+			if (hist_data->enable_timestamps)
+				tracing_set_time_stamp_abs(file->tr, false);
 			if (test->ops->free)
 				test->ops->free(test->ops, test);
 		}
@@ -1567,16 +5473,54 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 	struct hist_trigger_attrs *attrs;
 	struct event_trigger_ops *trigger_ops;
 	struct hist_trigger_data *hist_data;
-	char *trigger;
+	struct synth_event *se;
+	const char *se_name;
+	bool remove = false;
+	char *trigger, *p;
 	int ret = 0;
 
+	if (glob && strlen(glob)) {
+		last_cmd_set(param);
+		hist_err_clear();
+	}
+
 	if (!param)
 		return -EINVAL;
 
-	/* separate the trigger from the filter (k:v [if filter]) */
-	trigger = strsep(&param, " \t");
-	if (!trigger)
-		return -EINVAL;
+	if (glob[0] == '!')
+		remove = true;
+
+	/*
+	 * separate the trigger from the filter (k:v [if filter])
+	 * allowing for whitespace in the trigger
+	 */
+	p = trigger = param;
+	do {
+		p = strstr(p, "if");
+		if (!p)
+			break;
+		if (p == param)
+			return -EINVAL;
+		if (*(p - 1) != ' ' && *(p - 1) != '\t') {
+			p++;
+			continue;
+		}
+		if (p >= param + strlen(param) - strlen("if") - 1)
+			return -EINVAL;
+		if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') {
+			p++;
+			continue;
+		}
+		break;
+	} while (p);
+
+	if (!p)
+		param = NULL;
+	else {
+		*(p - 1) = '\0';
+		param = strstrip(p);
+		trigger = strstrip(trigger);
+	}
 
 	attrs = parse_hist_trigger_attrs(trigger);
 	if (IS_ERR(attrs))
@@ -1585,7 +5529,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 	if (attrs->map_bits)
 		hist_trigger_bits = attrs->map_bits;
 
-	hist_data = create_hist_data(hist_trigger_bits, attrs, file);
+	hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove);
 	if (IS_ERR(hist_data)) {
 		destroy_hist_trigger_attrs(attrs);
 		return PTR_ERR(hist_data);
@@ -1593,10 +5537,11 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 
 	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
 
-	ret = -ENOMEM;
 	trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
-	if (!trigger_data)
+	if (!trigger_data) {
+		ret = -ENOMEM;
 		goto out_free;
+	}
 
 	trigger_data->count = -1;
 	trigger_data->ops = trigger_ops;
@@ -1614,8 +5559,24 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 			goto out_free;
 	}
 
-	if (glob[0] == '!') {
+	if (remove) {
+		if (!have_hist_trigger_match(trigger_data, file))
+			goto out_free;
+
+		if (hist_trigger_check_refs(trigger_data, file)) {
+			ret = -EBUSY;
+			goto out_free;
+		}
+
 		cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+
+		mutex_lock(&synth_event_mutex);
+		se_name = trace_event_name(file->event_call);
+		se = find_synth_event(se_name);
+		if (se)
+			se->ref--;
+		mutex_unlock(&synth_event_mutex);
+
 		ret = 0;
 		goto out_free;
 	}
@@ -1632,14 +5593,47 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 		goto out_free;
 	} else if (ret < 0)
 		goto out_free;
+
+	if (get_named_trigger_data(trigger_data))
+		goto enable;
+
+	if (has_hist_vars(hist_data))
+		save_hist_vars(hist_data);
+
+	ret = create_actions(hist_data, file);
+	if (ret)
+		goto out_unreg;
+
+	ret = tracing_map_init(hist_data->map);
+	if (ret)
+		goto out_unreg;
+enable:
+	ret = hist_trigger_enable(trigger_data, file);
+	if (ret)
+		goto out_unreg;
+
+	mutex_lock(&synth_event_mutex);
+	se_name = trace_event_name(file->event_call);
+	se = find_synth_event(se_name);
+	if (se)
+		se->ref++;
+	mutex_unlock(&synth_event_mutex);
+
 	/* Just return zero, not the number of registered triggers */
 	ret = 0;
  out:
+	if (ret == 0)
+		hist_err_clear();
+
 	return ret;
+ out_unreg:
+	cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
  out_free:
 	if (cmd_ops->set_filter)
 		cmd_ops->set_filter(NULL, trigger_data, NULL);
 
+	remove_hist_vars(hist_data);
+
 	kfree(trigger_data);
 
 	destroy_hist_data(hist_data);
@@ -1669,7 +5663,8 @@ __init int register_trigger_hist_cmd(void)
 }
 
 static void
-hist_enable_trigger(struct event_trigger_data *data, void *rec)
+hist_enable_trigger(struct event_trigger_data *data, void *rec,
+		    struct ring_buffer_event *event)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 	struct event_trigger_data *test;
@@ -1685,7 +5680,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec)
 }
 
 static void
-hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
+hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
+			  struct ring_buffer_event *event)
 {
 	if (!data->count)
 		return;
@@ -1693,7 +5689,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
 	if (data->count != -1)
 		(data->count)--;
 
-	hist_enable_trigger(data, rec);
+	hist_enable_trigger(data, rec, event);
 }
 
 static struct event_trigger_ops hist_enable_trigger_ops = {
@@ -1798,3 +5794,31 @@ __init int register_trigger_hist_enable_disable_cmds(void)
 
 	return ret;
 }
+
+static __init int trace_events_hist_init(void)
+{
+	struct dentry *entry = NULL;
+	struct dentry *d_tracer;
+	int err = 0;
+
+	d_tracer = tracing_init_dentry();
+	if (IS_ERR(d_tracer)) {
+		err = PTR_ERR(d_tracer);
+		goto err;
+	}
+
+	entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
+				    NULL, &synth_events_fops);
+	if (!entry) {
+		err = -ENODEV;
+		goto err;
+	}
+
+	return err;
+ err:
+	pr_warn("Could not create tracefs 'synthetic_events' entry\n");
+
+	return err;
+}
+
+fs_initcall(trace_events_hist_init);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 87411482a46f..d18249683682 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -63,7 +63,8 @@ void trigger_data_free(struct event_trigger_data *data)
  * any trigger that should be deferred, ETT_NONE if nothing to defer.
  */
 enum event_trigger_type
-event_triggers_call(struct trace_event_file *file, void *rec)
+event_triggers_call(struct trace_event_file *file, void *rec,
+		    struct ring_buffer_event *event)
 {
 	struct event_trigger_data *data;
 	enum event_trigger_type tt = ETT_NONE;
@@ -76,7 +77,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
 		if (data->paused)
 			continue;
 		if (!rec) {
-			data->ops->func(data, rec);
+			data->ops->func(data, rec, event);
 			continue;
 		}
 		filter = rcu_dereference_sched(data->filter);
@@ -86,7 +87,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
 			tt |= data->cmd_ops->trigger_type;
 			continue;
 		}
-		data->ops->func(data, rec);
+		data->ops->func(data, rec, event);
 	}
 	return tt;
 }
@@ -96,7 +97,6 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
  * event_triggers_post_call - Call 'post_triggers' for a trace event
  * @file: The trace_event_file associated with the event
  * @tt: enum event_trigger_type containing a set bit for each trigger to invoke
- * @rec: The trace entry for the event
  *
  * For each trigger associated with an event, invoke the trigger
  * function registered with the associated trigger command, if the
@@ -107,8 +107,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
  */
 void
 event_triggers_post_call(struct trace_event_file *file,
-			 enum event_trigger_type tt,
-			 void *rec)
+			 enum event_trigger_type tt)
 {
 	struct event_trigger_data *data;
 
@@ -116,7 +115,7 @@ event_triggers_post_call(struct trace_event_file *file,
 		if (data->paused)
 			continue;
 		if (data->cmd_ops->trigger_type & tt)
-			data->ops->func(data, rec);
+			data->ops->func(data, NULL, NULL);
 	}
 }
 EXPORT_SYMBOL_GPL(event_triggers_post_call);
@@ -482,9 +481,10 @@ clear_event_triggers(struct trace_array *tr)
 	struct trace_event_file *file;
 
 	list_for_each_entry(file, &tr->events, list) {
-		struct event_trigger_data *data;
-		list_for_each_entry_rcu(data, &file->triggers, list) {
+		struct event_trigger_data *data, *n;
+		list_for_each_entry_safe(data, n, &file->triggers, list) {
 			trace_event_trigger_enable_disable(file, 0);
+			list_del_rcu(&data->list);
 			if (data->ops->free)
 				data->ops->free(data->ops, data);
 		}
@@ -641,6 +641,7 @@ event_trigger_callback(struct event_command *cmd_ops,
 	trigger_data->count = -1;
 	trigger_data->ops = trigger_ops;
 	trigger_data->cmd_ops = cmd_ops;
+	trigger_data->private_data = file;
 	INIT_LIST_HEAD(&trigger_data->list);
 	INIT_LIST_HEAD(&trigger_data->named_list);
 
@@ -908,8 +909,15 @@ void set_named_trigger_data(struct event_trigger_data *data,
 	data->named_data = named_data;
 }
 
+struct event_trigger_data *
+get_named_trigger_data(struct event_trigger_data *data)
+{
+	return data->named_data;
+}
+
 static void
-traceon_trigger(struct event_trigger_data *data, void *rec)
+traceon_trigger(struct event_trigger_data *data, void *rec,
+		struct ring_buffer_event *event)
 {
 	if (tracing_is_on())
 		return;
@@ -918,7 +926,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec)
 }
 
 static void
-traceon_count_trigger(struct event_trigger_data *data, void *rec)
+traceon_count_trigger(struct event_trigger_data *data, void *rec,
+		      struct ring_buffer_event *event)
 {
 	if (tracing_is_on())
 		return;
@@ -933,7 +942,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec)
 }
 
 static void
-traceoff_trigger(struct event_trigger_data *data, void *rec)
+traceoff_trigger(struct event_trigger_data *data, void *rec,
+		 struct ring_buffer_event *event)
 {
 	if (!tracing_is_on())
 		return;
@@ -942,7 +952,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec)
 }
 
 static void
-traceoff_count_trigger(struct event_trigger_data *data, void *rec)
+traceoff_count_trigger(struct event_trigger_data *data, void *rec,
+		       struct ring_buffer_event *event)
 {
 	if (!tracing_is_on())
 		return;
@@ -1039,13 +1050,20 @@ static struct event_command trigger_traceoff_cmd = {
 
 #ifdef CONFIG_TRACER_SNAPSHOT
 static void
-snapshot_trigger(struct event_trigger_data *data, void *rec)
+snapshot_trigger(struct event_trigger_data *data, void *rec,
+		 struct ring_buffer_event *event)
 {
-	tracing_snapshot();
+	struct trace_event_file *file = data->private_data;
+
+	if (file)
+		tracing_snapshot_instance(file->tr);
+	else
+		tracing_snapshot();
 }
 
 static void
-snapshot_count_trigger(struct event_trigger_data *data, void *rec)
+snapshot_count_trigger(struct event_trigger_data *data, void *rec,
+		       struct ring_buffer_event *event)
 {
 	if (!data->count)
 		return;
@@ -1053,7 +1071,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec)
 	if (data->count != -1)
 		(data->count)--;
 
-	snapshot_trigger(data, rec);
+	snapshot_trigger(data, rec, event);
 }
 
 static int
@@ -1063,7 +1081,7 @@ register_snapshot_trigger(char *glob, struct event_trigger_ops *ops,
 {
 	int ret = register_trigger(glob, ops, data, file);
 
-	if (ret > 0 && tracing_alloc_snapshot() != 0) {
+	if (ret > 0 && tracing_alloc_snapshot_instance(file->tr) != 0) {
 		unregister_trigger(glob, ops, data, file);
 		ret = 0;
 	}
@@ -1141,13 +1159,15 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
 #endif
 
 static void
-stacktrace_trigger(struct event_trigger_data *data, void *rec)
+stacktrace_trigger(struct event_trigger_data *data, void *rec,
+		   struct ring_buffer_event *event)
 {
 	trace_dump_stack(STACK_SKIP);
 }
 
 static void
-stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
+stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
+			 struct ring_buffer_event *event)
 {
 	if (!data->count)
 		return;
@@ -1155,7 +1175,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
 	if (data->count != -1)
 		(data->count)--;
 
-	stacktrace_trigger(data, rec);
+	stacktrace_trigger(data, rec, event);
 }
 
 static int
@@ -1217,7 +1237,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
 }
 
 static void
-event_enable_trigger(struct event_trigger_data *data, void *rec)
+event_enable_trigger(struct event_trigger_data *data, void *rec,
+		     struct ring_buffer_event *event)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 
@@ -1228,7 +1249,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec)
 }
 
 static void
-event_enable_count_trigger(struct event_trigger_data *data, void *rec)
+event_enable_count_trigger(struct event_trigger_data *data, void *rec,
+			   struct ring_buffer_event *event)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 
@@ -1242,7 +1264,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec)
 	if (data->count != -1)
 		(data->count)--;
 
-	event_enable_trigger(data, rec);
+	event_enable_trigger(data, rec, event);
 }
 
 int event_enable_trigger_print(struct seq_file *m,
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 548e62eb5c46..45630a76ed3a 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -14,6 +14,13 @@
 
 #include "trace_output.h"
 
+/* Stub function for events with triggers */
+static int ftrace_event_register(struct trace_event_call *call,
+				 enum trace_reg type, void *data)
+{
+	return 0;
+}
+
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM	ftrace
 
@@ -117,7 +124,7 @@ static void __always_unused ____ftrace_check_##name(void)		\
 
 #undef __dynamic_array
 #define __dynamic_array(type, item)					\
-	ret = trace_define_field(event_call, #type, #item,		\
+	ret = trace_define_field(event_call, #type "[]", #item,  \
 				 offsetof(typeof(field), item),		\
 				 0, is_signed_type(type), filter_type);\
 	if (ret)							\
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1cd3fb4d70f8..daa81571b22a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -512,8 +512,6 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
 	if (ret == 0)
 		tk->tp.flags |= TP_FLAG_REGISTERED;
 	else {
-		pr_warn("Could not insert probe at %s+%lu: %d\n",
-			trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret);
 		if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) {
 			pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
 			ret = 0;
@@ -1289,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
+
+int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
+			const char **symbol, u64 *probe_offset,
+			u64 *probe_addr, bool perf_type_tracepoint)
+{
+	const char *pevent = trace_event_name(event->tp_event);
+	const char *group = event->tp_event->class->system;
+	struct trace_kprobe *tk;
+
+	if (perf_type_tracepoint)
+		tk = find_trace_kprobe(pevent, group);
+	else
+		tk = event->tp_event->data;
+	if (!tk)
+		return -EINVAL;
+
+	*fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
+					      : BPF_FD_TYPE_KPROBE;
+	if (tk->symbol) {
+		*symbol = tk->symbol;
+		*probe_offset = tk->rp.kp.offset;
+		*probe_addr = 0;
+	} else {
+		*symbol = NULL;
+		*probe_offset = 0;
+		*probe_addr = (unsigned long)tk->rp.kp.addr;
+	}
+	return 0;
+}
 #endif	/* CONFIG_PERF_EVENTS */
 
 /*
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3c7bfc4bf5e9..4237eba4ef20 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -472,7 +472,7 @@ static __init int stack_trace_init(void)
 			NULL, &stack_trace_fops);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-	trace_create_file("stack_trace_filter", 0444, d_tracer,
+	trace_create_file("stack_trace_filter", 0644, d_tracer,
 			  &trace_ops, &stack_trace_filter_fops);
 #endif
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2014f4351ae0..bf89a51e740d 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -55,6 +55,7 @@ struct trace_uprobe {
 	struct list_head		list;
 	struct trace_uprobe_filter	filter;
 	struct uprobe_consumer		consumer;
+	struct path			path;
 	struct inode			*inode;
 	char				*filename;
 	unsigned long			offset;
@@ -151,6 +152,8 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 		return;
 
 	ret = strncpy_from_user(dst, src, maxlen);
+	if (ret == maxlen)
+		dst[--ret] = '\0';
 
 	if (ret < 0) {	/* Failed to fetch string */
 		((u8 *)get_rloc_data(dest))[0] = '\0';
@@ -287,7 +290,7 @@ static void free_trace_uprobe(struct trace_uprobe *tu)
 	for (i = 0; i < tu->tp.nr_args; i++)
 		traceprobe_free_probe_arg(&tu->tp.args[i]);
 
-	iput(tu->inode);
+	path_put(&tu->path);
 	kfree(tu->tp.call.class->system);
 	kfree(tu->tp.call.name);
 	kfree(tu->filename);
@@ -361,7 +364,6 @@ end:
 static int create_trace_uprobe(int argc, char **argv)
 {
 	struct trace_uprobe *tu;
-	struct inode *inode;
 	char *arg, *event, *group, *filename;
 	char buf[MAX_EVENT_NAME_LEN];
 	struct path path;
@@ -369,7 +371,6 @@ static int create_trace_uprobe(int argc, char **argv)
 	bool is_delete, is_return;
 	int i, ret;
 
-	inode = NULL;
 	ret = 0;
 	is_delete = false;
 	is_return = false;
@@ -435,21 +436,16 @@ static int create_trace_uprobe(int argc, char **argv)
 	}
 	/* Find the last occurrence, in case the path contains ':' too. */
 	arg = strrchr(argv[1], ':');
-	if (!arg) {
-		ret = -EINVAL;
-		goto fail_address_parse;
-	}
+	if (!arg)
+		return -EINVAL;
 
 	*arg++ = '\0';
 	filename = argv[1];
 	ret = kern_path(filename, LOOKUP_FOLLOW, &path);
 	if (ret)
-		goto fail_address_parse;
-
-	inode = igrab(d_inode(path.dentry));
-	path_put(&path);
+		return ret;
 
-	if (!inode || !S_ISREG(inode->i_mode)) {
+	if (!d_is_reg(path.dentry)) {
 		ret = -EINVAL;
 		goto fail_address_parse;
 	}
@@ -488,7 +484,7 @@ static int create_trace_uprobe(int argc, char **argv)
 		goto fail_address_parse;
 	}
 	tu->offset = offset;
-	tu->inode = inode;
+	tu->path = path;
 	tu->filename = kstrdup(filename, GFP_KERNEL);
 
 	if (!tu->filename) {
@@ -556,7 +552,7 @@ error:
 	return ret;
 
 fail_address_parse:
-	iput(inode);
+	path_put(&path);
 
 	pr_info("Failed to parse address or file.\n");
 
@@ -602,24 +598,9 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	char c = is_ret_probe(tu) ? 'r' : 'p';
 	int i;
 
-	seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
-			trace_event_name(&tu->tp.call));
-	seq_printf(m, " %s:", tu->filename);
-
-	/* Don't print "0x  (null)" when offset is 0 */
-	if (tu->offset) {
-		seq_printf(m, "0x%px", (void *)tu->offset);
-	} else {
-		switch (sizeof(void *)) {
-		case 4:
-			seq_printf(m, "0x00000000");
-			break;
-		case 8:
-		default:
-			seq_printf(m, "0x0000000000000000");
-			break;
-		}
-	}
+	seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, tu->tp.call.class->system,
+			trace_event_name(&tu->tp.call), tu->filename,
+			(int)(sizeof(void *) * 2), tu->offset);
 
 	for (i = 0; i < tu->tp.nr_args; i++)
 		seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
@@ -935,6 +916,7 @@ probe_event_enable(struct trace_uprobe *tu, struct trace_event_file *file,
 		goto err_flags;
 
 	tu->consumer.filter = filter;
+	tu->inode = d_real_inode(tu->path.dentry);
 	ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
 	if (ret)
 		goto err_buffer;
@@ -980,6 +962,7 @@ probe_event_disable(struct trace_uprobe *tu, struct trace_event_file *file)
 	WARN_ON(!uprobe_filter_is_empty(&tu->filter));
 
 	uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
+	tu->inode = NULL;
 	tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
 
 	uprobe_buffer_disable();
@@ -1178,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
 {
 	__uprobe_perf_func(tu, func, regs, ucb, dsize);
 }
+
+int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
+			const char **filename, u64 *probe_offset,
+			bool perf_type_tracepoint)
+{
+	const char *pevent = trace_event_name(event->tp_event);
+	const char *group = event->tp_event->class->system;
+	struct trace_uprobe *tu;
+
+	if (perf_type_tracepoint)
+		tu = find_probe_event(pevent, group);
+	else
+		tu = event->tp_event->data;
+	if (!tu)
+		return -EINVAL;
+
+	*fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE
+				    : BPF_FD_TYPE_UPROBE;
+	*filename = tu->filename;
+	*probe_offset = tu->offset;
+	return 0;
+}
 #endif	/* CONFIG_PERF_EVENTS */
 
 static int
@@ -1350,7 +1355,6 @@ struct trace_event_call *
 create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
 {
 	struct trace_uprobe *tu;
-	struct inode *inode;
 	struct path path;
 	int ret;
 
@@ -1358,11 +1362,8 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
 	if (ret)
 		return ERR_PTR(ret);
 
-	inode = igrab(d_inode(path.dentry));
-	path_put(&path);
-
-	if (!inode || !S_ISREG(inode->i_mode)) {
-		iput(inode);
+	if (!d_is_reg(path.dentry)) {
+		path_put(&path);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -1377,11 +1378,12 @@ create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
 	if (IS_ERR(tu)) {
 		pr_info("Failed to allocate trace_uprobe.(%d)\n",
 			(int)PTR_ERR(tu));
+		path_put(&path);
 		return ERR_CAST(tu);
 	}
 
 	tu->offset = offs;
-	tu->inode = inode;
+	tu->path = path;
 	tu->filename = kstrdup(name, GFP_KERNEL);
 	init_trace_event_call(tu, &tu->tp.call);
 
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 07e75344725b..5cadb1b8b5fe 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -66,6 +66,73 @@ u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i)
 	return (u64)atomic64_read(&elt->fields[i].sum);
 }
 
+/**
+ * tracing_map_set_var - Assign a tracing_map_elt's variable field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given variable associated with the tracing_map_elt
+ * @n: The value to assign
+ *
+ * Assign n to variable i associated with the specified tracing_map_elt
+ * instance.  The index i is the index returned by the call to
+ * tracing_map_add_var() when the tracing map was set up.
+ */
+void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n)
+{
+	atomic64_set(&elt->vars[i], n);
+	elt->var_set[i] = true;
+}
+
+/**
+ * tracing_map_var_set - Return whether or not a variable has been set
+ * @elt: The tracing_map_elt
+ * @i: The index of the given variable associated with the tracing_map_elt
+ *
+ * Return true if the variable has been set, false otherwise.  The
+ * index i is the index returned by the call to tracing_map_add_var()
+ * when the tracing map was set up.
+ */
+bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i)
+{
+	return elt->var_set[i];
+}
+
+/**
+ * tracing_map_read_var - Return the value of a tracing_map_elt's variable field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given variable associated with the tracing_map_elt
+ *
+ * Retrieve the value of the variable i associated with the specified
+ * tracing_map_elt instance.  The index i is the index returned by the
+ * call to tracing_map_add_var() when the tracing map was set
+ * up.
+ *
+ * Return: The variable value associated with field i for elt.
+ */
+u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i)
+{
+	return (u64)atomic64_read(&elt->vars[i]);
+}
+
+/**
+ * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given variable associated with the tracing_map_elt
+ *
+ * Retrieve the value of the variable i associated with the specified
+ * tracing_map_elt instance, and reset the variable to the 'not set'
+ * state.  The index i is the index returned by the call to
+ * tracing_map_add_var() when the tracing map was set up.  The reset
+ * essentially makes the variable a read-once variable if it's only
+ * accessed using this function.
+ *
+ * Return: The variable value associated with field i for elt.
+ */
+u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i)
+{
+	elt->var_set[i] = false;
+	return (u64)atomic64_read(&elt->vars[i]);
+}
+
 int tracing_map_cmp_string(void *val_a, void *val_b)
 {
 	char *a = val_a;
@@ -171,6 +238,28 @@ int tracing_map_add_sum_field(struct tracing_map *map)
 }
 
 /**
+ * tracing_map_add_var - Add a field describing a tracing_map var
+ * @map: The tracing_map
+ *
+ * Add a var to the map and return the index identifying it in the map
+ * and associated tracing_map_elts.  This is the index used for
+ * instance to update a var for a particular tracing_map_elt using
+ * tracing_map_update_var() or reading it via tracing_map_read_var().
+ *
+ * Return: The index identifying the var in the map and associated
+ * tracing_map_elts, or -EINVAL on error.
+ */
+int tracing_map_add_var(struct tracing_map *map)
+{
+	int ret = -EINVAL;
+
+	if (map->n_vars < TRACING_MAP_VARS_MAX)
+		ret = map->n_vars++;
+
+	return ret;
+}
+
+/**
  * tracing_map_add_key_field - Add a field describing a tracing_map key
  * @map: The tracing_map
  * @offset: The offset within the key
@@ -280,6 +369,11 @@ static void tracing_map_elt_clear(struct tracing_map_elt *elt)
 		if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
 			atomic64_set(&elt->fields[i].sum, 0);
 
+	for (i = 0; i < elt->map->n_vars; i++) {
+		atomic64_set(&elt->vars[i], 0);
+		elt->var_set[i] = false;
+	}
+
 	if (elt->map->ops && elt->map->ops->elt_clear)
 		elt->map->ops->elt_clear(elt);
 }
@@ -306,6 +400,8 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt)
 	if (elt->map->ops && elt->map->ops->elt_free)
 		elt->map->ops->elt_free(elt);
 	kfree(elt->fields);
+	kfree(elt->vars);
+	kfree(elt->var_set);
 	kfree(elt->key);
 	kfree(elt);
 }
@@ -333,6 +429,18 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
 		goto free;
 	}
 
+	elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL);
+	if (!elt->vars) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL);
+	if (!elt->var_set) {
+		err = -ENOMEM;
+		goto free;
+	}
+
 	tracing_map_elt_init_fields(elt);
 
 	if (map->ops && map->ops->elt_alloc) {
@@ -414,7 +522,9 @@ static inline struct tracing_map_elt *
 __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
 {
 	u32 idx, key_hash, test_key;
+	int dup_try = 0;
 	struct tracing_map_entry *entry;
+	struct tracing_map_elt *val;
 
 	key_hash = jhash(key, map->key_size, 0);
 	if (key_hash == 0)
@@ -426,11 +536,33 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
 		entry = TRACING_MAP_ENTRY(map->map, idx);
 		test_key = entry->key;
 
-		if (test_key && test_key == key_hash && entry->val &&
-		    keys_match(key, entry->val->key, map->key_size)) {
-			if (!lookup_only)
-				atomic64_inc(&map->hits);
-			return entry->val;
+		if (test_key && test_key == key_hash) {
+			val = READ_ONCE(entry->val);
+			if (val &&
+			    keys_match(key, val->key, map->key_size)) {
+				if (!lookup_only)
+					atomic64_inc(&map->hits);
+				return val;
+			} else if (unlikely(!val)) {
+				/*
+				 * The key is present. But, val (pointer to elt
+				 * struct) is still NULL. which means some other
+				 * thread is in the process of inserting an
+				 * element.
+				 *
+				 * On top of that, it's key_hash is same as the
+				 * one being inserted right now. So, it's
+				 * possible that the element has the same
+				 * key as well.
+				 */
+
+				dup_try++;
+				if (dup_try > map->map_size) {
+					atomic64_inc(&map->drops);
+					break;
+				}
+				continue;
+			}
 		}
 
 		if (!test_key) {
@@ -452,6 +584,13 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
 				atomic64_inc(&map->hits);
 
 				return entry->val;
+			} else {
+				/*
+				 * cmpxchg() failed. Loop around once
+				 * more to check what key was inserted.
+				 */
+				dup_try++;
+				continue;
 			}
 		}
 
@@ -816,67 +955,15 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
 	return sort_entry;
 }
 
-static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
-{
-	struct tracing_map_elt *dup_elt;
-	unsigned int i;
-
-	dup_elt = tracing_map_elt_alloc(elt->map);
-	if (IS_ERR(dup_elt))
-		return NULL;
-
-	if (elt->map->ops && elt->map->ops->elt_copy)
-		elt->map->ops->elt_copy(dup_elt, elt);
-
-	dup_elt->private_data = elt->private_data;
-	memcpy(dup_elt->key, elt->key, elt->map->key_size);
-
-	for (i = 0; i < elt->map->n_fields; i++) {
-		atomic64_set(&dup_elt->fields[i].sum,
-			     atomic64_read(&elt->fields[i].sum));
-		dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
-	}
-
-	return dup_elt;
-}
-
-static int merge_dup(struct tracing_map_sort_entry **sort_entries,
-		     unsigned int target, unsigned int dup)
-{
-	struct tracing_map_elt *target_elt, *elt;
-	bool first_dup = (target - dup) == 1;
-	int i;
-
-	if (first_dup) {
-		elt = sort_entries[target]->elt;
-		target_elt = copy_elt(elt);
-		if (!target_elt)
-			return -ENOMEM;
-		sort_entries[target]->elt = target_elt;
-		sort_entries[target]->elt_copied = true;
-	} else
-		target_elt = sort_entries[target]->elt;
-
-	elt = sort_entries[dup]->elt;
-
-	for (i = 0; i < elt->map->n_fields; i++)
-		atomic64_add(atomic64_read(&elt->fields[i].sum),
-			     &target_elt->fields[i].sum);
-
-	sort_entries[dup]->dup = true;
-
-	return 0;
-}
-
-static int merge_dups(struct tracing_map_sort_entry **sort_entries,
+static void detect_dups(struct tracing_map_sort_entry **sort_entries,
 		      int n_entries, unsigned int key_size)
 {
 	unsigned int dups = 0, total_dups = 0;
-	int err, i, j;
+	int i;
 	void *key;
 
 	if (n_entries < 2)
-		return total_dups;
+		return;
 
 	sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
 	     (int (*)(const void *, const void *))cmp_entries_dup, NULL);
@@ -885,30 +972,14 @@ static int merge_dups(struct tracing_map_sort_entry **sort_entries,
 	for (i = 1; i < n_entries; i++) {
 		if (!memcmp(sort_entries[i]->key, key, key_size)) {
 			dups++; total_dups++;
-			err = merge_dup(sort_entries, i - dups, i);
-			if (err)
-				return err;
 			continue;
 		}
 		key = sort_entries[i]->key;
 		dups = 0;
 	}
 
-	if (!total_dups)
-		return total_dups;
-
-	for (i = 0, j = 0; i < n_entries; i++) {
-		if (!sort_entries[i]->dup) {
-			sort_entries[j] = sort_entries[i];
-			if (j++ != i)
-				sort_entries[i] = NULL;
-		} else {
-			destroy_sort_entry(sort_entries[i]);
-			sort_entries[i] = NULL;
-		}
-	}
-
-	return total_dups;
+	WARN_ONCE(total_dups > 0,
+		  "Duplicates detected: %d\n", total_dups);
 }
 
 static bool is_key(struct tracing_map *map, unsigned int field_idx)
@@ -1034,10 +1105,7 @@ int tracing_map_sort_entries(struct tracing_map *map,
 		return 1;
 	}
 
-	ret = merge_dups(entries, n_entries, map->key_size);
-	if (ret < 0)
-		goto free;
-	n_entries -= ret;
+	detect_dups(entries, n_entries, map->key_size);
 
 	if (is_key(map, sort_keys[0].field_idx))
 		cmp_entries_fn = cmp_entries_key;
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 5b5bbf8ae550..053eb92b2d31 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -10,6 +10,7 @@
 #define TRACING_MAP_VALS_MAX		3
 #define TRACING_MAP_FIELDS_MAX		(TRACING_MAP_KEYS_MAX + \
 					 TRACING_MAP_VALS_MAX)
+#define TRACING_MAP_VARS_MAX		16
 #define TRACING_MAP_SORT_KEYS_MAX	2
 
 typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
@@ -137,6 +138,8 @@ struct tracing_map_field {
 struct tracing_map_elt {
 	struct tracing_map		*map;
 	struct tracing_map_field	*fields;
+	atomic64_t			*vars;
+	bool				*var_set;
 	void				*key;
 	void				*private_data;
 };
@@ -192,6 +195,7 @@ struct tracing_map {
 	int				key_idx[TRACING_MAP_KEYS_MAX];
 	unsigned int			n_keys;
 	struct tracing_map_sort_key	sort_key;
+	unsigned int			n_vars;
 	atomic64_t			hits;
 	atomic64_t			drops;
 };
@@ -215,11 +219,6 @@ struct tracing_map {
  *	Element allocation occurs before tracing begins, when the
  *	tracing_map_init() call is made by client code.
  *
- * @elt_copy: At certain points in the lifetime of an element, it may
- *	need to be copied.  The copy should include a copy of the
- *	client-allocated data, which can be copied into the 'to'
- *	element from the 'from' element.
- *
  * @elt_free: When a tracing_map_elt is freed, this function is called
  *	and allows client-allocated per-element data to be freed.
  *
@@ -233,8 +232,6 @@ struct tracing_map {
  */
 struct tracing_map_ops {
 	int			(*elt_alloc)(struct tracing_map_elt *elt);
-	void			(*elt_copy)(struct tracing_map_elt *to,
-					    struct tracing_map_elt *from);
 	void			(*elt_free)(struct tracing_map_elt *elt);
 	void			(*elt_clear)(struct tracing_map_elt *elt);
 	void			(*elt_init)(struct tracing_map_elt *elt);
@@ -248,6 +245,7 @@ tracing_map_create(unsigned int map_bits,
 extern int tracing_map_init(struct tracing_map *map);
 
 extern int tracing_map_add_sum_field(struct tracing_map *map);
+extern int tracing_map_add_var(struct tracing_map *map);
 extern int tracing_map_add_key_field(struct tracing_map *map,
 				     unsigned int offset,
 				     tracing_map_cmp_fn_t cmp_fn);
@@ -267,7 +265,13 @@ extern int tracing_map_cmp_none(void *val_a, void *val_b);
 
 extern void tracing_map_update_sum(struct tracing_map_elt *elt,
 				   unsigned int i, u64 n);
+extern void tracing_map_set_var(struct tracing_map_elt *elt,
+				unsigned int i, u64 n);
+extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i);
 extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
+extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
+extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
+
 extern void tracing_map_set_field_descr(struct tracing_map *map,
 					unsigned int i,
 					unsigned int key_offset,
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 671b13457387..6dc6356c3327 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -207,7 +207,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
 			lockdep_is_held(&tracepoints_mutex));
 	old = func_add(&tp_funcs, func, prio);
 	if (IS_ERR(old)) {
-		WARN_ON_ONCE(1);
+		WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
 		return PTR_ERR(old);
 	}
 
@@ -239,7 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
 			lockdep_is_held(&tracepoints_mutex));
 	old = func_remove(&tp_funcs, func);
 	if (IS_ERR(old)) {
-		WARN_ON_ONCE(1);
+		WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
 		return PTR_ERR(old);
 	}
 
@@ -257,7 +257,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
 }
 
 /**
- * tracepoint_probe_register -  Connect a probe to a tracepoint
+ * tracepoint_probe_register_prio -  Connect a probe to a tracepoint with priority
  * @tp: tracepoint
  * @probe: probe handler
  * @data: tracepoint data
@@ -290,7 +290,6 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio);
  * @tp: tracepoint
  * @probe: probe handler
  * @data: tracepoint data
- * @prio: priority of this function over other registered functions
  *
  * Returns 0 if ok, error value on error.
  * Note: if @tp is within a module, the caller is responsible for
diff --git a/kernel/umh.c b/kernel/umh.c
index f76b3ff876cf..30db93fd7e39 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -25,6 +25,8 @@
 #include <linux/ptrace.h>
 #include <linux/async.h>
 #include <linux/uaccess.h>
+#include <linux/shmem_fs.h>
+#include <linux/pipe_fs_i.h>
 
 #include <trace/events/module.h>
 
@@ -97,9 +99,13 @@ static int call_usermodehelper_exec_async(void *data)
 
 	commit_creds(new);
 
-	retval = do_execve(getname_kernel(sub_info->path),
-			   (const char __user *const __user *)sub_info->argv,
-			   (const char __user *const __user *)sub_info->envp);
+	if (sub_info->file)
+		retval = do_execve_file(sub_info->file,
+					sub_info->argv, sub_info->envp);
+	else
+		retval = do_execve(getname_kernel(sub_info->path),
+				   (const char __user *const __user *)sub_info->argv,
+				   (const char __user *const __user *)sub_info->envp);
 out:
 	sub_info->retval = retval;
 	/*
@@ -185,6 +191,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
 		if (pid < 0) {
 			sub_info->retval = pid;
 			umh_complete(sub_info);
+		} else {
+			sub_info->pid = pid;
 		}
 	}
 }
@@ -393,6 +401,117 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
 }
 EXPORT_SYMBOL(call_usermodehelper_setup);
 
+struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
+		int (*init)(struct subprocess_info *info, struct cred *new),
+		void (*cleanup)(struct subprocess_info *info), void *data)
+{
+	struct subprocess_info *sub_info;
+
+	sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
+	if (!sub_info)
+		return NULL;
+
+	INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
+	sub_info->path = "none";
+	sub_info->file = file;
+	sub_info->init = init;
+	sub_info->cleanup = cleanup;
+	sub_info->data = data;
+	return sub_info;
+}
+
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+{
+	struct umh_info *umh_info = info->data;
+	struct file *from_umh[2];
+	struct file *to_umh[2];
+	int err;
+
+	/* create pipe to send data to umh */
+	err = create_pipe_files(to_umh, 0);
+	if (err)
+		return err;
+	err = replace_fd(0, to_umh[0], 0);
+	fput(to_umh[0]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		return err;
+	}
+
+	/* create pipe to receive data from umh */
+	err = create_pipe_files(from_umh, 0);
+	if (err) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		return err;
+	}
+	err = replace_fd(1, from_umh[1], 0);
+	fput(from_umh[1]);
+	if (err < 0) {
+		fput(to_umh[1]);
+		replace_fd(0, NULL, 0);
+		fput(from_umh[0]);
+		return err;
+	}
+
+	umh_info->pipe_to_umh = to_umh[1];
+	umh_info->pipe_from_umh = from_umh[0];
+	return 0;
+}
+
+static void umh_save_pid(struct subprocess_info *info)
+{
+	struct umh_info *umh_info = info->data;
+
+	umh_info->pid = info->pid;
+}
+
+/**
+ * fork_usermode_blob - fork a blob of bytes as a usermode process
+ * @data: a blob of bytes that can be do_execv-ed as a file
+ * @len: length of the blob
+ * @info: information about usermode process (shouldn't be NULL)
+ *
+ * Returns either negative error or zero which indicates success
+ * in executing a blob of bytes as a usermode process. In such
+ * case 'struct umh_info *info' is populated with two pipes
+ * and a pid of the process. The caller is responsible for health
+ * check of the user process, killing it via pid, and closing the
+ * pipes when user process is no longer needed.
+ */
+int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
+{
+	struct subprocess_info *sub_info;
+	struct file *file;
+	ssize_t written;
+	loff_t pos = 0;
+	int err;
+
+	file = shmem_kernel_file_setup("", len, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	written = kernel_write(file, data, len, &pos);
+	if (written != len) {
+		err = written;
+		if (err >= 0)
+			err = -ENOMEM;
+		goto out;
+	}
+
+	err = -ENOMEM;
+	sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
+						  umh_save_pid, info);
+	if (!sub_info)
+		goto out;
+
+	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
+out:
+	fput(file);
+	return err;
+}
+EXPORT_SYMBOL_GPL(fork_usermode_blob);
+
 /**
  * call_usermodehelper_exec - start a usermode application
  * @sub_info: information about the subprocessa
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 246d4d4ce5c7..492c255e6c5a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1235,6 +1235,7 @@ bool current_in_userns(const struct user_namespace *target_ns)
 {
 	return in_userns(target_ns, current_user_ns());
 }
+EXPORT_SYMBOL(current_in_userns);
 
 static inline struct user_namespace *to_user_ns(struct ns_common *ns)
 {
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 913fe4336d2b..dcd6be1996fe 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -19,6 +19,8 @@
 #include <linux/proc_ns.h>
 #include <linux/sched/task.h>
 
+static struct kmem_cache *uts_ns_cache __ro_after_init;
+
 static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
 {
 	return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES);
@@ -33,7 +35,7 @@ static struct uts_namespace *create_uts_ns(void)
 {
 	struct uts_namespace *uts_ns;
 
-	uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
+	uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL);
 	if (uts_ns)
 		kref_init(&uts_ns->kref);
 	return uts_ns;
@@ -42,7 +44,7 @@ static struct uts_namespace *create_uts_ns(void)
 /*
  * Clone a new ns copying an original utsname, setting refcount to 1
  * @old_ns: namespace to clone
- * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
+ * Return ERR_PTR(-ENOMEM) on error (failure to allocate), new ns otherwise
  */
 static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
 					  struct uts_namespace *old_ns)
@@ -75,7 +77,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
 	return ns;
 
 fail_free:
-	kfree(ns);
+	kmem_cache_free(uts_ns_cache, ns);
 fail_dec:
 	dec_uts_namespaces(ucounts);
 fail:
@@ -113,7 +115,7 @@ void free_uts_ns(struct kref *kref)
 	dec_uts_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	ns_free_inum(&ns->ns);
-	kfree(ns);
+	kmem_cache_free(uts_ns_cache, ns);
 }
 
 static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
@@ -169,3 +171,13 @@ const struct proc_ns_operations utsns_operations = {
 	.install	= utsns_install,
 	.owner		= utsns_owner,
 };
+
+void __init uts_ns_init(void)
+{
+	uts_ns_cache = kmem_cache_create_usercopy(
+			"uts_namespace", sizeof(struct uts_namespace), 0,
+			SLAB_PANIC|SLAB_ACCOUNT,
+			offsetof(struct uts_namespace, name),
+			sizeof_field(struct uts_namespace, name),
+			NULL);
+}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ca7959be8aaa..9f9983b0a27d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -66,7 +66,7 @@ enum {
 	 * be executing on any CPU.  The pool behaves as an unbound one.
 	 *
 	 * Note that DISASSOCIATED should be flipped only while holding
-	 * attach_mutex to avoid changing binding state while
+	 * wq_pool_attach_mutex to avoid changing binding state while
 	 * worker_attach_to_pool() is in progress.
 	 */
 	POOL_MANAGER_ACTIVE	= 1 << 0,	/* being managed */
@@ -123,7 +123,7 @@ enum {
  *    cpu or grabbing pool->lock is enough for read access.  If
  *    POOL_DISASSOCIATED is set, it's identical to L.
  *
- * A: pool->attach_mutex protected.
+ * A: wq_pool_attach_mutex protected.
  *
  * PL: wq_pool_mutex protected.
  *
@@ -166,7 +166,6 @@ struct worker_pool {
 						/* L: hash of busy workers */
 
 	struct worker		*manager;	/* L: purely informational */
-	struct mutex		attach_mutex;	/* attach/detach exclusion */
 	struct list_head	workers;	/* A: attached workers */
 	struct completion	*detach_completion; /* all workers detached */
 
@@ -297,6 +296,7 @@ static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */
 static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
 
 static DEFINE_MUTEX(wq_pool_mutex);	/* protects pools and workqueues list */
+static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
 static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
 
@@ -399,14 +399,14 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
  * @worker: iteration cursor
  * @pool: worker_pool to iterate workers of
  *
- * This must be called with @pool->attach_mutex.
+ * This must be called with wq_pool_attach_mutex.
  *
  * The if/else clause exists only for the lockdep assertion and can be
  * ignored.
  */
 #define for_each_pool_worker(worker, pool)				\
 	list_for_each_entry((worker), &(pool)->workers, node)		\
-		if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
+		if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
 		else
 
 /**
@@ -1724,7 +1724,7 @@ static struct worker *alloc_worker(int node)
 static void worker_attach_to_pool(struct worker *worker,
 				   struct worker_pool *pool)
 {
-	mutex_lock(&pool->attach_mutex);
+	mutex_lock(&wq_pool_attach_mutex);
 
 	/*
 	 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
@@ -1733,37 +1733,40 @@ static void worker_attach_to_pool(struct worker *worker,
 	set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
 
 	/*
-	 * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
-	 * stable across this function.  See the comments above the
-	 * flag definition for details.
+	 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
+	 * stable across this function.  See the comments above the flag
+	 * definition for details.
 	 */
 	if (pool->flags & POOL_DISASSOCIATED)
 		worker->flags |= WORKER_UNBOUND;
 
 	list_add_tail(&worker->node, &pool->workers);
+	worker->pool = pool;
 
-	mutex_unlock(&pool->attach_mutex);
+	mutex_unlock(&wq_pool_attach_mutex);
 }
 
 /**
  * worker_detach_from_pool() - detach a worker from its pool
  * @worker: worker which is attached to its pool
- * @pool: the pool @worker is attached to
  *
  * Undo the attaching which had been done in worker_attach_to_pool().  The
  * caller worker shouldn't access to the pool after detached except it has
  * other reference to the pool.
  */
-static void worker_detach_from_pool(struct worker *worker,
-				    struct worker_pool *pool)
+static void worker_detach_from_pool(struct worker *worker)
 {
+	struct worker_pool *pool = worker->pool;
 	struct completion *detach_completion = NULL;
 
-	mutex_lock(&pool->attach_mutex);
+	mutex_lock(&wq_pool_attach_mutex);
+
 	list_del(&worker->node);
+	worker->pool = NULL;
+
 	if (list_empty(&pool->workers))
 		detach_completion = pool->detach_completion;
-	mutex_unlock(&pool->attach_mutex);
+	mutex_unlock(&wq_pool_attach_mutex);
 
 	/* clear leftover flags without pool->lock after it is detached */
 	worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
@@ -1799,7 +1802,6 @@ static struct worker *create_worker(struct worker_pool *pool)
 	if (!worker)
 		goto fail;
 
-	worker->pool = pool;
 	worker->id = id;
 
 	if (pool->cpu >= 0)
@@ -2086,6 +2088,12 @@ __acquires(&pool->lock)
 	worker->current_pwq = pwq;
 	work_color = get_work_color(work);
 
+	/*
+	 * Record wq name for cmdline and debug reporting, may get
+	 * overridden through set_worker_desc().
+	 */
+	strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);
+
 	list_del_init(&work->entry);
 
 	/*
@@ -2181,7 +2189,6 @@ __acquires(&pool->lock)
 	worker->current_work = NULL;
 	worker->current_func = NULL;
 	worker->current_pwq = NULL;
-	worker->desc_valid = false;
 	pwq_dec_nr_in_flight(pwq, work_color);
 }
 
@@ -2206,6 +2213,16 @@ static void process_scheduled_works(struct worker *worker)
 	}
 }
 
+static void set_pf_worker(bool val)
+{
+	mutex_lock(&wq_pool_attach_mutex);
+	if (val)
+		current->flags |= PF_WQ_WORKER;
+	else
+		current->flags &= ~PF_WQ_WORKER;
+	mutex_unlock(&wq_pool_attach_mutex);
+}
+
 /**
  * worker_thread - the worker thread function
  * @__worker: self
@@ -2224,7 +2241,7 @@ static int worker_thread(void *__worker)
 	struct worker_pool *pool = worker->pool;
 
 	/* tell the scheduler that this is a workqueue worker */
-	worker->task->flags |= PF_WQ_WORKER;
+	set_pf_worker(true);
 woke_up:
 	spin_lock_irq(&pool->lock);
 
@@ -2232,11 +2249,11 @@ woke_up:
 	if (unlikely(worker->flags & WORKER_DIE)) {
 		spin_unlock_irq(&pool->lock);
 		WARN_ON_ONCE(!list_empty(&worker->entry));
-		worker->task->flags &= ~PF_WQ_WORKER;
+		set_pf_worker(false);
 
 		set_task_comm(worker->task, "kworker/dying");
 		ida_simple_remove(&pool->worker_ida, worker->id);
-		worker_detach_from_pool(worker, pool);
+		worker_detach_from_pool(worker);
 		kfree(worker);
 		return 0;
 	}
@@ -2335,7 +2352,7 @@ static int rescuer_thread(void *__rescuer)
 	 * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
 	 * doesn't participate in concurrency management.
 	 */
-	rescuer->task->flags |= PF_WQ_WORKER;
+	set_pf_worker(true);
 repeat:
 	set_current_state(TASK_IDLE);
 
@@ -2367,7 +2384,6 @@ repeat:
 		worker_attach_to_pool(rescuer, pool);
 
 		spin_lock_irq(&pool->lock);
-		rescuer->pool = pool;
 
 		/*
 		 * Slurp in all works issued via this workqueue and
@@ -2417,10 +2433,9 @@ repeat:
 		if (need_more_worker(pool))
 			wake_up_worker(pool);
 
-		rescuer->pool = NULL;
 		spin_unlock_irq(&pool->lock);
 
-		worker_detach_from_pool(rescuer, pool);
+		worker_detach_from_pool(rescuer);
 
 		spin_lock_irq(&wq_mayday_lock);
 	}
@@ -2429,7 +2444,7 @@ repeat:
 
 	if (should_stop) {
 		__set_current_state(TASK_RUNNING);
-		rescuer->task->flags &= ~PF_WQ_WORKER;
+		set_pf_worker(false);
 		return 0;
 	}
 
@@ -3271,7 +3286,6 @@ static int init_worker_pool(struct worker_pool *pool)
 
 	timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
 
-	mutex_init(&pool->attach_mutex);
 	INIT_LIST_HEAD(&pool->workers);
 
 	ida_init(&pool->worker_ida);
@@ -3354,10 +3368,10 @@ static void put_unbound_pool(struct worker_pool *pool)
 	WARN_ON(pool->nr_workers || pool->nr_idle);
 	spin_unlock_irq(&pool->lock);
 
-	mutex_lock(&pool->attach_mutex);
+	mutex_lock(&wq_pool_attach_mutex);
 	if (!list_empty(&pool->workers))
 		pool->detach_completion = &detach_completion;
-	mutex_unlock(&pool->attach_mutex);
+	mutex_unlock(&wq_pool_attach_mutex);
 
 	if (pool->detach_completion)
 		wait_for_completion(pool->detach_completion);
@@ -3700,8 +3714,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 
 	lockdep_assert_held(&wq_pool_mutex);
 
-	ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
-		      GFP_KERNEL);
+	ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);
 
 	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
@@ -4347,7 +4360,6 @@ void set_worker_desc(const char *fmt, ...)
 		va_start(args, fmt);
 		vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
 		va_end(args);
-		worker->desc_valid = true;
 	}
 }
 
@@ -4371,7 +4383,6 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
 	char desc[WORKER_DESC_LEN] = { };
 	struct pool_workqueue *pwq = NULL;
 	struct workqueue_struct *wq = NULL;
-	bool desc_valid = false;
 	struct worker *worker;
 
 	if (!(task->flags & PF_WQ_WORKER))
@@ -4384,22 +4395,18 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
 	worker = kthread_probe_data(task);
 
 	/*
-	 * Carefully copy the associated workqueue's workfn and name.  Keep
-	 * the original last '\0' in case the original contains garbage.
+	 * Carefully copy the associated workqueue's workfn, name and desc.
+	 * Keep the original last '\0' in case the original is garbage.
 	 */
 	probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
 	probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
 	probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
 	probe_kernel_read(name, wq->name, sizeof(name) - 1);
-
-	/* copy worker description */
-	probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
-	if (desc_valid)
-		probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
+	probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
 
 	if (fn || name[0] || desc[0]) {
 		printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
-		if (desc[0])
+		if (strcmp(name, desc))
 			pr_cont(" (%s)", desc);
 		pr_cont("\n");
 	}
@@ -4579,6 +4586,47 @@ void show_workqueue_state(void)
 	rcu_read_unlock_sched();
 }
 
+/* used to show worker information through /proc/PID/{comm,stat,status} */
+void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
+{
+	int off;
+
+	/* always show the actual comm */
+	off = strscpy(buf, task->comm, size);
+	if (off < 0)
+		return;
+
+	/* stabilize PF_WQ_WORKER and worker pool association */
+	mutex_lock(&wq_pool_attach_mutex);
+
+	if (task->flags & PF_WQ_WORKER) {
+		struct worker *worker = kthread_data(task);
+		struct worker_pool *pool = worker->pool;
+
+		if (pool) {
+			spin_lock_irq(&pool->lock);
+			/*
+			 * ->desc tracks information (wq name or
+			 * set_worker_desc()) for the latest execution.  If
+			 * current, prepend '+', otherwise '-'.
+			 */
+			if (worker->desc[0] != '\0') {
+				if (worker->current_work)
+					scnprintf(buf + off, size - off, "+%s",
+						  worker->desc);
+				else
+					scnprintf(buf + off, size - off, "-%s",
+						  worker->desc);
+			}
+			spin_unlock_irq(&pool->lock);
+		}
+	}
+
+	mutex_unlock(&wq_pool_attach_mutex);
+}
+
+#ifdef CONFIG_SMP
+
 /*
  * CPU hotplug.
  *
@@ -4600,7 +4648,7 @@ static void unbind_workers(int cpu)
 	struct worker *worker;
 
 	for_each_cpu_worker_pool(pool, cpu) {
-		mutex_lock(&pool->attach_mutex);
+		mutex_lock(&wq_pool_attach_mutex);
 		spin_lock_irq(&pool->lock);
 
 		/*
@@ -4616,7 +4664,7 @@ static void unbind_workers(int cpu)
 		pool->flags |= POOL_DISASSOCIATED;
 
 		spin_unlock_irq(&pool->lock);
-		mutex_unlock(&pool->attach_mutex);
+		mutex_unlock(&wq_pool_attach_mutex);
 
 		/*
 		 * Call schedule() so that we cross rq->lock and thus can
@@ -4657,7 +4705,7 @@ static void rebind_workers(struct worker_pool *pool)
 {
 	struct worker *worker;
 
-	lockdep_assert_held(&pool->attach_mutex);
+	lockdep_assert_held(&wq_pool_attach_mutex);
 
 	/*
 	 * Restore CPU affinity of all workers.  As all idle workers should
@@ -4727,7 +4775,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
 	static cpumask_t cpumask;
 	struct worker *worker;
 
-	lockdep_assert_held(&pool->attach_mutex);
+	lockdep_assert_held(&wq_pool_attach_mutex);
 
 	/* is @cpu allowed for @pool? */
 	if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
@@ -4762,14 +4810,14 @@ int workqueue_online_cpu(unsigned int cpu)
 	mutex_lock(&wq_pool_mutex);
 
 	for_each_pool(pool, pi) {
-		mutex_lock(&pool->attach_mutex);
+		mutex_lock(&wq_pool_attach_mutex);
 
 		if (pool->cpu == cpu)
 			rebind_workers(pool);
 		else if (pool->cpu < 0)
 			restore_unbound_workers_cpumask(pool, cpu);
 
-		mutex_unlock(&pool->attach_mutex);
+		mutex_unlock(&wq_pool_attach_mutex);
 	}
 
 	/* update NUMA affinity of unbound workqueues */
@@ -4799,8 +4847,6 @@ int workqueue_offline_cpu(unsigned int cpu)
 	return 0;
 }
 
-#ifdef CONFIG_SMP
-
 struct work_for_cpu {
 	struct work_struct work;
 	long (*fn)(void *);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index d390d1be3748..66fbb5a9e633 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -31,13 +31,12 @@ struct worker {
 	struct work_struct	*current_work;	/* L: work being processed */
 	work_func_t		current_func;	/* L: current_work's fn */
 	struct pool_workqueue	*current_pwq; /* L: current_work's pwq */
-	bool			desc_valid;	/* ->desc is valid */
 	struct list_head	scheduled;	/* L: scheduled works */
 
 	/* 64 bytes boundary on 64bit, 32 on 32bit */
 
 	struct task_struct	*task;		/* I: worker task */
-	struct worker_pool	*pool;		/* I: the associated pool */
+	struct worker_pool	*pool;		/* A: the associated pool */
 						/* L: for rescuers */
 	struct list_head	node;		/* A: anchored at pool->workers */
 						/* A: runs through worker->node */