From d83a7cb375eec21f04c83542395d08b2f6641da2 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 13 Feb 2017 19:42:40 -0600
Subject: livepatch: change to a per-task consistency model

Change livepatch to use a basic per-task consistency model.  This is the
foundation which will eventually enable us to patch those ~10% of
security patches which change function or data semantics.  This is the
biggest remaining piece needed to make livepatch more generally useful.

This code stems from the design proposal made by Vojtech [1] in November
2014.  It's a hybrid of kGraft and kpatch: it uses kGraft's per-task
consistency and syscall barrier switching combined with kpatch's stack
trace switching.  There are also a number of fallback options which make
it quite flexible.

Patches are applied on a per-task basis, when the task is deemed safe to
switch over.  When a patch is enabled, livepatch enters into a
transition state where tasks are converging to the patched state.
Usually this transition state can complete in a few seconds.  The same
sequence occurs when a patch is disabled, except the tasks converge from
the patched state to the unpatched state.

An interrupt handler inherits the patched state of the task it
interrupts.  The same is true for forked tasks: the child inherits the
patched state of the parent.

Livepatch uses several complementary approaches to determine when it's
safe to patch tasks:

1. The first and most effective approach is stack checking of sleeping
   tasks.  If no affected functions are on the stack of a given task,
   the task is patched.  In most cases this will patch most or all of
   the tasks on the first try.  Otherwise it'll keep trying
   periodically.  This option is only available if the architecture has
   reliable stacks (HAVE_RELIABLE_STACKTRACE).

2. The second approach, if needed, is kernel exit switching.  A
   task is switched when it returns to user space from a system call, a
   user space IRQ, or a signal.  It's useful in the following cases:

   a) Patching I/O-bound user tasks which are sleeping on an affected
      function.  In this case you have to send SIGSTOP and SIGCONT to
      force it to exit the kernel and be patched.
   b) Patching CPU-bound user tasks.  If the task is highly CPU-bound
      then it will get patched the next time it gets interrupted by an
      IRQ.
   c) In the future it could be useful for applying patches for
      architectures which don't yet have HAVE_RELIABLE_STACKTRACE.  In
      this case you would have to signal most of the tasks on the
      system.  However this isn't supported yet because there's
      currently no way to patch kthreads without
      HAVE_RELIABLE_STACKTRACE.

3. For idle "swapper" tasks, since they don't ever exit the kernel, they
   instead have a klp_update_patch_state() call in the idle loop which
   allows them to be patched before the CPU enters the idle state.

   (Note there's not yet such an approach for kthreads.)

All the above approaches may be skipped by setting the 'immediate' flag
in the 'klp_patch' struct, which will disable per-task consistency and
patch all tasks immediately.  This can be useful if the patch doesn't
change any function or data semantics.  Note that, even with this flag
set, it's possible that some tasks may still be running with an old
version of the function, until that function returns.

There's also an 'immediate' flag in the 'klp_func' struct which allows
you to specify that certain functions in the patch can be applied
without per-task consistency.  This might be useful if you want to patch
a common function like schedule(), and the function change doesn't need
consistency but the rest of the patch does.

For architectures which don't have HAVE_RELIABLE_STACKTRACE, the user
must set patch->immediate which causes all tasks to be patched
immediately.  This option should be used with care, only when the patch
doesn't change any function or data semantics.

In the future, architectures which don't have HAVE_RELIABLE_STACKTRACE
may be allowed to use per-task consistency if we can come up with
another way to patch kthreads.

The /sys/kernel/livepatch/<patch>/transition file shows whether a patch
is in transition.  Only a single patch (the topmost patch on the stack)
can be in transition at a given time.  A patch can remain in transition
indefinitely, if any of the tasks are stuck in the initial patch state.

A transition can be reversed and effectively canceled by writing the
opposite value to the /sys/kernel/livepatch/<patch>/enabled file while
the transition is in progress.  Then all the tasks will attempt to
converge back to the original patch state.

[1] https://lkml.kernel.org/r/20141107140458.GA21774@suse.cz

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Ingo Molnar <mingo@kernel.org>        # for the scheduler changes
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/init_task.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux/init_task.h')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 91d9049f0039..5a791055b176 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,6 +15,7 @@
 #include <linux/sched/autogroup.h>
 #include <net/net_namespace.h>
 #include <linux/sched/rt.h>
+#include <linux/livepatch.h>
 #include <linux/mm_types.h>
 
 #include <asm/thread_info.h>
@@ -202,6 +203,13 @@ extern struct cred init_cred;
 # define INIT_KASAN(tsk)
 #endif
 
+#ifdef CONFIG_LIVEPATCH
+# define INIT_LIVEPATCH(tsk)						\
+	.patch_state = KLP_UNDEFINED,
+#else
+# define INIT_LIVEPATCH(tsk)
+#endif
+
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 # define INIT_TASK_TI(tsk)			\
 	.thread_info = INIT_THREAD_INFO(tsk),	\
@@ -288,6 +296,7 @@ extern struct cred init_cred;
 	INIT_VTIME(tsk)							\
 	INIT_NUMA_BALANCING(tsk)					\
 	INIT_KASAN(tsk)							\
+	INIT_LIVEPATCH(tsk)						\
 }
 
 
-- 
cgit v1.2.3


From e4e55b47ed9ae2c05ff062601ff6dacbe9dc4775 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 24 Mar 2017 20:46:33 +0900
Subject: LSM: Revive security_task_alloc() hook and per "struct task_struct"
 security blob.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We switched from "struct task_struct"->security to "struct cred"->security
in Linux 2.6.29. But not all LSM modules were happy with that change.
TOMOYO LSM module is an example which want to use per "struct task_struct"
security blob, for TOMOYO's security context is defined based on "struct
task_struct" rather than "struct cred". AppArmor LSM module is another
example which want to use it, for AppArmor is currently abusing the cred
a little bit to store the change_hat and setexeccon info. Although
security_task_free() hook was revived in Linux 3.4 because Yama LSM module
wanted to release per "struct task_struct" security blob,
security_task_alloc() hook and "struct task_struct"->security field were
not revived. Nowadays, we are getting proposals of lightweight LSM modules
which want to use per "struct task_struct" security blob.

We are already allowing multiple concurrent LSM modules (up to one fully
armored module which uses "struct cred"->security field or exclusive hooks
like security_xfrm_state_pol_flow_match(), plus unlimited number of
lightweight modules which do not use "struct cred"->security nor exclusive
hooks) as long as they are built into the kernel. But this patch does not
implement variable length "struct task_struct"->security field which will
become needed when multiple LSM modules want to use "struct task_struct"->
security field. Although it won't be difficult to implement variable length
"struct task_struct"->security field, let's think about it after we merged
this patch.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: John Johansen <john.johansen@canonical.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Tested-by: Djalal Harouni <tixxdz@gmail.com>
Acked-by: José Bollo <jobol@nonadev.net>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: James Morris <james.l.morris@oracle.com>
Cc: José Bollo <jobol@nonadev.net>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/init_task.h | 7 +++++++
 include/linux/lsm_hooks.h | 9 ++++++++-
 include/linux/sched.h     | 4 ++++
 include/linux/security.h  | 7 +++++++
 kernel/fork.c             | 7 ++++++-
 security/security.c       | 5 +++++
 6 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include/linux/init_task.h')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 91d9049f0039..926f2f553cc5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -210,6 +210,12 @@ extern struct cred init_cred;
 # define INIT_TASK_TI(tsk)
 #endif
 
+#ifdef CONFIG_SECURITY
+#define INIT_TASK_SECURITY .security = NULL,
+#else
+#define INIT_TASK_SECURITY
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -288,6 +294,7 @@ extern struct cred init_cred;
 	INIT_VTIME(tsk)							\
 	INIT_NUMA_BALANCING(tsk)					\
 	INIT_KASAN(tsk)							\
+	INIT_TASK_SECURITY						\
 }
 
 
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 1aa63335de9e..080f34e66017 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -533,8 +533,13 @@
  *	manual page for definitions of the @clone_flags.
  *	@clone_flags contains the flags indicating what should be shared.
  *	Return 0 if permission is granted.
+ * @task_alloc:
+ *	@task task being allocated.
+ *	@clone_flags contains the flags indicating what should be shared.
+ *	Handle allocation of task-related resources.
+ *	Returns a zero on success, negative values on failure.
  * @task_free:
- *	@task task being freed
+ *	@task task about to be freed.
  *	Handle release of task-related resources. (Note that this can be called
  *	from interrupt context.)
  * @cred_alloc_blank:
@@ -1482,6 +1487,7 @@ union security_list_options {
 	int (*file_open)(struct file *file, const struct cred *cred);
 
 	int (*task_create)(unsigned long clone_flags);
+	int (*task_alloc)(struct task_struct *task, unsigned long clone_flags);
 	void (*task_free)(struct task_struct *task);
 	int (*cred_alloc_blank)(struct cred *cred, gfp_t gfp);
 	void (*cred_free)(struct cred *cred);
@@ -1748,6 +1754,7 @@ struct security_hook_heads {
 	struct list_head file_receive;
 	struct list_head file_open;
 	struct list_head task_create;
+	struct list_head task_alloc;
 	struct list_head task_free;
 	struct list_head cred_alloc_blank;
 	struct list_head cred_free;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..71b8df306bb0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1037,6 +1037,10 @@ struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	/* A live task holds one reference: */
 	atomic_t			stack_refcount;
+#endif
+#ifdef CONFIG_SECURITY
+	/* Used by LSM modules for access restriction: */
+	void				*security;
 #endif
 	/* CPU-specific state of this task: */
 	struct thread_struct		thread;
diff --git a/include/linux/security.h b/include/linux/security.h
index 97df7bac5b48..af675b576645 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -308,6 +308,7 @@ int security_file_send_sigiotask(struct task_struct *tsk,
 int security_file_receive(struct file *file);
 int security_file_open(struct file *file, const struct cred *cred);
 int security_task_create(unsigned long clone_flags);
+int security_task_alloc(struct task_struct *task, unsigned long clone_flags);
 void security_task_free(struct task_struct *task);
 int security_cred_alloc_blank(struct cred *cred, gfp_t gfp);
 void security_cred_free(struct cred *cred);
@@ -861,6 +862,12 @@ static inline int security_task_create(unsigned long clone_flags)
 	return 0;
 }
 
+static inline int security_task_alloc(struct task_struct *task,
+				      unsigned long clone_flags)
+{
+	return 0;
+}
+
 static inline void security_task_free(struct task_struct *task)
 { }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..3d32513d6c73 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1679,9 +1679,12 @@ static __latent_entropy struct task_struct *copy_process(
 		goto bad_fork_cleanup_perf;
 	/* copy all the process information */
 	shm_init_task(p);
-	retval = copy_semundo(clone_flags, p);
+	retval = security_task_alloc(p, clone_flags);
 	if (retval)
 		goto bad_fork_cleanup_audit;
+	retval = copy_semundo(clone_flags, p);
+	if (retval)
+		goto bad_fork_cleanup_security;
 	retval = copy_files(clone_flags, p);
 	if (retval)
 		goto bad_fork_cleanup_semundo;
@@ -1903,6 +1906,8 @@ bad_fork_cleanup_files:
 	exit_files(p); /* blocking */
 bad_fork_cleanup_semundo:
 	exit_sem(p);
+bad_fork_cleanup_security:
+	security_task_free(p);
 bad_fork_cleanup_audit:
 	audit_free(p);
 bad_fork_cleanup_perf:
diff --git a/security/security.c b/security/security.c
index 2f15488dc6bc..549bddcc2116 100644
--- a/security/security.c
+++ b/security/security.c
@@ -937,6 +937,11 @@ int security_task_create(unsigned long clone_flags)
 	return call_int_hook(task_create, 0, clone_flags);
 }
 
+int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
+{
+	return call_int_hook(task_alloc, 0, task, clone_flags);
+}
+
 void security_task_free(struct task_struct *task)
 {
 	call_void_hook(task_free, task);
-- 
cgit v1.2.3


From e96a7705e7d3fef96aec9b590c63b2f6f7d2ba22 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Thu, 23 Mar 2017 15:56:08 +0100
Subject: sched/rtmutex/deadline: Fix a PI crash for deadline tasks

A crash happened while I was playing with deadline PI rtmutex.

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
    IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
    PGD 232a75067 PUD 230947067 PMD 0
    Oops: 0000 [#1] SMP
    CPU: 1 PID: 10994 Comm: a.out Not tainted

    Call Trace:
    [<ffffffff810b658c>] enqueue_task+0x2c/0x80
    [<ffffffff810ba763>] activate_task+0x23/0x30
    [<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
    [<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
    [<ffffffff8164e783>] __schedule+0xd3/0x900
    [<ffffffff8164efd9>] schedule+0x29/0x70
    [<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
    [<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
    [<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
    [<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
    [<ffffffff810ed8b0>] do_futex+0x190/0x5b0
    [<ffffffff810edd50>] SyS_futex+0x80/0x180

This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
are only protected by pi_lock when operating pi waiters, while
rt_mutex_get_top_task(), will access them with rq lock held but
not holding pi_lock.

In order to tackle it, we introduce new "pi_top_task" pointer
cached in task_struct, and add new rt_mutex_update_top_task()
to update its value, it can be called by rt_mutex_setprio()
which held both owner's pi_lock and rq lock. Thus "pi_top_task"
can be safely accessed by enqueue_task_dl() under rq lock.

Originally-From: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170323150216.157682758@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/init_task.h |  1 +
 include/linux/sched.h     |  2 ++
 include/linux/sched/rt.h  |  1 +
 kernel/fork.c             |  1 +
 kernel/locking/rtmutex.c  | 29 +++++++++++++++++++++--------
 kernel/sched/core.c       |  2 ++
 6 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include/linux/init_task.h')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 91d9049f0039..2c487e0879d5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -181,6 +181,7 @@ extern struct cred init_cred;
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
 	.pi_waiters = RB_ROOT,						\
+	.pi_top_task = NULL,						\
 	.pi_waiters_leftmost = NULL,
 #else
 # define INIT_RT_MUTEXES(tsk)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..1ea2eee7bc4f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -775,6 +775,8 @@ struct task_struct {
 	/* PI waiters blocked on a rt_mutex held by this task: */
 	struct rb_root			pi_waiters;
 	struct rb_node			*pi_waiters_leftmost;
+	/* Updated under owner's pi_lock and rq lock */
+	struct task_struct		*pi_top_task;
 	/* Deadlock detection and priority inheritance handling: */
 	struct rt_mutex_waiter		*pi_blocked_on;
 #endif
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 3bd668414f61..10ee7eeb0ee2 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -21,6 +21,7 @@ static inline int rt_task(struct task_struct *p)
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
 extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
+extern void rt_mutex_update_top_task(struct task_struct *p);
 extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..b30196a00b0d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1438,6 +1438,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 #ifdef CONFIG_RT_MUTEXES
 	p->pi_waiters = RB_ROOT;
 	p->pi_waiters_leftmost = NULL;
+	p->pi_top_task = NULL;
 	p->pi_blocked_on = NULL;
 #endif
 }
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 71ecf0624410..bc05b104eaed 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -322,6 +322,19 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 	RB_CLEAR_NODE(&waiter->pi_tree_entry);
 }
 
+/*
+ * Must hold both p->pi_lock and task_rq(p)->lock.
+ */
+void rt_mutex_update_top_task(struct task_struct *p)
+{
+	if (!task_has_pi_waiters(p)) {
+		p->pi_top_task = NULL;
+		return;
+	}
+
+	p->pi_top_task = task_top_pi_waiter(p)->task;
+}
+
 /*
  * Calculate task priority from the waiter tree priority
  *
@@ -337,12 +350,12 @@ int rt_mutex_getprio(struct task_struct *task)
 		   task->normal_prio);
 }
 
+/*
+ * Must hold either p->pi_lock or task_rq(p)->lock.
+ */
 struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 {
-	if (likely(!task_has_pi_waiters(task)))
-		return NULL;
-
-	return task_top_pi_waiter(task)->task;
+	return task->pi_top_task;
 }
 
 /*
@@ -351,12 +364,12 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
  */
 int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
 {
-	if (!task_has_pi_waiters(task))
+	struct task_struct *top_task = rt_mutex_get_top_task(task);
+
+	if (!top_task)
 		return newprio;
 
-	if (task_top_pi_waiter(task)->task->prio <= newprio)
-		return task_top_pi_waiter(task)->task->prio;
-	return newprio;
+	return min(top_task->prio, newprio);
 }
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ab9f6ac099a7..e1f44ec701c8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3713,6 +3713,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		goto out_unlock;
 	}
 
+	rt_mutex_update_top_task(p);
+
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 
-- 
cgit v1.2.3