From 769071ac9f20b6a447410c7eaa55d1a5233ef40c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:26:52 +0000 Subject: ns: Introduce Time Namespace Time Namespace isolates clock values. The kernel provides access to several clocks CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, etc. CLOCK_REALTIME System-wide clock that measures real (i.e., wall-clock) time. CLOCK_MONOTONIC Clock that cannot be set and represents monotonic time since some unspecified starting point. CLOCK_BOOTTIME Identical to CLOCK_MONOTONIC, except it also includes any time that the system is suspended. For many users, the time namespace means the ability to changes date and time in a container (CLOCK_REALTIME). Providing per namespace notions of CLOCK_REALTIME would be complex with a massive overhead, but has a dubious value. But in the context of checkpoint/restore functionality, monotonic and boottime clocks become interesting. Both clocks are monotonic with unspecified starting points. These clocks are widely used to measure time slices and set timers. After restoring or migrating processes, it has to be guaranteed that they never go backward. In an ideal case, the behavior of these clocks should be the same as for a case when a whole system is suspended. All this means that it is required to set CLOCK_MONOTONIC and CLOCK_BOOTTIME clocks, which can be achieved by adding per-namespace offsets for clocks. A time namespace is similar to a pid namespace in the way how it is created: unshare(CLONE_NEWTIME) system call creates a new time namespace, but doesn't set it to the current process. Then all children of the process will be born in the new time namespace, or a process can use the setns() system call to join a namespace. This scheme allows setting clock offsets for a namespace, before any processes appear in it. All available clone flags have been used, so CLONE_NEWTIME uses the highest bit of CSIGNAL. It means that it can be used only with the unshare() and the clone3() system calls. [ tglx: Adjusted paragraph about clone3() to reality and massaged the changelog a bit. ] Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://criu.org/Time_namespace Link: https://lists.openvz.org/pipermail/criu/2018-June/041504.html Link: https://lore.kernel.org/r/20191112012724.250792-4-dima@arista.com --- fs/proc/namespaces.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index dd2b35f78b09..8b5c720fe5d7 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -33,6 +33,10 @@ static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_CGROUPS &cgroupns_operations, #endif +#ifdef CONFIG_TIME_NS + &timens_operations, + &timens_for_children_operations, +#endif }; static const char *proc_ns_get_link(struct dentry *dentry, -- cgit v1.2.3 From 6cd889d43c40b13f81a44c41896781ce70244769 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:02 +0000 Subject: timerfd: Make timerfd_settime() time namespace aware timerfd_settime() accepts an absolute value of the expiration time if TFD_TIMER_ABSTIME is specified. This value is in the task's time namespace and has to be converted to the host's time namespace. Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-14-dima@arista.com --- fs/timerfd.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/timerfd.c b/fs/timerfd.c index ac7f59a58f94..c5509d2448e3 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -26,6 +26,7 @@ #include #include #include +#include struct timerfd_ctx { union { @@ -196,6 +197,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, } if (texp != 0) { + if (flags & TFD_TIMER_ABSTIME) + texp = timens_ktime_to_host(clockid, texp); if (isalarm(ctx)) { if (flags & TFD_TIMER_ABSTIME) alarm_start(&ctx->t.alarm, texp); -- cgit v1.2.3 From 0efc8bb0bb5fdfd529a23073ee15478b5d5e3839 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 12 Nov 2019 01:27:07 +0000 Subject: fs/proc: Respect boottime inside time namespace for /proc/uptime Make sure that /proc/uptime is adjusted to the tasks time namespace. Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-19-dima@arista.com --- fs/proc/uptime.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index a4c2791ab70b..5a1b228964fb 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -5,6 +5,7 @@ #include #include #include +#include #include static int uptime_proc_show(struct seq_file *m, void *v) @@ -20,6 +21,8 @@ static int uptime_proc_show(struct seq_file *m, void *v) nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; ktime_get_boottime_ts64(&uptime); + timens_add_boottime(&uptime); + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", -- cgit v1.2.3 From 04a8682a71becdb639ec9c0d82b315a2baef7a5d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 12 Nov 2019 01:27:16 +0000 Subject: fs/proc: Introduce /proc/pid/timens_offsets API to set time namespace offsets for children processes, i.e.: echo "$clockid $offset_sec $offset_nsec" > /proc/self/timens_offsets Co-developed-by: Dmitry Safonov Signed-off-by: Andrei Vagin Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20191112012724.250792-28-dima@arista.com --- fs/proc/base.c | 94 ++++++++++++++++++++++++++++++++++++++ include/linux/time_namespace.h | 10 ++++ kernel/time/namespace.c | 101 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index ebea9501afb8..5adc6390ac3a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -94,6 +94,7 @@ #include #include #include +#include #include #include "internal.h" #include "fd.h" @@ -1533,6 +1534,96 @@ static const struct file_operations proc_pid_sched_autogroup_operations = { #endif /* CONFIG_SCHED_AUTOGROUP */ +#ifdef CONFIG_TIME_NS +static int timens_offsets_show(struct seq_file *m, void *v) +{ + struct task_struct *p; + + p = get_proc_task(file_inode(m->file)); + if (!p) + return -ESRCH; + proc_timens_show_offsets(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t timens_offsets_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file_inode(file); + struct proc_timens_offset offsets[2]; + char *kbuf = NULL, *pos, *next_line; + struct task_struct *p; + int ret, noffsets; + + /* Only allow < page size writes at the beginning of the file */ + if ((*ppos != 0) || (count >= PAGE_SIZE)) + return -EINVAL; + + /* Slurp in the user data */ + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + /* Parse the user data */ + ret = -EINVAL; + noffsets = 0; + for (pos = kbuf; pos; pos = next_line) { + struct proc_timens_offset *off = &offsets[noffsets]; + int err; + + /* Find the end of line and ensure we don't look past it */ + next_line = strchr(pos, '\n'); + if (next_line) { + *next_line = '\0'; + next_line++; + if (*next_line == '\0') + next_line = NULL; + } + + err = sscanf(pos, "%u %lld %lu", &off->clockid, + &off->val.tv_sec, &off->val.tv_nsec); + if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC) + goto out; + noffsets++; + if (noffsets == ARRAY_SIZE(offsets)) { + if (next_line) + count = next_line - kbuf; + break; + } + } + + ret = -ESRCH; + p = get_proc_task(inode); + if (!p) + goto out; + ret = proc_timens_set_offset(file, p, offsets, noffsets); + put_task_struct(p); + if (ret) + goto out; + + ret = count; +out: + kfree(kbuf); + return ret; +} + +static int timens_offsets_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timens_offsets_show, inode); +} + +static const struct file_operations proc_timens_offsets_operations = { + .open = timens_offsets_open, + .read = seq_read, + .write = timens_offsets_write, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_TIME_NS */ + static ssize_t comm_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { @@ -3015,6 +3106,9 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), +#endif +#ifdef CONFIG_TIME_NS + REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations), #endif REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 04a2ba8b8a06..824d54e057eb 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -52,6 +52,16 @@ static inline void put_time_ns(struct time_namespace *ns) kref_put(&ns->kref, free_time_ns); } +void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m); + +struct proc_timens_offset { + int clockid; + struct timespec64 val; +}; + +int proc_timens_set_offset(struct file *file, struct task_struct *p, + struct proc_timens_offset *offsets, int n); + static inline void timens_add_monotonic(struct timespec64 *ts) { struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0732964803b9..12858507d75a 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -334,6 +335,106 @@ static struct user_namespace *timens_owner(struct ns_common *ns) return to_time_ns(ns)->user_ns; } +static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) +{ + seq_printf(m, "%d %lld %ld\n", clockid, ts->tv_sec, ts->tv_nsec); +} + +void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + + ns = timens_for_children_get(p); + if (!ns) + return; + time_ns = to_time_ns(ns); + + show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); + show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); + put_time_ns(time_ns); +} + +int proc_timens_set_offset(struct file *file, struct task_struct *p, + struct proc_timens_offset *offsets, int noffsets) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + struct timespec64 tp; + int i, err; + + ns = timens_for_children_get(p); + if (!ns) + return -ESRCH; + time_ns = to_time_ns(ns); + + if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { + put_time_ns(time_ns); + return -EPERM; + } + + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + ktime_get_ts64(&tp); + break; + case CLOCK_BOOTTIME: + ktime_get_boottime_ts64(&tp); + break; + default: + err = -EINVAL; + goto out; + } + + err = -ERANGE; + + if (off->val.tv_sec > KTIME_SEC_MAX || + off->val.tv_sec < -KTIME_SEC_MAX) + goto out; + + tp = timespec64_add(tp, off->val); + /* + * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is + * still unreachable. + */ + if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) + goto out; + } + + mutex_lock(&offset_lock); + if (time_ns->frozen_offsets) { + err = -EACCES; + goto out_unlock; + } + + err = 0; + /* Don't report errors after this line */ + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + struct timespec64 *offset = NULL; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + offset = &time_ns->offsets.monotonic; + break; + case CLOCK_BOOTTIME: + offset = &time_ns->offsets.boottime; + break; + } + + *offset = off->val; + } + +out_unlock: + mutex_unlock(&offset_lock); +out: + put_time_ns(time_ns); + + return err; +} + const struct proc_ns_operations timens_operations = { .name = "time", .type = CLONE_NEWTIME, -- cgit v1.2.3