Message ID | 20201221150956.226097-2-khorenko@virtuozzo.com |
---|---|
State | New |
Series | "Series without cover letter" |
Headers | show |
diff --git a/fs/proc/array.c b/fs/proc/array.c index ba712f18e5ff..735876a51a18 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -555,16 +555,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, start_time = task->real_start_time; #ifdef CONFIG_VE - if (!is_super) { - u64 offset = get_exec_env()->real_start_time; - start_time -= (unsigned long long)offset; - } - /* tasks inside a CT can have negative start time e.g. if the CT was - * migrated from another hw node, in which case we will report 0 in - * order not to confuse userspace */ - if ((s64)start_time < 0) - start_time = 0; + if (!is_super) + start_time = task->real_start_time_ct; #endif + /* convert nsec -> ticks */ start_time = nsec_to_clock_t(start_time); diff --git a/include/linux/sched.h b/include/linux/sched.h index 19ca9cc0f3b9..9846553f7039 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -839,7 +839,6 @@ struct task_struct { #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif - #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif @@ -853,6 +852,12 @@ struct task_struct { /* Boot based time in nsecs: */ u64 real_start_time; + /* + * This is a Container-side copy of 'real_start_time' field + * shown from inside of a Container and modified by host. + */ + u64 real_start_time_ct; + /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; diff --git a/include/linux/ve.h b/include/linux/ve.h index 3aa0ea0b1bab..ab8da4dceec1 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -148,6 +148,22 @@ static u64 ve_get_uptime(struct ve_struct *ve) return ktime_get_boot_ns() - ve->real_start_time; } +static inline void ve_set_task_start_time(struct ve_struct *ve, + struct task_struct *t) +{ + /* + * Mitigate memory access reordering risks by doing double check, + * 'is_running' could be read as 1 before we see + * 'real_start_time' updated here. If it's still 0, + * we know 'is_running' is being modified right NOW in + * parallel so it's safe to say that start time is also 0. + */ + if (!ve->is_running || !ve->real_start_time) + t->real_start_time_ct = 0; + else + t->real_start_time_ct = ve_get_uptime(ve); +} + extern void monotonic_abs_to_ve(clockid_t which_clock, struct timespec64 *tp); extern void monotonic_ve_to_abs(clockid_t which_clock, struct timespec64 *tp); diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 947191def935..0d7697ead271 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -232,5 +232,11 @@ struct prctl_mm_map { /* Control reclaim behavior when allocating memory */ #define PR_SET_IO_FLUSHER 57 #define PR_GET_IO_FLUSHER 58 +/* Set task container related fields */ +#define PR_SET_TASK_CT_FIELDS 1000 +#define PR_TASK_CT_FIELDS_START_TIME (1UL << 0) +struct prctl_task_ct_fields { + __s64 real_start_time; +}; #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index da7bda595d44..c996de548127 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -91,6 +91,7 @@ #include <linux/kcov.h> #include <linux/livepatch.h> #include <linux/thread_info.h> +#include <linux/ve.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -1711,6 +1712,9 @@ static __latent_entropy struct task_struct *copy_process( int retval; struct task_struct *p; struct multiprocess_signals delayed; +#ifdef CONFIG_VE + struct ve_struct *ve = get_exec_env(); +#endif /* * Don't allow sharing the root directory with processes in a different @@ -1863,6 +1867,14 @@ static __latent_entropy struct task_struct *copy_process( p->start_time = ktime_get_ns(); p->real_start_time = ktime_get_boot_ns(); + + p->real_start_time_ct = 0; + +#ifdef CONFIG_VE + if (!ve_is_super(ve)) + ve_set_task_start_time(ve, p); +#endif + p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); diff --git a/kernel/sys.c b/kernel/sys.c index b0914885cd68..7ca6e68eafa9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2261,6 +2261,26 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) } #endif +static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg, + unsigned long flags) +{ + struct prctl_task_ct_fields params; +#ifdef CONFIG_VE + struct ve_struct *ve = t->task_ve; + + if (!ve_is_super(ve) && !ve->is_pseudosuper) + return -EPERM; +#endif + + if (copy_from_user(¶ms, (const void __user *)arg, sizeof(params))) + return -EFAULT; + + if (flags & PR_TASK_CT_FIELDS_START_TIME) + t->real_start_time_ct = (u64)params.real_start_time; + + return 0; +} + static int propagate_has_child_subreaper(struct task_struct *p, void *data) { /* @@ -2538,6 +2558,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; break; + case PR_SET_TASK_CT_FIELDS: + error = prctl_set_task_ct_fields(me, arg2, arg3); + break; default: error = -EINVAL; break; diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 0c6630c6616a..3f53641455ad 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -820,6 +820,8 @@ static void ve_attach(struct cgroup_taskset *tset) if (cpuid_override_on()) set_tsk_thread_flag(task, TIF_CPUID_OVERRIDE); + + ve_set_task_start_time(ve, task); task->task_ve = ve; } }
On Mon, Dec 21, 2020 at 06:09:56PM +0300, Konstantin Khorenko wrote: > From: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com> > > Introduced 'real_start_time_ct' field in task_struct. > > The value is READ: > 1. When the process lives inside of a ve group and any process > inside of the same ve group wants to know it's start time by reading > it's /proc/[pid]/stat file. > 2. At container suspend operation to store this value to a dump image. > > The value is WRITTEN: > 1. At creation time (copy_process function) > 1.1. If a process is being created outside of ve group / on host, then > this value is initialized to 0 > 1.2. If a process is being created by process already living in ve > group, this value is calculated as host_uptime - ve_uptime. > > 2. During attach to ve. (ve_attach function). The process can be created on > a host and later attached to ve. It's container's start_time value has been > already initialized to 0 at creation time. After the process enters the > domain of a ve, the value should be initialized. > Note that the process can be attached to a non-running container, in which > case it's start_time value should not be calculated and left initialized to > 0. > > 3. At container restore via prctl (prctl_set_task_ct_fields function). > In this case the value is only settable outside of a container. > During restore the processes would be created from the dump image. > At restore step each process will execute prctl to set it's start_time > value, read from the dump. This would only be permitted during > pseudosuper ve mode. The value is set as is (read from the dump), without > any calculations. > > https://jira.sw.ru/browse/PSBM-64123 > > Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com> > > (cherry picked from vz7 commit eca790eaed527bae7029b4ae1cd557ce847ac6c0) > Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com> Reviewed-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com> > > v2: rebased to branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz branch > v3: added missing ve.h include > --- > fs/proc/array.c | 12 +++--------- > include/linux/sched.h | 7 ++++++- > include/linux/ve.h | 16 ++++++++++++++++ > include/uapi/linux/prctl.h | 6 ++++++ > kernel/fork.c | 12 ++++++++++++ > kernel/sys.c | 23 +++++++++++++++++++++++ > kernel/ve/ve.c | 2 ++ > 7 files changed, 68 insertions(+), 10 deletions(-) > > diff --git a/fs/proc/array.c b/fs/proc/array.c > index ba712f18e5ff..735876a51a18 100644 > --- a/fs/proc/array.c > +++ b/fs/proc/array.c > @@ -555,16 +555,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, > start_time = task->real_start_time; > > #ifdef CONFIG_VE > - if (!is_super) { > - u64 offset = get_exec_env()->real_start_time; > - start_time -= (unsigned long long)offset; > - } > - /* tasks inside a CT can have negative start time e.g. if the CT was > - * migrated from another hw node, in which case we will report 0 in > - * order not to confuse userspace */ > - if ((s64)start_time < 0) > - start_time = 0; > + if (!is_super) > + start_time = task->real_start_time_ct; > #endif > + > /* convert nsec -> ticks */ > start_time = nsec_to_clock_t(start_time); > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 19ca9cc0f3b9..9846553f7039 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -839,7 +839,6 @@ struct task_struct { > #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN > struct vtime vtime; > #endif > - > #ifdef CONFIG_NO_HZ_FULL > atomic_t tick_dep_mask; > #endif > @@ -853,6 +852,12 @@ struct task_struct { > /* Boot based time in nsecs: */ > u64 real_start_time; > > + /* > + * This is a Container-side copy of 'real_start_time' field > + * shown from inside of a Container and modified by host. > + */ > + u64 real_start_time_ct; > + > /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ > unsigned long min_flt; > unsigned long maj_flt; > diff --git a/include/linux/ve.h b/include/linux/ve.h > index 3aa0ea0b1bab..ab8da4dceec1 100644 > --- a/include/linux/ve.h > +++ b/include/linux/ve.h > @@ -148,6 +148,22 @@ static u64 ve_get_uptime(struct ve_struct *ve) > return ktime_get_boot_ns() - ve->real_start_time; > } > > +static inline void ve_set_task_start_time(struct ve_struct *ve, > + struct task_struct *t) > +{ > + /* > + * Mitigate memory access reordering risks by doing double check, > + * 'is_running' could be read as 1 before we see > + * 'real_start_time' updated here. If it's still 0, > + * we know 'is_running' is being modified right NOW in > + * parallel so it's safe to say that start time is also 0. > + */ > + if (!ve->is_running || !ve->real_start_time) > + t->real_start_time_ct = 0; > + else > + t->real_start_time_ct = ve_get_uptime(ve); > +} > + > extern void monotonic_abs_to_ve(clockid_t which_clock, struct timespec64 *tp); > extern void monotonic_ve_to_abs(clockid_t which_clock, struct timespec64 *tp); > > diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h > index 947191def935..0d7697ead271 100644 > --- a/include/uapi/linux/prctl.h > +++ b/include/uapi/linux/prctl.h > @@ -232,5 +232,11 @@ struct prctl_mm_map { > /* Control reclaim behavior when allocating memory */ > #define PR_SET_IO_FLUSHER 57 > #define PR_GET_IO_FLUSHER 58 > +/* Set task container related fields */ > +#define PR_SET_TASK_CT_FIELDS 1000 > +#define PR_TASK_CT_FIELDS_START_TIME (1UL << 0) > > +struct prctl_task_ct_fields { > + __s64 real_start_time; > +}; > #endif /* _LINUX_PRCTL_H */ > diff --git a/kernel/fork.c b/kernel/fork.c > index da7bda595d44..c996de548127 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -91,6 +91,7 @@ > #include <linux/kcov.h> > #include <linux/livepatch.h> > #include <linux/thread_info.h> > +#include <linux/ve.h> > > #include <asm/pgtable.h> > #include <asm/pgalloc.h> > @@ -1711,6 +1712,9 @@ static __latent_entropy struct task_struct *copy_process( > int retval; > struct task_struct *p; > struct multiprocess_signals delayed; > +#ifdef CONFIG_VE > + struct ve_struct *ve = get_exec_env(); > +#endif > > /* > * Don't allow sharing the root directory with processes in a different > @@ -1863,6 +1867,14 @@ static __latent_entropy struct task_struct *copy_process( > > p->start_time = ktime_get_ns(); > p->real_start_time = ktime_get_boot_ns(); > + > + p->real_start_time_ct = 0; > + > +#ifdef CONFIG_VE > + if (!ve_is_super(ve)) > + ve_set_task_start_time(ve, p); > +#endif > + > p->io_context = NULL; > audit_set_context(p, NULL); > cgroup_fork(p); > diff --git a/kernel/sys.c b/kernel/sys.c > index b0914885cd68..7ca6e68eafa9 100644 > --- a/kernel/sys.c > +++ b/kernel/sys.c > @@ -2261,6 +2261,26 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) > } > #endif > > +static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg, > + unsigned long flags) > +{ > + struct prctl_task_ct_fields params; > +#ifdef CONFIG_VE > + struct ve_struct *ve = t->task_ve; > + > + if (!ve_is_super(ve) && !ve->is_pseudosuper) > + return -EPERM; > +#endif > + > + if (copy_from_user(¶ms, (const void __user *)arg, sizeof(params))) > + return -EFAULT; > + > + if (flags & PR_TASK_CT_FIELDS_START_TIME) > + t->real_start_time_ct = (u64)params.real_start_time; > + > + return 0; > +} > + > static int propagate_has_child_subreaper(struct task_struct *p, void *data) > { > /* > @@ -2538,6 +2558,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, > > error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; > break; > + case PR_SET_TASK_CT_FIELDS: > + error = prctl_set_task_ct_fields(me, arg2, arg3); > + break; > default: > error = -EINVAL; > break; > diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c > index 0c6630c6616a..3f53641455ad 100644 > --- a/kernel/ve/ve.c > +++ b/kernel/ve/ve.c > @@ -820,6 +820,8 @@ static void ve_attach(struct cgroup_taskset *tset) > > if (cpuid_override_on()) > set_tsk_thread_flag(task, TIF_CPUID_OVERRIDE); > + > + ve_set_task_start_time(ve, task); > task->task_ve = ve; > } > } > -- > 2.28.0 >