[rh8,v3,2/2] ve/proc: Added separate start time field to task_struct to show in container

Submitted by Konstantin Khorenko on Dec. 21, 2020, 3:09 p.m.

Details

Message ID 20201221150956.226097-2-khorenko@virtuozzo.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Konstantin Khorenko Dec. 21, 2020, 3:09 p.m.
From: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>

Introduced 'real_start_time_ct' field in task_struct.

The value is READ:
1. When the process lives inside of a ve group and any process
inside of the same ve group wants to know it's start time by reading
it's /proc/[pid]/stat file.
2. At container suspend operation to store this value to a dump image.

The value is WRITTEN:
1. At creation time (copy_process function)
1.1. If a process is being created outside of ve group / on host, then
this value is initialized to 0
1.2. If a process is being created by process already living in ve
group, this value is calculated as host_uptime - ve_uptime.

2. During attach to ve. (ve_attach function). The process can be created on
a host and later attached to ve. It's container's start_time value has been
already initialized to 0 at creation time. After the process enters the
domain of a ve, the value should be initialized.
Note that the process can be attached to a non-running container, in which
case it's start_time value should not be calculated and left initialized to
0.

3. At container restore via prctl (prctl_set_task_ct_fields function).
In this case the value is only settable outside of a container.
During restore the processes would be created from the dump image.
At restore step each process will execute prctl to set it's start_time
value, read from the dump. This would only be permitted during
pseudosuper ve mode. The value is set as is (read from the dump), without
any calculations.

https://jira.sw.ru/browse/PSBM-64123

Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>

(cherry picked from vz7 commit eca790eaed527bae7029b4ae1cd557ce847ac6c0)
Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>

v2: rebased to branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz branch
v3: added missing ve.h include
---
 fs/proc/array.c            | 12 +++---------
 include/linux/sched.h      |  7 ++++++-
 include/linux/ve.h         | 16 ++++++++++++++++
 include/uapi/linux/prctl.h |  6 ++++++
 kernel/fork.c              | 12 ++++++++++++
 kernel/sys.c               | 23 +++++++++++++++++++++++
 kernel/ve/ve.c             |  2 ++
 7 files changed, 68 insertions(+), 10 deletions(-)

Patch hide | download patch | download mbox

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ba712f18e5ff..735876a51a18 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -555,16 +555,10 @@  static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	start_time = task->real_start_time;
 
 #ifdef CONFIG_VE
-	if (!is_super) {
-		u64 offset = get_exec_env()->real_start_time;
-		start_time -= (unsigned long long)offset;
-	}
-	/* tasks inside a CT can have negative start time e.g. if the CT was
-	 * migrated from another hw node, in which case we will report 0 in
-	 * order not to confuse userspace */
-	if ((s64)start_time < 0)
-		start_time = 0;
+	if (!is_super)
+		start_time = task->real_start_time_ct;
 #endif
+
 	/* convert nsec -> ticks */
 	start_time = nsec_to_clock_t(start_time);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 19ca9cc0f3b9..9846553f7039 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -839,7 +839,6 @@  struct task_struct {
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	struct vtime			vtime;
 #endif
-
 #ifdef CONFIG_NO_HZ_FULL
 	atomic_t			tick_dep_mask;
 #endif
@@ -853,6 +852,12 @@  struct task_struct {
 	/* Boot based time in nsecs: */
 	u64				real_start_time;
 
+	/*
+	 * This is a Container-side copy of 'real_start_time' field
+	 * shown from inside of a Container and modified by host.
+	 */
+	u64				real_start_time_ct;
+
 	/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
 	unsigned long			min_flt;
 	unsigned long			maj_flt;
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 3aa0ea0b1bab..ab8da4dceec1 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -148,6 +148,22 @@  static u64 ve_get_uptime(struct ve_struct *ve)
 	return ktime_get_boot_ns() - ve->real_start_time;
 }
 
+static inline void ve_set_task_start_time(struct ve_struct *ve,
+					  struct task_struct *t)
+{
+	/*
+	 * Mitigate memory access reordering risks by doing double check,
+	 * 'is_running' could be read as 1 before we see
+	 * 'real_start_time' updated here. If it's still 0,
+	 * we know 'is_running' is being modified right NOW in
+	 * parallel so it's safe to say that start time is also 0.
+	 */
+	if (!ve->is_running || !ve->real_start_time)
+		t->real_start_time_ct = 0;
+	else
+		t->real_start_time_ct = ve_get_uptime(ve);
+}
+
 extern void monotonic_abs_to_ve(clockid_t which_clock, struct timespec64 *tp);
 extern void monotonic_ve_to_abs(clockid_t which_clock, struct timespec64 *tp);
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 947191def935..0d7697ead271 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -232,5 +232,11 @@  struct prctl_mm_map {
 /* Control reclaim behavior when allocating memory */
 #define PR_SET_IO_FLUSHER		57
 #define PR_GET_IO_FLUSHER		58
+/* Set task container related fields */
+#define PR_SET_TASK_CT_FIELDS		1000
+#define PR_TASK_CT_FIELDS_START_TIME	(1UL << 0)
 
+struct prctl_task_ct_fields {
+	__s64 real_start_time;
+};
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index da7bda595d44..c996de548127 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -91,6 +91,7 @@ 
 #include <linux/kcov.h>
 #include <linux/livepatch.h>
 #include <linux/thread_info.h>
+#include <linux/ve.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -1711,6 +1712,9 @@  static __latent_entropy struct task_struct *copy_process(
 	int retval;
 	struct task_struct *p;
 	struct multiprocess_signals delayed;
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+#endif
 
 	/*
 	 * Don't allow sharing the root directory with processes in a different
@@ -1863,6 +1867,14 @@  static __latent_entropy struct task_struct *copy_process(
 
 	p->start_time = ktime_get_ns();
 	p->real_start_time = ktime_get_boot_ns();
+
+	p->real_start_time_ct = 0;
+
+#ifdef CONFIG_VE
+	if (!ve_is_super(ve))
+		ve_set_task_start_time(ve, p);
+#endif
+
 	p->io_context = NULL;
 	audit_set_context(p, NULL);
 	cgroup_fork(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index b0914885cd68..7ca6e68eafa9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2261,6 +2261,26 @@  static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 }
 #endif
 
+static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg,
+				    unsigned long flags)
+{
+	struct prctl_task_ct_fields params;
+#ifdef CONFIG_VE
+	struct ve_struct *ve = t->task_ve;
+
+	if (!ve_is_super(ve) && !ve->is_pseudosuper)
+		return -EPERM;
+#endif
+
+	if (copy_from_user(&params, (const void __user *)arg, sizeof(params)))
+		return -EFAULT;
+
+	if (flags & PR_TASK_CT_FIELDS_START_TIME)
+		t->real_start_time_ct = (u64)params.real_start_time;
+
+	return 0;
+}
+
 static int propagate_has_child_subreaper(struct task_struct *p, void *data)
 {
 	/*
@@ -2538,6 +2558,9 @@  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 
 		error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
 		break;
+	case PR_SET_TASK_CT_FIELDS:
+		error = prctl_set_task_ct_fields(me, arg2, arg3);
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 0c6630c6616a..3f53641455ad 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -820,6 +820,8 @@  static void ve_attach(struct cgroup_taskset *tset)
 
 		if (cpuid_override_on())
 			set_tsk_thread_flag(task, TIF_CPUID_OVERRIDE);
+
+		ve_set_task_start_time(ve, task);
 		task->task_ve = ve;
 	}
 }

Comments

Valeriy Vdovin Dec. 22, 2020, 9:42 a.m.
On Mon, Dec 21, 2020 at 06:09:56PM +0300, Konstantin Khorenko wrote:
> From: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
> 
> Introduced 'real_start_time_ct' field in task_struct.
> 
> The value is READ:
> 1. When the process lives inside of a ve group and any process
> inside of the same ve group wants to know it's start time by reading
> it's /proc/[pid]/stat file.
> 2. At container suspend operation to store this value to a dump image.
> 
> The value is WRITTEN:
> 1. At creation time (copy_process function)
> 1.1. If a process is being created outside of ve group / on host, then
> this value is initialized to 0
> 1.2. If a process is being created by process already living in ve
> group, this value is calculated as host_uptime - ve_uptime.
> 
> 2. During attach to ve. (ve_attach function). The process can be created on
> a host and later attached to ve. It's container's start_time value has been
> already initialized to 0 at creation time. After the process enters the
> domain of a ve, the value should be initialized.
> Note that the process can be attached to a non-running container, in which
> case it's start_time value should not be calculated and left initialized to
> 0.
> 
> 3. At container restore via prctl (prctl_set_task_ct_fields function).
> In this case the value is only settable outside of a container.
> During restore the processes would be created from the dump image.
> At restore step each process will execute prctl to set it's start_time
> value, read from the dump. This would only be permitted during
> pseudosuper ve mode. The value is set as is (read from the dump), without
> any calculations.
> 
> https://jira.sw.ru/browse/PSBM-64123
> 
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
> 
> (cherry picked from vz7 commit eca790eaed527bae7029b4ae1cd557ce847ac6c0)
> Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
Reviewed-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
> 
> v2: rebased to branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz branch
> v3: added missing ve.h include
> ---
>  fs/proc/array.c            | 12 +++---------
>  include/linux/sched.h      |  7 ++++++-
>  include/linux/ve.h         | 16 ++++++++++++++++
>  include/uapi/linux/prctl.h |  6 ++++++
>  kernel/fork.c              | 12 ++++++++++++
>  kernel/sys.c               | 23 +++++++++++++++++++++++
>  kernel/ve/ve.c             |  2 ++
>  7 files changed, 68 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index ba712f18e5ff..735876a51a18 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -555,16 +555,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
>  	start_time = task->real_start_time;
>  
>  #ifdef CONFIG_VE
> -	if (!is_super) {
> -		u64 offset = get_exec_env()->real_start_time;
> -		start_time -= (unsigned long long)offset;
> -	}
> -	/* tasks inside a CT can have negative start time e.g. if the CT was
> -	 * migrated from another hw node, in which case we will report 0 in
> -	 * order not to confuse userspace */
> -	if ((s64)start_time < 0)
> -		start_time = 0;
> +	if (!is_super)
> +		start_time = task->real_start_time_ct;
>  #endif
> +
>  	/* convert nsec -> ticks */
>  	start_time = nsec_to_clock_t(start_time);
>  
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 19ca9cc0f3b9..9846553f7039 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -839,7 +839,6 @@ struct task_struct {
>  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
>  	struct vtime			vtime;
>  #endif
> -
>  #ifdef CONFIG_NO_HZ_FULL
>  	atomic_t			tick_dep_mask;
>  #endif
> @@ -853,6 +852,12 @@ struct task_struct {
>  	/* Boot based time in nsecs: */
>  	u64				real_start_time;
>  
> +	/*
> +	 * This is a Container-side copy of 'real_start_time' field
> +	 * shown from inside of a Container and modified by host.
> +	 */
> +	u64				real_start_time_ct;
> +
>  	/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
>  	unsigned long			min_flt;
>  	unsigned long			maj_flt;
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 3aa0ea0b1bab..ab8da4dceec1 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -148,6 +148,22 @@ static u64 ve_get_uptime(struct ve_struct *ve)
>  	return ktime_get_boot_ns() - ve->real_start_time;
>  }
>  
> +static inline void ve_set_task_start_time(struct ve_struct *ve,
> +					  struct task_struct *t)
> +{
> +	/*
> +	 * Mitigate memory access reordering risks by doing double check,
> +	 * 'is_running' could be read as 1 before we see
> +	 * 'real_start_time' updated here. If it's still 0,
> +	 * we know 'is_running' is being modified right NOW in
> +	 * parallel so it's safe to say that start time is also 0.
> +	 */
> +	if (!ve->is_running || !ve->real_start_time)
> +		t->real_start_time_ct = 0;
> +	else
> +		t->real_start_time_ct = ve_get_uptime(ve);
> +}
> +
>  extern void monotonic_abs_to_ve(clockid_t which_clock, struct timespec64 *tp);
>  extern void monotonic_ve_to_abs(clockid_t which_clock, struct timespec64 *tp);
>  
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index 947191def935..0d7697ead271 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -232,5 +232,11 @@ struct prctl_mm_map {
>  /* Control reclaim behavior when allocating memory */
>  #define PR_SET_IO_FLUSHER		57
>  #define PR_GET_IO_FLUSHER		58
> +/* Set task container related fields */
> +#define PR_SET_TASK_CT_FIELDS		1000
> +#define PR_TASK_CT_FIELDS_START_TIME	(1UL << 0)
>  
> +struct prctl_task_ct_fields {
> +	__s64 real_start_time;
> +};
>  #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index da7bda595d44..c996de548127 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -91,6 +91,7 @@
>  #include <linux/kcov.h>
>  #include <linux/livepatch.h>
>  #include <linux/thread_info.h>
> +#include <linux/ve.h>
>  
>  #include <asm/pgtable.h>
>  #include <asm/pgalloc.h>
> @@ -1711,6 +1712,9 @@ static __latent_entropy struct task_struct *copy_process(
>  	int retval;
>  	struct task_struct *p;
>  	struct multiprocess_signals delayed;
> +#ifdef CONFIG_VE
> +	struct ve_struct *ve = get_exec_env();
> +#endif
>  
>  	/*
>  	 * Don't allow sharing the root directory with processes in a different
> @@ -1863,6 +1867,14 @@ static __latent_entropy struct task_struct *copy_process(
>  
>  	p->start_time = ktime_get_ns();
>  	p->real_start_time = ktime_get_boot_ns();
> +
> +	p->real_start_time_ct = 0;
> +
> +#ifdef CONFIG_VE
> +	if (!ve_is_super(ve))
> +		ve_set_task_start_time(ve, p);
> +#endif
> +
>  	p->io_context = NULL;
>  	audit_set_context(p, NULL);
>  	cgroup_fork(p);
> diff --git a/kernel/sys.c b/kernel/sys.c
> index b0914885cd68..7ca6e68eafa9 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2261,6 +2261,26 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
>  }
>  #endif
>  
> +static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg,
> +				    unsigned long flags)
> +{
> +	struct prctl_task_ct_fields params;
> +#ifdef CONFIG_VE
> +	struct ve_struct *ve = t->task_ve;
> +
> +	if (!ve_is_super(ve) && !ve->is_pseudosuper)
> +		return -EPERM;
> +#endif
> +
> +	if (copy_from_user(&params, (const void __user *)arg, sizeof(params)))
> +		return -EFAULT;
> +
> +	if (flags & PR_TASK_CT_FIELDS_START_TIME)
> +		t->real_start_time_ct = (u64)params.real_start_time;
> +
> +	return 0;
> +}
> +
>  static int propagate_has_child_subreaper(struct task_struct *p, void *data)
>  {
>  	/*
> @@ -2538,6 +2558,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  
>  		error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
>  		break;
> +	case PR_SET_TASK_CT_FIELDS:
> +		error = prctl_set_task_ct_fields(me, arg2, arg3);
> +		break;
>  	default:
>  		error = -EINVAL;
>  		break;
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 0c6630c6616a..3f53641455ad 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -820,6 +820,8 @@ static void ve_attach(struct cgroup_taskset *tset)
>  
>  		if (cpuid_override_on())
>  			set_tsk_thread_flag(task, TIF_CPUID_OVERRIDE);
> +
> +		ve_set_task_start_time(ve, task);
>  		task->task_ve = ve;
>  	}
>  }
> -- 
> 2.28.0
>