[RH7,1/1] ve/proc: Added separate start time field to task_struct to show in container

Submitted by Valeriy Vdovin on Jan. 13, 2020, 8:25 a.m.

Details

Message ID 1578903941-8393-2-git-send-email-valeriy.vdovin@virtuozzo.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Valeriy Vdovin Jan. 13, 2020, 8:25 a.m.
https://jira.sw.ru/browse/PSBM-100083
Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
---
 fs/proc/array.c            | 16 ++++------------
 include/linux/sched.h      |  5 +++++
 include/uapi/linux/prctl.h |  7 +++++++
 kernel/fork.c              | 15 +++++++++++++++
 kernel/sys.c               | 23 +++++++++++++++++++++++
 kernel/ve/ve.c             | 22 ++++++++++++++++++++++
 6 files changed, 76 insertions(+), 12 deletions(-)

Patch hide | download patch | download mbox

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3aa8a7d..fb611b1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -611,19 +611,11 @@  static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		(unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
 				+ task->real_start_time.tv_nsec;
 #ifdef CONFIG_VE
-	if (!is_super) {
-		struct timespec *ve_start_ts =
-				&get_exec_env()->real_start_timespec;
-		start_time -=
-			(unsigned long long)ve_start_ts->tv_sec * NSEC_PER_SEC
-				+ ve_start_ts->tv_nsec;
-	}
-	/* tasks inside a CT can have negative start time e.g. if the CT was
-	 * migrated from another hw node, in which case we will report 0 in
-	 * order not to confuse userspace */
-	if ((s64)start_time < 0)
-		start_time = 0;
+	if (!is_super)
+		start_time = (unsigned long long)
+			timespec_to_ns(&task->real_start_time_ct);
 #endif
+
 	/* convert nsec -> ticks */
 	start_time = nsec_to_clock_t(start_time);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 07f9954..0832904 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1934,6 +1934,11 @@  struct task_struct {
 	struct wake_q_node wake_q;
 	struct prev_cputime prev_cputime;
 	struct vtime vtime;
+	/*
+	 * this is a container-side copy of 'real_start_time' field
+	 * shown from inside of a container and modified by host.
+	 */
+	struct timespec real_start_time_ct;
 #endif /* __GENKSYMS__ */
 };
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 02376de..b185113 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -204,5 +204,12 @@  struct prctl_mm_map {
 # define PR_SPEC_DISABLE		(1UL << 2)
 # define PR_SPEC_FORCE_DISABLE		(1UL << 3)
 # define PR_SPEC_DISABLE_NOEXEC		(1UL << 4)
+/* Set task container related fields */
+#define PR_SET_TASK_CT_FIELDS	1000
+#define PR_TASK_CT_FIELDS_START_TIME	(1ULL << 0)
+
+struct prctl_task_ct_fields {
+	__s64 real_start_time;
+};
 
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 3d74228..dbc584c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1350,6 +1350,10 @@  static struct task_struct *copy_process(unsigned long clone_flags,
 	struct task_struct *p;
 	void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
 
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+#endif
+
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
 
@@ -1472,6 +1476,17 @@  static struct task_struct *copy_process(unsigned long clone_flags,
 	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->real_start_time = p->start_time;
 	monotonic_to_bootbased(&p->real_start_time);
+
+	p->real_start_time_ct.tv_sec = 0;
+	p->real_start_time_ct.tv_nsec = 0;
+
+#ifdef CONFIG_VE
+	if (!ve_is_super(ve) && ve->is_running) {
+		p->real_start_time_ct = timespec_sub(p->real_start_time,
+			ve->real_start_timespec);
+	}
+#endif
+
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	if (clone_flags & CLONE_THREAD)
diff --git a/kernel/sys.c b/kernel/sys.c
index c4d633ef..2ce16c7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2457,6 +2457,26 @@  static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 }
 #endif
 
+static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg,
+		unsigned long flags)
+{
+	struct prctl_task_ct_fields params;
+#ifdef CONFIG_VE
+	struct ve_struct *ve = t->task_ve;
+
+	if (!ve_is_super(ve) && !ve->is_pseudosuper)
+		return -EPERM;
+#endif
+
+	if (copy_from_user(&params, (const void __user *)arg, sizeof(params)))
+		return -EFAULT;
+
+	if (flags & PR_TASK_CT_FIELDS_START_TIME)
+		t->real_start_time_ct = ns_to_timespec(params.real_start_time);
+
+	return 0;
+}
+
 int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
 {
 	return -EINVAL;
@@ -2684,6 +2704,9 @@  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			return -EINVAL;
 		error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
 		break;
+	case PR_SET_TASK_CT_FIELDS:
+		error = prctl_set_task_ct_fields(me, arg2, arg3);
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index ad3a698..110c37e 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -339,6 +339,7 @@  static void kthread_attach_fn(struct kthread_work *w)
 	err = cgroup_attach_task_all(target, current);
 	if (err)
 		goto out;
+
 out:
 	work->result = err;
 	complete(&work->done);
@@ -506,6 +507,15 @@  static int ve_start_container(struct ve_struct *ve)
 		ve->real_start_timespec = tsk->real_start_time;
 	}
 
+	/*
+	 * 'current' belongs to this ve, but it's ct-relative start time hasn't
+	 * been initialized correctly at copy_process, because ve's start time
+	 * was zero. On the other hand, we know this task is this ve's
+	 * init (pid 1), * so we set it's relative start time to 0.
+	 */
+	tsk->real_start_time_ct.tv_sec = 0;
+	tsk->real_start_time_ct.tv_nsec = 0;
+
 	/* The value is wrong, but it is never compared to process
 	 * start times */
 	ve->start_jiffies = get_jiffies_64();
@@ -837,6 +847,7 @@  static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
 {
 	struct ve_struct *ve = cgroup_ve(cg);
 	struct task_struct *task;
+	struct timespec host_uptime;
 
 	cgroup_taskset_for_each(task, cg, tset) {
 		/* this probihibts ptracing of task entered to VE from host system */
@@ -850,6 +861,17 @@  static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
 		/* Leave parent exec domain */
 		task->parent_exec_id--;
 
+		if (ve->is_running) {
+			/* Viewed from inside of a container these processes
+			 * are newly created, so their actual container start
+			 * times should be set now.
+			 */
+			do_posix_clock_monotonic_gettime(&host_uptime);
+			monotonic_to_bootbased(&host_uptime);
+			task->real_start_time_ct = timespec_sub(host_uptime,
+				ve->real_start_timespec);
+		}
+
 		task->task_ve = ve;
 	}
 

Comments

Pavel Tikhomirov Jan. 13, 2020, 2:36 p.m.
Empty commit message body, please add here some small explanation about 
when (copy_process, ve_attach and prctl_set_task_ct_fields) we set it 
and with what value (0, starttime-ve.start_time, gettime-ve.start_time) 
and why.

On 1/13/20 11:25 AM, Valeriy Vdovin wrote:
> https://jira.sw.ru/browse/PSBM-100083
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
> ---
>   fs/proc/array.c            | 16 ++++------------
>   include/linux/sched.h      |  5 +++++
>   include/uapi/linux/prctl.h |  7 +++++++
>   kernel/fork.c              | 15 +++++++++++++++
>   kernel/sys.c               | 23 +++++++++++++++++++++++
>   kernel/ve/ve.c             | 22 ++++++++++++++++++++++
>   6 files changed, 76 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index 3aa8a7d..fb611b1 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -611,19 +611,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
>   		(unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
>   				+ task->real_start_time.tv_nsec;
>   #ifdef CONFIG_VE
> -	if (!is_super) {
> -		struct timespec *ve_start_ts =
> -				&get_exec_env()->real_start_timespec;
> -		start_time -=
> -			(unsigned long long)ve_start_ts->tv_sec * NSEC_PER_SEC
> -				+ ve_start_ts->tv_nsec;
> -	}
> -	/* tasks inside a CT can have negative start time e.g. if the CT was
> -	 * migrated from another hw node, in which case we will report 0 in
> -	 * order not to confuse userspace */
> -	if ((s64)start_time < 0)
> -		start_time = 0;
> +	if (!is_super)
> +		start_time = (unsigned long long)
> +			timespec_to_ns(&task->real_start_time_ct);
>   #endif
> +
>   	/* convert nsec -> ticks */
>   	start_time = nsec_to_clock_t(start_time);
>   
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 07f9954..0832904 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1934,6 +1934,11 @@ struct task_struct {
>   	struct wake_q_node wake_q;
>   	struct prev_cputime prev_cputime;
>   	struct vtime vtime;
> +	/*
> +	 * this is a container-side copy of 'real_start_time' field
> +	 * shown from inside of a container and modified by host.
> +	 */
> +	struct timespec real_start_time_ct;
>   #endif /* __GENKSYMS__ */
>   };
>   
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index 02376de..b185113 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -204,5 +204,12 @@ struct prctl_mm_map {
>   # define PR_SPEC_DISABLE		(1UL << 2)
>   # define PR_SPEC_FORCE_DISABLE		(1UL << 3)
>   # define PR_SPEC_DISABLE_NOEXEC		(1UL << 4)
> +/* Set task container related fields */
> +#define PR_SET_TASK_CT_FIELDS	1000
> +#define PR_TASK_CT_FIELDS_START_TIME	(1ULL << 0)
> +
> +struct prctl_task_ct_fields {
> +	__s64 real_start_time;
> +};
>   
>   #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 3d74228..dbc584c 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1350,6 +1350,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>   	struct task_struct *p;
>   	void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
>   
> +#ifdef CONFIG_VE
> +	struct ve_struct *ve = get_exec_env();
> +#endif
> +
>   	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
>   		return ERR_PTR(-EINVAL);
>   
> @@ -1472,6 +1476,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>   	do_posix_clock_monotonic_gettime(&p->start_time);
>   	p->real_start_time = p->start_time;
>   	monotonic_to_bootbased(&p->real_start_time);
> +
> +	p->real_start_time_ct.tv_sec = 0;
> +	p->real_start_time_ct.tv_nsec = 0;



> +
> +#ifdef CONFIG_VE

We might need smp_rmb between reading is_running and real_start_timespec 
(and corresponding smp_wmb in ve_start_container), or use ve->op_sem to 
be 100% safe here in case of cpu instruction reordering. These can 
decrease performance of process creation.

> +	if (!ve_is_super(ve) && ve->is_running) {
> +		p->real_start_time_ct = timespec_sub(p->real_start_time,
> +			ve->real_start_timespec);
> +	}
> +#endif
> +
>   	p->io_context = NULL;
>   	p->audit_context = NULL;
>   	if (clone_flags & CLONE_THREAD)
> diff --git a/kernel/sys.c b/kernel/sys.c
> index c4d633ef..2ce16c7 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2457,6 +2457,26 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
>   }
>   #endif
>   
> +static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg,
> +		unsigned long flags)
> +{
> +	struct prctl_task_ct_fields params;
> +#ifdef CONFIG_VE
> +	struct ve_struct *ve = t->task_ve;
> +
> +	if (!ve_is_super(ve) && !ve->is_pseudosuper)
> +		return -EPERM;
> +#endif
> +
> +	if (copy_from_user(&params, (const void __user *)arg, sizeof(params)))
> +		return -EFAULT;
> +
> +	if (flags & PR_TASK_CT_FIELDS_START_TIME)
> +		t->real_start_time_ct = ns_to_timespec(params.real_start_time);
> +
> +	return 0;
> +}
> +
>   int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
>   {
>   	return -EINVAL;
> @@ -2684,6 +2704,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>   			return -EINVAL;
>   		error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
>   		break;
> +	case PR_SET_TASK_CT_FIELDS:
> +		error = prctl_set_task_ct_fields(me, arg2, arg3);
> +		break;
>   	default:
>   		error = -EINVAL;
>   		break;
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index ad3a698..110c37e 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -339,6 +339,7 @@ static void kthread_attach_fn(struct kthread_work *w)
>   	err = cgroup_attach_task_all(target, current);
>   	if (err)
>   		goto out;
> +

Excess empty line.

>   out:
>   	work->result = err;
>   	complete(&work->done);
> @@ -506,6 +507,15 @@ static int ve_start_container(struct ve_struct *ve)
>   		ve->real_start_timespec = tsk->real_start_time;
>   	}
>   
> +	/*
> +	 * 'current' belongs to this ve, but it's ct-relative start time hasn't
> +	 * been initialized correctly at copy_process, because ve's start time
> +	 * was zero. On the other hand, we know this task is this ve's
> +	 * init (pid 1), * so we set it's relative start time to 0.
> +	 */
> +	tsk->real_start_time_ct.tv_sec = 0;
> +	tsk->real_start_time_ct.tv_nsec = 0;

Please move to ve_attach. I think real_start_time_ct should be set on 
ve_attach for current process and all others even if ve is not running.

> +
>   	/* The value is wrong, but it is never compared to process
>   	 * start times */
>   	ve->start_jiffies = get_jiffies_64();
> @@ -837,6 +847,7 @@ static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
>   {
>   	struct ve_struct *ve = cgroup_ve(cg);
>   	struct task_struct *task;
> +	struct timespec host_uptime;
>   
>   	cgroup_taskset_for_each(task, cg, tset) {
>   		/* this probihibts ptracing of task entered to VE from host system */
> @@ -850,6 +861,17 @@ static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
>   		/* Leave parent exec domain */
>   		task->parent_exec_id--;
>   
> +		if (ve->is_running) {

Same thing with lack of lock/memory barrier here.

> +			/* Viewed from inside of a container these processes
> +			 * are newly created, so their actual container start
> +			 * times should be set now.
> +			 */
> +			do_posix_clock_monotonic_gettime(&host_uptime);
> +			monotonic_to_bootbased(&host_uptime);
> +			task->real_start_time_ct = timespec_sub(host_uptime,
> +				ve->real_start_timespec);
> +		}

} else {
	tsk->real_start_time_ct.tv_sec = 0;
	tsk->real_start_time_ct.tv_nsec = 0;
}

These looks more generic for me.

> +
>   		task->task_ve = ve;
>   	}
>   
>
Pavel Tikhomirov Jan. 13, 2020, 2:59 p.m.
On 1/13/20 5:36 PM, Pavel Tikhomirov wrote:
> Empty commit message body, please add here some small explanation about 
> when (copy_process, ve_attach and prctl_set_task_ct_fields) we set it 
> and with what value (0, starttime-ve.start_time, gettime-ve.start_time) 
> and why.
> 
> On 1/13/20 11:25 AM, Valeriy Vdovin wrote:
>> https://jira.sw.ru/browse/PSBM-100083
>> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
>> ---
>>   fs/proc/array.c            | 16 ++++------------
>>   include/linux/sched.h      |  5 +++++
>>   include/uapi/linux/prctl.h |  7 +++++++
>>   kernel/fork.c              | 15 +++++++++++++++
>>   kernel/sys.c               | 23 +++++++++++++++++++++++
>>   kernel/ve/ve.c             | 22 ++++++++++++++++++++++
>>   6 files changed, 76 insertions(+), 12 deletions(-)
>>
>> diff --git a/fs/proc/array.c b/fs/proc/array.c
>> index 3aa8a7d..fb611b1 100644
>> --- a/fs/proc/array.c
>> +++ b/fs/proc/array.c
>> @@ -611,19 +611,11 @@ static int do_task_stat(struct seq_file *m, 
>> struct pid_namespace *ns,
>>           (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
>>                   + task->real_start_time.tv_nsec;
>>   #ifdef CONFIG_VE
>> -    if (!is_super) {
>> -        struct timespec *ve_start_ts =
>> -                &get_exec_env()->real_start_timespec;
>> -        start_time -=
>> -            (unsigned long long)ve_start_ts->tv_sec * NSEC_PER_SEC
>> -                + ve_start_ts->tv_nsec;
>> -    }
>> -    /* tasks inside a CT can have negative start time e.g. if the CT was
>> -     * migrated from another hw node, in which case we will report 0 in
>> -     * order not to confuse userspace */
>> -    if ((s64)start_time < 0)
>> -        start_time = 0;
>> +    if (!is_super)
>> +        start_time = (unsigned long long)
>> +            timespec_to_ns(&task->real_start_time_ct);
>>   #endif
>> +
>>       /* convert nsec -> ticks */
>>       start_time = nsec_to_clock_t(start_time);
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 07f9954..0832904 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1934,6 +1934,11 @@ struct task_struct {
>>       struct wake_q_node wake_q;
>>       struct prev_cputime prev_cputime;
>>       struct vtime vtime;
>> +    /*
>> +     * this is a container-side copy of 'real_start_time' field
>> +     * shown from inside of a container and modified by host.
>> +     */
>> +    struct timespec real_start_time_ct;
>>   #endif /* __GENKSYMS__ */
>>   };
>> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
>> index 02376de..b185113 100644
>> --- a/include/uapi/linux/prctl.h
>> +++ b/include/uapi/linux/prctl.h
>> @@ -204,5 +204,12 @@ struct prctl_mm_map {
>>   # define PR_SPEC_DISABLE        (1UL << 2)
>>   # define PR_SPEC_FORCE_DISABLE        (1UL << 3)
>>   # define PR_SPEC_DISABLE_NOEXEC        (1UL << 4)
>> +/* Set task container related fields */
>> +#define PR_SET_TASK_CT_FIELDS    1000
>> +#define PR_TASK_CT_FIELDS_START_TIME    (1ULL << 0)
>> +
>> +struct prctl_task_ct_fields {
>> +    __s64 real_start_time;
>> +};
>>   #endif /* _LINUX_PRCTL_H */
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index 3d74228..dbc584c 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -1350,6 +1350,10 @@ static struct task_struct 
>> *copy_process(unsigned long clone_flags,
>>       struct task_struct *p;
>>       void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
>> +#ifdef CONFIG_VE
>> +    struct ve_struct *ve = get_exec_env();
>> +#endif
>> +
>>       if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == 
>> (CLONE_NEWNS|CLONE_FS))
>>           return ERR_PTR(-EINVAL);
>> @@ -1472,6 +1476,17 @@ static struct task_struct 
>> *copy_process(unsigned long clone_flags,
>>       do_posix_clock_monotonic_gettime(&p->start_time);
>>       p->real_start_time = p->start_time;
>>       monotonic_to_bootbased(&p->real_start_time);
>> +
>> +    p->real_start_time_ct.tv_sec = 0;
>> +    p->real_start_time_ct.tv_nsec = 0;
> 
> 
> 
>> +
>> +#ifdef CONFIG_VE
> 
> We might need smp_rmb between reading is_running and real_start_timespec 
> (and corresponding smp_wmb in ve_start_container), or use ve->op_sem to 
> be 100% safe here in case of cpu instruction reordering. These can 
> decrease performance of process creation. >
>> +    if (!ve_is_super(ve) && ve->is_running) {

Discussed with Kostya, we can just check that ve->real_start_timespec is 
not zero here explicitly and we won't need any lock and barrier here.

Please write a small comment about it. Actually we still leave here a 
small chance of failure in case we would really have a CT with zero 
real_start_timespec, but we have the same issue in ve_start_container 
already =)

>> +        p->real_start_time_ct = timespec_sub(p->real_start_time,
>> +            ve->real_start_timespec);
>> +    }
>> +#endif
>> +
>>       p->io_context = NULL;
>>       p->audit_context = NULL;
>>       if (clone_flags & CLONE_THREAD)
>> diff --git a/kernel/sys.c b/kernel/sys.c
>> index c4d633ef..2ce16c7 100644
>> --- a/kernel/sys.c
>> +++ b/kernel/sys.c
>> @@ -2457,6 +2457,26 @@ static int prctl_get_tid_address(struct 
>> task_struct *me, int __user **tid_addr)
>>   }
>>   #endif
>> +static int prctl_set_task_ct_fields(struct task_struct *t, unsigned 
>> long arg,
>> +        unsigned long flags)
>> +{
>> +    struct prctl_task_ct_fields params;
>> +#ifdef CONFIG_VE
>> +    struct ve_struct *ve = t->task_ve;
>> +
>> +    if (!ve_is_super(ve) && !ve->is_pseudosuper)
>> +        return -EPERM;
>> +#endif
>> +
>> +    if (copy_from_user(&params, (const void __user *)arg, 
>> sizeof(params)))
>> +        return -EFAULT;
>> +
>> +    if (flags & PR_TASK_CT_FIELDS_START_TIME)
>> +        t->real_start_time_ct = ns_to_timespec(params.real_start_time);
>> +
>> +    return 0;
>> +}
>> +
>>   int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned 
>> long which)
>>   {
>>       return -EINVAL;
>> @@ -2684,6 +2704,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned 
>> long, arg2, unsigned long, arg3,
>>               return -EINVAL;
>>           error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
>>           break;
>> +    case PR_SET_TASK_CT_FIELDS:
>> +        error = prctl_set_task_ct_fields(me, arg2, arg3);
>> +        break;
>>       default:
>>           error = -EINVAL;
>>           break;
>> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
>> index ad3a698..110c37e 100644
>> --- a/kernel/ve/ve.c
>> +++ b/kernel/ve/ve.c
>> @@ -339,6 +339,7 @@ static void kthread_attach_fn(struct kthread_work *w)
>>       err = cgroup_attach_task_all(target, current);
>>       if (err)
>>           goto out;
>> +
> 
> Excess empty line.
> 
>>   out:
>>       work->result = err;
>>       complete(&work->done);
>> @@ -506,6 +507,15 @@ static int ve_start_container(struct ve_struct *ve)
>>           ve->real_start_timespec = tsk->real_start_time;
>>       }
>> +    /*
>> +     * 'current' belongs to this ve, but it's ct-relative start time 
>> hasn't
>> +     * been initialized correctly at copy_process, because ve's start 
>> time
>> +     * was zero. On the other hand, we know this task is this ve's
>> +     * init (pid 1), * so we set it's relative start time to 0.
>> +     */
>> +    tsk->real_start_time_ct.tv_sec = 0;
>> +    tsk->real_start_time_ct.tv_nsec = 0;
> 
> Please move to ve_attach. I think real_start_time_ct should be set on 
> ve_attach for current process and all others even if ve is not running.
> 
>> +
>>       /* The value is wrong, but it is never compared to process
>>        * start times */
>>       ve->start_jiffies = get_jiffies_64();
>> @@ -837,6 +847,7 @@ static void ve_attach(struct cgroup *cg, struct 
>> cgroup_taskset *tset)
>>   {
>>       struct ve_struct *ve = cgroup_ve(cg);
>>       struct task_struct *task;
>> +    struct timespec host_uptime;
>>       cgroup_taskset_for_each(task, cg, tset) {
>>           /* this probihibts ptracing of task entered to VE from host 
>> system */
>> @@ -850,6 +861,17 @@ static void ve_attach(struct cgroup *cg, struct 
>> cgroup_taskset *tset)
>>           /* Leave parent exec domain */
>>           task->parent_exec_id--;
>> +        if (ve->is_running) {
> 
> Same thing with lack of lock/memory barrier here.

Same here.

> 
>> +            /* Viewed from inside of a container these processes
>> +             * are newly created, so their actual container start
>> +             * times should be set now.
>> +             */
>> +            do_posix_clock_monotonic_gettime(&host_uptime);
>> +            monotonic_to_bootbased(&host_uptime);
>> +            task->real_start_time_ct = timespec_sub(host_uptime,
>> +                ve->real_start_timespec);
>> +        }
> 
> } else {
>      tsk->real_start_time_ct.tv_sec = 0;
>      tsk->real_start_time_ct.tv_nsec = 0;
> }
> 
> These looks more generic for me.
> 
>> +
>>           task->task_ve = ve;
>>       }
>>
>