[1/2] Added separate start time field to task_struct to show in container

Submitted by Valeriy Vdovin on Dec. 31, 2019, 11:41 a.m.

Details

Message ID 1577792473-23925-2-git-send-email-valeriy.vdovin@virtuozzo.com
State New
Series "Start time of a task inside a container."
Headers show

Commit Message

Valeriy Vdovin Dec. 31, 2019, 11:41 a.m.
https://jira.sw.ru/browse/PSBM-100083
Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
---
 fs/proc/array.c            | 16 ++++------------
 include/linux/sched.h      |  5 +++++
 include/uapi/linux/prctl.h |  9 +++++++++
 kernel/fork.c              | 20 ++++++++++++++++++++
 kernel/sys.c               | 22 ++++++++++++++++++++++
 kernel/ve/ve.c             | 27 +++++++++++++++++++++++++++
 6 files changed, 87 insertions(+), 12 deletions(-)

Patch hide | download patch | download mbox

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3aa8a7d..a6d2834 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -611,19 +611,11 @@  static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		(unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
 				+ task->real_start_time.tv_nsec;
 #ifdef CONFIG_VE
-	if (!is_super) {
-		struct timespec *ve_start_ts =
-				&get_exec_env()->real_start_timespec;
-		start_time -=
-			(unsigned long long)ve_start_ts->tv_sec * NSEC_PER_SEC
-				+ ve_start_ts->tv_nsec;
-	}
-	/* tasks inside a CT can have negative start time e.g. if the CT was
-	 * migrated from another hw node, in which case we will report 0 in
-	 * order not to confuse userspace */
-	if ((s64)start_time < 0)
-		start_time = 0;
+	if (!is_super && !task->task_ve->is_pseudosuper)
+		start_time = (unsigned long long)
+			timespec_to_ns(&task->real_start_time_ct);
 #endif
+
 	/* convert nsec -> ticks */
 	start_time = nsec_to_clock_t(start_time);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 07f9954..0832904 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1934,6 +1934,11 @@  struct task_struct {
 	struct wake_q_node wake_q;
 	struct prev_cputime prev_cputime;
 	struct vtime vtime;
+	/*
+	 * this is a container-side copy of 'real_start_time' field
+	 * shown from inside of a container and modified by host.
+	 */
+	struct timespec real_start_time_ct;
 #endif /* __GENKSYMS__ */
 };
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 02376de..6de7b22 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -195,6 +195,7 @@  struct prctl_mm_map {
 /* Per task speculation control */
 #define PR_GET_SPECULATION_CTRL		52
 #define PR_SET_SPECULATION_CTRL		53
+
 /* Speculation control variants */
 # define PR_SPEC_STORE_BYPASS		0
 /* Return and control values for PR_SET/GET_SPECULATION_CTRL */
@@ -205,4 +206,12 @@  struct prctl_mm_map {
 # define PR_SPEC_FORCE_DISABLE		(1UL << 3)
 # define PR_SPEC_DISABLE_NOEXEC		(1UL << 4)
 
+/* Set task container related fields */
+#define PR_SET_TASK_CT_FIELDS	54
+#define PR_TASK_CT_FIELDS_START_TIME	(1ULL << 0)
+
+struct prctl_task_ct_fields {
+	__s64 real_start_time;
+};
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 3d74228..7bc1957 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1348,8 +1348,14 @@  static struct task_struct *copy_process(unsigned long clone_flags,
 {
 	int retval;
 	struct task_struct *p;
+
 	void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
 
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+#endif
+
+
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
 
@@ -1472,6 +1478,20 @@  static struct task_struct *copy_process(unsigned long clone_flags,
 	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->real_start_time = p->start_time;
 	monotonic_to_bootbased(&p->real_start_time);
+
+	/*
+	 * Initially set container-relative start time for newly started
+	 * processes. For init process this value is invalid and will be
+	 * changed at container start.
+	 */
+	p->real_start_time_ct = p->real_start_time;
+
+#ifdef CONFIG_VE
+	set_normalized_timespec(&p->real_start_time_ct,
+		p->real_start_time.tv_sec - ve->real_start_timespec.tv_sec,
+		p->real_start_time.tv_nsec - ve->real_start_timespec.tv_nsec);
+#endif
+
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	if (clone_flags & CLONE_THREAD)
diff --git a/kernel/sys.c b/kernel/sys.c
index c4d633ef..2c9b3e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2457,6 +2457,25 @@  static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 }
 #endif
 
+static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg,
+		unsigned long flags)
+{
+	struct prctl_task_ct_fields params;
+
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env()) && !t->task_ve->is_pseudosuper)
+		return -EPERM;
+#endif
+
+	if (copy_from_user(&params, (const void __user *)arg, sizeof(params)))
+		return -EFAULT;
+
+	if (flags & PR_TASK_CT_FIELDS_START_TIME)
+		t->real_start_time_ct = ns_to_timespec(params.real_start_time);
+
+	return 0;
+}
+
 int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
 {
 	return -EINVAL;
@@ -2684,6 +2703,9 @@  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			return -EINVAL;
 		error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
 		break;
+	case PR_SET_TASK_CT_FIELDS:
+		error = prctl_set_task_ct_fields(me, arg2, arg3);
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index ad3a698..662fffe 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -308,6 +308,21 @@  struct kthread_attach_work {
 	int result;
 };
 
+static void adj_start_time_at_ve_change(struct task_struct *t,
+	struct ve_struct *old_ve, struct ve_struct *new_ve)
+{
+	int64_t cur_stime_ns;
+	int64_t cur_ve_stime_ns;
+	int64_t tgt_ve_stime_ns;
+
+	cur_stime_ns = timespec_to_ns(&t->real_start_time);
+	cur_ve_stime_ns = timespec_to_ns(&old_ve->real_start_timespec);
+	tgt_ve_stime_ns = timespec_to_ns(&new_ve->real_start_timespec);
+
+	cur_stime_ns += cur_ve_stime_ns - tgt_ve_stime_ns;
+	t->real_start_time_ct = ns_to_timespec(cur_stime_ns);
+}
+
 static void kthread_attach_fn(struct kthread_work *w)
 {
 	struct kthread_attach_work *work = container_of(w,
@@ -336,9 +351,12 @@  static void kthread_attach_fn(struct kthread_work *w)
 	if (err)
 		goto out;
 
+	adj_start_time_at_ve_change(current, current->task_ve, target->task_ve);
+
 	err = cgroup_attach_task_all(target, current);
 	if (err)
 		goto out;
+
 out:
 	work->result = err;
 	complete(&work->done);
@@ -506,6 +524,15 @@  static int ve_start_container(struct ve_struct *ve)
 		ve->real_start_timespec = tsk->real_start_time;
 	}
 
+	/*
+	 * 'current' belongs to this ve, but it's ct-relative start time hasn't
+	 * been initialized correctly at copy_process, because ve's start time
+     * was * zero. On the other hand, we know this task is this ve's
+     * init (pid 1), * so we set it's relative start time to 0.
+	 */
+	tsk->real_start_time_ct.tv_sec = 0;
+	tsk->real_start_time_ct.tv_nsec = 0;
+
 	/* The value is wrong, but it is never compared to process
 	 * start times */
 	ve->start_jiffies = get_jiffies_64();

Comments

Pavel Tikhomirov Jan. 9, 2020, 12:52 p.m.
When you send patches to devel@ list, please don't forget to add a 
"project" tag to each patch, especially if you have patches to different 
"project"s in one series.

e.g.:

a) if you send patch to VZ7 kernel you would have rh7/RH7/RHEL7 in the 
PATCH tag - [PATCH RH7 x/y]
b) if you send patch to VZ7 criu you would have criu/CRIU in the PATCH 
tag - [PATCH criu x/y]

Also best practice for writing commit message is to add 
"subsystem[/subsystem]:" part in the beggining for components you touch, 
for you kernel patch it would be something like "ve/proc: Add separate 
start ..."

Please see 
https://www.kernel.org/doc/html/v4.17/process/submitting-patches.html#describe-your-changes 
and 
https://www.kernel.org/doc/html/v4.17/process/submitting-patches.html#the-canonical-patch-format

On 12/31/19 2:41 PM, Valeriy Vdovin wrote:
> https://jira.sw.ru/browse/PSBM-100083
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
> ---
>   fs/proc/array.c            | 16 ++++------------
>   include/linux/sched.h      |  5 +++++
>   include/uapi/linux/prctl.h |  9 +++++++++
>   kernel/fork.c              | 20 ++++++++++++++++++++
>   kernel/sys.c               | 22 ++++++++++++++++++++++
>   kernel/ve/ve.c             | 27 +++++++++++++++++++++++++++
>   6 files changed, 87 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index 3aa8a7d..a6d2834 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -611,19 +611,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
>   		(unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
>   				+ task->real_start_time.tv_nsec;
>   #ifdef CONFIG_VE
> -	if (!is_super) {
> -		struct timespec *ve_start_ts =
> -				&get_exec_env()->real_start_timespec;
> -		start_time -=
> -			(unsigned long long)ve_start_ts->tv_sec * NSEC_PER_SEC
> -				+ ve_start_ts->tv_nsec;
> -	}
> -	/* tasks inside a CT can have negative start time e.g. if the CT was
> -	 * migrated from another hw node, in which case we will report 0 in
> -	 * order not to confuse userspace */
> -	if ((s64)start_time < 0)
> -		start_time = 0;
> +	if (!is_super && !task->task_ve->is_pseudosuper)

Here you have a problem:

a) "task" here is not a current task but a task you are reading 
/proc/pid/stat from, likely you want to check if current task is 
pseudosuper or not here instead.
b) You don't need to check pseudosuper here as criu does not read 
/proc/pid/stat on restore (where pseudosuper can be set).

So only leave "if (!is_super)". It's enough.

> +		start_time = (unsigned long long)
> +			timespec_to_ns(&task->real_start_time_ct);
>   #endif
> +
>   	/* convert nsec -> ticks */
>   	start_time = nsec_to_clock_t(start_time);
>   
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 07f9954..0832904 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1934,6 +1934,11 @@ struct task_struct {
>   	struct wake_q_node wake_q;
>   	struct prev_cputime prev_cputime;
>   	struct vtime vtime;
> +	/*
> +	 * this is a container-side copy of 'real_start_time' field
> +	 * shown from inside of a container and modified by host.
> +	 */
> +	struct timespec real_start_time_ct; >   #endif /* __GENKSYMS__ */
>   };
>   
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index 02376de..6de7b22 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -195,6 +195,7 @@ struct prctl_mm_map {
>   /* Per task speculation control */
>   #define PR_GET_SPECULATION_CTRL		52
>   #define PR_SET_SPECULATION_CTRL		53 > +

Excess empty line.

>   /* Speculation control variants */
>   # define PR_SPEC_STORE_BYPASS		0
>   /* Return and control values for PR_SET/GET_SPECULATION_CTRL */
> @@ -205,4 +206,12 @@ struct prctl_mm_map {
>   # define PR_SPEC_FORCE_DISABLE		(1UL << 3)
>   # define PR_SPEC_DISABLE_NOEXEC		(1UL << 4)
>   
> +/* Set task container related fields */
> +#define PR_SET_TASK_CT_FIELDS	54

In ms linux we already have:
define PR_PAC_RESET_KEYS               54

And also we need to think of what happens if they will take our ID in 
future and we will need to rebase on these change. So either 1) we need 
to chose big (INT_MAX or something like these) number which will be 
never used by ms linux prctl or 2) we need a way in criu to detect if 
our prctl changed id in future. (1) seems a lesser evil for me.

> +#define PR_TASK_CT_FIELDS_START_TIME	(1ULL << 0)
> +
> +struct prctl_task_ct_fields {
> +	__s64 real_start_time;
> +};
> + >   #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 3d74228..7bc1957 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1348,8 +1348,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>   {
>   	int retval;
>   	struct task_struct *p;
> +

Excess empty line.

>   	void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
>   
> +#ifdef CONFIG_VE
> +	struct ve_struct *ve = get_exec_env();
> +#endif
> +
> +

Excess empty line.

>   	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
>   		return ERR_PTR(-EINVAL);
>   
> @@ -1472,6 +1478,20 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>   	do_posix_clock_monotonic_gettime(&p->start_time);
>   	p->real_start_time = p->start_time;
>   	monotonic_to_bootbased(&p->real_start_time);
> +
> +	/*
> +	 * Initially set container-relative start time for newly started
> +	 * processes. For init process this value is invalid and will be
> +	 * changed at container start.
> +	 */
> +	p->real_start_time_ct = p->real_start_time;
> +
> +#ifdef CONFIG_VE
> +	set_normalized_timespec(&p->real_start_time_ct,
> +		p->real_start_time.tv_sec - ve->real_start_timespec.tv_sec,
> +		p->real_start_time.tv_nsec - ve->real_start_timespec.tv_nsec);

I thought we would:

c) set real_start_time_ct for CT processes only, for host processes it 
has no sense.
d) set real_start_time_ct here to 0 if CT is not yet started. See 
ve_struct->is_running.

> +#endif
> +
>   	p->io_context = NULL;
>   	p->audit_context = NULL;
>   	if (clone_flags & CLONE_THREAD)
> diff --git a/kernel/sys.c b/kernel/sys.c
> index c4d633ef..2c9b3e6 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2457,6 +2457,25 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
>   }
>   #endif
>   
> +static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg,
> +		unsigned long flags)
> +{
> +	struct prctl_task_ct_fields params;
> +
> +#ifdef CONFIG_VE
> +	if (!ve_is_super(get_exec_env()) && !t->task_ve->is_pseudosuper)

Same as (a), but here "t" and "current" are the same thing. It is 
inconsistent to use both get_exec_env() and t->task_ve in one function.

Please add "ve" variable same like you do in copy_process and use it in 
both places, here you may initialize it from t->task_ve if you prefer.

> +		return -EPERM;
> +#endif
> +
> +	if (copy_from_user(&params, (const void __user *)arg, sizeof(params)))
> +		return -EFAULT;
> +
> +	if (flags & PR_TASK_CT_FIELDS_START_TIME)
> +		t->real_start_time_ct = ns_to_timespec(params.real_start_time);
> +
> +	return 0;
> +}
> +
>   int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
>   {
>   	return -EINVAL;
> @@ -2684,6 +2703,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>   			return -EINVAL;
>   		error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
>   		break;
> +	case PR_SET_TASK_CT_FIELDS:
> +		error = prctl_set_task_ct_fields(me, arg2, arg3);
> +		break;
>   	default:
>   		error = -EINVAL;
>   		break;
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index ad3a698..662fffe 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -308,6 +308,21 @@ struct kthread_attach_work {
>   	int result;
>   };
>   
> +static void adj_start_time_at_ve_change(struct task_struct *t,
> +	struct ve_struct *old_ve, struct ve_struct *new_ve)
> +{
> +	int64_t cur_stime_ns;
> +	int64_t cur_ve_stime_ns;
> +	int64_t tgt_ve_stime_ns;
> +
> +	cur_stime_ns = timespec_to_ns(&t->real_start_time);
> +	cur_ve_stime_ns = timespec_to_ns(&old_ve->real_start_timespec);
> +	tgt_ve_stime_ns = timespec_to_ns(&new_ve->real_start_timespec);
> +
> +	cur_stime_ns += cur_ve_stime_ns - tgt_ve_stime_ns;
> +	t->real_start_time_ct = ns_to_timespec(cur_stime_ns);
> +}
> +
>   static void kthread_attach_fn(struct kthread_work *w)
>   {
>   	struct kthread_attach_work *work = container_of(w,
> @@ -336,9 +351,12 @@ static void kthread_attach_fn(struct kthread_work *w)
>   	if (err)
>   		goto out;
>   
> +	adj_start_time_at_ve_change(current, current->task_ve, target->task_ve);

I think we should instead set real_start_time_ct in ve_attach similar as 
we do in copy_process. These will also cover the case we forgot about - 
attaching to ve cgroup.

There are only two ways to get into ve cgroup:
- be cloned while parent is in ve cgroup,
- attach to ve cgroup through ve_attach.

In both cases if the VE is not running we should set real_start_time_ct 
to 0, no sense in negative values. If the VE is running we should set 
real_start_time_ct to current time relative to VE start. In first case 
it is p->real_start_time - ve->real_start_timespec in second case it is 
(likely, please check it first) do_posix_clock_monotonic_gettime() - 
ve->real_start_timespec.

> +
>   	err = cgroup_attach_task_all(target, current);
>   	if (err)
>   		goto out;
> +
>   out:
>   	work->result = err;
>   	complete(&work->done);
> @@ -506,6 +524,15 @@ static int ve_start_container(struct ve_struct *ve)
>   		ve->real_start_timespec = tsk->real_start_time;
>   	}
>   
> +	/*
> +	 * 'current' belongs to this ve, but it's ct-relative start time hasn't
> +	 * been initialized correctly at copy_process, because ve's start time

> +     * was * zero. On the other hand, we know this task is this ve's
> +     * init (pid 1), * so we set it's relative start time to 0.

Bad alignment.

> +	 */
> +	tsk->real_start_time_ct.tv_sec = 0;
> +	tsk->real_start_time_ct.tv_nsec = 0;
> +
>   	/* The value is wrong, but it is never compared to process
>   	 * start times */
>   	ve->start_jiffies = get_jiffies_64();
>