[CRIU,2/2] dump/restore: Maintain proper start_time param from /proc/[pid]/stat for each task

Submitted by Valeriy Vdovin on Jan. 13, 2020, 8:25 a.m.

Details

Message ID 1578903941-8393-3-git-send-email-valeriy.vdovin@virtuozzo.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Valeriy Vdovin Jan. 13, 2020, 8:25 a.m.
https://jira.sw.ru/browse/PSBM-100083
Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
---
 criu/cr-dump.c         | 49 ++++++++++++++++++++++++++++++++++++
 criu/cr-restore.c      | 68 ++++++++++++++++++++++++++++++--------------------
 criu/include/crtools.h | 31 +++++++++++++++++++++++
 images/core.proto      |  2 ++
 4 files changed, 123 insertions(+), 27 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 45626e8..751141f 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -1037,6 +1037,47 @@  int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread
 	return ret;
 }
 
+struct get_internal_start_time_rq {
+	int pid;
+	unsigned long long result;
+};
+
+static int child_get_internal_start_time(void *arg)
+{
+	struct proc_pid_stat p;
+	struct get_internal_start_time_rq *r =
+		(struct get_internal_start_time_rq *)arg;
+
+	/* We need to join ve to access container relative
+	 * value of task's start_time, otherwize we will see
+	 * start_time visible to host.
+	 */
+	join_veX(r->pid);
+
+	parse_pid_stat(r->pid, &p);
+	r->result = p.start_time;
+	return 0;
+}
+
+static int dump_task_internal_start_time(int pid, TaskCoreEntry *tc)
+{
+	int ret;
+	struct get_internal_start_time_rq r = {
+		.pid = pid,
+		.result = 0
+	};
+
+	ret = call_in_child_process(child_get_internal_start_time, &r);
+	if (ret) {
+		pr_err("Failed to exec in child\n");
+		return ret;
+	}
+
+	tc->has_start_time = 1;
+	tc->start_time = r.result;
+	return 0;
+}
+
 static int dump_task_core_all(struct parasite_ctl *ctl,
 			      struct pstree_item *item,
 			      const struct proc_pid_stat *stat,
@@ -1063,6 +1104,14 @@  static int dump_task_core_all(struct parasite_ctl *ctl,
 	core->tc->task_state = item->pid->state;
 	core->tc->exit_code = 0;
 
+	ret = dump_task_internal_start_time(pid, core->tc);
+	if (ret) {
+		pr_err("Failed to dump start_time for task %d\n", pid);
+		goto err;
+	}
+
+	pr_info("Dumped start_time of task %d is %lu\n", pid, core->tc->start_time);
+
 	if (stat->tty_nr) {
 		struct pstree_item *p = item;
 
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 170beab..c9777ae 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -947,6 +947,40 @@  static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
 static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core);
 static int prepare_mm(pid_t pid, struct task_restore_args *args);
 
+static int restore_start_time(int pid, CoreEntry *core)
+{
+	unsigned long long total_nsec;
+	unsigned long flags;
+	long tps;
+	struct prctl_task_ct_fields ct_fields;
+
+	if (!core->tc->has_start_time) {
+		pr_warn("Skipping restore_start_time for old image version.\n");
+		return -1;
+	}
+
+	tps = sysconf(_SC_CLK_TCK);
+	if (tps == -1) {
+		pr_perror("Failed to get clock ticks via sysconf");
+		return -1;
+	}
+
+	total_nsec = core->tc->start_time * (NSEC_PER_SEC / tps);
+
+	ct_fields.real_start_time = total_nsec;
+	flags = PR_TASK_CT_FIELDS_START_TIME;
+
+	if (prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0)) {
+		pr_perror("Can't set process start time");
+		return -1;
+	}
+
+	pr_info("Restored start_time of task %d is %lu\n",
+		pid, core->tc->start_time);
+
+	return 0;
+}
+
 static int restore_one_alive_task(int pid, CoreEntry *core)
 {
 	unsigned args_len;
@@ -955,6 +989,8 @@  static int restore_one_alive_task(int pid, CoreEntry *core)
 
 	rst_mem_switch_to_private();
 
+	restore_start_time(pid, core);
+
 	args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) *
 			current->nr_threads, page_size());
 	ta = mmap(NULL, args_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
@@ -1122,7 +1158,7 @@  static int wait_on_helpers_zombies(void)
 
 static int wait_exiting_children(char *prefix);
 
-static int restore_one_zombie(CoreEntry *core)
+static int restore_one_zombie(int pid, CoreEntry *core)
 {
 	int exit_code = core->tc->exit_code;
 
@@ -1134,6 +1170,8 @@  static int restore_one_zombie(CoreEntry *core)
 	if (lazy_pages_setup_zombie(vpid(current)))
 		return -1;
 
+	restore_start_time(pid, core);
+
 	prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
 
 	if (task_entries != NULL) {
@@ -1142,6 +1180,7 @@  static int restore_one_zombie(CoreEntry *core)
 		zombie_prepare_signals();
 	}
 
+
 	if (exit_code & 0x7f) {
 		int signr;
 
@@ -1344,7 +1383,7 @@  static int restore_one_task(int pid, CoreEntry *core)
 	if (task_alive(current))
 		ret = restore_one_alive_task(pid, core);
 	else if (current->pid->state == TASK_DEAD)
-		ret = restore_one_zombie(core);
+		ret = restore_one_zombie(pid, core);
 	else if (current->pid->state == TASK_HELPER) {
 		ret = restore_one_helper();
 	} else {
@@ -2156,31 +2195,6 @@  static int write_restored_pid(void)
 
 extern char *get_dumpee_veid(pid_t pid_real);
 
-#define join_veX(pid)	join_ve(pid, true)
-
-/*
- * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
- * of files on CT mounts for security. All mounts created after join_veX are
- * marked as CT mounts, including all mounts of the root_yard temporary mntns.
- * So if you do join_ve0 you can be blocked from executing anything.
- *
- * https://jira.sw.ru/browse/PSBM-98702
- *
- * note: If for some reason we will desperately need to execute binaries from
- * mounts in the root_yard temporary mntns from VE0 we have an option:
- *
- * In restore_root_task before calling join_veX we can clone a helper process
- * which will create CT userns and mntns first (all mounts are marked as host
- * mounts), next after join_veX in restore_root_task we create another helper
- * process which setns'es to these user and mnt namespaces, and from these
- * helper we can clone CT init process obviousely without CLONE_NEWNS and
- * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
- * but all mounts cloned from host will be marked as host mounts, and execution
- * on them will be allowed even from VE0.
- */
-
-#define join_ve0(pid)	join_ve(pid, false)
-
 /*
  * To eliminate overhead we don't parse VE cgroup mountpoint
  * but presume to find it in known place. Otherwise simply
diff --git a/criu/include/crtools.h b/criu/include/crtools.h
index c3f7cb3..acb1faf 100644
--- a/criu/include/crtools.h
+++ b/criu/include/crtools.h
@@ -44,6 +44,37 @@  extern void pr_check_features(const char *offset, const char *sep, int width);
 			.actor = name##_cb,			\
 	}
 
+#define join_veX(pid)	join_ve(pid, true)
+
+/*
+ * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
+ * of files on CT mounts for security. All mounts created after join_veX are
+ * marked as CT mounts, including all mounts of the root_yard temporary mntns.
+ * So if you do join_ve0 you can be blocked from executing anything.
+ *
+ * https://jira.sw.ru/browse/PSBM-98702
+ *
+ * note: If for some reason we will desperately need to execute binaries from
+ * mounts in the root_yard temporary mntns from VE0 we have an option:
+ *
+ * In restore_root_task before calling join_veX we can clone a helper process
+ * which will create CT userns and mntns first (all mounts are marked as host
+ * mounts), next after join_veX in restore_root_task we create another helper
+ * process which setns'es to these user and mnt namespaces, and from these
+ * helper we can clone CT init process obviousely without CLONE_NEWNS and
+ * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
+ * but all mounts cloned from host will be marked as host mounts, and execution
+ * on them will be allowed even from VE0.
+ */
+
+#define join_ve0(pid)	join_ve(pid, false)
+
+/*
+ * To eliminate overhead we don't parse VE cgroup mountpoint
+ * but presume to find it in known place. Otherwise simply
+ * don't enter into veX with one warning.
+ */
+
 int join_ve(pid_t pid, bool veX);
 
 #endif /* __CR_CRTOOLS_H__ */
diff --git a/images/core.proto b/images/core.proto
index 6ef5f50..c164c7a 100644
--- a/images/core.proto
+++ b/images/core.proto
@@ -50,6 +50,7 @@  message task_core_entry_VZ730 {
 	optional int32			tty_nr		= 15;
 	optional int32			tty_pgrp	= 16;
 	repeated sa_entry		sigactions	= 17;
+	optional uint64			start_time	= 19;
 }
 
 message task_core_entry {
@@ -79,6 +80,7 @@  message task_core_entry {
 	repeated sa_entry		sigactions	= 15;
 	optional int32			tty_nr		= 16;
 	optional int32			tty_pgrp	= 17;
+	optional uint64			start_time	= 19;
 }
 
 message task_kobj_ids_entry {

Comments

Pavel Tikhomirov Jan. 13, 2020, 3:23 p.m.
Two small nits, everything else is OK.

On 1/13/20 11:25 AM, Valeriy Vdovin wrote:
> https://jira.sw.ru/browse/PSBM-100083
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
> ---
>   criu/cr-dump.c         | 49 ++++++++++++++++++++++++++++++++++++
>   criu/cr-restore.c      | 68 ++++++++++++++++++++++++++++++--------------------
>   criu/include/crtools.h | 31 +++++++++++++++++++++++
>   images/core.proto      |  2 ++
>   4 files changed, 123 insertions(+), 27 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 45626e8..751141f 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1037,6 +1037,47 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread
>   	return ret;
>   }
>   
> +struct get_internal_start_time_rq {
> +	int pid;
> +	unsigned long long result;
> +};
> +
> +static int child_get_internal_start_time(void *arg)
> +{
> +	struct proc_pid_stat p;
> +	struct get_internal_start_time_rq *r =
> +		(struct get_internal_start_time_rq *)arg;
> +
> +	/* We need to join ve to access container relative
> +	 * value of task's start_time, otherwize we will see
> +	 * start_time visible to host.
> +	 */
> +	join_veX(r->pid);
> +
> +	parse_pid_stat(r->pid, &p);
> +	r->result = p.start_time;
> +	return 0;
> +}
> +
> +static int dump_task_internal_start_time(int pid, TaskCoreEntry *tc)
> +{
> +	int ret;
> +	struct get_internal_start_time_rq r = {
> +		.pid = pid,
> +		.result = 0
> +	};
> +
> +	ret = call_in_child_process(child_get_internal_start_time, &r);
> +	if (ret) {
> +		pr_err("Failed to exec in child\n");
> +		return ret;
> +	}
> +
> +	tc->has_start_time = 1;
> +	tc->start_time = r.result;
> +	return 0;
> +}
> +
>   static int dump_task_core_all(struct parasite_ctl *ctl,
>   			      struct pstree_item *item,
>   			      const struct proc_pid_stat *stat,
> @@ -1063,6 +1104,14 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
>   	core->tc->task_state = item->pid->state;
>   	core->tc->exit_code = 0;
>   
> +	ret = dump_task_internal_start_time(pid, core->tc);
> +	if (ret) {
> +		pr_err("Failed to dump start_time for task %d\n", pid);
> +		goto err;
> +	}
> +
> +	pr_info("Dumped start_time of task %d is %lu\n", pid, core->tc->start_time);
> +
>   	if (stat->tty_nr) {
>   		struct pstree_item *p = item;
>   
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 170beab..c9777ae 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -947,6 +947,40 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
>   static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core);
>   static int prepare_mm(pid_t pid, struct task_restore_args *args);
>   
> +static int restore_start_time(int pid, CoreEntry *core)
> +{
> +	unsigned long long total_nsec;
> +	unsigned long flags;
> +	long tps;
> +	struct prctl_task_ct_fields ct_fields;
> +
> +	if (!core->tc->has_start_time) {
> +		pr_warn("Skipping restore_start_time for old image version.\n");
> +		return -1;
> +	}
> +
> +	tps = sysconf(_SC_CLK_TCK);
> +	if (tps == -1) {
> +		pr_perror("Failed to get clock ticks via sysconf");
> +		return -1;
> +	}
> +
> +	total_nsec = core->tc->start_time * (NSEC_PER_SEC / tps);
> +
> +	ct_fields.real_start_time = total_nsec;
> +	flags = PR_TASK_CT_FIELDS_START_TIME;
> +
> +	if (prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0)) {
> +		pr_perror("Can't set process start time");
> +		return -1;
> +	}
> +
> +	pr_info("Restored start_time of task %d is %lu\n",
> +		pid, core->tc->start_time);
> +
> +	return 0;
> +}
> +
>   static int restore_one_alive_task(int pid, CoreEntry *core)
>   {
>   	unsigned args_len;
> @@ -955,6 +989,8 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
>   
>   	rst_mem_switch_to_private();
>   
> +	restore_start_time(pid, core);
> +
>   	args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) *
>   			current->nr_threads, page_size());
>   	ta = mmap(NULL, args_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
> @@ -1122,7 +1158,7 @@ static int wait_on_helpers_zombies(void)
>   
>   static int wait_exiting_children(char *prefix);
>   
> -static int restore_one_zombie(CoreEntry *core)
> +static int restore_one_zombie(int pid, CoreEntry *core)
>   {
>   	int exit_code = core->tc->exit_code;
>   
> @@ -1134,6 +1170,8 @@ static int restore_one_zombie(CoreEntry *core)
>   	if (lazy_pages_setup_zombie(vpid(current)))
>   		return -1;
>   
> +	restore_start_time(pid, core);
> +
>   	prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
>   
>   	if (task_entries != NULL) {
> @@ -1142,6 +1180,7 @@ static int restore_one_zombie(CoreEntry *core)
>   		zombie_prepare_signals();
>   	}
>   
> +

Excess empty line.

>   	if (exit_code & 0x7f) {
>   		int signr;
>   
> @@ -1344,7 +1383,7 @@ static int restore_one_task(int pid, CoreEntry *core)
>   	if (task_alive(current))
>   		ret = restore_one_alive_task(pid, core);
>   	else if (current->pid->state == TASK_DEAD)
> -		ret = restore_one_zombie(core);
> +		ret = restore_one_zombie(pid, core);
>   	else if (current->pid->state == TASK_HELPER) {
>   		ret = restore_one_helper();
>   	} else {
> @@ -2156,31 +2195,6 @@ static int write_restored_pid(void)
>   
>   extern char *get_dumpee_veid(pid_t pid_real);
>   
> -#define join_veX(pid)	join_ve(pid, true)
> -
> -/*
> - * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
> - * of files on CT mounts for security. All mounts created after join_veX are
> - * marked as CT mounts, including all mounts of the root_yard temporary mntns.
> - * So if you do join_ve0 you can be blocked from executing anything.
> - *
> - * https://jira.sw.ru/browse/PSBM-98702
> - *
> - * note: If for some reason we will desperately need to execute binaries from
> - * mounts in the root_yard temporary mntns from VE0 we have an option:
> - *
> - * In restore_root_task before calling join_veX we can clone a helper process
> - * which will create CT userns and mntns first (all mounts are marked as host
> - * mounts), next after join_veX in restore_root_task we create another helper
> - * process which setns'es to these user and mnt namespaces, and from these
> - * helper we can clone CT init process obviousely without CLONE_NEWNS and
> - * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> - * but all mounts cloned from host will be marked as host mounts, and execution
> - * on them will be allowed even from VE0.
> - */
> -
> -#define join_ve0(pid)	join_ve(pid, false)
> -
>   /*
>    * To eliminate overhead we don't parse VE cgroup mountpoint
>    * but presume to find it in known place. Otherwise simply
> diff --git a/criu/include/crtools.h b/criu/include/crtools.h
> index c3f7cb3..acb1faf 100644
> --- a/criu/include/crtools.h
> +++ b/criu/include/crtools.h
> @@ -44,6 +44,37 @@ extern void pr_check_features(const char *offset, const char *sep, int width);
>   			.actor = name##_cb,			\
>   	}
>   
> +#define join_veX(pid)	join_ve(pid, true)
> +
> +/*
> + * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
> + * of files on CT mounts for security. All mounts created after join_veX are
> + * marked as CT mounts, including all mounts of the root_yard temporary mntns.
> + * So if you do join_ve0 you can be blocked from executing anything.
> + *
> + * https://jira.sw.ru/browse/PSBM-98702
> + *
> + * note: If for some reason we will desperately need to execute binaries from
> + * mounts in the root_yard temporary mntns from VE0 we have an option:
> + *
> + * In restore_root_task before calling join_veX we can clone a helper process
> + * which will create CT userns and mntns first (all mounts are marked as host
> + * mounts), next after join_veX in restore_root_task we create another helper
> + * process which setns'es to these user and mnt namespaces, and from these
> + * helper we can clone CT init process obviousely without CLONE_NEWNS and
> + * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> + * but all mounts cloned from host will be marked as host mounts, and execution
> + * on them will be allowed even from VE0.
> + */
> +
> +#define join_ve0(pid)	join_ve(pid, false)
> +
> +/*
> + * To eliminate overhead we don't parse VE cgroup mountpoint
> + * but presume to find it in known place. Otherwise simply
> + * don't enter into veX with one warning.
> + */

Duplicated comment.

> +
>   int join_ve(pid_t pid, bool veX);
>   
>   #endif /* __CR_CRTOOLS_H__ */
> diff --git a/images/core.proto b/images/core.proto
> index 6ef5f50..c164c7a 100644
> --- a/images/core.proto
> +++ b/images/core.proto
> @@ -50,6 +50,7 @@ message task_core_entry_VZ730 {
>   	optional int32			tty_nr		= 15;
>   	optional int32			tty_pgrp	= 16;
>   	repeated sa_entry		sigactions	= 17;
> +	optional uint64			start_time	= 19;
>   }
>   
>   message task_core_entry {
> @@ -79,6 +80,7 @@ message task_core_entry {
>   	repeated sa_entry		sigactions	= 15;
>   	optional int32			tty_nr		= 16;
>   	optional int32			tty_pgrp	= 17;
> +	optional uint64			start_time	= 19;
>   }
>   
>   message task_kobj_ids_entry {
>
Pavel Tikhomirov Jan. 13, 2020, 3:38 p.m.
> +
> +	if (prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0)) {
> +		pr_perror("Can't set process start time");
> +		return -1;
> +	}

If we will update criu while still using older kernel (without prctl) 
and suspend some ct it will not resume, as these will fail with EINVAL. 
Looks like we still need to detect if prctl is supported via kdat.
Pavel Tikhomirov Jan. 14, 2020, 8:27 a.m.
Initially I missed it, but ignoring retcode is really bad mistake.

On 1/13/20 11:25 AM, Valeriy Vdovin wrote:
> https://jira.sw.ru/browse/PSBM-100083
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
> ---
>   criu/cr-dump.c         | 49 ++++++++++++++++++++++++++++++++++++
>   criu/cr-restore.c      | 68 ++++++++++++++++++++++++++++++--------------------
>   criu/include/crtools.h | 31 +++++++++++++++++++++++
>   images/core.proto      |  2 ++
>   4 files changed, 123 insertions(+), 27 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 45626e8..751141f 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1037,6 +1037,47 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread
>   	return ret;
>   }
>   
> +struct get_internal_start_time_rq {
> +	int pid;
> +	unsigned long long result;
> +};
> +
> +static int child_get_internal_start_time(void *arg)
> +{
> +	struct proc_pid_stat p;
> +	struct get_internal_start_time_rq *r =
> +		(struct get_internal_start_time_rq *)arg;
> +
> +	/* We need to join ve to access container relative
> +	 * value of task's start_time, otherwize we will see
> +	 * start_time visible to host.
> +	 */
> +	join_veX(r->pid);

Not checking return code here.

> +
> +	parse_pid_stat(r->pid, &p);

And here.

> +	r->result = p.start_time;
> +	return 0;
> +}
> +
> +static int dump_task_internal_start_time(int pid, TaskCoreEntry *tc)
> +{
> +	int ret;
> +	struct get_internal_start_time_rq r = {
> +		.pid = pid,
> +		.result = 0
> +	};
> +
> +	ret = call_in_child_process(child_get_internal_start_time, &r);
> +	if (ret) {
> +		pr_err("Failed to exec in child\n");
> +		return ret;
> +	}
> +
> +	tc->has_start_time = 1;
> +	tc->start_time = r.result;
> +	return 0;
> +}
> +
>   static int dump_task_core_all(struct parasite_ctl *ctl,
>   			      struct pstree_item *item,
>   			      const struct proc_pid_stat *stat,
> @@ -1063,6 +1104,14 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
>   	core->tc->task_state = item->pid->state;
>   	core->tc->exit_code = 0;
>   
> +	ret = dump_task_internal_start_time(pid, core->tc);
> +	if (ret) {
> +		pr_err("Failed to dump start_time for task %d\n", pid);
> +		goto err;
> +	}
> +
> +	pr_info("Dumped start_time of task %d is %lu\n", pid, core->tc->start_time);
> +
>   	if (stat->tty_nr) {
>   		struct pstree_item *p = item;
>   
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 170beab..c9777ae 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -947,6 +947,40 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
>   static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core);
>   static int prepare_mm(pid_t pid, struct task_restore_args *args);
>   
> +static int restore_start_time(int pid, CoreEntry *core)
> +{
> +	unsigned long long total_nsec;
> +	unsigned long flags;
> +	long tps;
> +	struct prctl_task_ct_fields ct_fields;
> +
> +	if (!core->tc->has_start_time) {
> +		pr_warn("Skipping restore_start_time for old image version.\n");
> +		return -1;
> +	}
> +
> +	tps = sysconf(_SC_CLK_TCK);
> +	if (tps == -1) {
> +		pr_perror("Failed to get clock ticks via sysconf");
> +		return -1;
> +	}
> +
> +	total_nsec = core->tc->start_time * (NSEC_PER_SEC / tps);
> +
> +	ct_fields.real_start_time = total_nsec;
> +	flags = PR_TASK_CT_FIELDS_START_TIME;
> +
> +	if (prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0)) {
> +		pr_perror("Can't set process start time");
> +		return -1;
> +	}
> +
> +	pr_info("Restored start_time of task %d is %lu\n",
> +		pid, core->tc->start_time);
> +
> +	return 0;
> +}
> +
>   static int restore_one_alive_task(int pid, CoreEntry *core)
>   {
>   	unsigned args_len;
> @@ -955,6 +989,8 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
>   
>   	rst_mem_switch_to_private();
>   
> +	restore_start_time(pid, core);

And here. You should explicitly return 0 in case of EINVAL returned from 
prctl and write a warning that prctl is not supported, and you should 
also return 0 in case of !has_start_time for backward compatibility. 
Sysconf errors and all other errors from prctl do matter, we can't 
ignore them.

I think we need to rework your patch via kdat to call 
prctl(PR_SET_TASK_CT_FIELDS) only once if it's not supported by kernel.

> +
>   	args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) *
>   			current->nr_threads, page_size());
>   	ta = mmap(NULL, args_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
> @@ -1122,7 +1158,7 @@ static int wait_on_helpers_zombies(void)
>   
>   static int wait_exiting_children(char *prefix);
>   
> -static int restore_one_zombie(CoreEntry *core)
> +static int restore_one_zombie(int pid, CoreEntry *core)
>   {
>   	int exit_code = core->tc->exit_code;
>   
> @@ -1134,6 +1170,8 @@ static int restore_one_zombie(CoreEntry *core)
>   	if (lazy_pages_setup_zombie(vpid(current)))
>   		return -1;
>   
> +	restore_start_time(pid, core);
> +
>   	prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
>   
>   	if (task_entries != NULL) {
> @@ -1142,6 +1180,7 @@ static int restore_one_zombie(CoreEntry *core)
>   		zombie_prepare_signals();
>   	}
>   
> +
>   	if (exit_code & 0x7f) {
>   		int signr;
>   
> @@ -1344,7 +1383,7 @@ static int restore_one_task(int pid, CoreEntry *core)
>   	if (task_alive(current))
>   		ret = restore_one_alive_task(pid, core);
>   	else if (current->pid->state == TASK_DEAD)
> -		ret = restore_one_zombie(core);
> +		ret = restore_one_zombie(pid, core);
>   	else if (current->pid->state == TASK_HELPER) {
>   		ret = restore_one_helper();
>   	} else {
> @@ -2156,31 +2195,6 @@ static int write_restored_pid(void)
>   
>   extern char *get_dumpee_veid(pid_t pid_real);
>   
> -#define join_veX(pid)	join_ve(pid, true)
> -
> -/*
> - * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
> - * of files on CT mounts for security. All mounts created after join_veX are
> - * marked as CT mounts, including all mounts of the root_yard temporary mntns.
> - * So if you do join_ve0 you can be blocked from executing anything.
> - *
> - * https://jira.sw.ru/browse/PSBM-98702
> - *
> - * note: If for some reason we will desperately need to execute binaries from
> - * mounts in the root_yard temporary mntns from VE0 we have an option:
> - *
> - * In restore_root_task before calling join_veX we can clone a helper process
> - * which will create CT userns and mntns first (all mounts are marked as host
> - * mounts), next after join_veX in restore_root_task we create another helper
> - * process which setns'es to these user and mnt namespaces, and from these
> - * helper we can clone CT init process obviousely without CLONE_NEWNS and
> - * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> - * but all mounts cloned from host will be marked as host mounts, and execution
> - * on them will be allowed even from VE0.
> - */
> -
> -#define join_ve0(pid)	join_ve(pid, false)
> -
>   /*
>    * To eliminate overhead we don't parse VE cgroup mountpoint
>    * but presume to find it in known place. Otherwise simply
> diff --git a/criu/include/crtools.h b/criu/include/crtools.h
> index c3f7cb3..acb1faf 100644
> --- a/criu/include/crtools.h
> +++ b/criu/include/crtools.h
> @@ -44,6 +44,37 @@ extern void pr_check_features(const char *offset, const char *sep, int width);
>   			.actor = name##_cb,			\
>   	}
>   
> +#define join_veX(pid)	join_ve(pid, true)
> +
> +/*
> + * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
> + * of files on CT mounts for security. All mounts created after join_veX are
> + * marked as CT mounts, including all mounts of the root_yard temporary mntns.
> + * So if you do join_ve0 you can be blocked from executing anything.
> + *
> + * https://jira.sw.ru/browse/PSBM-98702
> + *
> + * note: If for some reason we will desperately need to execute binaries from
> + * mounts in the root_yard temporary mntns from VE0 we have an option:
> + *
> + * In restore_root_task before calling join_veX we can clone a helper process
> + * which will create CT userns and mntns first (all mounts are marked as host
> + * mounts), next after join_veX in restore_root_task we create another helper
> + * process which setns'es to these user and mnt namespaces, and from these
> + * helper we can clone CT init process obviousely without CLONE_NEWNS and
> + * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> + * but all mounts cloned from host will be marked as host mounts, and execution
> + * on them will be allowed even from VE0.
> + */
> +
> +#define join_ve0(pid)	join_ve(pid, false)
> +
> +/*
> + * To eliminate overhead we don't parse VE cgroup mountpoint
> + * but presume to find it in known place. Otherwise simply
> + * don't enter into veX with one warning.
> + */
> +
>   int join_ve(pid_t pid, bool veX);
>   
>   #endif /* __CR_CRTOOLS_H__ */
> diff --git a/images/core.proto b/images/core.proto
> index 6ef5f50..c164c7a 100644
> --- a/images/core.proto
> +++ b/images/core.proto
> @@ -50,6 +50,7 @@ message task_core_entry_VZ730 {
>   	optional int32			tty_nr		= 15;
>   	optional int32			tty_pgrp	= 16;
>   	repeated sa_entry		sigactions	= 17;
> +	optional uint64			start_time	= 19;
>   }
>   
>   message task_core_entry {
> @@ -79,6 +80,7 @@ message task_core_entry {
>   	repeated sa_entry		sigactions	= 15;
>   	optional int32			tty_nr		= 16;
>   	optional int32			tty_pgrp	= 17;
> +	optional uint64			start_time	= 19;
>   }
>   
>   message task_kobj_ids_entry {
>