[CRIU,v5,1/1] dump/restore: Maintain proper start_time param from /proc/[pid]/stat for each task

Submitted by Valeriy Vdovin on Feb. 4, 2020, 1:02 p.m.

Details

Message ID 1580821366-3934-2-git-send-email-valeriy.vdovin@virtuozzo.com
State New
Series "dump/restore: Maintain proper start_time param from /proc/[pid]/stat for each task"
Headers show

Commit Message

Valeriy Vdovin Feb. 4, 2020, 1:02 p.m.
https://jira.sw.ru/browse/PSBM-64123

Introducing 'start_time' field into core image.
It is stored during suspend for each process inside of a dumped
container and set back to each new process at container restore
operation. Container will see this value after restore instead of the
real start time of all restored processes.

Signed-off-by: Valeriy Vdovin <valeriy.vdovin@virtuozzo.com>
---
 criu/cr-dump.c          |  60 +++++++++++++++++++++++++++-
 criu/cr-restore.c       | 104 ++++++++++++++++++++++++++++++++++++------------
 criu/include/crtools.h  |  25 ++++++++++++
 criu/include/kerndat.h  |   1 +
 criu/include/prctl.h    |  10 +++++
 criu/include/restorer.h |   9 +++++
 criu/kerndat.c          |  20 ++++++++++
 criu/pie/restorer.c     |  25 ++++++++++++
 images/core.proto       |   1 +
 9 files changed, 228 insertions(+), 27 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 45626e8..e2520e5 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -996,12 +996,64 @@  static int dump_task_ids(struct pstree_item *item, const struct cr_imgset *cr_im
 	return pb_write_one(img_from_set(cr_imgset, CR_FD_IDS), item->ids, PB_IDS);
 }
 
-int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *ti)
+struct get_internal_start_time_rq {
+	int pid;
+	unsigned long long result;
+};
+
+static int child_get_internal_start_time(void *arg)
+{
+	struct proc_pid_stat p;
+	struct get_internal_start_time_rq *r =
+		(struct get_internal_start_time_rq *)arg;
+
+	/* We need to join ve to access container relative
+	 * value of task's start_time, otherwize we will see
+	 * start_time visible to host.
+	 */
+	if (join_veX(r->pid)) {
+		pr_err("Failed to join ve, owning process %d\n", r->pid);
+		return -1;
+	}
+
+	if (parse_pid_stat(r->pid, &p)) {
+		pr_err("Failed to parse /proc/[pid]/stat for process: %d\n", r->pid);
+		return -1;
+	}
+
+	r->result = p.start_time;
+	return 0;
+}
+
+static int dump_thread_ve_start_time(int pid, ThreadCoreEntry *thread_core)
+{
+	int ret;
+	struct get_internal_start_time_rq r = {
+		.pid = pid,
+		.result = 0
+	};
+
+	ret = call_in_child_process(child_get_internal_start_time, &r);
+	if (ret) {
+		pr_err("Failed to get internal start_time of a process from ve\n");
+		return ret;
+	}
+
+	thread_core->has_start_time = true;
+	thread_core->start_time = r.result;
+
+	pr_info("Dumped start_time for task %d is %lu\n", pid, thread_core->start_time);
+	return 0;
+}
 
+int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *ti)
 {
 	int ret;
 	ThreadCoreEntry *tc = core->thread_core;
 
+	if (dump_thread_ve_start_time(pid, tc))
+		return -1;
+
 	ret = collect_lsm_profile(pid, tc->creds);
 	if (!ret) {
 		/*
@@ -1215,11 +1267,15 @@  static int dump_one_zombie(const struct pstree_item *item,
 	int ret = -1;
 	struct cr_img *img;
 
-	core = core_entry_alloc(0, 1);
+	core = core_entry_alloc(1, 1);
 	if (!core)
 		return -1;
 
 	strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN);
+
+	if (dump_thread_ve_start_time(vpid(item), core->thread_core))
+		return -1;
+
 	core->tc->task_state = TASK_DEAD;
 	core->tc->exit_code = pps->exit_code;
 
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 170beab..0e4a619 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -74,6 +74,7 @@ 
 #include "sigframe.h"
 #include "fdstore.h"
 #include "spfs.h"
+#include "prctl.h"
 
 #include "parasite-syscall.h"
 #include "files-reg.h"
@@ -1122,6 +1123,63 @@  static int wait_on_helpers_zombies(void)
 
 static int wait_exiting_children(char *prefix);
 
+static int restore_zombie_start_time(CoreEntry *core, pid_t pid)
+{
+	/* Zombie's start_time restoration differs from the one for a live task.
+	 * Here is why:
+	 *
+	 * For alive task the main codepath is done in pie sigreturn
+	 * restorer. Each thread in a process executes restorer blob code before
+	 * doing a sigreturn. Within a restorer blob each thread get's the chance
+	 * to run start_time restoration code for itself via prctl. Prctl identifies
+	 * each thread and set's start_time field nicely.
+	 *
+	 * Zombie process is special. We don't support multithreaded zombies and we don't run
+	 * any parasite code. We just create one single-threaded process in this very callstack,
+	 * patch some stats and kill it back. Right now we are in this single zombie thread,
+	 * so we can call prctl to set start_time for it right here.
+	 * If we decide to somewhy support zombies more, this code might be changed.
+	 */
+
+	int ret;
+	unsigned long flags;
+	long ticks_per_sec;
+	struct prctl_task_ct_fields ct_fields;
+
+	if (!kdat.task_ct_fields_supported)
+		return 0;
+
+	if (!core->thread_core) {
+		pr_info("Skipping zombie start_time restore. thread_core missing in dump.\n");
+		return 0;
+	}
+
+	if (!core->thread_core->has_start_time) {
+		pr_info("Skipping start_time restore for old image format.\n");
+		return 0;
+	}
+
+	ticks_per_sec = sysconf(_SC_CLK_TCK);
+	if (ticks_per_sec == -1) {
+		pr_perror("Failed to get clock ticks via sysconf");
+		return -1;
+	}
+
+	ct_fields.real_start_time = core->thread_core->start_time * (NSEC_PER_SEC / ticks_per_sec);
+	flags = PR_TASK_CT_FIELDS_START_TIME;
+
+	ret = prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0);
+	if (ret) {
+		pr_perror("Failed to restore zombie start_time");
+		return ret;
+	}
+
+	pr_info("Restored zombie start_time for task %d is %lu\n",
+		pid,
+		core->thread_core->start_time);
+	return 0;
+}
+
 static int restore_one_zombie(CoreEntry *core)
 {
 	int exit_code = core->tc->exit_code;
@@ -1136,6 +1194,9 @@  static int restore_one_zombie(CoreEntry *core)
 
 	prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
 
+	if (restore_zombie_start_time(core, vpid(current)))
+		return -1;
+
 	if (task_entries != NULL) {
 		if (wait_exiting_children("zombie"))
 			return -1;
@@ -2156,31 +2217,6 @@  static int write_restored_pid(void)
 
 extern char *get_dumpee_veid(pid_t pid_real);
 
-#define join_veX(pid)	join_ve(pid, true)
-
-/*
- * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
- * of files on CT mounts for security. All mounts created after join_veX are
- * marked as CT mounts, including all mounts of the root_yard temporary mntns.
- * So if you do join_ve0 you can be blocked from executing anything.
- *
- * https://jira.sw.ru/browse/PSBM-98702
- *
- * note: If for some reason we will desperately need to execute binaries from
- * mounts in the root_yard temporary mntns from VE0 we have an option:
- *
- * In restore_root_task before calling join_veX we can clone a helper process
- * which will create CT userns and mntns first (all mounts are marked as host
- * mounts), next after join_veX in restore_root_task we create another helper
- * process which setns'es to these user and mnt namespaces, and from these
- * helper we can clone CT init process obviousely without CLONE_NEWNS and
- * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
- * but all mounts cloned from host will be marked as host mounts, and execution
- * on them will be allowed even from VE0.
- */
-
-#define join_ve0(pid)	join_ve(pid, false)
-
 /*
  * To eliminate overhead we don't parse VE cgroup mountpoint
  * but presume to find it in known place. Otherwise simply
@@ -3455,6 +3491,7 @@  static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 
 	long new_sp;
 	long ret;
+	long ticks_per_sec;
 
 	long rst_mem_size;
 	long memzone_size;
@@ -3483,6 +3520,12 @@  static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 	BUILD_BUG_ON(sizeof(struct task_restore_args) & 1);
 	BUILD_BUG_ON(sizeof(struct thread_restore_args) & 1);
 
+	ticks_per_sec = sysconf(_SC_CLK_TCK);
+	if (ticks_per_sec == -1) {
+		pr_perror("Failed to get clock ticks via sysconf");
+		return -1;
+	}
+
 	/*
 	 * Read creds info for every thread and allocate memory
 	 * needed so we can use this data inside restorer.
@@ -3740,6 +3783,17 @@  static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 			strncpy(thread_args[i].comm, core->tc->comm, TASK_COMM_LEN - 1);
 		thread_args[i].comm[TASK_COMM_LEN - 1] = 0;
 
+		thread_args[i].restore_start_time = false;
+		if (kdat.task_ct_fields_supported) {
+			if (!tcore->thread_core->has_start_time) {
+				pr_warn("Skipping restore_start_time for old image version.\n");
+			} else {
+				thread_args[i].start_time =
+					tcore->thread_core->start_time * (NSEC_PER_SEC / ticks_per_sec);
+				thread_args[i].restore_start_time = true;
+			}
+		}
+
 		if (thread_args[i].pid != pid)
 			core_entry__free_unpacked(tcore, NULL);
 
diff --git a/criu/include/crtools.h b/criu/include/crtools.h
index c3f7cb3..902d18a 100644
--- a/criu/include/crtools.h
+++ b/criu/include/crtools.h
@@ -44,6 +44,31 @@  extern void pr_check_features(const char *offset, const char *sep, int width);
 			.actor = name##_cb,			\
 	}
 
+#define join_veX(pid)	join_ve(pid, true)
+
+/*
+ * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
+ * of files on CT mounts for security. All mounts created after join_veX are
+ * marked as CT mounts, including all mounts of the root_yard temporary mntns.
+ * So if you do join_ve0 you can be blocked from executing anything.
+ *
+ * https://jira.sw.ru/browse/PSBM-98702
+ *
+ * note: If for some reason we will desperately need to execute binaries from
+ * mounts in the root_yard temporary mntns from VE0 we have an option:
+ *
+ * In restore_root_task before calling join_veX we can clone a helper process
+ * which will create CT userns and mntns first (all mounts are marked as host
+ * mounts), next after join_veX in restore_root_task we create another helper
+ * process which setns'es to these user and mnt namespaces, and from these
+ * helper we can clone CT init process obviousely without CLONE_NEWNS and
+ * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
+ * but all mounts cloned from host will be marked as host mounts, and execution
+ * on them will be allowed even from VE0.
+ */
+
+#define join_ve0(pid)	join_ve(pid, false)
+
 int join_ve(pid_t pid, bool veX);
 
 #endif /* __CR_CRTOOLS_H__ */
diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h
index 72fcbf3..5042e94 100644
--- a/criu/include/kerndat.h
+++ b/criu/include/kerndat.h
@@ -77,6 +77,7 @@  struct kerndat_s {
 	bool has_inotify_setnextwd;
 	bool has_kcmp_epoll_tfd;
 	bool has_ve_ctty;
+	bool task_ct_fields_supported;
 };
 
 extern struct kerndat_s kdat;
diff --git a/criu/include/prctl.h b/criu/include/prctl.h
index 8e7fef3..52b9a30 100644
--- a/criu/include/prctl.h
+++ b/criu/include/prctl.h
@@ -82,4 +82,14 @@  struct prctl_mm_map {
 # define PR_GET_THP_DISABLE	42
 #endif
 
+#ifndef PR_SET_TASK_CT_FIELDS
+/* Set task container related fields */
+#define PR_SET_TASK_CT_FIELDS	1000
+#define PR_TASK_CT_FIELDS_START_TIME	(1ULL << 0)
+
+struct prctl_task_ct_fields {
+	s64 real_start_time;
+};
+#endif
+
 #endif /* __CR_PRCTL_H__ */
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index 07f0439..9f43dd8 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -112,6 +112,15 @@  struct thread_restore_args {
 	bool				seccomp_force_tsync;
 
 	char				comm[TASK_COMM_LEN];
+
+	/* set to 1 if start_time value is valid
+	 * and should be used for restore process
+	 */
+	bool				restore_start_time;
+	/* start_time of a recovered process
+	 * in ticks format as shown in /proc/pid/stat(22)
+	 */
+	unsigned long long		start_time;
 } __aligned(64);
 
 typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
diff --git a/criu/kerndat.c b/criu/kerndat.c
index b516789..3cfae1f 100644
--- a/criu/kerndat.c
+++ b/criu/kerndat.c
@@ -846,6 +846,25 @@  static int kerndat_x86_has_ptrace_fpu_xsave_bug(void)
 	return 0;
 }
 
+
+static void kerndat_task_ct_fields_supported()
+{
+	long ret;
+	struct prctl_task_ct_fields ct_fields = { 0 };
+	/*
+	 * arg2 to this prctl is flags. If flags are not set, nothing would
+	 * be done inside prctl, but it will still succeed.
+	 * But if it's unsupported, we will see EINVAL;
+	 */
+	ret = prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, 0, 0, 0);
+	if (ret && errno == EINVAL) {
+		kdat.task_ct_fields_supported = false;
+		pr_info("prctl code PR_SET_TASK_CT_FIELDS is unsupported by kernel version. "
+			"Restore of task's start_time field will be skipped.\n");
+	} else
+		kdat.task_ct_fields_supported = true;
+}
+
 #define KERNDAT_CACHE_FILE	KDAT_RUNDIR"/criu.kdat"
 #define KERNDAT_CACHE_FILE_TMP	KDAT_RUNDIR"/.criu.kdat"
 
@@ -1141,6 +1160,7 @@  int kerndat_init(void)
 	if (!ret)
 		ret = kerndat_nl_repair();
 
+	kerndat_task_ct_fields_supported();
 	kerndat_ve_ctty();
 	kerndat_lsm();
 	kerndat_mmap_min_addr();
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 0351986..945a00a 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -113,6 +113,28 @@  void parasite_cleanup(void)
 {
 }
 
+static int restore_start_time(struct thread_restore_args *args)
+{
+	int ret;
+	unsigned long flags;
+	struct prctl_task_ct_fields ct_fields;
+
+	if (!args->restore_start_time)
+		return 0;
+
+	ct_fields.real_start_time = args->start_time;
+	flags = PR_TASK_CT_FIELDS_START_TIME;
+
+	ret = sys_prctl_safe(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0);
+	if (ret) {
+		pr_info("Failed to restore start_time\n");
+		return ret;
+	}
+
+	pr_info("Restored start_time to %lld\n", args->start_time);
+	return 0;
+}
+
 extern void cr_restore_rt (void) asm ("__cr_restore_rt")
 			__attribute__ ((visibility ("hidden")));
 
@@ -523,6 +545,9 @@  static int restore_thread_common(struct thread_restore_args *args)
 
 	restore_sched_info(&args->sp);
 
+	if (restore_start_time(args))
+		return -1;
+
 	if (restore_nonsigframe_gpregs(&args->gpregs))
 		return -1;
 
diff --git a/images/core.proto b/images/core.proto
index 6ef5f50..41f412f 100644
--- a/images/core.proto
+++ b/images/core.proto
@@ -119,6 +119,7 @@  message thread_core_entry {
 	optional uint32			seccomp_filter	= 12;
 
 	optional string			comm		= 13;
+	optional uint64			start_time	= 14;
 }
 
 message task_rlimits_entry {