criu: dump and restore cpu affinity of each thread

Submitted by Sang Yan on Nov. 26, 2020, 9:17 a.m.

Details

Message ID 20201126091756.7027-1-sangyan@huawei.com
State New
Series "criu: dump and restore cpu affinity of each thread"
Headers show

Commit Message

Sang Yan Nov. 26, 2020, 9:17 a.m.
Criu should dump and restore threads' or processes'
cpu affinity.

Add one entry of thread_cpuallow_entry into
thread_core_entry to save cpu affinity info.

Restore it after threads restored but before running.

Signed-off-by: Sang Yan <sangyan@huawei.com>
---
 compel/arch/arm/plugins/std/syscalls/syscall.def   |  1 +
 .../ppc64/plugins/std/syscalls/syscall-ppc64.tbl   |  1 +
 .../s390/plugins/std/syscalls/syscall-s390.tbl     |  1 +
 .../arch/x86/plugins/std/syscalls/syscall_32.tbl   |  1 +
 .../arch/x86/plugins/std/syscalls/syscall_64.tbl   |  1 +
 criu/cr-dump.c                                     | 14 +++++++++++
 criu/cr-restore.c                                  | 22 ++++++++++++++++
 criu/include/restorer.h                            |  3 +++
 criu/pie/restorer.c                                | 29 ++++++++++++++++++++++
 criu/pstree.c                                      |  7 ++++++
 images/core.proto                                  |  5 ++++
 11 files changed, 85 insertions(+)

Patch hide | download patch | download mbox

diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def
index f7ebc85..1c70388 100644
--- a/compel/arch/arm/plugins/std/syscalls/syscall.def
+++ b/compel/arch/arm/plugins/std/syscalls/syscall.def
@@ -116,3 +116,4 @@  fsopen				430	430	(char *fsname, unsigned int flags)
 fsconfig			431	431	(int fd, unsigned int cmd, const char *key, const char *value, int aux)
 fsmount				432	432	(int fd, unsigned int flags, unsigned int attr_flags)
 clone3				435	435	(struct clone_args *uargs, size_t size)
+sched_setaffinity		122	241	(int fd, size_t cpusetsize, const cpu_set_t *mask)
diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
index 1afaf1e..460daf8 100644
--- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
+++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
@@ -112,3 +112,4 @@  __NR_fsopen		430		sys_fsopen		(char *fsname, unsigned int flags)
 __NR_fsconfig		431		sys_fsconfig		(int fd, unsigned int cmd, const char *key, const char *value, int aux)
 __NR_fsmount		432		sys_fsmount		(int fd, unsigned int flags, unsigned int attr_flags)
 __NR_clone3		435		sys_clone3		(struct clone_args *uargs, size_t size)
+__NR_sched_setaffinity	222		sys_sched_setaffinity	(int fd, size_t cpusetsize, const cpu_set_t *mask)
diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
index ae6fdb5..c0bba39 100644
--- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
+++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
@@ -112,3 +112,4 @@  __NR_fsopen		430		sys_fsopen		(char *fsname, unsigned int flags)
 __NR_fsconfig		431		sys_fsconfig		(int fd, unsigned int cmd, const char *key, const char *value, int aux)
 __NR_fsmount		432		sys_fsmount		(int fd, unsigned int flags, unsigned int attr_flags)
 __NR_clone3		435		sys_clone3		(struct clone_args *uargs, size_t size)
+__NR_sched_setaffinity	239		sys_sched_setaffinity	(int fd, size_t cpusetsize, const cpu_set_t *mask)
diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
index 7a48711..29c13e3 100644
--- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
+++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
@@ -63,6 +63,7 @@  __NR_mincore		218		sys_mincore		(void *addr, unsigned long size, unsigned char *
 __NR_madvise		219		sys_madvise		(unsigned long start, size_t len, int behavior)
 __NR_gettid		224		sys_gettid		(void)
 __NR_futex		240		sys_futex		(uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3)
+__NR_sched_setaffinity	241		sys_sched_setaffinity	(int fd, size_t cpusetsize, const cpu_set_t *mask)
 __NR_set_thread_area	243		sys_set_thread_area	(user_desc_t *info)
 __NR_get_thread_area	244		sys_get_thread_area	(user_desc_t *info)
 __NR_io_setup		245		sys_io_setup		(unsigned nr_reqs, aio_context_t *ctx32p)
diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
index 6667c07..74f5482 100644
--- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
+++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
@@ -73,6 +73,7 @@  __NR_mount			165		sys_mount		(char *dev_nmae, char *dir_name, char *type, unsign
 __NR_umount2			166		sys_umount2		(char *name, int flags)
 __NR_gettid			186		sys_gettid		(void)
 __NR_futex			202		sys_futex		(uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3)
+__NR_sched_setaffinity		203		sys_sched_setaffinity	(int fd, size_t cpusetsize, const cpu_set_t *mask)
 __NR_set_thread_area		205		sys_set_thread_area	(user_desc_t *info)
 __NR_io_setup			206		sys_io_setup		(unsigned nr_events, aio_context_t *ctx)
 __NR_io_getevents		208		sys_io_getevents	(aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 193a49c..6ffd526 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -140,6 +140,7 @@  static int dump_sched_info(int pid, ThreadCoreEntry *tc)
 {
 	int ret;
 	struct sched_param sp;
+	cpu_set_t cpumask;
 
 	BUILD_BUG_ON(SCHED_OTHER != 0); /* default in proto message */
 
@@ -185,6 +186,19 @@  static int dump_sched_info(int pid, ThreadCoreEntry *tc)
 	tc->has_sched_nice = true;
 	tc->sched_nice = ret;
 
+	pr_info("\tdumping cpu_allowed for %d\n", pid);
+	ret = syscall(__NR_sched_getaffinity, pid, sizeof(cpumask), &cpumask);
+	if (ret < 0) {
+		pr_perror("Can't get sched affinity for %d", pid);
+		return -1;
+	}
+	memcpy(tc->cpu_allowed->cpumask, &cpumask, __CPU_SETSIZE);
+	pr_info("\t 0x%lx, 0x%lx, 0x%lx, 0x%lx\n",
+		tc->cpu_allowed->cpumask[3],
+		tc->cpu_allowed->cpumask[2],
+		tc->cpu_allowed->cpumask[1],
+		tc->cpu_allowed->cpumask[0]);
+
 	return 0;
 }
 
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 8af2e29..375eb54 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -118,6 +118,7 @@  static int prepare_restorer_blob(void);
 static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core);
 static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core);
 static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core);
+static int prepare_alloweds(int pid, struct task_restore_args *ta, CoreEntry *leader_core);
 
 /*
  * Architectures can overwrite this function to restore registers that are not
@@ -922,6 +923,9 @@  static int restore_one_alive_task(int pid, CoreEntry *core)
 	if (prepare_signals(pid, ta, core))
 		return -1;
 
+	if (prepare_alloweds(pid, ta, core))
+		return -1;
+
 	if (prepare_posix_timers(pid, ta, core))
 		return -1;
 
@@ -3225,6 +3229,23 @@  out:
 	return ret;
 }
 
+static int prepare_alloweds(int pid, struct task_restore_args *ta, CoreEntry *leader_core)
+{
+	int i;
+	cpu_set_t *cpumaks;
+
+	ta->cpualloweds = (cpu_set_t *)rst_mem_align_cpos(RM_PRIVATE);
+
+	for (i = 0; i < current->nr_threads; i++) {
+		cpumaks = rst_mem_alloc(sizeof(cpu_set_t), RM_PRIVATE);
+		if (!cpumaks)
+			return -1;
+
+		memcpy(cpumaks, current->core[i]->thread_core->cpu_allowed->cpumask, sizeof(cpu_set_t));
+	}
+	return 0;
+}
+
 extern void __gcov_flush(void) __attribute__((weak));
 void __gcov_flush(void) {}
 
@@ -3684,6 +3705,7 @@  static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 	RST_MEM_FIXUP_PPTR(task_args->timerfd);
 	RST_MEM_FIXUP_PPTR(task_args->posix_timers);
 	RST_MEM_FIXUP_PPTR(task_args->siginfo);
+	RST_MEM_FIXUP_PPTR(task_args->cpualloweds);
 	RST_MEM_FIXUP_PPTR(task_args->rlims);
 	RST_MEM_FIXUP_PPTR(task_args->helpers);
 	RST_MEM_FIXUP_PPTR(task_args->zombies);
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index dfb4e6b..67df9f5 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -1,6 +1,7 @@ 
 #ifndef __CR_RESTORER_H__
 #define __CR_RESTORER_H__
 
+#include <sched.h>
 #include <signal.h>
 #include <limits.h>
 #include <sys/resource.h>
@@ -162,6 +163,8 @@  struct task_restore_args {
 	siginfo_t			*siginfo;
 	unsigned int			siginfo_n;
 
+	cpu_set_t			*cpualloweds;
+
 	struct rst_tcp_sock		*tcp_socks;
 	unsigned int			tcp_socks_n;
 
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index b3d7e2b..833b6bb 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -432,6 +432,31 @@  static int restore_signals(siginfo_t *ptr, int nr, bool group)
 	return 0;
 }
 
+static int restore_cpuallowed(struct task_restore_args *args)
+{
+	int i;
+	int pid;
+	int ret;
+	cpu_set_t *cpumask;
+
+	for (i = 0; i < args->nr_threads; i++) {
+		pid = args->thread_args[i].pid;
+		cpumask = &args->cpualloweds[i];
+		pr_info("Restoring %d cpu_allowed %lx, %lx, %lx, %lx\n", pid,
+			cpumask->__bits[3],
+			cpumask->__bits[2],
+			cpumask->__bits[1],
+			cpumask->__bits[0]);
+		ret = sys_sched_setaffinity(pid, sizeof(cpu_set_t), cpumask);
+		if (ret) {
+			pr_err("\t Restore %d cpumask failed.\n", pid);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
 static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args)
 {
 	unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0;
@@ -1900,6 +1925,10 @@  long __export_restore_task(struct task_restore_args *args)
 	if (ret)
 		goto core_restore_end;
 
+	ret = restore_cpuallowed(args);
+	if (ret)
+		goto core_restore_end;
+
 	restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD);
 
 	rst_tcp_socks_all(args);
diff --git a/criu/pstree.c b/criu/pstree.c
index f1513dc..d338377 100644
--- a/criu/pstree.c
+++ b/criu/pstree.c
@@ -58,11 +58,13 @@  CoreEntry *core_entry_alloc(int th, int tsk)
 		CredsEntry *ce = NULL;
 
 		sz += sizeof(ThreadCoreEntry) + sizeof(ThreadSasEntry) + sizeof(CredsEntry);
+		sz += sizeof(ThreadCpuallowEntry);
 
 		sz += CR_CAP_SIZE * sizeof(ce->cap_inh[0]);
 		sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]);
 		sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]);
 		sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]);
+		sz += __CPU_SETSIZE;
 		/*
 		 * @groups are dynamic and allocated
 		 * on demand.
@@ -127,6 +129,11 @@  CoreEntry *core_entry_alloc(int th, int tsk)
 			ce->cap_eff	= xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0]));
 			ce->cap_bnd	= xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0]));
 
+			core->thread_core->cpu_allowed = xptr_pull(&m, ThreadCpuallowEntry);
+			thread_cpuallow_entry__init(core->thread_core->cpu_allowed);
+			core->thread_core->cpu_allowed->n_cpumask = __CPU_SETSIZE / sizeof(uint64_t);
+			core->thread_core->cpu_allowed->cpumask = xptr_pull_s(&m, __CPU_SETSIZE);
+
 			if (arch_alloc_thread_info(core)) {
 				xfree(core);
 				core = NULL;
diff --git a/images/core.proto b/images/core.proto
index 9e9e393..d9788fd 100644
--- a/images/core.proto
+++ b/images/core.proto
@@ -81,6 +81,10 @@  message thread_sas_entry {
 	required uint32			ss_flags	= 3;
 }
 
+message thread_cpuallow_entry {
+	repeated uint64			cpumask		= 1;
+}
+
 message thread_core_entry {
 	required uint64			futex_rla	= 1;
 	required uint32			futex_rla_len	= 2;
@@ -99,6 +103,7 @@  message thread_core_entry {
 
 	optional string			comm		= 13;
 	optional uint64			blk_sigset_extended	= 14;
+	required thread_cpuallow_entry	cpu_allowed	= 15;
 }
 
 message task_rlimits_entry {