[Devel] pcs7: freeze -- Wait for exiting tasks to complete

Submitted by Cyrill Gorcunov on Oct. 7, 2016, 6:07 p.m.

Details

Message ID 1475863626-3342-1-git-send-email-gorcunov@virtuozzo.com
State New
Series "pcs7: freeze -- Wait for exiting tasks to complete"
Headers show

Commit Message

Cyrill Gorcunov Oct. 7, 2016, 6:07 p.m.
There is a small race between exitings tasks and cgroup freezer:
it can report a pid of exiting process which we will consider
as a zombie and refuse to checkpoint. Lets do a trick instead:
if we meets such task just wait a bit and repeate a freezing.

https://jira.sw.ru/browse/PSBM-53282

Signed-off-by: Cyrill Gorcunov <gorcunov@virtuozzo.com>
---
 criu/seize.c | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/seize.c b/criu/seize.c
index f39412f..d6723c1 100644
--- a/criu/seize.c
+++ b/criu/seize.c
@@ -136,11 +136,17 @@  static int seize_cgroup_tree(char *root_path, const char *state)
 			if (stat(buf, &st) == -1 && errno == ENOENT)
 				continue;
 
-			/* fails when meets a zombie */
+			/*
+			 * fails when meets a zombie, or eixting process:
+			 * there is a small race in a kernel -- the process
+			 * may start exiting and we are trying to freeze it
+			 * before it compete exit procedure. The caller simply
+			 * should wait a bit and try freezing again.
+			 */
 			pr_err("zombie %d (comm %s) found while seizing\n",
 			       pid, __task_comm_info(pid));
 			fclose(f);
-			return -1;
+			return -EPERM;
 		}
 	}
 	fclose(f);
@@ -153,6 +159,7 @@  static int seize_cgroup_tree(char *root_path, const char *state)
 
 	while ((de = readdir(dir))) {
 		struct stat st;
+		int ret;
 
 		if (dir_dots(de))
 			continue;
@@ -168,9 +175,10 @@  static int seize_cgroup_tree(char *root_path, const char *state)
 		if (!S_ISDIR(st.st_mode))
 			continue;
 
-		if (seize_cgroup_tree(path, state) < 0) {
+		ret = seize_cgroup_tree(path, state);
+		if (ret < 0) {
 			closedir(dir);
-			return -1;
+			return ret;
 		}
 	}
 	closedir(dir);
@@ -328,7 +336,7 @@  static int freeze_processes(void)
 
 	static const unsigned long step_ms = 100;
 	unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms;
-	unsigned long i;
+	unsigned long i = 0;
 
 	const struct timespec req = {
 		.tv_nsec	= step_ms * 1000000,
@@ -374,7 +382,7 @@  static int freeze_processes(void)
 		 * not read @tasks pids while freezer in
 		 * transition stage.
 		 */
-		for (i = 0; i <= nr_attempts; i++) {
+		for (; i <= nr_attempts; i++) {
 			state = get_freezer_state(fd);
 			if (!state) {
 				close(fd);
@@ -398,7 +406,18 @@  static int freeze_processes(void)
 		pr_debug("freezing processes: %lu attempts done\n", i);
 	}
 
-	exit_code = seize_cgroup_tree(opts.freeze_cgroup, state);
+	/*
+	 * Pay attention on @i variable -- it's continuation.
+	 */
+	for (; i <= nr_attempts; i++) {
+		exit_code = seize_cgroup_tree(opts.freeze_cgroup, state);
+		if (exit_code == -EPERM) {
+			if (alarm_timeouted())
+				goto err;
+			nanosleep(&req, NULL);
+		} else
+			break;
+	}
 
 err:
 	if (exit_code == 0 || freezer_thawed) {