[11/12] criu: Kill tasks even when the network is unlocked

Submitted by Dmitry Safonov on Nov. 9, 2019, 10:20 p.m.

Details

Message ID 20191109222045.89507-12-dima@arista.com
State Accepted
Series "compel: Add __must_check"
Commit 4a64f4394d7c1b8f5dd720ba77ea54580f6b3776
Headers show

Commit Message

Dmitry Safonov Nov. 9, 2019, 10:20 p.m.
Currently if anything fails after network has been unlocked tasks aren't
killed. Which doesn't work anyway: any stage sets `ret` and nothing
later gets called. Which means the tasks aren't resumed properly.
Furthermore, functions like catch_tasks() and compel_stop_on_syscall()
return failure on the first error.

Let's do the cleanup even when the network is unlocked.
If we want to keep the mess and ignore failures - a cli option should be
introduced for that (and existing code should be reworked with decisions
what is critical and what can be ignored).

Move "Restore finished successfully" message accordingly where
everything is evidently good.

While at here, any late failure will result not only in cleanup but in
criu returning error code.

Which in result makes tests to fail in such case:
> ======================= Run zdtm/static/inotify04 in ns ========================
> Start test
> ./inotify04 --pidfile=inotify04.pid --outfile=inotify04.out --dirname=inotify04.test
> Run criu dump
> =[log]=> dump/zdtm/static/inotify04/84/1/dump.log
> ------------------------ grep Error ------------------------
> (00.119763) fsnotify: 			openable (inode match) as zdtm/static/inotify04.test/inotify-testfile
> (00.119766) fsnotify: 	Dumping /zdtm/static/inotify04.test/inotify-testfile as path for handle
> (00.119769) fsnotify: id 0x00000b flags 0x000800
> (00.119787) 88 fdinfo 5: pos:                0 flags:             4000/0
> (00.119796) Warn  (criu/fsnotify.c:336): fsnotify: The 0x00000c inotify events will be dropped
> ------------------------ ERROR OVER ------------------------
> Run criu restore
> =[log]=> dump/zdtm/static/inotify04/84/1/restore.log
> ------------------------ grep Error ------------------------
> (00.391582) 123 was stopped
> (00.391667) 106 was trapped
> (00.391674) 106 (native) is going to execute the syscall 11, required is 11
> (00.391697) 106 was stopped
> (00.391720) Error (compel/src/lib/infect.c:1439): Task 123 is in unexpected state: b7f
> (00.391736) Error (compel/src/lib/infect.c:1447): Task stopped with 11: Segmentation fault
> ------------------------ ERROR OVER ------------------------
> 5: Old maps lost: set([])
> 5: New maps appeared: set([u'10000-1a000 rwxp', u'1a000-24000 rw-p'])
> ############### Test zdtm/static/inotify04 FAIL at maps compare ################
> Send the 9 signal to  106
> Wait for zdtm/static/inotify04(106) to die for 0.100000
> ======================= Test zdtm/static/inotify04 PASS ========================

Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 criu/cr-restore.c | 50 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 18 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 41f78cb7a2f9..953f28e45801 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -1975,7 +1975,7 @@  static void finalize_restore(void)
 	}
 }
 
-static void finalize_restore_detach(int status)
+static int finalize_restore_detach(void)
 {
 	struct pstree_item *item;
 
@@ -1989,16 +1989,21 @@  static void finalize_restore_detach(int status)
 		for (i = 0; i < item->nr_threads; i++) {
 			pid = item->threads[i].real;
 			if (pid < 0) {
-				BUG_ON(status >= 0);
-				break;
+				pr_err("pstree item has unvalid pid %d\n", pid);
+				continue;
 			}
 
-			if (arch_set_thread_regs_nosigrt(&item->threads[i]))
+			if (arch_set_thread_regs_nosigrt(&item->threads[i])) {
 				pr_perror("Restoring regs for %d failed", pid);
-			if (ptrace(PTRACE_DETACH, pid, NULL, 0))
-				pr_perror("Unable to execute %d", pid);
+				return -1;
+			}
+			if (ptrace(PTRACE_DETACH, pid, NULL, 0)) {
+				pr_perror("Unable to detach %d", pid);
+				return -1;
+			}
 		}
 	}
+	return 0;
 }
 
 static void ignore_kids(void)
@@ -2256,32 +2261,37 @@  skip_ns_bouncing:
 
 	/*
 	 * -------------------------------------------------------------
-	 * Below this line nothing should fail, because network is unlocked
+	 * Network is unlocked. If something fails below - we lose data
+	 * or a connection.
 	 */
 	attach_to_tasks(root_seized);
 
-	ret = restore_switch_stage(CR_STATE_RESTORE_CREDS);
-	BUG_ON(ret);
+	if (restore_switch_stage(CR_STATE_RESTORE_CREDS))
+		goto out_kill_network_unlocked;
 
 	timing_stop(TIME_RESTORE);
 
-	ret = catch_tasks(root_seized, &flag);
+	if (catch_tasks(root_seized, &flag)) {
+		pr_err("Can't catch all tasks\n");
+		goto out_kill_network_unlocked;
+	}
 
 	if (lazy_pages_finish_restore())
-		goto out_kill;
+		goto out_kill_network_unlocked;
 
-	pr_info("Restore finished successfully. Resuming tasks.\n");
 	__restore_switch_stage(CR_STATE_COMPLETE);
 
-	if (ret == 0)
-		ret = compel_stop_on_syscall(task_entries->nr_threads,
-			__NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag);
+	ret = compel_stop_on_syscall(task_entries->nr_threads,
+		__NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag);
+	if (ret) {
+		pr_err("Can't stop all tasks on rt_sigreturn\n");
+		goto out_kill_network_unlocked;
+	}
 
 	if (clear_breakpoints())
 		pr_err("Unable to flush breakpoints\n");
 
-	if (ret == 0)
-		finalize_restore();
+	finalize_restore();
 
 	ret = run_scripts(ACT_PRE_RESUME);
 	if (ret)
@@ -2293,8 +2303,10 @@  skip_ns_bouncing:
 	fini_cgroup();
 
 	/* Detaches from processes and they continue run through sigreturn. */
-	finalize_restore_detach(ret);
+	if (finalize_restore_detach())
+		goto out_kill_network_unlocked;
 
+	pr_info("Restore finished successfully. Tasks resumed.\n");
 	write_stats(RESTORE_STATS);
 
 	ret = run_scripts(ACT_POST_RESUME);
@@ -2306,6 +2318,8 @@  skip_ns_bouncing:
 
 	return 0;
 
+out_kill_network_unlocked:
+	pr_err("Killing processes because of failure on restore.\nThe Network was unlocked so some data or a connection may have been lost.\n");
 out_kill:
 	/*
 	 * The processes can be killed only when all of them have been created,