[v2,07/11] pstree: rework init reparent handling for pid namespaces

Submitted by Pavel Tikhomirov on May 30, 2017, 7:30 a.m.

Details

Message ID 20170530073045.9847-1-ptikhomirov@virtuozzo.com
State New
Series "rework init child-reaper reparent handling for pidnses"
Headers show

Commit Message

Pavel Tikhomirov May 30, 2017, 7:30 a.m.
- Put code into new handle_init_reparent, make it pidns relative
and call it for each pidns.

- Consider the case when process tree branch(subtree) reparented to init
(parent of root of these branch died) riping some session in two
pieces and representative of these session in reparented branch can
not inherit its session if we simply try to fork the tree as is.
Patch adds helper can_inherit_sid to find such "adopted" brunches and
re-reparent them to helpers.

Previousely we had only direct children of init handled.

- We need many helpers for one session as:

1) The leader of session, if it is already dead, can not be recreated as
a helper in arbitrary pidns. But only in pidns ancestor of pidns of
any alive process of these session (sessions processes can't leave
pidns in which the session had been created).

More over session can be created only on proper level: sid array of the
alive process can end with several zerroes, meaning that after creation
of session, processes had entered several more pidnses, so we need to
cut these extra levels before creating the leader.

2) We can not re-reparent branch directly to session leader as the latter
can be in other pidns, thus create additional helper in our init's pidns,
and it's children will reparent to init.

If parents of session processes are in multiple pidnses we will need
helper per each such pidns, to be able to re-reparent them. See test
with setns for an example

- Collect all helper processes in separate list, so that it would
be easier to find them with get_helper_by_sid for other possibly
existing pieces of these sid. Branches re-reparented to such helpers
are temporary out of the tree and also skipped from walk over items
in for_each_pssubtree_item.

- Collect zombies and helpers which will reparent to init of pidns in
collect_child_pids to init of pidns instead of root task.

- The process tree which had only reparents to pidns init process
(no child subreapers reparents) will be restored fine). One tricky case
than we need re-reparent and the session leader is in same pidns with us
and our parent is in lower pid ns will fail - it happens than somebody
enters the pidns does setsid and then does clone(CLONE_PARENT).

v2: handle get_free_pids returns 0 as error
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
---
 criu/cr-restore.c |  10 ++-
 criu/pstree.c     | 243 +++++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 177 insertions(+), 76 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index c45791b..53df6f0 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -751,8 +751,14 @@  static int collect_child_pids(int state, unsigned int *n)
 	 * process and they have to be collected too.
 	 */
 
-	if (current == root_item) {
-		for_each_pstree_item(pi) {
+	if (last_level_pid(current->pid) == INIT_PID) {
+		for_each_pssubtree_item(pi, current) {
+			/* Skip items from sub-namespaces */
+			while (pi && pi->ids->pid_ns_id != current->ids->pid_ns_id)
+				pi = pssubtree_item_next(pi, current, true);
+			if (!pi)
+				break;
+
 			if (pi->pid->state != TASK_HELPER &&
 			    pi->pid->state != TASK_DEAD)
 				continue;
diff --git a/criu/pstree.c b/criu/pstree.c
index 2e7a13d..9262833 100644
--- a/criu/pstree.c
+++ b/criu/pstree.c
@@ -899,91 +899,178 @@  int get_free_pids(struct ns_id *ns, pid_t *pids)
 	return MAX_NS_NESTING - i - 1;
 }
 
-static int prepare_pstree_ids(void)
+static int can_inherit_sid(struct pstree_item *item)
+{
+	struct pstree_item *parent;
+	parent = item->parent;
+	while (parent) {
+		/* parent will give the right sid to item */
+		if (vsid(item) == vsid(parent))
+			return 1;
+		/* non-leader can't give children sid different from it's own */
+		if (!is_session_leader(parent))
+			break;
+		/* some other ancestor can have the right pid for item */
+		parent = parent->parent;
+	}
+	return 0;
+}
+
+static struct pstree_item *get_helper(int sid, unsigned int id, struct list_head *helpers)
+{
+	struct pstree_item *helper;
+	list_for_each_entry(helper, helpers, sibling)
+		if (vsid(helper) == sid && helper->ids->pid_ns_id == id)
+			return helper;
+	return NULL;
+}
+
+static int handle_init_reparent(struct ns_id *ns, void *oarg)
 {
-	struct pstree_item *item, *child, *helper, *tmp;
+	struct pstree_item *init, *item, *helper, *branch, *tmp;
 	LIST_HEAD(helpers);
 
-	pid_t current_pgid = getpgid(getpid());
-	if (!list_empty(&top_pid_ns->children))
-		return 0;
+	init = __pstree_item_by_virt(ns, INIT_PID);
 
-	/*
-	 * Some task can be reparented to init. A helper task should be added
-	 * for restoring sid of such tasks. The helper tasks will be exited
-	 * immediately after forking children and all children will be
-	 * reparented to init.
-	 */
-	list_for_each_entry(item, &root_item->children, sibling) {
-		struct pstree_item *leader;
+	for_each_pssubtree_item(item, init) {
+skip:
+		if (!item)
+			break;
 
-		/*
-		 * If a child belongs to the root task's session or it's
-		 * a session leader himself -- this is a simple case, we
-		 * just proceed in a normal way.
-		 */
-		if (equal_pid(item->sid, root_item->sid) || is_session_leader(item))
+		/* Skip pidns's reaper */
+		if (item == init)
 			continue;
 
-		leader = pstree_item_by_virt(vsid(item));
-		BUG_ON(leader == NULL);
-		if (leader->pid->state != TASK_UNDEF) {
-			pid_t pid;
+		/* Session leaders do setsid() */
+		if (is_session_leader(item)) {
+			/*
+			 * Stop on pidns init, it's descendants
+			 * will be handled from it's pidns.
+			 */
+			if (last_level_pid(item->pid) == INIT_PID)
+				goto skip_descendants;
+			continue;
+		}
 
-			pid = get_free_pid(top_pid_ns);
-			if (pid < 0)
-				break;
-			helper = lookup_create_item(&pid, 1, item->ids->pid_ns_id);
-			if (helper == NULL)
+		if (can_inherit_sid(item))
+			goto skip_descendants;
+
+		helper = get_helper(vsid(item), ns->id, &helpers);
+		if (!helper) {
+			struct pstree_item *leader;
+			pid_t pid[MAX_NS_NESTING];
+			int level;
+
+			leader = pstree_item_by_virt(vsid(item));
+			BUG_ON(leader == NULL);
+
+			if (leader->pid->level > init->pid->level)
+				/*
+				 * If leader is in lower pidns, then item's branch
+				 * couldn't have been reparented to init from leader
+				 * - will manage item in other pidns
+				 *
+				 * FIXME One tricky case which does not fit these rule
+				 * is doing CLONE_PARENT after entering pidns and setsid.
+				 */
+				goto skip_descendants;
+
+			if (leader->pid->state == TASK_UNDEF) {
+				struct ns_id *leader_pid_ns = ns;
+				struct pstree_item *linit;
+				int i;
+
+				/*
+				 * Search a proper pidns where session leader helper
+				 * can be created (using the fact that all processes
+				 * of some session should be in pidns of leader or
+				 * some ancestor pidns)
+				 */
+				for (i = 0; i < init->pid->level - leader->pid->level; i++) {
+					BUG_ON(!leader_pid_ns->parent);
+					leader_pid_ns = leader_pid_ns->parent;
+				}
+				BUG_ON(!leader_pid_ns);
+				linit = __pstree_item_by_virt(leader_pid_ns, INIT_PID);
+				BUG_ON(!linit);
+
+				pr_info("Add a session leader helper %d\n", vsid(item));
+
+				memcpy(leader->sid, item->sid, PID_SIZE(leader->sid->level));
+				memcpy(leader->pgid, item->sid, PID_SIZE(leader->pgid->level));
+				leader->ids = linit->ids;
+				leader->parent = linit;
+
+				list_add_tail(&leader->sibling, &leader->parent->children);
+				init_pstree_helper(leader);
+			}
+			BUG_ON(!is_session_leader(leader));
+
+			level = get_free_pids(ns, pid);
+			if (level <= 0)
 				return -1;
 
-			pr_info("Session leader %d\n", vsid(item));
+			helper = lookup_create_item(&pid[MAX_NS_NESTING - level], level, ns->id);
+			if (helper == NULL)
+				return -1;
 
-			vsid(helper) = vsid(item);
-			vpgid(helper) = vpgid(leader);
-			helper->ids = leader->ids;
+			memcpy(helper->sid, item->sid, PID_SIZE(helper->sid->level));
+			memcpy(helper->pgid, leader->pgid, PID_SIZE(leader->pgid->level));
+			helper->ids = init->ids;
 			helper->parent = leader;
-			list_add(&helper->sibling, &leader->children);
 
-			pr_info("Attach %d to the task %d\n",
-					vpid(helper), vpid(leader));
-		} else {
-			helper = leader;
-			vsid(helper) = vsid(item);
-			vpgid(helper) = vsid(item);
-			helper->parent = root_item;
-			helper->ids = root_item->ids;
 			list_add_tail(&helper->sibling, &helpers);
-		}
-		if (init_pstree_helper(helper)) {
-			pr_err("Can't init helper\n");
-			return -1;
+			init_pstree_helper(helper);
+
+			pr_info("Add a helper %d for restoring SID %d\n", vpid(helper), vsid(helper));
 		}
 
-		pr_info("Add a helper %d for restoring SID %d\n",
-				vpid(helper), vsid(helper));
+		branch = item;
+		while (branch->parent && branch->parent != init)
+			branch = branch->parent;
+		pr_info("Attach %d to the temporary task %d\n", vpid(branch), vpid(helper));
 
-		child = list_entry(item->sibling.prev, struct pstree_item, sibling);
-		item = child;
+		if (branch->sibling.next == &init->children)
+			/* Last child of init */
+			item = NULL;
+		else
+			/* Skip the subtree that we're reparenting to helper */
+			item = list_entry(branch->sibling.next, struct pstree_item, sibling);
+
+		/* Re-reparent branch */
+		branch->parent = helper;
+		list_move(&branch->sibling, &helper->children);
+		goto skip;
+skip_descendants:
+		/* Descendants of non-leader should be fine, skip them */
+		item = pssubtree_item_next(item, init, true);
+		goto skip;
+	}
 
-		/*
-		 * Stack on helper task all children with target sid.
-		 */
-		list_for_each_entry_safe_continue(child, tmp, &root_item->children, sibling) {
-			if (!equal_pid(child->sid, helper->sid))
-				continue;
-			if (is_session_leader(child))
-				continue;
+	list_for_each_entry_safe(helper, tmp, &helpers, sibling) {
+		list_move(&helper->sibling, &helper->parent->children);
+		pr_info("Attach helper %d to the task %d\n", vpid(helper), vpid(helper->parent));
+	}
+	return 0;
+}
 
-			pr_info("Attach %d to the temporary task %d\n",
-					vpid(child), vpid(helper));
+static int prepare_pstree_ids(void)
+{
+	struct pstree_item *item, *helper;
+	pid_t current_pgid = getpgid(getpid());
 
-			child->parent = helper;
-			list_move(&child->sibling, &helper->children);
-		}
-	}
+	if (!list_empty(&top_pid_ns->children))
+		return 0;
+
+	/*
+	 * Some task can be reparented to init. A helper task should be added
+	 * for restoring sid of such tasks. The helper tasks will be exited
+	 * immediately after forking children and all children will be
+	 * reparented to init.
+	 */
+	if (walk_namespaces(&pid_ns_desc, handle_init_reparent, NULL))
+		return -1;
 
-	/* Try to connect helpers to session leaders */
 	for_each_pstree_item(item) {
 		if (!item->parent) /* skip the root task */
 			continue;
@@ -994,12 +1081,25 @@  static int prepare_pstree_ids(void)
 		if (!is_session_leader(item)) {
 			struct pstree_item *parent;
 
-			if (equal_pid(item->parent->sid, item->sid))
-				continue;
-
-			/* the task could fork a child before and after setsid() */
+			/* Lookup the leader, it could fork a child before and after setsid() */
 			parent = item->parent;
-			while (parent && !equal_pid(parent->pid, item->sid)) {
+			while (parent) {
+				/* Found leader */
+				if (equal_pid(parent->pid, item->sid))
+					break;
+
+				/* Inherited sid from parent */
+				if (equal_pid(parent->sid, item->sid)) {
+					parent = parent->parent;
+					continue;
+				}
+
+				/* Non-leader parent has different sid */
+				if (!is_session_leader(parent)) {
+					pr_err("Can't find a session leader for %d\n", vsid(item));
+					return -1;
+				}
+
 				if (parent->born_sid != -1 && parent->born_sid != vsid(item)) {
 					pr_err("Can't figure out which sid (%d or %d)"
 						"the process %d was born with\n",
@@ -1015,14 +1115,9 @@  static int prepare_pstree_ids(void)
 				pr_err("Can't find a session leader for %d\n", vsid(item));
 				return -1;
 			}
-
-			continue;
 		}
 	}
 
-	/* All other helpers are session leaders for own sessions */
-	list_splice(&helpers, &root_item->children);
-
 	/* Add a process group leader if it is absent  */
 	for_each_pstree_item(item) {
 		struct pid *pid;