[4/5] epoll: Add ability to restore migrated targets

Submitted by Cyrill Gorcunov on Dec. 7, 2018, 11:30 a.m.

Details

Message ID 20181207113043.21992-5-gorcunov@gmail.com
State New
Series "epoll: Add support of migrated targses"
Headers show

Commit Message

Cyrill Gorcunov Dec. 7, 2018, 11:30 a.m.
Some of targets file might belong to a different process:
say main epoll descriptor is inherited on fork but child
opens new file and add it into parent's epoll. To handle
such situation we use saved pid of a process which owns
target and on restore stage just queue this target in
fdstore engine. Then we notify the waiters which fetch
the target and add it into epoll descriptor.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
---
 criu/cr-restore.c        |   7 ++
 criu/eventpoll.c         | 238 +++++++++++++++++++++++++++++++++++++--
 criu/files.c             |   4 +
 criu/include/eventpoll.h |   3 +
 4 files changed, 245 insertions(+), 7 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 7bbe79aaed4b..af78505a2895 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -251,6 +251,9 @@  static int crtools_prepare_shared(void)
 	if (prepare_cgroup())
 		return -1;
 
+	if (eventpoll_prepare_shared())
+		return -1;
+
 	return 0;
 }
 
@@ -381,6 +384,10 @@  static int root_prepare_shared(void)
 	if (ret)
 		goto err;
 
+	ret = eventpoll_prepare_targets();
+	if (ret)
+		goto err;
+
 	show_saved_files();
 err:
 	return ret;
diff --git a/criu/eventpoll.c b/criu/eventpoll.c
index 21879165d1a4..4407d4d1ccee 100644
--- a/criu/eventpoll.c
+++ b/criu/eventpoll.c
@@ -26,6 +26,8 @@ 
 #include "kerndat.h"
 #include "file-ids.h"
 #include "kcmp-ids.h"
+#include "fdstore.h"
+#include "rst-malloc.h"
 
 #include "protobuf.h"
 #include "images/eventpoll.pb-c.h"
@@ -54,11 +56,41 @@  struct eventpoll_dinfo {
 	int				efd;
 };
 
+static LIST_HEAD(rst_epoll_list);
+
 struct eventpoll_file_info {
+	struct list_head		list;
+
 	EventpollFileEntry		*efe;
 	struct file_desc		d;
 };
 
+typedef union {
+	struct {
+		pid_t		pid;
+		unsigned int	tfd;
+	};
+	uint64_t		v;
+} epoll_target_key_t;
+
+typedef struct epoll_target_waiter {
+	void			*next;
+	pid_t			pid;
+} epoll_target_waiter_t;
+
+struct epoll_target {
+	struct rb_node		node;
+	epoll_target_key_t	key;
+	epoll_target_waiter_t	*waiters;
+	atomic_t		fdstore_ready;
+	int			fdstore_id;
+};
+
+static struct rb_root *epoll_targets_tree;
+
+static struct epoll_target *epoll_alloc_target(pid_t pid, unsigned int fd);
+static struct epoll_target *epoll_lookup_target(pid_t pid, unsigned int fd, bool allocate);
+
 /* Checks if file descriptor @lfd is eventfd */
 int is_eventpoll_link(char *link)
 {
@@ -180,11 +212,10 @@  int flush_eventpoll_dinfo_queue(void)
 			 * opened and added).
 			 */
 			if (t->pid != dinfo->pid) {
-				pr_err("kid_lookup_epoll: pid mismatch %d %d efd %d tfd %d toff %u\n",
-				       dinfo->pid, t->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off);
+				pr_debug("kid_lookup_epoll: pid mismatch %d %d efd %d tfd %d toff %u\n",
+					 dinfo->pid, t->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off);
 				tfde->has_pid = true;
 				tfde->pid = t->pid;
-				goto err;
 			}
 
 			tfde->tfd = t->idx;
@@ -453,10 +484,55 @@  static int eventpoll_open(struct file_desc *d, int *new_fd)
 	return -1;
 }
 
+int eventpoll_notify_target(pid_t pid, unsigned int tfd)
+{
+	epoll_target_waiter_t *w;
+	struct epoll_target *t;
+
+	t = epoll_lookup_target(pid, tfd, false);
+	if (!t)
+		return 0;
+
+	t->fdstore_id = fdstore_add(tfd);
+	if (t->fdstore_id < 0) {
+		pr_err("epoll_target: fdstore fails pid %d tfd %u\n",
+		       pid, tfd);
+		return -1;
+	}
+
+	pr_debug("epoll_target: pid %d tfd %u fdstore %d\n",
+		 pid, tfd, t->fdstore_id);
+
+	atomic_set(&t->fdstore_ready, 1);
+
+	for (w = t->waiters; w; w = w->next) {
+		pr_debug("epoll_target: pid %d tfd %u wake %d\n",
+			 pid, tfd, w->pid);
+		set_fds_event(w->pid);
+	}
+
+	return 0;
+}
+
 static int epoll_not_ready_tfd(EventpollTfdEntry *tdefe)
 {
 	struct fdinfo_list_entry *fle;
 
+	if (tdefe->has_pid) {
+		struct epoll_target *t;
+
+		t = epoll_lookup_target(tdefe->pid, tdefe->tfd, false);
+		if (!t) {
+			pr_err("epoll_target: No target found pid %d fd %u\n",
+			       tdefe->pid, tdefe->tfd);
+			return 0;
+		}
+
+		pr_debug("epoll_target: found pid %d fd %u fdstore %d\n",
+			 t->key.pid, t->key.tfd, t->fdstore_id);
+		return !atomic_read(&t->fdstore_ready) ? 1 : 0;
+	}
+
 	list_for_each_entry(fle, &rsti(current)->fds, ps_list) {
 		if (tdefe->tfd != fle->fe->fd)
 			continue;
@@ -477,17 +553,49 @@  static int epoll_not_ready_tfd(EventpollTfdEntry *tdefe)
 static int eventpoll_retore_tfd(int fd, int id, EventpollTfdEntry *tdefe)
 {
 	struct epoll_event event;
+	int tfd = tdefe->tfd;
+	int new_tfd = -1;
+	int ret = 0;
+
+	if (tdefe->has_pid) {
+		struct epoll_target *t;
+
+		t = epoll_lookup_target(tdefe->pid, tdefe->tfd, false);
+		if (!t) {
+			pr_err("epoll_target: Target disappeared pid %d fd %u\n",
+			       tdefe->pid, tdefe->tfd);
+			return -1;
+		}
+
+		if (!atomic_read(&t->fdstore_ready)) {
+			pr_err("epoll_target: Unexpected fdstore_ready pid %d fd %u\n",
+			       tdefe->pid, tdefe->tfd);
+			return -1;
+		}
+
+		new_tfd = fdstore_get(t->fdstore_id);
+		if (new_tfd < 0) {
+			pr_err("epoll_target: Can't fetch fdstore_id %d pid %d fd %u\n",
+			       t->fdstore_id, tdefe->pid, tdefe->tfd);
+			return -1;
+		}
+
+		pr_debug("epoll_target: fdstore_id %d pid %d fd %u -> %d\n",
+			 t->fdstore_id, tdefe->pid, tdefe->tfd, new_tfd);
+		tdefe->tfd = new_tfd;
+	}
 
 	pr_info_eventpoll_tfd("Restore ", id, tdefe);
 
 	event.events	= tdefe->events;
 	event.data.u64	= tdefe->data;
-	if (epoll_ctl(fd, EPOLL_CTL_ADD, tdefe->tfd, &event)) {
+	ret = epoll_ctl(fd, EPOLL_CTL_ADD, tdefe->tfd, &event);
+	if (ret)
 		pr_perror("Can't add event on %#08x", id);
-		return -1;
-	}
 
-	return 0;
+	tdefe->tfd = tfd;
+	close_safe(&new_tfd);
+	return ret;
 }
 
 static int eventpoll_post_open(struct file_desc *d, int fd)
@@ -514,6 +622,121 @@  static struct file_desc_ops desc_ops = {
 	.open		= eventpoll_open,
 };
 
+static struct epoll_target *epoll_alloc_target(pid_t pid, unsigned int tfd)
+{
+	struct epoll_target *t = shmalloc(sizeof(*t));
+	if (!t) {
+		pr_err("epoll_target: Can't allocate pid %d tfd %u\n", pid, tfd);
+		return NULL;
+	}
+
+	memzero_p(t);
+
+	rb_init_node(&t->node);
+	atomic_set(&t->fdstore_ready, 0);
+	t->fdstore_id = -1;
+	t->key.pid = pid;
+	t->key.tfd = tfd;
+
+	pr_debug("epoll_target: Allocated pid %d tfd %u\n", pid, tfd);
+	return t;
+}
+
+static struct epoll_target *epoll_lookup_target(pid_t pid, unsigned int tfd, bool allocate)
+{
+	struct rb_node *node = epoll_targets_tree->rb_node;
+	struct epoll_target *t = NULL;
+
+	struct rb_node **new = &epoll_targets_tree->rb_node;
+	struct rb_node *parent = NULL;
+
+	epoll_target_key_t key = {
+		.pid	= pid,
+		.tfd	= tfd,
+	};
+
+	while (node) {
+		struct epoll_target *this = rb_entry(node, struct epoll_target, node);
+
+		parent = *new;
+		if (key.v < this->key.v)
+			node = node->rb_left, new = &((*new)->rb_left);
+		else if (key.v > this->key.v)
+			node = node->rb_right, new = &((*new)->rb_right);
+		else
+			return this;
+	}
+
+	if (!allocate)
+		return NULL;
+
+	t = epoll_alloc_target(pid, tfd);
+	if (!t)
+		return NULL;
+
+	rb_link_and_balance(epoll_targets_tree, &t->node, parent, new);
+	return t;
+}
+
+int eventpoll_prepare_targets(void)
+{
+	struct eventpoll_file_info *info;
+	size_t i;
+
+	list_for_each_entry(info, &rst_epoll_list, list) {
+		for (i = 0; i < info->efe->n_tfd; i++) {
+			EventpollTfdEntry *tfde = info->efe->tfd[i];
+			epoll_target_waiter_t *waiter;
+			struct fdinfo_list_entry *fle;
+			struct epoll_target *t;
+
+			if (!tfde->has_pid)
+				continue;
+
+			fle = file_master(&info->d);
+
+			/*
+			 * Should heneve happen since we save pids for
+			 * foreign tasks only.
+			 */
+			if (unlikely(tfde->pid == vpid(fle->task))) {
+				pr_warn_once("epoll_target: Same pid %d\n", tfde->pid);
+				continue;
+			}
+
+			pr_debug("epoll_target: Foreign pid %d tfd %d waiter %d\n",
+				 tfde->pid, tfde->tfd, vpid(fle->task));
+
+			t = epoll_lookup_target(tfde->pid, tfde->tfd, true);
+			if (!t)
+				return -ENOMEM;
+
+			waiter = shmalloc(sizeof(*waiter));
+			if (waiter) {
+				epoll_target_waiter_t *w = t->waiters;
+				waiter->next = w, t->waiters = waiter;
+				waiter->pid = vpid(fle->task);
+			} else {
+				pr_err("epoll_target: Can't allocate waiter\n");
+				return -ENOMEM;
+			}
+		}
+	}
+
+	return 0;
+}
+
+int eventpoll_prepare_shared(void)
+{
+	epoll_targets_tree = shmalloc(sizeof(*epoll_targets_tree));
+	if (!epoll_targets_tree) {
+		pr_err("Can't allocate targets tree\n");
+		return -ENOMEM;
+	}
+	*epoll_targets_tree = RB_ROOT;
+	return 0;
+}
+
 static int collect_one_epoll_tfd(void *o, ProtobufCMessage *msg, struct cr_img *i)
 {
 	EventpollTfdEntry *tfde;
@@ -557,6 +780,7 @@  static int collect_one_epoll(void *o, ProtobufCMessage *msg, struct cr_img *i)
 	struct eventpoll_file_info *info = o;
 
 	info->efe = pb_msg(msg, EventpollFileEntry);
+	list_add_tail(&info->list, &rst_epoll_list);
 	pr_info_eventpoll("Collected ", info->efe);
 	return file_desc_add(&info->d, info->efe->id, &desc_ops);
 }
diff --git a/criu/files.c b/criu/files.c
index 0f88912a44cf..3e0c8c4592dd 100644
--- a/criu/files.c
+++ b/criu/files.c
@@ -1110,6 +1110,10 @@  int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd)
 
 	if (serve_out_fd(pid, fle->fe->fd, d))
 		return -1;
+
+	if (eventpoll_notify_target(pid, fle->fe->fd))
+		return -1;
+
 	return 0;
 }
 
diff --git a/criu/include/eventpoll.h b/criu/include/eventpoll.h
index 411c5c93fb16..60b726d347b7 100644
--- a/criu/include/eventpoll.h
+++ b/criu/include/eventpoll.h
@@ -5,6 +5,9 @@ 
 
 extern int is_eventpoll_link(char *link);
 extern int flush_eventpoll_dinfo_queue(void);
+extern int eventpoll_prepare_shared(void);
+extern int eventpoll_prepare_targets(void);
+extern int eventpoll_notify_target(pid_t pid, unsigned int tfd);
 
 extern const struct fdtype_ops eventpoll_dump_ops;
 extern struct collect_image_info epoll_tfd_cinfo;