[18/18,v2] SCM: Dump and restore SCM_RIGHTs

Submitted by Pavel Emelianov on July 13, 2017, 11:24 a.m.

Details

Message ID 4cd92dfc-7583-eeaf-f0d6-8ca2d0011f8f@virtuozzo.com
State Accepted
Series "Support descriptors sent over unix sockets"
Commit eab62f81331bf65e25da5e68f1235a0ceea83720
Headers show

Commit Message

Pavel Emelianov July 13, 2017, 11:24 a.m.
Most of the pieces has already been described in the previous patches :)
so here's the summary.

* Dump:

When receiving a message, also receive any SCM-s (already there) and when
SCM_RIGHTs one is met -- go ahead and just dump received descriptors using
regular code, but taking current as the victim task.

Few words about file paths resolution -- since we do dump path-ed files
by receiving them from victim's parasite, such files sent via sockets
should still work OK, as we still receive them, just from another socket.

Several problems here:

1. Unix sockets sent via unix sockets form knots. Not supported.
2. Eventpolls sent via unix might themseves poll unix sockets. Knots
   again. Not supported either.

* Restore:

On restore we need to make unix socket wait for the soon-to-be-scm-sent
descriptors to get restored, so we need to find them, then put a dependency.
After that, the fake fdinfo entry is attached to the respective file
descs, when sent the respective descriptors are closed.

https://github.com/xemul/criu/issues/251

v2: Addressed comments from Kirill

* Moved prepare_scms before adding fake fles (with comment)
* Add scm-only fles as fake, thus removing close_scm_fds
* Try hard finding any suitable fle to use as scm one when 
  queuing them for unix socket scm list, only allocate a new
  one if really needed

Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
---
 criu/cr-restore.c      |  23 ++++++++
 criu/include/sockets.h |   2 +
 criu/sk-queue.c        | 140 ++++++++++++++++++++++++++++++++++++++++++++-
 criu/sk-unix.c         | 152 ++++++++++++++++++++++++++++++++++++++++++++++++-
 images/sk-packet.proto |   6 ++
 5 files changed, 318 insertions(+), 5 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index e14fa06..e11d724 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -359,6 +359,29 @@  static int root_prepare_shared(void)
 	if (ret)
 		goto err;
 
+	/*
+	 * This should be called with all packets collected AND all
+	 * fdescs and fles prepared BUT post-prep-s not run.
+	 *
+	 * Also, add_fake_fds_masters() should go afterwards
+	 *
+	 * 1)It may add a master file there, and this master must be
+	 * resolved in add_fake_fds_masters(). Otherwise the task,
+	 * which is the owner of this just added master, may not have
+	 * rights to create the master (imagine, scm file is a socket 
+	 * of a net_ns, which can't be assigned by the task);
+	 * 
+	 * 2)Another case -- there was not a task, which has
+	 * permittions to create a socket, and you added it in
+	 * prepare_scms(). In this case, we mustn't add one more fle 
+	 * in add_fake_fds_masters() -- and if this function is 
+	 * called after prepare_scms(), it won't add anything. This 
+	 * will reduce number of fake files, we add.
+	 */
+	ret = prepare_scms();
+	if (ret)
+		goto err;
+
 	/* This func may add new files, so it must be called before post prepare */
 	ret = add_fake_fds_masters();
 	if (ret)
diff --git a/criu/include/sockets.h b/criu/include/sockets.h
index 3fa8017..1bd5c67 100644
--- a/criu/include/sockets.h
+++ b/criu/include/sockets.h
@@ -38,6 +38,8 @@  extern int collect_sockets(struct ns_id *);
 extern struct collect_image_info inet_sk_cinfo;
 extern struct collect_image_info unix_sk_cinfo;
 extern int fix_external_unix_sockets(void);
+extern int prepare_scms(void);
+extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids);
 
 extern struct collect_image_info netlink_sk_cinfo;
 
diff --git a/criu/sk-queue.c b/criu/sk-queue.c
index 77e203e..f3ebd6c 100644
--- a/criu/sk-queue.c
+++ b/criu/sk-queue.c
@@ -18,9 +18,9 @@ 
 #include "util.h"
 #include "util-pie.h"
 #include "sockets.h"
-
+#include "xmalloc.h"
 #include "sk-queue.h"
-
+#include "files.h"
 #include "protobuf.h"
 #include "images/sk-packet.pb-c.h"
 
@@ -28,6 +28,8 @@  struct sk_packet {
 	struct list_head	list;
 	SkPacketEntry		*entry;
 	char        		*data;
+	unsigned		scm_len;
+	int			*scm;
 };
 
 static LIST_HEAD(packets_list);
@@ -37,12 +39,22 @@  static int collect_one_packet(void *obj, ProtobufCMessage *msg, struct cr_img *i
 	struct sk_packet *pkt = obj;
 
 	pkt->entry = pb_msg(msg, SkPacketEntry);
-
+	pkt->scm = NULL;
 	pkt->data = xmalloc(pkt->entry->length);
 	if (pkt->data ==NULL)
 		return -1;
 
 	/*
+	 * See dump_packet_cmsg() -- only SCM_RIGHTS are supported and
+	 * only 1 of that kind is possible, thus not more than 1 SCMs
+	 * on a packet.
+	 */
+	if (pkt->entry->n_scm > 1) {
+		pr_err("More than 1 SCM is not possible\n");
+		return -1;
+	}
+
+	/*
 	 * NOTE: packet must be added to the tail. Otherwise sequence
 	 * will be broken.
 	 */
@@ -64,6 +76,50 @@  struct collect_image_info sk_queues_cinfo = {
 	.collect = collect_one_packet,
 };
 
+static int dump_scm_rights(struct cmsghdr *ch, SkPacketEntry *pe)
+{
+	int nr_fds, *fds, i;
+	void *buf;
+	ScmEntry *scme;
+
+	nr_fds = (ch->cmsg_len - sizeof(*ch)) / sizeof(int);
+	fds = (int *)CMSG_DATA(ch);
+
+	buf = xmalloc(sizeof(ScmEntry) + nr_fds * sizeof(uint32_t));
+	if (!buf)
+		return -1;
+
+	scme = xptr_pull(&buf, ScmEntry);
+	scm_entry__init(scme);
+	scme->type = SCM_RIGHTS;
+	scme->n_rights = nr_fds;
+	scme->rights = xptr_pull_s(&buf, nr_fds * sizeof(uint32_t));
+
+	for (i = 0; i < nr_fds; i++) {
+		int ftyp;
+
+		if (dump_my_file(fds[i], &scme->rights[i], &ftyp))
+			return -1;
+
+		/*
+		 * Unix sent over Unix or Epoll with some other sh*t
+		 * sent over unix (maybe with this very unix polled)
+		 * are tricky and not supported for now. (XXX -- todo)
+		 */
+		if (ftyp == FD_TYPES__UNIXSK || ftyp == FD_TYPES__EVENTPOLL) {
+			pr_err("Can't dump send %d (unix/epoll) fd\n", ftyp);
+			return -1;
+		}
+	}
+
+	i = pe->n_scm++;
+	if (xrealloc_safe(&pe->scm, pe->n_scm * sizeof(ScmEntry*)))
+		return -1;
+
+	pe->scm[i] = scme;
+	return 0;
+}
+
 /*
  * Maximum size of the control messages. XXX -- is there any
  * way to get this value out of the kernel?
@@ -73,8 +129,26 @@  struct collect_image_info sk_queues_cinfo = {
 static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
 {
 	struct cmsghdr *ch;
+	int n_rights = 0;
 
 	for (ch = CMSG_FIRSTHDR(mh); ch; ch = CMSG_NXTHDR(mh, ch)) {
+		if (ch->cmsg_type == SCM_RIGHTS) {
+			if (n_rights) {
+				/*
+				 * Even if user is sending more than one cmsg with
+				 * rights, kernel merges them alltogether on recv.
+				 */
+				pr_err("Unexpected 2nd SCM_RIGHTS from the kernel\n");
+				return -1;
+			}
+
+			if (dump_scm_rights(ch, pe))
+				return -1;
+
+			n_rights++;
+			continue;
+		}
+
 		pr_err("Control messages in queue, not supported\n");
 		return -1;
 	}
@@ -82,6 +156,18 @@  static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
 	return 0;
 }
 
+static void release_cmsg(SkPacketEntry *pe)
+{
+	int i;
+
+	for (i = 0; i < pe->n_scm; i++)
+		xfree(pe->scm[i]);
+	xfree(pe->scm);
+
+	pe->n_scm = 0;
+	pe->scm = NULL;
+}
+
 int dump_sk_queue(int sock_fd, int sock_id)
 {
 	SkPacketEntry pe = SK_PACKET_ENTRY__INIT;
@@ -181,6 +267,9 @@  int dump_sk_queue(int sock_fd, int sock_id)
 			ret = -EIO;
 			goto err_set_sock;
 		}
+
+		if (pe.scm)
+			release_cmsg(&pe);
 	}
 	ret = 0;
 
@@ -209,6 +298,11 @@  static int send_one_pkt(int fd, struct sk_packet *pkt)
 	iov.iov_base = pkt->data;
 	iov.iov_len = entry->length;
 
+	if (pkt->scm != NULL) {
+		mh.msg_controllen = pkt->scm_len;
+		mh.msg_control = pkt->scm;
+	}
+
 	/*
 	 * Don't try to use sendfile here, because it use sendpage() and
 	 * all data are split on pages and a new skb is allocated for
@@ -264,3 +358,43 @@  int restore_sk_queue(int fd, unsigned int peer_id)
 out:
 	return ret;
 }
+
+int prepare_scms(void)
+{
+	struct sk_packet *pkt;
+
+	pr_info("Preparing SCMs\n");
+	list_for_each_entry(pkt, &packets_list, list) {
+		SkPacketEntry *pe = pkt->entry;
+		ScmEntry *se;
+		struct cmsghdr *ch;
+
+		if (!pe->n_scm)
+			continue;
+
+		se = pe->scm[0]; /* Only 1 SCM is possible */
+
+		if (se->type == SCM_RIGHTS) {
+			pkt->scm_len = CMSG_SPACE(se->n_rights * sizeof(int));
+			pkt->scm = xmalloc(pkt->scm_len);
+			if (!pkt->scm)
+				return -1;
+
+			ch = (struct cmsghdr *)pkt->scm; /* FIXME -- via msghdr */
+			ch->cmsg_level = SOL_SOCKET;
+			ch->cmsg_type = SCM_RIGHTS;
+			ch->cmsg_len = CMSG_LEN(se->n_rights * sizeof(int));
+
+			if (unix_note_scm_rights(pe->id_for, se->rights,
+						(int *)CMSG_DATA(ch), se->n_rights))
+				return -1;
+
+			continue;
+		}
+
+		pr_err("Unsupported scm %d in image\n", se->type);
+		return -1;
+	}
+
+	return 0;
+}
diff --git a/criu/sk-unix.c b/criu/sk-unix.c
index 42ce1bb..3963a4a 100644
--- a/criu/sk-unix.c
+++ b/criu/sk-unix.c
@@ -798,6 +798,7 @@  struct unix_sk_info {
 	struct file_desc d;
 	struct list_head connected; /* List of sockets, connected to me */
 	struct list_head node; /* To link in peer's connected list  */
+	struct list_head scm_fles;
 
 	/*
 	 * For DGRAM sockets with queues, we should only restore the queue
@@ -809,6 +810,11 @@  struct unix_sk_info {
 	u8 listen:1;
 };
 
+struct scm_fle {
+	struct list_head l;
+	struct fdinfo_list_entry *fle;
+};
+
 #define USK_PAIR_MASTER		0x1
 #define USK_PAIR_SLAVE		0x2
 
@@ -824,6 +830,141 @@  static struct unix_sk_info *find_unix_sk_by_ino(int ino)
 	return NULL;
 }
 
+static struct unix_sk_info *find_queuer_for(int id)
+{
+	struct unix_sk_info *ui;
+
+	list_for_each_entry(ui, &unix_sockets, list) {
+		if (ui->queuer == id)
+			return ui;
+	}
+
+	return NULL;
+}
+
+static struct fdinfo_list_entry *get_fle_for_scm(struct file_desc *tgt,
+		struct pstree_item *owner)
+{
+	struct fdinfo_list_entry *fle;
+	FdinfoEntry *e = NULL;
+	int fd;
+
+	list_for_each_entry(fle, &tgt->fd_info_head, desc_list) {
+		if (fle->task == owner)
+			/*
+			 * Owner already has this file in its fdtable.
+			 * Just use one.
+			 */
+			return fle;
+
+		e = fle->fe; /* keep any for further reference */
+	}
+
+	/*
+	 * Some other task restores this file. Pretend that
+	 * we're another user of it.
+	 */
+	fd = find_unused_fd(owner, -1);
+	pr_info("`- will add SCM-only %d fd\n", fd);
+
+	if (e != NULL) {
+		e = dup_fdinfo(e, fd, 0);
+		if (!e) {
+			pr_err("Can't duplicate fdinfo for scm\n");
+			return NULL;
+		}
+	} else {
+		/*
+		 * This can happen if the file in question is
+		 * sent over the socket and closed. In this case
+		 * we need to ... invent a new one!
+		 */
+
+		e = xmalloc(sizeof(*e));
+		if (!e)
+			return NULL;
+
+		fdinfo_entry__init(e);
+		e->id = tgt->id;
+		e->type = tgt->ops->type;
+		e->fd = fd;
+		e->flags = 0;
+	}
+
+	/*
+	 * Make this fle fake, so that files collecting engine
+	 * closes them at the end.
+	 */
+	return collect_fd_to(vpid(owner), e, rsti(owner), tgt, true);
+}
+
+int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids)
+{
+	struct unix_sk_info *ui;
+	struct pstree_item *owner;
+	int i;
+
+	ui = find_queuer_for(id_for);
+	if (!ui) {
+		pr_err("Can't find sender for %d\n", id_for);
+		return -1;
+	}
+
+	pr_info("Found queuer for %d -> %d\n", id_for, ui->ue->id);
+	/*
+	 * This is the task that will restore this socket
+	 */
+	owner = file_master(&ui->d)->task;
+
+	pr_info("-> will set up deps\n");
+	/*
+	 * The ui will send data to the rights receiver. Add a fake fle
+	 * for the file and a dependency.
+	 */
+	for (i = 0; i < n_ids; i++) {
+		struct file_desc *tgt;
+		struct scm_fle *sfle;
+
+		tgt = find_file_desc_raw(FD_TYPES__UND, file_ids[i]);
+		if (!tgt) {
+			pr_err("Can't find fdesc to send\n");
+			return -1;
+		}
+
+		pr_info("scm: add file %d -> %d\n", tgt->id, vpid(owner));
+		sfle = xmalloc(sizeof(*sfle));
+		if (!sfle)
+			return -1;
+
+		sfle->fle = get_fle_for_scm(tgt, owner);
+		if (!sfle->fle) {
+			pr_err("Can't request new fle for scm\n");
+			return -1;
+		}
+
+		list_add_tail(&sfle->l, &ui->scm_fles);
+		fds[i] = sfle->fle->fe->fd;
+	}
+
+	return 0;
+}
+
+static int chk_restored_scms(struct unix_sk_info *ui)
+{
+	struct scm_fle *sf, *n;
+
+	list_for_each_entry_safe(sf, n, &ui->scm_fles, l) {
+		if (sf->fle->stage < FLE_OPEN)
+			return 1;
+
+		/* Optimization for the next pass */
+		list_del(&sf->l);
+		xfree(sf);
+	}
+
+	return 0;
+}
+
 static int wake_connected_sockets(struct unix_sk_info *ui)
 {
 	struct fdinfo_list_entry *fle;
@@ -1322,12 +1463,18 @@  static int open_unix_sk(struct file_desc *d, int *new_fd)
 	struct unix_sk_info *ui;
 	int ret;
 
+	ui = container_of(d, struct unix_sk_info, d);
+
+	/* FIXME -- only queue restore may be postponed */
+	if (chk_restored_scms(ui)) {
+		pr_info("scm: Wait for tgt to restore\n");
+		return 1;
+	}
+
 	fle = file_master(d);
 	if (fle->stage >= FLE_OPEN)
 		return post_open_unix_sk(d, fle->fe->fd);
 
-	ui = container_of(d, struct unix_sk_info, d);
-
 	if (inherited_fd(d, new_fd)) {
 		ui->ue->uflags |= USK_INHERIT;
 		ret = *new_fd >= 0 ? 0 : -1;
@@ -1440,6 +1587,7 @@  static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i)
 	ui->listen = 0;
 	INIT_LIST_HEAD(&ui->connected);
 	INIT_LIST_HEAD(&ui->node);
+	INIT_LIST_HEAD(&ui->scm_fles);
 	ui->flags = 0;
 	fixup_sock_net_ns_id(&ui->ue->ns_id, &ui->ue->has_ns_id);
 
diff --git a/images/sk-packet.proto b/images/sk-packet.proto
index 27b48e4..009b461 100644
--- a/images/sk-packet.proto
+++ b/images/sk-packet.proto
@@ -1,8 +1,14 @@ 
 syntax = "proto2";
 
+message scm_entry {
+	required uint32			type		= 1;
+	repeated uint32			rights		= 2;
+}
+
 message sk_packet_entry {
 	required uint32		id_for		= 1;
 	required uint32		length		= 2;
 	// optional bytes		addr	= 3;
 	// optional sk_ucred_entry	ucred	= 128;
+	repeated scm_entry	scm		= 4;
 }

Comments

Kirill Tkhai Aug. 9, 2017, 6:17 p.m.
On 13.07.2017 14:24, Pavel Emelyanov wrote:
> Most of the pieces has already been described in the previous patches :)
> so here's the summary.
> 
> * Dump:
> 
> When receiving a message, also receive any SCM-s (already there) and when
> SCM_RIGHTs one is met -- go ahead and just dump received descriptors using
> regular code, but taking current as the victim task.
> 
> Few words about file paths resolution -- since we do dump path-ed files
> by receiving them from victim's parasite, such files sent via sockets
> should still work OK, as we still receive them, just from another socket.
> 
> Several problems here:
> 
> 1. Unix sockets sent via unix sockets form knots. Not supported.
> 2. Eventpolls sent via unix might themseves poll unix sockets. Knots
>    again. Not supported either.
> 
> * Restore:
> 
> On restore we need to make unix socket wait for the soon-to-be-scm-sent
> descriptors to get restored, so we need to find them, then put a dependency.
> After that, the fake fdinfo entry is attached to the respective file
> descs, when sent the respective descriptors are closed.
> 
> https://github.com/xemul/criu/issues/251
> 
> v2: Addressed comments from Kirill
> 
> * Moved prepare_scms before adding fake fles (with comment)
> * Add scm-only fles as fake, thus removing close_scm_fds
> * Try hard finding any suitable fle to use as scm one when 
>   queuing them for unix socket scm list, only allocate a new
>   one if really needed
> 
> Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>

Skipped this one. I thought, it's v2 as the v2 series.

Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>

> ---
>  criu/cr-restore.c      |  23 ++++++++
>  criu/include/sockets.h |   2 +
>  criu/sk-queue.c        | 140 ++++++++++++++++++++++++++++++++++++++++++++-
>  criu/sk-unix.c         | 152 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  images/sk-packet.proto |   6 ++
>  5 files changed, 318 insertions(+), 5 deletions(-)
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index e14fa06..e11d724 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -359,6 +359,29 @@ static int root_prepare_shared(void)
>  	if (ret)
>  		goto err;
>  
> +	/*
> +	 * This should be called with all packets collected AND all
> +	 * fdescs and fles prepared BUT post-prep-s not run.
> +	 *
> +	 * Also, add_fake_fds_masters() should go afterwards
> +	 *
> +	 * 1)It may add a master file there, and this master must be
> +	 * resolved in add_fake_fds_masters(). Otherwise the task,
> +	 * which is the owner of this just added master, may not have
> +	 * rights to create the master (imagine, scm file is a socket 
> +	 * of a net_ns, which can't be assigned by the task);
> +	 * 
> +	 * 2)Another case -- there was not a task, which has
> +	 * permittions to create a socket, and you added it in
> +	 * prepare_scms(). In this case, we mustn't add one more fle 
> +	 * in add_fake_fds_masters() -- and if this function is 
> +	 * called after prepare_scms(), it won't add anything. This 
> +	 * will reduce number of fake files, we add.
> +	 */
> +	ret = prepare_scms();
> +	if (ret)
> +		goto err;
> +
>  	/* This func may add new files, so it must be called before post prepare */
>  	ret = add_fake_fds_masters();
>  	if (ret)
> diff --git a/criu/include/sockets.h b/criu/include/sockets.h
> index 3fa8017..1bd5c67 100644
> --- a/criu/include/sockets.h
> +++ b/criu/include/sockets.h
> @@ -38,6 +38,8 @@ extern int collect_sockets(struct ns_id *);
>  extern struct collect_image_info inet_sk_cinfo;
>  extern struct collect_image_info unix_sk_cinfo;
>  extern int fix_external_unix_sockets(void);
> +extern int prepare_scms(void);
> +extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids);
>  
>  extern struct collect_image_info netlink_sk_cinfo;
>  
> diff --git a/criu/sk-queue.c b/criu/sk-queue.c
> index 77e203e..f3ebd6c 100644
> --- a/criu/sk-queue.c
> +++ b/criu/sk-queue.c
> @@ -18,9 +18,9 @@
>  #include "util.h"
>  #include "util-pie.h"
>  #include "sockets.h"
> -
> +#include "xmalloc.h"
>  #include "sk-queue.h"
> -
> +#include "files.h"
>  #include "protobuf.h"
>  #include "images/sk-packet.pb-c.h"
>  
> @@ -28,6 +28,8 @@ struct sk_packet {
>  	struct list_head	list;
>  	SkPacketEntry		*entry;
>  	char        		*data;
> +	unsigned		scm_len;
> +	int			*scm;
>  };
>  
>  static LIST_HEAD(packets_list);
> @@ -37,12 +39,22 @@ static int collect_one_packet(void *obj, ProtobufCMessage *msg, struct cr_img *i
>  	struct sk_packet *pkt = obj;
>  
>  	pkt->entry = pb_msg(msg, SkPacketEntry);
> -
> +	pkt->scm = NULL;
>  	pkt->data = xmalloc(pkt->entry->length);
>  	if (pkt->data ==NULL)
>  		return -1;
>  
>  	/*
> +	 * See dump_packet_cmsg() -- only SCM_RIGHTS are supported and
> +	 * only 1 of that kind is possible, thus not more than 1 SCMs
> +	 * on a packet.
> +	 */
> +	if (pkt->entry->n_scm > 1) {
> +		pr_err("More than 1 SCM is not possible\n");
> +		return -1;
> +	}
> +
> +	/*
>  	 * NOTE: packet must be added to the tail. Otherwise sequence
>  	 * will be broken.
>  	 */
> @@ -64,6 +76,50 @@ struct collect_image_info sk_queues_cinfo = {
>  	.collect = collect_one_packet,
>  };
>  
> +static int dump_scm_rights(struct cmsghdr *ch, SkPacketEntry *pe)
> +{
> +	int nr_fds, *fds, i;
> +	void *buf;
> +	ScmEntry *scme;
> +
> +	nr_fds = (ch->cmsg_len - sizeof(*ch)) / sizeof(int);
> +	fds = (int *)CMSG_DATA(ch);
> +
> +	buf = xmalloc(sizeof(ScmEntry) + nr_fds * sizeof(uint32_t));
> +	if (!buf)
> +		return -1;
> +
> +	scme = xptr_pull(&buf, ScmEntry);
> +	scm_entry__init(scme);
> +	scme->type = SCM_RIGHTS;
> +	scme->n_rights = nr_fds;
> +	scme->rights = xptr_pull_s(&buf, nr_fds * sizeof(uint32_t));
> +
> +	for (i = 0; i < nr_fds; i++) {
> +		int ftyp;
> +
> +		if (dump_my_file(fds[i], &scme->rights[i], &ftyp))
> +			return -1;
> +
> +		/*
> +		 * Unix sent over Unix or Epoll with some other sh*t
> +		 * sent over unix (maybe with this very unix polled)
> +		 * are tricky and not supported for now. (XXX -- todo)
> +		 */
> +		if (ftyp == FD_TYPES__UNIXSK || ftyp == FD_TYPES__EVENTPOLL) {
> +			pr_err("Can't dump send %d (unix/epoll) fd\n", ftyp);
> +			return -1;
> +		}
> +	}
> +
> +	i = pe->n_scm++;
> +	if (xrealloc_safe(&pe->scm, pe->n_scm * sizeof(ScmEntry*)))
> +		return -1;
> +
> +	pe->scm[i] = scme;
> +	return 0;
> +}
> +
>  /*
>   * Maximum size of the control messages. XXX -- is there any
>   * way to get this value out of the kernel?
> @@ -73,8 +129,26 @@ struct collect_image_info sk_queues_cinfo = {
>  static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>  {
>  	struct cmsghdr *ch;
> +	int n_rights = 0;
>  
>  	for (ch = CMSG_FIRSTHDR(mh); ch; ch = CMSG_NXTHDR(mh, ch)) {
> +		if (ch->cmsg_type == SCM_RIGHTS) {
> +			if (n_rights) {
> +				/*
> +				 * Even if user is sending more than one cmsg with
> +				 * rights, kernel merges them alltogether on recv.
> +				 */
> +				pr_err("Unexpected 2nd SCM_RIGHTS from the kernel\n");
> +				return -1;
> +			}
> +
> +			if (dump_scm_rights(ch, pe))
> +				return -1;
> +
> +			n_rights++;
> +			continue;
> +		}
> +
>  		pr_err("Control messages in queue, not supported\n");
>  		return -1;
>  	}
> @@ -82,6 +156,18 @@ static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>  	return 0;
>  }
>  
> +static void release_cmsg(SkPacketEntry *pe)
> +{
> +	int i;
> +
> +	for (i = 0; i < pe->n_scm; i++)
> +		xfree(pe->scm[i]);
> +	xfree(pe->scm);
> +
> +	pe->n_scm = 0;
> +	pe->scm = NULL;
> +}
> +
>  int dump_sk_queue(int sock_fd, int sock_id)
>  {
>  	SkPacketEntry pe = SK_PACKET_ENTRY__INIT;
> @@ -181,6 +267,9 @@ int dump_sk_queue(int sock_fd, int sock_id)
>  			ret = -EIO;
>  			goto err_set_sock;
>  		}
> +
> +		if (pe.scm)
> +			release_cmsg(&pe);
>  	}
>  	ret = 0;
>  
> @@ -209,6 +298,11 @@ static int send_one_pkt(int fd, struct sk_packet *pkt)
>  	iov.iov_base = pkt->data;
>  	iov.iov_len = entry->length;
>  
> +	if (pkt->scm != NULL) {
> +		mh.msg_controllen = pkt->scm_len;
> +		mh.msg_control = pkt->scm;
> +	}
> +
>  	/*
>  	 * Don't try to use sendfile here, because it use sendpage() and
>  	 * all data are split on pages and a new skb is allocated for
> @@ -264,3 +358,43 @@ int restore_sk_queue(int fd, unsigned int peer_id)
>  out:
>  	return ret;
>  }
> +
> +int prepare_scms(void)
> +{
> +	struct sk_packet *pkt;
> +
> +	pr_info("Preparing SCMs\n");
> +	list_for_each_entry(pkt, &packets_list, list) {
> +		SkPacketEntry *pe = pkt->entry;
> +		ScmEntry *se;
> +		struct cmsghdr *ch;
> +
> +		if (!pe->n_scm)
> +			continue;
> +
> +		se = pe->scm[0]; /* Only 1 SCM is possible */
> +
> +		if (se->type == SCM_RIGHTS) {
> +			pkt->scm_len = CMSG_SPACE(se->n_rights * sizeof(int));
> +			pkt->scm = xmalloc(pkt->scm_len);
> +			if (!pkt->scm)
> +				return -1;
> +
> +			ch = (struct cmsghdr *)pkt->scm; /* FIXME -- via msghdr */
> +			ch->cmsg_level = SOL_SOCKET;
> +			ch->cmsg_type = SCM_RIGHTS;
> +			ch->cmsg_len = CMSG_LEN(se->n_rights * sizeof(int));
> +
> +			if (unix_note_scm_rights(pe->id_for, se->rights,
> +						(int *)CMSG_DATA(ch), se->n_rights))
> +				return -1;
> +
> +			continue;
> +		}
> +
> +		pr_err("Unsupported scm %d in image\n", se->type);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> diff --git a/criu/sk-unix.c b/criu/sk-unix.c
> index 42ce1bb..3963a4a 100644
> --- a/criu/sk-unix.c
> +++ b/criu/sk-unix.c
> @@ -798,6 +798,7 @@ struct unix_sk_info {
>  	struct file_desc d;
>  	struct list_head connected; /* List of sockets, connected to me */
>  	struct list_head node; /* To link in peer's connected list  */
> +	struct list_head scm_fles;
>  
>  	/*
>  	 * For DGRAM sockets with queues, we should only restore the queue
> @@ -809,6 +810,11 @@ struct unix_sk_info {
>  	u8 listen:1;
>  };
>  
> +struct scm_fle {
> +	struct list_head l;
> +	struct fdinfo_list_entry *fle;
> +};
> +
>  #define USK_PAIR_MASTER		0x1
>  #define USK_PAIR_SLAVE		0x2
>  
> @@ -824,6 +830,141 @@ static struct unix_sk_info *find_unix_sk_by_ino(int ino)
>  	return NULL;
>  }
>  
> +static struct unix_sk_info *find_queuer_for(int id)
> +{
> +	struct unix_sk_info *ui;
> +
> +	list_for_each_entry(ui, &unix_sockets, list) {
> +		if (ui->queuer == id)
> +			return ui;
> +	}
> +
> +	return NULL;
> +}
> +
> +static struct fdinfo_list_entry *get_fle_for_scm(struct file_desc *tgt,
> +		struct pstree_item *owner)
> +{
> +	struct fdinfo_list_entry *fle;
> +	FdinfoEntry *e = NULL;
> +	int fd;
> +
> +	list_for_each_entry(fle, &tgt->fd_info_head, desc_list) {
> +		if (fle->task == owner)
> +			/*
> +			 * Owner already has this file in its fdtable.
> +			 * Just use one.
> +			 */
> +			return fle;
> +
> +		e = fle->fe; /* keep any for further reference */
> +	}
> +
> +	/*
> +	 * Some other task restores this file. Pretend that
> +	 * we're another user of it.
> +	 */
> +	fd = find_unused_fd(owner, -1);
> +	pr_info("`- will add SCM-only %d fd\n", fd);
> +
> +	if (e != NULL) {
> +		e = dup_fdinfo(e, fd, 0);
> +		if (!e) {
> +			pr_err("Can't duplicate fdinfo for scm\n");
> +			return NULL;
> +		}
> +	} else {
> +		/*
> +		 * This can happen if the file in question is
> +		 * sent over the socket and closed. In this case
> +		 * we need to ... invent a new one!
> +		 */
> +
> +		e = xmalloc(sizeof(*e));
> +		if (!e)
> +			return NULL;
> +
> +		fdinfo_entry__init(e);
> +		e->id = tgt->id;
> +		e->type = tgt->ops->type;
> +		e->fd = fd;
> +		e->flags = 0;
> +	}
> +
> +	/*
> +	 * Make this fle fake, so that files collecting engine
> +	 * closes them at the end.
> +	 */
> +	return collect_fd_to(vpid(owner), e, rsti(owner), tgt, true);
> +}
> +
> +int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids)
> +{
> +	struct unix_sk_info *ui;
> +	struct pstree_item *owner;
> +	int i;
> +
> +	ui = find_queuer_for(id_for);
> +	if (!ui) {
> +		pr_err("Can't find sender for %d\n", id_for);
> +		return -1;
> +	}
> +
> +	pr_info("Found queuer for %d -> %d\n", id_for, ui->ue->id);
> +	/*
> +	 * This is the task that will restore this socket
> +	 */
> +	owner = file_master(&ui->d)->task;
> +
> +	pr_info("-> will set up deps\n");
> +	/*
> +	 * The ui will send data to the rights receiver. Add a fake fle
> +	 * for the file and a dependency.
> +	 */
> +	for (i = 0; i < n_ids; i++) {
> +		struct file_desc *tgt;
> +		struct scm_fle *sfle;
> +
> +		tgt = find_file_desc_raw(FD_TYPES__UND, file_ids[i]);
> +		if (!tgt) {
> +			pr_err("Can't find fdesc to send\n");
> +			return -1;
> +		}
> +
> +		pr_info("scm: add file %d -> %d\n", tgt->id, vpid(owner));
> +		sfle = xmalloc(sizeof(*sfle));
> +		if (!sfle)
> +			return -1;
> +
> +		sfle->fle = get_fle_for_scm(tgt, owner);
> +		if (!sfle->fle) {
> +			pr_err("Can't request new fle for scm\n");
> +			return -1;
> +		}
> +
> +		list_add_tail(&sfle->l, &ui->scm_fles);
> +		fds[i] = sfle->fle->fe->fd;
> +	}
> +
> +	return 0;
> +}
> +
> +static int chk_restored_scms(struct unix_sk_info *ui)
> +{
> +	struct scm_fle *sf, *n;
> +
> +	list_for_each_entry_safe(sf, n, &ui->scm_fles, l) {
> +		if (sf->fle->stage < FLE_OPEN)
> +			return 1;
> +
> +		/* Optimization for the next pass */
> +		list_del(&sf->l);
> +		xfree(sf);
> +	}
> +
> +	return 0;
> +}
> +
>  static int wake_connected_sockets(struct unix_sk_info *ui)
>  {
>  	struct fdinfo_list_entry *fle;
> @@ -1322,12 +1463,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>  	struct unix_sk_info *ui;
>  	int ret;
>  
> +	ui = container_of(d, struct unix_sk_info, d);
> +
> +	/* FIXME -- only queue restore may be postponed */
> +	if (chk_restored_scms(ui)) {
> +		pr_info("scm: Wait for tgt to restore\n");
> +		return 1;
> +	}
> +
>  	fle = file_master(d);
>  	if (fle->stage >= FLE_OPEN)
>  		return post_open_unix_sk(d, fle->fe->fd);
>  
> -	ui = container_of(d, struct unix_sk_info, d);
> -
>  	if (inherited_fd(d, new_fd)) {
>  		ui->ue->uflags |= USK_INHERIT;
>  		ret = *new_fd >= 0 ? 0 : -1;
> @@ -1440,6 +1587,7 @@ static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i)
>  	ui->listen = 0;
>  	INIT_LIST_HEAD(&ui->connected);
>  	INIT_LIST_HEAD(&ui->node);
> +	INIT_LIST_HEAD(&ui->scm_fles);
>  	ui->flags = 0;
>  	fixup_sock_net_ns_id(&ui->ue->ns_id, &ui->ue->has_ns_id);
>  
> diff --git a/images/sk-packet.proto b/images/sk-packet.proto
> index 27b48e4..009b461 100644
> --- a/images/sk-packet.proto
> +++ b/images/sk-packet.proto
> @@ -1,8 +1,14 @@
>  syntax = "proto2";
>  
> +message scm_entry {
> +	required uint32			type		= 1;
> +	repeated uint32			rights		= 2;
> +}
> +
>  message sk_packet_entry {
>  	required uint32		id_for		= 1;
>  	required uint32		length		= 2;
>  	// optional bytes		addr	= 3;
>  	// optional sk_ucred_entry	ucred	= 128;
> +	repeated scm_entry	scm		= 4;
>  }
>