[18/18] SCM: Dump and restore SCM_RIGHTs

Submitted by Pavel Emelyanov on July 10, 2017, 9:42 a.m.

Details

Message ID 28a366b7-4d90-583e-4c23-d0f563187c4d@virtuozzo.com
State New
Series "Support descriptors sent over unix sockets"
Headers show

Commit Message

Pavel Emelyanov July 10, 2017, 9:42 a.m.
Most of the pieces has already been described in the previous patches :)
so here's the summary.

* Dump:

When receiving a message, also receive any SCM-s (already there) and when
SCM_RIGHTs one is met -- go ahead and just dump received descriptors using
regular code, but taking current as the victim task.

Few words about file paths resolution -- since we do dump path-ed files
by receiving them from victim's parasite, such files sent via sockets
should still work OK, as we still receive them, just from another socket.

Several problems here:

1. Unix sockets sent via unix sockets form knots. Not supported.
2. Eventpolls sent via unix might themseves poll unix sockets. Knots
   again. Not supported either.

* Restore:

On restore we need to make unix socket wait for the soon-to-be-scm-sent
descriptors to get restored, so we need to find them, then put a dependency.
After that, the fake fdinfo entry is attached to the respective file
descs, when sent the respective descriptors are closed.

https://github.com/xemul/criu/issues/251

Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
---
 criu/cr-restore.c      |   8 +++
 criu/include/sockets.h |   2 +
 criu/sk-queue.c        | 157 ++++++++++++++++++++++++++++++++++++++++++++++++-
 criu/sk-unix.c         | 127 ++++++++++++++++++++++++++++++++++++++-
 images/sk-packet.proto |   6 ++
 5 files changed, 295 insertions(+), 5 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index e14fa06..b9ef49c 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -364,6 +364,14 @@  static int root_prepare_shared(void)
 	if (ret)
 		goto err;
 
+	/*
+	 * This should be called with all packets collected AND all
+	 * fdescs and fles prepared BUT post-prep-s not run.
+	 */
+	ret = prepare_scms();
+	if (ret)
+		goto err;
+
 	ret = run_post_prepare();
 	if (ret)
 		goto err;
diff --git a/criu/include/sockets.h b/criu/include/sockets.h
index 3fa8017..1bd5c67 100644
--- a/criu/include/sockets.h
+++ b/criu/include/sockets.h
@@ -38,6 +38,8 @@  extern int collect_sockets(struct ns_id *);
 extern struct collect_image_info inet_sk_cinfo;
 extern struct collect_image_info unix_sk_cinfo;
 extern int fix_external_unix_sockets(void);
+extern int prepare_scms(void);
+extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids);
 
 extern struct collect_image_info netlink_sk_cinfo;
 
diff --git a/criu/sk-queue.c b/criu/sk-queue.c
index 77e203e..953db66 100644
--- a/criu/sk-queue.c
+++ b/criu/sk-queue.c
@@ -18,9 +18,9 @@ 
 #include "util.h"
 #include "util-pie.h"
 #include "sockets.h"
-
+#include "xmalloc.h"
 #include "sk-queue.h"
-
+#include "files.h"
 #include "protobuf.h"
 #include "images/sk-packet.pb-c.h"
 
@@ -28,6 +28,8 @@  struct sk_packet {
 	struct list_head	list;
 	SkPacketEntry		*entry;
 	char        		*data;
+	unsigned		scm_len;
+	int			*scm;
 };
 
 static LIST_HEAD(packets_list);
@@ -37,12 +39,22 @@  static int collect_one_packet(void *obj, ProtobufCMessage *msg, struct cr_img *i
 	struct sk_packet *pkt = obj;
 
 	pkt->entry = pb_msg(msg, SkPacketEntry);
-
+	pkt->scm = NULL;
 	pkt->data = xmalloc(pkt->entry->length);
 	if (pkt->data ==NULL)
 		return -1;
 
 	/*
+	 * See dump_packet_cmsg() -- only SCM_RIGHTS are supported and
+	 * only 1 of that kind is possible, thus not more than 1 SCMs
+	 * on a packet.
+	 */
+	if (pkt->entry->n_scm > 1) {
+		pr_err("More than 1 SCM is not possible\n");
+		return -1;
+	}
+
+	/*
 	 * NOTE: packet must be added to the tail. Otherwise sequence
 	 * will be broken.
 	 */
@@ -64,6 +76,50 @@  struct collect_image_info sk_queues_cinfo = {
 	.collect = collect_one_packet,
 };
 
+static int dump_scm_rights(struct cmsghdr *ch, SkPacketEntry *pe)
+{
+	int nr_fds, *fds, i;
+	void *buf;
+	ScmEntry *scme;
+
+	nr_fds = (ch->cmsg_len - sizeof(*ch)) / sizeof(int);
+	fds = (int *)CMSG_DATA(ch);
+
+	buf = xmalloc(sizeof(ScmEntry) + nr_fds * sizeof(uint32_t));
+	if (!buf)
+		return -1;
+
+	scme = xptr_pull(&buf, ScmEntry);
+	scm_entry__init(scme);
+	scme->type = SCM_RIGHTS;
+	scme->n_rights = nr_fds;
+	scme->rights = xptr_pull_s(&buf, nr_fds * sizeof(uint32_t));
+
+	for (i = 0; i < nr_fds; i++) {
+		int ftyp;
+
+		if (dump_my_file(fds[i], &scme->rights[i], &ftyp))
+			return -1;
+
+		/*
+		 * Unix sent over Unix or Epoll with some other sh*t
+		 * sent over unix (maybe with this very unix polled)
+		 * are tricky and not supported for now. (XXX -- todo)
+		 */
+		if (ftyp == FD_TYPES__UNIXSK || ftyp == FD_TYPES__EVENTPOLL) {
+			pr_err("Can't dump send %d (unix/epoll) fd\n", ftyp);
+			return -1;
+		}
+	}
+
+	i = pe->n_scm++;
+	if (xrealloc_safe(&pe->scm, pe->n_scm * sizeof(ScmEntry*)))
+		return -1;
+
+	pe->scm[i] = scme;
+	return 0;
+}
+
 /*
  * Maximum size of the control messages. XXX -- is there any
  * way to get this value out of the kernel?
@@ -73,8 +129,26 @@  struct collect_image_info sk_queues_cinfo = {
 static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
 {
 	struct cmsghdr *ch;
+	int n_rights = 0;
 
 	for (ch = CMSG_FIRSTHDR(mh); ch; ch = CMSG_NXTHDR(mh, ch)) {
+		if (ch->cmsg_type == SCM_RIGHTS) {
+			if (n_rights) {
+				/*
+				 * Even if user is sending more than one cmsg with
+				 * rights, kernel merges them alltogether on recv.
+				 */
+				pr_err("Unexpected 2nd SCM_RIGHTS from the kernel\n");
+				return -1;
+			}
+
+			if (dump_scm_rights(ch, pe))
+				return -1;
+
+			n_rights++;
+			continue;
+		}
+
 		pr_err("Control messages in queue, not supported\n");
 		return -1;
 	}
@@ -82,6 +156,18 @@  static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
 	return 0;
 }
 
+static void release_cmsg(SkPacketEntry *pe)
+{
+	int i;
+
+	for (i = 0; i < pe->n_scm; i++)
+		xfree(pe->scm[i]);
+	xfree(pe->scm);
+
+	pe->n_scm = 0;
+	pe->scm = NULL;
+}
+
 int dump_sk_queue(int sock_fd, int sock_id)
 {
 	SkPacketEntry pe = SK_PACKET_ENTRY__INIT;
@@ -181,6 +267,9 @@  int dump_sk_queue(int sock_fd, int sock_id)
 			ret = -EIO;
 			goto err_set_sock;
 		}
+
+		if (pe.scm)
+			release_cmsg(&pe);
 	}
 	ret = 0;
 
@@ -197,6 +286,20 @@  err_brk:
 	return ret;
 }
 
+static void close_scm_fds(struct sk_packet *pkt)
+{
+	int i, *fds, n_fds;
+	struct cmsghdr *ch = (struct cmsghdr *)pkt->scm;
+
+	fds = (int *)CMSG_DATA(ch);
+	n_fds = (ch->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
+
+	for (i = 0; i < n_fds; i++) {
+		if (close(fds[i]))
+			pr_warn("scm: Error closing sent fd\n");
+	}
+}
+
 static int send_one_pkt(int fd, struct sk_packet *pkt)
 {
 	int ret;
@@ -209,6 +312,11 @@  static int send_one_pkt(int fd, struct sk_packet *pkt)
 	iov.iov_base = pkt->data;
 	iov.iov_len = entry->length;
 
+	if (pkt->scm != NULL) {
+		mh.msg_controllen = pkt->scm_len;
+		mh.msg_control = pkt->scm;
+	}
+
 	/*
 	 * Don't try to use sendfile here, because it use sendpage() and
 	 * all data are split on pages and a new skb is allocated for
@@ -229,6 +337,9 @@  static int send_one_pkt(int fd, struct sk_packet *pkt)
 		return -1;
 	}
 
+	if (pkt->scm != NULL)
+		close_scm_fds(pkt);
+
 	return 0;
 }
 
@@ -264,3 +375,43 @@  int restore_sk_queue(int fd, unsigned int peer_id)
 out:
 	return ret;
 }
+
+int prepare_scms(void)
+{
+	struct sk_packet *pkt;
+
+	pr_info("Preparing SCMs\n");
+	list_for_each_entry(pkt, &packets_list, list) {
+		SkPacketEntry *pe = pkt->entry;
+		ScmEntry *se;
+		struct cmsghdr *ch;
+
+		if (!pe->n_scm)
+			continue;
+
+		se = pe->scm[0]; /* Only 1 SCM is possible */
+
+		if (se->type == SCM_RIGHTS) {
+			pkt->scm_len = CMSG_SPACE(se->n_rights * sizeof(int));
+			pkt->scm = xmalloc(pkt->scm_len);
+			if (!pkt->scm)
+				return -1;
+
+			ch = (struct cmsghdr *)pkt->scm; /* FIXME -- via msghdr */
+			ch->cmsg_level = SOL_SOCKET;
+			ch->cmsg_type = SCM_RIGHTS;
+			ch->cmsg_len = CMSG_LEN(se->n_rights * sizeof(int));
+
+			if (unix_note_scm_rights(pe->id_for, se->rights,
+						(int *)CMSG_DATA(ch), se->n_rights))
+				return -1;
+
+			continue;
+		}
+
+		pr_err("Unsupported scm %d in image\n", se->type);
+		return -1;
+	}
+
+	return 0;
+}
diff --git a/criu/sk-unix.c b/criu/sk-unix.c
index 42ce1bb..165adc9 100644
--- a/criu/sk-unix.c
+++ b/criu/sk-unix.c
@@ -798,6 +798,7 @@  struct unix_sk_info {
 	struct file_desc d;
 	struct list_head connected; /* List of sockets, connected to me */
 	struct list_head node; /* To link in peer's connected list  */
+	struct list_head scm_fles;
 
 	/*
 	 * For DGRAM sockets with queues, we should only restore the queue
@@ -809,6 +810,11 @@  struct unix_sk_info {
 	u8 listen:1;
 };
 
+struct scm_fle {
+	struct list_head l;
+	struct fdinfo_list_entry *fle;
+};
+
 #define USK_PAIR_MASTER		0x1
 #define USK_PAIR_SLAVE		0x2
 
@@ -824,6 +830,116 @@  static struct unix_sk_info *find_unix_sk_by_ino(int ino)
 	return NULL;
 }
 
+static struct unix_sk_info *find_queuer_for(int id)
+{
+	struct unix_sk_info *ui;
+
+	list_for_each_entry(ui, &unix_sockets, list) {
+		if (ui->queuer == id)
+			return ui;
+	}
+
+	return NULL;
+}
+
+int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids)
+{
+	struct unix_sk_info *ui;
+	struct pstree_item *owner;
+	int i;
+
+	ui = find_queuer_for(id_for);
+	if (!ui) {
+		pr_err("Can't find sender for %d\n", id_for);
+		return -1;
+	}
+
+	pr_info("Found queuer for %d -> %d\n", id_for, ui->ue->id);
+	/*
+	 * This is the task that will restore this socket
+	 */
+	owner = file_master(&ui->d)->task;
+
+	pr_info("-> will set up deps\n");
+	/*
+	 * The ui will send data to the rights receiver. Add a fake fle
+	 * for the file and a dependency.
+	 */
+	for (i = 0; i < n_ids; i++) {
+		struct file_desc *tgt;
+		struct fdinfo_list_entry *fle;
+		struct scm_fle *sfle;
+		FdinfoEntry *e;
+		int fd;
+
+		tgt = find_file_desc_raw(FD_TYPES__UND, file_ids[i]);
+		if (!tgt) {
+			pr_err("Can't find fdesc to send\n");
+			return -1;
+		}
+
+		pr_info("`- Found %d file\n", file_ids[i]);
+		fd = find_unused_fd(owner, -1);
+
+		fle = try_file_master(tgt);
+		if (fle) {
+			e = dup_fdinfo(fle->fe, fd, 0);
+			if (!e) {
+				pr_err("Can't duplicate fdinfo for scm\n");
+				return -1;
+			}
+		} else {
+			/*
+			 * This can happen if the file in question is
+			 * sent over the socket and closed. In this case
+			 * we need to ... invent a new one!
+			 */
+
+			e = xmalloc(sizeof(*e));
+			if (!e)
+				return -1;
+
+			fdinfo_entry__init(e);
+			e->id = tgt->id;
+			e->type = tgt->ops->type;
+			e->fd = fd;
+			e->flags = 0;
+		}
+
+		pr_info("scm: add %d -> %d.fd[%d]\n", tgt->id, vpid(owner), fd);
+		sfle = xmalloc(sizeof(*sfle));
+		if (!sfle)
+			return -1;
+
+		sfle->fle = collect_fd_to(vpid(owner), e, rsti(owner), tgt, false);
+		if (!sfle->fle) {
+			pr_err("Can't request new fle for scm\n");
+			return -1;
+		}
+
+		list_add_tail(&sfle->l, &ui->scm_fles);
+		fds[i] = fd;
+	}
+
+	return 0;
+}
+
+static int chk_restored_scms(struct unix_sk_info *ui)
+{
+	struct scm_fle *sf, *n;
+
+	list_for_each_entry_safe(sf, n, &ui->scm_fles, l) {
+		if (sf->fle->stage != FLE_RESTORED)
+			return 1;
+
+		/* Optimization for the next pass */
+		list_del(&sf->l);
+		xfree(sf);
+	}
+
+	return 0;
+}
+
 static int wake_connected_sockets(struct unix_sk_info *ui)
 {
 	struct fdinfo_list_entry *fle;
@@ -1322,12 +1438,18 @@  static int open_unix_sk(struct file_desc *d, int *new_fd)
 	struct unix_sk_info *ui;
 	int ret;
 
+	ui = container_of(d, struct unix_sk_info, d);
+
+	/* FIXME -- only queue restore may be postponed */
+	if (chk_restored_scms(ui)) {
+		pr_info("scm: Wait for tgt to restore\n");
+		return 1;
+	}
+
 	fle = file_master(d);
 	if (fle->stage >= FLE_OPEN)
 		return post_open_unix_sk(d, fle->fe->fd);
 
-	ui = container_of(d, struct unix_sk_info, d);
-
 	if (inherited_fd(d, new_fd)) {
 		ui->ue->uflags |= USK_INHERIT;
 		ret = *new_fd >= 0 ? 0 : -1;
@@ -1440,6 +1562,7 @@  static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i)
 	ui->listen = 0;
 	INIT_LIST_HEAD(&ui->connected);
 	INIT_LIST_HEAD(&ui->node);
+	INIT_LIST_HEAD(&ui->scm_fles);
 	ui->flags = 0;
 	fixup_sock_net_ns_id(&ui->ue->ns_id, &ui->ue->has_ns_id);
 
diff --git a/images/sk-packet.proto b/images/sk-packet.proto
index 27b48e4..009b461 100644
--- a/images/sk-packet.proto
+++ b/images/sk-packet.proto
@@ -1,8 +1,14 @@ 
 syntax = "proto2";
 
+message scm_entry {
+	required uint32			type		= 1;
+	repeated uint32			rights		= 2;
+}
+
 message sk_packet_entry {
 	required uint32		id_for		= 1;
 	required uint32		length		= 2;
 	// optional bytes		addr	= 3;
 	// optional sk_ucred_entry	ucred	= 128;
+	repeated scm_entry	scm		= 4;
 }

Comments

Kirill Tkhai July 11, 2017, 12:38 p.m.
On Mon, Jul 10, 2017 at 12:42, Pavel Emelyanov wrote:
> Most of the pieces has already been described in the previous patches :)
> so here's the summary.
> 
> * Dump:
> 
> When receiving a message, also receive any SCM-s (already there) and when
> SCM_RIGHTs one is met -- go ahead and just dump received descriptors using
> regular code, but taking current as the victim task.
> 
> Few words about file paths resolution -- since we do dump path-ed files
> by receiving them from victim's parasite, such files sent via sockets
> should still work OK, as we still receive them, just from another socket.
> 
> Several problems here:
> 
> 1. Unix sockets sent via unix sockets form knots. Not supported.
> 2. Eventpolls sent via unix might themseves poll unix sockets. Knots
>    again. Not supported either.
> 
> * Restore:
> 
> On restore we need to make unix socket wait for the soon-to-be-scm-sent
> descriptors to get restored, so we need to find them, then put a dependency.
> After that, the fake fdinfo entry is attached to the respective file
> descs, when sent the respective descriptors are closed.
> 
> https://github.com/xemul/criu/issues/251
> 
> Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
> ---
>  criu/cr-restore.c      |   8 +++
>  criu/include/sockets.h |   2 +
>  criu/sk-queue.c        | 157 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  criu/sk-unix.c         | 127 ++++++++++++++++++++++++++++++++++++++-
>  images/sk-packet.proto |   6 ++
>  5 files changed, 295 insertions(+), 5 deletions(-)
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index e14fa06..b9ef49c 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -364,6 +364,14 @@ static int root_prepare_shared(void)
>  	if (ret)
>  		goto err;
>  
> +	/*
> +	 * This should be called with all packets collected AND all
> +	 * fdescs and fles prepared BUT post-prep-s not run.
> +	 */
> +	ret = prepare_scms();
> +	if (ret)
> +		goto err;

This should be called before add_fake_fds_masters():

1)It may add a master file there, and this master
must be resolved in add_fake_fds_masters(). Otherwise
the task, which is the owner of this just added master,
may not have rights to create the master (imagine,
scm file is a socket of a net_ns, which can't be
assigned by the task);

2)Another case -- there was not a task, which has
permittions to create a socket, and you added it
in prepare_scms(). In this case, we mustn't add
one more fle in add_fake_fds_masters() -- and
if this function is called after prepare_scms(),
it won't add anything. This will reduce number
of fake files, we add.

> +
>  	ret = run_post_prepare();
>  	if (ret)
>  		goto err;
> diff --git a/criu/include/sockets.h b/criu/include/sockets.h
> index 3fa8017..1bd5c67 100644
> --- a/criu/include/sockets.h
> +++ b/criu/include/sockets.h
> @@ -38,6 +38,8 @@ extern int collect_sockets(struct ns_id *);
>  extern struct collect_image_info inet_sk_cinfo;
>  extern struct collect_image_info unix_sk_cinfo;
>  extern int fix_external_unix_sockets(void);
> +extern int prepare_scms(void);
> +extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids);
>  
>  extern struct collect_image_info netlink_sk_cinfo;
>  
> diff --git a/criu/sk-queue.c b/criu/sk-queue.c
> index 77e203e..953db66 100644
> --- a/criu/sk-queue.c
> +++ b/criu/sk-queue.c
> @@ -18,9 +18,9 @@
>  #include "util.h"
>  #include "util-pie.h"
>  #include "sockets.h"
> -
> +#include "xmalloc.h"
>  #include "sk-queue.h"
> -
> +#include "files.h"
>  #include "protobuf.h"
>  #include "images/sk-packet.pb-c.h"
>  
> @@ -28,6 +28,8 @@ struct sk_packet {
>  	struct list_head	list;
>  	SkPacketEntry		*entry;
>  	char        		*data;
> +	unsigned		scm_len;
> +	int			*scm;
>  };
>  
>  static LIST_HEAD(packets_list);
> @@ -37,12 +39,22 @@ static int collect_one_packet(void *obj, ProtobufCMessage *msg, struct cr_img *i
>  	struct sk_packet *pkt = obj;
>  
>  	pkt->entry = pb_msg(msg, SkPacketEntry);
> -
> +	pkt->scm = NULL;
>  	pkt->data = xmalloc(pkt->entry->length);
>  	if (pkt->data ==NULL)
>  		return -1;
>  
>  	/*
> +	 * See dump_packet_cmsg() -- only SCM_RIGHTS are supported and
> +	 * only 1 of that kind is possible, thus not more than 1 SCMs
> +	 * on a packet.
> +	 */
> +	if (pkt->entry->n_scm > 1) {
> +		pr_err("More than 1 SCM is not possible\n");
> +		return -1;
> +	}
> +
> +	/*
>  	 * NOTE: packet must be added to the tail. Otherwise sequence
>  	 * will be broken.
>  	 */
> @@ -64,6 +76,50 @@ struct collect_image_info sk_queues_cinfo = {
>  	.collect = collect_one_packet,
>  };
>  
> +static int dump_scm_rights(struct cmsghdr *ch, SkPacketEntry *pe)
> +{
> +	int nr_fds, *fds, i;
> +	void *buf;
> +	ScmEntry *scme;
> +
> +	nr_fds = (ch->cmsg_len - sizeof(*ch)) / sizeof(int);
> +	fds = (int *)CMSG_DATA(ch);
> +
> +	buf = xmalloc(sizeof(ScmEntry) + nr_fds * sizeof(uint32_t));
> +	if (!buf)
> +		return -1;
> +
> +	scme = xptr_pull(&buf, ScmEntry);
> +	scm_entry__init(scme);
> +	scme->type = SCM_RIGHTS;
> +	scme->n_rights = nr_fds;
> +	scme->rights = xptr_pull_s(&buf, nr_fds * sizeof(uint32_t));
> +
> +	for (i = 0; i < nr_fds; i++) {
> +		int ftyp;
> +
> +		if (dump_my_file(fds[i], &scme->rights[i], &ftyp))
> +			return -1;
> +
> +		/*
> +		 * Unix sent over Unix or Epoll with some other sh*t
> +		 * sent over unix (maybe with this very unix polled)
> +		 * are tricky and not supported for now. (XXX -- todo)
> +		 */
> +		if (ftyp == FD_TYPES__UNIXSK || ftyp == FD_TYPES__EVENTPOLL) {
> +			pr_err("Can't dump send %d (unix/epoll) fd\n", ftyp);
> +			return -1;
> +		}
> +	}
> +
> +	i = pe->n_scm++;
> +	if (xrealloc_safe(&pe->scm, pe->n_scm * sizeof(ScmEntry*)))
> +		return -1;
> +
> +	pe->scm[i] = scme;
> +	return 0;
> +}
> +
>  /*
>   * Maximum size of the control messages. XXX -- is there any
>   * way to get this value out of the kernel?
> @@ -73,8 +129,26 @@ struct collect_image_info sk_queues_cinfo = {
>  static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>  {
>  	struct cmsghdr *ch;
> +	int n_rights = 0;
>  
>  	for (ch = CMSG_FIRSTHDR(mh); ch; ch = CMSG_NXTHDR(mh, ch)) {
> +		if (ch->cmsg_type == SCM_RIGHTS) {
> +			if (n_rights) {
> +				/*
> +				 * Even if user is sending more than one cmsg with
> +				 * rights, kernel merges them alltogether on recv.
> +				 */
> +				pr_err("Unexpected 2nd SCM_RIGHTS from the kernel\n");
> +				return -1;
> +			}
> +
> +			if (dump_scm_rights(ch, pe))
> +				return -1;
> +
> +			n_rights++;
> +			continue;
> +		}
> +
>  		pr_err("Control messages in queue, not supported\n");
>  		return -1;
>  	}
> @@ -82,6 +156,18 @@ static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>  	return 0;
>  }
>  
> +static void release_cmsg(SkPacketEntry *pe)
> +{
> +	int i;
> +
> +	for (i = 0; i < pe->n_scm; i++)
> +		xfree(pe->scm[i]);
> +	xfree(pe->scm);
> +
> +	pe->n_scm = 0;
> +	pe->scm = NULL;
> +}
> +
>  int dump_sk_queue(int sock_fd, int sock_id)
>  {
>  	SkPacketEntry pe = SK_PACKET_ENTRY__INIT;
> @@ -181,6 +267,9 @@ int dump_sk_queue(int sock_fd, int sock_id)
>  			ret = -EIO;
>  			goto err_set_sock;
>  		}
> +
> +		if (pe.scm)
> +			release_cmsg(&pe);
>  	}
>  	ret = 0;
>  
> @@ -197,6 +286,20 @@ err_brk:
>  	return ret;
>  }
>  
> +static void close_scm_fds(struct sk_packet *pkt)
> +{
> +	int i, *fds, n_fds;
> +	struct cmsghdr *ch = (struct cmsghdr *)pkt->scm;
> +
> +	fds = (int *)CMSG_DATA(ch);
> +	n_fds = (ch->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
> +
> +	for (i = 0; i < n_fds; i++) {
> +		if (close(fds[i]))
> +			pr_warn("scm: Error closing sent fd\n");
> +	}
> +}
> +
>  static int send_one_pkt(int fd, struct sk_packet *pkt)
>  {
>  	int ret;
> @@ -209,6 +312,11 @@ static int send_one_pkt(int fd, struct sk_packet *pkt)
>  	iov.iov_base = pkt->data;
>  	iov.iov_len = entry->length;
>  
> +	if (pkt->scm != NULL) {
> +		mh.msg_controllen = pkt->scm_len;
> +		mh.msg_control = pkt->scm;
> +	}
> +
>  	/*
>  	 * Don't try to use sendfile here, because it use sendpage() and
>  	 * all data are split on pages and a new skb is allocated for
> @@ -229,6 +337,9 @@ static int send_one_pkt(int fd, struct sk_packet *pkt)
>  		return -1;
>  	}
>  
> +	if (pkt->scm != NULL)
> +		close_scm_fds(pkt);
> +
>  	return 0;
>  }
>  
> @@ -264,3 +375,43 @@ int restore_sk_queue(int fd, unsigned int peer_id)
>  out:
>  	return ret;
>  }

The previous patches don't make restore_sk_queue() find a fle,
which fd is used to set next packet in queue. This series does
not support multi-queuer queues?

It seems, we don't need "unix: Split resolv and interconnect (v2)",
it's just need to select a correct fd from task fles, and to use
it in restore_sk_queue().

> +
> +int prepare_scms(void)
> +{
> +	struct sk_packet *pkt;
> +
> +	pr_info("Preparing SCMs\n");
> +	list_for_each_entry(pkt, &packets_list, list) {
> +		SkPacketEntry *pe = pkt->entry;
> +		ScmEntry *se;
> +		struct cmsghdr *ch;
> +
> +		if (!pe->n_scm)
> +			continue;
> +
> +		se = pe->scm[0]; /* Only 1 SCM is possible */
> +
> +		if (se->type == SCM_RIGHTS) {
> +			pkt->scm_len = CMSG_SPACE(se->n_rights * sizeof(int));
> +			pkt->scm = xmalloc(pkt->scm_len);
> +			if (!pkt->scm)
> +				return -1;
> +
> +			ch = (struct cmsghdr *)pkt->scm; /* FIXME -- via msghdr */
> +			ch->cmsg_level = SOL_SOCKET;
> +			ch->cmsg_type = SCM_RIGHTS;
> +			ch->cmsg_len = CMSG_LEN(se->n_rights * sizeof(int));
> +
> +			if (unix_note_scm_rights(pe->id_for, se->rights,
> +						(int *)CMSG_DATA(ch), se->n_rights))
> +				return -1;
> +
> +			continue;
> +		}
> +
> +		pr_err("Unsupported scm %d in image\n", se->type);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> diff --git a/criu/sk-unix.c b/criu/sk-unix.c
> index 42ce1bb..165adc9 100644
> --- a/criu/sk-unix.c
> +++ b/criu/sk-unix.c
> @@ -798,6 +798,7 @@ struct unix_sk_info {
>  	struct file_desc d;
>  	struct list_head connected; /* List of sockets, connected to me */
>  	struct list_head node; /* To link in peer's connected list  */
> +	struct list_head scm_fles;
>  
>  	/*
>  	 * For DGRAM sockets with queues, we should only restore the queue
> @@ -809,6 +810,11 @@ struct unix_sk_info {
>  	u8 listen:1;
>  };
>  
> +struct scm_fle {
> +	struct list_head l;
> +	struct fdinfo_list_entry *fle;
> +};
> +
>  #define USK_PAIR_MASTER		0x1
>  #define USK_PAIR_SLAVE		0x2
>  
> @@ -824,6 +830,116 @@ static struct unix_sk_info *find_unix_sk_by_ino(int ino)
>  	return NULL;
>  }
>  
> +static struct unix_sk_info *find_queuer_for(int id)
> +{
> +	struct unix_sk_info *ui;
> +
> +	list_for_each_entry(ui, &unix_sockets, list) {
> +		if (ui->queuer == id)
> +			return ui;
> +	}
> +
> +	return NULL;
> +}
> +
> +int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids)
> +{
> +	struct unix_sk_info *ui;
> +	struct pstree_item *owner;
> +	int i;
> +
> +	ui = find_queuer_for(id_for);
> +	if (!ui) {
> +		pr_err("Can't find sender for %d\n", id_for);
> +		return -1;
> +	}
> +
> +	pr_info("Found queuer for %d -> %d\n", id_for, ui->ue->id);
> +	/*
> +	 * This is the task that will restore this socket
> +	 */
> +	owner = file_master(&ui->d)->task;
> +
> +	pr_info("-> will set up deps\n");
> +	/*
> +	 * The ui will send data to the rights receiver. Add a fake fle
> +	 * for the file and a dependency.
> +	 */
> +	for (i = 0; i < n_ids; i++) {
> +		struct file_desc *tgt;
> +		struct fdinfo_list_entry *fle;
> +		struct scm_fle *sfle;
> +		FdinfoEntry *e;
> +		int fd;
> +
> +		tgt = find_file_desc_raw(FD_TYPES__UND, file_ids[i]);
> +		if (!tgt) {
> +			pr_err("Can't find fdesc to send\n");
> +			return -1;
> +		}
> +
> +		pr_info("`- Found %d file\n", file_ids[i]);
> +		fd = find_unused_fd(owner, -1);

If I haven't missed something, I don't see, where you try
to found the fle in task's file descriptor  to use it for
queuing. It seems, we add a fake fle for every packet in
a queue. If it's so, it's not performance-good, and firstly
we should try to check, that task already owns the fle instead
of that.

> +
> +		fle = try_file_master(tgt);
> +		if (fle) {
> +			e = dup_fdinfo(fle->fe, fd, 0);
> +			if (!e) {
> +				pr_err("Can't duplicate fdinfo for scm\n");
> +				return -1;
> +			}
> +		} else {
> +			/*
> +			 * This can happen if the file in question is
> +			 * sent over the socket and closed. In this case
> +			 * we need to ... invent a new one!
> +			 */
> +
> +			e = xmalloc(sizeof(*e));
> +			if (!e)
> +				return -1;
> +
> +			fdinfo_entry__init(e);
> +			e->id = tgt->id;
> +			e->type = tgt->ops->type;
> +			e->fd = fd;
> +			e->flags = 0;

I'd add the code around fdinfo_entry__init() in separate function
as we already have 3 places like it.

> +		}
> +
> +		pr_info("scm: add %d -> %d.fd[%d]\n", tgt->id, vpid(owner), fd);
> +		sfle = xmalloc(sizeof(*sfle));
> +		if (!sfle)
> +			return -1;
> +
> +		sfle->fle = collect_fd_to(vpid(owner), e, rsti(owner), tgt, false);
> +		if (!sfle->fle) {
> +			pr_err("Can't request new fle for scm\n");
> +			return -1;
> +		}
> +
> +		list_add_tail(&sfle->l, &ui->scm_fles);
> +		fds[i] = fd;
> +	}
> +
> +	return 0;
> +}
> +
> +static int chk_restored_scms(struct unix_sk_info *ui)
> +{
> +	struct scm_fle *sf, *n;
> +
> +	list_for_each_entry_safe(sf, n, &ui->scm_fles, l) {
> +		if (sf->fle->stage != FLE_RESTORED)
> +			return 1;

It's need to wait for sf->fle->bound here instead. I suppose,
the above introduces some circular dependencies for cases
with several sockets having epoll, which is listening them.

> +
> +		/* Optimization for the next pass */
> +		list_del(&sf->l);
> +		xfree(sf);
> +	}
> +
> +	return 0;
> +}
> +
>  static int wake_connected_sockets(struct unix_sk_info *ui)
>  {
>  	struct fdinfo_list_entry *fle;
> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>  	struct unix_sk_info *ui;
>  	int ret;
>  
> +	ui = container_of(d, struct unix_sk_info, d);
> +
> +	/* FIXME -- only queue restore may be postponed */
> +	if (chk_restored_scms(ui)) {
> +		pr_info("scm: Wait for tgt to restore\n");
> +		return 1;
> +	}

It's too strong limitation, we delay another tasks here.
We may open the socket and to serve it out, and we do
not need wait queuers for these actions.

It's better to wait for the master fle of packet right
before we restore the queue, and to use it from our
task fle list.

> +
>  	fle = file_master(d);
>  	if (fle->stage >= FLE_OPEN)
>  		return post_open_unix_sk(d, fle->fe->fd);
>  
> -	ui = container_of(d, struct unix_sk_info, d);
> -
>  	if (inherited_fd(d, new_fd)) {
>  		ui->ue->uflags |= USK_INHERIT;
>  		ret = *new_fd >= 0 ? 0 : -1;
> @@ -1440,6 +1562,7 @@ static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i)
>  	ui->listen = 0;
>  	INIT_LIST_HEAD(&ui->connected);
>  	INIT_LIST_HEAD(&ui->node);
> +	INIT_LIST_HEAD(&ui->scm_fles);
>  	ui->flags = 0;
>  	fixup_sock_net_ns_id(&ui->ue->ns_id, &ui->ue->has_ns_id);
>  
> diff --git a/images/sk-packet.proto b/images/sk-packet.proto
> index 27b48e4..009b461 100644
> --- a/images/sk-packet.proto
> +++ b/images/sk-packet.proto
> @@ -1,8 +1,14 @@
>  syntax = "proto2";
>  
> +message scm_entry {
> +	required uint32			type		= 1;
> +	repeated uint32			rights		= 2;
> +}
> +
>  message sk_packet_entry {
>  	required uint32		id_for		= 1;
>  	required uint32		length		= 2;
>  	// optional bytes		addr	= 3;
>  	// optional sk_ucred_entry	ucred	= 128;
> +	repeated scm_entry	scm		= 4;
>  }
> -- 
> 2.1.4
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
Pavel Emelyanov July 11, 2017, 3:49 p.m.
>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>> index e14fa06..b9ef49c 100644
>> --- a/criu/cr-restore.c
>> +++ b/criu/cr-restore.c
>> @@ -364,6 +364,14 @@ static int root_prepare_shared(void)
>>  	if (ret)
>>  		goto err;
>>  
>> +	/*
>> +	 * This should be called with all packets collected AND all
>> +	 * fdescs and fles prepared BUT post-prep-s not run.
>> +	 */
>> +	ret = prepare_scms();
>> +	if (ret)
>> +		goto err;
> 
> This should be called before add_fake_fds_masters():
> 
> 1)It may add a master file there, and this master
> must be resolved in add_fake_fds_masters(). Otherwise
> the task, which is the owner of this just added master,
> may not have rights to create the master (imagine,
> scm file is a socket of a net_ns, which can't be
> assigned by the task);
> 
> 2)Another case -- there was not a task, which has
> permittions to create a socket, and you added it
> in prepare_scms(). In this case, we mustn't add
> one more fle in add_fake_fds_masters() -- and
> if this function is called after prepare_scms(),
> it won't add anything. This will reduce number
> of fake files, we add.

OK, will move and add this text as code comment.

>> +
>>  	ret = run_post_prepare();
>>  	if (ret)
>>  		goto err;
>> diff --git a/criu/include/sockets.h b/criu/include/sockets.h
>> index 3fa8017..1bd5c67 100644
>> --- a/criu/include/sockets.h
>> +++ b/criu/include/sockets.h
>> @@ -38,6 +38,8 @@ extern int collect_sockets(struct ns_id *);
>>  extern struct collect_image_info inet_sk_cinfo;
>>  extern struct collect_image_info unix_sk_cinfo;
>>  extern int fix_external_unix_sockets(void);
>> +extern int prepare_scms(void);
>> +extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids);
>>  
>>  extern struct collect_image_info netlink_sk_cinfo;
>>  
>> diff --git a/criu/sk-queue.c b/criu/sk-queue.c
>> index 77e203e..953db66 100644
>> --- a/criu/sk-queue.c
>> +++ b/criu/sk-queue.c
>> @@ -18,9 +18,9 @@
>>  #include "util.h"
>>  #include "util-pie.h"
>>  #include "sockets.h"
>> -
>> +#include "xmalloc.h"
>>  #include "sk-queue.h"
>> -
>> +#include "files.h"
>>  #include "protobuf.h"
>>  #include "images/sk-packet.pb-c.h"
>>  
>> @@ -28,6 +28,8 @@ struct sk_packet {
>>  	struct list_head	list;
>>  	SkPacketEntry		*entry;
>>  	char        		*data;
>> +	unsigned		scm_len;
>> +	int			*scm;
>>  };
>>  
>>  static LIST_HEAD(packets_list);
>> @@ -37,12 +39,22 @@ static int collect_one_packet(void *obj, ProtobufCMessage *msg, struct cr_img *i
>>  	struct sk_packet *pkt = obj;
>>  
>>  	pkt->entry = pb_msg(msg, SkPacketEntry);
>> -
>> +	pkt->scm = NULL;
>>  	pkt->data = xmalloc(pkt->entry->length);
>>  	if (pkt->data ==NULL)
>>  		return -1;
>>  
>>  	/*
>> +	 * See dump_packet_cmsg() -- only SCM_RIGHTS are supported and
>> +	 * only 1 of that kind is possible, thus not more than 1 SCMs
>> +	 * on a packet.
>> +	 */
>> +	if (pkt->entry->n_scm > 1) {
>> +		pr_err("More than 1 SCM is not possible\n");
>> +		return -1;
>> +	}
>> +
>> +	/*
>>  	 * NOTE: packet must be added to the tail. Otherwise sequence
>>  	 * will be broken.
>>  	 */
>> @@ -64,6 +76,50 @@ struct collect_image_info sk_queues_cinfo = {
>>  	.collect = collect_one_packet,
>>  };
>>  
>> +static int dump_scm_rights(struct cmsghdr *ch, SkPacketEntry *pe)
>> +{
>> +	int nr_fds, *fds, i;
>> +	void *buf;
>> +	ScmEntry *scme;
>> +
>> +	nr_fds = (ch->cmsg_len - sizeof(*ch)) / sizeof(int);
>> +	fds = (int *)CMSG_DATA(ch);
>> +
>> +	buf = xmalloc(sizeof(ScmEntry) + nr_fds * sizeof(uint32_t));
>> +	if (!buf)
>> +		return -1;
>> +
>> +	scme = xptr_pull(&buf, ScmEntry);
>> +	scm_entry__init(scme);
>> +	scme->type = SCM_RIGHTS;
>> +	scme->n_rights = nr_fds;
>> +	scme->rights = xptr_pull_s(&buf, nr_fds * sizeof(uint32_t));
>> +
>> +	for (i = 0; i < nr_fds; i++) {
>> +		int ftyp;
>> +
>> +		if (dump_my_file(fds[i], &scme->rights[i], &ftyp))
>> +			return -1;
>> +
>> +		/*
>> +		 * Unix sent over Unix or Epoll with some other sh*t
>> +		 * sent over unix (maybe with this very unix polled)
>> +		 * are tricky and not supported for now. (XXX -- todo)
>> +		 */
>> +		if (ftyp == FD_TYPES__UNIXSK || ftyp == FD_TYPES__EVENTPOLL) {
>> +			pr_err("Can't dump send %d (unix/epoll) fd\n", ftyp);
>> +			return -1;
>> +		}
>> +	}
>> +
>> +	i = pe->n_scm++;
>> +	if (xrealloc_safe(&pe->scm, pe->n_scm * sizeof(ScmEntry*)))
>> +		return -1;
>> +
>> +	pe->scm[i] = scme;
>> +	return 0;
>> +}
>> +
>>  /*
>>   * Maximum size of the control messages. XXX -- is there any
>>   * way to get this value out of the kernel?
>> @@ -73,8 +129,26 @@ struct collect_image_info sk_queues_cinfo = {
>>  static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>>  {
>>  	struct cmsghdr *ch;
>> +	int n_rights = 0;
>>  
>>  	for (ch = CMSG_FIRSTHDR(mh); ch; ch = CMSG_NXTHDR(mh, ch)) {
>> +		if (ch->cmsg_type == SCM_RIGHTS) {
>> +			if (n_rights) {
>> +				/*
>> +				 * Even if user is sending more than one cmsg with
>> +				 * rights, kernel merges them alltogether on recv.
>> +				 */
>> +				pr_err("Unexpected 2nd SCM_RIGHTS from the kernel\n");
>> +				return -1;
>> +			}
>> +
>> +			if (dump_scm_rights(ch, pe))
>> +				return -1;
>> +
>> +			n_rights++;
>> +			continue;
>> +		}
>> +
>>  		pr_err("Control messages in queue, not supported\n");
>>  		return -1;
>>  	}
>> @@ -82,6 +156,18 @@ static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>>  	return 0;
>>  }
>>  
>> +static void release_cmsg(SkPacketEntry *pe)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < pe->n_scm; i++)
>> +		xfree(pe->scm[i]);
>> +	xfree(pe->scm);
>> +
>> +	pe->n_scm = 0;
>> +	pe->scm = NULL;
>> +}
>> +
>>  int dump_sk_queue(int sock_fd, int sock_id)
>>  {
>>  	SkPacketEntry pe = SK_PACKET_ENTRY__INIT;
>> @@ -181,6 +267,9 @@ int dump_sk_queue(int sock_fd, int sock_id)
>>  			ret = -EIO;
>>  			goto err_set_sock;
>>  		}
>> +
>> +		if (pe.scm)
>> +			release_cmsg(&pe);
>>  	}
>>  	ret = 0;
>>  
>> @@ -197,6 +286,20 @@ err_brk:
>>  	return ret;
>>  }
>>  
>> +static void close_scm_fds(struct sk_packet *pkt)
>> +{
>> +	int i, *fds, n_fds;
>> +	struct cmsghdr *ch = (struct cmsghdr *)pkt->scm;
>> +
>> +	fds = (int *)CMSG_DATA(ch);
>> +	n_fds = (ch->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
>> +
>> +	for (i = 0; i < n_fds; i++) {
>> +		if (close(fds[i]))
>> +			pr_warn("scm: Error closing sent fd\n");
>> +	}
>> +}
>> +
>>  static int send_one_pkt(int fd, struct sk_packet *pkt)
>>  {
>>  	int ret;
>> @@ -209,6 +312,11 @@ static int send_one_pkt(int fd, struct sk_packet *pkt)
>>  	iov.iov_base = pkt->data;
>>  	iov.iov_len = entry->length;
>>  
>> +	if (pkt->scm != NULL) {
>> +		mh.msg_controllen = pkt->scm_len;
>> +		mh.msg_control = pkt->scm;
>> +	}
>> +
>>  	/*
>>  	 * Don't try to use sendfile here, because it use sendpage() and
>>  	 * all data are split on pages and a new skb is allocated for
>> @@ -229,6 +337,9 @@ static int send_one_pkt(int fd, struct sk_packet *pkt)
>>  		return -1;
>>  	}
>>  
>> +	if (pkt->scm != NULL)
>> +		close_scm_fds(pkt);
>> +
>>  	return 0;
>>  }
>>  
>> @@ -264,3 +375,43 @@ int restore_sk_queue(int fd, unsigned int peer_id)
>>  out:
>>  	return ret;
>>  }
> 
> The previous patches don't make restore_sk_queue() find a fle,
> which fd is used to set next packet in queue. This series does
> not support multi-queuer queues?

No. You're talking about "restore sender address" patches :) I remember that,
and that's a separate issue. Right now if we have several sockets sending
data to one, on restore we pick the first peer and make it restore the whole
queue.

> It seems, we don't need "unix: Split resolv and interconnect (v2)",
> it's just need to select a correct fd from task fles, and to use
> it in restore_sk_queue().

We do. Inside unix_note_scm_rights() I need to find a queuer for a queue
and this lookup is only possible after the resolve and interconnect split
I've made before.

>> +
>> +int prepare_scms(void)
>> +{
>> +	struct sk_packet *pkt;
>> +
>> +	pr_info("Preparing SCMs\n");
>> +	list_for_each_entry(pkt, &packets_list, list) {
>> +		SkPacketEntry *pe = pkt->entry;
>> +		ScmEntry *se;
>> +		struct cmsghdr *ch;
>> +
>> +		if (!pe->n_scm)
>> +			continue;
>> +
>> +		se = pe->scm[0]; /* Only 1 SCM is possible */
>> +
>> +		if (se->type == SCM_RIGHTS) {
>> +			pkt->scm_len = CMSG_SPACE(se->n_rights * sizeof(int));
>> +			pkt->scm = xmalloc(pkt->scm_len);
>> +			if (!pkt->scm)
>> +				return -1;
>> +
>> +			ch = (struct cmsghdr *)pkt->scm; /* FIXME -- via msghdr */
>> +			ch->cmsg_level = SOL_SOCKET;
>> +			ch->cmsg_type = SCM_RIGHTS;
>> +			ch->cmsg_len = CMSG_LEN(se->n_rights * sizeof(int));
>> +
>> +			if (unix_note_scm_rights(pe->id_for, se->rights,
>> +						(int *)CMSG_DATA(ch), se->n_rights))
>> +				return -1;
>> +
>> +			continue;
>> +		}
>> +
>> +		pr_err("Unsupported scm %d in image\n", se->type);
>> +		return -1;
>> +	}
>> +
>> +	return 0;
>> +}
>> diff --git a/criu/sk-unix.c b/criu/sk-unix.c
>> index 42ce1bb..165adc9 100644
>> --- a/criu/sk-unix.c
>> +++ b/criu/sk-unix.c
>> @@ -798,6 +798,7 @@ struct unix_sk_info {
>>  	struct file_desc d;
>>  	struct list_head connected; /* List of sockets, connected to me */
>>  	struct list_head node; /* To link in peer's connected list  */
>> +	struct list_head scm_fles;
>>  
>>  	/*
>>  	 * For DGRAM sockets with queues, we should only restore the queue
>> @@ -809,6 +810,11 @@ struct unix_sk_info {
>>  	u8 listen:1;
>>  };
>>  
>> +struct scm_fle {
>> +	struct list_head l;
>> +	struct fdinfo_list_entry *fle;
>> +};
>> +
>>  #define USK_PAIR_MASTER		0x1
>>  #define USK_PAIR_SLAVE		0x2
>>  
>> @@ -824,6 +830,116 @@ static struct unix_sk_info *find_unix_sk_by_ino(int ino)
>>  	return NULL;
>>  }
>>  
>> +static struct unix_sk_info *find_queuer_for(int id)
>> +{
>> +	struct unix_sk_info *ui;
>> +
>> +	list_for_each_entry(ui, &unix_sockets, list) {
>> +		if (ui->queuer == id)
>> +			return ui;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids)
>> +{
>> +	struct unix_sk_info *ui;
>> +	struct pstree_item *owner;
>> +	int i;
>> +
>> +	ui = find_queuer_for(id_for);
>> +	if (!ui) {
>> +		pr_err("Can't find sender for %d\n", id_for);
>> +		return -1;
>> +	}
>> +
>> +	pr_info("Found queuer for %d -> %d\n", id_for, ui->ue->id);
>> +	/*
>> +	 * This is the task that will restore this socket
>> +	 */
>> +	owner = file_master(&ui->d)->task;
>> +
>> +	pr_info("-> will set up deps\n");
>> +	/*
>> +	 * The ui will send data to the rights receiver. Add a fake fle
>> +	 * for the file and a dependency.
>> +	 */
>> +	for (i = 0; i < n_ids; i++) {
>> +		struct file_desc *tgt;
>> +		struct fdinfo_list_entry *fle;
>> +		struct scm_fle *sfle;
>> +		FdinfoEntry *e;
>> +		int fd;
>> +
>> +		tgt = find_file_desc_raw(FD_TYPES__UND, file_ids[i]);
>> +		if (!tgt) {
>> +			pr_err("Can't find fdesc to send\n");
>> +			return -1;
>> +		}
>> +
>> +		pr_info("`- Found %d file\n", file_ids[i]);
>> +		fd = find_unused_fd(owner, -1);
> 
> If I haven't missed something, I don't see, where you try
> to found the fle in task's file descriptor  to use it for
> queuing. It seems, we add a fake fle for every packet in
> a queue. If it's so, it's not performance-good, and firstly
> we should try to check, that task already owns the fle instead
> of that.

Yes, I made this deliberately. Adding one more descriptor for a file
will take us several syscalls more. Fortunately queues with scms will
not be a frequent case.

>> +
>> +		fle = try_file_master(tgt);
>> +		if (fle) {
>> +			e = dup_fdinfo(fle->fe, fd, 0);
>> +			if (!e) {
>> +				pr_err("Can't duplicate fdinfo for scm\n");
>> +				return -1;
>> +			}
>> +		} else {
>> +			/*
>> +			 * This can happen if the file in question is
>> +			 * sent over the socket and closed. In this case
>> +			 * we need to ... invent a new one!
>> +			 */
>> +
>> +			e = xmalloc(sizeof(*e));
>> +			if (!e)
>> +				return -1;
>> +
>> +			fdinfo_entry__init(e);
>> +			e->id = tgt->id;
>> +			e->type = tgt->ops->type;
>> +			e->fd = fd;
>> +			e->flags = 0;
> 
> I'd add the code around fdinfo_entry__init() in separate function
> as we already have 3 places like it.

OK, will do. What are the others?

>> +		}
>> +
>> +		pr_info("scm: add %d -> %d.fd[%d]\n", tgt->id, vpid(owner), fd);
>> +		sfle = xmalloc(sizeof(*sfle));
>> +		if (!sfle)
>> +			return -1;
>> +
>> +		sfle->fle = collect_fd_to(vpid(owner), e, rsti(owner), tgt, false);
>> +		if (!sfle->fle) {
>> +			pr_err("Can't request new fle for scm\n");
>> +			return -1;
>> +		}
>> +
>> +		list_add_tail(&sfle->l, &ui->scm_fles);
>> +		fds[i] = fd;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int chk_restored_scms(struct unix_sk_info *ui)
>> +{
>> +	struct scm_fle *sf, *n;
>> +
>> +	list_for_each_entry_safe(sf, n, &ui->scm_fles, l) {
>> +		if (sf->fle->stage != FLE_RESTORED)
>> +			return 1;
> 
> It's need to wait for sf->fle->bound here instead. 

Why?

> I suppose, the above introduces some circular dependencies for cases
> with several sockets having epoll, which is listening them.

Nope :) On dump I explicitly ban unix sockets and epolls sent via unix
sockets. So we can only wait here for some other files, that cannot wait us.

>> +
>> +		/* Optimization for the next pass */
>> +		list_del(&sf->l);
>> +		xfree(sf);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>  static int wake_connected_sockets(struct unix_sk_info *ui)
>>  {
>>  	struct fdinfo_list_entry *fle;
>> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>>  	struct unix_sk_info *ui;
>>  	int ret;
>>  
>> +	ui = container_of(d, struct unix_sk_info, d);
>> +
>> +	/* FIXME -- only queue restore may be postponed */
>> +	if (chk_restored_scms(ui)) {
>> +		pr_info("scm: Wait for tgt to restore\n");
>> +		return 1;
>> +	}
> 
> It's too strong limitation, we delay another tasks here.

Another tasks? No, we just ask files restore loop to go and try to open
some other files.

> We may open the socket and to serve it out, and we do
> not need wait queuers for these actions.

Not always. There are several cases when we do socketpair() then restore
a queue then close one of the ends immediately.

> It's better to wait for the master fle of packet right
> before we restore the queue, and to use it from our
> task fle list.

Master may sit in some other's list.

>> +
>>  	fle = file_master(d);
>>  	if (fle->stage >= FLE_OPEN)
>>  		return post_open_unix_sk(d, fle->fe->fd);
>>  
>> -	ui = container_of(d, struct unix_sk_info, d);
>> -
>>  	if (inherited_fd(d, new_fd)) {
>>  		ui->ue->uflags |= USK_INHERIT;
>>  		ret = *new_fd >= 0 ? 0 : -1;
>> @@ -1440,6 +1562,7 @@ static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i)
>>  	ui->listen = 0;
>>  	INIT_LIST_HEAD(&ui->connected);
>>  	INIT_LIST_HEAD(&ui->node);
>> +	INIT_LIST_HEAD(&ui->scm_fles);
>>  	ui->flags = 0;
>>  	fixup_sock_net_ns_id(&ui->ue->ns_id, &ui->ue->has_ns_id);
>>  
>> diff --git a/images/sk-packet.proto b/images/sk-packet.proto
>> index 27b48e4..009b461 100644
>> --- a/images/sk-packet.proto
>> +++ b/images/sk-packet.proto
>> @@ -1,8 +1,14 @@
>>  syntax = "proto2";
>>  
>> +message scm_entry {
>> +	required uint32			type		= 1;
>> +	repeated uint32			rights		= 2;
>> +}
>> +
>>  message sk_packet_entry {
>>  	required uint32		id_for		= 1;
>>  	required uint32		length		= 2;
>>  	// optional bytes		addr	= 3;
>>  	// optional sk_ucred_entry	ucred	= 128;
>> +	repeated scm_entry	scm		= 4;
>>  }
>> -- 
>> 2.1.4
>>
>> _______________________________________________
>> CRIU mailing list
>> CRIU@openvz.org
>> https://lists.openvz.org/mailman/listinfo/criu
> .
>
Kirill Tkhai July 12, 2017, 9:41 a.m.
On 11.07.2017 18:49, Pavel Emelyanov wrote:
> 
>>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>>> index e14fa06..b9ef49c 100644
>>> --- a/criu/cr-restore.c
>>> +++ b/criu/cr-restore.c
>>> @@ -364,6 +364,14 @@ static int root_prepare_shared(void)
>>>  	if (ret)
>>>  		goto err;
>>>  
>>> +	/*
>>> +	 * This should be called with all packets collected AND all
>>> +	 * fdescs and fles prepared BUT post-prep-s not run.
>>> +	 */
>>> +	ret = prepare_scms();
>>> +	if (ret)
>>> +		goto err;
>>
>> This should be called before add_fake_fds_masters():
>>
>> 1)It may add a master file there, and this master
>> must be resolved in add_fake_fds_masters(). Otherwise
>> the task, which is the owner of this just added master,
>> may not have rights to create the master (imagine,
>> scm file is a socket of a net_ns, which can't be
>> assigned by the task);
>>
>> 2)Another case -- there was not a task, which has
>> permittions to create a socket, and you added it
>> in prepare_scms(). In this case, we mustn't add
>> one more fle in add_fake_fds_masters() -- and
>> if this function is called after prepare_scms(),
>> it won't add anything. This will reduce number
>> of fake files, we add.
> 
> OK, will move and add this text as code comment.
> 
>>> +
>>>  	ret = run_post_prepare();
>>>  	if (ret)
>>>  		goto err;
>>> diff --git a/criu/include/sockets.h b/criu/include/sockets.h
>>> index 3fa8017..1bd5c67 100644
>>> --- a/criu/include/sockets.h
>>> +++ b/criu/include/sockets.h
>>> @@ -38,6 +38,8 @@ extern int collect_sockets(struct ns_id *);
>>>  extern struct collect_image_info inet_sk_cinfo;
>>>  extern struct collect_image_info unix_sk_cinfo;
>>>  extern int fix_external_unix_sockets(void);
>>> +extern int prepare_scms(void);
>>> +extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids);
>>>  
>>>  extern struct collect_image_info netlink_sk_cinfo;
>>>  
>>> diff --git a/criu/sk-queue.c b/criu/sk-queue.c
>>> index 77e203e..953db66 100644
>>> --- a/criu/sk-queue.c
>>> +++ b/criu/sk-queue.c
>>> @@ -18,9 +18,9 @@
>>>  #include "util.h"
>>>  #include "util-pie.h"
>>>  #include "sockets.h"
>>> -
>>> +#include "xmalloc.h"
>>>  #include "sk-queue.h"
>>> -
>>> +#include "files.h"
>>>  #include "protobuf.h"
>>>  #include "images/sk-packet.pb-c.h"
>>>  
>>> @@ -28,6 +28,8 @@ struct sk_packet {
>>>  	struct list_head	list;
>>>  	SkPacketEntry		*entry;
>>>  	char        		*data;
>>> +	unsigned		scm_len;
>>> +	int			*scm;
>>>  };
>>>  
>>>  static LIST_HEAD(packets_list);
>>> @@ -37,12 +39,22 @@ static int collect_one_packet(void *obj, ProtobufCMessage *msg, struct cr_img *i
>>>  	struct sk_packet *pkt = obj;
>>>  
>>>  	pkt->entry = pb_msg(msg, SkPacketEntry);
>>> -
>>> +	pkt->scm = NULL;
>>>  	pkt->data = xmalloc(pkt->entry->length);
>>>  	if (pkt->data ==NULL)
>>>  		return -1;
>>>  
>>>  	/*
>>> +	 * See dump_packet_cmsg() -- only SCM_RIGHTS are supported and
>>> +	 * only 1 of that kind is possible, thus not more than 1 SCMs
>>> +	 * on a packet.
>>> +	 */
>>> +	if (pkt->entry->n_scm > 1) {
>>> +		pr_err("More than 1 SCM is not possible\n");
>>> +		return -1;
>>> +	}
>>> +
>>> +	/*
>>>  	 * NOTE: packet must be added to the tail. Otherwise sequence
>>>  	 * will be broken.
>>>  	 */
>>> @@ -64,6 +76,50 @@ struct collect_image_info sk_queues_cinfo = {
>>>  	.collect = collect_one_packet,
>>>  };
>>>  
>>> +static int dump_scm_rights(struct cmsghdr *ch, SkPacketEntry *pe)
>>> +{
>>> +	int nr_fds, *fds, i;
>>> +	void *buf;
>>> +	ScmEntry *scme;
>>> +
>>> +	nr_fds = (ch->cmsg_len - sizeof(*ch)) / sizeof(int);
>>> +	fds = (int *)CMSG_DATA(ch);
>>> +
>>> +	buf = xmalloc(sizeof(ScmEntry) + nr_fds * sizeof(uint32_t));
>>> +	if (!buf)
>>> +		return -1;
>>> +
>>> +	scme = xptr_pull(&buf, ScmEntry);
>>> +	scm_entry__init(scme);
>>> +	scme->type = SCM_RIGHTS;
>>> +	scme->n_rights = nr_fds;
>>> +	scme->rights = xptr_pull_s(&buf, nr_fds * sizeof(uint32_t));
>>> +
>>> +	for (i = 0; i < nr_fds; i++) {
>>> +		int ftyp;
>>> +
>>> +		if (dump_my_file(fds[i], &scme->rights[i], &ftyp))
>>> +			return -1;
>>> +
>>> +		/*
>>> +		 * Unix sent over Unix or Epoll with some other sh*t
>>> +		 * sent over unix (maybe with this very unix polled)
>>> +		 * are tricky and not supported for now. (XXX -- todo)
>>> +		 */
>>> +		if (ftyp == FD_TYPES__UNIXSK || ftyp == FD_TYPES__EVENTPOLL) {
>>> +			pr_err("Can't dump send %d (unix/epoll) fd\n", ftyp);
>>> +			return -1;
>>> +		}
>>> +	}
>>> +
>>> +	i = pe->n_scm++;
>>> +	if (xrealloc_safe(&pe->scm, pe->n_scm * sizeof(ScmEntry*)))
>>> +		return -1;
>>> +
>>> +	pe->scm[i] = scme;
>>> +	return 0;
>>> +}
>>> +
>>>  /*
>>>   * Maximum size of the control messages. XXX -- is there any
>>>   * way to get this value out of the kernel?
>>> @@ -73,8 +129,26 @@ struct collect_image_info sk_queues_cinfo = {
>>>  static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>>>  {
>>>  	struct cmsghdr *ch;
>>> +	int n_rights = 0;
>>>  
>>>  	for (ch = CMSG_FIRSTHDR(mh); ch; ch = CMSG_NXTHDR(mh, ch)) {
>>> +		if (ch->cmsg_type == SCM_RIGHTS) {
>>> +			if (n_rights) {
>>> +				/*
>>> +				 * Even if user is sending more than one cmsg with
>>> +				 * rights, kernel merges them alltogether on recv.
>>> +				 */
>>> +				pr_err("Unexpected 2nd SCM_RIGHTS from the kernel\n");
>>> +				return -1;
>>> +			}
>>> +
>>> +			if (dump_scm_rights(ch, pe))
>>> +				return -1;
>>> +
>>> +			n_rights++;
>>> +			continue;
>>> +		}
>>> +
>>>  		pr_err("Control messages in queue, not supported\n");
>>>  		return -1;
>>>  	}
>>> @@ -82,6 +156,18 @@ static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>>>  	return 0;
>>>  }
>>>  
>>> +static void release_cmsg(SkPacketEntry *pe)
>>> +{
>>> +	int i;
>>> +
>>> +	for (i = 0; i < pe->n_scm; i++)
>>> +		xfree(pe->scm[i]);
>>> +	xfree(pe->scm);
>>> +
>>> +	pe->n_scm = 0;
>>> +	pe->scm = NULL;
>>> +}
>>> +
>>>  int dump_sk_queue(int sock_fd, int sock_id)
>>>  {
>>>  	SkPacketEntry pe = SK_PACKET_ENTRY__INIT;
>>> @@ -181,6 +267,9 @@ int dump_sk_queue(int sock_fd, int sock_id)
>>>  			ret = -EIO;
>>>  			goto err_set_sock;
>>>  		}
>>> +
>>> +		if (pe.scm)
>>> +			release_cmsg(&pe);
>>>  	}
>>>  	ret = 0;
>>>  
>>> @@ -197,6 +286,20 @@ err_brk:
>>>  	return ret;
>>>  }
>>>  
>>> +static void close_scm_fds(struct sk_packet *pkt)
>>> +{
>>> +	int i, *fds, n_fds;
>>> +	struct cmsghdr *ch = (struct cmsghdr *)pkt->scm;
>>> +
>>> +	fds = (int *)CMSG_DATA(ch);
>>> +	n_fds = (ch->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
>>> +
>>> +	for (i = 0; i < n_fds; i++) {
>>> +		if (close(fds[i]))
>>> +			pr_warn("scm: Error closing sent fd\n");
>>> +	}
>>> +}
>>> +
>>>  static int send_one_pkt(int fd, struct sk_packet *pkt)
>>>  {
>>>  	int ret;
>>> @@ -209,6 +312,11 @@ static int send_one_pkt(int fd, struct sk_packet *pkt)
>>>  	iov.iov_base = pkt->data;
>>>  	iov.iov_len = entry->length;
>>>  
>>> +	if (pkt->scm != NULL) {
>>> +		mh.msg_controllen = pkt->scm_len;
>>> +		mh.msg_control = pkt->scm;
>>> +	}
>>> +
>>>  	/*
>>>  	 * Don't try to use sendfile here, because it use sendpage() and
>>>  	 * all data are split on pages and a new skb is allocated for
>>> @@ -229,6 +337,9 @@ static int send_one_pkt(int fd, struct sk_packet *pkt)
>>>  		return -1;
>>>  	}
>>>  
>>> +	if (pkt->scm != NULL)
>>> +		close_scm_fds(pkt);
>>> +
>>>  	return 0;
>>>  }
>>>  
>>> @@ -264,3 +375,43 @@ int restore_sk_queue(int fd, unsigned int peer_id)
>>>  out:
>>>  	return ret;
>>>  }
>>
>> The previous patches don't make restore_sk_queue() find a fle,
>> which fd is used to set next packet in queue. This series does
>> not support multi-queuer queues?
> 
> No. You're talking about "restore sender address" patches :) I remember that,
> and that's a separate issue. Right now if we have several sockets sending
> data to one, on restore we pick the first peer and make it restore the whole
> queue.
> 
>> It seems, we don't need "unix: Split resolv and interconnect (v2)",
>> it's just need to select a correct fd from task fles, and to use
>> it in restore_sk_queue().
> 
> We do. Inside unix_note_scm_rights() I need to find a queuer for a queue
> and this lookup is only possible after the resolve and interconnect split
> I've made before.
> 
>>> +
>>> +int prepare_scms(void)
>>> +{
>>> +	struct sk_packet *pkt;
>>> +
>>> +	pr_info("Preparing SCMs\n");
>>> +	list_for_each_entry(pkt, &packets_list, list) {
>>> +		SkPacketEntry *pe = pkt->entry;
>>> +		ScmEntry *se;
>>> +		struct cmsghdr *ch;
>>> +
>>> +		if (!pe->n_scm)
>>> +			continue;
>>> +
>>> +		se = pe->scm[0]; /* Only 1 SCM is possible */
>>> +
>>> +		if (se->type == SCM_RIGHTS) {
>>> +			pkt->scm_len = CMSG_SPACE(se->n_rights * sizeof(int));
>>> +			pkt->scm = xmalloc(pkt->scm_len);
>>> +			if (!pkt->scm)
>>> +				return -1;
>>> +
>>> +			ch = (struct cmsghdr *)pkt->scm; /* FIXME -- via msghdr */
>>> +			ch->cmsg_level = SOL_SOCKET;
>>> +			ch->cmsg_type = SCM_RIGHTS;
>>> +			ch->cmsg_len = CMSG_LEN(se->n_rights * sizeof(int));
>>> +
>>> +			if (unix_note_scm_rights(pe->id_for, se->rights,
>>> +						(int *)CMSG_DATA(ch), se->n_rights))
>>> +				return -1;
>>> +
>>> +			continue;
>>> +		}
>>> +
>>> +		pr_err("Unsupported scm %d in image\n", se->type);
>>> +		return -1;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> diff --git a/criu/sk-unix.c b/criu/sk-unix.c
>>> index 42ce1bb..165adc9 100644
>>> --- a/criu/sk-unix.c
>>> +++ b/criu/sk-unix.c
>>> @@ -798,6 +798,7 @@ struct unix_sk_info {
>>>  	struct file_desc d;
>>>  	struct list_head connected; /* List of sockets, connected to me */
>>>  	struct list_head node; /* To link in peer's connected list  */
>>> +	struct list_head scm_fles;
>>>  
>>>  	/*
>>>  	 * For DGRAM sockets with queues, we should only restore the queue
>>> @@ -809,6 +810,11 @@ struct unix_sk_info {
>>>  	u8 listen:1;
>>>  };
>>>  
>>> +struct scm_fle {
>>> +	struct list_head l;
>>> +	struct fdinfo_list_entry *fle;
>>> +};
>>> +
>>>  #define USK_PAIR_MASTER		0x1
>>>  #define USK_PAIR_SLAVE		0x2
>>>  
>>> @@ -824,6 +830,116 @@ static struct unix_sk_info *find_unix_sk_by_ino(int ino)
>>>  	return NULL;
>>>  }
>>>  
>>> +static struct unix_sk_info *find_queuer_for(int id)
>>> +{
>>> +	struct unix_sk_info *ui;
>>> +
>>> +	list_for_each_entry(ui, &unix_sockets, list) {
>>> +		if (ui->queuer == id)
>>> +			return ui;
>>> +	}
>>> +
>>> +	return NULL;
>>> +}
>>> +
>>> +int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids)
>>> +{
>>> +	struct unix_sk_info *ui;
>>> +	struct pstree_item *owner;
>>> +	int i;
>>> +
>>> +	ui = find_queuer_for(id_for);
>>> +	if (!ui) {
>>> +		pr_err("Can't find sender for %d\n", id_for);
>>> +		return -1;
>>> +	}
>>> +
>>> +	pr_info("Found queuer for %d -> %d\n", id_for, ui->ue->id);
>>> +	/*
>>> +	 * This is the task that will restore this socket
>>> +	 */
>>> +	owner = file_master(&ui->d)->task;
>>> +
>>> +	pr_info("-> will set up deps\n");
>>> +	/*
>>> +	 * The ui will send data to the rights receiver. Add a fake fle
>>> +	 * for the file and a dependency.
>>> +	 */
>>> +	for (i = 0; i < n_ids; i++) {
>>> +		struct file_desc *tgt;
>>> +		struct fdinfo_list_entry *fle;
>>> +		struct scm_fle *sfle;
>>> +		FdinfoEntry *e;
>>> +		int fd;
>>> +
>>> +		tgt = find_file_desc_raw(FD_TYPES__UND, file_ids[i]);
>>> +		if (!tgt) {
>>> +			pr_err("Can't find fdesc to send\n");
>>> +			return -1;
>>> +		}
>>> +
>>> +		pr_info("`- Found %d file\n", file_ids[i]);
>>> +		fd = find_unused_fd(owner, -1);
>>
>> If I haven't missed something, I don't see, where you try
>> to found the fle in task's file descriptor  to use it for
>> queuing. It seems, we add a fake fle for every packet in
>> a queue. If it's so, it's not performance-good, and firstly
>> we should try to check, that task already owns the fle instead
>> of that.
> 
> Yes, I made this deliberately. Adding one more descriptor for a file
> will take us several syscalls more. Fortunately queues with scms will
> not be a frequent case.

Your series restore queues with file descriptors in scm, and one of
the queuers is used to to that. Task of this queuer obtains additional
file descriptors. Right? If so, why we don't try to analyze the task's
descriptors, if it already has the one, we add? I'm asking about that,
and this seems to not require excess file descriptors or system calls
(it's possible to just analyze fd identifiers). I'm not sure we understood
each other in question %) Do you mean the same?
 
>>> +
>>> +		fle = try_file_master(tgt);
>>> +		if (fle) {
>>> +			e = dup_fdinfo(fle->fe, fd, 0);
>>> +			if (!e) {
>>> +				pr_err("Can't duplicate fdinfo for scm\n");
>>> +				return -1;
>>> +			}
>>> +		} else {
>>> +			/*
>>> +			 * This can happen if the file in question is
>>> +			 * sent over the socket and closed. In this case
>>> +			 * we need to ... invent a new one!
>>> +			 */
>>> +
>>> +			e = xmalloc(sizeof(*e));
>>> +			if (!e)
>>> +				return -1;
>>> +
>>> +			fdinfo_entry__init(e);
>>> +			e->id = tgt->id;
>>> +			e->type = tgt->ops->type;
>>> +			e->fd = fd;
>>> +			e->flags = 0;
>>
>> I'd add the code around fdinfo_entry__init() in separate function
>> as we already have 3 places like it.
> 
> OK, will do. What are the others?

I mean the places, where fdinfo_entry__init() is used:
$ git grep fdinfo_entry__init
criu/files.c:   fdinfo_entry__init(e);
criu/files.c:   fdinfo_entry__init(e);
criu/sk-unix.c:                 fdinfo_entry__init(e);

Something like

e = create_fdinfo_entry(tgt->id, tgt->ops->type, fd, 0, false);
if (!e)
	return -1;

"false" is to use shmem or not.

> 
>>> +		}
>>> +
>>> +		pr_info("scm: add %d -> %d.fd[%d]\n", tgt->id, vpid(owner), fd);
>>> +		sfle = xmalloc(sizeof(*sfle));
>>> +		if (!sfle)
>>> +			return -1;
>>> +
>>> +		sfle->fle = collect_fd_to(vpid(owner), e, rsti(owner), tgt, false);
>>> +		if (!sfle->fle) {
>>> +			pr_err("Can't request new fle for scm\n");
>>> +			return -1;
>>> +		}
>>> +
>>> +		list_add_tail(&sfle->l, &ui->scm_fles);
>>> +		fds[i] = fd;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int chk_restored_scms(struct unix_sk_info *ui)
>>> +{
>>> +	struct scm_fle *sf, *n;
>>> +
>>> +	list_for_each_entry_safe(sf, n, &ui->scm_fles, l) {
>>> +		if (sf->fle->stage != FLE_RESTORED)
>>> +			return 1;
>>
>> It's need to wait for sf->fle->bound here instead. 
> 
> Why?

Then, what is scms? If these are the files, which contain in the
queue messages, we just need to wait till they open, don't we?
 
>> I suppose, the above introduces some circular dependencies for cases
>> with several sockets having epoll, which is listening them.
> 
> Nope :) On dump I explicitly ban unix sockets and epolls sent via unix
> sockets. So we can only wait here for some other files, that cannot wait us.
> 
>>> +
>>> +		/* Optimization for the next pass */
>>> +		list_del(&sf->l);
>>> +		xfree(sf);
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>>  static int wake_connected_sockets(struct unix_sk_info *ui)
>>>  {
>>>  	struct fdinfo_list_entry *fle;
>>> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>>>  	struct unix_sk_info *ui;
>>>  	int ret;
>>>  
>>> +	ui = container_of(d, struct unix_sk_info, d);
>>> +
>>> +	/* FIXME -- only queue restore may be postponed */
>>> +	if (chk_restored_scms(ui)) {
>>> +		pr_info("scm: Wait for tgt to restore\n");
>>> +		return 1;
>>> +	}
>>
>> It's too strong limitation, we delay another tasks here.
> 
> Another tasks? No, we just ask files restore loop to go and try to open
> some other files.

But another tasks may wait file descriptors you're serving out.
This tasks will be delayed.
 
>> We may open the socket and to serve it out, and we do
>> not need wait queuers for these actions.
> 
> Not always. There are several cases when we do socketpair() then restore
> a queue then close one of the ends immediately.

Even if the second end is closed, anither tasks may want this
slave fle, and their eventpolls may depend on it. I think,
we should wait only in the moments, when we must wait, and do
not do that in another cases.
 
>> It's better to wait for the master fle of packet right
>> before we restore the queue, and to use it from our
>> task fle list.
> 
> Master may sit in some other's list.
> 
>>> +
>>>  	fle = file_master(d);
>>>  	if (fle->stage >= FLE_OPEN)
>>>  		return post_open_unix_sk(d, fle->fe->fd);
>>>  
>>> -	ui = container_of(d, struct unix_sk_info, d);
>>> -
>>>  	if (inherited_fd(d, new_fd)) {
>>>  		ui->ue->uflags |= USK_INHERIT;
>>>  		ret = *new_fd >= 0 ? 0 : -1;
>>> @@ -1440,6 +1562,7 @@ static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i)
>>>  	ui->listen = 0;
>>>  	INIT_LIST_HEAD(&ui->connected);
>>>  	INIT_LIST_HEAD(&ui->node);
>>> +	INIT_LIST_HEAD(&ui->scm_fles);
>>>  	ui->flags = 0;
>>>  	fixup_sock_net_ns_id(&ui->ue->ns_id, &ui->ue->has_ns_id);
>>>  
>>> diff --git a/images/sk-packet.proto b/images/sk-packet.proto
>>> index 27b48e4..009b461 100644
>>> --- a/images/sk-packet.proto
>>> +++ b/images/sk-packet.proto
>>> @@ -1,8 +1,14 @@
>>>  syntax = "proto2";
>>>  
>>> +message scm_entry {
>>> +	required uint32			type		= 1;
>>> +	repeated uint32			rights		= 2;
>>> +}
>>> +
>>>  message sk_packet_entry {
>>>  	required uint32		id_for		= 1;
>>>  	required uint32		length		= 2;
>>>  	// optional bytes		addr	= 3;
>>>  	// optional sk_ucred_entry	ucred	= 128;
>>> +	repeated scm_entry	scm		= 4;
>>>  }
>>> -- 
>>> 2.1.4
>>>
>>> _______________________________________________
>>> CRIU mailing list
>>> CRIU@openvz.org
>>> https://lists.openvz.org/mailman/listinfo/criu
>> .
>>
>
Pavel Emelyanov July 12, 2017, 12:10 p.m.
>>>> +	for (i = 0; i < n_ids; i++) {
>>>> +		struct file_desc *tgt;
>>>> +		struct fdinfo_list_entry *fle;
>>>> +		struct scm_fle *sfle;
>>>> +		FdinfoEntry *e;
>>>> +		int fd;
>>>> +
>>>> +		tgt = find_file_desc_raw(FD_TYPES__UND, file_ids[i]);
>>>> +		if (!tgt) {
>>>> +			pr_err("Can't find fdesc to send\n");
>>>> +			return -1;
>>>> +		}
>>>> +
>>>> +		pr_info("`- Found %d file\n", file_ids[i]);
>>>> +		fd = find_unused_fd(owner, -1);
>>>
>>> If I haven't missed something, I don't see, where you try
>>> to found the fle in task's file descriptor  to use it for
>>> queuing. It seems, we add a fake fle for every packet in
>>> a queue. If it's so, it's not performance-good, and firstly
>>> we should try to check, that task already owns the fle instead
>>> of that.
>>
>> Yes, I made this deliberately. Adding one more descriptor for a file
>> will take us several syscalls more. Fortunately queues with scms will
>> not be a frequent case.
> 
> Your series restore queues with file descriptors in scm, and one of
> the queuers is used to to that. Task of this queuer obtains additional
> file descriptors. Right?

Right. Task of the master fle of the queuer does so.

> If so, why we don't try to analyze the task's
> descriptors, if it already has the one, we add? 

We can. In that case we'll need to add a flag on the fle, saying that
this fle can be closed by scm-sending code. I thought about it, but
wanted to make the first version simpler.

> I'm asking about that,
> and this seems to not require excess file descriptors or system calls
> (it's possible to just analyze fd identifiers). I'm not sure we understood
> each other in question %) Do you mean the same?

You're right. If a task that is to send a descriptor already owns the
relevant fle, we can just reuse that. I wanted to do it incrementally,
as this would require more changes and would complicate maintainers'
review :P

At the same time, I don't expect tons of scm-rights messages and think
that saving a couple of syscalls and one descriptors might not worth the 
efforts. But ... anyway.

>>>> +
>>>> +		fle = try_file_master(tgt);
>>>> +		if (fle) {
>>>> +			e = dup_fdinfo(fle->fe, fd, 0);
>>>> +			if (!e) {
>>>> +				pr_err("Can't duplicate fdinfo for scm\n");
>>>> +				return -1;
>>>> +			}
>>>> +		} else {
>>>> +			/*
>>>> +			 * This can happen if the file in question is
>>>> +			 * sent over the socket and closed. In this case
>>>> +			 * we need to ... invent a new one!
>>>> +			 */
>>>> +
>>>> +			e = xmalloc(sizeof(*e));
>>>> +			if (!e)
>>>> +				return -1;
>>>> +
>>>> +			fdinfo_entry__init(e);
>>>> +			e->id = tgt->id;
>>>> +			e->type = tgt->ops->type;
>>>> +			e->fd = fd;
>>>> +			e->flags = 0;
>>>
>>> I'd add the code around fdinfo_entry__init() in separate function
>>> as we already have 3 places like it.
>>
>> OK, will do. What are the others?
> 
> I mean the places, where fdinfo_entry__init() is used:
> $ git grep fdinfo_entry__init
> criu/files.c:   fdinfo_entry__init(e);
> criu/files.c:   fdinfo_entry__init(e);
> criu/sk-unix.c:                 fdinfo_entry__init(e);
> 
> Something like
> 
> e = create_fdinfo_entry(tgt->id, tgt->ops->type, fd, 0, false);
> if (!e)
> 	return -1;
> 
> "false" is to use shmem or not.

Ah, I see. So the options we have now is having N lines of assignments vs
having a routine with M arguments. I would chose the 2nd only if M < N,
but your proposal is 4 assignments vs 5 arguments.

>>>> +		}
>>>> +
>>>> +		pr_info("scm: add %d -> %d.fd[%d]\n", tgt->id, vpid(owner), fd);
>>>> +		sfle = xmalloc(sizeof(*sfle));
>>>> +		if (!sfle)
>>>> +			return -1;
>>>> +
>>>> +		sfle->fle = collect_fd_to(vpid(owner), e, rsti(owner), tgt, false);
>>>> +		if (!sfle->fle) {
>>>> +			pr_err("Can't request new fle for scm\n");
>>>> +			return -1;
>>>> +		}
>>>> +
>>>> +		list_add_tail(&sfle->l, &ui->scm_fles);
>>>> +		fds[i] = fd;
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static int chk_restored_scms(struct unix_sk_info *ui)
>>>> +{
>>>> +	struct scm_fle *sf, *n;
>>>> +
>>>> +	list_for_each_entry_safe(sf, n, &ui->scm_fles, l) {
>>>> +		if (sf->fle->stage != FLE_RESTORED)
>>>> +			return 1;
>>>
>>> It's need to wait for sf->fle->bound here instead. 
>>
>> Why?
> 
> Then, what is scms? If these are the files, which contain in the
> queue messages, we just need to wait till they open, don't we?

Here we wait for a file descriptor to appear in the task's fdtable, so that
we could reference one. Do you want to say that waiting for FLE_OPEN is
enough? I need make sure that fle->fd does reference the correct files, 
in other words, file has been put into proper descriptor already. Is that
so? If yes, I will fix this place, but what's the difference between
FLE_OPEN and FLE_RESTORED then?

>>> I suppose, the above introduces some circular dependencies for cases
>>> with several sockets having epoll, which is listening them.
>>
>> Nope :) On dump I explicitly ban unix sockets and epolls sent via unix
>> sockets. So we can only wait here for some other files, that cannot wait us.
>>
>>>> +
>>>> +		/* Optimization for the next pass */
>>>> +		list_del(&sf->l);
>>>> +		xfree(sf);
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>>  static int wake_connected_sockets(struct unix_sk_info *ui)
>>>>  {
>>>>  	struct fdinfo_list_entry *fle;
>>>> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>>>>  	struct unix_sk_info *ui;
>>>>  	int ret;
>>>>  
>>>> +	ui = container_of(d, struct unix_sk_info, d);
>>>> +
>>>> +	/* FIXME -- only queue restore may be postponed */
>>>> +	if (chk_restored_scms(ui)) {
>>>> +		pr_info("scm: Wait for tgt to restore\n");
>>>> +		return 1;
>>>> +	}
>>>
>>> It's too strong limitation, we delay another tasks here.
>>
>> Another tasks? No, we just ask files restore loop to go and try to open
>> some other files.
> 
> But another tasks may wait file descriptors you're serving out.
> This tasks will be delayed.

Mm... Yes, it will. Isn't it the cost of synchronization anyway? The FIXME
comment above it is exactly about it -- complicating the code for the sake
of (hopefully) performance.

>>> We may open the socket and to serve it out, and we do
>>> not need wait queuers for these actions.
>>
>> Not always. There are several cases when we do socketpair() then restore
>> a queue then close one of the ends immediately.
> 
> Even if the second end is closed, anither tasks may want this
> slave fle, and their eventpolls may depend on it.
> I think, we should wait only in the moments, when we must wait, and do
> not do that in another cases.

This is valid, but must is to be replaced with should :)
Is anything broken with the current approach? Or just suboptimal?

-- Pavel
Kirill Tkhai July 12, 2017, 1:04 p.m.
On 12.07.2017 15:10, Pavel Emelyanov wrote:
> 
>>>>> +	for (i = 0; i < n_ids; i++) {
>>>>> +		struct file_desc *tgt;
>>>>> +		struct fdinfo_list_entry *fle;
>>>>> +		struct scm_fle *sfle;
>>>>> +		FdinfoEntry *e;
>>>>> +		int fd;
>>>>> +
>>>>> +		tgt = find_file_desc_raw(FD_TYPES__UND, file_ids[i]);
>>>>> +		if (!tgt) {
>>>>> +			pr_err("Can't find fdesc to send\n");
>>>>> +			return -1;
>>>>> +		}
>>>>> +
>>>>> +		pr_info("`- Found %d file\n", file_ids[i]);
>>>>> +		fd = find_unused_fd(owner, -1);
>>>>
>>>> If I haven't missed something, I don't see, where you try
>>>> to found the fle in task's file descriptor  to use it for
>>>> queuing. It seems, we add a fake fle for every packet in
>>>> a queue. If it's so, it's not performance-good, and firstly
>>>> we should try to check, that task already owns the fle instead
>>>> of that.
>>>
>>> Yes, I made this deliberately. Adding one more descriptor for a file
>>> will take us several syscalls more. Fortunately queues with scms will
>>> not be a frequent case.
>>
>> Your series restore queues with file descriptors in scm, and one of
>> the queuers is used to to that. Task of this queuer obtains additional
>> file descriptors. Right?
> 
> Right. Task of the master fle of the queuer does so.
> 
>> If so, why we don't try to analyze the task's
>> descriptors, if it already has the one, we add? 
> 
> We can. In that case we'll need to add a flag on the fle, saying that
> this fle can be closed by scm-sending code. I thought about it, but
> wanted to make the first version simpler.

Hm. I skip this in the first look, but you introduces a duplicate of
generic code in close_scm_fds(). Why can't a new scm be added as
a fake fle? If so, generic code will close it as another thing of such
sort.

So, the algorithm may be much simplier. If task has a fle, use it.
If task has no, add a fle with fake flag, and generic code will close
it later.

I don't think, that it's good to duplicate a generic code.
I see your comment below vvvv, but it seems it does not suppose to
delete close_scm_fds() later.
 
>> I'm asking about that,
>> and this seems to not require excess file descriptors or system calls
>> (it's possible to just analyze fd identifiers). I'm not sure we understood
>> each other in question %) Do you mean the same?
> 
> You're right. If a task that is to send a descriptor already owns the
> relevant fle, we can just reuse that. I wanted to do it incrementally,
> as this would require more changes and would complicate maintainers'
> review :P
> 
> At the same time, I don't expect tons of scm-rights messages and think
> that saving a couple of syscalls and one descriptors might not worth the 
> efforts. But ... anyway.

This comment referred ^^^^

>>>>> +
>>>>> +		fle = try_file_master(tgt);
>>>>> +		if (fle) {
>>>>> +			e = dup_fdinfo(fle->fe, fd, 0);
>>>>> +			if (!e) {
>>>>> +				pr_err("Can't duplicate fdinfo for scm\n");
>>>>> +				return -1;
>>>>> +			}
>>>>> +		} else {
>>>>> +			/*
>>>>> +			 * This can happen if the file in question is
>>>>> +			 * sent over the socket and closed. In this case
>>>>> +			 * we need to ... invent a new one!
>>>>> +			 */
>>>>> +
>>>>> +			e = xmalloc(sizeof(*e));
>>>>> +			if (!e)
>>>>> +				return -1;
>>>>> +
>>>>> +			fdinfo_entry__init(e);
>>>>> +			e->id = tgt->id;
>>>>> +			e->type = tgt->ops->type;
>>>>> +			e->fd = fd;
>>>>> +			e->flags = 0;
>>>>
>>>> I'd add the code around fdinfo_entry__init() in separate function
>>>> as we already have 3 places like it.
>>>
>>> OK, will do. What are the others?
>>
>> I mean the places, where fdinfo_entry__init() is used:
>> $ git grep fdinfo_entry__init
>> criu/files.c:   fdinfo_entry__init(e);
>> criu/files.c:   fdinfo_entry__init(e);
>> criu/sk-unix.c:                 fdinfo_entry__init(e);
>>
>> Something like
>>
>> e = create_fdinfo_entry(tgt->id, tgt->ops->type, fd, 0, false);
>> if (!e)
>> 	return -1;
>>
>> "false" is to use shmem or not.
> 
> Ah, I see. So the options we have now is having N lines of assignments vs
> having a routine with M arguments. I would chose the 2nd only if M < N,
> but your proposal is 4 assignments vs 5 arguments.
> 
>>>>> +		}
>>>>> +
>>>>> +		pr_info("scm: add %d -> %d.fd[%d]\n", tgt->id, vpid(owner), fd);
>>>>> +		sfle = xmalloc(sizeof(*sfle));
>>>>> +		if (!sfle)
>>>>> +			return -1;
>>>>> +
>>>>> +		sfle->fle = collect_fd_to(vpid(owner), e, rsti(owner), tgt, false);
>>>>> +		if (!sfle->fle) {
>>>>> +			pr_err("Can't request new fle for scm\n");
>>>>> +			return -1;
>>>>> +		}
>>>>> +
>>>>> +		list_add_tail(&sfle->l, &ui->scm_fles);
>>>>> +		fds[i] = fd;
>>>>> +	}
>>>>> +
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>> +static int chk_restored_scms(struct unix_sk_info *ui)
>>>>> +{
>>>>> +	struct scm_fle *sf, *n;
>>>>> +
>>>>> +	list_for_each_entry_safe(sf, n, &ui->scm_fles, l) {
>>>>> +		if (sf->fle->stage != FLE_RESTORED)
>>>>> +			return 1;
>>>>
>>>> It's need to wait for sf->fle->bound here instead. 
>>>
>>> Why?
>>
>> Then, what is scms? If these are the files, which contain in the
>> queue messages, we just need to wait till they open, don't we?
> 
> Here we wait for a file descriptor to appear in the task's fdtable, so that
> we could reference one. Do you want to say that waiting for FLE_OPEN is
> enough? I need make sure that fle->fd does reference the correct files, 
> in other words, file has been put into proper descriptor already. Is that
> so? If yes, I will fix this place, but what's the difference between
> FLE_OPEN and FLE_RESTORED then?

FLE_OPEN means "file is just open, but it's type-specific part is not configured yet".
It means the file is right after the open() syscall or socket() syscall etc.
It seems to be enough, if unix sockets do not transfer file attributes
in additional to flags we set in setup_and_serve_out::fcntl(fle->flags).
If so, we should extent setup_and_serve_out() in this case, but I don't
think such attributes exist in Linux: the whole generic part is made
in setup_and_serve_out(), if the existed, this function would contained
them.
 
>>>> I suppose, the above introduces some circular dependencies for cases
>>>> with several sockets having epoll, which is listening them.
>>>
>>> Nope :) On dump I explicitly ban unix sockets and epolls sent via unix
>>> sockets. So we can only wait here for some other files, that cannot wait us.
>>>
>>>>> +
>>>>> +		/* Optimization for the next pass */
>>>>> +		list_del(&sf->l);
>>>>> +		xfree(sf);
>>>>> +	}
>>>>> +
>>>>> +	return 0;
>>>>> +}
>>>>> +
>>>>>  static int wake_connected_sockets(struct unix_sk_info *ui)
>>>>>  {
>>>>>  	struct fdinfo_list_entry *fle;
>>>>> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>>>>>  	struct unix_sk_info *ui;
>>>>>  	int ret;
>>>>>  
>>>>> +	ui = container_of(d, struct unix_sk_info, d);
>>>>> +
>>>>> +	/* FIXME -- only queue restore may be postponed */
>>>>> +	if (chk_restored_scms(ui)) {
>>>>> +		pr_info("scm: Wait for tgt to restore\n");
>>>>> +		return 1;
>>>>> +	}
>>>>
>>>> It's too strong limitation, we delay another tasks here.
>>>
>>> Another tasks? No, we just ask files restore loop to go and try to open
>>> some other files.
>>
>> But another tasks may wait file descriptors you're serving out.
>> This tasks will be delayed.
> 
> Mm... Yes, it will. Isn't it the cost of synchronization anyway? The FIXME
> comment above it is exactly about it -- complicating the code for the sake
> of (hopefully) performance.

I don't think this will complicate the code. We moved to asynchronous file
restore engine just to do not have "big criu locks". Now all dependences
are as strong as they are need, and not more. It seems to be not good
to move away from this path.
 
>>>> We may open the socket and to serve it out, and we do
>>>> not need wait queuers for these actions.
>>>
>>> Not always. There are several cases when we do socketpair() then restore
>>> a queue then close one of the ends immediately.
>>
>> Even if the second end is closed, anither tasks may want this
>> slave fle, and their eventpolls may depend on it.
>> I think, we should wait only in the moments, when we must wait, and do
>> not do that in another cases.
> 
> This is valid, but must is to be replaced with should :)
> Is anything broken with the current approach? Or just suboptimal?

Yes, this is suboptiomal, and it's really easy to avoid that, I think.
It would be difficult to rework that later, if we found the limitations
as too strong that they don't allow to restore some type of files.
Pavel Emelyanov July 12, 2017, 2:19 p.m.
>>> If so, why we don't try to analyze the task's
>>> descriptors, if it already has the one, we add? 
>>
>> We can. In that case we'll need to add a flag on the fle, saying that
>> this fle can be closed by scm-sending code. I thought about it, but
>> wanted to make the first version simpler.
> 
> Hm. I skip this in the first look, but you introduces a duplicate of
> generic code in close_scm_fds(). Why can't a new scm be added as
> a fake fle? If so, generic code will close it as another thing of such
> sort.
> 
> So, the algorithm may be much simplier. If task has a fle, use it.
> If task has no, add a fle with fake flag, and generic code will close
> it later.

Do we have this close-if-flag-is-set code already?

> I don't think, that it's good to duplicate a generic code.
> I see your comment below vvvv, but it seems it does not suppose to
> delete close_scm_fds() later.

It does. The flag you're talking above is to be added (unless it's there already).

Another thing about it -- consider you've added this flag to the generic code. 
Calling close on the descriptor should happen strictly after the respective fle 
has been sent over the unix socket with scm. How should the _generic_ code make 
sure it has happened?

>>>>>> +		}
>>>>>> +
>>>>>> +		pr_info("scm: add %d -> %d.fd[%d]\n", tgt->id, vpid(owner), fd);
>>>>>> +		sfle = xmalloc(sizeof(*sfle));
>>>>>> +		if (!sfle)
>>>>>> +			return -1;
>>>>>> +
>>>>>> +		sfle->fle = collect_fd_to(vpid(owner), e, rsti(owner), tgt, false);
>>>>>> +		if (!sfle->fle) {
>>>>>> +			pr_err("Can't request new fle for scm\n");
>>>>>> +			return -1;
>>>>>> +		}
>>>>>> +
>>>>>> +		list_add_tail(&sfle->l, &ui->scm_fles);
>>>>>> +		fds[i] = fd;
>>>>>> +	}
>>>>>> +
>>>>>> +	return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static int chk_restored_scms(struct unix_sk_info *ui)
>>>>>> +{
>>>>>> +	struct scm_fle *sf, *n;
>>>>>> +
>>>>>> +	list_for_each_entry_safe(sf, n, &ui->scm_fles, l) {
>>>>>> +		if (sf->fle->stage != FLE_RESTORED)
>>>>>> +			return 1;
>>>>>
>>>>> It's need to wait for sf->fle->bound here instead. 
>>>>
>>>> Why?
>>>
>>> Then, what is scms? If these are the files, which contain in the
>>> queue messages, we just need to wait till they open, don't we?
>>
>> Here we wait for a file descriptor to appear in the task's fdtable, so that
>> we could reference one. Do you want to say that waiting for FLE_OPEN is
>> enough? I need make sure that fle->fd does reference the correct files, 
>> in other words, file has been put into proper descriptor already. Is that
>> so? If yes, I will fix this place, but what's the difference between
>> FLE_OPEN and FLE_RESTORED then?
> 
> FLE_OPEN means "file is just open, but it's type-specific part is not configured yet".
> It means the file is right after the open() syscall or socket() syscall etc.
> It seems to be enough, if unix sockets do not transfer file attributes
> in additional to flags we set in setup_and_serve_out::fcntl(fle->flags).
> If so, we should extent setup_and_serve_out() in this case, but I don't
> think such attributes exist in Linux: the whole generic part is made
> in setup_and_serve_out(), if the existed, this function would contained
> them.

OK, fair enough. Please, send the patch with comments to the constants in
questions. I'll update this piece in this set.

>>>>> I suppose, the above introduces some circular dependencies for cases
>>>>> with several sockets having epoll, which is listening them.
>>>>
>>>> Nope :) On dump I explicitly ban unix sockets and epolls sent via unix
>>>> sockets. So we can only wait here for some other files, that cannot wait us.
>>>>
>>>>>> +
>>>>>> +		/* Optimization for the next pass */
>>>>>> +		list_del(&sf->l);
>>>>>> +		xfree(sf);
>>>>>> +	}
>>>>>> +
>>>>>> +	return 0;
>>>>>> +}
>>>>>> +
>>>>>>  static int wake_connected_sockets(struct unix_sk_info *ui)
>>>>>>  {
>>>>>>  	struct fdinfo_list_entry *fle;
>>>>>> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>>>>>>  	struct unix_sk_info *ui;
>>>>>>  	int ret;
>>>>>>  
>>>>>> +	ui = container_of(d, struct unix_sk_info, d);
>>>>>> +
>>>>>> +	/* FIXME -- only queue restore may be postponed */
>>>>>> +	if (chk_restored_scms(ui)) {
>>>>>> +		pr_info("scm: Wait for tgt to restore\n");
>>>>>> +		return 1;
>>>>>> +	}
>>>>>
>>>>> It's too strong limitation, we delay another tasks here.
>>>>
>>>> Another tasks? No, we just ask files restore loop to go and try to open
>>>> some other files.
>>>
>>> But another tasks may wait file descriptors you're serving out.
>>> This tasks will be delayed.
>>
>> Mm... Yes, it will. Isn't it the cost of synchronization anyway? The FIXME
>> comment above it is exactly about it -- complicating the code for the sake
>> of (hopefully) performance.
> 
> I don't think this will complicate the code. We moved to asynchronous file
> restore engine just to do not have "big criu locks".

There's no locks here with this if, it just reorders the sequence of fle->open
calls.

> Now all dependences
> are as strong as they are need, and not more. It seems to be not good
> to move away from this path.
>  
>>>>> We may open the socket and to serve it out, and we do
>>>>> not need wait queuers for these actions.
>>>>
>>>> Not always. There are several cases when we do socketpair() then restore
>>>> a queue then close one of the ends immediately.
>>>
>>> Even if the second end is closed, anither tasks may want this
>>> slave fle, and their eventpolls may depend on it.
>>> I think, we should wait only in the moments, when we must wait, and do
>>> not do that in another cases.
>>
>> This is valid, but must is to be replaced with should :)
>> Is anything broken with the current approach? Or just suboptimal?
> 
> Yes, this is suboptiomal, and it's really easy to avoid that, I think.

Actually it's not. As I said earlier, there are cases when you cannot restore
a socket without a queue and restore the queue later, so for these cases you
need to skip the whole restoration. For the cases when you can restore a socket
and postpone the queue restore, you may do the first step and then return 1
requesting for yet another ->open call. With this only the unix ->open logic
becomes noticeably more complex.

> It would be difficult to rework that later, if we found the limitations
> as too strong that they don't allow to restore some type of files.

While I do agree with this in general, I see no points in splitting the unix sk
restore into open, postpone, restore the queue (optionally), postpone again, do
post_open. This doesn't seem to solve any real problem, makes something (what?)
better for a really rare case and complicates the ->open logic for unix sockets.

-- Pavel
Kirill Tkhai July 12, 2017, 2:42 p.m.
On 12.07.2017 17:19, Pavel Emelyanov wrote:
> 
>>>> If so, why we don't try to analyze the task's
>>>> descriptors, if it already has the one, we add? 
>>>
>>> We can. In that case we'll need to add a flag on the fle, saying that
>>> this fle can be closed by scm-sending code. I thought about it, but
>>> wanted to make the first version simpler.
>>
>> Hm. I skip this in the first look, but you introduces a duplicate of
>> generic code in close_scm_fds(). Why can't a new scm be added as
>> a fake fle? If so, generic code will close it as another thing of such
>> sort.
>>
>> So, the algorithm may be much simplier. If task has a fle, use it.
>> If task has no, add a fle with fake flag, and generic code will close
>> it later.
> 
> Do we have this close-if-flag-is-set code already?

fdinfo_list_entry::fake

>> I don't think, that it's good to duplicate a generic code.
>> I see your comment below vvvv, but it seems it does not suppose to
>> delete close_scm_fds() later.
> 
> It does. The flag you're talking above is to be added (unless it's there already).

There is already a generic ^^^
 
> Another thing about it -- consider you've added this flag to the generic code. 
> Calling close on the descriptor should happen strictly after the respective fle 
> has been sent over the unix socket with scm. How should the _generic_ code make 
> sure it has happened?
> 
>>>>>>> +		}
>>>>>>> +
>>>>>>> +		pr_info("scm: add %d -> %d.fd[%d]\n", tgt->id, vpid(owner), fd);
>>>>>>> +		sfle = xmalloc(sizeof(*sfle));
>>>>>>> +		if (!sfle)
>>>>>>> +			return -1;
>>>>>>> +
>>>>>>> +		sfle->fle = collect_fd_to(vpid(owner), e, rsti(owner), tgt, false);
>>>>>>> +		if (!sfle->fle) {
>>>>>>> +			pr_err("Can't request new fle for scm\n");
>>>>>>> +			return -1;
>>>>>>> +		}
>>>>>>> +
>>>>>>> +		list_add_tail(&sfle->l, &ui->scm_fles);
>>>>>>> +		fds[i] = fd;
>>>>>>> +	}
>>>>>>> +
>>>>>>> +	return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static int chk_restored_scms(struct unix_sk_info *ui)
>>>>>>> +{
>>>>>>> +	struct scm_fle *sf, *n;
>>>>>>> +
>>>>>>> +	list_for_each_entry_safe(sf, n, &ui->scm_fles, l) {
>>>>>>> +		if (sf->fle->stage != FLE_RESTORED)
>>>>>>> +			return 1;
>>>>>>
>>>>>> It's need to wait for sf->fle->bound here instead. 
>>>>>
>>>>> Why?
>>>>
>>>> Then, what is scms? If these are the files, which contain in the
>>>> queue messages, we just need to wait till they open, don't we?
>>>
>>> Here we wait for a file descriptor to appear in the task's fdtable, so that
>>> we could reference one. Do you want to say that waiting for FLE_OPEN is
>>> enough? I need make sure that fle->fd does reference the correct files, 
>>> in other words, file has been put into proper descriptor already. Is that
>>> so? If yes, I will fix this place, but what's the difference between
>>> FLE_OPEN and FLE_RESTORED then?
>>
>> FLE_OPEN means "file is just open, but it's type-specific part is not configured yet".
>> It means the file is right after the open() syscall or socket() syscall etc.
>> It seems to be enough, if unix sockets do not transfer file attributes
>> in additional to flags we set in setup_and_serve_out::fcntl(fle->flags).
>> If so, we should extent setup_and_serve_out() in this case, but I don't
>> think such attributes exist in Linux: the whole generic part is made
>> in setup_and_serve_out(), if the existed, this function would contained
>> them.
> 
> OK, fair enough. Please, send the patch with comments to the constants in
> questions. I'll update this piece in this set.

OK
 
>>>>>> I suppose, the above introduces some circular dependencies for cases
>>>>>> with several sockets having epoll, which is listening them.
>>>>>
>>>>> Nope :) On dump I explicitly ban unix sockets and epolls sent via unix
>>>>> sockets. So we can only wait here for some other files, that cannot wait us.
>>>>>
>>>>>>> +
>>>>>>> +		/* Optimization for the next pass */
>>>>>>> +		list_del(&sf->l);
>>>>>>> +		xfree(sf);
>>>>>>> +	}
>>>>>>> +
>>>>>>> +	return 0;
>>>>>>> +}
>>>>>>> +
>>>>>>>  static int wake_connected_sockets(struct unix_sk_info *ui)
>>>>>>>  {
>>>>>>>  	struct fdinfo_list_entry *fle;
>>>>>>> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>>>>>>>  	struct unix_sk_info *ui;
>>>>>>>  	int ret;
>>>>>>>  
>>>>>>> +	ui = container_of(d, struct unix_sk_info, d);
>>>>>>> +
>>>>>>> +	/* FIXME -- only queue restore may be postponed */
>>>>>>> +	if (chk_restored_scms(ui)) {
>>>>>>> +		pr_info("scm: Wait for tgt to restore\n");
>>>>>>> +		return 1;
>>>>>>> +	}
>>>>>>
>>>>>> It's too strong limitation, we delay another tasks here.
>>>>>
>>>>> Another tasks? No, we just ask files restore loop to go and try to open
>>>>> some other files.
>>>>
>>>> But another tasks may wait file descriptors you're serving out.
>>>> This tasks will be delayed.
>>>
>>> Mm... Yes, it will. Isn't it the cost of synchronization anyway? The FIXME
>>> comment above it is exactly about it -- complicating the code for the sake
>>> of (hopefully) performance.
>>
>> I don't think this will complicate the code. We moved to asynchronous file
>> restore engine just to do not have "big criu locks".
> 
> There's no locks here with this if, it just reorders the sequence of fle->open
> calls.

It's dependencies and it influence on open order.
 
>> Now all dependences
>> are as strong as they are need, and not more. It seems to be not good
>> to move away from this path.
>>  
>>>>>> We may open the socket and to serve it out, and we do
>>>>>> not need wait queuers for these actions.
>>>>>
>>>>> Not always. There are several cases when we do socketpair() then restore
>>>>> a queue then close one of the ends immediately.
>>>>
>>>> Even if the second end is closed, anither tasks may want this
>>>> slave fle, and their eventpolls may depend on it.
>>>> I think, we should wait only in the moments, when we must wait, and do
>>>> not do that in another cases.
>>>
>>> This is valid, but must is to be replaced with should :)
>>> Is anything broken with the current approach? Or just suboptimal?
>>
>> Yes, this is suboptiomal, and it's really easy to avoid that, I think.
> 
> Actually it's not. As I said earlier, there are cases when you cannot restore
> a socket without a queue and restore the queue later, so for these cases you
> need to skip the whole restoration. For the cases when you can restore a socket
> and postpone the queue restore, you may do the first step and then return 1
> requesting for yet another ->open call. With this only the unix ->open logic
> becomes noticeably more complex.

Could you give an example of a case, when a socket needs to wait fill another sockets
are restored, before it can call socket() syscall?
 
>> It would be difficult to rework that later, if we found the limitations
>> as too strong that they don't allow to restore some type of files.
> 
> While I do agree with this in general, I see no points in splitting the unix sk
> restore into open, postpone, restore the queue (optionally), postpone again, do
> post_open. This doesn't seem to solve any real problem, makes something (what?)
> better for a really rare case and complicates the ->open logic for unix sockets.

It's not need to introduce multy-cases. There is a generic rule:
queues with scms or msg names are restored by the socket itself, and the socket
uses task's (maybe ghost) fles to do that.
Pavel Emelyanov July 13, 2017, 9:48 a.m.
On 07/12/2017 05:42 PM, Kirill Tkhai wrote:
> On 12.07.2017 17:19, Pavel Emelyanov wrote:
>>
>>>>> If so, why we don't try to analyze the task's
>>>>> descriptors, if it already has the one, we add? 
>>>>
>>>> We can. In that case we'll need to add a flag on the fle, saying that
>>>> this fle can be closed by scm-sending code. I thought about it, but
>>>> wanted to make the first version simpler.
>>>
>>> Hm. I skip this in the first look, but you introduces a duplicate of
>>> generic code in close_scm_fds(). Why can't a new scm be added as
>>> a fake fle? If so, generic code will close it as another thing of such
>>> sort.
>>>
>>> So, the algorithm may be much simplier. If task has a fle, use it.
>>> If task has no, add a fle with fake flag, and generic code will close
>>> it later.
>>
>> Do we have this close-if-flag-is-set code already?
> 
> fdinfo_list_entry::fake
> 
>>> I don't think, that it's good to duplicate a generic code.
>>> I see your comment below vvvv, but it seems it does not suppose to
>>> delete close_scm_fds() later.
>>
>> It does. The flag you're talking above is to be added (unless it's there already).
> 
> There is already a generic ^^^

OK, will take a look.

>>>>>>> I suppose, the above introduces some circular dependencies for cases
>>>>>>> with several sockets having epoll, which is listening them.
>>>>>>
>>>>>> Nope :) On dump I explicitly ban unix sockets and epolls sent via unix
>>>>>> sockets. So we can only wait here for some other files, that cannot wait us.
>>>>>>
>>>>>>>> +
>>>>>>>> +		/* Optimization for the next pass */
>>>>>>>> +		list_del(&sf->l);
>>>>>>>> +		xfree(sf);
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>>  static int wake_connected_sockets(struct unix_sk_info *ui)
>>>>>>>>  {
>>>>>>>>  	struct fdinfo_list_entry *fle;
>>>>>>>> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>>>>>>>>  	struct unix_sk_info *ui;
>>>>>>>>  	int ret;
>>>>>>>>  
>>>>>>>> +	ui = container_of(d, struct unix_sk_info, d);
>>>>>>>> +
>>>>>>>> +	/* FIXME -- only queue restore may be postponed */
>>>>>>>> +	if (chk_restored_scms(ui)) {
>>>>>>>> +		pr_info("scm: Wait for tgt to restore\n");
>>>>>>>> +		return 1;
>>>>>>>> +	}
>>>>>>>
>>>>>>> It's too strong limitation, we delay another tasks here.
>>>>>>
>>>>>> Another tasks? No, we just ask files restore loop to go and try to open
>>>>>> some other files.
>>>>>
>>>>> But another tasks may wait file descriptors you're serving out.
>>>>> This tasks will be delayed.
>>>>
>>>> Mm... Yes, it will. Isn't it the cost of synchronization anyway? The FIXME
>>>> comment above it is exactly about it -- complicating the code for the sake
>>>> of (hopefully) performance.
>>>
>>> I don't think this will complicate the code. We moved to asynchronous file
>>> restore engine just to do not have "big criu locks".
>>
>> There's no locks here with this if, it just reorders the sequence of fle->open
>> calls.
> 
> It's dependencies and it influence on open order.

Of course. This is what we introduced the ability to call ->open again and again -- to
re-order openings the way we need. And I still don't understand what's the problem with
the fact I delay opening of a socket. "This changes the order" and "delays others" are
not real problems, the biggest problem they introduce is a couple of additional CPU
ticks needed to call unix's open for the 2nd time.
 
>>> Now all dependences
>>> are as strong as they are need, and not more. It seems to be not good
>>> to move away from this path.
>>>  
>>>>>>> We may open the socket and to serve it out, and we do
>>>>>>> not need wait queuers for these actions.
>>>>>>
>>>>>> Not always. There are several cases when we do socketpair() then restore
>>>>>> a queue then close one of the ends immediately.
>>>>>
>>>>> Even if the second end is closed, anither tasks may want this
>>>>> slave fle, and their eventpolls may depend on it.
>>>>> I think, we should wait only in the moments, when we must wait, and do
>>>>> not do that in another cases.
>>>>
>>>> This is valid, but must is to be replaced with should :)
>>>> Is anything broken with the current approach? Or just suboptimal?
>>>
>>> Yes, this is suboptiomal, and it's really easy to avoid that, I think.
>>
>> Actually it's not. As I said earlier, there are cases when you cannot restore
>> a socket without a queue and restore the queue later, so for these cases you
>> need to skip the whole restoration. For the cases when you can restore a socket
>> and postpone the queue restore, you may do the first step and then return 1
>> requesting for yet another ->open call. With this only the unix ->open logic
>> becomes noticeably more complex.
> 
> Could you give an example of a case, when a socket needs to wait fill another sockets
> are restored, before it can call socket() syscall?

With SCMs -- easily. open_unix_sk_standalone(), branch (ui->ue->state == TCP_ESTABLISHED) && !ui->ue->peer
or ui->ue->type == SOCK_DGRAM && !ui->queuer. Both call socketpair(), restore queue at once, then close
one of the ends. If I have SCM_RIGHTs message in a queue I need to wait for e.g. inet socket to be opened
before I can call socketpair().

>>> It would be difficult to rework that later, if we found the limitations
>>> as too strong that they don't allow to restore some type of files.
>>
>> While I do agree with this in general, I see no points in splitting the unix sk
>> restore into open, postpone, restore the queue (optionally), postpone again, do
>> post_open. This doesn't seem to solve any real problem, makes something (what?)
>> better for a really rare case and complicates the ->open logic for unix sockets.
> 
> It's not need to introduce multy-cases. There is a generic rule:
> queues with scms or msg names are restored by the socket itself, and the socket
> uses task's (maybe ghost) fles to do that.

No queues are not restored by the socket itself, they are restored by the socket queuer, that
is some other socket.

-- Pavel
Kirill Tkhai July 13, 2017, 10:41 a.m.
On 13.07.2017 12:48, Pavel Emelyanov wrote:
> On 07/12/2017 05:42 PM, Kirill Tkhai wrote:
>> On 12.07.2017 17:19, Pavel Emelyanov wrote:
>>>
>>>>>> If so, why we don't try to analyze the task's
>>>>>> descriptors, if it already has the one, we add? 
>>>>>
>>>>> We can. In that case we'll need to add a flag on the fle, saying that
>>>>> this fle can be closed by scm-sending code. I thought about it, but
>>>>> wanted to make the first version simpler.
>>>>
>>>> Hm. I skip this in the first look, but you introduces a duplicate of
>>>> generic code in close_scm_fds(). Why can't a new scm be added as
>>>> a fake fle? If so, generic code will close it as another thing of such
>>>> sort.
>>>>
>>>> So, the algorithm may be much simplier. If task has a fle, use it.
>>>> If task has no, add a fle with fake flag, and generic code will close
>>>> it later.
>>>
>>> Do we have this close-if-flag-is-set code already?
>>
>> fdinfo_list_entry::fake
>>
>>>> I don't think, that it's good to duplicate a generic code.
>>>> I see your comment below vvvv, but it seems it does not suppose to
>>>> delete close_scm_fds() later.
>>>
>>> It does. The flag you're talking above is to be added (unless it's there already).
>>
>> There is already a generic ^^^
> 
> OK, will take a look.
> 
>>>>>>>> I suppose, the above introduces some circular dependencies for cases
>>>>>>>> with several sockets having epoll, which is listening them.
>>>>>>>
>>>>>>> Nope :) On dump I explicitly ban unix sockets and epolls sent via unix
>>>>>>> sockets. So we can only wait here for some other files, that cannot wait us.
>>>>>>>
>>>>>>>>> +
>>>>>>>>> +		/* Optimization for the next pass */
>>>>>>>>> +		list_del(&sf->l);
>>>>>>>>> +		xfree(sf);
>>>>>>>>> +	}
>>>>>>>>> +
>>>>>>>>> +	return 0;
>>>>>>>>> +}
>>>>>>>>> +
>>>>>>>>>  static int wake_connected_sockets(struct unix_sk_info *ui)
>>>>>>>>>  {
>>>>>>>>>  	struct fdinfo_list_entry *fle;
>>>>>>>>> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>>>>>>>>>  	struct unix_sk_info *ui;
>>>>>>>>>  	int ret;
>>>>>>>>>  
>>>>>>>>> +	ui = container_of(d, struct unix_sk_info, d);
>>>>>>>>> +
>>>>>>>>> +	/* FIXME -- only queue restore may be postponed */
>>>>>>>>> +	if (chk_restored_scms(ui)) {
>>>>>>>>> +		pr_info("scm: Wait for tgt to restore\n");
>>>>>>>>> +		return 1;
>>>>>>>>> +	}
>>>>>>>>
>>>>>>>> It's too strong limitation, we delay another tasks here.
>>>>>>>
>>>>>>> Another tasks? No, we just ask files restore loop to go and try to open
>>>>>>> some other files.
>>>>>>
>>>>>> But another tasks may wait file descriptors you're serving out.
>>>>>> This tasks will be delayed.
>>>>>
>>>>> Mm... Yes, it will. Isn't it the cost of synchronization anyway? The FIXME
>>>>> comment above it is exactly about it -- complicating the code for the sake
>>>>> of (hopefully) performance.
>>>>
>>>> I don't think this will complicate the code. We moved to asynchronous file
>>>> restore engine just to do not have "big criu locks".
>>>
>>> There's no locks here with this if, it just reorders the sequence of fle->open
>>> calls.
>>
>> It's dependencies and it influence on open order.
> 
> Of course. This is what we introduced the ability to call ->open again and again -- to
> re-order openings the way we need. And I still don't understand what's the problem with
> the fact I delay opening of a socket. "This changes the order" and "delays others" are
> not real problems, the biggest problem they introduce is a couple of additional CPU
> ticks needed to call unix's open for the 2nd time.

My idea is just to restore as much in parallel, as possible. When you don't
send master fle to another tasks, these tasks may reach the stage, when the
fle is the only thing they are need at the current moment of restore. And
they will just wait you.

Also, you will need to serve master fle out as soon as possible, when you
will implement scm with unix fds.
  
>>>> Now all dependences
>>>> are as strong as they are need, and not more. It seems to be not good
>>>> to move away from this path.
>>>>  
>>>>>>>> We may open the socket and to serve it out, and we do
>>>>>>>> not need wait queuers for these actions.
>>>>>>>
>>>>>>> Not always. There are several cases when we do socketpair() then restore
>>>>>>> a queue then close one of the ends immediately.
>>>>>>
>>>>>> Even if the second end is closed, anither tasks may want this
>>>>>> slave fle, and their eventpolls may depend on it.
>>>>>> I think, we should wait only in the moments, when we must wait, and do
>>>>>> not do that in another cases.
>>>>>
>>>>> This is valid, but must is to be replaced with should :)
>>>>> Is anything broken with the current approach? Or just suboptimal?
>>>>
>>>> Yes, this is suboptiomal, and it's really easy to avoid that, I think.
>>>
>>> Actually it's not. As I said earlier, there are cases when you cannot restore
>>> a socket without a queue and restore the queue later, so for these cases you
>>> need to skip the whole restoration. For the cases when you can restore a socket
>>> and postpone the queue restore, you may do the first step and then return 1
>>> requesting for yet another ->open call. With this only the unix ->open logic
>>> becomes noticeably more complex.
>>
>> Could you give an example of a case, when a socket needs to wait fill another sockets
>> are restored, before it can call socket() syscall?
> 
> With SCMs -- easily. open_unix_sk_standalone(), branch (ui->ue->state == TCP_ESTABLISHED) && !ui->ue->peer
> or ui->ue->type == SOCK_DGRAM && !ui->queuer. Both call socketpair(), restore queue at once, then close
> one of the ends. If I have SCM_RIGHTs message in a queue I need to wait for e.g. inet socket to be opened
> before I can call socketpair().

Then only this type of sockets should wait till scm dependences are restored
before is creates a socket pair.

And the above is only till the moment unix fds in scm are implemented. When
they are, this type of socket also will fit generic model, but we will need
more difficult solution for the second closed end. I think, we will need to
open socketpair, serve first end out and then to pin second end open as fake
file.

>>>> It would be difficult to rework that later, if we found the limitations
>>>> as too strong that they don't allow to restore some type of files.
>>>
>>> While I do agree with this in general, I see no points in splitting the unix sk
>>> restore into open, postpone, restore the queue (optionally), postpone again, do
>>> post_open. This doesn't seem to solve any real problem, makes something (what?)
>>> better for a really rare case and complicates the ->open logic for unix sockets.
>>
>> It's not need to introduce multy-cases. There is a generic rule:
>> queues with scms or msg names are restored by the socket itself, and the socket
>> uses task's (maybe ghost) fles to do that.
> 
> No queues are not restored by the socket itself, they are restored by the socket queuer, that
> is some other socket
Pavel Emelyanov July 13, 2017, 11:41 a.m.
On 07/13/2017 01:41 PM, Kirill Tkhai wrote:
> On 13.07.2017 12:48, Pavel Emelyanov wrote:
>> On 07/12/2017 05:42 PM, Kirill Tkhai wrote:
>>> On 12.07.2017 17:19, Pavel Emelyanov wrote:
>>>>
>>>>>>> If so, why we don't try to analyze the task's
>>>>>>> descriptors, if it already has the one, we add? 
>>>>>>
>>>>>> We can. In that case we'll need to add a flag on the fle, saying that
>>>>>> this fle can be closed by scm-sending code. I thought about it, but
>>>>>> wanted to make the first version simpler.
>>>>>
>>>>> Hm. I skip this in the first look, but you introduces a duplicate of
>>>>> generic code in close_scm_fds(). Why can't a new scm be added as
>>>>> a fake fle? If so, generic code will close it as another thing of such
>>>>> sort.
>>>>>
>>>>> So, the algorithm may be much simplier. If task has a fle, use it.
>>>>> If task has no, add a fle with fake flag, and generic code will close
>>>>> it later.
>>>>
>>>> Do we have this close-if-flag-is-set code already?
>>>
>>> fdinfo_list_entry::fake
>>>
>>>>> I don't think, that it's good to duplicate a generic code.
>>>>> I see your comment below vvvv, but it seems it does not suppose to
>>>>> delete close_scm_fds() later.
>>>>
>>>> It does. The flag you're talking above is to be added (unless it's there already).
>>>
>>> There is already a generic ^^^
>>
>> OK, will take a look.
>>
>>>>>>>>> I suppose, the above introduces some circular dependencies for cases
>>>>>>>>> with several sockets having epoll, which is listening them.
>>>>>>>>
>>>>>>>> Nope :) On dump I explicitly ban unix sockets and epolls sent via unix
>>>>>>>> sockets. So we can only wait here for some other files, that cannot wait us.
>>>>>>>>
>>>>>>>>>> +
>>>>>>>>>> +		/* Optimization for the next pass */
>>>>>>>>>> +		list_del(&sf->l);
>>>>>>>>>> +		xfree(sf);
>>>>>>>>>> +	}
>>>>>>>>>> +
>>>>>>>>>> +	return 0;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>>  static int wake_connected_sockets(struct unix_sk_info *ui)
>>>>>>>>>>  {
>>>>>>>>>>  	struct fdinfo_list_entry *fle;
>>>>>>>>>> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>>>>>>>>>>  	struct unix_sk_info *ui;
>>>>>>>>>>  	int ret;
>>>>>>>>>>  
>>>>>>>>>> +	ui = container_of(d, struct unix_sk_info, d);
>>>>>>>>>> +
>>>>>>>>>> +	/* FIXME -- only queue restore may be postponed */
>>>>>>>>>> +	if (chk_restored_scms(ui)) {
>>>>>>>>>> +		pr_info("scm: Wait for tgt to restore\n");
>>>>>>>>>> +		return 1;
>>>>>>>>>> +	}
>>>>>>>>>
>>>>>>>>> It's too strong limitation, we delay another tasks here.
>>>>>>>>
>>>>>>>> Another tasks? No, we just ask files restore loop to go and try to open
>>>>>>>> some other files.
>>>>>>>
>>>>>>> But another tasks may wait file descriptors you're serving out.
>>>>>>> This tasks will be delayed.
>>>>>>
>>>>>> Mm... Yes, it will. Isn't it the cost of synchronization anyway? The FIXME
>>>>>> comment above it is exactly about it -- complicating the code for the sake
>>>>>> of (hopefully) performance.
>>>>>
>>>>> I don't think this will complicate the code. We moved to asynchronous file
>>>>> restore engine just to do not have "big criu locks".
>>>>
>>>> There's no locks here with this if, it just reorders the sequence of fle->open
>>>> calls.
>>>
>>> It's dependencies and it influence on open order.
>>
>> Of course. This is what we introduced the ability to call ->open again and again -- to
>> re-order openings the way we need. And I still don't understand what's the problem with
>> the fact I delay opening of a socket. "This changes the order" and "delays others" are
>> not real problems, the biggest problem they introduce is a couple of additional CPU
>> ticks needed to call unix's open for the 2nd time.
> 
> My idea is just to restore as much in parallel, as possible. 

I generally agree with that, but scms is not the case we should break our backs with.

> When you don't send master fle to another tasks, these tasks may reach the stage,

Come on, this is pure race. Whether or not the peer task would go sleeping because
we've skipped unix open this time is the question of hitting or missing the time
frame that it takes to call unix_open by pointer and checking that the return code
is 1. How many CPU ticks (not even sched ticks) is that?!

> when the fle is the only thing they are need at the current moment of restore. And
> they will just wait you.
> 
> Also, you will need to serve master fle out as soon as possible, when you
> will implement scm with unix fds.

When we'll implement scm with unix fds just serving master out ASAP won't help, we'll
need to reshuffle much more code to resolve circular and mutual scms.

>>>>> Now all dependences
>>>>> are as strong as they are need, and not more. It seems to be not good
>>>>> to move away from this path.
>>>>>  
>>>>>>>>> We may open the socket and to serve it out, and we do
>>>>>>>>> not need wait queuers for these actions.
>>>>>>>>
>>>>>>>> Not always. There are several cases when we do socketpair() then restore
>>>>>>>> a queue then close one of the ends immediately.
>>>>>>>
>>>>>>> Even if the second end is closed, anither tasks may want this
>>>>>>> slave fle, and their eventpolls may depend on it.
>>>>>>> I think, we should wait only in the moments, when we must wait, and do
>>>>>>> not do that in another cases.
>>>>>>
>>>>>> This is valid, but must is to be replaced with should :)
>>>>>> Is anything broken with the current approach? Or just suboptimal?
>>>>>
>>>>> Yes, this is suboptiomal, and it's really easy to avoid that, I think.
>>>>
>>>> Actually it's not. As I said earlier, there are cases when you cannot restore
>>>> a socket without a queue and restore the queue later, so for these cases you
>>>> need to skip the whole restoration. For the cases when you can restore a socket
>>>> and postpone the queue restore, you may do the first step and then return 1
>>>> requesting for yet another ->open call. With this only the unix ->open logic
>>>> becomes noticeably more complex.
>>>
>>> Could you give an example of a case, when a socket needs to wait fill another sockets
>>> are restored, before it can call socket() syscall?
>>
>> With SCMs -- easily. open_unix_sk_standalone(), branch (ui->ue->state == TCP_ESTABLISHED) && !ui->ue->peer
>> or ui->ue->type == SOCK_DGRAM && !ui->queuer. Both call socketpair(), restore queue at once, then close
>> one of the ends. If I have SCM_RIGHTs message in a queue I need to wait for e.g. inet socket to be opened
>> before I can call socketpair().
> 
> Then only this type of sockets should wait till scm dependences are restored
> before is creates a socket pair.

Yes, but what's the point? This won't help us resolving generic unix sent via unix case anyway. 
The change you're asking for is doubling the checks for whether or not to call socket() or
socketpair() for the sake of spectral parallelism. I can agree with this change, but not as a
part of this set, it's large enough already.

> And the above is only till the moment unix fds in scm are implemented. When
> they are, this type of socket also will fit generic model, but we will need
> more difficult solution for the second closed end. I think, we will need to
> open socketpair, serve first end out and then to pin second end open as fake
> file.
> 
>>>>> It would be difficult to rework that later, if we found the limitations
>>>>> as too strong that they don't allow to restore some type of files.
>>>>
>>>> While I do agree with this in general, I see no points in splitting the unix sk
>>>> restore into open, postpone, restore the queue (optionally), postpone again, do
>>>> post_open. This doesn't seem to solve any real problem, makes something (what?)
>>>> better for a really rare case and complicates the ->open logic for unix sockets.
>>>
>>> It's not need to introduce multy-cases. There is a generic rule:
>>> queues with scms or msg names are restored by the socket itself, and the socket
>>> uses task's (maybe ghost) fles to do that.
>>
>> No queues are not restored by the socket itself, they are restored by the socket queuer, that
>> is some other socket
> .
> 

-- Pavel
Kirill Tkhai July 13, 2017, 12:15 p.m.
On 13.07.2017 14:41, Pavel Emelyanov wrote:
> On 07/13/2017 01:41 PM, Kirill Tkhai wrote:
>> On 13.07.2017 12:48, Pavel Emelyanov wrote:
>>> On 07/12/2017 05:42 PM, Kirill Tkhai wrote:
>>>> On 12.07.2017 17:19, Pavel Emelyanov wrote:
>>>>>
>>>>>>>> If so, why we don't try to analyze the task's
>>>>>>>> descriptors, if it already has the one, we add? 
>>>>>>>
>>>>>>> We can. In that case we'll need to add a flag on the fle, saying that
>>>>>>> this fle can be closed by scm-sending code. I thought about it, but
>>>>>>> wanted to make the first version simpler.
>>>>>>
>>>>>> Hm. I skip this in the first look, but you introduces a duplicate of
>>>>>> generic code in close_scm_fds(). Why can't a new scm be added as
>>>>>> a fake fle? If so, generic code will close it as another thing of such
>>>>>> sort.
>>>>>>
>>>>>> So, the algorithm may be much simplier. If task has a fle, use it.
>>>>>> If task has no, add a fle with fake flag, and generic code will close
>>>>>> it later.
>>>>>
>>>>> Do we have this close-if-flag-is-set code already?
>>>>
>>>> fdinfo_list_entry::fake
>>>>
>>>>>> I don't think, that it's good to duplicate a generic code.
>>>>>> I see your comment below vvvv, but it seems it does not suppose to
>>>>>> delete close_scm_fds() later.
>>>>>
>>>>> It does. The flag you're talking above is to be added (unless it's there already).
>>>>
>>>> There is already a generic ^^^
>>>
>>> OK, will take a look.
>>>
>>>>>>>>>> I suppose, the above introduces some circular dependencies for cases
>>>>>>>>>> with several sockets having epoll, which is listening them.
>>>>>>>>>
>>>>>>>>> Nope :) On dump I explicitly ban unix sockets and epolls sent via unix
>>>>>>>>> sockets. So we can only wait here for some other files, that cannot wait us.
>>>>>>>>>
>>>>>>>>>>> +
>>>>>>>>>>> +		/* Optimization for the next pass */
>>>>>>>>>>> +		list_del(&sf->l);
>>>>>>>>>>> +		xfree(sf);
>>>>>>>>>>> +	}
>>>>>>>>>>> +
>>>>>>>>>>> +	return 0;
>>>>>>>>>>> +}
>>>>>>>>>>> +
>>>>>>>>>>>  static int wake_connected_sockets(struct unix_sk_info *ui)
>>>>>>>>>>>  {
>>>>>>>>>>>  	struct fdinfo_list_entry *fle;
>>>>>>>>>>> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>>>>>>>>>>>  	struct unix_sk_info *ui;
>>>>>>>>>>>  	int ret;
>>>>>>>>>>>  
>>>>>>>>>>> +	ui = container_of(d, struct unix_sk_info, d);
>>>>>>>>>>> +
>>>>>>>>>>> +	/* FIXME -- only queue restore may be postponed */
>>>>>>>>>>> +	if (chk_restored_scms(ui)) {
>>>>>>>>>>> +		pr_info("scm: Wait for tgt to restore\n");
>>>>>>>>>>> +		return 1;
>>>>>>>>>>> +	}
>>>>>>>>>>
>>>>>>>>>> It's too strong limitation, we delay another tasks here.
>>>>>>>>>
>>>>>>>>> Another tasks? No, we just ask files restore loop to go and try to open
>>>>>>>>> some other files.
>>>>>>>>
>>>>>>>> But another tasks may wait file descriptors you're serving out.
>>>>>>>> This tasks will be delayed.
>>>>>>>
>>>>>>> Mm... Yes, it will. Isn't it the cost of synchronization anyway? The FIXME
>>>>>>> comment above it is exactly about it -- complicating the code for the sake
>>>>>>> of (hopefully) performance.
>>>>>>
>>>>>> I don't think this will complicate the code. We moved to asynchronous file
>>>>>> restore engine just to do not have "big criu locks".
>>>>>
>>>>> There's no locks here with this if, it just reorders the sequence of fle->open
>>>>> calls.
>>>>
>>>> It's dependencies and it influence on open order.
>>>
>>> Of course. This is what we introduced the ability to call ->open again and again -- to
>>> re-order openings the way we need. And I still don't understand what's the problem with
>>> the fact I delay opening of a socket. "This changes the order" and "delays others" are
>>> not real problems, the biggest problem they introduce is a couple of additional CPU
>>> ticks needed to call unix's open for the 2nd time.
>>
>> My idea is just to restore as much in parallel, as possible. 
> 
> I generally agree with that, but scms is not the case we should break our backs with.
> 
>> When you don't send master fle to another tasks, these tasks may reach the stage,
> 
> Come on, this is pure race. Whether or not the peer task would go sleeping because
> we've skipped unix open this time is the question of hitting or missing the time
> frame that it takes to call unix_open by pointer and checking that the return code
> is 1. How many CPU ticks (not even sched ticks) is that?!

This can't be accounted, because of this depends on configuration and relationships
between tasks. I just say, that when you make available a resource as soon as it's ready,
you reduces the lengths of chains of dependences between tasks existing at a moment.

When it's not made so, the chains of dependencies existing at a moment become longer.
And this is not a question of ticks and relationships between two separate tasks.
An only delay may lead to many sleeps of many tasks, which may be avoided by the
rule to make resource available right after it becomes ready.

My point is just this.
 
>> when the fle is the only thing they are need at the current moment of restore. And
>> they will just wait you.
>>
>> Also, you will need to serve master fle out as soon as possible, when you
>> will implement scm with unix fds.
> 
> When we'll implement scm with unix fds just serving master out ASAP won't help, we'll
> need to reshuffle much more code to resolve circular and mutual scms.
> 
>>>>>> Now all dependences
>>>>>> are as strong as they are need, and not more. It seems to be not good
>>>>>> to move away from this path.
>>>>>>  
>>>>>>>>>> We may open the socket and to serve it out, and we do
>>>>>>>>>> not need wait queuers for these actions.
>>>>>>>>>
>>>>>>>>> Not always. There are several cases when we do socketpair() then restore
>>>>>>>>> a queue then close one of the ends immediately.
>>>>>>>>
>>>>>>>> Even if the second end is closed, anither tasks may want this
>>>>>>>> slave fle, and their eventpolls may depend on it.
>>>>>>>> I think, we should wait only in the moments, when we must wait, and do
>>>>>>>> not do that in another cases.
>>>>>>>
>>>>>>> This is valid, but must is to be replaced with should :)
>>>>>>> Is anything broken with the current approach? Or just suboptimal?
>>>>>>
>>>>>> Yes, this is suboptiomal, and it's really easy to avoid that, I think.
>>>>>
>>>>> Actually it's not. As I said earlier, there are cases when you cannot restore
>>>>> a socket without a queue and restore the queue later, so for these cases you
>>>>> need to skip the whole restoration. For the cases when you can restore a socket
>>>>> and postpone the queue restore, you may do the first step and then return 1
>>>>> requesting for yet another ->open call. With this only the unix ->open logic
>>>>> becomes noticeably more complex.
>>>>
>>>> Could you give an example of a case, when a socket needs to wait fill another sockets
>>>> are restored, before it can call socket() syscall?
>>>
>>> With SCMs -- easily. open_unix_sk_standalone(), branch (ui->ue->state == TCP_ESTABLISHED) && !ui->ue->peer
>>> or ui->ue->type == SOCK_DGRAM && !ui->queuer. Both call socketpair(), restore queue at once, then close
>>> one of the ends. If I have SCM_RIGHTs message in a queue I need to wait for e.g. inet socket to be opened
>>> before I can call socketpair().
>>
>> Then only this type of sockets should wait till scm dependences are restored
>> before is creates a socket pair.
> 
> Yes, but what's the point? This won't help us resolving generic unix sent via unix case anyway. 
> The change you're asking for is doubling the checks for whether or not to call socket() or
> socketpair() for the sake of spectral parallelism. I can agree with this change, but not as a
> part of this set, it's large enough already.

Generic unix sockets sent via unix case solution will continue this way, I assume.
There will be sequence:
1)open
2)serve out
3)wait till all scm are ready (if scm is master of our task, wait till it's FLE_OPEN, otherwise till it's received, i.e. FLE_FINISHED).
  also wait till owners of msg_names are bound (unix_sk_info::bound). these sockets also should be received as fake fles if we need.
4)restore your own queue (Using ready scms or sending via msg_names owners)

There is no a circular dependency.

Unix socket queues without such particulars will be restored by their peers.
But if a socket has a scm or msg_names it's peer field will forced zeroed and
it will restore its queue by itself.
 
>> And the above is only till the moment unix fds in scm are implemented. When
>> they are, this type of socket also will fit generic model, but we will need
>> more difficult solution for the second closed end. I think, we will need to
>> open socketpair, serve first end out and then to pin second end open as fake
>> file.
>>
>>>>>> It would be difficult to rework that later, if we found the limitations
>>>>>> as too strong that they don't allow to restore some type of files.
>>>>>
>>>>> While I do agree with this in general, I see no points in splitting the unix sk
>>>>> restore into open, postpone, restore the queue (optionally), postpone again, do
>>>>> post_open. This doesn't seem to solve any real problem, makes something (what?)
>>>>> better for a really rare case and complicates the ->open logic for unix sockets.
>>>>
>>>> It's not need to introduce multy-cases. There is a generic rule:
>>>> queues with scms or msg names are restored by the socket itself, and the socket
>>>> uses task's (maybe ghost) fles to do that.
>>>
>>> No queues are not restored by the socket itself, they are restored by the socket queuer, that
>>> is some other socket
>> .
>>
Pavel Emelyanov July 13, 2017, 12:26 p.m.
>>> When you don't send master fle to another tasks, these tasks may reach the stage,
>>
>> Come on, this is pure race. Whether or not the peer task would go sleeping because
>> we've skipped unix open this time is the question of hitting or missing the time
>> frame that it takes to call unix_open by pointer and checking that the return code
>> is 1. How many CPU ticks (not even sched ticks) is that?!
> 
> This can't be accounted, because of this depends on configuration and relationships
> between tasks. I just say, that when you make available a resource as soon as it's ready,
> you reduces the lengths of chains of dependences between tasks existing at a moment.

Yes.

> When it's not made so, the chains of dependencies existing at a moment become longer.
> And this is not a question of ticks and relationships between two separate tasks.
> An only delay may lead to many sleeps of many tasks, which may be avoided by the
> rule to make resource available right after it becomes ready.
> 
> My point is just this.

Got it. Now get my point -- shuffling the code further to make this happen will
cost us time and (!) readability. Current files. and satellites is complex enough,
splitting unix_open into more stages won't make it simpler. The outcome from this
may be noticeable in extremely rare cases (we've had no bugs for inability to
C/R due to SCMs in queue). IOW -- the result doesn't worth the pain.

>>> when the fle is the only thing they are need at the current moment of restore. And
>>> they will just wait you.
>>>
>>> Also, you will need to serve master fle out as soon as possible, when you
>>> will implement scm with unix fds.
>>
>> When we'll implement scm with unix fds just serving master out ASAP won't help, we'll
>> need to reshuffle much more code to resolve circular and mutual scms.
>>
>>>>>>> Now all dependences
>>>>>>> are as strong as they are need, and not more. It seems to be not good
>>>>>>> to move away from this path.
>>>>>>>  
>>>>>>>>>>> We may open the socket and to serve it out, and we do
>>>>>>>>>>> not need wait queuers for these actions.
>>>>>>>>>>
>>>>>>>>>> Not always. There are several cases when we do socketpair() then restore
>>>>>>>>>> a queue then close one of the ends immediately.
>>>>>>>>>
>>>>>>>>> Even if the second end is closed, anither tasks may want this
>>>>>>>>> slave fle, and their eventpolls may depend on it.
>>>>>>>>> I think, we should wait only in the moments, when we must wait, and do
>>>>>>>>> not do that in another cases.
>>>>>>>>
>>>>>>>> This is valid, but must is to be replaced with should :)
>>>>>>>> Is anything broken with the current approach? Or just suboptimal?
>>>>>>>
>>>>>>> Yes, this is suboptiomal, and it's really easy to avoid that, I think.
>>>>>>
>>>>>> Actually it's not. As I said earlier, there are cases when you cannot restore
>>>>>> a socket without a queue and restore the queue later, so for these cases you
>>>>>> need to skip the whole restoration. For the cases when you can restore a socket
>>>>>> and postpone the queue restore, you may do the first step and then return 1
>>>>>> requesting for yet another ->open call. With this only the unix ->open logic
>>>>>> becomes noticeably more complex.
>>>>>
>>>>> Could you give an example of a case, when a socket needs to wait fill another sockets
>>>>> are restored, before it can call socket() syscall?
>>>>
>>>> With SCMs -- easily. open_unix_sk_standalone(), branch (ui->ue->state == TCP_ESTABLISHED) && !ui->ue->peer
>>>> or ui->ue->type == SOCK_DGRAM && !ui->queuer. Both call socketpair(), restore queue at once, then close
>>>> one of the ends. If I have SCM_RIGHTs message in a queue I need to wait for e.g. inet socket to be opened
>>>> before I can call socketpair().
>>>
>>> Then only this type of sockets should wait till scm dependences are restored
>>> before is creates a socket pair.
>>
>> Yes, but what's the point? This won't help us resolving generic unix sent via unix case anyway. 
>> The change you're asking for is doubling the checks for whether or not to call socket() or
>> socketpair() for the sake of spectral parallelism. I can agree with this change, but not as a
>> part of this set, it's large enough already.
> 
> Generic unix sockets sent via unix case solution will continue this way, I assume.
> There will be sequence:
> 1)open
> 2)serve out

Doesn't work for the cases I've mentioned above. You need to do step 4 before this point
in current files.c scheme. Or fix it to allow for keeping socketpair's peers temporarily.

> 3)wait till all scm are ready (if scm is master of our task, wait till it's FLE_OPEN, otherwise till it's received, i.e. FLE_FINISHED).
>   also wait till owners of msg_names are bound (unix_sk_info::bound). these sockets also should be received as fake fles if we need.
> 4)restore your own queue (Using ready scms or sending via msg_names owners)

Not your own, but your peer's queue. If we could sendmsg() data into our queue things
would become much simpler.

> There is no a circular dependency.

You can send socket A via socket B, then socket B via socket A. That's the circular
dependency I'm talking about. It can be untied, but not with the existing code, it
need to account for temporary socketpairs.

> Unix socket queues without such particulars will be restored by their peers.
> But if a socket has a scm or msg_names it's peer field will forced zeroed and
> it will restore its queue by itself.
>  
>>> And the above is only till the moment unix fds in scm are implemented. When
>>> they are, this type of socket also will fit generic model, but we will need
>>> more difficult solution for the second closed end. I think, we will need to
>>> open socketpair, serve first end out and then to pin second end open as fake
>>> file.
>>>
>>>>>>> It would be difficult to rework that later, if we found the limitations
>>>>>>> as too strong that they don't allow to restore some type of files.
>>>>>>
>>>>>> While I do agree with this in general, I see no points in splitting the unix sk
>>>>>> restore into open, postpone, restore the queue (optionally), postpone again, do
>>>>>> post_open. This doesn't seem to solve any real problem, makes something (what?)
>>>>>> better for a really rare case and complicates the ->open logic for unix sockets.
>>>>>
>>>>> It's not need to introduce multy-cases. There is a generic rule:
>>>>> queues with scms or msg names are restored by the socket itself, and the socket
>>>>> uses task's (maybe ghost) fles to do that.
>>>>
>>>> No queues are not restored by the socket itself, they are restored by the socket queuer, that
>>>> is some other socket

-- Pavel
Kirill Tkhai July 13, 2017, 1:17 p.m.
On 13.07.2017 15:26, Pavel Emelyanov wrote:
> 
>>>> When you don't send master fle to another tasks, these tasks may reach the stage,
>>>
>>> Come on, this is pure race. Whether or not the peer task would go sleeping because
>>> we've skipped unix open this time is the question of hitting or missing the time
>>> frame that it takes to call unix_open by pointer and checking that the return code
>>> is 1. How many CPU ticks (not even sched ticks) is that?!
>>
>> This can't be accounted, because of this depends on configuration and relationships
>> between tasks. I just say, that when you make available a resource as soon as it's ready,
>> you reduces the lengths of chains of dependences between tasks existing at a moment.
> 
> Yes.
> 
>> When it's not made so, the chains of dependencies existing at a moment become longer.
>> And this is not a question of ticks and relationships between two separate tasks.
>> An only delay may lead to many sleeps of many tasks, which may be avoided by the
>> rule to make resource available right after it becomes ready.
>>
>> My point is just this.
> 
> Got it. Now get my point -- shuffling the code further to make this happen will
> cost us time and (!) readability. Current files. and satellites is complex enough,
> splitting unix_open into more stages won't make it simpler. The outcome from this
> may be noticeable in extremely rare cases (we've had no bugs for inability to
> C/R due to SCMs in queue). IOW -- the result doesn't worth the pain.

I don't think it will make readability worse, as it's just to move this check
in appropriate places. The check is just a function call, it doesn't seem to
be difficult to watch.

But anyway, I've already said everything that I think about this. Further is up to you.

>>>> when the fle is the only thing they are need at the current moment of restore. And
>>>> they will just wait you.
>>>>
>>>> Also, you will need to serve master fle out as soon as possible, when you
>>>> will implement scm with unix fds.
>>>
>>> When we'll implement scm with unix fds just serving master out ASAP won't help, we'll
>>> need to reshuffle much more code to resolve circular and mutual scms.
>>>
>>>>>>>> Now all dependences
>>>>>>>> are as strong as they are need, and not more. It seems to be not good
>>>>>>>> to move away from this path.
>>>>>>>>  
>>>>>>>>>>>> We may open the socket and to serve it out, and we do
>>>>>>>>>>>> not need wait queuers for these actions.
>>>>>>>>>>>
>>>>>>>>>>> Not always. There are several cases when we do socketpair() then restore
>>>>>>>>>>> a queue then close one of the ends immediately.
>>>>>>>>>>
>>>>>>>>>> Even if the second end is closed, anither tasks may want this
>>>>>>>>>> slave fle, and their eventpolls may depend on it.
>>>>>>>>>> I think, we should wait only in the moments, when we must wait, and do
>>>>>>>>>> not do that in another cases.
>>>>>>>>>
>>>>>>>>> This is valid, but must is to be replaced with should :)
>>>>>>>>> Is anything broken with the current approach? Or just suboptimal?
>>>>>>>>
>>>>>>>> Yes, this is suboptiomal, and it's really easy to avoid that, I think.
>>>>>>>
>>>>>>> Actually it's not. As I said earlier, there are cases when you cannot restore
>>>>>>> a socket without a queue and restore the queue later, so for these cases you
>>>>>>> need to skip the whole restoration. For the cases when you can restore a socket
>>>>>>> and postpone the queue restore, you may do the first step and then return 1
>>>>>>> requesting for yet another ->open call. With this only the unix ->open logic
>>>>>>> becomes noticeably more complex.
>>>>>>
>>>>>> Could you give an example of a case, when a socket needs to wait fill another sockets
>>>>>> are restored, before it can call socket() syscall?
>>>>>
>>>>> With SCMs -- easily. open_unix_sk_standalone(), branch (ui->ue->state == TCP_ESTABLISHED) && !ui->ue->peer
>>>>> or ui->ue->type == SOCK_DGRAM && !ui->queuer. Both call socketpair(), restore queue at once, then close
>>>>> one of the ends. If I have SCM_RIGHTs message in a queue I need to wait for e.g. inet socket to be opened
>>>>> before I can call socketpair().
>>>>
>>>> Then only this type of sockets should wait till scm dependences are restored
>>>> before is creates a socket pair.
>>>
>>> Yes, but what's the point? This won't help us resolving generic unix sent via unix case anyway. 
>>> The change you're asking for is doubling the checks for whether or not to call socket() or
>>> socketpair() for the sake of spectral parallelism. I can agree with this change, but not as a
>>> part of this set, it's large enough already.
>>
>> Generic unix sockets sent via unix case solution will continue this way, I assume.
>> There will be sequence:
>> 1)open
>> 2)serve out
> 
> Doesn't work for the cases I've mentioned above. You need to do step 4 before this point
> in current files.c scheme. Or fix it to allow for keeping socketpair's peers temporarily

Yes, there will be need a temporary fle.

> 
>> 3)wait till all scm are ready (if scm is master of our task, wait till it's FLE_OPEN, otherwise till it's received, i.e. FLE_FINISHED).
>>   also wait till owners of msg_names are bound (unix_sk_info::bound). these sockets also should be received as fake fles if we need.
>> 4)restore your own queue (Using ready scms or sending via msg_names owners)
> 
> Not your own, but your peer's queue. If we could sendmsg() data into our queue things
> would become much simpler.

Maybe, if we don't find there possible a circular dependence with epoll.
 
>> There is no a circular dependency.
> 
> You can send socket A via socket B, then socket B via socket A. That's the circular
> dependency I'm talking about. It can be untied, but not with the existing code, it
> need to account for temporary socketpairs.

It's not a circular dependence, it won't lead us to deadlock in the scheme I draw above.
 
>> Unix socket queues without such particulars will be restored by their peers.
>> But if a socket has a scm or msg_names it's peer field will forced zeroed and
>> it will restore its queue by itself.
>>  
>>>> And the above is only till the moment unix fds in scm are implemented. When
>>>> they are, this type of socket also will fit generic model, but we will need
>>>> more difficult solution for the second closed end. I think, we will need to
>>>> open socketpair, serve first end out and then to pin second end open as fake
>>>> file.
>>>>
>>>>>>>> It would be difficult to rework that later, if we found the limitations
>>>>>>>> as too strong that they don't allow to restore some type of files.
>>>>>>>
>>>>>>> While I do agree with this in general, I see no points in splitting the unix sk
>>>>>>> restore into open, postpone, restore the queue (optionally), postpone again, do
>>>>>>> post_open. This doesn't seem to solve any real problem, makes something (what?)
>>>>>>> better for a really rare case and complicates the ->open logic for unix sockets.
>>>>>>
>>>>>> It's not need to introduce multy-cases. There is a generic rule:
>>>>>> queues with scms or msg names are restored by the socket itself, and the socket
>>>>>> uses task's (maybe ghost) fles to do that.
>>>>>
>>>>> No queues are not restored by the socket itself, they are restored by the socket queuer, that
>>>>> is some other socket
> 
> -- Pavel
>