[3/5] Process Migration using Sockets

Submitted by Rodrigo Bruno on Feb. 11, 2017, 3:34 a.m.

Details

Message ID f6871028389e9f80038a12b9292a77ad.squirrel@146.193.41.131
State New
Series "Series without cover letter"
Headers show

Commit Message

Rodrigo Bruno Feb. 11, 2017, 3:34 a.m.
criu/Makefile.crtools        |   4 +
 criu/cr-dump.c               |  16 +++
 criu/cr-restore.c            |   6 +
 criu/crtools.c               |  20 +++-
 criu/image.c                 |  47 +++++++-
 criu/img-remote.c            | 275
+++++++++++++++++++++++++++++++++++++++++++
 criu/include/cr_options.h    |   1 +
 criu/include/img-remote.h    |  83 +++++++++++++
 criu/include/protobuf-desc.h |   4 +
 criu/page-xfer.c             |  64 +++++++---
 criu/pagemap.c               |  67 ++++++++---
 criu/protobuf-desc.c         |   1 +
 criu/util.c                  |  31 +++--
 images/Makefile              |   1 +
 images/remote-image.proto    |  20 ++++
 15 files changed, 599 insertions(+), 41 deletions(-)

    This patch introduces the --remote option and the necessary code
changes to
    support it. This leaves user the option to decide if the checkpoint
data is to
    be stored on disk or sent through the network (through the image-proxy).
    The latter forwards the data to the destination node where image-cache
    receives it.

    The overall communication is performed as follows:
    src_node CRIU dump -> (sends images through UNIX sockets) ->      
image-proxy
                                                                               |
                                                                               V
    dst_node: CRIU restore <- (receives images through UNIX sockets)<-
image-cache

    Communication between image-proxy and image-cache is done through a
single
    TCP connection.

    Running criu with --remote option is like this:

    dst_node# criu image-cache -d --port <port> -o /tmp/image-cache.log
    --local-cache-path <local_cache_path> ...
    dst_node# criu restore --remote -o /tmp/image-cache.log
    --local-cache-path <local_cache_path> ...
    src_node# criu image-proxy -d --port <port> --address <dst_node> -o
/tmp/image-proxy.log
    --local-proxy-path <local_proxy_path> ...
    src_node# criu dump -t <pid> --remote -o /tmp/dump.log
    --local-proxy-path <local_proxy_path> ...

    Signed-off-by: Rodrigo Bruno <rbruno@gsd.inesc-id.pt>
    Signed-off-by: Katerina Koukiou <k.koukiou@gmail.com>

Patch hide | download patch | download mbox

diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools
index e740798..4d21199 100644
--- a/criu/Makefile.crtools
+++ b/criu/Makefile.crtools
@@ -26,6 +26,10 @@  obj-y			+= files-reg.o
 obj-y			+= fsnotify.o
 obj-y			+= image-desc.o
 obj-y			+= image.o
+obj-y			+= img-remote.o
+obj-y			+= img-proxy.o
+obj-y			+= img-cache.o
+obj-y			+= img-remote-proto.o
 obj-y			+= ipc_ns.o
 obj-y			+= irmap.o
 obj-y			+= kcmp-ids.o
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 5e708a3..f7a4efb 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -83,6 +83,7 @@ 
 #include "seize.h"
 #include "fault-injection.h"
 #include "dump.h"
+#include "img-remote.h"

 static char loc_buf[PAGE_SIZE];

@@ -1497,6 +1498,11 @@  int cr_pre_dump_tasks(pid_t pid)
 	struct pstree_item *item;
 	int ret = -1;

+	if (opts.remote && push_snapshot_id() < 0) {
+		pr_err("Failed to push image namespace.\n");
+		goto err;
+	}
+
 	root_item = alloc_pstree_item();
 	if (!root_item)
 		goto err;
@@ -1632,6 +1638,11 @@  static int cr_dump_finish(int ret)

 	close_service_fd(CR_PROC_FD_OFF);

+	if (opts.remote && (finish_remote_dump() < 0)) {
+		pr_err("Finish remote dump failed.\n");
+		return post_dump_ret ? : 1;
+	}
+
 	if (ret) {
 		pr_err("Dumping FAILED.\n");
 	} else {
@@ -1652,6 +1663,11 @@  int cr_dump_tasks(pid_t pid)
 	pr_info("Dumping processes (pid: %d)\n", pid);
 	pr_info("========================================\n");

+	if (opts.remote && push_snapshot_id() < 0) {
+		pr_err("Failed to push image namespace.\n");
+		goto err;
+	}
+
 	root_item = alloc_pstree_item();
 	if (!root_item)
 		goto err;
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index dc7bef0..4c1b419 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -30,6 +30,7 @@ 
 #include "cr_options.h"
 #include "servicefd.h"
 #include "image.h"
+#include "img-remote.h"
 #include "util.h"
 #include "util-pie.h"
 #include "criu-log.h"
@@ -1954,6 +1955,11 @@  int cr_restore_tasks(void)
 		goto err;

 	ret = restore_root_task(root_item);
+
+	if (opts.remote && (finish_remote_restore() < 0)) {
+		pr_err("Finish remote restore failed.\n");
+		goto err;
+	}
 err:
 	cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret);
 	return ret;
diff --git a/criu/crtools.c b/criu/crtools.c
index 1b5cbd1..622fc6c 100644
--- a/criu/crtools.c
+++ b/criu/crtools.c
@@ -52,6 +52,7 @@ 

 #include "setproctitle.h"
 #include "sysctl.h"
+#include "img-remote.h"

 struct cr_options opts;

@@ -74,6 +75,8 @@  void init_opts(void)
 	opts.ghost_limit = DEFAULT_GHOST_LIMIT;
 	opts.timeout = DEFAULT_TIMEOUT;
 	opts.empty_ns = 0;
+	opts.addr = DEFAULT_CACHE_HOST;
+	opts.port = DEFAULT_CACHE_PORT;
 }

 static int parse_join_ns(const char *ptr)
@@ -280,6 +283,7 @@  int main(int argc, char *argv[], char *envp[])
 		{ "cgroup-dump-controller",	required_argument,	0, 1082	},
 		{ SK_INFLIGHT_PARAM,		no_argument,		0, 1083	},
 		{ "deprecated",			no_argument,		0, 1084 },
+		{ "remote",			no_argument,		0, 1085 },
 		{ "display-stats",		no_argument,		0, 1086 },
 		{ "weak-sysctls",		no_argument,		0, 1087 },
 		{ },
@@ -594,6 +598,9 @@  int main(int argc, char *argv[], char *envp[])
 			pr_msg("Turn deprecated stuff ON\n");
 			opts.deprecated_ok = true;
 			break;
+		case 1085:
+			opts.remote = true;
+			break;
 		case 1086:
 			opts.display_stats = true;
 			break;
@@ -755,6 +762,12 @@  int main(int argc, char *argv[], char *envp[])
 	if (!strcmp(argv[optind], "page-server"))
 		return cr_page_server(opts.daemon_mode, -1) > 0 ? 0 : 1;

+	if (!strcmp(argv[optind], "image-cache"))
+		return image_cache(opts.daemon_mode, DEFAULT_CACHE_SOCKET, opts.port);
+
+	if (!strcmp(argv[optind], "image-proxy"))
+		return image_proxy(opts.daemon_mode, DEFAULT_PROXY_SOCKET, opts.addr,
opts.port);
+
 	if (!strcmp(argv[optind], "service"))
 		return cr_service(opts.daemon_mode);

@@ -781,6 +794,8 @@  usage:
 "  criu page-server\n"
 "  criu service [<options>]\n"
 "  criu dedup\n"
+"  criu image-cache [<options>]\n"
+"  criu image-proxy [<options>]\n"
 "\n"
 "Commands:\n"
 "  dump           checkpoint a process/tree identified by pid\n"
@@ -793,6 +808,8 @@  usage:
 "  dedup          remove duplicates in memory dump\n"
 "  cpuinfo dump   writes cpu information into image file\n"
 "  cpuinfo check  validates cpu information read from image file\n"
+"  image-cache    launch destination-side cache for images sent from the
source-side\n"
+"  image-proxy    launch source-side proxy to sent images to the
destination-side\n"
 	);

 	if (usage_error) {
@@ -836,6 +853,7 @@  usage:
 "                            macvlan[IFNAME]:OUTNAME\n"
 "                            mnt[COOKIE]:ROOT\n"
 "\n"
+"  --remote              dump/restore images directly to/from remote node
using image-proxy/image-cache\n"
 "* Special resources support:\n"
 "     --" SK_EST_PARAM "  checkpoint/restore established TCP connections\n"
 "     --" SK_INFLIGHT_PARAM "   skip (ignore) in-flight TCP connections\n"
@@ -922,7 +940,7 @@  usage:
 "\n"
 "Page/Service server options:\n"
 "  --address ADDR        address of server or service\n"
-"  --port PORT           port of page server\n"
+"  --port PORT           port of page serve or service\n"
 "  -d|--daemon           run in the background after creating socket\n"
 "\n"
 "Other options:\n"
diff --git a/criu/image.c b/criu/image.c
index 0ad6b89..567c8f1 100644
--- a/criu/image.c
+++ b/criu/image.c
@@ -16,6 +16,7 @@ 
 #include "xmalloc.h"
 #include "images/inventory.pb-c.h"
 #include "images/pagemap.pb-c.h"
+#include "img-remote.h"

 bool ns_per_id = false;
 bool img_common_magic = true;
@@ -312,13 +313,53 @@  static int img_write_magic(struct cr_img *img, int
oflags, int type)
 	return write_img(img, &imgset_template[type].magic);
 }

+int do_open_remote_image(int dfd, char *path, int flags)
+{
+	char *snapshot_id = NULL;
+	int ret;
+
+	/* When using namespaces, the current dir is changed so we need to
+	 * change to previous working dir and back to correctly open the image
+	 * proxy and cache sockets. */
+	int save = dirfd(opendir("."));
+	if (fchdir(get_service_fd(IMG_FD_OFF)) < 0) {
+		pr_debug("fchdir to dfd failed!\n");
+		return -1;
+	}
+
+	snapshot_id = get_snapshot_id_from_idx(dfd);
+
+	if (snapshot_id == NULL)
+		ret = -1;
+	else if (flags == O_RDONLY) {
+		pr_debug("do_open_remote_image RDONLY path=%s snapshot_id=%s\n",
+				  path, snapshot_id);
+		ret = read_remote_image_connection(snapshot_id, path);
+	} else {
+		pr_debug("do_open_remote_image WDONLY path=%s snapshot_id=%s\n",
+				  path, snapshot_id);
+		ret = write_remote_image_connection(snapshot_id, path, O_WRONLY);
+	}
+
+	if (fchdir(save) < 0) {
+		pr_debug("fchdir to save failed!\n");
+		return -1;
+	}
+	close(save);
+
+	return ret;
+}
+
 static int do_open_image(struct cr_img *img, int dfd, int type, unsigned
long oflags, char *path)
 {
 	int ret, flags;

 	flags = oflags & ~(O_NOBUF | O_SERVICE | O_FORCE_LOCAL);

-	ret = openat(dfd, path, flags, CR_FD_PERM);
+	if (opts.remote && !(oflags & O_FORCE_LOCAL))
+		ret = do_open_remote_image(dfd, path, flags);
+	else
+		ret = openat(dfd, path, flags, CR_FD_PERM);
 	if (ret < 0) {
 		if (!(flags & O_CREAT) && (errno == ENOENT || ret == -ENOENT)) {
 			pr_info("No %s image\n", path);
@@ -420,7 +461,9 @@  int open_image_dir(char *dir)
 	close(fd);
 	fd = ret;

-	if (opts.img_parent) {
+	if (opts.remote) {
+		init_snapshot_id(dir);
+	} else if (opts.img_parent) {
 		ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK);
 		if (ret < 0 && errno != EEXIST) {
 			pr_perror("Can't link parent snapshot");
diff --git a/criu/img-remote.c b/criu/img-remote.c
new file mode 100644
index 0000000..337cb4a
--- /dev/null
+++ b/criu/img-remote.c
@@ -0,0 +1,275 @@ 
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include "xmalloc.h"
+#include "criu-log.h"
+#include "img-remote.h"
+#include "img-remote-proto.h"
+#include "images/remote-image.pb-c.h"
+#include "protobuf-desc.h"
+#include <fcntl.h>
+#include "servicefd.h"
+#include "common/compiler.h"
+#include "cr_options.h"
+
+#define PB_LOCAL_IMAGE_SIZE PATHLEN
+
+static char *snapshot_id;
+bool restoring = true;
+
+LIST_HEAD(snapshot_head);
+
+/* A snapshot is a dump or pre-dump operation. Each snapshot is
identified by an
+ * ID which corresponds to the working directory specefied by the user.
+ */
+struct snapshot {
+	char snapshot_id[PATHLEN];
+	struct list_head l;
+};
+
+struct snapshot *new_snapshot(char *snapshot_id)
+{
+	struct snapshot *s = malloc(sizeof(struct snapshot));
+
+	if (!s) {
+		pr_perror("Failed to allocate snapshot structure");
+		return NULL;
+	}
+	strncpy(s->snapshot_id, snapshot_id, PATHLEN);
+	return s;
+}
+
+void add_snapshot(struct snapshot *snapshot)
+{
+	list_add_tail(&(snapshot->l), &snapshot_head);
+}
+
+int read_remote_image_connection(char *snapshot_id, char *path)
+{
+	int error;
+	int sockfd = setup_UNIX_client_socket(restoring ? DEFAULT_CACHE_SOCKET:
DEFAULT_PROXY_SOCKET);
+
+	if (sockfd < 0) {
+		pr_perror("Error opening local connection for %s:%s", path, snapshot_id);
+		return -1;
+	}
+
+	if (write_header(sockfd, snapshot_id, path, O_RDONLY) < 0) {
+		pr_perror("Error writing header for %s:%s", path, snapshot_id);
+		return -1;
+	}
+
+	if (read_reply_header(sockfd, &error) < 0) {
+		pr_perror("Error reading reply header for %s:%s", path, snapshot_id);
+		return -1;
+	}
+	if (!error || !strncmp(path, RESTORE_FINISH, sizeof(RESTORE_FINISH)))
+		return sockfd;
+	else if (error == ENOENT) {
+		pr_info("Image does not exist (%s:%s)\n", path, snapshot_id);
+		close(sockfd);
+		return -ENOENT;
+	}
+	pr_perror("Unexpected error returned: %d (%s:%s)\n", error, path,
snapshot_id);
+	close(sockfd);
+	return -1;
+}
+
+int write_remote_image_connection(char *snapshot_id, char *path, int flags)
+{
+	int sockfd = setup_UNIX_client_socket(DEFAULT_PROXY_SOCKET);
+
+	if (sockfd < 0)
+		return -1;
+
+	if (write_header(sockfd, snapshot_id, path, flags) < 0) {
+		pr_perror("Error writing header for %s:%s", path, snapshot_id);
+		return -1;
+	}
+	return sockfd;
+}
+
+int finish_remote_dump(void)
+{
+	pr_info("Dump side is calling finish\n");
+	int fd = write_remote_image_connection(NULL_SNAPSHOT_ID, DUMP_FINISH,
O_WRONLY);
+
+	if (fd == -1) {
+		pr_perror("Unable to open finish dump connection");
+		return -1;
+	}
+
+	close(fd);
+	return 0;
+}
+
+int finish_remote_restore(void)
+{
+	pr_info("Restore side is calling finish\n");
+	int fd = read_remote_image_connection(NULL_SNAPSHOT_ID, RESTORE_FINISH);
+
+	if (fd == -1) {
+		pr_perror("Unable to open finish restore connection");
+		return -1;
+	}
+
+	close(fd);
+	return 0;
+}
+
+int skip_remote_bytes(int fd, unsigned long len)
+{
+	static char buf[4096];
+	int n = 0;
+	unsigned long curr = 0;
+
+	for (; curr < len; ) {
+		n = read(fd, buf, min(len - curr, (unsigned long)4096));
+		if (n == 0) {
+			pr_perror("Unexpected end of stream (skipping %lx/%lx bytes)",
+				curr, len);
+			return -1;
+		} else if (n > 0) {
+			curr += n;
+		} else {
+			pr_perror("Error while skipping bytes from stream (%lx/%lx)",
+				curr, len);
+			return -1;
+		}
+	}
+
+	if (curr != len) {
+		pr_perror("Unable to skip the current number of bytes: %lx instead of
%lx",
+			curr, len);
+		return -1;
+	}
+	return 0;
+}
+
+static int pull_snapshot_ids(void)
+{
+	int n, sockfd;
+	SnapshotIdEntry *ls;
+	struct snapshot *s = NULL;
+
+	sockfd = read_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG);
+
+	/* The connection was successful but there is not file. */
+	if (sockfd < 0 && errno == ENOENT)
+		return 0;
+	else if (sockfd < 0) {
+		pr_perror("Unable to open snapshot id read connection");
+		return -1;
+	}
+
+	while (1) {
+		n = pb_read_obj(sockfd, (void **)&ls, PB_SNAPSHOT_ID);
+		if (!n) {
+			close(sockfd);
+			return n;
+		} else if (n < 0) {
+			pr_perror("Unable to read remote snapshot ids");
+			close(sockfd);
+			return n;
+		}
+
+		s = new_snapshot(ls->snapshot_id);
+		if (!s) {
+			pr_perror("Unable create new snapshot structure");
+			close(sockfd);
+			return -1;
+		}
+		add_snapshot(s);
+		pr_info("[read_snapshot ids] parent = %s\n", ls->snapshot_id);
+	}
+	free(ls);
+	close(sockfd);
+	return n;
+}
+
+int push_snapshot_id(void)
+{
+	int n;
+	restoring = false;
+	SnapshotIdEntry rn = SNAPSHOT_ID_ENTRY__INIT;
+	int sockfd = write_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG,
O_APPEND);
+
+	if (sockfd < 0) {
+		pr_perror("Unable to open snapshot id push connection");
+		return -1;
+	}
+
+	rn.snapshot_id = xmalloc(sizeof(char) * PATHLEN);
+	if (!rn.snapshot_id) {
+		pr_perror("Unable to allocate snapshot id buffer");
+		close(sockfd);
+		return -1;
+	}
+	strncpy(rn.snapshot_id, snapshot_id, PATHLEN);
+
+	n = pb_write_obj(sockfd, &rn, PB_SNAPSHOT_ID);
+
+	xfree(rn.snapshot_id);
+	close(sockfd);
+	return n;
+}
+
+void init_snapshot_id(char *si)
+{
+	snapshot_id = si;
+}
+
+char *get_curr_snapshot_id(void)
+{
+	return snapshot_id;
+}
+
+int get_curr_snapshot_id_idx(void)
+{
+	struct snapshot *si;
+	int idx = 0;
+
+	if (list_empty(&snapshot_head))
+		pull_snapshot_ids();
+
+	list_for_each_entry(si, &snapshot_head, l) {
+	if (!strncmp(si->snapshot_id, snapshot_id, PATHLEN))
+			return idx;
+		idx++;
+	}
+
+	pr_perror("Error, could not find current snapshot id (%s) fd",
snapshot_id);
+	return -1;
+}
+
+char *get_snapshot_id_from_idx(int idx)
+{
+	struct snapshot *si;
+
+	if (list_empty(&snapshot_head))
+		pull_snapshot_ids();
+
+	/* Note: if idx is the service fd then we need the current
+	 * snapshot_id idx. Else we need a parent snapshot_id idx.
+	 */
+	if (idx == get_service_fd(IMG_FD_OFF))
+		idx = get_curr_snapshot_id_idx();
+
+	list_for_each_entry(si, &snapshot_head, l) {
+		if (!idx)
+			return si->snapshot_id;
+		idx--;
+	}
+
+	pr_perror("Error, could not find snapshot id for idx %d", idx);
+	return NULL;
+}
+
+int get_curr_parent_snapshot_id_idx(void)
+{
+	return get_curr_snapshot_id_idx() - 1;
+}
diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h
index c2f4b10..16e9243 100644
--- a/criu/include/cr_options.h
+++ b/criu/include/cr_options.h
@@ -115,6 +115,7 @@  struct cr_options {
 	 * to turn one ON while the code is in.
 	 */
 	bool			deprecated_ok;
+	bool			remote;
 	bool			display_stats;
 	bool			weak_sysctls;
 };
diff --git a/criu/include/img-remote.h b/criu/include/img-remote.h
new file mode 100644
index 0000000..6ffc429
--- /dev/null
+++ b/criu/include/img-remote.h
@@ -0,0 +1,83 @@ 
+#include <limits.h>
+#include <stdbool.h>
+
+#ifndef IMAGE_REMOTE_H
+#define	IMAGE_REMOTE_H
+
+#define PATHLEN PATH_MAX
+#define DUMP_FINISH "DUMP_FINISH"
+#define RESTORE_FINISH "RESTORE_FINISH"
+#define PARENT_IMG "parent"
+#define NULL_SNAPSHOT_ID "null"
+#define DEFAULT_CACHE_SOCKET "img-cache.sock"
+#define DEFAULT_PROXY_SOCKET "img-proxy.sock"
+#define DEFAULT_CACHE_PORT 9996
+#define DEFAULT_CACHE_HOST "localhost"
+
+/* Called by restore to get the fd correspondent to a particular path.
This call
+ * will block until the connection is received.
+ */
+int read_remote_image_connection(char *snapshot_id, char *path);
+
+/* Called by dump to create a socket connection to the restore side. The
socket
+ * fd is returned for further writing operations.
+ */
+int write_remote_image_connection(char *snapshot_id, char *path, int flags);
+
+/* Called by dump/restore when everything is dumped/restored. This function
+ * creates a new connection with a special control name. The receiver
side uses
+ * it to ack that no more files are coming.
+ */
+int finish_remote_dump();
+int finish_remote_restore();
+
+/* Starts an image proxy daemon (dump side). It receives image files through
+ * socket connections and forwards them to the image cache (restore side).
+ */
+int image_proxy(bool background, char *local_proxy_path, char
*cache_host, unsigned short cache_port);
+
+/* Starts an image cache daemon (restore side). It receives image files
through
+ * socket connections and caches them until they are requested by the
restore
+ * process.
+ */
+int image_cache(bool background, char *local_cache_path, unsigned short
cache_port);
+
+/* Reads (discards) 'len' bytes from fd. This is used to emulate the
function
+ * lseek, which is used to advance the file needle.
+ */
+int skip_remote_bytes(int fd, unsigned long len);
+
+/* To support iterative migration, the concept of snapshot_id is introduced
+ * (only when remote migration is enabled). Each image is tagged with one
+ * snapshot_id. The snapshot_id is the image directory used for the
operation
+ * that creates the image (either predump or dump). Images stored in memory
+ * (both in Image Proxy and Image Cache) are identified by their name and
+ * snapshot_id. Snapshot_ids are ordered so that we can find parent pagemaps
+ * (that will be used when restoring the process).
+ */
+
+/* Sets the current snapshot_id */
+void init_snapshot_id(char *ns);
+
+/* Returns the current snapshot_id. */
+char *get_curr_snapshot_id();
+
+/* Returns the snapshot_id index representing the current snapshot_id. This
+ * index represents the hierarchy position. For example: images tagged with
+ * the snapshot_id with index 1 are more recent than the images tagged with
+ * the snapshot_id with index 0.
+ */
+int get_curr_snapshot_id_idx();
+
+/* Returns the snapshot_id associated with the snapshot_id index. */
+char *get_snapshot_id_from_idx(int idx);
+
+/* Pushes the current snapshot_id into the snapshot_id hierarchy (into
the Image
+ * Proxy and Image Cache).
+ */
+int push_snapshot_id();
+
+/* Returns the snapshot id index that preceeds the current snapshot_id. */
+int get_curr_parent_snapshot_id_idx();
+
+#endif
diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h
index 6c76b49..43ac534 100644
--- a/criu/include/protobuf-desc.h
+++ b/criu/include/protobuf-desc.h
@@ -59,6 +59,10 @@  enum {
 	PB_BINFMT_MISC,		/* 50 */
 	PB_TTY_DATA,
 	PB_AUTOFS,
+	PB_REMOTE_IMAGE,        /* Header for images sent from proxy to cache.*/
+	PB_LOCAL_IMAGE,         /* Header for reading/writing images from/to
proxy or cache. */
+	PB_LOCAL_IMAGE_REPLY,	/* Header for reading/writing images reply. */
+	PB_SNAPSHOT_ID,         /* Contains a single id. Used for
reading/writing ids from proxy or cache. */

 	/* PB_AUTOGEN_STOP */

diff --git a/criu/page-xfer.c b/criu/page-xfer.c
index 4def13a..60f60ef 100644
--- a/criu/page-xfer.c
+++ b/criu/page-xfer.c
@@ -17,6 +17,7 @@ 
 #include "protobuf.h"
 #include "images/pagemap.pb-c.h"
 #include "fcntl.h"
+#include "img-remote.h"

 static int page_server_sk = -1;

@@ -170,15 +171,17 @@  static int write_pages_loc(struct page_xfer *xfer,
 		int p, unsigned long len)
 {
 	ssize_t ret;
+	ssize_t curr = 0;

-	ret = splice(p, NULL, img_raw_fd(xfer->pi), NULL, len, SPLICE_F_MOVE);
-	if (ret == -1) {
-		pr_perror("Unable to spice data");
-		return -1;
-	}
-	if (ret != len) {
-		pr_err("Only %zu of %lu bytes have been spliced\n", ret, len);
-		return -1;
+	while (1) {
+		ret = splice(p, NULL, img_raw_fd(xfer->pi), NULL, len, SPLICE_F_MOVE);
+		if (ret == -1) {
+			pr_perror("Unable to spice data");
+			return -1;
+		}
+		curr += ret;
+		if (curr == len)
+			break;
 	}

 	return 0;
@@ -288,13 +291,29 @@  static int open_page_local_xfer(struct page_xfer
*xfer, int fd_type, long id)
 		int pfd;
 		int pr_flags = (fd_type == CR_FD_PAGEMAP) ? PR_TASK : PR_SHMEM;

-		pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
-		if (pfd < 0 && errno == ENOENT)
-			goto out;
+
+		if (opts.remote) {
+			/* Note: we are replacing a real directory FD for a snapshot_id
+			 * index. Since we need the parent of the current snapshot_id,
+			 * we want the current snapshot_id index minus one. It is
+			 * possible that dfd is already a snapshot_id index. We test it
+			 * by comparing it to the service FD. When opening an image (see
+			 * do_open_image) we convert the snapshot_id index into a real
+			 * snapshot_id.
+			 */
+			pfd = get_curr_snapshot_id_idx() - 1;
+			if (pfd < 0)
+				goto out;
+		} else {
+			pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
+			if (pfd < 0 && errno == ENOENT)
+				goto out;
+		}

 		xfer->parent = xmalloc(sizeof(*xfer->parent));
 		if (!xfer->parent) {
-			close(pfd);
+			if (!opts.remote)
+				close(pfd);
 			return -1;
 		}

@@ -303,10 +322,12 @@  static int open_page_local_xfer(struct page_xfer
*xfer, int fd_type, long id)
 			pr_perror("No parent image found, though parent directory is set");
 			xfree(xfer->parent);
 			xfer->parent = NULL;
-			close(pfd);
+			if (!opts.remote)
+				close(pfd);
 			goto out;
 		}
-		close(pfd);
+		if (!opts.remote)
+			close(pfd);
 	}

 out:
@@ -411,9 +432,16 @@  int check_parent_local_xfer(int fd_type, int id)
 	struct stat st;
 	int ret, pfd;

-	pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
-	if (pfd < 0 && errno == ENOENT)
-		return 0;
+	if (opts.remote) {
+		pfd = get_curr_parent_snapshot_id_idx();
+		pr_err("Unable to get parent snapsgot id");
+		if (pfd == -1)
+			return -1;
+	} else {
+		pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
+		if (pfd < 0 && errno == ENOENT)
+			return 0;
+	}

 	snprintf(path, sizeof(path), imgset_template[fd_type].fmt, id);
 	ret = fstatat(pfd, path, &st, 0);
@@ -475,6 +503,8 @@  int check_parent_page_xfer(int fd_type, long id)
 {
 	if (opts.use_page_server)
 		return check_parent_server_xfer(fd_type, id);
+	else if (opts.remote)
+		return get_curr_parent_snapshot_id_idx() == -1 ? 0 : 1;
 	else
 		return check_parent_local_xfer(fd_type, id);
 }
diff --git a/criu/pagemap.c b/criu/pagemap.c
index 8e854c9..201f985 100644
--- a/criu/pagemap.c
+++ b/criu/pagemap.c
@@ -14,6 +14,7 @@ 
 #include "xmalloc.h"
 #include "protobuf.h"
 #include "images/pagemap.pb-c.h"
+#include "img-remote.h"

 #ifndef SEEK_DATA
 #define SEEK_DATA	3
@@ -136,8 +137,12 @@  static void skip_pagemap_pages(struct page_read *pr,
unsigned long len)
 	if (!len)
 		return;

-	if (!pr->pe->in_parent)
+	if (!pr->pe->in_parent) {
+		if (opts.remote)
+			if (skip_remote_bytes(img_raw_fd(pr->pi), len))
+				pr_perror("Error skipping remote bytes");
 		pr->pi_off += len;
+	}
 	pr->cvaddr += len;
 }

@@ -155,7 +160,7 @@  static int seek_pagemap(struct page_read *pr, unsigned
long vaddr)
 			break;

 		if (vaddr >= start && vaddr < end) {
-			skip_pagemap_pages(pr, start - pr->cvaddr);
+			skip_pagemap_pages(pr, start > pr->cvaddr ? start - pr->cvaddr : 0);
 			return 1;
 		}

@@ -171,7 +176,7 @@  adv:
 static int seek_pagemap_page(struct page_read *pr, unsigned long vaddr)
 {
 	if (seek_pagemap(pr, vaddr)) {
-		skip_pagemap_pages(pr, vaddr - pr->cvaddr);
+		skip_pagemap_pages(pr, vaddr > pr->cvaddr ? vaddr - pr->cvaddr : 0);
 		return 1;
 	}

@@ -248,6 +253,7 @@  static int read_local_page(struct page_read *pr,
unsigned long vaddr,
 {
 	int fd = img_raw_fd(pr->pi);
 	int ret;
+	size_t curr = 0;

 	/*
 	 * Flush any pending async requests if any not to break the
@@ -257,10 +263,15 @@  static int read_local_page(struct page_read *pr,
unsigned long vaddr,
 		return -1;

 	pr_debug("\tpr%u Read page from self %lx/%"PRIx64"\n", pr->id,
pr->cvaddr, pr->pi_off);
-	ret = pread(fd, buf, len, pr->pi_off);
-	if (ret != len) {
-		pr_perror("Can't read mapping page %d", ret);
-		return -1;
+	while (1) {
+		ret = read(fd, buf + curr, len - curr);
+		if (ret < 1) {
+			pr_perror("Can't read mapping page %d", ret);
+			return -1;
+		}
+		curr += ret;
+		if (curr == len)
+			break;
 	}

 	if (opts.auto_dedup) {
@@ -358,7 +369,7 @@  static int maybe_read_page(struct page_read *pr,
unsigned long vaddr,
 	int ret;
 	unsigned long len = nr * PAGE_SIZE;

-	if (flags & PR_ASYNC)
+	if (flags & PR_ASYNC && !opts.remote)
 		ret = enqueue_async_page(pr, vaddr, len, buf);
 	else
 		ret = read_local_page(pr, vaddr, len, buf);
@@ -459,9 +470,24 @@  static int try_open_parent(int dfd, int pid, struct
page_read *pr, int pr_flags)
 	int pfd, ret;
 	struct page_read *parent = NULL;

-	pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
-	if (pfd < 0 && errno == ENOENT)
-		goto out;
+	if (opts.remote) {
+		/* Note: we are replacing a real directory FD for a snapshot_id
+		 * index. Since we need the parent of the current snapshot_id,
+		 * we want the current snapshot_id index minus one. It is
+		 * possible that dfd is already a snapshot_id index. We test it
+		 * by comparing it to the service FD. When opening an image (see
+		 * do_open_image) we convert the snapshot_id index into a real
+		 * snapshot_id.
+		 */
+		pfd = dfd == get_service_fd(IMG_FD_OFF) ?
+			get_curr_snapshot_id_idx() - 1 : dfd - 1;
+		if (pfd < 0)
+			goto out;
+	} else {
+		pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
+		if (pfd < 0 && errno == ENOENT)
+			goto out;
+	}

 	parent = xmalloc(sizeof(*parent));
 	if (!parent)
@@ -476,7 +502,8 @@  static int try_open_parent(int dfd, int pid, struct
page_read *pr, int pr_flags)
 		parent = NULL;
 	}

-	close(pfd);
+	if (!opts.remote)
+		close(pfd);
 out:
 	pr->parent = parent;
 	return 0;
@@ -484,7 +511,8 @@  out:
 err_free:
 	xfree(parent);
 err_cl:
-	close(pfd);
+	if (!opts.remote)
+		close(pfd);
 	return -1;
 }

@@ -501,7 +529,18 @@  static int init_pagemaps(struct page_read *pr)
 	off_t fsize;
 	int nr_pmes, nr_realloc;

-	fsize = img_raw_size(pr->pmi);
+	if (!opts.remote)
+		fsize = img_raw_size(pr->pmi);
+	else
+		/*
+		 * FIXME - There is no easy way to estimate the size of the
+		 * pagemap that is still to be read from the socket. Possible
+		 * solution is to ask Image Proxy or Image Cache about the size
+		 * of the image. 1024 is a wild guess (more space is allocated
+		 * if needed).
+		 */
+		fsize = 1024;
+
 	if (fsize < 0)
 		return -1;

diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c
index 41c2080..bfe00c5 100644
--- a/criu/protobuf-desc.c
+++ b/criu/protobuf-desc.c
@@ -62,6 +62,7 @@ 
 #include "images/seccomp.pb-c.h"
 #include "images/binfmt-misc.pb-c.h"
 #include "images/autofs.pb-c.h"
+#include "images/remote-image.pb-c.h"

 struct cr_pb_message_desc cr_pb_descs[PB_MAX];

diff --git a/criu/util.c b/criu/util.c
index 4b0bcfd..d826732 100644
--- a/criu/util.c
+++ b/criu/util.c
@@ -516,29 +516,46 @@  int copy_file(int fd_in, int fd_out, size_t bytes)
 {
 	ssize_t written = 0;
 	size_t chunk = bytes ? bytes : 4096;
+	char *buffer = (char*) malloc(chunk);
+	ssize_t ret;

 	while (1) {
-		ssize_t ret;
-
-		ret = sendfile(fd_out, fd_in, NULL, chunk);
+		if (opts.remote) {
+			ret = read(fd_in, buffer, chunk);
+			if (ret < 0) {
+				pr_perror("Can't read from fd_in\n");
+				ret = -1;
+				goto err;
+			}
+			if (write(fd_out, buffer, ret) != ret) {
+				pr_perror("Couldn't write all read bytes\n");
+				ret = -1;
+				goto err;
+			}
+		}
+		else
+                        ret = sendfile(fd_out, fd_in, NULL, chunk);
 		if (ret < 0) {
 			pr_perror("Can't send data to ghost file");
-			return -1;
+			ret = -1;
+			goto err;
 		}

 		if (ret == 0) {
 			if (bytes && (written != bytes)) {
 				pr_err("Ghost file size mismatch %zu/%zu\n",
 						written, bytes);
-				return -1;
+				ret = -1;
+				goto err;
 			}
 			break;
 		}

 		written += ret;
 	}
-
-	return 0;
+err:
+	free(buffer);
+	return ret;
 }

 int read_fd_link(int lfd, char *buf, size_t size)
diff --git a/images/Makefile b/images/Makefile
index eb18526..5b60948 100644
--- a/images/Makefile
+++ b/images/Makefile
@@ -61,6 +61,7 @@  proto-obj-y	+= time.o
 proto-obj-y	+= sysctl.o
 proto-obj-y	+= autofs.o
 proto-obj-y	+= macvlan.o
+proto-obj-y	+= remote-image.o

 CFLAGS		+= -iquote $(obj)/

diff --git a/images/remote-image.proto b/images/remote-image.proto
new file mode 100644
index 0000000..1212627
--- /dev/null
+++ b/images/remote-image.proto
@@ -0,0 +1,20 @@ 
+message local_image_entry {
+	required string name		= 1;
+	required string snapshot_id	= 2;
+	required uint32 open_mode	= 3;
+}
+
+message remote_image_entry {
+	required string name		= 1;
+	required string snapshot_id	= 2;
+	required uint32 open_mode	= 3;
+	required uint64 size		= 4;
+}
+
+message local_image_reply_entry {
+	required uint32 error           = 1;
+}
+
+message snapshot_id_entry {
+	required string snapshot_id	= 1;
+}

Comments

Pavel Emelianov Feb. 14, 2017, 10:26 a.m.
OK, all three patches are

Acked-by: Pavel Emelyanov <xemul@virtuozzo.com>

Andrey, plz, merge them into criu-dev.


Rodrigo, here's the list of what should be done with the cache/proxy
in order to have them merged into master.

0. Document the whole thing :)
   Please, add articles for newly introduced actions and options to
   https://criu.org/CLI page.
   Also, it would be good to have an article describing the protocols
   involved.

1. Make the unix sockets reside in work-dir.
   The good thing is that we've get rid of the socket name option :)
   But looking at do_open_remote_image() I see that it fchdir-s to
   image dir before connecting to proxy/cache. Better solution is to
   put the socket into workdir.

   1a. After this the option -D|--images-dir should become optional.
       Provided the --remote is given CRIU should work purely on the
       work-dir and not generate anything in the images-dir.

2. Tune up the image_cache and image_proxy commands to accept the
   --status-fd and --pidfile options.
   Presumably the very cr_daemon() call should be equipped with 
   everything that should be done for daemonizing and proxy/cache
   tasks should just call it :)

3. Fix local connections not to generate per-image threads. There
   can be many images and it's not nice to stress the system with
   such amount of threads. Please, look at how criu/uffd.c manages
   multiple descriptors with page-faults using the epoll stuff.

   3a. The accept_remote_image_connections() seem not to work well
       with opts.ps_socket scenario as the former just calls accept()
       on whatever socket is passed there, while the opts.ps_socket
       is already an established socket for data transfer.

4. No strings in protocol. Now the hard-coded "RESTORE_FINISH" string
   (and DUMP_FINISHED one) is used to terminate the communication.
   Need to tune up the protobuf objects to send boolean (or integer)
   EOF sign rather that the string.

5. Check how proxy/cache works with incremental dumps. Looking at the
   skip_remote_bytes() I think that image-cache and -proxy still do not
   work well with stacked pages images. Probably for those we'll need
   the page-server or lazy-pages -like protocol that would request the
   needed regions and receive it back rather than read bytes from
   sockets simply to skip those.

6. Add support for cache/proxy into go-phaul code. I haven't yet finished
   with the prototype, but plan to do it soon, so once the above steps
   are done we'll be able to proceed with this one.

-- Pavel

On 02/11/2017 06:34 AM, rbruno@gsd.inesc-id.pt wrote:
>  criu/Makefile.crtools        |   4 +
>  criu/cr-dump.c               |  16 +++
>  criu/cr-restore.c            |   6 +
>  criu/crtools.c               |  20 +++-
>  criu/image.c                 |  47 +++++++-
>  criu/img-remote.c            | 275
> +++++++++++++++++++++++++++++++++++++++++++
>  criu/include/cr_options.h    |   1 +
>  criu/include/img-remote.h    |  83 +++++++++++++
>  criu/include/protobuf-desc.h |   4 +
>  criu/page-xfer.c             |  64 +++++++---
>  criu/pagemap.c               |  67 ++++++++---
>  criu/protobuf-desc.c         |   1 +
>  criu/util.c                  |  31 +++--
>  images/Makefile              |   1 +
>  images/remote-image.proto    |  20 ++++
>  15 files changed, 599 insertions(+), 41 deletions(-)
> 
>     This patch introduces the --remote option and the necessary code
> changes to
>     support it. This leaves user the option to decide if the checkpoint
> data is to
>     be stored on disk or sent through the network (through the image-proxy).
>     The latter forwards the data to the destination node where image-cache
>     receives it.
> 
>     The overall communication is performed as follows:
>     src_node CRIU dump -> (sends images through UNIX sockets) ->      
> image-proxy
>                                                                                |
>                                                                                V
>     dst_node: CRIU restore <- (receives images through UNIX sockets)<-
> image-cache
> 
>     Communication between image-proxy and image-cache is done through a
> single
>     TCP connection.
> 
>     Running criu with --remote option is like this:
> 
>     dst_node# criu image-cache -d --port <port> -o /tmp/image-cache.log
>     --local-cache-path <local_cache_path> ...
>     dst_node# criu restore --remote -o /tmp/image-cache.log
>     --local-cache-path <local_cache_path> ...
>     src_node# criu image-proxy -d --port <port> --address <dst_node> -o
> /tmp/image-proxy.log
>     --local-proxy-path <local_proxy_path> ...
>     src_node# criu dump -t <pid> --remote -o /tmp/dump.log
>     --local-proxy-path <local_proxy_path> ...
> 
>     Signed-off-by: Rodrigo Bruno <rbruno@gsd.inesc-id.pt>
>     Signed-off-by: Katerina Koukiou <k.koukiou@gmail.com>
> 
> diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools
> index e740798..4d21199 100644
> --- a/criu/Makefile.crtools
> +++ b/criu/Makefile.crtools
> @@ -26,6 +26,10 @@ obj-y			+= files-reg.o
>  obj-y			+= fsnotify.o
>  obj-y			+= image-desc.o
>  obj-y			+= image.o
> +obj-y			+= img-remote.o
> +obj-y			+= img-proxy.o
> +obj-y			+= img-cache.o
> +obj-y			+= img-remote-proto.o
>  obj-y			+= ipc_ns.o
>  obj-y			+= irmap.o
>  obj-y			+= kcmp-ids.o
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 5e708a3..f7a4efb 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -83,6 +83,7 @@
>  #include "seize.h"
>  #include "fault-injection.h"
>  #include "dump.h"
> +#include "img-remote.h"
> 
>  static char loc_buf[PAGE_SIZE];
> 
> @@ -1497,6 +1498,11 @@ int cr_pre_dump_tasks(pid_t pid)
>  	struct pstree_item *item;
>  	int ret = -1;
> 
> +	if (opts.remote && push_snapshot_id() < 0) {
> +		pr_err("Failed to push image namespace.\n");
> +		goto err;
> +	}
> +
>  	root_item = alloc_pstree_item();
>  	if (!root_item)
>  		goto err;
> @@ -1632,6 +1638,11 @@ static int cr_dump_finish(int ret)
> 
>  	close_service_fd(CR_PROC_FD_OFF);
> 
> +	if (opts.remote && (finish_remote_dump() < 0)) {
> +		pr_err("Finish remote dump failed.\n");
> +		return post_dump_ret ? : 1;
> +	}
> +
>  	if (ret) {
>  		pr_err("Dumping FAILED.\n");
>  	} else {
> @@ -1652,6 +1663,11 @@ int cr_dump_tasks(pid_t pid)
>  	pr_info("Dumping processes (pid: %d)\n", pid);
>  	pr_info("========================================\n");
> 
> +	if (opts.remote && push_snapshot_id() < 0) {
> +		pr_err("Failed to push image namespace.\n");
> +		goto err;
> +	}
> +
>  	root_item = alloc_pstree_item();
>  	if (!root_item)
>  		goto err;
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index dc7bef0..4c1b419 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -30,6 +30,7 @@
>  #include "cr_options.h"
>  #include "servicefd.h"
>  #include "image.h"
> +#include "img-remote.h"
>  #include "util.h"
>  #include "util-pie.h"
>  #include "criu-log.h"
> @@ -1954,6 +1955,11 @@ int cr_restore_tasks(void)
>  		goto err;
> 
>  	ret = restore_root_task(root_item);
> +
> +	if (opts.remote && (finish_remote_restore() < 0)) {
> +		pr_err("Finish remote restore failed.\n");
> +		goto err;
> +	}
>  err:
>  	cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret);
>  	return ret;
> diff --git a/criu/crtools.c b/criu/crtools.c
> index 1b5cbd1..622fc6c 100644
> --- a/criu/crtools.c
> +++ b/criu/crtools.c
> @@ -52,6 +52,7 @@
> 
>  #include "setproctitle.h"
>  #include "sysctl.h"
> +#include "img-remote.h"
> 
>  struct cr_options opts;
> 
> @@ -74,6 +75,8 @@ void init_opts(void)
>  	opts.ghost_limit = DEFAULT_GHOST_LIMIT;
>  	opts.timeout = DEFAULT_TIMEOUT;
>  	opts.empty_ns = 0;
> +	opts.addr = DEFAULT_CACHE_HOST;
> +	opts.port = DEFAULT_CACHE_PORT;
>  }
> 
>  static int parse_join_ns(const char *ptr)
> @@ -280,6 +283,7 @@ int main(int argc, char *argv[], char *envp[])
>  		{ "cgroup-dump-controller",	required_argument,	0, 1082	},
>  		{ SK_INFLIGHT_PARAM,		no_argument,		0, 1083	},
>  		{ "deprecated",			no_argument,		0, 1084 },
> +		{ "remote",			no_argument,		0, 1085 },
>  		{ "display-stats",		no_argument,		0, 1086 },
>  		{ "weak-sysctls",		no_argument,		0, 1087 },
>  		{ },
> @@ -594,6 +598,9 @@ int main(int argc, char *argv[], char *envp[])
>  			pr_msg("Turn deprecated stuff ON\n");
>  			opts.deprecated_ok = true;
>  			break;
> +		case 1085:
> +			opts.remote = true;
> +			break;
>  		case 1086:
>  			opts.display_stats = true;
>  			break;
> @@ -755,6 +762,12 @@ int main(int argc, char *argv[], char *envp[])
>  	if (!strcmp(argv[optind], "page-server"))
>  		return cr_page_server(opts.daemon_mode, -1) > 0 ? 0 : 1;
> 
> +	if (!strcmp(argv[optind], "image-cache"))
> +		return image_cache(opts.daemon_mode, DEFAULT_CACHE_SOCKET, opts.port);
> +
> +	if (!strcmp(argv[optind], "image-proxy"))
> +		return image_proxy(opts.daemon_mode, DEFAULT_PROXY_SOCKET, opts.addr,
> opts.port);
> +
>  	if (!strcmp(argv[optind], "service"))
>  		return cr_service(opts.daemon_mode);
> 
> @@ -781,6 +794,8 @@ usage:
>  "  criu page-server\n"
>  "  criu service [<options>]\n"
>  "  criu dedup\n"
> +"  criu image-cache [<options>]\n"
> +"  criu image-proxy [<options>]\n"
>  "\n"
>  "Commands:\n"
>  "  dump           checkpoint a process/tree identified by pid\n"
> @@ -793,6 +808,8 @@ usage:
>  "  dedup          remove duplicates in memory dump\n"
>  "  cpuinfo dump   writes cpu information into image file\n"
>  "  cpuinfo check  validates cpu information read from image file\n"
> +"  image-cache    launch destination-side cache for images sent from the
> source-side\n"
> +"  image-proxy    launch source-side proxy to sent images to the
> destination-side\n"
>  	);
> 
>  	if (usage_error) {
> @@ -836,6 +853,7 @@ usage:
>  "                            macvlan[IFNAME]:OUTNAME\n"
>  "                            mnt[COOKIE]:ROOT\n"
>  "\n"
> +"  --remote              dump/restore images directly to/from remote node
> using image-proxy/image-cache\n"
>  "* Special resources support:\n"
>  "     --" SK_EST_PARAM "  checkpoint/restore established TCP connections\n"
>  "     --" SK_INFLIGHT_PARAM "   skip (ignore) in-flight TCP connections\n"
> @@ -922,7 +940,7 @@ usage:
>  "\n"
>  "Page/Service server options:\n"
>  "  --address ADDR        address of server or service\n"
> -"  --port PORT           port of page server\n"
> +"  --port PORT           port of page serve or service\n"
>  "  -d|--daemon           run in the background after creating socket\n"
>  "\n"
>  "Other options:\n"
> diff --git a/criu/image.c b/criu/image.c
> index 0ad6b89..567c8f1 100644
> --- a/criu/image.c
> +++ b/criu/image.c
> @@ -16,6 +16,7 @@
>  #include "xmalloc.h"
>  #include "images/inventory.pb-c.h"
>  #include "images/pagemap.pb-c.h"
> +#include "img-remote.h"
> 
>  bool ns_per_id = false;
>  bool img_common_magic = true;
> @@ -312,13 +313,53 @@ static int img_write_magic(struct cr_img *img, int
> oflags, int type)
>  	return write_img(img, &imgset_template[type].magic);
>  }
> 
> +int do_open_remote_image(int dfd, char *path, int flags)
> +{
> +	char *snapshot_id = NULL;
> +	int ret;
> +
> +	/* When using namespaces, the current dir is changed so we need to
> +	 * change to previous working dir and back to correctly open the image
> +	 * proxy and cache sockets. */
> +	int save = dirfd(opendir("."));
> +	if (fchdir(get_service_fd(IMG_FD_OFF)) < 0) {
> +		pr_debug("fchdir to dfd failed!\n");
> +		return -1;
> +	}
> +
> +	snapshot_id = get_snapshot_id_from_idx(dfd);
> +
> +	if (snapshot_id == NULL)
> +		ret = -1;
> +	else if (flags == O_RDONLY) {
> +		pr_debug("do_open_remote_image RDONLY path=%s snapshot_id=%s\n",
> +				  path, snapshot_id);
> +		ret = read_remote_image_connection(snapshot_id, path);
> +	} else {
> +		pr_debug("do_open_remote_image WDONLY path=%s snapshot_id=%s\n",
> +				  path, snapshot_id);
> +		ret = write_remote_image_connection(snapshot_id, path, O_WRONLY);
> +	}
> +
> +	if (fchdir(save) < 0) {
> +		pr_debug("fchdir to save failed!\n");
> +		return -1;
> +	}
> +	close(save);
> +
> +	return ret;
> +}
> +
>  static int do_open_image(struct cr_img *img, int dfd, int type, unsigned
> long oflags, char *path)
>  {
>  	int ret, flags;
> 
>  	flags = oflags & ~(O_NOBUF | O_SERVICE | O_FORCE_LOCAL);
> 
> -	ret = openat(dfd, path, flags, CR_FD_PERM);
> +	if (opts.remote && !(oflags & O_FORCE_LOCAL))
> +		ret = do_open_remote_image(dfd, path, flags);
> +	else
> +		ret = openat(dfd, path, flags, CR_FD_PERM);
>  	if (ret < 0) {
>  		if (!(flags & O_CREAT) && (errno == ENOENT || ret == -ENOENT)) {
>  			pr_info("No %s image\n", path);
> @@ -420,7 +461,9 @@ int open_image_dir(char *dir)
>  	close(fd);
>  	fd = ret;
> 
> -	if (opts.img_parent) {
> +	if (opts.remote) {
> +		init_snapshot_id(dir);
> +	} else if (opts.img_parent) {
>  		ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK);
>  		if (ret < 0 && errno != EEXIST) {
>  			pr_perror("Can't link parent snapshot");
> diff --git a/criu/img-remote.c b/criu/img-remote.c
> new file mode 100644
> index 0000000..337cb4a
> --- /dev/null
> +++ b/criu/img-remote.c
> @@ -0,0 +1,275 @@
> +#include <unistd.h>
> +#include <stdlib.h>
> +#include <stdio.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <netinet/in.h>
> +#include <netdb.h>
> +#include "xmalloc.h"
> +#include "criu-log.h"
> +#include "img-remote.h"
> +#include "img-remote-proto.h"
> +#include "images/remote-image.pb-c.h"
> +#include "protobuf-desc.h"
> +#include <fcntl.h>
> +#include "servicefd.h"
> +#include "common/compiler.h"
> +#include "cr_options.h"
> +
> +#define PB_LOCAL_IMAGE_SIZE PATHLEN
> +
> +static char *snapshot_id;
> +bool restoring = true;
> +
> +LIST_HEAD(snapshot_head);
> +
> +/* A snapshot is a dump or pre-dump operation. Each snapshot is
> identified by an
> + * ID which corresponds to the working directory specefied by the user.
> + */
> +struct snapshot {
> +	char snapshot_id[PATHLEN];
> +	struct list_head l;
> +};
> +
> +struct snapshot *new_snapshot(char *snapshot_id)
> +{
> +	struct snapshot *s = malloc(sizeof(struct snapshot));
> +
> +	if (!s) {
> +		pr_perror("Failed to allocate snapshot structure");
> +		return NULL;
> +	}
> +	strncpy(s->snapshot_id, snapshot_id, PATHLEN);
> +	return s;
> +}
> +
> +void add_snapshot(struct snapshot *snapshot)
> +{
> +	list_add_tail(&(snapshot->l), &snapshot_head);
> +}
> +
> +int read_remote_image_connection(char *snapshot_id, char *path)
> +{
> +	int error;
> +	int sockfd = setup_UNIX_client_socket(restoring ? DEFAULT_CACHE_SOCKET:
> DEFAULT_PROXY_SOCKET);
> +
> +	if (sockfd < 0) {
> +		pr_perror("Error opening local connection for %s:%s", path, snapshot_id);
> +		return -1;
> +	}
> +
> +	if (write_header(sockfd, snapshot_id, path, O_RDONLY) < 0) {
> +		pr_perror("Error writing header for %s:%s", path, snapshot_id);
> +		return -1;
> +	}
> +
> +	if (read_reply_header(sockfd, &error) < 0) {
> +		pr_perror("Error reading reply header for %s:%s", path, snapshot_id);
> +		return -1;
> +	}
> +	if (!error || !strncmp(path, RESTORE_FINISH, sizeof(RESTORE_FINISH)))
> +		return sockfd;
> +	else if (error == ENOENT) {
> +		pr_info("Image does not exist (%s:%s)\n", path, snapshot_id);
> +		close(sockfd);
> +		return -ENOENT;
> +	}
> +	pr_perror("Unexpected error returned: %d (%s:%s)\n", error, path,
> snapshot_id);
> +	close(sockfd);
> +	return -1;
> +}
> +
> +int write_remote_image_connection(char *snapshot_id, char *path, int flags)
> +{
> +	int sockfd = setup_UNIX_client_socket(DEFAULT_PROXY_SOCKET);
> +
> +	if (sockfd < 0)
> +		return -1;
> +
> +	if (write_header(sockfd, snapshot_id, path, flags) < 0) {
> +		pr_perror("Error writing header for %s:%s", path, snapshot_id);
> +		return -1;
> +	}
> +	return sockfd;
> +}
> +
> +int finish_remote_dump(void)
> +{
> +	pr_info("Dump side is calling finish\n");
> +	int fd = write_remote_image_connection(NULL_SNAPSHOT_ID, DUMP_FINISH,
> O_WRONLY);
> +
> +	if (fd == -1) {
> +		pr_perror("Unable to open finish dump connection");
> +		return -1;
> +	}
> +
> +	close(fd);
> +	return 0;
> +}
> +
> +int finish_remote_restore(void)
> +{
> +	pr_info("Restore side is calling finish\n");
> +	int fd = read_remote_image_connection(NULL_SNAPSHOT_ID, RESTORE_FINISH);
> +
> +	if (fd == -1) {
> +		pr_perror("Unable to open finish restore connection");
> +		return -1;
> +	}
> +
> +	close(fd);
> +	return 0;
> +}
> +
> +int skip_remote_bytes(int fd, unsigned long len)
> +{
> +	static char buf[4096];
> +	int n = 0;
> +	unsigned long curr = 0;
> +
> +	for (; curr < len; ) {
> +		n = read(fd, buf, min(len - curr, (unsigned long)4096));
> +		if (n == 0) {
> +			pr_perror("Unexpected end of stream (skipping %lx/%lx bytes)",
> +				curr, len);
> +			return -1;
> +		} else if (n > 0) {
> +			curr += n;
> +		} else {
> +			pr_perror("Error while skipping bytes from stream (%lx/%lx)",
> +				curr, len);
> +			return -1;
> +		}
> +	}
> +
> +	if (curr != len) {
> +		pr_perror("Unable to skip the current number of bytes: %lx instead of
> %lx",
> +			curr, len);
> +		return -1;
> +	}
> +	return 0;
> +}
> +
> +static int pull_snapshot_ids(void)
> +{
> +	int n, sockfd;
> +	SnapshotIdEntry *ls;
> +	struct snapshot *s = NULL;
> +
> +	sockfd = read_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG);
> +
> +	/* The connection was successful but there is not file. */
> +	if (sockfd < 0 && errno == ENOENT)
> +		return 0;
> +	else if (sockfd < 0) {
> +		pr_perror("Unable to open snapshot id read connection");
> +		return -1;
> +	}
> +
> +	while (1) {
> +		n = pb_read_obj(sockfd, (void **)&ls, PB_SNAPSHOT_ID);
> +		if (!n) {
> +			close(sockfd);
> +			return n;
> +		} else if (n < 0) {
> +			pr_perror("Unable to read remote snapshot ids");
> +			close(sockfd);
> +			return n;
> +		}
> +
> +		s = new_snapshot(ls->snapshot_id);
> +		if (!s) {
> +			pr_perror("Unable create new snapshot structure");
> +			close(sockfd);
> +			return -1;
> +		}
> +		add_snapshot(s);
> +		pr_info("[read_snapshot ids] parent = %s\n", ls->snapshot_id);
> +	}
> +	free(ls);
> +	close(sockfd);
> +	return n;
> +}
> +
> +int push_snapshot_id(void)
> +{
> +	int n;
> +	restoring = false;
> +	SnapshotIdEntry rn = SNAPSHOT_ID_ENTRY__INIT;
> +	int sockfd = write_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG,
> O_APPEND);
> +
> +	if (sockfd < 0) {
> +		pr_perror("Unable to open snapshot id push connection");
> +		return -1;
> +	}
> +
> +	rn.snapshot_id = xmalloc(sizeof(char) * PATHLEN);
> +	if (!rn.snapshot_id) {
> +		pr_perror("Unable to allocate snapshot id buffer");
> +		close(sockfd);
> +		return -1;
> +	}
> +	strncpy(rn.snapshot_id, snapshot_id, PATHLEN);
> +
> +	n = pb_write_obj(sockfd, &rn, PB_SNAPSHOT_ID);
> +
> +	xfree(rn.snapshot_id);
> +	close(sockfd);
> +	return n;
> +}
> +
> +void init_snapshot_id(char *si)
> +{
> +	snapshot_id = si;
> +}
> +
> +char *get_curr_snapshot_id(void)
> +{
> +	return snapshot_id;
> +}
> +
> +int get_curr_snapshot_id_idx(void)
> +{
> +	struct snapshot *si;
> +	int idx = 0;
> +
> +	if (list_empty(&snapshot_head))
> +		pull_snapshot_ids();
> +
> +	list_for_each_entry(si, &snapshot_head, l) {
> +	if (!strncmp(si->snapshot_id, snapshot_id, PATHLEN))
> +			return idx;
> +		idx++;
> +	}
> +
> +	pr_perror("Error, could not find current snapshot id (%s) fd",
> snapshot_id);
> +	return -1;
> +}
> +
> +char *get_snapshot_id_from_idx(int idx)
> +{
> +	struct snapshot *si;
> +
> +	if (list_empty(&snapshot_head))
> +		pull_snapshot_ids();
> +
> +	/* Note: if idx is the service fd then we need the current
> +	 * snapshot_id idx. Else we need a parent snapshot_id idx.
> +	 */
> +	if (idx == get_service_fd(IMG_FD_OFF))
> +		idx = get_curr_snapshot_id_idx();
> +
> +	list_for_each_entry(si, &snapshot_head, l) {
> +		if (!idx)
> +			return si->snapshot_id;
> +		idx--;
> +	}
> +
> +	pr_perror("Error, could not find snapshot id for idx %d", idx);
> +	return NULL;
> +}
> +
> +int get_curr_parent_snapshot_id_idx(void)
> +{
> +	return get_curr_snapshot_id_idx() - 1;
> +}
> diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h
> index c2f4b10..16e9243 100644
> --- a/criu/include/cr_options.h
> +++ b/criu/include/cr_options.h
> @@ -115,6 +115,7 @@ struct cr_options {
>  	 * to turn one ON while the code is in.
>  	 */
>  	bool			deprecated_ok;
> +	bool			remote;
>  	bool			display_stats;
>  	bool			weak_sysctls;
>  };
> diff --git a/criu/include/img-remote.h b/criu/include/img-remote.h
> new file mode 100644
> index 0000000..6ffc429
> --- /dev/null
> +++ b/criu/include/img-remote.h
> @@ -0,0 +1,83 @@
> +#include <limits.h>
> +#include <stdbool.h>
> +
> +#ifndef IMAGE_REMOTE_H
> +#define	IMAGE_REMOTE_H
> +
> +#define PATHLEN PATH_MAX
> +#define DUMP_FINISH "DUMP_FINISH"
> +#define RESTORE_FINISH "RESTORE_FINISH"
> +#define PARENT_IMG "parent"
> +#define NULL_SNAPSHOT_ID "null"
> +#define DEFAULT_CACHE_SOCKET "img-cache.sock"
> +#define DEFAULT_PROXY_SOCKET "img-proxy.sock"
> +#define DEFAULT_CACHE_PORT 9996
> +#define DEFAULT_CACHE_HOST "localhost"
> +
> +/* Called by restore to get the fd correspondent to a particular path.
> This call
> + * will block until the connection is received.
> + */
> +int read_remote_image_connection(char *snapshot_id, char *path);
> +
> +/* Called by dump to create a socket connection to the restore side. The
> socket
> + * fd is returned for further writing operations.
> + */
> +int write_remote_image_connection(char *snapshot_id, char *path, int flags);
> +
> +/* Called by dump/restore when everything is dumped/restored. This function
> + * creates a new connection with a special control name. The receiver
> side uses
> + * it to ack that no more files are coming.
> + */
> +int finish_remote_dump();
> +int finish_remote_restore();
> +
> +/* Starts an image proxy daemon (dump side). It receives image files through
> + * socket connections and forwards them to the image cache (restore side).
> + */
> +int image_proxy(bool background, char *local_proxy_path, char
> *cache_host, unsigned short cache_port);
> +
> +/* Starts an image cache daemon (restore side). It receives image files
> through
> + * socket connections and caches them until they are requested by the
> restore
> + * process.
> + */
> +int image_cache(bool background, char *local_cache_path, unsigned short
> cache_port);
> +
> +/* Reads (discards) 'len' bytes from fd. This is used to emulate the
> function
> + * lseek, which is used to advance the file needle.
> + */
> +int skip_remote_bytes(int fd, unsigned long len);
> +
> +/* To support iterative migration, the concept of snapshot_id is introduced
> + * (only when remote migration is enabled). Each image is tagged with one
> + * snapshot_id. The snapshot_id is the image directory used for the
> operation
> + * that creates the image (either predump or dump). Images stored in memory
> + * (both in Image Proxy and Image Cache) are identified by their name and
> + * snapshot_id. Snapshot_ids are ordered so that we can find parent pagemaps
> + * (that will be used when restoring the process).
> + */
> +
> +/* Sets the current snapshot_id */
> +void init_snapshot_id(char *ns);
> +
> +/* Returns the current snapshot_id. */
> +char *get_curr_snapshot_id();
> +
> +/* Returns the snapshot_id index representing the current snapshot_id. This
> + * index represents the hierarchy position. For example: images tagged with
> + * the snapshot_id with index 1 are more recent than the images tagged with
> + * the snapshot_id with index 0.
> + */
> +int get_curr_snapshot_id_idx();
> +
> +/* Returns the snapshot_id associated with the snapshot_id index. */
> +char *get_snapshot_id_from_idx(int idx);
> +
> +/* Pushes the current snapshot_id into the snapshot_id hierarchy (into
> the Image
> + * Proxy and Image Cache).
> + */
> +int push_snapshot_id();
> +
> +/* Returns the snapshot id index that preceeds the current snapshot_id. */
> +int get_curr_parent_snapshot_id_idx();
> +
> +#endif
> diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h
> index 6c76b49..43ac534 100644
> --- a/criu/include/protobuf-desc.h
> +++ b/criu/include/protobuf-desc.h
> @@ -59,6 +59,10 @@ enum {
>  	PB_BINFMT_MISC,		/* 50 */
>  	PB_TTY_DATA,
>  	PB_AUTOFS,
> +	PB_REMOTE_IMAGE,        /* Header for images sent from proxy to cache.*/
> +	PB_LOCAL_IMAGE,         /* Header for reading/writing images from/to
> proxy or cache. */
> +	PB_LOCAL_IMAGE_REPLY,	/* Header for reading/writing images reply. */
> +	PB_SNAPSHOT_ID,         /* Contains a single id. Used for
> reading/writing ids from proxy or cache. */
> 
>  	/* PB_AUTOGEN_STOP */
> 
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index 4def13a..60f60ef 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -17,6 +17,7 @@
>  #include "protobuf.h"
>  #include "images/pagemap.pb-c.h"
>  #include "fcntl.h"
> +#include "img-remote.h"
> 
>  static int page_server_sk = -1;
> 
> @@ -170,15 +171,17 @@ static int write_pages_loc(struct page_xfer *xfer,
>  		int p, unsigned long len)
>  {
>  	ssize_t ret;
> +	ssize_t curr = 0;
> 
> -	ret = splice(p, NULL, img_raw_fd(xfer->pi), NULL, len, SPLICE_F_MOVE);
> -	if (ret == -1) {
> -		pr_perror("Unable to spice data");
> -		return -1;
> -	}
> -	if (ret != len) {
> -		pr_err("Only %zu of %lu bytes have been spliced\n", ret, len);
> -		return -1;
> +	while (1) {
> +		ret = splice(p, NULL, img_raw_fd(xfer->pi), NULL, len, SPLICE_F_MOVE);
> +		if (ret == -1) {
> +			pr_perror("Unable to spice data");
> +			return -1;
> +		}
> +		curr += ret;
> +		if (curr == len)
> +			break;
>  	}
> 
>  	return 0;
> @@ -288,13 +291,29 @@ static int open_page_local_xfer(struct page_xfer
> *xfer, int fd_type, long id)
>  		int pfd;
>  		int pr_flags = (fd_type == CR_FD_PAGEMAP) ? PR_TASK : PR_SHMEM;
> 
> -		pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
> -		if (pfd < 0 && errno == ENOENT)
> -			goto out;
> +
> +		if (opts.remote) {
> +			/* Note: we are replacing a real directory FD for a snapshot_id
> +			 * index. Since we need the parent of the current snapshot_id,
> +			 * we want the current snapshot_id index minus one. It is
> +			 * possible that dfd is already a snapshot_id index. We test it
> +			 * by comparing it to the service FD. When opening an image (see
> +			 * do_open_image) we convert the snapshot_id index into a real
> +			 * snapshot_id.
> +			 */
> +			pfd = get_curr_snapshot_id_idx() - 1;
> +			if (pfd < 0)
> +				goto out;
> +		} else {
> +			pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
> +			if (pfd < 0 && errno == ENOENT)
> +				goto out;
> +		}
> 
>  		xfer->parent = xmalloc(sizeof(*xfer->parent));
>  		if (!xfer->parent) {
> -			close(pfd);
> +			if (!opts.remote)
> +				close(pfd);
>  			return -1;
>  		}
> 
> @@ -303,10 +322,12 @@ static int open_page_local_xfer(struct page_xfer
> *xfer, int fd_type, long id)
>  			pr_perror("No parent image found, though parent directory is set");
>  			xfree(xfer->parent);
>  			xfer->parent = NULL;
> -			close(pfd);
> +			if (!opts.remote)
> +				close(pfd);
>  			goto out;
>  		}
> -		close(pfd);
> +		if (!opts.remote)
> +			close(pfd);
>  	}
> 
>  out:
> @@ -411,9 +432,16 @@ int check_parent_local_xfer(int fd_type, int id)
>  	struct stat st;
>  	int ret, pfd;
> 
> -	pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
> -	if (pfd < 0 && errno == ENOENT)
> -		return 0;
> +	if (opts.remote) {
> +		pfd = get_curr_parent_snapshot_id_idx();
> +		pr_err("Unable to get parent snapsgot id");
> +		if (pfd == -1)
> +			return -1;
> +	} else {
> +		pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
> +		if (pfd < 0 && errno == ENOENT)
> +			return 0;
> +	}
> 
>  	snprintf(path, sizeof(path), imgset_template[fd_type].fmt, id);
>  	ret = fstatat(pfd, path, &st, 0);
> @@ -475,6 +503,8 @@ int check_parent_page_xfer(int fd_type, long id)
>  {
>  	if (opts.use_page_server)
>  		return check_parent_server_xfer(fd_type, id);
> +	else if (opts.remote)
> +		return get_curr_parent_snapshot_id_idx() == -1 ? 0 : 1;
>  	else
>  		return check_parent_local_xfer(fd_type, id);
>  }
> diff --git a/criu/pagemap.c b/criu/pagemap.c
> index 8e854c9..201f985 100644
> --- a/criu/pagemap.c
> +++ b/criu/pagemap.c
> @@ -14,6 +14,7 @@
>  #include "xmalloc.h"
>  #include "protobuf.h"
>  #include "images/pagemap.pb-c.h"
> +#include "img-remote.h"
> 
>  #ifndef SEEK_DATA
>  #define SEEK_DATA	3
> @@ -136,8 +137,12 @@ static void skip_pagemap_pages(struct page_read *pr,
> unsigned long len)
>  	if (!len)
>  		return;
> 
> -	if (!pr->pe->in_parent)
> +	if (!pr->pe->in_parent) {
> +		if (opts.remote)
> +			if (skip_remote_bytes(img_raw_fd(pr->pi), len))
> +				pr_perror("Error skipping remote bytes");
>  		pr->pi_off += len;
> +	}
>  	pr->cvaddr += len;
>  }
> 
> @@ -155,7 +160,7 @@ static int seek_pagemap(struct page_read *pr, unsigned
> long vaddr)
>  			break;
> 
>  		if (vaddr >= start && vaddr < end) {
> -			skip_pagemap_pages(pr, start - pr->cvaddr);
> +			skip_pagemap_pages(pr, start > pr->cvaddr ? start - pr->cvaddr : 0);
>  			return 1;
>  		}
> 
> @@ -171,7 +176,7 @@ adv:
>  static int seek_pagemap_page(struct page_read *pr, unsigned long vaddr)
>  {
>  	if (seek_pagemap(pr, vaddr)) {
> -		skip_pagemap_pages(pr, vaddr - pr->cvaddr);
> +		skip_pagemap_pages(pr, vaddr > pr->cvaddr ? vaddr - pr->cvaddr : 0);
>  		return 1;
>  	}
> 
> @@ -248,6 +253,7 @@ static int read_local_page(struct page_read *pr,
> unsigned long vaddr,
>  {
>  	int fd = img_raw_fd(pr->pi);
>  	int ret;
> +	size_t curr = 0;
> 
>  	/*
>  	 * Flush any pending async requests if any not to break the
> @@ -257,10 +263,15 @@ static int read_local_page(struct page_read *pr,
> unsigned long vaddr,
>  		return -1;
> 
>  	pr_debug("\tpr%u Read page from self %lx/%"PRIx64"\n", pr->id,
> pr->cvaddr, pr->pi_off);
> -	ret = pread(fd, buf, len, pr->pi_off);
> -	if (ret != len) {
> -		pr_perror("Can't read mapping page %d", ret);
> -		return -1;
> +	while (1) {
> +		ret = read(fd, buf + curr, len - curr);
> +		if (ret < 1) {
> +			pr_perror("Can't read mapping page %d", ret);
> +			return -1;
> +		}
> +		curr += ret;
> +		if (curr == len)
> +			break;
>  	}
> 
>  	if (opts.auto_dedup) {
> @@ -358,7 +369,7 @@ static int maybe_read_page(struct page_read *pr,
> unsigned long vaddr,
>  	int ret;
>  	unsigned long len = nr * PAGE_SIZE;
> 
> -	if (flags & PR_ASYNC)
> +	if (flags & PR_ASYNC && !opts.remote)
>  		ret = enqueue_async_page(pr, vaddr, len, buf);
>  	else
>  		ret = read_local_page(pr, vaddr, len, buf);
> @@ -459,9 +470,24 @@ static int try_open_parent(int dfd, int pid, struct
> page_read *pr, int pr_flags)
>  	int pfd, ret;
>  	struct page_read *parent = NULL;
> 
> -	pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
> -	if (pfd < 0 && errno == ENOENT)
> -		goto out;
> +	if (opts.remote) {
> +		/* Note: we are replacing a real directory FD for a snapshot_id
> +		 * index. Since we need the parent of the current snapshot_id,
> +		 * we want the current snapshot_id index minus one. It is
> +		 * possible that dfd is already a snapshot_id index. We test it
> +		 * by comparing it to the service FD. When opening an image (see
> +		 * do_open_image) we convert the snapshot_id index into a real
> +		 * snapshot_id.
> +		 */
> +		pfd = dfd == get_service_fd(IMG_FD_OFF) ?
> +			get_curr_snapshot_id_idx() - 1 : dfd - 1;
> +		if (pfd < 0)
> +			goto out;
> +	} else {
> +		pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
> +		if (pfd < 0 && errno == ENOENT)
> +			goto out;
> +	}
> 
>  	parent = xmalloc(sizeof(*parent));
>  	if (!parent)
> @@ -476,7 +502,8 @@ static int try_open_parent(int dfd, int pid, struct
> page_read *pr, int pr_flags)
>  		parent = NULL;
>  	}
> 
> -	close(pfd);
> +	if (!opts.remote)
> +		close(pfd);
>  out:
>  	pr->parent = parent;
>  	return 0;
> @@ -484,7 +511,8 @@ out:
>  err_free:
>  	xfree(parent);
>  err_cl:
> -	close(pfd);
> +	if (!opts.remote)
> +		close(pfd);
>  	return -1;
>  }
> 
> @@ -501,7 +529,18 @@ static int init_pagemaps(struct page_read *pr)
>  	off_t fsize;
>  	int nr_pmes, nr_realloc;
> 
> -	fsize = img_raw_size(pr->pmi);
> +	if (!opts.remote)
> +		fsize = img_raw_size(pr->pmi);
> +	else
> +		/*
> +		 * FIXME - There is no easy way to estimate the size of the
> +		 * pagemap that is still to be read from the socket. Possible
> +		 * solution is to ask Image Proxy or Image Cache about the size
> +		 * of the image. 1024 is a wild guess (more space is allocated
> +		 * if needed).
> +		 */
> +		fsize = 1024;
> +
>  	if (fsize < 0)
>  		return -1;
> 
> diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c
> index 41c2080..bfe00c5 100644
> --- a/criu/protobuf-desc.c
> +++ b/criu/protobuf-desc.c
> @@ -62,6 +62,7 @@
>  #include "images/seccomp.pb-c.h"
>  #include "images/binfmt-misc.pb-c.h"
>  #include "images/autofs.pb-c.h"
> +#include "images/remote-image.pb-c.h"
> 
>  struct cr_pb_message_desc cr_pb_descs[PB_MAX];
> 
> diff --git a/criu/util.c b/criu/util.c
> index 4b0bcfd..d826732 100644
> --- a/criu/util.c
> +++ b/criu/util.c
> @@ -516,29 +516,46 @@ int copy_file(int fd_in, int fd_out, size_t bytes)
>  {
>  	ssize_t written = 0;
>  	size_t chunk = bytes ? bytes : 4096;
> +	char *buffer = (char*) malloc(chunk);
> +	ssize_t ret;
> 
>  	while (1) {
> -		ssize_t ret;
> -
> -		ret = sendfile(fd_out, fd_in, NULL, chunk);
> +		if (opts.remote) {
> +			ret = read(fd_in, buffer, chunk);
> +			if (ret < 0) {
> +				pr_perror("Can't read from fd_in\n");
> +				ret = -1;
> +				goto err;
> +			}
> +			if (write(fd_out, buffer, ret) != ret) {
> +				pr_perror("Couldn't write all read bytes\n");
> +				ret = -1;
> +				goto err;
> +			}
> +		}
> +		else
> +                        ret = sendfile(fd_out, fd_in, NULL, chunk);
>  		if (ret < 0) {
>  			pr_perror("Can't send data to ghost file");
> -			return -1;
> +			ret = -1;
> +			goto err;
>  		}
> 
>  		if (ret == 0) {
>  			if (bytes && (written != bytes)) {
>  				pr_err("Ghost file size mismatch %zu/%zu\n",
>  						written, bytes);
> -				return -1;
> +				ret = -1;
> +				goto err;
>  			}
>  			break;
>  		}
> 
>  		written += ret;
>  	}
> -
> -	return 0;
> +err:
> +	free(buffer);
> +	return ret;
>  }
> 
>  int read_fd_link(int lfd, char *buf, size_t size)
> diff --git a/images/Makefile b/images/Makefile
> index eb18526..5b60948 100644
> --- a/images/Makefile
> +++ b/images/Makefile
> @@ -61,6 +61,7 @@ proto-obj-y	+= time.o
>  proto-obj-y	+= sysctl.o
>  proto-obj-y	+= autofs.o
>  proto-obj-y	+= macvlan.o
> +proto-obj-y	+= remote-image.o
> 
>  CFLAGS		+= -iquote $(obj)/
> 
> diff --git a/images/remote-image.proto b/images/remote-image.proto
> new file mode 100644
> index 0000000..1212627
> --- /dev/null
> +++ b/images/remote-image.proto
> @@ -0,0 +1,20 @@
> +message local_image_entry {
> +	required string name		= 1;
> +	required string snapshot_id	= 2;
> +	required uint32 open_mode	= 3;
> +}
> +
> +message remote_image_entry {
> +	required string name		= 1;
> +	required string snapshot_id	= 2;
> +	required uint32 open_mode	= 3;
> +	required uint64 size		= 4;
> +}
> +
> +message local_image_reply_entry {
> +	required uint32 error           = 1;
> +}
> +
> +message snapshot_id_entry {
> +	required string snapshot_id	= 1;
> +}
> 
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
> .
>