[PATCHv2,1/2] net: add nftables c/r

Submitted by Alexander Mikhalitsyn on Nov. 12, 2019, 4:06 p.m.

Details

Message ID 20191112160643.24403-1-alexander.mikhalitsyn@virtuozzo.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Alexander Mikhalitsyn Nov. 12, 2019, 4:06 p.m.
From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>

After Centos-8 nft used instead of iptables. But we had never supported nft rules in
CRIU, and after c/r all rules are flushed.

Path to nft tool can be changed via CR_NFTABLES environment variable
similar to CR_IPTABLES.

Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
Signed-off-by: Alexander Mikhalitsyn <alexander@mihalicyn.com>
---
 criu/image-desc.c         |  1 +
 criu/include/image-desc.h |  1 +
 criu/include/magic.h      |  1 +
 criu/include/util.h       |  2 ++
 criu/net.c                | 65 +++++++++++++++++++++++++++++++++++++--
 criu/util.c               | 39 +++++++++++++++++++++++
 6 files changed, 107 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/image-desc.c b/criu/image-desc.c
index 81cd0748..ae5d817f 100644
--- a/criu/image-desc.c
+++ b/criu/image-desc.c
@@ -76,6 +76,7 @@  struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
 	FD_ENTRY_F(RULE,	"rule-%u", O_NOBUF),
 	FD_ENTRY_F(IPTABLES,	"iptables-%u", O_NOBUF),
 	FD_ENTRY_F(IP6TABLES,	"ip6tables-%u", O_NOBUF),
+	FD_ENTRY_F(NFTABLES,	"nftables-%u", O_NOBUF),
 	FD_ENTRY_F(TMPFS_IMG,	"tmpfs-%u.tar.gz", O_NOBUF),
 	FD_ENTRY_F(TMPFS_DEV,	"tmpfs-dev-%u.tar.gz", O_NOBUF),
 	FD_ENTRY_F(AUTOFS,	"autofs-%u", O_NOBUF),
diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h
index fea80a71..6db8bf94 100644
--- a/criu/include/image-desc.h
+++ b/criu/include/image-desc.h
@@ -42,6 +42,7 @@  enum {
 	CR_FD_RULE,
 	CR_FD_IPTABLES,
 	CR_FD_IP6TABLES,
+	CR_FD_NFTABLES,
 	CR_FD_NETNS,
 	CR_FD_NETNF_CT,
 	CR_FD_NETNF_EXP,
diff --git a/criu/include/magic.h b/criu/include/magic.h
index 05101f43..1a583f4e 100644
--- a/criu/include/magic.h
+++ b/criu/include/magic.h
@@ -103,6 +103,7 @@ 
 #define TMPFS_DEV_MAGIC		RAW_IMAGE_MAGIC
 #define IPTABLES_MAGIC		RAW_IMAGE_MAGIC
 #define IP6TABLES_MAGIC		RAW_IMAGE_MAGIC
+#define NFTABLES_MAGIC		RAW_IMAGE_MAGIC
 #define NETNF_CT_MAGIC		RAW_IMAGE_MAGIC
 #define NETNF_EXP_MAGIC		RAW_IMAGE_MAGIC
 
diff --git a/criu/include/util.h b/criu/include/util.h
index a14be722..57b46dcc 100644
--- a/criu/include/util.h
+++ b/criu/include/util.h
@@ -252,6 +252,8 @@  static inline bool issubpath(const char *path, const char *sub_path)
 		(end == '/' || end == '\0');
 }
 
+int check_cmd_exists(const char *cmd);
+
 /*
  * mkdir -p
  */
diff --git a/criu/net.c b/criu/net.c
index fe9b51ad..4737b604 100644
--- a/criu/net.c
+++ b/criu/net.c
@@ -1739,12 +1739,12 @@  static int run_ip_tool(char *arg1, char *arg2, char *arg3, char *arg4, int fdin,
 	return 0;
 }
 
-static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
+static int run_tool(const char *env_var, char *def_cmd, int fdin, int fdout)
 {
 	int ret;
 	char *cmd;
 
-	cmd = getenv("CR_IPTABLES");
+	cmd = getenv(env_var);
 	if (!cmd)
 		cmd = def_cmd;
 	pr_debug("\tRunning %s for %s\n", cmd, def_cmd);
@@ -1755,6 +1755,16 @@  static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
 	return ret;
 }
 
+static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
+{
+	return run_tool("CR_IPTABLES", def_cmd, fdin, fdout);
+}
+
+static int run_nftables_tool(char *def_cmd, int fdin, int fdout)
+{
+	return run_tool("CR_NFTABLES", def_cmd, fdin, fdout);
+}
+
 static inline int dump_ifaddr(struct cr_imgset *fds)
 {
 	struct cr_img *img = img_from_set(fds, CR_FD_IFADDR);
@@ -1818,6 +1828,21 @@  static inline int dump_iptables(struct cr_imgset *fds)
 	return 0;
 }
 
+static inline int dump_nftables(struct cr_imgset *fds)
+{
+	struct cr_img *img;
+
+	/* we not dump nftables if nft utility isn't present */
+	if (!check_cmd_exists("nft"))
+		return 0;
+
+	img = img_from_set(fds, CR_FD_NFTABLES);
+	if (run_nftables_tool("nft list ruleset", -1, img_raw_fd(img)))
+		return -1;
+
+	return 0;
+}
+
 static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds)
 {
 	void *buf, *o_buf;
@@ -2082,6 +2107,38 @@  out:
 	return ret;
 }
 
+static inline int restore_nftables(int pid)
+{
+	int ret = -1;
+	struct cr_img *img;
+
+	img = open_image(CR_FD_NFTABLES, O_RSTR, pid);
+	if (img == NULL)
+		return -1;
+	if (empty_image(img)) {
+		/* Backward compatibility */
+		pr_info("Skipping nft restore, no image");
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * At this point we already know, that image may contain nftables info,
+	 * if nft utility not present we need to warn user about possible
+	 * problems
+	 */
+	if (!check_cmd_exists("nft")) {
+		pr_warn("Skipping nft restore, no nft utility");
+		ret = 0;
+		goto out;
+	}
+
+	ret = run_nftables_tool("nft -f /proc/self/fd/0", img_raw_fd(img), -1);
+out:
+	close_image(img);
+	return ret;
+}
+
 int read_net_ns_img(void)
 {
 	struct ns_id *ns;
@@ -2299,6 +2356,8 @@  int dump_net_ns(struct ns_id *ns)
 			ret = dump_rule(fds);
 		if (!ret)
 			ret = dump_iptables(fds);
+		if (!ret)
+			ret = dump_nftables(fds);
 		if (!ret)
 			ret = dump_netns_conf(ns, fds);
 	} else if (ns->type != NS_ROOT) {
@@ -2392,6 +2451,8 @@  static int prepare_net_ns_second_stage(struct ns_id *ns)
 			ret = restore_rule(nsid);
 		if (!ret)
 			ret = restore_iptables(nsid);
+		if (!ret)
+			ret = restore_nftables(nsid);
 	}
 
 	if (!ret)
diff --git a/criu/util.c b/criu/util.c
index 2a3d7abc..b4b1564f 100644
--- a/criu/util.c
+++ b/criu/util.c
@@ -795,6 +795,45 @@  struct vma_area *alloc_vma_area(void)
 	return p;
 }
 
+static int __check_cmd_exists(const char *path)
+{
+	return (access(path, F_OK) == 0);
+}
+
+int check_cmd_exists(const char *cmd)
+{
+	int ret = 0;
+	char buf[255];
+	char *env_path, *dup, *s, *p;
+
+	env_path = getenv("PATH");
+
+	/* ok, the simply __check_cmd_exists */
+	if (!env_path || cmd[0] == '/')
+		return __check_cmd_exists(cmd);
+
+	dup = strdup(env_path);
+
+	/* let's try to find program in PATH */
+	s = dup;
+	p = NULL;
+	do {
+		p = strchr(s, ':');
+		if (p != NULL)
+			p[0] = 0;
+
+		sprintf(buf, "%s/%s", s, cmd);
+		if ((ret = __check_cmd_exists(buf)))
+			goto free;
+
+		s = p + 1;
+	} while (p != NULL);
+
+free:
+	free(dup);
+	return ret;
+}
+
 int mkdirpat(int fd, const char *path, int mode)
 {
 	size_t i;

Comments

Andrei Vagin Nov. 13, 2019, 8:45 a.m.
On Tue, Nov 12, 2019 at 07:06:42PM +0300, Alexander Mikhalitsyn wrote:
> From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> 
> After Centos-8 nft used instead of iptables. But we had never supported nft rules in
> CRIU, and after c/r all rules are flushed.
> 
> Path to nft tool can be changed via CR_NFTABLES environment variable
> similar to CR_IPTABLES.


Can we use libnftnl? This should be faster, because we will not need to
fork a new process and exec a binary.

> 
> Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
> Signed-off-by: Alexander Mikhalitsyn <alexander@mihalicyn.com>
> ---
>  criu/image-desc.c         |  1 +
>  criu/include/image-desc.h |  1 +
>  criu/include/magic.h      |  1 +
>  criu/include/util.h       |  2 ++
>  criu/net.c                | 65 +++++++++++++++++++++++++++++++++++++--
>  criu/util.c               | 39 +++++++++++++++++++++++
>  6 files changed, 107 insertions(+), 2 deletions(-)
> 
> diff --git a/criu/image-desc.c b/criu/image-desc.c
> index 81cd0748..ae5d817f 100644
> --- a/criu/image-desc.c
> +++ b/criu/image-desc.c
> @@ -76,6 +76,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
>  	FD_ENTRY_F(RULE,	"rule-%u", O_NOBUF),
>  	FD_ENTRY_F(IPTABLES,	"iptables-%u", O_NOBUF),
>  	FD_ENTRY_F(IP6TABLES,	"ip6tables-%u", O_NOBUF),
> +	FD_ENTRY_F(NFTABLES,	"nftables-%u", O_NOBUF),
>  	FD_ENTRY_F(TMPFS_IMG,	"tmpfs-%u.tar.gz", O_NOBUF),
>  	FD_ENTRY_F(TMPFS_DEV,	"tmpfs-dev-%u.tar.gz", O_NOBUF),
>  	FD_ENTRY_F(AUTOFS,	"autofs-%u", O_NOBUF),
> diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h
> index fea80a71..6db8bf94 100644
> --- a/criu/include/image-desc.h
> +++ b/criu/include/image-desc.h
> @@ -42,6 +42,7 @@ enum {
>  	CR_FD_RULE,
>  	CR_FD_IPTABLES,
>  	CR_FD_IP6TABLES,
> +	CR_FD_NFTABLES,
>  	CR_FD_NETNS,
>  	CR_FD_NETNF_CT,
>  	CR_FD_NETNF_EXP,
> diff --git a/criu/include/magic.h b/criu/include/magic.h
> index 05101f43..1a583f4e 100644
> --- a/criu/include/magic.h
> +++ b/criu/include/magic.h
> @@ -103,6 +103,7 @@
>  #define TMPFS_DEV_MAGIC		RAW_IMAGE_MAGIC
>  #define IPTABLES_MAGIC		RAW_IMAGE_MAGIC
>  #define IP6TABLES_MAGIC		RAW_IMAGE_MAGIC
> +#define NFTABLES_MAGIC		RAW_IMAGE_MAGIC
>  #define NETNF_CT_MAGIC		RAW_IMAGE_MAGIC
>  #define NETNF_EXP_MAGIC		RAW_IMAGE_MAGIC
>  
> diff --git a/criu/include/util.h b/criu/include/util.h
> index a14be722..57b46dcc 100644
> --- a/criu/include/util.h
> +++ b/criu/include/util.h
> @@ -252,6 +252,8 @@ static inline bool issubpath(const char *path, const char *sub_path)
>  		(end == '/' || end == '\0');
>  }
>  
> +int check_cmd_exists(const char *cmd);
> +
>  /*
>   * mkdir -p
>   */
> diff --git a/criu/net.c b/criu/net.c
> index fe9b51ad..4737b604 100644
> --- a/criu/net.c
> +++ b/criu/net.c
> @@ -1739,12 +1739,12 @@ static int run_ip_tool(char *arg1, char *arg2, char *arg3, char *arg4, int fdin,
>  	return 0;
>  }
>  
> -static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
> +static int run_tool(const char *env_var, char *def_cmd, int fdin, int fdout)
>  {
>  	int ret;
>  	char *cmd;
>  
> -	cmd = getenv("CR_IPTABLES");
> +	cmd = getenv(env_var);
>  	if (!cmd)
>  		cmd = def_cmd;
>  	pr_debug("\tRunning %s for %s\n", cmd, def_cmd);
> @@ -1755,6 +1755,16 @@ static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
>  	return ret;
>  }
>  
> +static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
> +{
> +	return run_tool("CR_IPTABLES", def_cmd, fdin, fdout);
> +}
> +
> +static int run_nftables_tool(char *def_cmd, int fdin, int fdout)
> +{
> +	return run_tool("CR_NFTABLES", def_cmd, fdin, fdout);
> +}
> +
>  static inline int dump_ifaddr(struct cr_imgset *fds)
>  {
>  	struct cr_img *img = img_from_set(fds, CR_FD_IFADDR);
> @@ -1818,6 +1828,21 @@ static inline int dump_iptables(struct cr_imgset *fds)
>  	return 0;
>  }
>  
> +static inline int dump_nftables(struct cr_imgset *fds)
> +{
> +	struct cr_img *img;
> +
> +	/* we not dump nftables if nft utility isn't present */
> +	if (!check_cmd_exists("nft"))
> +		return 0;
> +
> +	img = img_from_set(fds, CR_FD_NFTABLES);
> +	if (run_nftables_tool("nft list ruleset", -1, img_raw_fd(img)))
> +		return -1;
> +
> +	return 0;
> +}
> +
>  static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds)
>  {
>  	void *buf, *o_buf;
> @@ -2082,6 +2107,38 @@ out:
>  	return ret;
>  }
>  
> +static inline int restore_nftables(int pid)

why do we need to inline this function?

> +{
> +	int ret = -1;
> +	struct cr_img *img;
> +
> +	img = open_image(CR_FD_NFTABLES, O_RSTR, pid);
> +	if (img == NULL)
> +		return -1;
> +	if (empty_image(img)) {
> +		/* Backward compatibility */
> +		pr_info("Skipping nft restore, no image");
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	/*
> +	 * At this point we already know, that image may contain nftables info,
> +	 * if nft utility not present we need to warn user about possible
> +	 * problems
> +	 */
> +	if (!check_cmd_exists("nft")) {
> +		pr_warn("Skipping nft restore, no nft utility");

This must be the error and need to return -1. If we have rules in a
image file, we don't want to ignore them.

> +		ret = 0;
> +		goto out;
> +	}
> +
> +	ret = run_nftables_tool("nft -f /proc/self/fd/0", img_raw_fd(img), -1);


> +out:
> +	close_image(img);
> +	return ret;
> +}
> +
>  int read_net_ns_img(void)
>  {
>  	struct ns_id *ns;
> @@ -2299,6 +2356,8 @@ int dump_net_ns(struct ns_id *ns)
>  			ret = dump_rule(fds);
>  		if (!ret)
>  			ret = dump_iptables(fds);
> +		if (!ret)
> +			ret = dump_nftables(fds);
>  		if (!ret)
>  			ret = dump_netns_conf(ns, fds);
>  	} else if (ns->type != NS_ROOT) {
> @@ -2392,6 +2451,8 @@ static int prepare_net_ns_second_stage(struct ns_id *ns)
>  			ret = restore_rule(nsid);
>  		if (!ret)
>  			ret = restore_iptables(nsid);
> +		if (!ret)
> +			ret = restore_nftables(nsid);
>  	}
>  
>  	if (!ret)
> diff --git a/criu/util.c b/criu/util.c
> index 2a3d7abc..b4b1564f 100644
> --- a/criu/util.c
> +++ b/criu/util.c
> @@ -795,6 +795,45 @@ struct vma_area *alloc_vma_area(void)
>  	return p;
>  }
>  
> +static int __check_cmd_exists(const char *path)
> +{
> +	return (access(path, F_OK) == 0);
> +}
> +
> +int check_cmd_exists(const char *cmd)
> +{
> +	int ret = 0;
> +	char buf[255];
> +	char *env_path, *dup, *s, *p;
> +
> +	env_path = getenv("PATH");
> +
> +	/* ok, the simply __check_cmd_exists */
> +	if (!env_path || cmd[0] == '/')
> +		return __check_cmd_exists(cmd);
> +
> +	dup = strdup(env_path);
> +
	dup = xstrdup(env_path);
	if (dup == NULL)
		return -1;

> +	/* let's try to find program in PATH */
> +	s = dup;
> +	p = NULL;
> +	do {
> +		p = strchr(s, ':');
> +		if (p != NULL)
> +			p[0] = 0;
> +
> +		sprintf(buf, "%s/%s", s, cmd);
> +		if ((ret = __check_cmd_exists(buf)))

		ret = __check_cmd_exists(buf);
		if (ret)

> +			goto free;
> +
> +		s = p + 1;
> +	} while (p != NULL);
> +
> +free:
> +	free(dup);

	xfree(dup);

> +	return ret;
> +}
> +
>  int mkdirpat(int fd, const char *path, int mode)
>  {
>  	size_t i;
> -- 
> 2.17.1
> 
> _______________________________________________
> CRIU mailing list
> CRIU@openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
Alexander Mikhalitsyn Nov. 13, 2019, 2:14 p.m.
On Wed, 13 Nov 2019 00:45:20 -0800
Andrei Vagin <avagin@gmail.com> wrote:

> On Tue, Nov 12, 2019 at 07:06:42PM +0300, Alexander Mikhalitsyn wrote:
> > From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> > 
> > After Centos-8 nft used instead of iptables. But we had never supported nft rules in
> > CRIU, and after c/r all rules are flushed.
> > 
> > Path to nft tool can be changed via CR_NFTABLES environment variable
> > similar to CR_IPTABLES.
> 
> 
> Can we use libnftnl? This should be faster, because we will not need to
> fork a new process and exec a binary.
> 
Good point, but we have some problems with that. Currently, when we do "nft list ruleset" it output have special format (that corresponds to formal grammar defined by (nftables/src/parser_bison.y)). It's not format that supported by libnftnl. Currently libnftnl supports only dumping ruleset in "command format" as a sequence of nft commands that describes current ruleset. But if we try to use this format on Checkpoint in CRIU, then we have a problems on Restore - we need to execute a lot (maybe) "nft ..." commands. In performance terms it may be even worse then as we doing now. Of course, we can grab this parser from nftables to CRIU, but it's a lot of code and then we need additional compile-time deps - Bison for example. I don't know what to choose... :)

> > 
> > Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> > Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
> > Signed-off-by: Alexander Mikhalitsyn <alexander@mihalicyn.com>
> > ---
> >  criu/image-desc.c         |  1 +
> >  criu/include/image-desc.h |  1 +
> >  criu/include/magic.h      |  1 +
> >  criu/include/util.h       |  2 ++
> >  criu/net.c                | 65 +++++++++++++++++++++++++++++++++++++--
> >  criu/util.c               | 39 +++++++++++++++++++++++
> >  6 files changed, 107 insertions(+), 2 deletions(-)
> > 
> > diff --git a/criu/image-desc.c b/criu/image-desc.c
> > index 81cd0748..ae5d817f 100644
> > --- a/criu/image-desc.c
> > +++ b/criu/image-desc.c
> > @@ -76,6 +76,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
> >  	FD_ENTRY_F(RULE,	"rule-%u", O_NOBUF),
> >  	FD_ENTRY_F(IPTABLES,	"iptables-%u", O_NOBUF),
> >  	FD_ENTRY_F(IP6TABLES,	"ip6tables-%u", O_NOBUF),
> > +	FD_ENTRY_F(NFTABLES,	"nftables-%u", O_NOBUF),
> >  	FD_ENTRY_F(TMPFS_IMG,	"tmpfs-%u.tar.gz", O_NOBUF),
> >  	FD_ENTRY_F(TMPFS_DEV,	"tmpfs-dev-%u.tar.gz", O_NOBUF),
> >  	FD_ENTRY_F(AUTOFS,	"autofs-%u", O_NOBUF),
> > diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h
> > index fea80a71..6db8bf94 100644
> > --- a/criu/include/image-desc.h
> > +++ b/criu/include/image-desc.h
> > @@ -42,6 +42,7 @@ enum {
> >  	CR_FD_RULE,
> >  	CR_FD_IPTABLES,
> >  	CR_FD_IP6TABLES,
> > +	CR_FD_NFTABLES,
> >  	CR_FD_NETNS,
> >  	CR_FD_NETNF_CT,
> >  	CR_FD_NETNF_EXP,
> > diff --git a/criu/include/magic.h b/criu/include/magic.h
> > index 05101f43..1a583f4e 100644
> > --- a/criu/include/magic.h
> > +++ b/criu/include/magic.h
> > @@ -103,6 +103,7 @@
> >  #define TMPFS_DEV_MAGIC		RAW_IMAGE_MAGIC
> >  #define IPTABLES_MAGIC		RAW_IMAGE_MAGIC
> >  #define IP6TABLES_MAGIC		RAW_IMAGE_MAGIC
> > +#define NFTABLES_MAGIC		RAW_IMAGE_MAGIC
> >  #define NETNF_CT_MAGIC		RAW_IMAGE_MAGIC
> >  #define NETNF_EXP_MAGIC		RAW_IMAGE_MAGIC
> >  
> > diff --git a/criu/include/util.h b/criu/include/util.h
> > index a14be722..57b46dcc 100644
> > --- a/criu/include/util.h
> > +++ b/criu/include/util.h
> > @@ -252,6 +252,8 @@ static inline bool issubpath(const char *path, const char *sub_path)
> >  		(end == '/' || end == '\0');
> >  }
> >  
> > +int check_cmd_exists(const char *cmd);
> > +
> >  /*
> >   * mkdir -p
> >   */
> > diff --git a/criu/net.c b/criu/net.c
> > index fe9b51ad..4737b604 100644
> > --- a/criu/net.c
> > +++ b/criu/net.c
> > @@ -1739,12 +1739,12 @@ static int run_ip_tool(char *arg1, char *arg2, char *arg3, char *arg4, int fdin,
> >  	return 0;
> >  }
> >  
> > -static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
> > +static int run_tool(const char *env_var, char *def_cmd, int fdin, int fdout)
> >  {
> >  	int ret;
> >  	char *cmd;
> >  
> > -	cmd = getenv("CR_IPTABLES");
> > +	cmd = getenv(env_var);
> >  	if (!cmd)
> >  		cmd = def_cmd;
> >  	pr_debug("\tRunning %s for %s\n", cmd, def_cmd);
> > @@ -1755,6 +1755,16 @@ static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
> >  	return ret;
> >  }
> >  
> > +static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
> > +{
> > +	return run_tool("CR_IPTABLES", def_cmd, fdin, fdout);
> > +}
> > +
> > +static int run_nftables_tool(char *def_cmd, int fdin, int fdout)
> > +{
> > +	return run_tool("CR_NFTABLES", def_cmd, fdin, fdout);
> > +}
> > +
> >  static inline int dump_ifaddr(struct cr_imgset *fds)
> >  {
> >  	struct cr_img *img = img_from_set(fds, CR_FD_IFADDR);
> > @@ -1818,6 +1828,21 @@ static inline int dump_iptables(struct cr_imgset *fds)
> >  	return 0;
> >  }
> >  
> > +static inline int dump_nftables(struct cr_imgset *fds)
> > +{
> > +	struct cr_img *img;
> > +
> > +	/* we not dump nftables if nft utility isn't present */
> > +	if (!check_cmd_exists("nft"))
> > +		return 0;
> > +
> > +	img = img_from_set(fds, CR_FD_NFTABLES);
> > +	if (run_nftables_tool("nft list ruleset", -1, img_raw_fd(img)))
> > +		return -1;
> > +
> > +	return 0;
> > +}
> > +
> >  static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds)
> >  {
> >  	void *buf, *o_buf;
> > @@ -2082,6 +2107,38 @@ out:
> >  	return ret;
> >  }
> >  
> > +static inline int restore_nftables(int pid)
> 
> why do we need to inline this function?
> 
> > +{
> > +	int ret = -1;
> > +	struct cr_img *img;
> > +
> > +	img = open_image(CR_FD_NFTABLES, O_RSTR, pid);
> > +	if (img == NULL)
> > +		return -1;
> > +	if (empty_image(img)) {
> > +		/* Backward compatibility */
> > +		pr_info("Skipping nft restore, no image");
> > +		ret = 0;
> > +		goto out;
> > +	}
> > +
> > +	/*
> > +	 * At this point we already know, that image may contain nftables info,
> > +	 * if nft utility not present we need to warn user about possible
> > +	 * problems
> > +	 */
> > +	if (!check_cmd_exists("nft")) {
> > +		pr_warn("Skipping nft restore, no nft utility");
> 
> This must be the error and need to return -1. If we have rules in a
> image file, we don't want to ignore them.
> 
> > +		ret = 0;
> > +		goto out;
> > +	}
> > +
> > +	ret = run_nftables_tool("nft -f /proc/self/fd/0", img_raw_fd(img), -1);
> 
> 
> > +out:
> > +	close_image(img);
> > +	return ret;
> > +}
> > +
> >  int read_net_ns_img(void)
> >  {
> >  	struct ns_id *ns;
> > @@ -2299,6 +2356,8 @@ int dump_net_ns(struct ns_id *ns)
> >  			ret = dump_rule(fds);
> >  		if (!ret)
> >  			ret = dump_iptables(fds);
> > +		if (!ret)
> > +			ret = dump_nftables(fds);
> >  		if (!ret)
> >  			ret = dump_netns_conf(ns, fds);
> >  	} else if (ns->type != NS_ROOT) {
> > @@ -2392,6 +2451,8 @@ static int prepare_net_ns_second_stage(struct ns_id *ns)
> >  			ret = restore_rule(nsid);
> >  		if (!ret)
> >  			ret = restore_iptables(nsid);
> > +		if (!ret)
> > +			ret = restore_nftables(nsid);
> >  	}
> >  
> >  	if (!ret)
> > diff --git a/criu/util.c b/criu/util.c
> > index 2a3d7abc..b4b1564f 100644
> > --- a/criu/util.c
> > +++ b/criu/util.c
> > @@ -795,6 +795,45 @@ struct vma_area *alloc_vma_area(void)
> >  	return p;
> >  }
> >  
> > +static int __check_cmd_exists(const char *path)
> > +{
> > +	return (access(path, F_OK) == 0);
> > +}
> > +
> > +int check_cmd_exists(const char *cmd)
> > +{
> > +	int ret = 0;
> > +	char buf[255];
> > +	char *env_path, *dup, *s, *p;
> > +
> > +	env_path = getenv("PATH");
> > +
> > +	/* ok, the simply __check_cmd_exists */
> > +	if (!env_path || cmd[0] == '/')
> > +		return __check_cmd_exists(cmd);
> > +
> > +	dup = strdup(env_path);
> > +
> 	dup = xstrdup(env_path);
> 	if (dup == NULL)
> 		return -1;
> 
> > +	/* let's try to find program in PATH */
> > +	s = dup;
> > +	p = NULL;
> > +	do {
> > +		p = strchr(s, ':');
> > +		if (p != NULL)
> > +			p[0] = 0;
> > +
> > +		sprintf(buf, "%s/%s", s, cmd);
> > +		if ((ret = __check_cmd_exists(buf)))
> 
> 		ret = __check_cmd_exists(buf);
> 		if (ret)
> 
> > +			goto free;
> > +
> > +		s = p + 1;
> > +	} while (p != NULL);
> > +
> > +free:
> > +	free(dup);
> 
> 	xfree(dup);
> 
> > +	return ret;
> > +}
> > +
> >  int mkdirpat(int fd, const char *path, int mode)
> >  {
> >  	size_t i;
> > -- 
> > 2.17.1
> > 
> > _______________________________________________
> > CRIU mailing list
> > CRIU@openvz.org
> > https://lists.openvz.org/mailman/listinfo/criu
Andrei Vagin Nov. 14, 2019, 4:29 a.m.
On Wed, Nov 13, 2019 at 02:14:06PM +0000, Alexander Mikhalitsyn wrote:
> On Wed, 13 Nov 2019 00:45:20 -0800
> Andrei Vagin <avagin@gmail.com> wrote:
> 
> > On Tue, Nov 12, 2019 at 07:06:42PM +0300, Alexander Mikhalitsyn wrote:
> > > From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> > > 
> > > After Centos-8 nft used instead of iptables. But we had never supported nft rules in
> > > CRIU, and after c/r all rules are flushed.
> > > 
> > > Path to nft tool can be changed via CR_NFTABLES environment variable
> > > similar to CR_IPTABLES.
> > 
> > 
> > Can we use libnftnl? This should be faster, because we will not need to
> > fork a new process and exec a binary.
> > 

> Good point, but we have some problems with that. Currently, when we do
> "nft list ruleset" it output have special format (that corresponds to
> formal grammar defined by (nftables/src/parser_bison.y)). It's not
> format that supported by libnftnl. Currently libnftnl supports only
> dumping ruleset in "command format" as a sequence of nft commands that
> describes current ruleset. But if we try to use this format on
> Checkpoint in CRIU, then we have a problems on Restore - we need to
> execute a lot (maybe) "nft ..." commands. In performance terms it may
> be even worse then as we doing now. Of course, we can grab this parser
> from nftables to CRIU, but it's a lot of code and then we need
> additional compile-time deps - Bison for example. I don't know what to
> choose... :)

Maybe we can use this way:

on dump:
  * create a netlink socket
  * send a request to list all rules
  * save raw netlink messages in an image file.

on restore:
  * create a netlink socket
  * send netlink messages from the image file into the socket.

"ip addr save" and "ip addr restore" work this way.
https://github.com/shemminger/iproute2/blob/master/ip/ipaddress.c#L1552
https://github.com/shemminger/iproute2/blob/master/ip/ipaddress.c#L1618

> 
> > > 
> > > Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> > > Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
> > > Signed-off-by: Alexander Mikhalitsyn <alexander@mihalicyn.com>
> > > ---
> > >  criu/image-desc.c         |  1 +
> > >  criu/include/image-desc.h |  1 +
> > >  criu/include/magic.h      |  1 +
> > >  criu/include/util.h       |  2 ++
> > >  criu/net.c                | 65 +++++++++++++++++++++++++++++++++++++--
> > >  criu/util.c               | 39 +++++++++++++++++++++++
> > >  6 files changed, 107 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/criu/image-desc.c b/criu/image-desc.c
> > > index 81cd0748..ae5d817f 100644
> > > --- a/criu/image-desc.c
> > > +++ b/criu/image-desc.c
> > > @@ -76,6 +76,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
> > >  	FD_ENTRY_F(RULE,	"rule-%u", O_NOBUF),
> > >  	FD_ENTRY_F(IPTABLES,	"iptables-%u", O_NOBUF),
> > >  	FD_ENTRY_F(IP6TABLES,	"ip6tables-%u", O_NOBUF),
> > > +	FD_ENTRY_F(NFTABLES,	"nftables-%u", O_NOBUF),
> > >  	FD_ENTRY_F(TMPFS_IMG,	"tmpfs-%u.tar.gz", O_NOBUF),
> > >  	FD_ENTRY_F(TMPFS_DEV,	"tmpfs-dev-%u.tar.gz", O_NOBUF),
> > >  	FD_ENTRY_F(AUTOFS,	"autofs-%u", O_NOBUF),
> > > diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h
> > > index fea80a71..6db8bf94 100644
> > > --- a/criu/include/image-desc.h
> > > +++ b/criu/include/image-desc.h
> > > @@ -42,6 +42,7 @@ enum {
> > >  	CR_FD_RULE,
> > >  	CR_FD_IPTABLES,
> > >  	CR_FD_IP6TABLES,
> > > +	CR_FD_NFTABLES,
> > >  	CR_FD_NETNS,
> > >  	CR_FD_NETNF_CT,
> > >  	CR_FD_NETNF_EXP,
> > > diff --git a/criu/include/magic.h b/criu/include/magic.h
> > > index 05101f43..1a583f4e 100644
> > > --- a/criu/include/magic.h
> > > +++ b/criu/include/magic.h
> > > @@ -103,6 +103,7 @@
> > >  #define TMPFS_DEV_MAGIC		RAW_IMAGE_MAGIC
> > >  #define IPTABLES_MAGIC		RAW_IMAGE_MAGIC
> > >  #define IP6TABLES_MAGIC		RAW_IMAGE_MAGIC
> > > +#define NFTABLES_MAGIC		RAW_IMAGE_MAGIC
> > >  #define NETNF_CT_MAGIC		RAW_IMAGE_MAGIC
> > >  #define NETNF_EXP_MAGIC		RAW_IMAGE_MAGIC
> > >  
> > > diff --git a/criu/include/util.h b/criu/include/util.h
> > > index a14be722..57b46dcc 100644
> > > --- a/criu/include/util.h
> > > +++ b/criu/include/util.h
> > > @@ -252,6 +252,8 @@ static inline bool issubpath(const char *path, const char *sub_path)
> > >  		(end == '/' || end == '\0');
> > >  }
> > >  
> > > +int check_cmd_exists(const char *cmd);
> > > +
> > >  /*
> > >   * mkdir -p
> > >   */
> > > diff --git a/criu/net.c b/criu/net.c
> > > index fe9b51ad..4737b604 100644
> > > --- a/criu/net.c
> > > +++ b/criu/net.c
> > > @@ -1739,12 +1739,12 @@ static int run_ip_tool(char *arg1, char *arg2, char *arg3, char *arg4, int fdin,
> > >  	return 0;
> > >  }
> > >  
> > > -static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
> > > +static int run_tool(const char *env_var, char *def_cmd, int fdin, int fdout)
> > >  {
> > >  	int ret;
> > >  	char *cmd;
> > >  
> > > -	cmd = getenv("CR_IPTABLES");
> > > +	cmd = getenv(env_var);
> > >  	if (!cmd)
> > >  		cmd = def_cmd;
> > >  	pr_debug("\tRunning %s for %s\n", cmd, def_cmd);
> > > @@ -1755,6 +1755,16 @@ static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
> > >  	return ret;
> > >  }
> > >  
> > > +static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
> > > +{
> > > +	return run_tool("CR_IPTABLES", def_cmd, fdin, fdout);
> > > +}
> > > +
> > > +static int run_nftables_tool(char *def_cmd, int fdin, int fdout)
> > > +{
> > > +	return run_tool("CR_NFTABLES", def_cmd, fdin, fdout);
> > > +}
> > > +
> > >  static inline int dump_ifaddr(struct cr_imgset *fds)
> > >  {
> > >  	struct cr_img *img = img_from_set(fds, CR_FD_IFADDR);
> > > @@ -1818,6 +1828,21 @@ static inline int dump_iptables(struct cr_imgset *fds)
> > >  	return 0;
> > >  }
> > >  
> > > +static inline int dump_nftables(struct cr_imgset *fds)
> > > +{
> > > +	struct cr_img *img;
> > > +
> > > +	/* we not dump nftables if nft utility isn't present */
> > > +	if (!check_cmd_exists("nft"))
> > > +		return 0;
> > > +
> > > +	img = img_from_set(fds, CR_FD_NFTABLES);
> > > +	if (run_nftables_tool("nft list ruleset", -1, img_raw_fd(img)))
> > > +		return -1;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > >  static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds)
> > >  {
> > >  	void *buf, *o_buf;
> > > @@ -2082,6 +2107,38 @@ out:
> > >  	return ret;
> > >  }
> > >  
> > > +static inline int restore_nftables(int pid)
> > 
> > why do we need to inline this function?
> > 
> > > +{
> > > +	int ret = -1;
> > > +	struct cr_img *img;
> > > +
> > > +	img = open_image(CR_FD_NFTABLES, O_RSTR, pid);
> > > +	if (img == NULL)
> > > +		return -1;
> > > +	if (empty_image(img)) {
> > > +		/* Backward compatibility */
> > > +		pr_info("Skipping nft restore, no image");
> > > +		ret = 0;
> > > +		goto out;
> > > +	}
> > > +
> > > +	/*
> > > +	 * At this point we already know, that image may contain nftables info,
> > > +	 * if nft utility not present we need to warn user about possible
> > > +	 * problems
> > > +	 */
> > > +	if (!check_cmd_exists("nft")) {
> > > +		pr_warn("Skipping nft restore, no nft utility");
> > 
> > This must be the error and need to return -1. If we have rules in a
> > image file, we don't want to ignore them.
> > 
> > > +		ret = 0;
> > > +		goto out;
> > > +	}
> > > +
> > > +	ret = run_nftables_tool("nft -f /proc/self/fd/0", img_raw_fd(img), -1);
> > 
> > 
> > > +out:
> > > +	close_image(img);
> > > +	return ret;
> > > +}
> > > +
> > >  int read_net_ns_img(void)
> > >  {
> > >  	struct ns_id *ns;
> > > @@ -2299,6 +2356,8 @@ int dump_net_ns(struct ns_id *ns)
> > >  			ret = dump_rule(fds);
> > >  		if (!ret)
> > >  			ret = dump_iptables(fds);
> > > +		if (!ret)
> > > +			ret = dump_nftables(fds);
> > >  		if (!ret)
> > >  			ret = dump_netns_conf(ns, fds);
> > >  	} else if (ns->type != NS_ROOT) {
> > > @@ -2392,6 +2451,8 @@ static int prepare_net_ns_second_stage(struct ns_id *ns)
> > >  			ret = restore_rule(nsid);
> > >  		if (!ret)
> > >  			ret = restore_iptables(nsid);
> > > +		if (!ret)
> > > +			ret = restore_nftables(nsid);
> > >  	}
> > >  
> > >  	if (!ret)
> > > diff --git a/criu/util.c b/criu/util.c
> > > index 2a3d7abc..b4b1564f 100644
> > > --- a/criu/util.c
> > > +++ b/criu/util.c
> > > @@ -795,6 +795,45 @@ struct vma_area *alloc_vma_area(void)
> > >  	return p;
> > >  }
> > >  
> > > +static int __check_cmd_exists(const char *path)
> > > +{
> > > +	return (access(path, F_OK) == 0);
> > > +}
> > > +
> > > +int check_cmd_exists(const char *cmd)
> > > +{
> > > +	int ret = 0;
> > > +	char buf[255];
> > > +	char *env_path, *dup, *s, *p;
> > > +
> > > +	env_path = getenv("PATH");
> > > +
> > > +	/* ok, the simply __check_cmd_exists */
> > > +	if (!env_path || cmd[0] == '/')
> > > +		return __check_cmd_exists(cmd);
> > > +
> > > +	dup = strdup(env_path);
> > > +
> > 	dup = xstrdup(env_path);
> > 	if (dup == NULL)
> > 		return -1;
> > 
> > > +	/* let's try to find program in PATH */
> > > +	s = dup;
> > > +	p = NULL;
> > > +	do {
> > > +		p = strchr(s, ':');
> > > +		if (p != NULL)
> > > +			p[0] = 0;
> > > +
> > > +		sprintf(buf, "%s/%s", s, cmd);
> > > +		if ((ret = __check_cmd_exists(buf)))
> > 
> > 		ret = __check_cmd_exists(buf);
> > 		if (ret)
> > 
> > > +			goto free;
> > > +
> > > +		s = p + 1;
> > > +	} while (p != NULL);
> > > +
> > > +free:
> > > +	free(dup);
> > 
> > 	xfree(dup);
> > 
> > > +	return ret;
> > > +}
> > > +
> > >  int mkdirpat(int fd, const char *path, int mode)
> > >  {
> > >  	size_t i;
> > > -- 
> > > 2.17.1
> > > 
> > > _______________________________________________
> > > CRIU mailing list
> > > CRIU@openvz.org
> > > https://lists.openvz.org/mailman/listinfo/criu
Kirill Gorkunov Nov. 14, 2019, 7:04 a.m.
On Wed, Nov 13, 2019 at 08:29:15PM -0800, Andrei Vagin (C) wrote:
> On Wed, Nov 13, 2019 at 02:14:06PM +0000, Alexander Mikhalitsyn wrote:
> > On Wed, 13 Nov 2019 00:45:20 -0800
> > Andrei Vagin <avagin@gmail.com> wrote:
> > 
> > > On Tue, Nov 12, 2019 at 07:06:42PM +0300, Alexander Mikhalitsyn wrote:
> > > > From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> > > > 
> > > > After Centos-8 nft used instead of iptables. But we had never supported nft rules in
> > > > CRIU, and after c/r all rules are flushed.
> > > > 
> > > > Path to nft tool can be changed via CR_NFTABLES environment variable
> > > > similar to CR_IPTABLES.
> > > 
> > > 
> > > Can we use libnftnl? This should be faster, because we will not need to
> > > fork a new process and exec a binary.
> > > 
> 
> > Good point, but we have some problems with that. Currently, when we do
> > "nft list ruleset" it output have special format (that corresponds to
> > formal grammar defined by (nftables/src/parser_bison.y)). It's not
> > format that supported by libnftnl. Currently libnftnl supports only
> > dumping ruleset in "command format" as a sequence of nft commands that
> > describes current ruleset. But if we try to use this format on
> > Checkpoint in CRIU, then we have a problems on Restore - we need to
> > execute a lot (maybe) "nft ..." commands. In performance terms it may
> > be even worse then as we doing now. Of course, we can grab this parser
> > from nftables to CRIU, but it's a lot of code and then we need
> > additional compile-time deps - Bison for example. I don't know what to
> > choose... :)
> 
> Maybe we can use this way:
> 
> on dump:
>   * create a netlink socket
>   * send a request to list all rules
>   * save raw netlink messages in an image file.
> 
> on restore:
>   * create a netlink socket
>   * send netlink messages from the image file into the socket.
> 
> "ip addr save" and "ip addr restore" work this way.
> https://github.com/shemminger/iproute2/blob/master/ip/ipaddress.c#L1552
> https://github.com/shemminger/iproute2/blob/master/ip/ipaddress.c#L1618

Which implies to move in part of netlink protocol into criu.
Personally I don't mind but to me this looks a bit complex.
I would rather stick with not that fast but more reliable
usage of exec'ing CR_NFTABLES and the we could try to
implement builtin (netlink based) data fetch from
the kernel.
Andrei Vagin Nov. 14, 2019, 8:02 a.m.
On Wed, Nov 13, 2019, 11:04 PM Kirill Gorkunov <gorcunov@virtuozzo.com>
wrote:

> On Wed, Nov 13, 2019 at 08:29:15PM -0800, Andrei Vagin (C) wrote:
> > On Wed, Nov 13, 2019 at 02:14:06PM +0000, Alexander Mikhalitsyn wrote:
> > > On Wed, 13 Nov 2019 00:45:20 -0800
> > > Andrei Vagin <avagin@gmail.com> wrote:
> > >
> > > > On Tue, Nov 12, 2019 at 07:06:42PM +0300, Alexander Mikhalitsyn
> wrote:
> > > > > From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> > > > >
> > > > > After Centos-8 nft used instead of iptables. But we had never
> supported nft rules in
> > > > > CRIU, and after c/r all rules are flushed.
> > > > >
> > > > > Path to nft tool can be changed via CR_NFTABLES environment
> variable
> > > > > similar to CR_IPTABLES.
> > > >
> > > >
> > > > Can we use libnftnl? This should be faster, because we will not need
> to
> > > > fork a new process and exec a binary.
> > > >
> >
> > > Good point, but we have some problems with that. Currently, when we do
> > > "nft list ruleset" it output have special format (that corresponds to
> > > formal grammar defined by (nftables/src/parser_bison.y)). It's not
> > > format that supported by libnftnl. Currently libnftnl supports only
> > > dumping ruleset in "command format" as a sequence of nft commands that
> > > describes current ruleset. But if we try to use this format on
> > > Checkpoint in CRIU, then we have a problems on Restore - we need to
> > > execute a lot (maybe) "nft ..." commands. In performance terms it may
> > > be even worse then as we doing now. Of course, we can grab this parser
> > > from nftables to CRIU, but it's a lot of code and then we need
> > > additional compile-time deps - Bison for example. I don't know what to
> > > choose... :)
> >
> > Maybe we can use this way:
> >
> > on dump:
> >   * create a netlink socket
> >   * send a request to list all rules
> >   * save raw netlink messages in an image file.
> >
> > on restore:
> >   * create a netlink socket
> >   * send netlink messages from the image file into the socket.
> >
> > "ip addr save" and "ip addr restore" work this way.
> > https://github.com/shemminger/iproute2/blob/master/ip/ipaddress.c#L1552
> > https://github.com/shemminger/iproute2/blob/master/ip/ipaddress.c#L1618
>
> Which implies to move in part of netlink protocol into criu.
> Personally I don't mind but to me this looks a bit complex.
> I would rather stick with not that fast but more reliable
> usage of exec'ing CR_NFTABLES and the we could try to
> implement builtin (netlink based) data fetch from
> the kernel.
>

I believe we already have all required parts of netlink in the criu code
base. And our experience with the iptable tool showed that tools are not
always reliable. I think it's worth to try to implement this and see how it
will looks like. I think it should not take more than an hour or two.
Kirill Gorkunov Nov. 14, 2019, 8:32 a.m.
On Thu, Nov 14, 2019 at 12:02:01AM -0800, Andrei Vagin wrote:
...
>      Which implies to move in part of netlink protocol into criu.
>      Personally I don't mind but to me this looks a bit complex.
>      I would rather stick with not that fast but more reliable
>      usage of exec'ing CR_NFTABLES and the we could try to
>      implement builtin (netlink based) data fetch from
>      the kernel.
> 
> I believe we already have all required parts of netlink in the criu code
> base. And our experience with the iptable tool showed that tools are not
> always reliable. I think it's worth to try to implement this and see how
> it will looks like. I think it should not take more than an hour or two.

I already heard this phrase "it should not take more than an hour or two"
once or two times -- it ended up in a week coding marathon, so no longer
buy it :) I must confess I didn't look into kernel sources to figure out
which exactly things are needed but have a serious doubts it is that easy.

Still if Alexander would be able to estimate what we need to fetch by hands
this would be great!

	Cyrill
Kirill Gorkunov Nov. 14, 2019, 5:01 p.m.
On Thu, Nov 14, 2019 at 11:32:05AM +0300, Cyrill Gorcunov wrote:
>
> Still if Alexander would be able to estimate what we need to fetch by hands
> this would be great!

Been talking to Andrew f2f so we've got a consensus -- for vanilla criu
we definitely should use netlink protocol and fetch rulesets directly,
which should not be that hard but maybe a little time consuming task.
For vz7 instance we should figure out how urgent the support for nft
rules is needed and in case of real need we could make a faster fix
via execve and then once vanilla variant is ready -- switch to netlink.
Alexander Mikhalitsyn Nov. 18, 2019, 1:01 p.m.
On Thu, 14 Nov 2019 20:01:09 +0300
Kirill Gorkunov <gorcunov@virtuozzo.com> wrote:

> On Thu, Nov 14, 2019 at 11:32:05AM +0300, Cyrill Gorcunov wrote:
> >
> > Still if Alexander would be able to estimate what we need to fetch by hands
> > this would be great!
> 
> Been talking to Andrew f2f so we've got a consensus -- for vanilla criu
> we definitely should use netlink protocol and fetch rulesets directly,
> which should not be that hard but maybe a little time consuming task.
> For vz7 instance we should figure out how urgent the support for nft
> rules is needed and in case of real need we could make a faster fix
> via execve and then once vanilla variant is ready -- switch to netlink.
Dear friends,

I've made some investigation and tried to write initial implementation of nftables dump/restore using libnftnl. I would like to tell you what I did and what conclusions I came to:

I) It's *not* simple. Why?

In nft we operate the concept of "ruleset" (table, chains, rules, sets). We need to dump ruleset i.e. all tables, all chains (that assigned to some table), all rules (that assigned to some chain), all sets. To do that we need to form sequence of NETLINK messages: NFT_MSG_GETTABLE, NFT_MSG_GETCHAIN, NFT_MSG_GETRULE, NFT_MSG_GETSET (and for each set element NFT_MSG_GETSETELEM). To each of this requests kernel respond with special NETLINK messages that describes table, rule, etc. But. We *can't* simply save this binary netlink response messages and send that when needed to recreate for example table of rule! Why? Because this message contains more data, that we need to restore structure (message type ID the same when we get table or when we create table - that's ok).

Consider example (dump/creation of the same table)

Message that we can construct from netlink response to GETTABLE request:
(I've only changed flags to R (request) and A (ack))
----------------	------------------
|  0000000068  |	| message length |
| 02560 | R-A- |	|  type | flags  |
|  1574068571  |	| sequence number|
|  0000004410  |	|     port ID    |
----------------	------------------
|00002|-B|01024|	|len |flags| type|
|00017|--|00001|	|len |flags| type|
| 74 65 73 74  |	|      data      |	 t e s t
| 31 32 33 33  |	|      data      |	 1 2 3 3
| 34 34 73 64  |	|      data      |	 4 4 s d
| 00 00 00 00  |	|      data      |	        
|00008|--|00002|	|len |flags| type|
| 00 00 00 00  |	|      data      |	        
|00008|--|00003|	|len |flags| type|
| 00 00 00 00  |	|      data      |	        
|00012|--|00004|	|len |flags| type|
| 00 00 00 00  |	|      data      |	        
| 00 00 00 24  |	|      data      |	       $
----------------	------------------

This is the *same* message that I construct by using libnftnl
----------------	------------------
|  0000000040  |	| message length |
| 02560 | R-A- |	|  type | flags  |
|  1574068472  |	| sequence number|
|  0000004217  |	|     port ID    |
----------------	------------------
|00017|--|00001|	|len |flags| type|
| 74 65 73 74  |	|      data      |	 t e s t
| 31 32 33 33  |	|      data      |	 1 2 3 3
| 34 34 73 64  |	|      data      |	 4 4 s d
| 00 00 00 00  |	|      data      |	        
----------------	------------------

As you see in first message we have some additional data - it's for example "kernel handle" of table, and other attributes. Idealy, after GETTABLE request I need to do several things with response:
1. change flags to R A (request, ack).
2. Obtain and remove extra attributes (NFTA_TABLE_USE attribute for table and so on)

II) Approach, described in previous point looks ugly for me. Why? Because I need to make some changes in raw netlink messages and it may provoke bugs and so on.

But...

III) I think that we have two possible solutions to this problem:
1. We could try some hack -> in libnftnl we have several helper functions that could convert raw netlink messages to libnftnl-defined structures (struct nftnl_table, struct nftnl_chain, etc). We can parse response netlink messages to netlink structures, then vice versa construct payload to create table (chain, rule, set)! :) Looks a bit uggly. But it's better than make some changes in binary NETLINK messages by hands. A? When we use this way all extra attributes will be stripped! Look on nftnl_table_nlmsg_build_payload() function from libnftnl (file table.c) and compare it to nftnl_table_parse_attr_cb() for instance.
2. Another option is to make new feature in libnftnl library to dump ruleset to binary file (or json/xml) and then reconstruct ruleset from file. It's hard. But in this case we make some good feature in libnftnl and guys from netlink will support this code.

I would like to hear your opinion and advices. :)

Thank you
Regards, Alex
Alexander Mikhalitsyn Nov. 20, 2019, 1:48 p.m.
On Mon, 18 Nov 2019 16:01:50 +0300
Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com> wrote:

> On Thu, 14 Nov 2019 20:01:09 +0300
> Kirill Gorkunov <gorcunov@virtuozzo.com> wrote:
> 
> > On Thu, Nov 14, 2019 at 11:32:05AM +0300, Cyrill Gorcunov wrote:
> > >
> > > Still if Alexander would be able to estimate what we need to fetch by hands
> > > this would be great!
> > 
> > Been talking to Andrew f2f so we've got a consensus -- for vanilla criu
> > we definitely should use netlink protocol and fetch rulesets directly,
> > which should not be that hard but maybe a little time consuming task.
> > For vz7 instance we should figure out how urgent the support for nft
> > rules is needed and in case of real need we could make a faster fix
> > via execve and then once vanilla variant is ready -- switch to netlink.
> Dear friends,
> 
> I've made some investigation and tried to write initial implementation of nftables dump/restore using libnftnl. I would like to tell you what I did and what conclusions I came to:
> 
> I) It's *not* simple. Why?
> 
> In nft we operate the concept of "ruleset" (table, chains, rules, sets). We need to dump ruleset i.e. all tables, all chains (that assigned to some table), all rules (that assigned to some chain), all sets. To do that we need to form sequence of NETLINK messages: NFT_MSG_GETTABLE, NFT_MSG_GETCHAIN, NFT_MSG_GETRULE, NFT_MSG_GETSET (and for each set element NFT_MSG_GETSETELEM). To each of this requests kernel respond with special NETLINK messages that describes table, rule, etc. But. We *can't* simply save this binary netlink response messages and send that when needed to recreate for example table of rule! Why? Because this message contains more data, that we need to restore structure (message type ID the same when we get table or when we create table - that's ok).
> 
> Consider example (dump/creation of the same table)
> 
> Message that we can construct from netlink response to GETTABLE request:
> (I've only changed flags to R (request) and A (ack))
> ----------------	------------------
> |  0000000068  |	| message length |
> | 02560 | R-A- |	|  type | flags  |
> |  1574068571  |	| sequence number|
> |  0000004410  |	|     port ID    |
> ----------------	------------------
> |00002|-B|01024|	|len |flags| type|
> |00017|--|00001|	|len |flags| type|
> | 74 65 73 74  |	|      data      |	 t e s t
> | 31 32 33 33  |	|      data      |	 1 2 3 3
> | 34 34 73 64  |	|      data      |	 4 4 s d
> | 00 00 00 00  |	|      data      |	        
> |00008|--|00002|	|len |flags| type|
> | 00 00 00 00  |	|      data      |	        
> |00008|--|00003|	|len |flags| type|
> | 00 00 00 00  |	|      data      |	        
> |00012|--|00004|	|len |flags| type|
> | 00 00 00 00  |	|      data      |	        
> | 00 00 00 24  |	|      data      |	       $
> ----------------	------------------
> 
> This is the *same* message that I construct by using libnftnl
> ----------------	------------------
> |  0000000040  |	| message length |
> | 02560 | R-A- |	|  type | flags  |
> |  1574068472  |	| sequence number|
> |  0000004217  |	|     port ID    |
> ----------------	------------------
> |00017|--|00001|	|len |flags| type|
> | 74 65 73 74  |	|      data      |	 t e s t
> | 31 32 33 33  |	|      data      |	 1 2 3 3
> | 34 34 73 64  |	|      data      |	 4 4 s d
> | 00 00 00 00  |	|      data      |	        
> ----------------	------------------
> 
> As you see in first message we have some additional data - it's for example "kernel handle" of table, and other attributes. Idealy, after GETTABLE request I need to do several things with response:
> 1. change flags to R A (request, ack).
> 2. Obtain and remove extra attributes (NFTA_TABLE_USE attribute for table and so on)
> 
> II) Approach, described in previous point looks ugly for me. Why? Because I need to make some changes in raw netlink messages and it may provoke bugs and so on.
> 
> But...
> 
> III) I think that we have two possible solutions to this problem:
> 1. We could try some hack -> in libnftnl we have several helper functions that could convert raw netlink messages to libnftnl-defined structures (struct nftnl_table, struct nftnl_chain, etc). We can parse response netlink messages to netlink structures, then vice versa construct payload to create table (chain, rule, set)! :) Looks a bit uggly. But it's better than make some changes in binary NETLINK messages by hands. A? When we use this way all extra attributes will be stripped! Look on nftnl_table_nlmsg_build_payload() function from libnftnl (file table.c) and compare it to nftnl_table_parse_attr_cb() for instance.
> 2. Another option is to make new feature in libnftnl library to dump ruleset to binary file (or json/xml) and then reconstruct ruleset from file. It's hard. But in this case we make some good feature in libnftnl and guys from netlink will support this code.
> 
> I would like to hear your opinion and advices. :)
> 
> Thank you
> Regards, Alex

Hi guys,

I came with good news :)
I wrote a question to netfilter-devel mailing list about which API we could use that guaranteed to be unchanged. And Pablo Neira Ayuso adviced to take a look on API described on LIBNFTABLES(3) man. This API should not changed in near future. I've already implemented full ruleset dump/restore stand alone utilities using that API. All works just perfect. But we need to link CRIU with libnftables.

Guys, what do you think about it? Is it okay, to add new CRIU dependency - libnftables?

Regards, Alex.
Kirill Gorkunov Nov. 20, 2019, 2:12 p.m.
On Wed, Nov 20, 2019 at 04:48:56PM +0300, Alexander Mikhalitsyn wrote:
> 
> Hi guys,
> 
> I came with good news :)
> I wrote a question to netfilter-devel mailing list about which API we
> could use that guaranteed to be unchanged. And Pablo Neira Ayuso adviced
> to take a look on API described on LIBNFTABLES(3) man. This API should
> not changed in near future. I've already implemented full ruleset dump/restore
> stand alone utilities using that API. All works just perfect. But we need
> to link CRIU with libnftables.
> 
> Guys, what do you think about it? Is it okay, to add new CRIU dependency - libnftables?

As to me, this should be ok. Still the final solution is up to Andrew.