[1/2] sk-inet: Add initial support for raw sockets

Submitted by Cyrill Gorcunov on Feb. 22, 2017, 1:53 p.m.

Details

Message ID 1487771608-11470-2-git-send-email-gorcunov@openvz.org
State New
Series "net: Add support for raw sockets"
Headers show

Commit Message

Cyrill Gorcunov Feb. 22, 2017, 1:53 p.m.
For raw sockets we need DIAG module extension, so in case
if we're failing while collecting socket don't exit with
error but warn a user and if we really meet raw socket
we will exit later on socket's lookup stage.

Strictly speaking we can use procfs parsing instead but
this gonna be a way more complex that well-known diag
approach and taking into account that raw sockets are
note that widely used lets support only when diag module
is present in the system.

In the patch the initial raw sockets support added
compelte enough to handle SO_IP_SET request from
ipset tool (needed by modern containers). But the
code might need extention/fixes in future.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
 criu/cr-check.c        | 11 ++++++++++
 criu/include/sk-inet.h | 12 +++++++++++
 criu/sk-inet.c         | 58 ++++++++++++++++++++++++++++++++++++--------------
 criu/sockets.c         | 35 ++++++++++++++++++++++++++++++
 images/sk-inet.proto   |  2 ++
 5 files changed, 102 insertions(+), 16 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-check.c b/criu/cr-check.c
index c8261255fdeb..3e487a774403 100644
--- a/criu/cr-check.c
+++ b/criu/cr-check.c
@@ -49,6 +49,7 @@ 
 #include "cr_options.h"
 #include "libnetlink.h"
 #include "net.h"
+#include "inet_diag.h"
 #include "linux/userfaultfd.h"
 #include "restorer.h"
 
@@ -1090,6 +1091,14 @@  static int check_sk_netns(void)
 	return 0;
 }
 
+static int check_net_diag_raw(void)
+{
+	check_sock_diag();
+	return !socket_test_collect_bit(AF_INET, IPPROTO_RAW) &&
+		!socket_test_collect_bit(AF_INET6, IPPROTO_RAW);
+}
+
+
 static int check_compat_cr(void)
 {
 	if (kdat_compat_sigreturn_test())
@@ -1202,6 +1211,7 @@  int cr_check(void)
 		ret |= check_userns();
 		ret |= check_loginuid();
 		ret |= check_sk_netns();
+		ret |= check_net_diag_raw();
 	}
 
 	/*
@@ -1254,6 +1264,7 @@  static struct feature_list feature_list[] = {
 	{ "lazy_pages", check_uffd },
 	{ "compat_cr", check_compat_cr },
 	{ "sk_ns", check_sk_netns },
+	{ "net_diag_raw", check_net_diag_raw },
 	{ NULL, NULL },
 };
 
diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h
index bf6fb1d77ddf..5d996581c18d 100644
--- a/criu/include/sk-inet.h
+++ b/criu/include/sk-inet.h
@@ -16,6 +16,18 @@ 
 #define TCP_REPAIR_OPTIONS	22
 #endif
 
+#ifndef IP_HDRINCL
+# define IP_HDRINCL		3
+#endif
+
+#ifndef IP_NODEFRAG
+# define IP_NODEFRAG		22
+#endif
+
+#ifndef IPV6_HDRINCL
+# define IPV6_HDRINCL		36
+#endif
+
 struct inet_sk_desc {
 	struct socket_desc	sd;
 	unsigned int		type;
diff --git a/criu/sk-inet.c b/criu/sk-inet.c
index ee6ce60e2b12..4efe6eb4cc11 100644
--- a/criu/sk-inet.c
+++ b/criu/sk-inet.c
@@ -101,7 +101,7 @@  static void show_one_inet_img(const char *act, const InetSkEntry *e)
 		e->state, src_addr);
 }
 
-static int can_dump_ipproto(int ino, int proto)
+static int can_dump_ipproto(int ino, int proto, int type)
 {
 	/* Make sure it's a proto we support */
 	switch (proto) {
@@ -111,8 +111,12 @@  static int can_dump_ipproto(int ino, int proto)
 	case IPPROTO_UDPLITE:
 		break;
 	default:
-		pr_err("Unsupported proto %d for socket %x\n", proto, ino);
-		return 0;
+		/* Raw sockets may have any protocol inside */
+		if (type != SOCK_RAW) {
+			pr_err("Unsupported proto %d (type %d) for socket %x\n",
+			       proto, type, ino);
+			return 0;
+		}
 	}
 
 	return 1;
@@ -142,9 +146,9 @@  static int can_dump_inet_sk(const struct inet_sk_desc *sk)
 		return 1;
 	}
 
-	if (sk->type != SOCK_STREAM) {
+	if (sk->type != SOCK_STREAM && sk->type != SOCK_RAW) {
 		pr_err("Can't dump %d inet socket %x. "
-				"Only can stream and dgram.\n",
+				"Only can stream, dgram and raw.\n",
 				sk->type, sk->sd.ino);
 		return 0;
 	}
@@ -288,12 +292,24 @@  err:
 	return NULL;
 }
 
-static int dump_ip_opts(int sk, IpOptsEntry *ioe)
+
+static int dump_ip_opts(int family, int type, int sk, IpOptsEntry *ioe)
 {
 	int ret = 0;
 
-	ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
-	ioe->has_freebind = ioe->freebind;
+	if (type == SOCK_RAW) {
+		if (family == AF_INET6) {
+			ret |= dump_opt(sk, SOL_IPV6, IPV6_HDRINCL, &ioe->hdrincl);
+		} else {
+			ret |= dump_opt(sk, SOL_IP, IP_HDRINCL, &ioe->hdrincl);
+			ret |= dump_opt(sk, SOL_IP, IP_NODEFRAG, &ioe->nodefrag);
+			ioe->has_nodefrag = ioe->nodefrag;
+		}
+		ioe->has_hdrincl = ioe->hdrincl;
+	} else {
+		ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
+		ioe->has_freebind = ioe->freebind;
+	}
 
 	return ret;
 }
@@ -323,14 +339,18 @@  static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa
 	InetSkEntry ie = INET_SK_ENTRY__INIT;
 	IpOptsEntry ipopts = IP_OPTS_ENTRY__INIT;
 	SkOptsEntry skopts = SK_OPTS_ENTRY__INIT;
-	int ret = -1, err = -1, proto;
+	int ret = -1, err = -1, proto, type;
 
 	ret = do_dump_opt(lfd, SOL_SOCKET, SO_PROTOCOL,
 					&proto, sizeof(proto));
 	if (ret)
 		goto err;
+	ret = do_dump_opt(lfd, SOL_SOCKET, SO_TYPE,
+			  &type, sizeof(type));
+	if (ret)
+		goto err;
 
-	if (!can_dump_ipproto(p->stat.st_ino, proto))
+	if (!can_dump_ipproto(p->stat.st_ino, proto, type))
 		goto err;
 
 	sk = (struct inet_sk_desc *)lookup_socket(p->stat.st_ino, family, proto);
@@ -410,7 +430,7 @@  static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa
 	memcpy(ie.src_addr, sk->src_addr, pb_repeated_size(&ie, src_addr));
 	memcpy(ie.dst_addr, sk->dst_addr, pb_repeated_size(&ie, dst_addr));
 
-	if (dump_ip_opts(lfd, &ipopts))
+	if (dump_ip_opts(family, sk->type, lfd, &ipopts))
 		goto err;
 
 	if (dump_socket_opts(lfd, &skopts))
@@ -424,7 +444,7 @@  static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa
 
 	switch (proto) {
 	case IPPROTO_TCP:
-		err = dump_one_tcp(lfd, sk);
+		err = (sk->type != SOCK_RAW) ? dump_one_tcp(lfd, sk) : 0;
 		break;
 	default:
 		err = 0;
@@ -606,12 +626,18 @@  static int post_open_inet_sk(struct file_desc *d, int sk)
 	return 0;
 }
 
-int restore_ip_opts(int sk, IpOptsEntry *ioe)
+int restore_ip_opts(int family, int sk, IpOptsEntry *ioe)
 {
 	int ret = 0;
 
 	if (ioe->has_freebind)
 		ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
+	if (ioe->has_nodefrag)
+		ret |= restore_opt(sk, SOL_IP, IP_NODEFRAG, &ioe->nodefrag);
+	if (ioe->has_hdrincl)
+		ret |= restore_opt(sk, family == AF_INET6 ? SOL_IPV6 : SOL_IP,
+				   family == AF_INET6 ? IPV6_HDRINCL : IP_HDRINCL,
+				   &ioe->hdrincl);
 
 	return ret;
 }
@@ -635,7 +661,7 @@  static int open_inet_sk(struct file_desc *d, int *new_fd)
 		return -1;
 	}
 
-	if ((ie->type != SOCK_STREAM) && (ie->type != SOCK_DGRAM)) {
+	if ((ie->type != SOCK_STREAM) && (ie->type != SOCK_DGRAM) && (ie->type != SOCK_RAW)) {
 		pr_err("Unsupported socket type: %d\n", ie->type);
 		return -1;
 	}
@@ -713,7 +739,7 @@  done:
 	if (rst_file_params(sk, ie->fown, ie->flags))
 		goto err;
 
-	if (ie->ip_opts && restore_ip_opts(sk, ie->ip_opts))
+	if (ie->ip_opts && restore_ip_opts(ie->family, sk, ie->ip_opts))
 		goto err;
 
 	if (restore_socket_opts(sk, ie->opts))
@@ -780,7 +806,7 @@  int inet_bind(int sk, struct inet_sk_info *ii)
 	 * sockets could not be bound to them in this moment
 	 * without setting IP_FREEBIND.
 	 */
-	if (ii->ie->family == AF_INET6) {
+	if (ii->ie->family == AF_INET6 && ii->ie->proto != IPPROTO_RAW) {
 		int yes = 1;
 
 		if (restore_opt(sk, SOL_IP, IP_FREEBIND, &yes))
diff --git a/criu/sockets.c b/criu/sockets.c
index 420eee8252e2..06938f27ce05 100644
--- a/criu/sockets.c
+++ b/criu/sockets.c
@@ -61,9 +61,11 @@  enum socket_cl_bits
 	INET_TCP_CL_BIT,
 	INET_UDP_CL_BIT,
 	INET_UDPLITE_CL_BIT,
+	INET_RAW_CL_BIT,
 	INET6_TCP_CL_BIT,
 	INET6_UDP_CL_BIT,
 	INET6_UDPLITE_CL_BIT,
+	INET6_RAW_CL_BIT,
 	UNIX_CL_BIT,
 	PACKET_CL_BIT,
 	_MAX_CL_BIT,
@@ -89,6 +91,8 @@  enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsigned int proto)
 			return INET_UDP_CL_BIT;
 		if (proto == IPPROTO_UDPLITE)
 			return INET_UDPLITE_CL_BIT;
+		if (proto == IPPROTO_RAW)
+			return INET_RAW_CL_BIT;
 	}
 	if (family == AF_INET6) {
 		if (proto == IPPROTO_TCP)
@@ -97,6 +101,8 @@  enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsigned int proto)
 			return INET6_UDP_CL_BIT;
 		if (proto == IPPROTO_UDPLITE)
 			return INET6_UDPLITE_CL_BIT;
+		if (proto == IPPROTO_RAW)
+			return INET6_RAW_CL_BIT;
 	}
 
 	pr_err("Unknown pair family %d proto %d\n", family, proto);
@@ -598,6 +604,9 @@  static int inet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg)
 	case IPPROTO_TCP:
 		type = SOCK_STREAM;
 		break;
+	case IPPROTO_RAW:
+		type = SOCK_RAW;
+		break;
 	case IPPROTO_UDP:
 	case IPPROTO_UDPLITE:
 		type = SOCK_DGRAM;
@@ -620,6 +629,14 @@  static int do_collect_req(int nl, struct sock_diag_req *req, int size,
 
 	if (tmp == 0)
 		set_collect_bit(req->r.n.sdiag_family, req->r.n.sdiag_protocol);
+	else if (tmp == -ENOENT &&
+		 ((req->r.n.sdiag_family == AF_INET ||
+		   req->r.n.sdiag_family == AF_INET6) &&
+		  req->r.n.sdiag_protocol == IPPROTO_RAW)) {
+		pr_warn("No support for DIAG module on family %s with protocol IPPROTO_RAW, may fail later\n",
+			req->r.n.sdiag_family == AF_INET ? "IPv4" : "IPv6");
+		tmp = 0;
+	}
 
 	return tmp;
 }
@@ -677,6 +694,15 @@  int collect_sockets(struct ns_id *ns)
 	if (tmp)
 		err = tmp;
 
+	/* Collect IPv4 RAW sockets */
+	req.r.i.sdiag_family	= AF_INET;
+	req.r.i.sdiag_protocol	= IPPROTO_RAW;
+	req.r.i.idiag_ext	= 0;
+	req.r.i.idiag_states	= -1; /* All */
+	tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, ns, &req.r.i);
+	if (tmp)
+		err = tmp;
+
 	/* Collect IPv6 TCP sockets */
 	req.r.i.sdiag_family	= AF_INET6;
 	req.r.i.sdiag_protocol	= IPPROTO_TCP;
@@ -708,6 +734,15 @@  int collect_sockets(struct ns_id *ns)
 	if (tmp)
 		err = tmp;
 
+	/* Collect IPv6 RAW sockets */
+	req.r.i.sdiag_family	= AF_INET6;
+	req.r.i.sdiag_protocol	= IPPROTO_RAW;
+	req.r.i.idiag_ext	= 0;
+	req.r.i.idiag_states	= -1; /* All */
+	tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, ns, &req.r.i);
+	if (tmp)
+		err = tmp;
+
 	req.r.p.sdiag_family	= AF_PACKET;
 	req.r.p.sdiag_protocol	= 0;
 	req.r.p.pdiag_show	= PACKET_SHOW_INFO | PACKET_SHOW_MCLIST |
diff --git a/images/sk-inet.proto b/images/sk-inet.proto
index 09c5a47d2464..173c74a40df7 100644
--- a/images/sk-inet.proto
+++ b/images/sk-inet.proto
@@ -6,6 +6,8 @@  import "sk-opts.proto";
 
 message ip_opts_entry {
 	optional bool		freebind	= 1;
+	optional bool		hdrincl		= 2;
+	optional bool		nodefrag	= 3;
 }
 
 message inet_sk_entry {

Comments

Andrey Vagin March 15, 2017, 10:33 p.m.
On Wed, Feb 22, 2017 at 04:53:27PM +0300, Cyrill Gorcunov wrote:
> For raw sockets we need DIAG module extension, so in case
> if we're failing while collecting socket don't exit with
> error but warn a user and if we really meet raw socket
> we will exit later on socket's lookup stage.
> 
> Strictly speaking we can use procfs parsing instead but
> this gonna be a way more complex that well-known diag
> approach and taking into account that raw sockets are
> note that widely used lets support only when diag module
> is present in the system.
> 
> In the patch the initial raw sockets support added
> compelte enough to handle SO_IP_SET request from
> ipset tool (needed by modern containers). But the
> code might need extention/fixes in future.
> 
> Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
> ---
>  criu/cr-check.c        | 11 ++++++++++
>  criu/include/sk-inet.h | 12 +++++++++++
>  criu/sk-inet.c         | 58 ++++++++++++++++++++++++++++++++++++--------------
>  criu/sockets.c         | 35 ++++++++++++++++++++++++++++++
>  images/sk-inet.proto   |  2 ++
>  5 files changed, 102 insertions(+), 16 deletions(-)
> 
> diff --git a/criu/cr-check.c b/criu/cr-check.c
> index c8261255fdeb..3e487a774403 100644
> --- a/criu/cr-check.c
> +++ b/criu/cr-check.c
> @@ -49,6 +49,7 @@
>  #include "cr_options.h"
>  #include "libnetlink.h"
>  #include "net.h"
> +#include "inet_diag.h"
>  #include "linux/userfaultfd.h"
>  #include "restorer.h"
>  
> @@ -1090,6 +1091,14 @@ static int check_sk_netns(void)
>  	return 0;
>  }
>  
> +static int check_net_diag_raw(void)
> +{
> +	check_sock_diag();
> +	return !socket_test_collect_bit(AF_INET, IPPROTO_RAW) &&
> +		!socket_test_collect_bit(AF_INET6, IPPROTO_RAW);
> +}
> +
> +
>  static int check_compat_cr(void)
>  {
>  	if (kdat_compat_sigreturn_test())
> @@ -1202,6 +1211,7 @@ int cr_check(void)
>  		ret |= check_userns();
>  		ret |= check_loginuid();
>  		ret |= check_sk_netns();
> +		ret |= check_net_diag_raw();
>  	}
>  
>  	/*
> @@ -1254,6 +1264,7 @@ static struct feature_list feature_list[] = {
>  	{ "lazy_pages", check_uffd },
>  	{ "compat_cr", check_compat_cr },
>  	{ "sk_ns", check_sk_netns },
> +	{ "net_diag_raw", check_net_diag_raw },
>  	{ NULL, NULL },
>  };
>  
> diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h
> index bf6fb1d77ddf..5d996581c18d 100644
> --- a/criu/include/sk-inet.h
> +++ b/criu/include/sk-inet.h
> @@ -16,6 +16,18 @@
>  #define TCP_REPAIR_OPTIONS	22
>  #endif
>  
> +#ifndef IP_HDRINCL
> +# define IP_HDRINCL		3
> +#endif
> +
> +#ifndef IP_NODEFRAG
> +# define IP_NODEFRAG		22
> +#endif
> +
> +#ifndef IPV6_HDRINCL
> +# define IPV6_HDRINCL		36
> +#endif
> +
>  struct inet_sk_desc {
>  	struct socket_desc	sd;
>  	unsigned int		type;
> diff --git a/criu/sk-inet.c b/criu/sk-inet.c
> index ee6ce60e2b12..4efe6eb4cc11 100644
> --- a/criu/sk-inet.c
> +++ b/criu/sk-inet.c
> @@ -101,7 +101,7 @@ static void show_one_inet_img(const char *act, const InetSkEntry *e)
>  		e->state, src_addr);
>  }
>  
> -static int can_dump_ipproto(int ino, int proto)
> +static int can_dump_ipproto(int ino, int proto, int type)
>  {
>  	/* Make sure it's a proto we support */
>  	switch (proto) {
> @@ -111,8 +111,12 @@ static int can_dump_ipproto(int ino, int proto)
>  	case IPPROTO_UDPLITE:
>  		break;
>  	default:
> -		pr_err("Unsupported proto %d for socket %x\n", proto, ino);
> -		return 0;
> +		/* Raw sockets may have any protocol inside */
> +		if (type != SOCK_RAW) {

maybe we can check type out of this function?

> +			pr_err("Unsupported proto %d (type %d) for socket %x\n",
> +			       proto, type, ino);
> +			return 0;
> +		}
>  	}
>  
>  	return 1;
> @@ -142,9 +146,9 @@ static int can_dump_inet_sk(const struct inet_sk_desc *sk)
>  		return 1;
>  	}
>  
> -	if (sk->type != SOCK_STREAM) {
> +	if (sk->type != SOCK_STREAM && sk->type != SOCK_RAW) {
>  		pr_err("Can't dump %d inet socket %x. "
> -				"Only can stream and dgram.\n",
> +				"Only can stream, dgram and raw.\n",
>  				sk->type, sk->sd.ino);
>  		return 0;
>  	}
> @@ -288,12 +292,24 @@ err:
>  	return NULL;
>  }
>  
> -static int dump_ip_opts(int sk, IpOptsEntry *ioe)
> +
> +static int dump_ip_opts(int family, int type, int sk, IpOptsEntry *ioe)
>  {
>  	int ret = 0;
>  
> -	ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
> -	ioe->has_freebind = ioe->freebind;
> +	if (type == SOCK_RAW) {
> +		if (family == AF_INET6) {
> +			ret |= dump_opt(sk, SOL_IPV6, IPV6_HDRINCL, &ioe->hdrincl);
> +		} else {
> +			ret |= dump_opt(sk, SOL_IP, IP_HDRINCL, &ioe->hdrincl);
> +			ret |= dump_opt(sk, SOL_IP, IP_NODEFRAG, &ioe->nodefrag);
> +			ioe->has_nodefrag = ioe->nodefrag;
> +		}
> +		ioe->has_hdrincl = ioe->hdrincl;
> +	} else {
> +		ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);

is IP_FREEBIND not suitable for raw sockets?

> +		ioe->has_freebind = ioe->freebind;
> +	}
>  
>  	return ret;
>  }
> @@ -323,14 +339,18 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa
>  	InetSkEntry ie = INET_SK_ENTRY__INIT;
>  	IpOptsEntry ipopts = IP_OPTS_ENTRY__INIT;
>  	SkOptsEntry skopts = SK_OPTS_ENTRY__INIT;
> -	int ret = -1, err = -1, proto;
> +	int ret = -1, err = -1, proto, type;
>  
>  	ret = do_dump_opt(lfd, SOL_SOCKET, SO_PROTOCOL,
>  					&proto, sizeof(proto));
>  	if (ret)
>  		goto err;
> +	ret = do_dump_opt(lfd, SOL_SOCKET, SO_TYPE,
> +			  &type, sizeof(type));
> +	if (ret)
> +		goto err;
>  
> -	if (!can_dump_ipproto(p->stat.st_ino, proto))
> +	if (!can_dump_ipproto(p->stat.st_ino, proto, type))

something like this

	if (type != SOCK_RAW && !can_dump_ipproto(p->stat.st_ino, proto, type))) {

>  		goto err;
>  
>  	sk = (struct inet_sk_desc *)lookup_socket(p->stat.st_ino, family, proto);
> @@ -410,7 +430,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa
>  	memcpy(ie.src_addr, sk->src_addr, pb_repeated_size(&ie, src_addr));
>  	memcpy(ie.dst_addr, sk->dst_addr, pb_repeated_size(&ie, dst_addr));
>  
> -	if (dump_ip_opts(lfd, &ipopts))
> +	if (dump_ip_opts(family, sk->type, lfd, &ipopts))
>  		goto err;
>  
>  	if (dump_socket_opts(lfd, &skopts))
> @@ -424,7 +444,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa
>  
>  	switch (proto) {
>  	case IPPROTO_TCP:
> -		err = dump_one_tcp(lfd, sk);
> +		err = (sk->type != SOCK_RAW) ? dump_one_tcp(lfd, sk) : 0;
>  		break;
>  	default:
>  		err = 0;
> @@ -606,12 +626,18 @@ static int post_open_inet_sk(struct file_desc *d, int sk)
>  	return 0;
>  }
>  
> -int restore_ip_opts(int sk, IpOptsEntry *ioe)
> +int restore_ip_opts(int family, int sk, IpOptsEntry *ioe)
>  {
>  	int ret = 0;
>  
>  	if (ioe->has_freebind)
>  		ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
> +	if (ioe->has_nodefrag)
> +		ret |= restore_opt(sk, SOL_IP, IP_NODEFRAG, &ioe->nodefrag);
> +	if (ioe->has_hdrincl)
> +		ret |= restore_opt(sk, family == AF_INET6 ? SOL_IPV6 : SOL_IP,
> +				   family == AF_INET6 ? IPV6_HDRINCL : IP_HDRINCL,
> +				   &ioe->hdrincl);
>  
>  	return ret;
>  }
> @@ -635,7 +661,7 @@ static int open_inet_sk(struct file_desc *d, int *new_fd)
>  		return -1;
>  	}
>  
> -	if ((ie->type != SOCK_STREAM) && (ie->type != SOCK_DGRAM)) {
> +	if ((ie->type != SOCK_STREAM) && (ie->type != SOCK_DGRAM) && (ie->type != SOCK_RAW)) {
>  		pr_err("Unsupported socket type: %d\n", ie->type);
>  		return -1;
>  	}
> @@ -713,7 +739,7 @@ done:
>  	if (rst_file_params(sk, ie->fown, ie->flags))
>  		goto err;
>  
> -	if (ie->ip_opts && restore_ip_opts(sk, ie->ip_opts))
> +	if (ie->ip_opts && restore_ip_opts(ie->family, sk, ie->ip_opts))
>  		goto err;
>  
>  	if (restore_socket_opts(sk, ie->opts))
> @@ -780,7 +806,7 @@ int inet_bind(int sk, struct inet_sk_info *ii)
>  	 * sockets could not be bound to them in this moment
>  	 * without setting IP_FREEBIND.
>  	 */
> -	if (ii->ie->family == AF_INET6) {
> +	if (ii->ie->family == AF_INET6 && ii->ie->proto != IPPROTO_RAW) {
>  		int yes = 1;
>  
>  		if (restore_opt(sk, SOL_IP, IP_FREEBIND, &yes))
> diff --git a/criu/sockets.c b/criu/sockets.c
> index 420eee8252e2..06938f27ce05 100644
> --- a/criu/sockets.c
> +++ b/criu/sockets.c
> @@ -61,9 +61,11 @@ enum socket_cl_bits
>  	INET_TCP_CL_BIT,
>  	INET_UDP_CL_BIT,
>  	INET_UDPLITE_CL_BIT,
> +	INET_RAW_CL_BIT,
>  	INET6_TCP_CL_BIT,
>  	INET6_UDP_CL_BIT,
>  	INET6_UDPLITE_CL_BIT,
> +	INET6_RAW_CL_BIT,
>  	UNIX_CL_BIT,
>  	PACKET_CL_BIT,
>  	_MAX_CL_BIT,
> @@ -89,6 +91,8 @@ enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsigned int proto)
>  			return INET_UDP_CL_BIT;
>  		if (proto == IPPROTO_UDPLITE)
>  			return INET_UDPLITE_CL_BIT;
> +		if (proto == IPPROTO_RAW)
> +			return INET_RAW_CL_BIT;
>  	}
>  	if (family == AF_INET6) {
>  		if (proto == IPPROTO_TCP)
> @@ -97,6 +101,8 @@ enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsigned int proto)
>  			return INET6_UDP_CL_BIT;
>  		if (proto == IPPROTO_UDPLITE)
>  			return INET6_UDPLITE_CL_BIT;
> +		if (proto == IPPROTO_RAW)
> +			return INET6_RAW_CL_BIT;
>  	}
>  
>  	pr_err("Unknown pair family %d proto %d\n", family, proto);
> @@ -598,6 +604,9 @@ static int inet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg)
>  	case IPPROTO_TCP:
>  		type = SOCK_STREAM;
>  		break;
> +	case IPPROTO_RAW:
> +		type = SOCK_RAW;
> +		break;
>  	case IPPROTO_UDP:
>  	case IPPROTO_UDPLITE:
>  		type = SOCK_DGRAM;
> @@ -620,6 +629,14 @@ static int do_collect_req(int nl, struct sock_diag_req *req, int size,
>  
>  	if (tmp == 0)
>  		set_collect_bit(req->r.n.sdiag_family, req->r.n.sdiag_protocol);
> +	else if (tmp == -ENOENT &&
> +		 ((req->r.n.sdiag_family == AF_INET ||
> +		   req->r.n.sdiag_family == AF_INET6) &&
> +		  req->r.n.sdiag_protocol == IPPROTO_RAW)) {
> +		pr_warn("No support for DIAG module on family %s with protocol IPPROTO_RAW, may fail later\n",
> +			req->r.n.sdiag_family == AF_INET ? "IPv4" : "IPv6");
> +		tmp = 0;


can you handle this error in collect_sockets() like we do for netlink
and packet sockers

> +	}
>  
>  	return tmp;
>  }
> @@ -677,6 +694,15 @@ int collect_sockets(struct ns_id *ns)
>  	if (tmp)
>  		err = tmp;
>  
> +	/* Collect IPv4 RAW sockets */
> +	req.r.i.sdiag_family	= AF_INET;
> +	req.r.i.sdiag_protocol	= IPPROTO_RAW;
> +	req.r.i.idiag_ext	= 0;
> +	req.r.i.idiag_states	= -1; /* All */
> +	tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, ns, &req.r.i);
> +	if (tmp)
> +		err = tmp;
> +
>  	/* Collect IPv6 TCP sockets */
>  	req.r.i.sdiag_family	= AF_INET6;
>  	req.r.i.sdiag_protocol	= IPPROTO_TCP;
> @@ -708,6 +734,15 @@ int collect_sockets(struct ns_id *ns)
>  	if (tmp)
>  		err = tmp;
>  
> +	/* Collect IPv6 RAW sockets */
> +	req.r.i.sdiag_family	= AF_INET6;
> +	req.r.i.sdiag_protocol	= IPPROTO_RAW;
> +	req.r.i.idiag_ext	= 0;
> +	req.r.i.idiag_states	= -1; /* All */
> +	tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, ns, &req.r.i);
> +	if (tmp)
> +		err = tmp;
> +
>  	req.r.p.sdiag_family	= AF_PACKET;
>  	req.r.p.sdiag_protocol	= 0;
>  	req.r.p.pdiag_show	= PACKET_SHOW_INFO | PACKET_SHOW_MCLIST |
> diff --git a/images/sk-inet.proto b/images/sk-inet.proto
> index 09c5a47d2464..173c74a40df7 100644
> --- a/images/sk-inet.proto
> +++ b/images/sk-inet.proto
> @@ -6,6 +6,8 @@ import "sk-opts.proto";
>  
>  message ip_opts_entry {
>  	optional bool		freebind	= 1;
> +	optional bool		hdrincl		= 2;
> +	optional bool		nodefrag	= 3;
>  }
>  
>  message inet_sk_entry {
> -- 
> 2.7.4
>
Cyrill Gorcunov March 22, 2017, 2:55 p.m.
On Wed, Mar 15, 2017 at 03:33:18PM -0700, Andrey Vagin wrote:
> >  
> > -static int can_dump_ipproto(int ino, int proto)
> > +static int can_dump_ipproto(int ino, int proto, int type)
> >  {
> >  	/* Make sure it's a proto we support */
> >  	switch (proto) {
> > @@ -111,8 +111,12 @@ static int can_dump_ipproto(int ino, int proto)
> >  	case IPPROTO_UDPLITE:
> >  		break;
> >  	default:
> > -		pr_err("Unsupported proto %d for socket %x\n", proto, ino);
> > -		return 0;
> > +		/* Raw sockets may have any protocol inside */
> > +		if (type != SOCK_RAW) {
> 
> maybe we can check type out of this function?

ipproto depends on two factors -- protocol itself and socket type,
I think the reverse -- this place is very suitable for checking.

> >  
> > -	ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
> > -	ioe->has_freebind = ioe->freebind;
> > +	if (type == SOCK_RAW) {
> > +		if (family == AF_INET6) {
> > +			ret |= dump_opt(sk, SOL_IPV6, IPV6_HDRINCL, &ioe->hdrincl);
> > +		} else {
> > +			ret |= dump_opt(sk, SOL_IP, IP_HDRINCL, &ioe->hdrincl);
> > +			ret |= dump_opt(sk, SOL_IP, IP_NODEFRAG, &ioe->nodefrag);
> > +			ioe->has_nodefrag = ioe->nodefrag;
> > +		}
> > +		ioe->has_hdrincl = ioe->hdrincl;
> > +	} else {
> > +		ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
> 
> is IP_FREEBIND not suitable for raw sockets?

Well, strictly speaking kernel won't fail if we setup freebind here but
for first patches I don't want to modify it. I'll have to revisit raw
sockets more anyway and then I'll probably move it in.

> >  
> > -	if (!can_dump_ipproto(p->stat.st_ino, proto))
> > +	if (!can_dump_ipproto(p->stat.st_ino, proto, type))
> 
> something like this
> 
> 	if (type != SOCK_RAW && !can_dump_ipproto(p->stat.st_ino, proto, type))) {

No no, as I said I think can_dump_ipproto is better place for that.

> > +	else if (tmp == -ENOENT &&
> > +		 ((req->r.n.sdiag_family == AF_INET ||
> > +		   req->r.n.sdiag_family == AF_INET6) &&
> > +		  req->r.n.sdiag_protocol == IPPROTO_RAW)) {
> > +		pr_warn("No support for DIAG module on family %s with protocol IPPROTO_RAW, may fail later\n",
> > +			req->r.n.sdiag_family == AF_INET ? "IPv4" : "IPv6");
> > +		tmp = 0;
> 
> 
> can you handle this error in collect_sockets() like we do for netlink
> and packet sockers

There was some compilcation iirc. Would you allow me to do it on top?

	Cyrill