[v7,6/8] net: add support for macvlan link types

Submitted by Tycho Andersen on Oct. 19, 2016, 4:54 a.m.

Details

Message ID 1476852848-31714-7-git-send-email-tycho.andersen@canonical.com
State Superseded
Series "Series without cover letter"
Headers show

Commit Message

Tycho Andersen Oct. 19, 2016, 4:54 a.m.
While this is in principle similar to how veths are handled, we have to do
things in two different ways depending on whether or not there is a user
namespace involved, because there is no way to ask the kernel to attach a
macvlan NIC to a device in a net ns that we don't have CAP_NET_ADMIN in.

So we do it in two ways:

a. If we are in a user namespace, we create the device in usernsd and use
   IFLA_NET_NS_FD to set the netns which it should be created in (saving
   us a "move into this netns" step).

b. If we aren't in a user namespace, we could still be in a net namespace,
   so we use IFLA_LINK_NETNSID to set namespace that the i/o device will be
   in. Then we open a netlink socket from criu's netns and use
   IFLA_NET_NS_FD to tell the kernel to create the macvlan device in the
   target's namespace.

v2: * s/CLONE_NEWNET/CLONE_NEWUSER
    * Don't bother to dump IFLA_LINK and IFLA_LINK_NETNSID. Although we
      need to provide these on restore, there's no kernel interface that
      persists these. To populate IFLA_LINK, we require users pass
      --macvlan-pair, and we create a NETNSID relation as needed and pass
      that in for macvlan links (although this infrastructure could be used
      elsewhere for links that need it in the future, since is in the
      hoisted populate_newlink_req()).
    * use new external command instead of creating a --macvlan-pair option

v3: add a feature check for linux/net_namespace.h, since not every arch in
    travis has this (new-ish) header

v4: * include sys/types.h instead of linux/if.h to get IFF_UP flag
    * remove old doc addition about --macvlan-pair option

v5: define IFLA_LINK_NETNSID and RTM_NEWNSID if they don't exist

v6: define IFLA_MACVLAN_FLAGS and bump the size of IFLA_MACVLAN_MAX when
    necessary

v7: * remove unused struct macvlan_pair
    * split feature test for linux/net_namespace.h into separate patch
    * move IFLA_INFO_MAX testing in dump_one_netdev to the right patch
    * add documents for netwlink_extras fields
    * split changeflags into separate patch
    * use existing netnsid if we get EEXIST
    * move macvlan code to a helper function
    * use netnsid to restore in userns case, and not pid

Signed-off-by: Tycho Andersen <tycho.andersen@canonical.com>
---
 criu/crtools.c            |   1 +
 criu/external.c           |   9 ++
 criu/include/libnetlink.h |   4 +
 criu/include/net.h        |   2 +
 criu/net.c                | 315 ++++++++++++++++++++++++++++++++++++++++++++--
 images/Makefile           |   1 +
 images/macvlan.proto      |   4 +
 images/netdev.proto       |   4 +
 8 files changed, 330 insertions(+), 10 deletions(-)
 create mode 100644 images/macvlan.proto

Patch hide | download patch | download mbox

diff --git a/criu/crtools.c b/criu/crtools.c
index 8b5ec5d..8deb20a 100644
--- a/criu/crtools.c
+++ b/criu/crtools.c
@@ -842,6 +842,7 @@  usage:
 "                        Formats of RES on restore:\n"
 "                            dev[VAL]:DEVPATH\n"
 "                            veth[IFNAME]:OUTNAME{@BRIDGE}\n"
+"                            macvlan[IFNAME]:OUTNAME\n"
 "\n"
 "* Special resources support:\n"
 "     --" SK_EST_PARAM "  checkpoint/restore established TCP connections\n"
diff --git a/criu/external.c b/criu/external.c
index bc6c6d4..d8fee21 100644
--- a/criu/external.c
+++ b/criu/external.c
@@ -3,6 +3,9 @@ 
 #include "cr_options.h"
 #include "xmalloc.h"
 #include "external.h"
+#include "util.h"
+
+#include "net.h"
 
 int add_external(char *key)
 {
@@ -12,6 +15,12 @@  int add_external(char *key)
 	if (!ext)
 		return -1;
 	ext->id = key;
+
+	if (strstartswith(key, "macvlan") && macvlan_ext_add(ext) < 0) {
+		xfree(ext);
+		return -1;
+	}
+
 	list_add(&ext->node, &opts.external);
 
 	return 0;
diff --git a/criu/include/libnetlink.h b/criu/include/libnetlink.h
index 591af0e..0549ef9 100644
--- a/criu/include/libnetlink.h
+++ b/criu/include/libnetlink.h
@@ -13,5 +13,9 @@  extern int addattr_l(struct nlmsghdr *n, int maxlen, int type,
 #define NLMSG_TAIL(nmsg) \
 	((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
 
+#ifndef NETNS_RTA
+#define NETNS_RTA(r) \
+	((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtgenmsg))))
+#endif
 
 #endif /* __CR_LIBNETLINK_H__ */
diff --git a/criu/include/net.h b/criu/include/net.h
index b367e34..d621da1 100644
--- a/criu/include/net.h
+++ b/criu/include/net.h
@@ -4,6 +4,7 @@ 
 #include <linux/netlink.h>
 
 #include "list.h"
+#include "external.h"
 
 struct cr_imgset;
 extern int dump_net_ns(int ns_id);
@@ -30,6 +31,7 @@  extern int read_ns_sys_file(char *path, char *buf, int len);
 extern int restore_link_parms(NetDeviceEntry *nde, int nlsk);
 
 extern int veth_pair_add(char *in, char *out);
+extern int macvlan_ext_add(struct external *ext);
 extern int move_veth_to_bridge(void);
 
 #endif /* __CR_NET_H__ */
diff --git a/criu/net.c b/criu/net.c
index 4c502cd..1e688af 100644
--- a/criu/net.c
+++ b/criu/net.c
@@ -1,6 +1,7 @@ 
 #include <unistd.h>
 #include <sys/socket.h>
 #include <linux/netlink.h>
+#include <linux/net_namespace.h>
 #include <linux/rtnetlink.h>
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
@@ -10,6 +11,7 @@ 
 #include <sys/wait.h>
 #include <sched.h>
 #include <sys/mount.h>
+#include <sys/types.h>
 #include <net/if.h>
 #include <linux/sockios.h>
 #include <libnl3/netlink/msg.h>
@@ -34,6 +36,22 @@ 
 #include "protobuf.h"
 #include "images/netdev.pb-c.h"
 
+#ifndef IFLA_LINK_NETNSID
+#define IFLA_LINK_NETNSID	37
+#endif
+
+#ifndef RTM_NEWNSID
+#define RTM_NEWNSID		88
+#endif
+
+#ifndef IFLA_MACVLAN_FLAGS
+#define IFLA_MACVLAN_FLAGS 2
+#endif
+
+#ifndef IFLA_MACVLAN_MAX
+#define IFLA_MACVLAN_MAX 6
+#endif
+
 static int ns_sysfs_fd = -1;
 
 int read_ns_sys_file(char *path, char *buf, int len)
@@ -508,6 +526,37 @@  static int dump_bridge(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nla
 	return write_netdev_img(nde, imgset, info);
 }
 
+static int dump_macvlan(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nlattr **info)
+{
+	MacvlanLinkEntry macvlan = MACVLAN_LINK_ENTRY__INIT;
+	int ret;
+	struct nlattr *data[IFLA_MACVLAN_MAX];
+
+	if (!info || !info[IFLA_INFO_DATA]) {
+		pr_err("no data for macvlan\n");
+		return -1;
+	}
+
+	ret = nla_parse_nested(data, IFLA_MACVLAN_MAX, info[IFLA_INFO_DATA], NULL);
+	if (ret < 0) {
+		pr_err("failed ot parse macvlan data\n");
+		return -1;
+	}
+
+	if (!data[IFLA_MACVLAN_MODE]) {
+		pr_err("macvlan mode required for %s\n", nde->name);
+		return -1;
+	}
+
+	macvlan.mode = *((u32 *)RTA_DATA(data[IFLA_MACVLAN_MODE]));
+
+	if (data[IFLA_MACVLAN_FLAGS])
+		macvlan.flags = *((u16 *) RTA_DATA(data[IFLA_MACVLAN_FLAGS]));
+
+	nde->macvlan = &macvlan;
+	return write_netdev_img(nde, imgset, info);
+}
+
 static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind,
 		struct nlattr **tb, struct cr_imgset *fds)
 {
@@ -540,6 +589,8 @@  static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind,
 
 		pr_warn("GRE tap device %s not supported natively\n", name);
 	}
+	if (!strcmp(kind, "macvlan"))
+		return dump_one_netdev(ND_TYPE__MACVLAN, ifi, tb, fds, dump_macvlan);
 
 	return dump_unknown_device(ifi, kind, tb, fds);
 }
@@ -841,8 +892,17 @@  struct newlink_req {
 	char buf[1024];
 };
 
+/* Optional extra things to be provided at the top level of the NEWLINK
+ * request.
+ */
+struct newlink_extras {
+	int netns_id;		/* IFLA_NET_NS_ID */
+	int link;		/* IFLA_LINK */
+	int target_netns;	/* IFLA_NET_NS_FD */
+};
+
 static int populate_newlink_req(struct newlink_req *req, int msg_type, NetDeviceEntry *nde,
-		int (*link_info)(NetDeviceEntry *, struct newlink_req *))
+		int (*link_info)(NetDeviceEntry *, struct newlink_req *), struct newlink_extras *extras)
 {
 	memset(req, 0, sizeof(*req));
 
@@ -860,6 +920,17 @@  static int populate_newlink_req(struct newlink_req *req, int msg_type, NetDevice
 		req->i.ifi_index = nde->ifindex;
 	req->i.ifi_flags = nde->flags;
 
+	if (extras) {
+		if (extras->netns_id >= 0)
+			addattr_l(&req->h, sizeof(*req), IFLA_LINK_NETNSID, &extras->netns_id, sizeof(extras->netns_id));
+
+		if (extras->link >= 0)
+			addattr_l(&req->h, sizeof(*req), IFLA_LINK, &extras->link, sizeof(extras->link));
+
+		if (extras->target_netns >= 0)
+			addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &extras->target_netns, sizeof(extras->target_netns));
+
+	}
 
 	addattr_l(&req->h, sizeof(*req), IFLA_IFNAME, nde->name, strlen(nde->name));
 	addattr_l(&req->h, sizeof(*req), IFLA_MTU, &nde->mtu, sizeof(nde->mtu));
@@ -889,11 +960,12 @@  static int populate_newlink_req(struct newlink_req *req, int msg_type, NetDevice
 }
 
 static int do_rtm_link_req(int msg_type, NetDeviceEntry *nde, int nlsk,
-		int (*link_info)(NetDeviceEntry *, struct newlink_req *))
+		int (*link_info)(NetDeviceEntry *, struct newlink_req *),
+		struct newlink_extras *extras)
 {
 	struct newlink_req req;
 
-	if (populate_newlink_req(&req, msg_type, nde, link_info) < 0)
+	if (populate_newlink_req(&req, msg_type, nde, link_info, extras) < 0)
 		return -1;
 
 	return do_rtnl_req(nlsk, &req, req.h.nlmsg_len, restore_link_cb, NULL, NULL);
@@ -901,14 +973,15 @@  static int do_rtm_link_req(int msg_type, NetDeviceEntry *nde, int nlsk,
 
 int restore_link_parms(NetDeviceEntry *nde, int nlsk)
 {
-	return do_rtm_link_req(RTM_SETLINK, nde, nlsk, NULL);
+	return do_rtm_link_req(RTM_SETLINK, nde, nlsk, NULL, NULL);
 }
 
 static int restore_one_link(NetDeviceEntry *nde, int nlsk,
-		int (*link_info)(NetDeviceEntry *, struct newlink_req *))
+		int (*link_info)(NetDeviceEntry *, struct newlink_req *),
+		struct newlink_extras *extras)
 {
 	pr_info("Restoring netdev %s idx %d\n", nde->name, nde->ifindex);
-	return do_rtm_link_req(RTM_NEWLINK, nde, nlsk, link_info);
+	return do_rtm_link_req(RTM_NEWLINK, nde, nlsk, link_info, extras);
 }
 
 #ifndef VETH_INFO_MAX
@@ -1004,6 +1077,216 @@  static int changeflags(int s, char *name, short flags)
 	return 0;
 }
 
+static int macvlan_link_info(NetDeviceEntry *nde, struct newlink_req *req)
+{
+	struct rtattr *macvlan_data;
+	MacvlanLinkEntry *macvlan = nde->macvlan;
+
+	if (!macvlan) {
+		pr_err("Missing macvlan link entry %d\n", nde->ifindex);
+		return -1;
+	}
+
+	addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "macvlan", 7);
+
+	macvlan_data = NLMSG_TAIL(&req->h);
+	addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0);
+
+	addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_MODE, &macvlan->mode, sizeof(macvlan->mode));
+
+	if (macvlan->has_flags)
+		addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_FLAGS, &macvlan->flags, sizeof(macvlan->flags));
+
+	macvlan_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)macvlan_data;
+
+	return 0;
+}
+
+static int userns_restore_one_link(void *arg, int fd, pid_t pid)
+{
+	int nlsk, ret;
+	struct newlink_req *req = arg;
+
+	nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (nlsk < 0) {
+		pr_perror("Can't create nlk socket");
+		return -1;
+	}
+
+	addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &fd, sizeof(fd));
+
+	ret = do_rtnl_req(nlsk, req, req->h.nlmsg_len, restore_link_cb, NULL, NULL);
+	close(nlsk);
+	return ret;
+}
+
+static int get_nsid_cb(struct nlmsghdr *nlh, void *arg)
+{
+	struct rtgenmsg *rthdr;
+	struct rtattr *rta;
+	int len, *netnsid = arg;
+
+	rthdr = NLMSG_DATA(nlh);
+	len = nlh->nlmsg_len - NLMSG_SPACE(sizeof(*rthdr));
+
+	if (len < 0)
+		return -1;
+
+	rta = NETNS_RTA(rthdr);
+
+	while (RTA_OK(rta, len)) {
+		if (rta->rta_type == NETNSA_NSID)
+			*netnsid = *((int *) RTA_DATA(rta));
+		rta = RTA_NEXT(rta, len);
+	}
+
+	if (netnsid < 0) {
+		pr_err("Didn't get a netnsid back from netlink?\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int get_criu_netnsid(int nlsk)
+{
+	static int netnsid = -1;
+	struct {
+		struct nlmsghdr n;
+		struct rtgenmsg g;
+		char buf[1024];
+	} req;
+	int ns_fd = get_service_fd(NS_FD_OFF), i;
+
+	if (netnsid > 0)
+		return netnsid;
+
+	for (i = 0; i < 10; i++) {
+		int ret;
+
+		memset(&req, 0, sizeof(req));
+
+		req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.g));
+		req.n.nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK;
+		req.n.nlmsg_type = RTM_NEWNSID;
+		req.n.nlmsg_seq = CR_NLMSG_SEQ;
+
+		addattr_l(&req.n, sizeof(req), NETNSA_FD, &ns_fd, sizeof(ns_fd));
+		addattr_l(&req.n, sizeof(req), NETNSA_NSID, &i, sizeof(i));
+
+		ret = do_rtnl_req(nlsk, &req, req.n.nlmsg_len, NULL, NULL, NULL);
+		if (ret < 0) {
+			if (ret == -EEXIST) {
+				req.n.nlmsg_type = RTM_GETNSID;
+				ret = do_rtnl_req(nlsk, &req, req.n.nlmsg_len, get_nsid_cb, NULL, &netnsid);
+				if (ret < 0) {
+					pr_err("Couldn't get netnsid: %d\n", ret);
+					return -1;
+				}
+			}
+			errno = -ret;
+			pr_perror("couldn't create new netnsid");
+			return -1;
+		}
+
+		netnsid = i;
+		return netnsid;
+	}
+
+	pr_err("tried to create too many netnsids\n");
+	return -1;
+}
+
+static int restore_one_macvlan(NetDeviceEntry *nde, int nlsk)
+{
+	struct newlink_extras extras = {
+		.netns_id = -1,
+		.link = -1,
+		.target_netns = -1,
+	};
+	char key[100], *val;
+	int my_netns = -1, ret = -1, s;
+
+	snprintf(key, sizeof(key), "macvlan[%s]", nde->name);
+	val = external_lookup_data(key);
+	if (IS_ERR_OR_NULL(val)) {
+		pr_err("a macvlan parent for %s is required\n", nde->name);
+		return -1;
+	}
+
+	extras.link = (int) (unsigned long) val;
+
+	extras.netns_id = get_criu_netnsid(nlsk);
+	if (extras.netns_id < 0) {
+		pr_err("failed to get criu's netnsid\n");
+		return -1;
+	}
+
+	my_netns = open_proc(PROC_SELF, "ns/net");
+	if (my_netns < 0) {
+		pr_perror("couldn't get my netns");
+		return -1;
+	}
+
+	if (root_ns_mask & CLONE_NEWUSER) {
+		struct newlink_req req;
+
+		if (populate_newlink_req(&req, RTM_NEWLINK, nde, macvlan_link_info, &extras) < 0)
+			goto out;
+
+		if (userns_call(userns_restore_one_link, 0, &req, sizeof(req), my_netns) < 0) {
+			pr_err("couldn't restore macvlan interface %s via usernsd\n", nde->name);
+			goto out;
+		}
+	} else {
+		int root_nlsk, ns_fd = get_service_fd(NS_FD_OFF);
+
+		extras.target_netns = my_netns;
+
+		if (setns(ns_fd, CLONE_NEWNET) < 0) {
+			pr_perror("couldn't setns to parent ns");
+			goto out;
+		}
+
+		root_nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+
+		if (setns(my_netns, CLONE_NEWNET) < 0) {
+			close(root_nlsk);
+			pr_perror("couldn't setns back to my ns");
+			goto out;
+		}
+
+		if (root_nlsk < 0) {
+			pr_perror("Can't create nlk socket");
+			goto out;
+		}
+
+		ret = restore_one_link(nde, root_nlsk, macvlan_link_info, &extras);
+		close(root_nlsk);
+		if (ret < 0)
+			return -1;
+	}
+
+	/* We have to change the flags of the NDE manually here because
+	 * we used IFLA_LINK_NETNSID to restore it, which creates the
+	 * device and then shuts it down when it changes the device's
+	 * namespace, but doesn't start it back up when it goes to the
+	 * other namespace. So, we restore its state here.
+	 */
+	s = socket(AF_LOCAL, SOCK_STREAM, 0);
+	if (s < 0) {
+		pr_perror("couldn't open socket for flag changing");
+		goto out;
+	}
+	ret = changeflags(s, nde->name, nde->flags);
+	close(s);
+
+out:
+	if (my_netns >= 0)
+		close(my_netns);
+	return ret;
+}
+
 static int restore_link(NetDeviceEntry *nde, int nlsk)
 {
 	pr_info("Restoring link %s type %d\n", nde->name, nde->type);
@@ -1013,14 +1296,15 @@  static int restore_link(NetDeviceEntry *nde, int nlsk)
 	case ND_TYPE__EXTLINK:  /* see comment in images/netdev.proto */
 		return restore_link_parms(nde, nlsk);
 	case ND_TYPE__VENET:
-		return restore_one_link(nde, nlsk, venet_link_info);
+		return restore_one_link(nde, nlsk, venet_link_info, NULL);
 	case ND_TYPE__VETH:
-		return restore_one_link(nde, nlsk, veth_link_info);
+		return restore_one_link(nde, nlsk, veth_link_info, NULL);
 	case ND_TYPE__TUN:
 		return restore_one_tun(nde, nlsk);
 	case ND_TYPE__BRIDGE:
-		return restore_one_link(nde, nlsk, bridge_link_info);
-
+		return restore_one_link(nde, nlsk, bridge_link_info, NULL);
+	case ND_TYPE__MACVLAN:
+		return restore_one_macvlan(nde, nlsk);
 	default:
 		pr_err("Unsupported link type %d\n", nde->type);
 		break;
@@ -1673,6 +1957,17 @@  int veth_pair_add(char *in, char *out)
 	return add_external(e_str);
 }
 
+int macvlan_ext_add(struct external *ext)
+{
+	ext->data = (void *) (unsigned long) if_nametoindex(external_val(ext));
+	if (ext->data == 0) {
+		pr_perror("can't get ifindex of %s", ext->id);
+		return -1;
+	}
+
+	return 0;
+}
+
 /*
  * The setns() syscall (called by switch_ns()) can be extremely
  * slow. If we call it two or more times from the same task the
diff --git a/images/Makefile b/images/Makefile
index cf50794..eb18526 100644
--- a/images/Makefile
+++ b/images/Makefile
@@ -60,6 +60,7 @@  proto-obj-y	+= binfmt-misc.o
 proto-obj-y	+= time.o
 proto-obj-y	+= sysctl.o
 proto-obj-y	+= autofs.o
+proto-obj-y	+= macvlan.o
 
 CFLAGS		+= -iquote $(obj)/
 
diff --git a/images/macvlan.proto b/images/macvlan.proto
new file mode 100644
index 0000000..c9c9045
--- /dev/null
+++ b/images/macvlan.proto
@@ -0,0 +1,4 @@ 
+message macvlan_link_entry {
+	required uint32	mode	= 1;
+	optional uint32 flags	= 2;
+}
diff --git a/images/netdev.proto b/images/netdev.proto
index 19b501c..2f2f3d1 100644
--- a/images/netdev.proto
+++ b/images/netdev.proto
@@ -1,5 +1,6 @@ 
 syntax = "proto2";
 
+import "macvlan.proto";
 import "opts.proto";
 import "tun.proto";
 import "sysctl.proto";
@@ -20,6 +21,7 @@  enum nd_type {
 	 */
 	VENET		= 5;
 	BRIDGE		= 6;
+	MACVLAN		= 7;
 }
 
 message net_device_entry {
@@ -38,6 +40,8 @@  message net_device_entry {
 	repeated sysctl_entry conf4	= 9;
 
 	repeated sysctl_entry conf6	= 10;
+
+	optional macvlan_link_entry	macvlan		= 11;
 }
 
 message netns_entry {