[2/4] sfd: Rework install, clone helpers to use fcntl

Submitted by Cyrill Gorcunov on May 30, 2017, 6:13 p.m.

Details

Message ID 1496168013-32656-3-git-send-email-gorcunov@openvz.org
State New
Series "sfd: Rework service fd engine"
Headers show

Commit Message

Cyrill Gorcunov May 30, 2017, 6:13 p.m.
dup3 closes opened target descriptor silently which already can
be borrowed by some other code (for example with containers with
huge memory usage, say 256G, the page transfer pipe may already
open the fd we reserved for service engine), so we should use
fcntl here instead and exit with error is such situation
happened.

Moreover, for calling convenience if there is previous
sfd already assigned and marked in sfd map -- just close
it before setting up new instance.

Another problem is clone_service_fd, we may have a situation
(say fdt_shared testcase) where service_fd_id downgrates so
we're trying to reinstall already opened desciptors. For
a while we simply close the desctination because it is
early-restore-forking stage where we hardly ever get
collistion with already opened files, but still it's
not 100% protected and we need to improve sfd engine
more making tracking per-id based together with proper
locking scheme.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
 criu/servicefd.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 79 insertions(+), 12 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/servicefd.c b/criu/servicefd.c
index 185021d2ef25..245891b79cbc 100644
--- a/criu/servicefd.c
+++ b/criu/servicefd.c
@@ -8,18 +8,46 @@ 
 
 #include "servicefd.h"
 #include "bitops.h"
-#include "log.h"
+#include "criu-log.h"
 
 #include "common/bug.h"
 
 #undef	LOG_PREFIX
 #define LOG_PREFIX "sfd: "
 
+// #define SFD_DEBUG
+
+#ifdef SFD_DEBUG
+# define pr_sfd_debug(fmt, ...)	pr_debug(fmt, ##__VA_ARGS__)
+#else
+# define pr_sfd_debug(fmt, ...)
+#endif
+
 static DECLARE_BITMAP(sfd_map, SERVICE_FD_MAX);
 
 static int service_fd_rlim_cur;
 static int service_fd_id;
 
+#define __gen_type_str(x)	[x] = __stringify_1(x)
+static char * const type_str[SERVICE_FD_MAX] = {
+	__gen_type_str(LOG_FD_OFF),
+	__gen_type_str(IMG_FD_OFF),
+	__gen_type_str(PROC_FD_OFF),
+	__gen_type_str(PROC_PID_FD_OFF),
+	__gen_type_str(CTL_TTY_OFF),
+	__gen_type_str(SELF_STDIN_OFF),
+	__gen_type_str(CR_PROC_FD_OFF),
+	__gen_type_str(ROOT_FD_OFF),
+	__gen_type_str(CGROUP_YARD),
+	__gen_type_str(USERNSD_SK),
+	__gen_type_str(NS_FD_OFF),
+	__gen_type_str(TRANSPORT_FD_OFF),
+	__gen_type_str(RPC_SK_OFF),
+	__gen_type_str(FDSTORE_SK_OFF),
+	__gen_type_str(LAZY_PAGES_SK_OFF),
+};
+#undef __gen_type_str
+
 int init_service_fd(void)
 {
 	struct rlimit64 rlimit;
@@ -62,11 +90,30 @@  int reserve_service_fd(enum sfd_type type)
 int install_service_fd(enum sfd_type type, int fd)
 {
 	int sfd = __get_service_fd(type, service_fd_id);
+	int ret;
 
 	BUG_ON((int)type <= SERVICE_FD_MIN || (int)type >= SERVICE_FD_MAX);
+	BUG_ON(type >= ARRAY_SIZE(type_str));
+
+	pr_sfd_debug("install %d/%d (%s)\n", service_fd_id, sfd, type_str[type]);
+	if (test_bit(type, sfd_map)) {
+		pr_sfd_debug("\tclose previous %d/%d (%s)\n",
+			     service_fd_id, sfd, type_str[type]);
+		close_service_fd(type);
+	}
 
-	if (dup3(fd, sfd, O_CLOEXEC) != sfd) {
-		pr_perror("Dup %d -> %d failed", fd, sfd);
+	ret = fcntl(fd, F_DUPFD_CLOEXEC, sfd);
+	if (ret != sfd) {
+		if (ret < 0) {
+			pr_perror("%d/%d -> %d/%d (%s) transition failed",
+				  service_fd_id, fd, sfd,
+				  service_fd_id, type_str[type]);
+		} else {
+			pr_err("%d/%d (%s) is busy\n",
+			       service_fd_id, sfd,
+			       type_str[type]);
+			close(ret);
+		}
 		return -1;
 	}
 
@@ -92,34 +139,54 @@  int close_service_fd(enum sfd_type type)
 
 	close(fd);
 	clear_bit(type, sfd_map);
+	pr_sfd_debug("close %d/%d (%s)\n", service_fd_id, fd, type_str[type]);
 	return 0;
 }
 
 int clone_service_fd(int id)
 {
-	int ret = -1, i;
+	int ret, i;
 
 	if (service_fd_id == id)
 		return 0;
 
+	pr_sfd_debug("clone %d/%d\n", service_fd_id, id);
 	for (i = SERVICE_FD_MIN + 1; i < SERVICE_FD_MAX; i++) {
 		int old = get_service_fd(i);
 		int new = __get_service_fd(i, id);
 
 		if (old < 0)
 			continue;
-		ret = dup2(old, new);
-		if (ret == -1) {
-			if (errno == EBADF)
-				continue;
-			pr_perror("Unable to clone %d->%d", old, new);
+
+		/*
+		 * Cloning of service fds happen at early stage
+		 * where no other files needed are opened, so
+		 * for simplicity just close the destination.
+		 *
+		 * FIXME: Still better to invent more deep tracking
+		 * of service files opened, say every rsti(item)
+		 * structure should have own sfd_map and everything
+		 * should be under appropriate shared locking.
+		 */
+		close(new);
+		ret = fcntl(old, F_DUPFD, new);
+		if (ret < 0 || ret != new) {
+			if (ret < 0) {
+				pr_perror("clone %d/%d -> %d/%d (%s) transition failed",
+					  service_fd_id, old, id, new, type_str[i]);
+			} else {
+				pr_err("clone %d/%d -> %d/%d (%s) busy\n",
+				       service_fd_id, old, id, new, type_str[i]);
+				close(ret);
+			}
+			return -1;
 		}
+		pr_sfd_debug("clone %d/%d -> %d/%d (%s)\n",
+			     service_fd_id, old, id, new, type_str[i]);
 	}
 
 	service_fd_id = id;
-	ret = 0;
-
-	return ret;
+	return 0;
 }
 
 bool is_any_service_fd(int fd)

Comments

Andrey Vagin May 30, 2017, 9:47 p.m.
On Tue, May 30, 2017 at 09:13:31PM +0300, Cyrill Gorcunov wrote:
> dup3 closes opened target descriptor silently which already can
> be borrowed by some other code (for example with containers with
> huge memory usage, say 256G, the page transfer pipe may already
> open the fd we reserved for service engine), so we should use
> fcntl here instead and exit with error is such situation
> happened.
> 
> Moreover, for calling convenience if there is previous
> sfd already assigned and marked in sfd map -- just close
> it before setting up new instance.
> 
> Another problem is clone_service_fd, we may have a situation
> (say fdt_shared testcase) where service_fd_id downgrates so
> we're trying to reinstall already opened desciptors. For
> a while we simply close the desctination because it is
> early-restore-forking stage where we hardly ever get
> collistion with already opened files, but still it's
> not 100% protected and we need to improve sfd engine
> more making tracking per-id based together with proper
> locking scheme.
> 
> Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
> ---
>  criu/servicefd.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 79 insertions(+), 12 deletions(-)
> 
> diff --git a/criu/servicefd.c b/criu/servicefd.c
> index 185021d2ef25..245891b79cbc 100644
> --- a/criu/servicefd.c
> +++ b/criu/servicefd.c
> @@ -8,18 +8,46 @@
>  
>  #include "servicefd.h"
>  #include "bitops.h"
> -#include "log.h"
> +#include "criu-log.h"
>  
>  #include "common/bug.h"
>  
>  #undef	LOG_PREFIX
>  #define LOG_PREFIX "sfd: "
>  
> +// #define SFD_DEBUG
> +
> +#ifdef SFD_DEBUG
> +# define pr_sfd_debug(fmt, ...)	pr_debug(fmt, ##__VA_ARGS__)
> +#else
> +# define pr_sfd_debug(fmt, ...)
> +#endif
> +
>  static DECLARE_BITMAP(sfd_map, SERVICE_FD_MAX);
>  
>  static int service_fd_rlim_cur;
>  static int service_fd_id;
>  
> +#define __gen_type_str(x)	[x] = __stringify_1(x)
> +static char * const type_str[SERVICE_FD_MAX] = {
> +	__gen_type_str(LOG_FD_OFF),
> +	__gen_type_str(IMG_FD_OFF),
> +	__gen_type_str(PROC_FD_OFF),
> +	__gen_type_str(PROC_PID_FD_OFF),
> +	__gen_type_str(CTL_TTY_OFF),
> +	__gen_type_str(SELF_STDIN_OFF),
> +	__gen_type_str(CR_PROC_FD_OFF),
> +	__gen_type_str(ROOT_FD_OFF),
> +	__gen_type_str(CGROUP_YARD),
> +	__gen_type_str(USERNSD_SK),
> +	__gen_type_str(NS_FD_OFF),
> +	__gen_type_str(TRANSPORT_FD_OFF),
> +	__gen_type_str(RPC_SK_OFF),
> +	__gen_type_str(FDSTORE_SK_OFF),
> +	__gen_type_str(LAZY_PAGES_SK_OFF),
> +};
> +#undef __gen_type_str
> +
>  int init_service_fd(void)
>  {
>  	struct rlimit64 rlimit;
> @@ -62,11 +90,30 @@ int reserve_service_fd(enum sfd_type type)
>  int install_service_fd(enum sfd_type type, int fd)
>  {
>  	int sfd = __get_service_fd(type, service_fd_id);
> +	int ret;
>  
>  	BUG_ON((int)type <= SERVICE_FD_MIN || (int)type >= SERVICE_FD_MAX);
> +	BUG_ON(type >= ARRAY_SIZE(type_str));
> +
> +	pr_sfd_debug("install %d/%d (%s)\n", service_fd_id, sfd, type_str[type]);
> +	if (test_bit(type, sfd_map)) {
> +		pr_sfd_debug("\tclose previous %d/%d (%s)\n",
> +			     service_fd_id, sfd, type_str[type]);
> +		close_service_fd(type);
> +	}
>  
> -	if (dup3(fd, sfd, O_CLOEXEC) != sfd) {
> -		pr_perror("Dup %d -> %d failed", fd, sfd);
> +	ret = fcntl(fd, F_DUPFD_CLOEXEC, sfd);
> +	if (ret != sfd) {
> +		if (ret < 0) {
> +			pr_perror("%d/%d -> %d/%d (%s) transition failed",
> +				  service_fd_id, fd, sfd,
> +				  service_fd_id, type_str[type]);
> +		} else {
> +			pr_err("%d/%d (%s) is busy\n",
> +			       service_fd_id, sfd,
> +			       type_str[type]);
> +			close(ret);
> +		}
>  		return -1;
>  	}
>  
> @@ -92,34 +139,54 @@ int close_service_fd(enum sfd_type type)
>  
>  	close(fd);
>  	clear_bit(type, sfd_map);
> +	pr_sfd_debug("close %d/%d (%s)\n", service_fd_id, fd, type_str[type]);
>  	return 0;
>  }
>  
>  int clone_service_fd(int id)
>  {
> -	int ret = -1, i;
> +	int ret, i;
>  
>  	if (service_fd_id == id)
>  		return 0;
>  
> +	pr_sfd_debug("clone %d/%d\n", service_fd_id, id);
>  	for (i = SERVICE_FD_MIN + 1; i < SERVICE_FD_MAX; i++) {
>  		int old = get_service_fd(i);
>  		int new = __get_service_fd(i, id);
>  
>  		if (old < 0)
>  			continue;
> -		ret = dup2(old, new);
> -		if (ret == -1) {
> -			if (errno == EBADF)
> -				continue;
> -			pr_perror("Unable to clone %d->%d", old, new);
> +
> +		/*
> +		 * Cloning of service fds happen at early stage
> +		 * where no other files needed are opened, so
> +		 * for simplicity just close the destination.

Why should we close the destination? Who could open it?

> +		 *
> +		 * FIXME: Still better to invent more deep tracking
> +		 * of service files opened, say every rsti(item)
> +		 * structure should have own sfd_map and everything
> +		 * should be under appropriate shared locking.
> +		 */
> +		close(new);

I don't understand why we need this close ^^^^

If it is need, why close + fcntl is better than dup2?

> +		ret = fcntl(old, F_DUPFD, new);
> +		if (ret < 0 || ret != new) {
> +			if (ret < 0) {
> +				pr_perror("clone %d/%d -> %d/%d (%s) transition failed",
> +					  service_fd_id, old, id, new, type_str[i]);
> +			} else {
> +				pr_err("clone %d/%d -> %d/%d (%s) busy\n",
> +				       service_fd_id, old, id, new, type_str[i]);
> +				close(ret);
> +			}
> +			return -1;
>  		}
> +		pr_sfd_debug("clone %d/%d -> %d/%d (%s)\n",
> +			     service_fd_id, old, id, new, type_str[i]);
>  	}
>  
>  	service_fd_id = id;
> -	ret = 0;
> -
> -	return ret;
> +	return 0;
>  }
>  
>  bool is_any_service_fd(int fd)
> -- 
> 2.7.4
>
Cyrill Gorcunov May 30, 2017, 10:14 p.m.
On Tue, May 30, 2017 at 02:47:55PM -0700, Andrey Vagin wrote:
> > +
> > +		/*
> > +		 * Cloning of service fds happen at early stage
> > +		 * where no other files needed are opened, so
> > +		 * for simplicity just close the destination.
> 
> Why should we close the destination? Who could open it?

Our fdt_shared test already does it :( There is a transition
from service_fd_id = 2 to 0, where 0 already has these
files opened. If you uncomment SDF_DEBUG on top of this
patch and run the test you'll see.

> 
> > +		 *
> > +		 * FIXME: Still better to invent more deep tracking
> > +		 * of service files opened, say every rsti(item)
> > +		 * structure should have own sfd_map and everything
> > +		 * should be under appropriate shared locking.
> > +		 */
> > +		close(new);
> 
> I don't understand why we need this close ^^^^
> 
> If it is need, why close + fcntl is better than dup2?

This is temp solution, see FIXME above. We need more deep
rework here, so sfd will be per id. At moment it's not
anyhow better then dup, but for _now_ only.