[v2,12/14] files: Make tasks set their own service_fd_base

Submitted by Andrei Vagin on Dec. 28, 2017, 7 p.m.

Details

Message ID 20171228190029.GB3513@outlook.office365.com
State New
Series "Introduce custom per-task service fds placement"
Headers show

Commit Message

Andrei Vagin Dec. 28, 2017, 7 p.m.
On Thu, Dec 28, 2017 at 04:10:29PM +0300, Kirill Tkhai wrote:
> Currently, we set rlim(RLIMIT_NOFILE) unlimited
> and service_fd_rlim_cur to place service fds.
> This leads to a signify problem: every task uses
> the biggest possible files_struct in kernel, and
> it consumes excess memory after restore
> in comparation to dump. In some situations this
> may end in restore fail as there is no enough
> memory in memory cgroup of on node.
> 
> The patch fixes the problem by introducing
> task-measured service_fd_base. It's calculated
> in dependence of max used file fd and is placed
> near the right border of kernel-allocated memory
> hunk for task's fds (see alloc_fdtable() for
> details). This reduces kernel-allocated files_struct
> to 512 fds for the most process in standard linux
> system (I've analysed the processes in my work system).
> 
> Also, since the "standard processes" will have the same
> service_fd_base, clone_service_fd() won't have to
> actualy dup() their service fds for them like we
> have at the moment. This is the one of reasons why
> we still keep service fds as a range of fds,
> and do not try to use unused holes in task fds.
> 
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> 
> v2: Add a handle for very big fd numbers near service_fd_rlim_cur.
> ---
>  criu/util.c |   57 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 53 insertions(+), 4 deletions(-)
> 
> diff --git a/criu/util.c b/criu/util.c
> index d45bb7d61..8a25ab2cf 100644
> --- a/criu/util.c
> +++ b/criu/util.c
> @@ -527,7 +527,7 @@ int close_service_fd(enum sfd_type type)
>  static void move_service_fd(struct pstree_item *me, int type, int new_id, int new_base)
>  {
>  	int old = get_service_fd(type);
> -	int new = __get_service_fd(type, new_id);
> +	int new = new_base - type - SERVICE_FD_MAX * new_id;
>  	int ret;
>  
>  	if (old < 0)
> @@ -540,24 +540,73 @@ static void move_service_fd(struct pstree_item *me, int type, int new_id, int ne
>  		close(old);
>  }
>  
> +static int choose_service_fd_base(struct pstree_item *me)
> +{
> +	int nr, real_nr, fdt_nr = 1, id = rsti(me)->service_fd_id;
> +
> +	if (rsti(me)->fdt) {
> +		/* The base is set by owner of fdt (id 0) */
> +		if (id != 0)
> +			return service_fd_base;
> +		fdt_nr = rsti(me)->fdt->nr;
> +	}
> +	/* Now find process's max used fd number */
> +	if (!list_empty(&rsti(me)->fds))
> +		nr = list_entry(rsti(me)->fds.prev,
> +				struct fdinfo_list_entry, ps_list)->fe->fd;
> +	else
> +		nr = -1;
> +
> +	nr = max(nr, inh_fd_max);
> +	/*
> +	 * Service fds go after max fd near right border of alignment:
> +	 *
> +	 * ...|max_fd|max_fd+1|...|sfd first|...|sfd last (aligned)|
> +	 *
> +	 * So, they take maximum numbers of area allocated by kernel.
> +	 * See linux alloc_fdtable() for details.
> +	 */
> +	nr += (SERVICE_FD_MAX - SERVICE_FD_MIN) * fdt_nr;
> +	nr += 16; /* Safety pad */
> +	real_nr = nr;
> +
> +	nr /= (1024 / sizeof(void *));
> +	nr = 1 << (32 - __builtin_clz(nr + 1));
> +	nr *= (1024 / sizeof(void *));
> +
> +	if (nr > service_fd_rlim_cur) {
> +		/* Right border is bigger, than rlim. OK, then just aligned value is enough */
> +		nr = round_down(service_fd_rlim_cur, (1024 / sizeof(void *)));
> +		if (nr < real_nr) {
> +			pr_err("Can't chose service_fd_base: %d %d\n", nr, real_nr);
> +			return -1;
> +		}
> +	}
> +
> +	return nr;
> +}
> +
>  int clone_service_fd(struct pstree_item *me)
>  {
>  	int id, new_base, i, ret = -1;
>  
> -	new_base = service_fd_base;
> +	new_base = choose_service_fd_base(me);
>  	id = rsti(me)->service_fd_id;
>  
> -	if (service_fd_id == id)
> +	if (new_base == -1)
> +		return -1;
> +	if (service_fd_base == new_base && service_fd_id == id)
>  		return 0;

For the root process, service_fd_base is changed 1048576 -> 256 and
there is one service fd, so for the root process fdtable will be
bigger than it has to be. Actually we have a problem each time when
a child process has a smaller fdtable than a parent process.


And here is a test which shows this problem with more collors
t a/test/zdtm/static/env00.c b/test/zdtm/static/env00.c
index 1feabfa9f..2505cb54c 100644

before dump:
[root@fc24 criu]# echo  $((`cat /sys/fs/cgroup/memory/test/memory.kmem.usage_in_bytes` >> 10))
12276

after restore:
[root@fc24 criu]# echo  $((`cat /sys/fs/cgroup/memory/test/memory.kmem.usage_in_bytes` >> 10))
108432

>  
>  	/* Dup sfds in memmove() style: they may overlap */
> -	if (get_service_fd(LOG_FD_OFF) > __get_service_fd(LOG_FD_OFF, id))
> +	if (get_service_fd(LOG_FD_OFF) > new_base - LOG_FD_OFF - SERVICE_FD_MAX * id)
>  		for (i = SERVICE_FD_MIN + 1; i < SERVICE_FD_MAX; i++)
>  			move_service_fd(me, i, id, new_base);
>  	else
>  		for (i = SERVICE_FD_MAX - 1; i > SERVICE_FD_MIN; i--)
>  			move_service_fd(me, i, id, new_base);
>  
> +	service_fd_base = new_base;
>  	service_fd_id = id;
>  	ret = 0;
>  
>

Patch hide | download patch | download mbox

--- a/test/zdtm/static/env00.c
+++ b/test/zdtm/static/env00.c
@@ -1,6 +1,7 @@ 
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sched.h>
 
 #include "zdtmtst.h"
 
@@ -13,9 +14,22 @@  TEST_OPTION(envname, string, "environment variable name", 1);
 int main(int argc, char **argv)
 {
        char *env;
+       int i;
+       pid_t pid;
 
        test_init(argc, argv);
 
+       for (i = 0; i < 10; i++) {
+               pid = fork();
+               if (pid == 0) {
+                       while (1)
+                               sleep(1);
+                       return 0;
+               }
+       }
+
+       dup2(1, 1000000);
+
        if (setenv(envname, test_author, 1)) {
                pr_perror("Can't set env var \"%s\" to \"%s\"", envname, test_author);
                exit(1);