[12/12] files: Make tasks set their own service_fd_base

Submitted by Kirill Tkhai on Dec. 26, 2017, 3:47 p.m.

Details

Message ID 151430324798.1302.9674008184180460412.stgit@localhost.localdomain
State New
Series "Introduce custom per-task service fds placement"
Headers show

Commit Message

Kirill Tkhai Dec. 26, 2017, 3:47 p.m.
Currently, we set rlim(RLIMIT_NOFILE) unlimited
and service_fd_rlim_cur to place service fds.
This leads to a signify problem: every task uses
the biggest possible files_struct in kernel, and
it consumes excess memory after restore
in comparation to dump. In some situations this
may end in restore fail as there is no enough
memory in memory cgroup of on node.

The patch fixes the problem by introducing
task-measured service_fd_base. It's calculated
in dependence of max used file fd and is placed
near the right border of kernel-allocated memory
hunk for task's fds (see alloc_fdtable() for
details). This reduces kernel-allocated files_struct
to 512 fds for the most process in standard linux
system (I've analysed the processes in my work system).

Also, since the "standard processes" will have the same
service_fd_base, clone_service_fd() won't have to
actualy dup() their service fds for them like we
have at the moment. This is the one of reasons why
we still keep service fds as a range of fds,
and do not try to use unused holes in task fds.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 criu/util.c |   48 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/util.c b/criu/util.c
index 2ae4d8e64..ef8fd522b 100644
--- a/criu/util.c
+++ b/criu/util.c
@@ -524,7 +524,7 @@  int close_service_fd(enum sfd_type type)
 static void move_service_fd(struct pstree_item *me, int type, int new_id, int new_base)
 {
 	int old = get_service_fd(type);
-	int new = __get_service_fd(type, new_id);
+	int new = new_base - type - SERVICE_FD_MAX * new_id;
 	int ret;
 
 	if (old < 0)
@@ -537,24 +537,64 @@  static void move_service_fd(struct pstree_item *me, int type, int new_id, int ne
 		close(old);
 }
 
+static int choose_service_fd_base(struct pstree_item *me)
+{
+	int nr, fdt_nr = 1, id = rsti(me)->service_fd_id;
+
+	if (rsti(me)->fdt) {
+		/* The base is set by owner of fdt (id 0) */
+		if (id != 0)
+			return service_fd_base;
+		fdt_nr = rsti(me)->fdt->nr;
+	}
+	/* Now find process's max used fd number */
+	if (!list_empty(&rsti(me)->fds))
+		nr = list_entry(rsti(me)->fds.prev,
+				struct fdinfo_list_entry, ps_list)->fe->fd;
+	else
+		nr = -1;
+
+	nr = max(nr, inh_fd_max);
+	/*
+	 * Service fds go after max fd near right border of alignment:
+	 *
+	 * ...|max_fd|max_fd+1|...|sfd first|...|sfd last (aligned)|
+	 *
+	 * So, they take maximum numbers of area allocated by kernel.
+	 * See linux alloc_fdtable() for details.
+	 */
+	nr += (SERVICE_FD_MAX - SERVICE_FD_MIN) * fdt_nr;
+	nr += 64; /* Safety pad */
+
+        nr /= (1024 / sizeof(void *));
+        nr = 1 << (32 - __builtin_clz(nr + 1));
+        nr *= (1024 / sizeof(void *));
+
+	/* Make sure, service fds with different bases do not overlap */
+	BUILD_BUG_ON(1024 / sizeof(void *) <= SERVICE_FD_MAX);
+
+	return nr;
+}
+
 int clone_service_fd(struct pstree_item *me)
 {
 	int id, new_base, i, ret = -1;
 
-	new_base = service_fd_base;
+	new_base = choose_service_fd_base(me);
 	id = rsti(me)->service_fd_id;
 
-	if (service_fd_id == id)
+	if (service_fd_base == new_base && service_fd_id == id)
 		return 0;
 
 	/* Dup sfds in memmove() style: they may overlap */
-	if (get_service_fd(LOG_FD_OFF) > __get_service_fd(LOG_FD_OFF, id))
+	if (get_service_fd(LOG_FD_OFF) > new_base - LOG_FD_OFF - SERVICE_FD_MAX * id)
 		for (i = SERVICE_FD_MIN + 1; i < SERVICE_FD_MAX; i++)
 			move_service_fd(me, i, id, new_base);
 	else
 		for (i = SERVICE_FD_MAX - 1; i > SERVICE_FD_MIN; i--)
 			move_service_fd(me, i, id, new_base);
 
+	service_fd_base = new_base;
 	service_fd_id = id;
 	ret = 0;