[19/28] seccomp: Add restore of per-thread filters

Submitted by Kirill Gorkunov on April 2, 2018, 11:55 a.m.

Details

Message ID 20180402115508.GA10384@uranus
State Rejected
Series "seccomp, v2: Add support for per-thread tracking"
Headers show

Commit Message

Kirill Gorkunov April 2, 2018, 11:55 a.m.
On Thu, Mar 22, 2018 at 03:13:24PM -0700, Andrey Vagin wrote:
> On Wed, Mar 21, 2018 at 12:43:04AM +0300, Cyrill Gorcunov wrote:
> > From: Cyrill Gorcunov <gorcunov@virtuozzo.com>
> > 
> > https://jira.sw.ru/browse/PSBM-78762
> > 
> > Signed-off-by: Cyrill Gorcunov <gorcunov@virtuozzo.com>
> > ---
> >  criu/cr-restore.c       |  10 ++--
> >  criu/include/restorer.h |  15 ++++-
> >  criu/include/seccomp.h  |   9 ++-
> >  criu/pie/restorer.c     | 106 +++++++++++++++++++++-------------
> >  criu/seccomp.c          | 149 ++++++++++++++++++++++++++++++++++--------------
> >  5 files changed, 195 insertions(+), 94 deletions(-)
> 
> 
> 200 lines of code!!! Pls, write what is going on here in a commit
> message.
---
Andrew, take a look please if such change log looks better?
---
From 9a37591540bce128968008cfedba65b73d7620f9 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@virtuozzo.com>
Date: Mon, 26 Feb 2018 18:34:33 +0300
Subject: [PATCH 19/29] seccomp: Add restore of per-thread filters

Since we're carrying filter chains on per-thread basis
we can start restoring them the way it should:

 - each thread_restore_args carries own filter chain
   read from a core image
 - once thread is created its filter is restored back
   via seccomp call

https://jira.sw.ru/browse/PSBM-78762

Signed-off-by: Cyrill Gorcunov <gorcunov@virtuozzo.com>
---
 criu/cr-restore.c       |  10 ++--
 criu/include/restorer.h |  15 ++++-
 criu/include/seccomp.h  |   9 ++-
 criu/pie/restorer.c     | 106 +++++++++++++++++++++-------------
 criu/seccomp.c          | 149 ++++++++++++++++++++++++++++++++++--------------
 5 files changed, 195 insertions(+), 94 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index b9f8de5e82b1..3025ec4032c2 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -331,7 +331,7 @@  static int root_prepare_shared(void)
 	if (prepare_remaps())
 		return -1;
 
-	if (prepare_seccomp_filters())
+	if (seccomp_read_image())
 		return -1;
 
 	if (collect_images(cinfos, ARRAY_SIZE(cinfos)))
@@ -1031,7 +1031,7 @@  static int restore_one_alive_task(int pid, CoreEntry *core)
 	if (prepare_timerfds(ta))
 		return -1;
 
-	if (seccomp_filters_get_rst_pos(core, ta) < 0)
+	if (seccomp_prepare_threads(current, ta) < 0)
 		return -1;
 
 	if (prepare_itimers(pid, ta, core) < 0)
@@ -3668,12 +3668,8 @@  static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 	RST_MEM_FIXUP_PPTR(task_args->rlims);
 	RST_MEM_FIXUP_PPTR(task_args->helpers);
 	RST_MEM_FIXUP_PPTR(task_args->zombies);
-	RST_MEM_FIXUP_PPTR(task_args->seccomp_filters);
 	RST_MEM_FIXUP_PPTR(task_args->vma_ios);
 
-	if (core->thread_core->has_seccomp_mode)
-		task_args->seccomp_mode = core->thread_core->seccomp_mode;
-
 	task_args->compatible_mode = core_is_compat(core);
 
 	if (opts.check_only)
@@ -3763,6 +3759,8 @@  static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 		if (ret)
 			goto err;
 
+		seccomp_rst_reloc(&thread_args[i]);
+
 		thread_args[i].mz = mz + i;
 		sigframe = (struct rt_sigframe *)&mz[i].rt_sigframe;
 
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index 15307d9c0701..3767d9d25088 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -4,6 +4,7 @@ 
 #include <signal.h>
 #include <limits.h>
 #include <sys/resource.h>
+#include <linux/filter.h>
 
 #include "common/config.h"
 #include "types.h"
@@ -76,6 +77,11 @@  struct thread_creds_args {
 	unsigned long			mem_pos_next;
 };
 
+struct thread_seccomp_filter {
+	struct sock_fprog		sock_fprog;
+	unsigned int			flags;
+};
+
 struct thread_restore_args {
 	struct restore_mem_zone		*mz;
 
@@ -100,6 +106,12 @@  struct thread_restore_args {
 
 	bool				check_only;
 	struct thread_creds_args	*creds_args;
+
+	int				seccomp_mode;
+	unsigned long			seccomp_filters_pos;
+	struct thread_seccomp_filter	*seccomp_filters;
+	void				*seccomp_filters_data;
+	unsigned int			seccomp_filters_n;
 } __aligned(64);
 
 typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
@@ -163,9 +175,6 @@  struct task_restore_args {
 	pid_t				*zombies;
 	unsigned int			zombies_n;
 
-	struct sock_fprog		*seccomp_filters;
-	unsigned int			seccomp_filters_n;
-
 	/* * * * * * * * * * * * * * * * * * * * */
 
 	unsigned long			task_size;
diff --git a/criu/include/seccomp.h b/criu/include/seccomp.h
index 47c24c9719c1..1808e3d610c3 100644
--- a/criu/include/seccomp.h
+++ b/criu/include/seccomp.h
@@ -27,6 +27,8 @@ 
 #define SECCOMP_FILTER_FLAG_TSYNC 1
 #endif
 
+struct thread_restore_args;
+struct task_restore_args;
 struct pstree_item;
 struct rb_node;
 
@@ -69,7 +71,8 @@  extern void seccomp_free_entries(void);
 extern int seccomp_dump_thread(pid_t tid_real, ThreadCoreEntry *thread_core);
 extern int seccomp_collect_dump_filters(void);
 
-extern int prepare_seccomp_filters(void);
-struct task_restore_args;
-extern int seccomp_filters_get_rst_pos(CoreEntry *item, struct task_restore_args *);
+extern int seccomp_read_image(void);
+extern int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta);
+extern void seccomp_rst_reloc(struct thread_restore_args *thread_arg);
+
 #endif
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 091026103805..5ede206c55ef 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -395,54 +395,82 @@  static int restore_signals(siginfo_t *ptr, int nr, bool group)
 	return 0;
 }
 
-static int restore_seccomp(struct task_restore_args *args)
+static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args)
 {
+	size_t i;
 	int ret;
 
-	switch (args->seccomp_mode) {
-	case SECCOMP_MODE_DISABLED:
-		return 0;
-	case SECCOMP_MODE_STRICT:
-		ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
+	for (i = 0; i < args->seccomp_filters_n; i++) {
+		struct thread_seccomp_filter *filter = &args->seccomp_filters[i];
+
+		ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, filter->flags, (void *)&filter->sock_fprog);
 		if (ret < 0) {
-			pr_err("prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) returned %d\n", ret);
-			goto die;
+			if (ret == -ENOSYS) {
+				pr_debug("seccomp: sys_seccomp is not supported in kernel, "
+					 "switching to prctl interface\n");
+				ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
+						(long)(void *)&filter->sock_fprog, 0, 0);
+				if (ret) {
+					pr_err("seccomp: PR_SET_SECCOMP returned %d on tid %d\n",
+					       ret, tid);
+					return -1;
+				}
+			} else {
+				pr_err("seccomp: SECCOMP_SET_MODE_FILTER returned %d on tid %d\n",
+				       ret, tid);
+				return -1;
+			}
 		}
-		return 0;
-	case SECCOMP_MODE_FILTER: {
-		int i;
-		void *filter_data;
-
-		filter_data = &args->seccomp_filters[args->seccomp_filters_n];
-
-		for (i = 0; i < args->seccomp_filters_n; i++) {
-			struct sock_fprog *fprog = &args->seccomp_filters[i];
+	}
 
-			fprog->filter = filter_data;
+	return 0;
+}
 
-			/* We always TSYNC here, since we require that the
-			 * creds for all threads be the same; this means we
-			 * don't have to restore_seccomp() in threads, and that
-			 * future TSYNC behavior will be correct.
-			 */
-			ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, (char *) fprog);
-			if (ret < 0) {
-				pr_err("sys_seccomp() returned %d\n", ret);
-				goto die;
-			}
+static int restore_seccomp(struct thread_restore_args *args)
+{
+	pid_t tid = 0;
+	int ret, i;
 
-			filter_data += fprog->len * sizeof(struct sock_filter);
+	for (i = 0; i < MAX_NS_NESTING; i++) {
+		if (args->pid[i] == 0) {
+			tid = args->pid[i - 1];
+			break;
 		}
+	}
 
-		return 0;
+	if (tid != sys_gettid()) {
+		pr_err("seccomp: Unexpected tid %d != %d\n",
+		       tid, (pid_t)sys_gettid());
+		return -1;
 	}
+
+	switch (args->seccomp_mode) {
+	case SECCOMP_MODE_DISABLED:
+		return 0;
+		break;
+	case SECCOMP_MODE_STRICT:
+		ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
+		if (ret < 0) {
+			pr_err("seccomp: SECCOMP_MODE_STRICT returned %d on tid %d\n",
+			       ret, tid);
+		}
+		break;
+	case SECCOMP_MODE_FILTER:
+		ret = restore_seccomp_filter(tid, args);
+		break;
 	default:
-		goto die;
+		pr_err("seccomp: Unknown seccomp mode %d on tid %d\n",
+		       args->seccomp_mode, tid);
+		ret = -1;
+		break;
 	}
 
-	return 0;
-die:
-	return -1;
+	if (!ret) {
+		pr_debug("seccomp: Restored mode %d on tid %d\n",
+			 args->seccomp_mode, tid);
+	}
+
+	return ret;
 }
 
 static int restore_robust_futex(struct thread_restore_args *args)
@@ -541,6 +569,10 @@  long __export_restore_thread(struct thread_restore_args *args)
 		sys_close(fd);
 	}
 
+	/* Make sure it's before creds restore */
+	if (restore_seccomp(args))
+		goto core_restore_end;
+
 	ret = restore_creds(args->creds_args, args->ta->proc_fd);
 	if (ret)
 		goto core_restore_end;
@@ -559,9 +591,6 @@  long __export_restore_thread(struct thread_restore_args *args)
 	restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD);
 	restore_pdeath_sig(args);
 
-	if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED)
-		pr_info("Restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid());
-
 	restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS);
 
 	futex_dec_and_wake(&thread_inprogress);
@@ -1680,8 +1709,7 @@  long __export_restore_task(struct task_restore_args *args)
 	/* The kernel restricts setting seccomp to uid 0 in the current user
 	 * ns, so we must do this before restore_creds.
 	 */
-	pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid());
-	if (restore_seccomp(args))
+	if (restore_seccomp(args->t))
 		goto core_restore_end;
 
 	/*
diff --git a/criu/seccomp.c b/criu/seccomp.c
index a4304e645de7..dd608265731a 100644
--- a/criu/seccomp.c
+++ b/criu/seccomp.c
@@ -24,6 +24,8 @@ 
 static struct rb_root seccomp_tid_rb_root = RB_ROOT;
 static struct seccomp_entry *seccomp_tid_entry_root;
 
+static SeccompEntry *seccomp_img_entry;
+
 struct seccomp_entry *seccomp_lookup(pid_t tid_real, bool create, bool mandatory)
 {
 	struct seccomp_entry *entry = NULL;
@@ -293,10 +295,8 @@  int seccomp_collect_dump_filters(void)
 	return 0;
 }
 
-/* Populated on restore by prepare_seccomp_filters */
-static SeccompEntry *se;
-
-int prepare_seccomp_filters(void)
+/* The seccomp_img_entry will be shared between all children */
+int seccomp_read_image(void)
 {
 	struct cr_img *img;
 	int ret;
@@ -305,66 +305,129 @@  int prepare_seccomp_filters(void)
 	if (!img)
 		return -1;
 
-	ret = pb_read_one_eof(img, &se, PB_SECCOMP);
+	ret = pb_read_one_eof(img, &seccomp_img_entry, PB_SECCOMP);
 	close_image(img);
 	if (ret <= 0)
 		return 0; /* there were no filters */
 
-	BUG_ON(!se);
+	BUG_ON(!seccomp_img_entry);
 
 	return 0;
 }
 
-int seccomp_filters_get_rst_pos(CoreEntry *core, struct task_restore_args *ta)
+/* seccomp_img_entry will be freed per-children after forking */
+static void free_seccomp_filters(void)
 {
-	SeccompFilter *sf = NULL;
-	struct sock_fprog *arr = NULL;
-	void *filter_data = NULL;
-	int ret = -1, i, n_filters;
-	size_t filter_size = 0;
+	if (seccomp_img_entry) {
+		seccomp_entry__free_unpacked(seccomp_img_entry, NULL);
+		seccomp_img_entry = NULL;
+	}
+}
 
-	ta->seccomp_filters_n = 0;
+void seccomp_rst_reloc(struct thread_restore_args *args)
+{
+	size_t j, off;
 
-	if (!core->thread_core->has_seccomp_filter)
-		return 0;
+	if (!args->seccomp_filters_n)
+		return;
 
-	ta->seccomp_filters = (struct sock_fprog *)rst_mem_align_cpos(RM_PRIVATE);
+	args->seccomp_filters = rst_mem_remap_ptr(args->seccomp_filters_pos, RM_PRIVATE);
+	args->seccomp_filters_data = (void *)args->seccomp_filters +
+			args->seccomp_filters_n * sizeof(struct thread_seccomp_filter);
 
-	BUG_ON(core->thread_core->seccomp_filter > se->n_seccomp_filters);
-	sf = se->seccomp_filters[core->thread_core->seccomp_filter];
+	for (j = off = 0; j < args->seccomp_filters_n; j++) {
+		struct thread_seccomp_filter *f = &args->seccomp_filters[j];
 
-	while (1) {
-		ta->seccomp_filters_n++;
-		filter_size += sf->filter.len;
+		f->sock_fprog.filter = args->seccomp_filters_data + off;
+		off += f->sock_fprog.len * sizeof(struct sock_filter);
+	}
+}
 
-		if (!sf->has_prev)
-			break;
+int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta)
+{
+	struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]);
+	size_t i, j, nr_filters, filters_size, rst_size, off;
 
-		sf = se->seccomp_filters[sf->prev];
-	}
+	for (i = 0; i < item->nr_threads; i++) {
+		ThreadCoreEntry *thread_core = item->core[i]->thread_core;
+		struct thread_restore_args *args = &args_array[i];
+		SeccompFilter *sf;
 
-	n_filters = ta->seccomp_filters_n;
-	arr = rst_mem_alloc(sizeof(struct sock_fprog) * n_filters + filter_size, RM_PRIVATE);
-	if (!arr)
-		goto out;
+		args->seccomp_mode		= SECCOMP_MODE_DISABLED;
+		args->seccomp_filters_pos	= 0;
+		args->seccomp_filters_n		= 0;
+		args->seccomp_filters		= NULL;
+		args->seccomp_filters_data	= NULL;
 
-	filter_data = &arr[n_filters];
-	sf = se->seccomp_filters[core->thread_core->seccomp_filter];
-	for (i = 0; i < n_filters; i++) {
-		struct sock_fprog *fprog = &arr[i];
+		if (thread_core->has_seccomp_mode)
+			args->seccomp_mode = thread_core->seccomp_mode;
 
-		BUG_ON(sf->filter.len % sizeof(struct sock_filter));
-		fprog->len = sf->filter.len / sizeof(struct sock_filter);
+		if (args->seccomp_mode != SECCOMP_MODE_FILTER)
+			continue;
 
-		memcpy(filter_data, sf->filter.data, sf->filter.len);
+		if (thread_core->seccomp_filter >= seccomp_img_entry->n_seccomp_filters) {
+			pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
+			       item->threads[i]->ns[0].virt, thread_core->seccomp_filter,
+			       seccomp_img_entry->n_seccomp_filters);
+			return -1;
+		}
 
-		filter_data += sf->filter.len;
-		sf = se->seccomp_filters[sf->prev];
-	}
+		sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
+		if (sf->filter.len % (sizeof(struct sock_filter))) {
+			pr_err("Corrupted filter len on tid %d (index %u)\n",
+			       item->threads[i]->ns[0].virt,
+			       thread_core->seccomp_filter);
+			return -1;
+		}
+		filters_size = sf->filter.len;
+		nr_filters = 1;
+
+		while (sf->has_prev) {
+			if (sf->prev >= seccomp_img_entry->n_seccomp_filters) {
+				pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
+				       item->threads[i]->ns[0].virt, sf->prev,
+				       seccomp_img_entry->n_seccomp_filters);
+				return -1;
+			}
+
+			sf = seccomp_img_entry->seccomp_filters[sf->prev];
+			if (sf->filter.len % (sizeof(struct sock_filter))) {
+				pr_err("Corrupted filter len on tid %d (index %u)\n",
+				       item->threads[i]->ns[0].virt, sf->prev);
+				return -1;
+			}
+			filters_size += sf->filter.len;
+			nr_filters++;
+		}
 
-	ret = 0;
+		args->seccomp_filters_n = nr_filters;
 
-out:
-	seccomp_entry__free_unpacked(se, NULL);
-	return ret;
+		rst_size = filters_size + nr_filters * sizeof(struct thread_seccomp_filter);
+		args->seccomp_filters_pos = rst_mem_align_cpos(RM_PRIVATE);
+		args->seccomp_filters = rst_mem_alloc(rst_size, RM_PRIVATE);
+		if (!args->seccomp_filters) {
+			pr_err("Can't allocate %zu bytes for filters on tid %d\n",
+			       rst_size, item->threads[i]->ns[0].virt);
+			return -ENOMEM;
+		}
+		args->seccomp_filters_data = (void *)args->seccomp_filters +
+			nr_filters * sizeof(struct thread_seccomp_filter);
+
+		sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
+		for (j = off = 0; j < nr_filters; j++) {
+			struct thread_seccomp_filter *f = &args->seccomp_filters[j];
+
+			f->sock_fprog.len	= sf->filter.len / sizeof(struct sock_filter);
+			f->sock_fprog.filter	= args->seccomp_filters_data + off;
+			f->flags		= sf->flags;
+
+			memcpy(f->sock_fprog.filter, sf->filter.data, sf->filter.len);
+
+			off += sf->filter.len;
+			sf = seccomp_img_entry->seccomp_filters[sf->prev];
+		}
+	}
+
+	free_seccomp_filters();
+	return 0;
 }