[19/28] seccomp: Add restore of per-thread filters

Submitted by Cyrill Gorcunov on March 20, 2018, 9:43 p.m.

Details

Message ID 20180320214313.25326-20-gorcunov@gmail.com
State New
Series "seccomp, v2: Add support for per-thread tracking"
Headers show

Commit Message

Cyrill Gorcunov March 20, 2018, 9:43 p.m.
From: Cyrill Gorcunov <gorcunov@virtuozzo.com>

https://jira.sw.ru/browse/PSBM-78762

Signed-off-by: Cyrill Gorcunov <gorcunov@virtuozzo.com>
---
 criu/cr-restore.c       |  10 ++--
 criu/include/restorer.h |  15 ++++-
 criu/include/seccomp.h  |   9 ++-
 criu/pie/restorer.c     | 106 +++++++++++++++++++++-------------
 criu/seccomp.c          | 149 ++++++++++++++++++++++++++++++++++--------------
 5 files changed, 195 insertions(+), 94 deletions(-)

Patch hide | download patch | download mbox

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index b9f8de5e82b1..3025ec4032c2 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -331,7 +331,7 @@  static int root_prepare_shared(void)
 	if (prepare_remaps())
 		return -1;
 
-	if (prepare_seccomp_filters())
+	if (seccomp_read_image())
 		return -1;
 
 	if (collect_images(cinfos, ARRAY_SIZE(cinfos)))
@@ -1031,7 +1031,7 @@  static int restore_one_alive_task(int pid, CoreEntry *core)
 	if (prepare_timerfds(ta))
 		return -1;
 
-	if (seccomp_filters_get_rst_pos(core, ta) < 0)
+	if (seccomp_prepare_threads(current, ta) < 0)
 		return -1;
 
 	if (prepare_itimers(pid, ta, core) < 0)
@@ -3668,12 +3668,8 @@  static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 	RST_MEM_FIXUP_PPTR(task_args->rlims);
 	RST_MEM_FIXUP_PPTR(task_args->helpers);
 	RST_MEM_FIXUP_PPTR(task_args->zombies);
-	RST_MEM_FIXUP_PPTR(task_args->seccomp_filters);
 	RST_MEM_FIXUP_PPTR(task_args->vma_ios);
 
-	if (core->thread_core->has_seccomp_mode)
-		task_args->seccomp_mode = core->thread_core->seccomp_mode;
-
 	task_args->compatible_mode = core_is_compat(core);
 
 	if (opts.check_only)
@@ -3763,6 +3759,8 @@  static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 		if (ret)
 			goto err;
 
+		seccomp_rst_reloc(&thread_args[i]);
+
 		thread_args[i].mz = mz + i;
 		sigframe = (struct rt_sigframe *)&mz[i].rt_sigframe;
 
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index 15307d9c0701..3767d9d25088 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -4,6 +4,7 @@ 
 #include <signal.h>
 #include <limits.h>
 #include <sys/resource.h>
+#include <linux/filter.h>
 
 #include "common/config.h"
 #include "types.h"
@@ -76,6 +77,11 @@  struct thread_creds_args {
 	unsigned long			mem_pos_next;
 };
 
+struct thread_seccomp_filter {
+	struct sock_fprog		sock_fprog;
+	unsigned int			flags;
+};
+
 struct thread_restore_args {
 	struct restore_mem_zone		*mz;
 
@@ -100,6 +106,12 @@  struct thread_restore_args {
 
 	bool				check_only;
 	struct thread_creds_args	*creds_args;
+
+	int				seccomp_mode;
+	unsigned long			seccomp_filters_pos;
+	struct thread_seccomp_filter	*seccomp_filters;
+	void				*seccomp_filters_data;
+	unsigned int			seccomp_filters_n;
 } __aligned(64);
 
 typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
@@ -163,9 +175,6 @@  struct task_restore_args {
 	pid_t				*zombies;
 	unsigned int			zombies_n;
 
-	struct sock_fprog		*seccomp_filters;
-	unsigned int			seccomp_filters_n;
-
 	/* * * * * * * * * * * * * * * * * * * * */
 
 	unsigned long			task_size;
diff --git a/criu/include/seccomp.h b/criu/include/seccomp.h
index 47c24c9719c1..1808e3d610c3 100644
--- a/criu/include/seccomp.h
+++ b/criu/include/seccomp.h
@@ -27,6 +27,8 @@ 
 #define SECCOMP_FILTER_FLAG_TSYNC 1
 #endif
 
+struct thread_restore_args;
+struct task_restore_args;
 struct pstree_item;
 struct rb_node;
 
@@ -69,7 +71,8 @@  extern void seccomp_free_entries(void);
 extern int seccomp_dump_thread(pid_t tid_real, ThreadCoreEntry *thread_core);
 extern int seccomp_collect_dump_filters(void);
 
-extern int prepare_seccomp_filters(void);
-struct task_restore_args;
-extern int seccomp_filters_get_rst_pos(CoreEntry *item, struct task_restore_args *);
+extern int seccomp_read_image(void);
+extern int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta);
+extern void seccomp_rst_reloc(struct thread_restore_args *thread_arg);
+
 #endif
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 091026103805..5ede206c55ef 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -395,54 +395,82 @@  static int restore_signals(siginfo_t *ptr, int nr, bool group)
 	return 0;
 }
 
-static int restore_seccomp(struct task_restore_args *args)
+static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args)
 {
+	size_t i;
 	int ret;
 
-	switch (args->seccomp_mode) {
-	case SECCOMP_MODE_DISABLED:
-		return 0;
-	case SECCOMP_MODE_STRICT:
-		ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
+	for (i = 0; i < args->seccomp_filters_n; i++) {
+		struct thread_seccomp_filter *filter = &args->seccomp_filters[i];
+
+		ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, filter->flags, (void *)&filter->sock_fprog);
 		if (ret < 0) {
-			pr_err("prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) returned %d\n", ret);
-			goto die;
+			if (ret == -ENOSYS) {
+				pr_debug("seccomp: sys_seccomp is not supported in kernel, "
+					 "switching to prctl interface\n");
+				ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
+						(long)(void *)&filter->sock_fprog, 0, 0);
+				if (ret) {
+					pr_err("seccomp: PR_SET_SECCOMP returned %d on tid %d\n",
+					       ret, tid);
+					return -1;
+				}
+			} else {
+				pr_err("seccomp: SECCOMP_SET_MODE_FILTER returned %d on tid %d\n",
+				       ret, tid);
+				return -1;
+			}
 		}
-		return 0;
-	case SECCOMP_MODE_FILTER: {
-		int i;
-		void *filter_data;
-
-		filter_data = &args->seccomp_filters[args->seccomp_filters_n];
-
-		for (i = 0; i < args->seccomp_filters_n; i++) {
-			struct sock_fprog *fprog = &args->seccomp_filters[i];
+	}
 
-			fprog->filter = filter_data;
+	return 0;
+}
 
-			/* We always TSYNC here, since we require that the
-			 * creds for all threads be the same; this means we
-			 * don't have to restore_seccomp() in threads, and that
-			 * future TSYNC behavior will be correct.
-			 */
-			ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, (char *) fprog);
-			if (ret < 0) {
-				pr_err("sys_seccomp() returned %d\n", ret);
-				goto die;
-			}
+static int restore_seccomp(struct thread_restore_args *args)
+{
+	pid_t tid = 0;
+	int ret, i;
 
-			filter_data += fprog->len * sizeof(struct sock_filter);
+	for (i = 0; i < MAX_NS_NESTING; i++) {
+		if (args->pid[i] == 0) {
+			tid = args->pid[i - 1];
+			break;
 		}
+	}
 
-		return 0;
+	if (tid != sys_gettid()) {
+		pr_err("seccomp: Unexpected tid %d != %d\n",
+		       tid, (pid_t)sys_gettid());
+		return -1;
 	}
+
+	switch (args->seccomp_mode) {
+	case SECCOMP_MODE_DISABLED:
+		return 0;
+		break;
+	case SECCOMP_MODE_STRICT:
+		ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
+		if (ret < 0) {
+			pr_err("seccomp: SECCOMP_MODE_STRICT returned %d on tid %d\n",
+			       ret, tid);
+		}
+		break;
+	case SECCOMP_MODE_FILTER:
+		ret = restore_seccomp_filter(tid, args);
+		break;
 	default:
-		goto die;
+		pr_err("seccomp: Unknown seccomp mode %d on tid %d\n",
+		       args->seccomp_mode, tid);
+		ret = -1;
+		break;
 	}
 
-	return 0;
-die:
-	return -1;
+	if (!ret) {
+		pr_debug("seccomp: Restored mode %d on tid %d\n",
+			 args->seccomp_mode, tid);
+	}
+
+	return ret;
 }
 
 static int restore_robust_futex(struct thread_restore_args *args)
@@ -541,6 +569,10 @@  long __export_restore_thread(struct thread_restore_args *args)
 		sys_close(fd);
 	}
 
+	/* Make sure it's before creds restore */
+	if (restore_seccomp(args))
+		goto core_restore_end;
+
 	ret = restore_creds(args->creds_args, args->ta->proc_fd);
 	if (ret)
 		goto core_restore_end;
@@ -559,9 +591,6 @@  long __export_restore_thread(struct thread_restore_args *args)
 	restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD);
 	restore_pdeath_sig(args);
 
-	if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED)
-		pr_info("Restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid());
-
 	restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS);
 
 	futex_dec_and_wake(&thread_inprogress);
@@ -1680,8 +1709,7 @@  long __export_restore_task(struct task_restore_args *args)
 	/* The kernel restricts setting seccomp to uid 0 in the current user
 	 * ns, so we must do this before restore_creds.
 	 */
-	pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid());
-	if (restore_seccomp(args))
+	if (restore_seccomp(args->t))
 		goto core_restore_end;
 
 	/*
diff --git a/criu/seccomp.c b/criu/seccomp.c
index a4304e645de7..dd608265731a 100644
--- a/criu/seccomp.c
+++ b/criu/seccomp.c
@@ -24,6 +24,8 @@ 
 static struct rb_root seccomp_tid_rb_root = RB_ROOT;
 static struct seccomp_entry *seccomp_tid_entry_root;
 
+static SeccompEntry *seccomp_img_entry;
+
 struct seccomp_entry *seccomp_lookup(pid_t tid_real, bool create, bool mandatory)
 {
 	struct seccomp_entry *entry = NULL;
@@ -293,10 +295,8 @@  int seccomp_collect_dump_filters(void)
 	return 0;
 }
 
-/* Populated on restore by prepare_seccomp_filters */
-static SeccompEntry *se;
-
-int prepare_seccomp_filters(void)
+/* The seccomp_img_entry will be shared between all children */
+int seccomp_read_image(void)
 {
 	struct cr_img *img;
 	int ret;
@@ -305,66 +305,129 @@  int prepare_seccomp_filters(void)
 	if (!img)
 		return -1;
 
-	ret = pb_read_one_eof(img, &se, PB_SECCOMP);
+	ret = pb_read_one_eof(img, &seccomp_img_entry, PB_SECCOMP);
 	close_image(img);
 	if (ret <= 0)
 		return 0; /* there were no filters */
 
-	BUG_ON(!se);
+	BUG_ON(!seccomp_img_entry);
 
 	return 0;
 }
 
-int seccomp_filters_get_rst_pos(CoreEntry *core, struct task_restore_args *ta)
+/* seccomp_img_entry will be freed per-children after forking */
+static void free_seccomp_filters(void)
 {
-	SeccompFilter *sf = NULL;
-	struct sock_fprog *arr = NULL;
-	void *filter_data = NULL;
-	int ret = -1, i, n_filters;
-	size_t filter_size = 0;
+	if (seccomp_img_entry) {
+		seccomp_entry__free_unpacked(seccomp_img_entry, NULL);
+		seccomp_img_entry = NULL;
+	}
+}
 
-	ta->seccomp_filters_n = 0;
+void seccomp_rst_reloc(struct thread_restore_args *args)
+{
+	size_t j, off;
 
-	if (!core->thread_core->has_seccomp_filter)
-		return 0;
+	if (!args->seccomp_filters_n)
+		return;
 
-	ta->seccomp_filters = (struct sock_fprog *)rst_mem_align_cpos(RM_PRIVATE);
+	args->seccomp_filters = rst_mem_remap_ptr(args->seccomp_filters_pos, RM_PRIVATE);
+	args->seccomp_filters_data = (void *)args->seccomp_filters +
+			args->seccomp_filters_n * sizeof(struct thread_seccomp_filter);
 
-	BUG_ON(core->thread_core->seccomp_filter > se->n_seccomp_filters);
-	sf = se->seccomp_filters[core->thread_core->seccomp_filter];
+	for (j = off = 0; j < args->seccomp_filters_n; j++) {
+		struct thread_seccomp_filter *f = &args->seccomp_filters[j];
 
-	while (1) {
-		ta->seccomp_filters_n++;
-		filter_size += sf->filter.len;
+		f->sock_fprog.filter = args->seccomp_filters_data + off;
+		off += f->sock_fprog.len * sizeof(struct sock_filter);
+	}
+}
 
-		if (!sf->has_prev)
-			break;
+int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta)
+{
+	struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]);
+	size_t i, j, nr_filters, filters_size, rst_size, off;
 
-		sf = se->seccomp_filters[sf->prev];
-	}
+	for (i = 0; i < item->nr_threads; i++) {
+		ThreadCoreEntry *thread_core = item->core[i]->thread_core;
+		struct thread_restore_args *args = &args_array[i];
+		SeccompFilter *sf;
 
-	n_filters = ta->seccomp_filters_n;
-	arr = rst_mem_alloc(sizeof(struct sock_fprog) * n_filters + filter_size, RM_PRIVATE);
-	if (!arr)
-		goto out;
+		args->seccomp_mode		= SECCOMP_MODE_DISABLED;
+		args->seccomp_filters_pos	= 0;
+		args->seccomp_filters_n		= 0;
+		args->seccomp_filters		= NULL;
+		args->seccomp_filters_data	= NULL;
 
-	filter_data = &arr[n_filters];
-	sf = se->seccomp_filters[core->thread_core->seccomp_filter];
-	for (i = 0; i < n_filters; i++) {
-		struct sock_fprog *fprog = &arr[i];
+		if (thread_core->has_seccomp_mode)
+			args->seccomp_mode = thread_core->seccomp_mode;
 
-		BUG_ON(sf->filter.len % sizeof(struct sock_filter));
-		fprog->len = sf->filter.len / sizeof(struct sock_filter);
+		if (args->seccomp_mode != SECCOMP_MODE_FILTER)
+			continue;
 
-		memcpy(filter_data, sf->filter.data, sf->filter.len);
+		if (thread_core->seccomp_filter >= seccomp_img_entry->n_seccomp_filters) {
+			pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
+			       item->threads[i]->ns[0].virt, thread_core->seccomp_filter,
+			       seccomp_img_entry->n_seccomp_filters);
+			return -1;
+		}
 
-		filter_data += sf->filter.len;
-		sf = se->seccomp_filters[sf->prev];
-	}
+		sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
+		if (sf->filter.len % (sizeof(struct sock_filter))) {
+			pr_err("Corrupted filter len on tid %d (index %u)\n",
+			       item->threads[i]->ns[0].virt,
+			       thread_core->seccomp_filter);
+			return -1;
+		}
+		filters_size = sf->filter.len;
+		nr_filters = 1;
+
+		while (sf->has_prev) {
+			if (sf->prev >= seccomp_img_entry->n_seccomp_filters) {
+				pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
+				       item->threads[i]->ns[0].virt, sf->prev,
+				       seccomp_img_entry->n_seccomp_filters);
+				return -1;
+			}
+
+			sf = seccomp_img_entry->seccomp_filters[sf->prev];
+			if (sf->filter.len % (sizeof(struct sock_filter))) {
+				pr_err("Corrupted filter len on tid %d (index %u)\n",
+				       item->threads[i]->ns[0].virt, sf->prev);
+				return -1;
+			}
+			filters_size += sf->filter.len;
+			nr_filters++;
+		}
 
-	ret = 0;
+		args->seccomp_filters_n = nr_filters;
 
-out:
-	seccomp_entry__free_unpacked(se, NULL);
-	return ret;
+		rst_size = filters_size + nr_filters * sizeof(struct thread_seccomp_filter);
+		args->seccomp_filters_pos = rst_mem_align_cpos(RM_PRIVATE);
+		args->seccomp_filters = rst_mem_alloc(rst_size, RM_PRIVATE);
+		if (!args->seccomp_filters) {
+			pr_err("Can't allocate %zu bytes for filters on tid %d\n",
+			       rst_size, item->threads[i]->ns[0].virt);
+			return -ENOMEM;
+		}
+		args->seccomp_filters_data = (void *)args->seccomp_filters +
+			nr_filters * sizeof(struct thread_seccomp_filter);
+
+		sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
+		for (j = off = 0; j < nr_filters; j++) {
+			struct thread_seccomp_filter *f = &args->seccomp_filters[j];
+
+			f->sock_fprog.len	= sf->filter.len / sizeof(struct sock_filter);
+			f->sock_fprog.filter	= args->seccomp_filters_data + off;
+			f->flags		= sf->flags;
+
+			memcpy(f->sock_fprog.filter, sf->filter.data, sf->filter.len);
+
+			off += sf->filter.len;
+			sf = seccomp_img_entry->seccomp_filters[sf->prev];
+		}
+	}
+
+	free_seccomp_filters();
+	return 0;
 }

Comments

Andrey Vagin March 22, 2018, 10:13 p.m.
On Wed, Mar 21, 2018 at 12:43:04AM +0300, Cyrill Gorcunov wrote:
> From: Cyrill Gorcunov <gorcunov@virtuozzo.com>
> 
> https://jira.sw.ru/browse/PSBM-78762
> 
> Signed-off-by: Cyrill Gorcunov <gorcunov@virtuozzo.com>
> ---
>  criu/cr-restore.c       |  10 ++--
>  criu/include/restorer.h |  15 ++++-
>  criu/include/seccomp.h  |   9 ++-
>  criu/pie/restorer.c     | 106 +++++++++++++++++++++-------------
>  criu/seccomp.c          | 149 ++++++++++++++++++++++++++++++++++--------------
>  5 files changed, 195 insertions(+), 94 deletions(-)


200 lines of code!!! Pls, write what is going on here in a commit
message.

> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index b9f8de5e82b1..3025ec4032c2 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -331,7 +331,7 @@ static int root_prepare_shared(void)
>  	if (prepare_remaps())
>  		return -1;
>  
> -	if (prepare_seccomp_filters())
> +	if (seccomp_read_image())
>  		return -1;
>  
>  	if (collect_images(cinfos, ARRAY_SIZE(cinfos)))
> @@ -1031,7 +1031,7 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
>  	if (prepare_timerfds(ta))
>  		return -1;
>  
> -	if (seccomp_filters_get_rst_pos(core, ta) < 0)
> +	if (seccomp_prepare_threads(current, ta) < 0)
>  		return -1;
>  
>  	if (prepare_itimers(pid, ta, core) < 0)
> @@ -3668,12 +3668,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>  	RST_MEM_FIXUP_PPTR(task_args->rlims);
>  	RST_MEM_FIXUP_PPTR(task_args->helpers);
>  	RST_MEM_FIXUP_PPTR(task_args->zombies);
> -	RST_MEM_FIXUP_PPTR(task_args->seccomp_filters);
>  	RST_MEM_FIXUP_PPTR(task_args->vma_ios);
>  
> -	if (core->thread_core->has_seccomp_mode)
> -		task_args->seccomp_mode = core->thread_core->seccomp_mode;
> -
>  	task_args->compatible_mode = core_is_compat(core);
>  
>  	if (opts.check_only)
> @@ -3763,6 +3759,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>  		if (ret)
>  			goto err;
>  
> +		seccomp_rst_reloc(&thread_args[i]);
> +
>  		thread_args[i].mz = mz + i;
>  		sigframe = (struct rt_sigframe *)&mz[i].rt_sigframe;
>  
> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
> index 15307d9c0701..3767d9d25088 100644
> --- a/criu/include/restorer.h
> +++ b/criu/include/restorer.h
> @@ -4,6 +4,7 @@
>  #include <signal.h>
>  #include <limits.h>
>  #include <sys/resource.h>
> +#include <linux/filter.h>
>  
>  #include "common/config.h"
>  #include "types.h"
> @@ -76,6 +77,11 @@ struct thread_creds_args {
>  	unsigned long			mem_pos_next;
>  };
>  
> +struct thread_seccomp_filter {
> +	struct sock_fprog		sock_fprog;
> +	unsigned int			flags;
> +};
> +
>  struct thread_restore_args {
>  	struct restore_mem_zone		*mz;
>  
> @@ -100,6 +106,12 @@ struct thread_restore_args {
>  
>  	bool				check_only;
>  	struct thread_creds_args	*creds_args;
> +
> +	int				seccomp_mode;
> +	unsigned long			seccomp_filters_pos;
> +	struct thread_seccomp_filter	*seccomp_filters;
> +	void				*seccomp_filters_data;
> +	unsigned int			seccomp_filters_n;
>  } __aligned(64);
>  
>  typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
> @@ -163,9 +175,6 @@ struct task_restore_args {
>  	pid_t				*zombies;
>  	unsigned int			zombies_n;
>  
> -	struct sock_fprog		*seccomp_filters;
> -	unsigned int			seccomp_filters_n;
> -
>  	/* * * * * * * * * * * * * * * * * * * * */
>  
>  	unsigned long			task_size;
> diff --git a/criu/include/seccomp.h b/criu/include/seccomp.h
> index 47c24c9719c1..1808e3d610c3 100644
> --- a/criu/include/seccomp.h
> +++ b/criu/include/seccomp.h
> @@ -27,6 +27,8 @@
>  #define SECCOMP_FILTER_FLAG_TSYNC 1
>  #endif
>  
> +struct thread_restore_args;
> +struct task_restore_args;
>  struct pstree_item;
>  struct rb_node;
>  
> @@ -69,7 +71,8 @@ extern void seccomp_free_entries(void);
>  extern int seccomp_dump_thread(pid_t tid_real, ThreadCoreEntry *thread_core);
>  extern int seccomp_collect_dump_filters(void);
>  
> -extern int prepare_seccomp_filters(void);
> -struct task_restore_args;
> -extern int seccomp_filters_get_rst_pos(CoreEntry *item, struct task_restore_args *);
> +extern int seccomp_read_image(void);
> +extern int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta);
> +extern void seccomp_rst_reloc(struct thread_restore_args *thread_arg);
> +
>  #endif
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index 091026103805..5ede206c55ef 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -395,54 +395,82 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group)
>  	return 0;
>  }
>  
> -static int restore_seccomp(struct task_restore_args *args)
> +static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args)
>  {
> +	size_t i;
>  	int ret;
>  
> -	switch (args->seccomp_mode) {
> -	case SECCOMP_MODE_DISABLED:
> -		return 0;
> -	case SECCOMP_MODE_STRICT:
> -		ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
> +	for (i = 0; i < args->seccomp_filters_n; i++) {
> +		struct thread_seccomp_filter *filter = &args->seccomp_filters[i];
> +
> +		ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, filter->flags, (void *)&filter->sock_fprog);
>  		if (ret < 0) {
> -			pr_err("prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) returned %d\n", ret);
> -			goto die;
> +			if (ret == -ENOSYS) {
> +				pr_debug("seccomp: sys_seccomp is not supported in kernel, "
> +					 "switching to prctl interface\n");
> +				ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
> +						(long)(void *)&filter->sock_fprog, 0, 0);
> +				if (ret) {
> +					pr_err("seccomp: PR_SET_SECCOMP returned %d on tid %d\n",
> +					       ret, tid);
> +					return -1;
> +				}
> +			} else {
> +				pr_err("seccomp: SECCOMP_SET_MODE_FILTER returned %d on tid %d\n",
> +				       ret, tid);
> +				return -1;
> +			}
>  		}
> -		return 0;
> -	case SECCOMP_MODE_FILTER: {
> -		int i;
> -		void *filter_data;
> -
> -		filter_data = &args->seccomp_filters[args->seccomp_filters_n];
> -
> -		for (i = 0; i < args->seccomp_filters_n; i++) {
> -			struct sock_fprog *fprog = &args->seccomp_filters[i];
> +	}
>  
> -			fprog->filter = filter_data;
> +	return 0;
> +}
>  
> -			/* We always TSYNC here, since we require that the
> -			 * creds for all threads be the same; this means we
> -			 * don't have to restore_seccomp() in threads, and that
> -			 * future TSYNC behavior will be correct.
> -			 */
> -			ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, (char *) fprog);
> -			if (ret < 0) {
> -				pr_err("sys_seccomp() returned %d\n", ret);
> -				goto die;
> -			}
> +static int restore_seccomp(struct thread_restore_args *args)
> +{
> +	pid_t tid = 0;
> +	int ret, i;
>  
> -			filter_data += fprog->len * sizeof(struct sock_filter);
> +	for (i = 0; i < MAX_NS_NESTING; i++) {
> +		if (args->pid[i] == 0) {
> +			tid = args->pid[i - 1];
> +			break;
>  		}
> +	}
>  
> -		return 0;
> +	if (tid != sys_gettid()) {
> +		pr_err("seccomp: Unexpected tid %d != %d\n",
> +		       tid, (pid_t)sys_gettid());
> +		return -1;
>  	}
> +
> +	switch (args->seccomp_mode) {
> +	case SECCOMP_MODE_DISABLED:
> +		return 0;
> +		break;
> +	case SECCOMP_MODE_STRICT:
> +		ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
> +		if (ret < 0) {
> +			pr_err("seccomp: SECCOMP_MODE_STRICT returned %d on tid %d\n",
> +			       ret, tid);
> +		}
> +		break;
> +	case SECCOMP_MODE_FILTER:
> +		ret = restore_seccomp_filter(tid, args);
> +		break;
>  	default:
> -		goto die;
> +		pr_err("seccomp: Unknown seccomp mode %d on tid %d\n",
> +		       args->seccomp_mode, tid);
> +		ret = -1;
> +		break;
>  	}
>  
> -	return 0;
> -die:
> -	return -1;
> +	if (!ret) {
> +		pr_debug("seccomp: Restored mode %d on tid %d\n",
> +			 args->seccomp_mode, tid);
> +	}
> +
> +	return ret;
>  }
>  
>  static int restore_robust_futex(struct thread_restore_args *args)
> @@ -541,6 +569,10 @@ long __export_restore_thread(struct thread_restore_args *args)
>  		sys_close(fd);
>  	}
>  
> +	/* Make sure it's before creds restore */
> +	if (restore_seccomp(args))
> +		goto core_restore_end;
> +
>  	ret = restore_creds(args->creds_args, args->ta->proc_fd);
>  	if (ret)
>  		goto core_restore_end;
> @@ -559,9 +591,6 @@ long __export_restore_thread(struct thread_restore_args *args)
>  	restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD);
>  	restore_pdeath_sig(args);
>  
> -	if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED)
> -		pr_info("Restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid());
> -
>  	restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS);
>  
>  	futex_dec_and_wake(&thread_inprogress);
> @@ -1680,8 +1709,7 @@ long __export_restore_task(struct task_restore_args *args)
>  	/* The kernel restricts setting seccomp to uid 0 in the current user
>  	 * ns, so we must do this before restore_creds.
>  	 */
> -	pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid());
> -	if (restore_seccomp(args))
> +	if (restore_seccomp(args->t))
>  		goto core_restore_end;
>  
>  	/*
> diff --git a/criu/seccomp.c b/criu/seccomp.c
> index a4304e645de7..dd608265731a 100644
> --- a/criu/seccomp.c
> +++ b/criu/seccomp.c
> @@ -24,6 +24,8 @@
>  static struct rb_root seccomp_tid_rb_root = RB_ROOT;
>  static struct seccomp_entry *seccomp_tid_entry_root;
>  
> +static SeccompEntry *seccomp_img_entry;
> +
>  struct seccomp_entry *seccomp_lookup(pid_t tid_real, bool create, bool mandatory)
>  {
>  	struct seccomp_entry *entry = NULL;
> @@ -293,10 +295,8 @@ int seccomp_collect_dump_filters(void)
>  	return 0;
>  }
>  
> -/* Populated on restore by prepare_seccomp_filters */
> -static SeccompEntry *se;
> -
> -int prepare_seccomp_filters(void)
> +/* The seccomp_img_entry will be shared between all children */
> +int seccomp_read_image(void)
>  {
>  	struct cr_img *img;
>  	int ret;
> @@ -305,66 +305,129 @@ int prepare_seccomp_filters(void)
>  	if (!img)
>  		return -1;
>  
> -	ret = pb_read_one_eof(img, &se, PB_SECCOMP);
> +	ret = pb_read_one_eof(img, &seccomp_img_entry, PB_SECCOMP);
>  	close_image(img);
>  	if (ret <= 0)
>  		return 0; /* there were no filters */
>  
> -	BUG_ON(!se);
> +	BUG_ON(!seccomp_img_entry);
>  
>  	return 0;
>  }
>  
> -int seccomp_filters_get_rst_pos(CoreEntry *core, struct task_restore_args *ta)
> +/* seccomp_img_entry will be freed per-children after forking */
> +static void free_seccomp_filters(void)
>  {
> -	SeccompFilter *sf = NULL;
> -	struct sock_fprog *arr = NULL;
> -	void *filter_data = NULL;
> -	int ret = -1, i, n_filters;
> -	size_t filter_size = 0;
> +	if (seccomp_img_entry) {
> +		seccomp_entry__free_unpacked(seccomp_img_entry, NULL);
> +		seccomp_img_entry = NULL;
> +	}
> +}
>  
> -	ta->seccomp_filters_n = 0;
> +void seccomp_rst_reloc(struct thread_restore_args *args)
> +{
> +	size_t j, off;
>  
> -	if (!core->thread_core->has_seccomp_filter)
> -		return 0;
> +	if (!args->seccomp_filters_n)
> +		return;
>  
> -	ta->seccomp_filters = (struct sock_fprog *)rst_mem_align_cpos(RM_PRIVATE);
> +	args->seccomp_filters = rst_mem_remap_ptr(args->seccomp_filters_pos, RM_PRIVATE);
> +	args->seccomp_filters_data = (void *)args->seccomp_filters +
> +			args->seccomp_filters_n * sizeof(struct thread_seccomp_filter);
>  
> -	BUG_ON(core->thread_core->seccomp_filter > se->n_seccomp_filters);
> -	sf = se->seccomp_filters[core->thread_core->seccomp_filter];
> +	for (j = off = 0; j < args->seccomp_filters_n; j++) {
> +		struct thread_seccomp_filter *f = &args->seccomp_filters[j];
>  
> -	while (1) {
> -		ta->seccomp_filters_n++;
> -		filter_size += sf->filter.len;
> +		f->sock_fprog.filter = args->seccomp_filters_data + off;
> +		off += f->sock_fprog.len * sizeof(struct sock_filter);
> +	}
> +}
>  
> -		if (!sf->has_prev)
> -			break;
> +int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta)
> +{
> +	struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]);
> +	size_t i, j, nr_filters, filters_size, rst_size, off;
>  
> -		sf = se->seccomp_filters[sf->prev];
> -	}
> +	for (i = 0; i < item->nr_threads; i++) {
> +		ThreadCoreEntry *thread_core = item->core[i]->thread_core;
> +		struct thread_restore_args *args = &args_array[i];
> +		SeccompFilter *sf;
>  
> -	n_filters = ta->seccomp_filters_n;
> -	arr = rst_mem_alloc(sizeof(struct sock_fprog) * n_filters + filter_size, RM_PRIVATE);
> -	if (!arr)
> -		goto out;
> +		args->seccomp_mode		= SECCOMP_MODE_DISABLED;
> +		args->seccomp_filters_pos	= 0;
> +		args->seccomp_filters_n		= 0;
> +		args->seccomp_filters		= NULL;
> +		args->seccomp_filters_data	= NULL;
>  
> -	filter_data = &arr[n_filters];
> -	sf = se->seccomp_filters[core->thread_core->seccomp_filter];
> -	for (i = 0; i < n_filters; i++) {
> -		struct sock_fprog *fprog = &arr[i];
> +		if (thread_core->has_seccomp_mode)
> +			args->seccomp_mode = thread_core->seccomp_mode;
>  
> -		BUG_ON(sf->filter.len % sizeof(struct sock_filter));
> -		fprog->len = sf->filter.len / sizeof(struct sock_filter);
> +		if (args->seccomp_mode != SECCOMP_MODE_FILTER)
> +			continue;
>  
> -		memcpy(filter_data, sf->filter.data, sf->filter.len);
> +		if (thread_core->seccomp_filter >= seccomp_img_entry->n_seccomp_filters) {
> +			pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
> +			       item->threads[i]->ns[0].virt, thread_core->seccomp_filter,
> +			       seccomp_img_entry->n_seccomp_filters);
> +			return -1;
> +		}
>  
> -		filter_data += sf->filter.len;
> -		sf = se->seccomp_filters[sf->prev];
> -	}
> +		sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
> +		if (sf->filter.len % (sizeof(struct sock_filter))) {
> +			pr_err("Corrupted filter len on tid %d (index %u)\n",
> +			       item->threads[i]->ns[0].virt,
> +			       thread_core->seccomp_filter);
> +			return -1;
> +		}
> +		filters_size = sf->filter.len;
> +		nr_filters = 1;
> +
> +		while (sf->has_prev) {
> +			if (sf->prev >= seccomp_img_entry->n_seccomp_filters) {
> +				pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
> +				       item->threads[i]->ns[0].virt, sf->prev,
> +				       seccomp_img_entry->n_seccomp_filters);
> +				return -1;
> +			}
> +
> +			sf = seccomp_img_entry->seccomp_filters[sf->prev];
> +			if (sf->filter.len % (sizeof(struct sock_filter))) {
> +				pr_err("Corrupted filter len on tid %d (index %u)\n",
> +				       item->threads[i]->ns[0].virt, sf->prev);
> +				return -1;
> +			}
> +			filters_size += sf->filter.len;
> +			nr_filters++;
> +		}
>  
> -	ret = 0;
> +		args->seccomp_filters_n = nr_filters;
>  
> -out:
> -	seccomp_entry__free_unpacked(se, NULL);
> -	return ret;
> +		rst_size = filters_size + nr_filters * sizeof(struct thread_seccomp_filter);
> +		args->seccomp_filters_pos = rst_mem_align_cpos(RM_PRIVATE);
> +		args->seccomp_filters = rst_mem_alloc(rst_size, RM_PRIVATE);
> +		if (!args->seccomp_filters) {
> +			pr_err("Can't allocate %zu bytes for filters on tid %d\n",
> +			       rst_size, item->threads[i]->ns[0].virt);
> +			return -ENOMEM;
> +		}
> +		args->seccomp_filters_data = (void *)args->seccomp_filters +
> +			nr_filters * sizeof(struct thread_seccomp_filter);
> +
> +		sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
> +		for (j = off = 0; j < nr_filters; j++) {
> +			struct thread_seccomp_filter *f = &args->seccomp_filters[j];
> +
> +			f->sock_fprog.len	= sf->filter.len / sizeof(struct sock_filter);
> +			f->sock_fprog.filter	= args->seccomp_filters_data + off;
> +			f->flags		= sf->flags;
> +
> +			memcpy(f->sock_fprog.filter, sf->filter.data, sf->filter.len);
> +
> +			off += sf->filter.len;
> +			sf = seccomp_img_entry->seccomp_filters[sf->prev];
> +		}
> +	}
> +
> +	free_seccomp_filters();
> +	return 0;
>  }
> -- 
> 2.14.3
>
Cyrill Gorcunov March 23, 2018, 7:01 a.m.
On Thu, Mar 22, 2018 at 03:13:24PM -0700, Andrey Vagin wrote:
> On Wed, Mar 21, 2018 at 12:43:04AM +0300, Cyrill Gorcunov wrote:
> > From: Cyrill Gorcunov <gorcunov@virtuozzo.com>
> > 
> > https://jira.sw.ru/browse/PSBM-78762
> > 
> > Signed-off-by: Cyrill Gorcunov <gorcunov@virtuozzo.com>
> > ---
> >  criu/cr-restore.c       |  10 ++--
> >  criu/include/restorer.h |  15 ++++-
> >  criu/include/seccomp.h  |   9 ++-
> >  criu/pie/restorer.c     | 106 +++++++++++++++++++++-------------
> >  criu/seccomp.c          | 149 ++++++++++++++++++++++++++++++++++--------------
> >  5 files changed, 195 insertions(+), 94 deletions(-)
> 
> 
> 200 lines of code!!! Pls, write what is going on here in a commit
> message.

Mmmm, actually there is not that much I can put into a message :)
Simply because I didn't do somthing new here in code but rather
move syscall from per-task to per-thread place. Maybe something like this?
---

Currently all threads are inheriting filters from the group leader
via SECCOMP_FILTER_FLAG_TSYNC but we already prepared dumper to
carry filters on per-thread basis in core*.img files. Thus to
make per-thread restore of seccomp filters we drop old
SECCOMP_FILTER_FLAG_TSYNC call and make each thread to restore
own chain from the image.