[net-next,1/3] bpf, seccomp: Add eBPF filter capabilities

Submitted by Sargun Dhillon on Feb. 13, 2018, 3:42 p.m.

Details

Message ID 20180213154255.GA3301@ircssh-2.c.rugged-nimbus-611.internal
State New
Series "Series without cover letter"
Headers show

Commit Message

Sargun Dhillon Feb. 13, 2018, 3:42 p.m.
From: Sargun Dhillon <sargun@netflix.com>

This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
to be used for seccomp filters as an alternative to cBPF filters. The
program type has relatively limited capabilities in terms of helpers,
but that can be extended later on.

It also introduces a new mechanism to attach these filters via the
prctl and seccomp syscalls -- SECCOMP_MODE_FILTER_EXTENDED, and
SECCOMP_SET_MODE_FILTER_EXTENDED respectively.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
---
 arch/Kconfig                 |   7 ++
 include/linux/bpf_types.h    |   3 +
 include/uapi/linux/bpf.h     |   2 +
 include/uapi/linux/seccomp.h |  15 +++--
 kernel/bpf/syscall.c         |   1 +
 kernel/seccomp.c             | 148 +++++++++++++++++++++++++++++++++++++------
 6 files changed, 150 insertions(+), 26 deletions(-)

Patch hide | download patch | download mbox

diff --git a/arch/Kconfig b/arch/Kconfig
index 76c0b54443b1..db675888577c 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -401,6 +401,13 @@  config SECCOMP_FILTER
 
 	  See Documentation/prctl/seccomp_filter.txt for details.
 
+config SECCOMP_FILTER_EXTENDED
+	bool "Extended BPF seccomp filters"
+	depends on SECCOMP_FILTER && BPF_SYSCALL
+	help
+	  Enables seccomp filters to be written in eBPF, as opposed
+	  to just cBPF filters.
+
 config HAVE_GCC_PLUGINS
 	bool
 	help
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 19b8349a3809..945c65c4e461 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -22,6 +22,9 @@  BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #endif
+#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
+BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index db6bdc375126..5f96cb7ed954 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1,3 +1,4 @@ 
+
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  *
@@ -133,6 +134,7 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_SOCK_OPS,
 	BPF_PROG_TYPE_SK_SKB,
 	BPF_PROG_TYPE_CGROUP_DEVICE,
+	BPF_PROG_TYPE_SECCOMP,
 };
 
 enum bpf_attach_type {
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 2a0bd9dd104d..7da8b39f2a6a 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -7,14 +7,17 @@ 
 
 
 /* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
-#define SECCOMP_MODE_DISABLED	0 /* seccomp is not in use. */
-#define SECCOMP_MODE_STRICT	1 /* uses hard-coded filter. */
-#define SECCOMP_MODE_FILTER	2 /* uses user-supplied filter. */
+#define SECCOMP_MODE_DISABLED		0 /* seccomp is not in use. */
+#define SECCOMP_MODE_STRICT		1 /* uses hard-coded filter. */
+#define SECCOMP_MODE_FILTER		2 /* uses user-supplied filter. */
+#define SECCOMP_MODE_FILTER_EXTENDED	3 /* uses eBPF filter from fd */
 
 /* Valid operations for seccomp syscall. */
-#define SECCOMP_SET_MODE_STRICT		0
-#define SECCOMP_SET_MODE_FILTER		1
-#define SECCOMP_GET_ACTION_AVAIL	2
+#define SECCOMP_SET_MODE_STRICT			0
+#define SECCOMP_SET_MODE_FILTER			1
+#define SECCOMP_GET_ACTION_AVAIL		2
+#define SECCOMP_SET_MODE_FILTER_EXTENDED	3
+
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
 #define SECCOMP_FILTER_FLAG_TSYNC	1
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e24aa3241387..86d6ec8b916d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1202,6 +1202,7 @@  static int bpf_prog_load(union bpf_attr *attr)
 
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
+	    type != BPF_PROG_TYPE_SECCOMP &&
 	    !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 940fa408a288..b30dd25c1cb8 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -37,6 +37,7 @@ 
 #include <linux/security.h>
 #include <linux/tracehook.h>
 #include <linux/uaccess.h>
+#include <linux/bpf.h>
 
 /**
  * struct seccomp_filter - container for seccomp BPF programs
@@ -367,17 +368,6 @@  static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 
 	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
 
-	/*
-	 * Installing a seccomp filter requires that the task has
-	 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
-	 * This avoids scenarios where unprivileged tasks can affect the
-	 * behavior of privileged children.
-	 */
-	if (!task_no_new_privs(current) &&
-	    security_capable_noaudit(current_cred(), current_user_ns(),
-				     CAP_SYS_ADMIN) != 0)
-		return ERR_PTR(-EACCES);
-
 	/* Allocate a new seccomp_filter */
 	sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
 	if (!sfilter)
@@ -423,6 +413,48 @@  seccomp_prepare_user_filter(const char __user *user_filter)
 	return filter;
 }
 
+#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
+/**
+ * seccomp_prepare_extended_filter - prepares a user-supplied eBPF fd
+ * @user_filter: pointer to the user data containing an fd.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+static struct seccomp_filter *
+seccomp_prepare_extended_filter(const char __user *user_fd)
+{
+	struct seccomp_filter *sfilter;
+	struct bpf_prog *fp;
+	int fd;
+
+	/* Fetch the fd from userspace */
+	if (get_user(fd, (int __user *)user_fd))
+		return ERR_PTR(-EFAULT);
+
+	/* Allocate a new seccomp_filter */
+	sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
+	if (!sfilter)
+		return ERR_PTR(-ENOMEM);
+
+	fp = bpf_prog_get_type(fd, BPF_PROG_TYPE_SECCOMP);
+	if (IS_ERR(fp)) {
+		kfree(sfilter);
+		return ERR_CAST(fp);
+	}
+
+	sfilter->prog = fp;
+	refcount_set(&sfilter->usage, 1);
+
+	return sfilter;
+}
+#else
+static struct seccomp_filter *
+seccomp_prepare_extended_filter(const char __user *filter_fd)
+{
+	return ERR_PTR(-EINVAL);
+}
+#endif
+
 /**
  * seccomp_attach_filter: validate and attach filter
  * @flags:  flags to change filter behavior
@@ -492,7 +524,10 @@  void get_seccomp_filter(struct task_struct *tsk)
 static inline void seccomp_filter_free(struct seccomp_filter *filter)
 {
 	if (filter) {
-		bpf_prog_destroy(filter->prog);
+		if (bpf_prog_was_classic(filter->prog))
+			bpf_prog_destroy(filter->prog);
+		else
+			bpf_prog_put(filter->prog);
 		kfree(filter);
 	}
 }
@@ -842,18 +877,36 @@  static long seccomp_set_mode_strict(void)
  * Returns 0 on success or -EINVAL on failure.
  */
 static long seccomp_set_mode_filter(unsigned int flags,
-				    const char __user *filter)
+				    const char __user *filter,
+				    unsigned long filter_type)
 {
-	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
+	/* We use SECCOMP_MODE_FILTER for both eBPF and cBPF filters */
+	const unsigned long filter_mode = SECCOMP_MODE_FILTER;
 	struct seccomp_filter *prepared = NULL;
 	long ret = -EINVAL;
 
 	/* Validate flags. */
 	if (flags & ~SECCOMP_FILTER_FLAG_MASK)
 		return -EINVAL;
+	/*
+	 * Installing a seccomp filter requires that the task has
+	 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
+	 * This avoids scenarios where unprivileged tasks can affect the
+	 * behavior of privileged children.
+	 */
+	if (!task_no_new_privs(current) &&
+	    security_capable_noaudit(current_cred(), current_user_ns(),
+				     CAP_SYS_ADMIN) != 0)
+		return -EACCES;
 
 	/* Prepare the new filter before holding any locks. */
-	prepared = seccomp_prepare_user_filter(filter);
+	if (filter_type == SECCOMP_SET_MODE_FILTER_EXTENDED)
+		prepared = seccomp_prepare_extended_filter(filter);
+	else if (filter_type == SECCOMP_SET_MODE_FILTER)
+		prepared = seccomp_prepare_user_filter(filter);
+	else
+		return -EINVAL;
+
 	if (IS_ERR(prepared))
 		return PTR_ERR(prepared);
 
@@ -867,7 +920,7 @@  static long seccomp_set_mode_filter(unsigned int flags,
 
 	spin_lock_irq(&current->sighand->siglock);
 
-	if (!seccomp_may_assign_mode(seccomp_mode))
+	if (!seccomp_may_assign_mode(filter_mode))
 		goto out;
 
 	ret = seccomp_attach_filter(flags, prepared);
@@ -876,7 +929,7 @@  static long seccomp_set_mode_filter(unsigned int flags,
 	/* Do not free the successfully attached filter. */
 	prepared = NULL;
 
-	seccomp_assign_mode(current, seccomp_mode);
+	seccomp_assign_mode(current, filter_mode);
 out:
 	spin_unlock_irq(&current->sighand->siglock);
 	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
@@ -926,7 +979,9 @@  static long do_seccomp(unsigned int op, unsigned int flags,
 			return -EINVAL;
 		return seccomp_set_mode_strict();
 	case SECCOMP_SET_MODE_FILTER:
-		return seccomp_set_mode_filter(flags, uargs);
+		return seccomp_set_mode_filter(flags, uargs, op);
+	case SECCOMP_SET_MODE_FILTER_EXTENDED:
+		return seccomp_set_mode_filter(flags, uargs, op);
 	case SECCOMP_GET_ACTION_AVAIL:
 		if (flags != 0)
 			return -EINVAL;
@@ -969,6 +1024,10 @@  long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 		op = SECCOMP_SET_MODE_FILTER;
 		uargs = filter;
 		break;
+	case SECCOMP_MODE_FILTER_EXTENDED:
+		op = SECCOMP_SET_MODE_FILTER_EXTENDED;
+		uargs = filter;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1040,8 +1099,7 @@  long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
 	if (IS_ERR(filter))
 		return PTR_ERR(filter);
 
-	fprog = filter->prog->orig_prog;
-	if (!fprog) {
+	if (!bpf_prog_was_classic(filter->prog)) {
 		/* This must be a new non-cBPF filter, since we save
 		 * every cBPF filter's orig_prog above when
 		 * CONFIG_CHECKPOINT_RESTORE is enabled.
@@ -1050,6 +1108,7 @@  long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
 		goto out;
 	}
 
+	fprog = filter->prog->orig_prog;
 	ret = fprog->len;
 	if (!data)
 		goto out;
@@ -1239,6 +1298,55 @@  static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
 	return 0;
 }
 
+#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
+static bool seccomp_is_valid_access(int off, int size,
+				    enum bpf_access_type type,
+				    struct bpf_insn_access_aux *info)
+{
+	if (type != BPF_READ)
+		return false;
+
+	if (off < 0 || off + size > sizeof(struct seccomp_data))
+		return false;
+
+	switch (off) {
+	case bpf_ctx_range_till(struct seccomp_data, args[0], args[5]):
+		return (size == sizeof(__u64));
+	case bpf_ctx_range(struct seccomp_data, nr):
+		return (size == FIELD_SIZEOF(struct seccomp_data, nr));
+	case bpf_ctx_range(struct seccomp_data, arch):
+		return (size == FIELD_SIZEOF(struct seccomp_data, arch));
+	case bpf_ctx_range(struct seccomp_data, instruction_pointer):
+		return (size == FIELD_SIZEOF(struct seccomp_data,
+					     instruction_pointer));
+	}
+
+	return false;
+}
+
+static const struct bpf_func_proto *
+seccomp_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_trace_printk:
+		if (capable(CAP_SYS_ADMIN))
+			return bpf_get_trace_printk_proto();
+	default:
+		return NULL;
+	}
+}
+
+const struct bpf_prog_ops seccomp_prog_ops = {
+};
+
+const struct bpf_verifier_ops seccomp_verifier_ops = {
+	.get_func_proto		= seccomp_func_proto,
+	.is_valid_access	= seccomp_is_valid_access,
+};
+#endif /* CONFIG_SECCOMP_FILTER_EXTENDED */
+
 static struct ctl_path seccomp_sysctl_path[] = {
 	{ .procname = "kernel", },
 	{ .procname = "seccomp", },

Comments

Kees Cook Feb. 13, 2018, 8:34 p.m.
On Tue, Feb 13, 2018 at 7:42 AM, Sargun Dhillon <sargun@sargun.me> wrote:
> From: Sargun Dhillon <sargun@netflix.com>
>
> This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
> to be used for seccomp filters as an alternative to cBPF filters. The
> program type has relatively limited capabilities in terms of helpers,
> but that can be extended later on.
>
> It also introduces a new mechanism to attach these filters via the
> prctl and seccomp syscalls -- SECCOMP_MODE_FILTER_EXTENDED, and
> SECCOMP_SET_MODE_FILTER_EXTENDED respectively.
>
> Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> ---
>  arch/Kconfig                 |   7 ++
>  include/linux/bpf_types.h    |   3 +
>  include/uapi/linux/bpf.h     |   2 +
>  include/uapi/linux/seccomp.h |  15 +++--
>  kernel/bpf/syscall.c         |   1 +
>  kernel/seccomp.c             | 148 +++++++++++++++++++++++++++++++++++++------
>  6 files changed, 150 insertions(+), 26 deletions(-)
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 76c0b54443b1..db675888577c 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -401,6 +401,13 @@ config SECCOMP_FILTER
>
>           See Documentation/prctl/seccomp_filter.txt for details.
>
> +config SECCOMP_FILTER_EXTENDED
> +       bool "Extended BPF seccomp filters"
> +       depends on SECCOMP_FILTER && BPF_SYSCALL
> +       help
> +         Enables seccomp filters to be written in eBPF, as opposed
> +         to just cBPF filters.
> +
>  config HAVE_GCC_PLUGINS
>         bool
>         help
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index 19b8349a3809..945c65c4e461 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
>  #ifdef CONFIG_CGROUP_BPF
>  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
>  #endif
> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
> +BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp)
> +#endif
>
>  BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index db6bdc375126..5f96cb7ed954 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1,3 +1,4 @@
> +
>  /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
>  /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
>   *
> @@ -133,6 +134,7 @@ enum bpf_prog_type {
>         BPF_PROG_TYPE_SOCK_OPS,
>         BPF_PROG_TYPE_SK_SKB,
>         BPF_PROG_TYPE_CGROUP_DEVICE,
> +       BPF_PROG_TYPE_SECCOMP,
>  };
>
>  enum bpf_attach_type {
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index 2a0bd9dd104d..7da8b39f2a6a 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -7,14 +7,17 @@
>
>
>  /* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
> -#define SECCOMP_MODE_DISABLED  0 /* seccomp is not in use. */
> -#define SECCOMP_MODE_STRICT    1 /* uses hard-coded filter. */
> -#define SECCOMP_MODE_FILTER    2 /* uses user-supplied filter. */
> +#define SECCOMP_MODE_DISABLED          0 /* seccomp is not in use. */
> +#define SECCOMP_MODE_STRICT            1 /* uses hard-coded filter. */
> +#define SECCOMP_MODE_FILTER            2 /* uses user-supplied filter. */
> +#define SECCOMP_MODE_FILTER_EXTENDED   3 /* uses eBPF filter from fd */

This MODE flag isn't needed: it's still using a filter, and the
interface changes should be sufficient with
SECCOMP_SET_MODE_FILTER_EXTENDED below.

>  /* Valid operations for seccomp syscall. */
> -#define SECCOMP_SET_MODE_STRICT                0
> -#define SECCOMP_SET_MODE_FILTER                1
> -#define SECCOMP_GET_ACTION_AVAIL       2
> +#define SECCOMP_SET_MODE_STRICT                        0
> +#define SECCOMP_SET_MODE_FILTER                        1
> +#define SECCOMP_GET_ACTION_AVAIL               2
> +#define SECCOMP_SET_MODE_FILTER_EXTENDED       3

It seems like this should be a FILTER flag, not a syscall op change?

> +
>
>  /* Valid flags for SECCOMP_SET_MODE_FILTER */
>  #define SECCOMP_FILTER_FLAG_TSYNC      1
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index e24aa3241387..86d6ec8b916d 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr)
>
>         if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
>             type != BPF_PROG_TYPE_CGROUP_SKB &&
> +           type != BPF_PROG_TYPE_SECCOMP &&
>             !capable(CAP_SYS_ADMIN))
>                 return -EPERM;

So only init_ns-CAP_SYS_ADMIN would be able to use seccomp eBPF?

> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 940fa408a288..b30dd25c1cb8 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -37,6 +37,7 @@
>  #include <linux/security.h>
>  #include <linux/tracehook.h>
>  #include <linux/uaccess.h>
> +#include <linux/bpf.h>
>
>  /**
>   * struct seccomp_filter - container for seccomp BPF programs
> @@ -367,17 +368,6 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
>
>         BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
>
> -       /*
> -        * Installing a seccomp filter requires that the task has
> -        * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
> -        * This avoids scenarios where unprivileged tasks can affect the
> -        * behavior of privileged children.
> -        */
> -       if (!task_no_new_privs(current) &&
> -           security_capable_noaudit(current_cred(), current_user_ns(),
> -                                    CAP_SYS_ADMIN) != 0)
> -               return ERR_PTR(-EACCES);
> -
>         /* Allocate a new seccomp_filter */
>         sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
>         if (!sfilter)
> @@ -423,6 +413,48 @@ seccomp_prepare_user_filter(const char __user *user_filter)
>         return filter;
>  }
>
> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
> +/**
> + * seccomp_prepare_extended_filter - prepares a user-supplied eBPF fd
> + * @user_filter: pointer to the user data containing an fd.
> + *
> + * Returns 0 on success and non-zero otherwise.
> + */
> +static struct seccomp_filter *
> +seccomp_prepare_extended_filter(const char __user *user_fd)
> +{
> +       struct seccomp_filter *sfilter;
> +       struct bpf_prog *fp;
> +       int fd;
> +
> +       /* Fetch the fd from userspace */
> +       if (get_user(fd, (int __user *)user_fd))
> +               return ERR_PTR(-EFAULT);
> +
> +       /* Allocate a new seccomp_filter */
> +       sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
> +       if (!sfilter)
> +               return ERR_PTR(-ENOMEM);
> +
> +       fp = bpf_prog_get_type(fd, BPF_PROG_TYPE_SECCOMP);
> +       if (IS_ERR(fp)) {
> +               kfree(sfilter);
> +               return ERR_CAST(fp);
> +       }
> +
> +       sfilter->prog = fp;
> +       refcount_set(&sfilter->usage, 1);
> +
> +       return sfilter;
> +}
> +#else
> +static struct seccomp_filter *
> +seccomp_prepare_extended_filter(const char __user *filter_fd)
> +{
> +       return ERR_PTR(-EINVAL);
> +}
> +#endif
> +
>  /**
>   * seccomp_attach_filter: validate and attach filter
>   * @flags:  flags to change filter behavior
> @@ -492,7 +524,10 @@ void get_seccomp_filter(struct task_struct *tsk)
>  static inline void seccomp_filter_free(struct seccomp_filter *filter)
>  {
>         if (filter) {
> -               bpf_prog_destroy(filter->prog);
> +               if (bpf_prog_was_classic(filter->prog))
> +                       bpf_prog_destroy(filter->prog);
> +               else
> +                       bpf_prog_put(filter->prog);
>                 kfree(filter);
>         }
>  }
> @@ -842,18 +877,36 @@ static long seccomp_set_mode_strict(void)
>   * Returns 0 on success or -EINVAL on failure.
>   */
>  static long seccomp_set_mode_filter(unsigned int flags,
> -                                   const char __user *filter)
> +                                   const char __user *filter,
> +                                   unsigned long filter_type)

I think this can just live in flags?

>  {
> -       const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
> +       /* We use SECCOMP_MODE_FILTER for both eBPF and cBPF filters */
> +       const unsigned long filter_mode = SECCOMP_MODE_FILTER;
>         struct seccomp_filter *prepared = NULL;
>         long ret = -EINVAL;
>
>         /* Validate flags. */
>         if (flags & ~SECCOMP_FILTER_FLAG_MASK)
>                 return -EINVAL;
> +       /*
> +        * Installing a seccomp filter requires that the task has
> +        * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
> +        * This avoids scenarios where unprivileged tasks can affect the
> +        * behavior of privileged children.
> +        */
> +       if (!task_no_new_privs(current) &&
> +           security_capable_noaudit(current_cred(), current_user_ns(),
> +                                    CAP_SYS_ADMIN) != 0)
> +               return -EACCES;

This changes the order of checks -- before, too-large filters would
get EINVAL even if they lacked the needed permissions. As long as this
doesn't break anything in the real world, it should be fine, but I
might want to instead create a perm-check function and just call it in
both functions. (And likely write a self-test that checks this order,
if it doesn't already exist.)

>
>         /* Prepare the new filter before holding any locks. */
> -       prepared = seccomp_prepare_user_filter(filter);
> +       if (filter_type == SECCOMP_SET_MODE_FILTER_EXTENDED)
> +               prepared = seccomp_prepare_extended_filter(filter);
> +       else if (filter_type == SECCOMP_SET_MODE_FILTER)
> +               prepared = seccomp_prepare_user_filter(filter);
> +       else
> +               return -EINVAL;
> +
>         if (IS_ERR(prepared))
>                 return PTR_ERR(prepared);
>
> @@ -867,7 +920,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
>
>         spin_lock_irq(&current->sighand->siglock);
>
> -       if (!seccomp_may_assign_mode(seccomp_mode))
> +       if (!seccomp_may_assign_mode(filter_mode))
>                 goto out;
>
>         ret = seccomp_attach_filter(flags, prepared);
> @@ -876,7 +929,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
>         /* Do not free the successfully attached filter. */
>         prepared = NULL;
>
> -       seccomp_assign_mode(current, seccomp_mode);
> +       seccomp_assign_mode(current, filter_mode);

With a filter flag, the above hunks don't need to be changed, for example.

>  out:
>         spin_unlock_irq(&current->sighand->siglock);
>         if (flags & SECCOMP_FILTER_FLAG_TSYNC)
> @@ -926,7 +979,9 @@ static long do_seccomp(unsigned int op, unsigned int flags,
>                         return -EINVAL;
>                 return seccomp_set_mode_strict();
>         case SECCOMP_SET_MODE_FILTER:
> -               return seccomp_set_mode_filter(flags, uargs);
> +               return seccomp_set_mode_filter(flags, uargs, op);
> +       case SECCOMP_SET_MODE_FILTER_EXTENDED:
> +               return seccomp_set_mode_filter(flags, uargs, op);

And this isn't needed, since it would be passed as a flag.

>         case SECCOMP_GET_ACTION_AVAIL:
>                 if (flags != 0)
>                         return -EINVAL;
> @@ -969,6 +1024,10 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
>                 op = SECCOMP_SET_MODE_FILTER;
>                 uargs = filter;
>                 break;
> +       case SECCOMP_MODE_FILTER_EXTENDED:
> +               op = SECCOMP_SET_MODE_FILTER_EXTENDED;
> +               uargs = filter;
> +               break;

Same.

>         default:
>                 return -EINVAL;
>         }
> @@ -1040,8 +1099,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
>         if (IS_ERR(filter))
>                 return PTR_ERR(filter);
>
> -       fprog = filter->prog->orig_prog;
> -       if (!fprog) {
> +       if (!bpf_prog_was_classic(filter->prog)) {
>                 /* This must be a new non-cBPF filter, since we save
>                  * every cBPF filter's orig_prog above when
>                  * CONFIG_CHECKPOINT_RESTORE is enabled.
> @@ -1050,6 +1108,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
>                 goto out;
>         }
>
> +       fprog = filter->prog->orig_prog;

I wonder if it would be easier to review to split eBPF install from
the eBPF "get filter" changes as separate patches?

>         ret = fprog->len;
>         if (!data)
>                 goto out;
> @@ -1239,6 +1298,55 @@ static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
>         return 0;
>  }
>
> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
> +static bool seccomp_is_valid_access(int off, int size,
> +                                   enum bpf_access_type type,
> +                                   struct bpf_insn_access_aux *info)
> +{
> +       if (type != BPF_READ)
> +               return false;
> +
> +       if (off < 0 || off + size > sizeof(struct seccomp_data))
> +               return false;
> +
> +       switch (off) {
> +       case bpf_ctx_range_till(struct seccomp_data, args[0], args[5]):
> +               return (size == sizeof(__u64));
> +       case bpf_ctx_range(struct seccomp_data, nr):
> +               return (size == FIELD_SIZEOF(struct seccomp_data, nr));
> +       case bpf_ctx_range(struct seccomp_data, arch):
> +               return (size == FIELD_SIZEOF(struct seccomp_data, arch));
> +       case bpf_ctx_range(struct seccomp_data, instruction_pointer):
> +               return (size == FIELD_SIZEOF(struct seccomp_data,
> +                                            instruction_pointer));
> +       }
> +
> +       return false;
> +}
> +
> +static const struct bpf_func_proto *
> +seccomp_func_proto(enum bpf_func_id func_id)
> +{
> +       switch (func_id) {
> +       case BPF_FUNC_get_current_uid_gid:
> +               return &bpf_get_current_uid_gid_proto;
> +       case BPF_FUNC_trace_printk:
> +               if (capable(CAP_SYS_ADMIN))
> +                       return bpf_get_trace_printk_proto();
> +       default:
> +               return NULL;
> +       }
> +}

This makes me so uncomfortable. :) Why is uid/gid needed? Why add
printk support here? (And why is it CAP_SYS_ADMIN checked if the
entire filter is CAP_SYS_ADMIN checked before being attached?)

> +
> +const struct bpf_prog_ops seccomp_prog_ops = {
> +};
> +
> +const struct bpf_verifier_ops seccomp_verifier_ops = {
> +       .get_func_proto         = seccomp_func_proto,
> +       .is_valid_access        = seccomp_is_valid_access,
> +};
> +#endif /* CONFIG_SECCOMP_FILTER_EXTENDED */
> +
>  static struct ctl_path seccomp_sysctl_path[] = {
>         { .procname = "kernel", },
>         { .procname = "seccomp", },
> --
> 2.14.1
>

-Kees
Sargun Dhillon Feb. 17, 2018, 6:29 a.m.
On Tue, Feb 13, 2018 at 12:34 PM, Kees Cook <keescook@chromium.org> wrote:
> On Tue, Feb 13, 2018 at 7:42 AM, Sargun Dhillon <sargun@sargun.me> wrote:
>> From: Sargun Dhillon <sargun@netflix.com>
>>
>> This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
>> to be used for seccomp filters as an alternative to cBPF filters. The
>> program type has relatively limited capabilities in terms of helpers,
>> but that can be extended later on.
>>
>> It also introduces a new mechanism to attach these filters via the
>> prctl and seccomp syscalls -- SECCOMP_MODE_FILTER_EXTENDED, and
>> SECCOMP_SET_MODE_FILTER_EXTENDED respectively.
>>
>> Signed-off-by: Sargun Dhillon <sargun@sargun.me>
>> ---
>>  arch/Kconfig                 |   7 ++
>>  include/linux/bpf_types.h    |   3 +
>>  include/uapi/linux/bpf.h     |   2 +
>>  include/uapi/linux/seccomp.h |  15 +++--
>>  kernel/bpf/syscall.c         |   1 +
>>  kernel/seccomp.c             | 148 +++++++++++++++++++++++++++++++++++++------
>>  6 files changed, 150 insertions(+), 26 deletions(-)
>>
>> diff --git a/arch/Kconfig b/arch/Kconfig
>> index 76c0b54443b1..db675888577c 100644
>> --- a/arch/Kconfig
>> +++ b/arch/Kconfig
>> @@ -401,6 +401,13 @@ config SECCOMP_FILTER
>>
>>           See Documentation/prctl/seccomp_filter.txt for details.
>>
>> +config SECCOMP_FILTER_EXTENDED
>> +       bool "Extended BPF seccomp filters"
>> +       depends on SECCOMP_FILTER && BPF_SYSCALL
>> +       help
>> +         Enables seccomp filters to be written in eBPF, as opposed
>> +         to just cBPF filters.
>> +
>>  config HAVE_GCC_PLUGINS
>>         bool
>>         help
>> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
>> index 19b8349a3809..945c65c4e461 100644
>> --- a/include/linux/bpf_types.h
>> +++ b/include/linux/bpf_types.h
>> @@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
>>  #ifdef CONFIG_CGROUP_BPF
>>  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
>>  #endif
>> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
>> +BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp)
>> +#endif
>>
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index db6bdc375126..5f96cb7ed954 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1,3 +1,4 @@
>> +
>>  /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
>>  /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
>>   *
>> @@ -133,6 +134,7 @@ enum bpf_prog_type {
>>         BPF_PROG_TYPE_SOCK_OPS,
>>         BPF_PROG_TYPE_SK_SKB,
>>         BPF_PROG_TYPE_CGROUP_DEVICE,
>> +       BPF_PROG_TYPE_SECCOMP,
>>  };
>>
>>  enum bpf_attach_type {
>> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
>> index 2a0bd9dd104d..7da8b39f2a6a 100644
>> --- a/include/uapi/linux/seccomp.h
>> +++ b/include/uapi/linux/seccomp.h
>> @@ -7,14 +7,17 @@
>>
>>
>>  /* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
>> -#define SECCOMP_MODE_DISABLED  0 /* seccomp is not in use. */
>> -#define SECCOMP_MODE_STRICT    1 /* uses hard-coded filter. */
>> -#define SECCOMP_MODE_FILTER    2 /* uses user-supplied filter. */
>> +#define SECCOMP_MODE_DISABLED          0 /* seccomp is not in use. */
>> +#define SECCOMP_MODE_STRICT            1 /* uses hard-coded filter. */
>> +#define SECCOMP_MODE_FILTER            2 /* uses user-supplied filter. */
>> +#define SECCOMP_MODE_FILTER_EXTENDED   3 /* uses eBPF filter from fd */
>
> This MODE flag isn't needed: it's still using a filter, and the
> interface changes should be sufficient with
> SECCOMP_SET_MODE_FILTER_EXTENDED below.
>
>>  /* Valid operations for seccomp syscall. */
>> -#define SECCOMP_SET_MODE_STRICT                0
>> -#define SECCOMP_SET_MODE_FILTER                1
>> -#define SECCOMP_GET_ACTION_AVAIL       2
>> +#define SECCOMP_SET_MODE_STRICT                        0
>> +#define SECCOMP_SET_MODE_FILTER                        1
>> +#define SECCOMP_GET_ACTION_AVAIL               2
>> +#define SECCOMP_SET_MODE_FILTER_EXTENDED       3
>
> It seems like this should be a FILTER flag, not a syscall op change?
>
>> +
>>
>>  /* Valid flags for SECCOMP_SET_MODE_FILTER */
>>  #define SECCOMP_FILTER_FLAG_TSYNC      1
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index e24aa3241387..86d6ec8b916d 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr)
>>
>>         if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
>>             type != BPF_PROG_TYPE_CGROUP_SKB &&
>> +           type != BPF_PROG_TYPE_SECCOMP &&
>>             !capable(CAP_SYS_ADMIN))
>>                 return -EPERM;
>
> So only init_ns-CAP_SYS_ADMIN would be able to use seccomp eBPF?
>
No, this is specifically so non-init CAP_SYS_ADMIN cal load BPF
filters that are either socket_filter, cgroup_skb, or seccomp.

>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> index 940fa408a288..b30dd25c1cb8 100644
>> --- a/kernel/seccomp.c
>> +++ b/kernel/seccomp.c
>> @@ -37,6 +37,7 @@
>>  #include <linux/security.h>
>>  #include <linux/tracehook.h>
>>  #include <linux/uaccess.h>
>> +#include <linux/bpf.h>
>>
>>  /**
>>   * struct seccomp_filter - container for seccomp BPF programs
>> @@ -367,17 +368,6 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
>>
>>         BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
>>
>> -       /*
>> -        * Installing a seccomp filter requires that the task has
>> -        * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
>> -        * This avoids scenarios where unprivileged tasks can affect the
>> -        * behavior of privileged children.
>> -        */
>> -       if (!task_no_new_privs(current) &&
>> -           security_capable_noaudit(current_cred(), current_user_ns(),
>> -                                    CAP_SYS_ADMIN) != 0)
>> -               return ERR_PTR(-EACCES);
>> -
>>         /* Allocate a new seccomp_filter */
>>         sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
>>         if (!sfilter)
>> @@ -423,6 +413,48 @@ seccomp_prepare_user_filter(const char __user *user_filter)
>>         return filter;
>>  }
>>
>> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
>> +/**
>> + * seccomp_prepare_extended_filter - prepares a user-supplied eBPF fd
>> + * @user_filter: pointer to the user data containing an fd.
>> + *
>> + * Returns 0 on success and non-zero otherwise.
>> + */
>> +static struct seccomp_filter *
>> +seccomp_prepare_extended_filter(const char __user *user_fd)
>> +{
>> +       struct seccomp_filter *sfilter;
>> +       struct bpf_prog *fp;
>> +       int fd;
>> +
>> +       /* Fetch the fd from userspace */
>> +       if (get_user(fd, (int __user *)user_fd))
>> +               return ERR_PTR(-EFAULT);
>> +
>> +       /* Allocate a new seccomp_filter */
>> +       sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
>> +       if (!sfilter)
>> +               return ERR_PTR(-ENOMEM);
>> +
>> +       fp = bpf_prog_get_type(fd, BPF_PROG_TYPE_SECCOMP);
>> +       if (IS_ERR(fp)) {
>> +               kfree(sfilter);
>> +               return ERR_CAST(fp);
>> +       }
>> +
>> +       sfilter->prog = fp;
>> +       refcount_set(&sfilter->usage, 1);
>> +
>> +       return sfilter;
>> +}
>> +#else
>> +static struct seccomp_filter *
>> +seccomp_prepare_extended_filter(const char __user *filter_fd)
>> +{
>> +       return ERR_PTR(-EINVAL);
>> +}
>> +#endif
>> +
>>  /**
>>   * seccomp_attach_filter: validate and attach filter
>>   * @flags:  flags to change filter behavior
>> @@ -492,7 +524,10 @@ void get_seccomp_filter(struct task_struct *tsk)
>>  static inline void seccomp_filter_free(struct seccomp_filter *filter)
>>  {
>>         if (filter) {
>> -               bpf_prog_destroy(filter->prog);
>> +               if (bpf_prog_was_classic(filter->prog))
>> +                       bpf_prog_destroy(filter->prog);
>> +               else
>> +                       bpf_prog_put(filter->prog);
>>                 kfree(filter);
>>         }
>>  }
>> @@ -842,18 +877,36 @@ static long seccomp_set_mode_strict(void)
>>   * Returns 0 on success or -EINVAL on failure.
>>   */
>>  static long seccomp_set_mode_filter(unsigned int flags,
>> -                                   const char __user *filter)
>> +                                   const char __user *filter,
>> +                                   unsigned long filter_type)
>
> I think this can just live in flags?
>
>>  {
>> -       const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
>> +       /* We use SECCOMP_MODE_FILTER for both eBPF and cBPF filters */
>> +       const unsigned long filter_mode = SECCOMP_MODE_FILTER;
>>         struct seccomp_filter *prepared = NULL;
>>         long ret = -EINVAL;
>>
>>         /* Validate flags. */
>>         if (flags & ~SECCOMP_FILTER_FLAG_MASK)
>>                 return -EINVAL;
>> +       /*
>> +        * Installing a seccomp filter requires that the task has
>> +        * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
>> +        * This avoids scenarios where unprivileged tasks can affect the
>> +        * behavior of privileged children.
>> +        */
>> +       if (!task_no_new_privs(current) &&
>> +           security_capable_noaudit(current_cred(), current_user_ns(),
>> +                                    CAP_SYS_ADMIN) != 0)
>> +               return -EACCES;
>
> This changes the order of checks -- before, too-large filters would
> get EINVAL even if they lacked the needed permissions. As long as this
> doesn't break anything in the real world, it should be fine, but I
> might want to instead create a perm-check function and just call it in
> both functions. (And likely write a self-test that checks this order,
> if it doesn't already exist.)
>
>>
>>         /* Prepare the new filter before holding any locks. */
>> -       prepared = seccomp_prepare_user_filter(filter);
>> +       if (filter_type == SECCOMP_SET_MODE_FILTER_EXTENDED)
>> +               prepared = seccomp_prepare_extended_filter(filter);
>> +       else if (filter_type == SECCOMP_SET_MODE_FILTER)
>> +               prepared = seccomp_prepare_user_filter(filter);
>> +       else
>> +               return -EINVAL;
>> +
>>         if (IS_ERR(prepared))
>>                 return PTR_ERR(prepared);
>>
>> @@ -867,7 +920,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
>>
>>         spin_lock_irq(&current->sighand->siglock);
>>
>> -       if (!seccomp_may_assign_mode(seccomp_mode))
>> +       if (!seccomp_may_assign_mode(filter_mode))
>>                 goto out;
>>
>>         ret = seccomp_attach_filter(flags, prepared);
>> @@ -876,7 +929,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
>>         /* Do not free the successfully attached filter. */
>>         prepared = NULL;
>>
>> -       seccomp_assign_mode(current, seccomp_mode);
>> +       seccomp_assign_mode(current, filter_mode);
>
> With a filter flag, the above hunks don't need to be changed, for example.
>
>>  out:
>>         spin_unlock_irq(&current->sighand->siglock);
>>         if (flags & SECCOMP_FILTER_FLAG_TSYNC)
>> @@ -926,7 +979,9 @@ static long do_seccomp(unsigned int op, unsigned int flags,
>>                         return -EINVAL;
>>                 return seccomp_set_mode_strict();
>>         case SECCOMP_SET_MODE_FILTER:
>> -               return seccomp_set_mode_filter(flags, uargs);
>> +               return seccomp_set_mode_filter(flags, uargs, op);
>> +       case SECCOMP_SET_MODE_FILTER_EXTENDED:
>> +               return seccomp_set_mode_filter(flags, uargs, op);
>
> And this isn't needed, since it would be passed as a flag.
>
>>         case SECCOMP_GET_ACTION_AVAIL:
>>                 if (flags != 0)
>>                         return -EINVAL;
>> @@ -969,6 +1024,10 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
>>                 op = SECCOMP_SET_MODE_FILTER;
>>                 uargs = filter;
>>                 break;
>> +       case SECCOMP_MODE_FILTER_EXTENDED:
>> +               op = SECCOMP_SET_MODE_FILTER_EXTENDED;
>> +               uargs = filter;
>> +               break;
>
> Same.
>
>>         default:
>>                 return -EINVAL;
>>         }
>> @@ -1040,8 +1099,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
>>         if (IS_ERR(filter))
>>                 return PTR_ERR(filter);
>>
>> -       fprog = filter->prog->orig_prog;
>> -       if (!fprog) {
>> +       if (!bpf_prog_was_classic(filter->prog)) {
>>                 /* This must be a new non-cBPF filter, since we save
>>                  * every cBPF filter's orig_prog above when
>>                  * CONFIG_CHECKPOINT_RESTORE is enabled.
>> @@ -1050,6 +1108,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
>>                 goto out;
>>         }
>>
>> +       fprog = filter->prog->orig_prog;
>
> I wonder if it would be easier to review to split eBPF install from
> the eBPF "get filter" changes as separate patches?
>
Yes, will respin. Thanks for your feedback. I appreciate the quick review.

>>         ret = fprog->len;
>>         if (!data)
>>                 goto out;
>> @@ -1239,6 +1298,55 @@ static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
>>         return 0;
>>  }
>>
>> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
>> +static bool seccomp_is_valid_access(int off, int size,
>> +                                   enum bpf_access_type type,
>> +                                   struct bpf_insn_access_aux *info)
>> +{
>> +       if (type != BPF_READ)
>> +               return false;
>> +
>> +       if (off < 0 || off + size > sizeof(struct seccomp_data))
>> +               return false;
>> +
>> +       switch (off) {
>> +       case bpf_ctx_range_till(struct seccomp_data, args[0], args[5]):
>> +               return (size == sizeof(__u64));
>> +       case bpf_ctx_range(struct seccomp_data, nr):
>> +               return (size == FIELD_SIZEOF(struct seccomp_data, nr));
>> +       case bpf_ctx_range(struct seccomp_data, arch):
>> +               return (size == FIELD_SIZEOF(struct seccomp_data, arch));
>> +       case bpf_ctx_range(struct seccomp_data, instruction_pointer):
>> +               return (size == FIELD_SIZEOF(struct seccomp_data,
>> +                                            instruction_pointer));
>> +       }
>> +
>> +       return false;
>> +}
>> +
>> +static const struct bpf_func_proto *
>> +seccomp_func_proto(enum bpf_func_id func_id)
>> +{
>> +       switch (func_id) {
>> +       case BPF_FUNC_get_current_uid_gid:
>> +               return &bpf_get_current_uid_gid_proto;
>> +       case BPF_FUNC_trace_printk:
>> +               if (capable(CAP_SYS_ADMIN))
>> +                       return bpf_get_trace_printk_proto();
>> +       default:
>> +               return NULL;
>> +       }
>> +}
>
> This makes me so uncomfortable. :) Why is uid/gid needed? Why add
> printk support here? (And why is it CAP_SYS_ADMIN checked if the
> entire filter is CAP_SYS_ADMIN checked before being attached?)
>
See comment above. Anyone can load filters. You can load the filter as
a normal user, drop privliged, and install the filter later with
cap_sys_admin, or no_new_privs.
>> +
>> +const struct bpf_prog_ops seccomp_prog_ops = {
>> +};
>> +
>> +const struct bpf_verifier_ops seccomp_verifier_ops = {
>> +       .get_func_proto         = seccomp_func_proto,
>> +       .is_valid_access        = seccomp_is_valid_access,
>> +};
>> +#endif /* CONFIG_SECCOMP_FILTER_EXTENDED */
>> +
>>  static struct ctl_path seccomp_sysctl_path[] = {
>>         { .procname = "kernel", },
>>         { .procname = "seccomp", },
>> --
>> 2.14.1
>>
>
> -Kees
>
> --
> Kees Cook
> Pixel Security