[v7,3/6] seccomp: add a way to get a listener fd from ptrace

Submitted by Tycho Andersen on Sept. 27, 2018, 3:11 p.m.

Details

Message ID 20180927151119.9989-4-tycho@tycho.ws
State New
Series "seccomp trap to userspace"
Headers show

Commit Message

Tycho Andersen Sept. 27, 2018, 3:11 p.m.
As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
version which can acquire filters is useful. There are at least two reasons
this is preferable, even though it uses ptrace:

1. You can control tasks that aren't cooperating with you
2. You can control tasks whose filters block sendmsg() and socket(); if the
   task installs a filter which blocks these calls, there's no way with
   SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.

v2: fix a bug where listener mode was not unset when an unused fd was not
    available
v3: fix refcounting bug (Oleg)
v4: * change the listener's fd flags to be 0
    * rename GET_LISTENER to NEW_LISTENER (Matthew)
v5: * add capable(CAP_SYS_ADMIN) requirement
v7: * point the new listener at the right filter (Jann)

Signed-off-by: Tycho Andersen <tycho@tycho.ws>
CC: Kees Cook <keescook@chromium.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Oleg Nesterov <oleg@redhat.com>
CC: Eric W. Biederman <ebiederm@xmission.com>
CC: "Serge E. Hallyn" <serge@hallyn.com>
CC: Christian Brauner <christian.brauner@ubuntu.com>
CC: Tyler Hicks <tyhicks@canonical.com>
CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
---
 include/linux/seccomp.h                       |  7 ++
 include/uapi/linux/ptrace.h                   |  2 +
 kernel/ptrace.c                               |  4 ++
 kernel/seccomp.c                              | 31 +++++++++
 tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++
 5 files changed, 112 insertions(+)

Patch hide | download patch | download mbox

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 017444b5efed..234c61b37405 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -83,6 +83,8 @@  static inline int seccomp_mode(struct seccomp *s)
 #ifdef CONFIG_SECCOMP_FILTER
 extern void put_seccomp_filter(struct task_struct *tsk);
 extern void get_seccomp_filter(struct task_struct *tsk);
+extern long seccomp_new_listener(struct task_struct *task,
+				 unsigned long filter_off);
 #else  /* CONFIG_SECCOMP_FILTER */
 static inline void put_seccomp_filter(struct task_struct *tsk)
 {
@@ -92,6 +94,11 @@  static inline void get_seccomp_filter(struct task_struct *tsk)
 {
 	return;
 }
+static inline long seccomp_new_listener(struct task_struct *task,
+					unsigned long filter_off)
+{
+	return -EINVAL;
+}
 #endif /* CONFIG_SECCOMP_FILTER */
 
 #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index d5a1b8a492b9..e80ecb1bd427 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -73,6 +73,8 @@  struct seccomp_metadata {
 	__u64 flags;		/* Output: filter's flags */
 };
 
+#define PTRACE_SECCOMP_NEW_LISTENER	0x420e
+
 /* Read signals from a shared (process wide) queue */
 #define PTRACE_PEEKSIGINFO_SHARED	(1 << 0)
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 21fec73d45d4..289960ac181b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1096,6 +1096,10 @@  int ptrace_request(struct task_struct *child, long request,
 		ret = seccomp_get_metadata(child, addr, datavp);
 		break;
 
+	case PTRACE_SECCOMP_NEW_LISTENER:
+		ret = seccomp_new_listener(child, addr);
+		break;
+
 	default:
 		break;
 	}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 44a31ac8373a..17685803a2af 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1777,4 +1777,35 @@  static struct file *init_listener(struct task_struct *task,
 
 	return ret;
 }
+
+long seccomp_new_listener(struct task_struct *task,
+			  unsigned long filter_off)
+{
+	struct seccomp_filter *filter;
+	struct file *listener;
+	int fd;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	filter = get_nth_filter(task, filter_off);
+	if (IS_ERR(filter))
+		return PTR_ERR(filter);
+
+	fd = get_unused_fd_flags(0);
+	if (fd < 0) {
+		__put_seccomp_filter(filter);
+		return fd;
+	}
+
+	listener = init_listener(task, filter);
+	__put_seccomp_filter(filter);
+	if (IS_ERR(listener)) {
+		put_unused_fd(fd);
+		return PTR_ERR(listener);
+	}
+
+	fd_install(fd, listener);
+	return fd;
+}
 #endif
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 5f4b836a6792..c6ba3ed5392e 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -193,6 +193,10 @@  int seccomp(unsigned int op, unsigned int flags, void *args)
 }
 #endif
 
+#ifndef PTRACE_SECCOMP_NEW_LISTENER
+#define PTRACE_SECCOMP_NEW_LISTENER 0x420e
+#endif
+
 #if __BYTE_ORDER == __LITTLE_ENDIAN
 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
 #elif __BYTE_ORDER == __BIG_ENDIAN
@@ -3175,6 +3179,70 @@  TEST(get_user_notification_syscall)
 	EXPECT_EQ(0, WEXITSTATUS(status));
 }
 
+TEST(get_user_notification_ptrace)
+{
+	pid_t pid;
+	int status, listener;
+	int sk_pair[2];
+	char c;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+
+	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		EXPECT_EQ(user_trap_syscall(__NR_getpid, 0), 0);
+
+		/* Test that we get ENOSYS while not attached */
+		EXPECT_EQ(syscall(__NR_getpid), -1);
+		EXPECT_EQ(errno, ENOSYS);
+
+		/* Signal we're ready and have installed the filter. */
+		EXPECT_EQ(write(sk_pair[1], "J", 1), 1);
+
+		EXPECT_EQ(read(sk_pair[1], &c, 1), 1);
+		EXPECT_EQ(c, 'H');
+
+		exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC);
+	}
+
+	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
+	EXPECT_EQ(c, 'J');
+
+	EXPECT_EQ(ptrace(PTRACE_ATTACH, pid), 0);
+	EXPECT_EQ(waitpid(pid, NULL, 0), pid);
+	listener = ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0);
+	EXPECT_GE(listener, 0);
+
+	/* EBUSY for second listener */
+	EXPECT_EQ(ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0), -1);
+	EXPECT_EQ(errno, EBUSY);
+
+	EXPECT_EQ(ptrace(PTRACE_DETACH, pid, NULL, 0), 0);
+
+	/* Now signal we are done and respond with magic */
+	EXPECT_EQ(write(sk_pair[0], "H", 1), 1);
+
+	req.len = sizeof(req);
+	EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_RECV, &req), sizeof(req));
+
+	resp.len = sizeof(resp);
+	resp.id = req.id;
+	resp.error = 0;
+	resp.val = USER_NOTIF_MAGIC;
+
+	EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_SEND, &resp), sizeof(resp));
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+
+	close(listener);
+}
+
 /*
  * Check that a pid in a child namespace still shows up as valid in ours.
  */

Comments

Jann Horn via Containers Sept. 27, 2018, 4:20 p.m.
On Thu, Sep 27, 2018 at 5:11 PM Tycho Andersen <tycho@tycho.ws> wrote:
> As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> version which can acquire filters is useful. There are at least two reasons
> this is preferable, even though it uses ptrace:
>
> 1. You can control tasks that aren't cooperating with you
> 2. You can control tasks whose filters block sendmsg() and socket(); if the
>    task installs a filter which blocks these calls, there's no way with
>    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.
>
> v2: fix a bug where listener mode was not unset when an unused fd was not
>     available
> v3: fix refcounting bug (Oleg)
> v4: * change the listener's fd flags to be 0
>     * rename GET_LISTENER to NEW_LISTENER (Matthew)
> v5: * add capable(CAP_SYS_ADMIN) requirement
> v7: * point the new listener at the right filter (Jann)
>
> Signed-off-by: Tycho Andersen <tycho@tycho.ws>
> CC: Kees Cook <keescook@chromium.org>
> CC: Andy Lutomirski <luto@amacapital.net>
> CC: Oleg Nesterov <oleg@redhat.com>
> CC: Eric W. Biederman <ebiederm@xmission.com>
> CC: "Serge E. Hallyn" <serge@hallyn.com>
> CC: Christian Brauner <christian.brauner@ubuntu.com>
> CC: Tyler Hicks <tyhicks@canonical.com>
> CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>

If you address the two nits below, you can add:
Reviewed-by: Jann Horn <jannh@google.com>

>  include/linux/seccomp.h                       |  7 ++
>  include/uapi/linux/ptrace.h                   |  2 +
>  kernel/ptrace.c                               |  4 ++
>  kernel/seccomp.c                              | 31 +++++++++
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++
>  5 files changed, 112 insertions(+)
>
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 017444b5efed..234c61b37405 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s)
>  #ifdef CONFIG_SECCOMP_FILTER
>  extern void put_seccomp_filter(struct task_struct *tsk);
>  extern void get_seccomp_filter(struct task_struct *tsk);
> +extern long seccomp_new_listener(struct task_struct *task,
> +                                unsigned long filter_off);

Nit: Sorry, I only noticed this just now, but this should have return
type int, not long. ptrace_request() returns an int, and an fd is also
normally represented as an int, not a long.

>  #else  /* CONFIG_SECCOMP_FILTER */
>  static inline void put_seccomp_filter(struct task_struct *tsk)
>  {
> @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
>  {
>         return;
>  }
> +static inline long seccomp_new_listener(struct task_struct *task,
> +                                       unsigned long filter_off)
> +{
> +       return -EINVAL;
> +}
>  #endif /* CONFIG_SECCOMP_FILTER */
>
>  #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
> diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> index d5a1b8a492b9..e80ecb1bd427 100644
> --- a/include/uapi/linux/ptrace.h
> +++ b/include/uapi/linux/ptrace.h
> @@ -73,6 +73,8 @@ struct seccomp_metadata {
>         __u64 flags;            /* Output: filter's flags */
>  };
>
> +#define PTRACE_SECCOMP_NEW_LISTENER    0x420e
> +
>  /* Read signals from a shared (process wide) queue */
>  #define PTRACE_PEEKSIGINFO_SHARED      (1 << 0)
>
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 21fec73d45d4..289960ac181b 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request,
>                 ret = seccomp_get_metadata(child, addr, datavp);
>                 break;
>
> +       case PTRACE_SECCOMP_NEW_LISTENER:
> +               ret = seccomp_new_listener(child, addr);
> +               break;
> +
>         default:
>                 break;
>         }
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 44a31ac8373a..17685803a2af 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
>
>         return ret;
>  }
> +
> +long seccomp_new_listener(struct task_struct *task,
> +                         unsigned long filter_off)
> +{
> +       struct seccomp_filter *filter;
> +       struct file *listener;
> +       int fd;
> +
> +       if (!capable(CAP_SYS_ADMIN))
> +               return -EACCES;
> +
> +       filter = get_nth_filter(task, filter_off);
> +       if (IS_ERR(filter))
> +               return PTR_ERR(filter);
> +
> +       fd = get_unused_fd_flags(0);

s/0/O_CLOEXEC/ ? If userspace needs a non-cloexec fd, userspace can
easily unset O_CLOEXEC; but the reverse isn't true, because it'd be
racy.

> +       if (fd < 0) {
> +               __put_seccomp_filter(filter);
> +               return fd;
> +       }
> +
> +       listener = init_listener(task, filter);
> +       __put_seccomp_filter(filter);
> +       if (IS_ERR(listener)) {
> +               put_unused_fd(fd);
> +               return PTR_ERR(listener);
> +       }
> +
> +       fd_install(fd, listener);
> +       return fd;
> +}
>  #endif
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 5f4b836a6792..c6ba3ed5392e 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -193,6 +193,10 @@ int seccomp(unsigned int op, unsigned int flags, void *args)
>  }
>  #endif
>
> +#ifndef PTRACE_SECCOMP_NEW_LISTENER
> +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e
> +#endif
> +
>  #if __BYTE_ORDER == __LITTLE_ENDIAN
>  #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
>  #elif __BYTE_ORDER == __BIG_ENDIAN
> @@ -3175,6 +3179,70 @@ TEST(get_user_notification_syscall)
>         EXPECT_EQ(0, WEXITSTATUS(status));
>  }
>
> +TEST(get_user_notification_ptrace)
> +{
> +       pid_t pid;
> +       int status, listener;
> +       int sk_pair[2];
> +       char c;
> +       struct seccomp_notif req = {};
> +       struct seccomp_notif_resp resp = {};
> +
> +       ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
> +
> +       pid = fork();
> +       ASSERT_GE(pid, 0);
> +
> +       if (pid == 0) {
> +               EXPECT_EQ(user_trap_syscall(__NR_getpid, 0), 0);
> +
> +               /* Test that we get ENOSYS while not attached */
> +               EXPECT_EQ(syscall(__NR_getpid), -1);
> +               EXPECT_EQ(errno, ENOSYS);
> +
> +               /* Signal we're ready and have installed the filter. */
> +               EXPECT_EQ(write(sk_pair[1], "J", 1), 1);
> +
> +               EXPECT_EQ(read(sk_pair[1], &c, 1), 1);
> +               EXPECT_EQ(c, 'H');
> +
> +               exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC);
> +       }
> +
> +       EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
> +       EXPECT_EQ(c, 'J');
> +
> +       EXPECT_EQ(ptrace(PTRACE_ATTACH, pid), 0);
> +       EXPECT_EQ(waitpid(pid, NULL, 0), pid);
> +       listener = ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0);
> +       EXPECT_GE(listener, 0);
> +
> +       /* EBUSY for second listener */
> +       EXPECT_EQ(ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0), -1);
> +       EXPECT_EQ(errno, EBUSY);
> +
> +       EXPECT_EQ(ptrace(PTRACE_DETACH, pid, NULL, 0), 0);
> +
> +       /* Now signal we are done and respond with magic */
> +       EXPECT_EQ(write(sk_pair[0], "H", 1), 1);
> +
> +       req.len = sizeof(req);
> +       EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_RECV, &req), sizeof(req));
> +
> +       resp.len = sizeof(resp);
> +       resp.id = req.id;
> +       resp.error = 0;
> +       resp.val = USER_NOTIF_MAGIC;
> +
> +       EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_SEND, &resp), sizeof(resp));
> +
> +       EXPECT_EQ(waitpid(pid, &status, 0), pid);
> +       EXPECT_EQ(true, WIFEXITED(status));
> +       EXPECT_EQ(0, WEXITSTATUS(status));
> +
> +       close(listener);
> +}
> +
>  /*
>   * Check that a pid in a child namespace still shows up as valid in ours.
>   */
> --
> 2.17.1
>
Tycho Andersen Sept. 27, 2018, 4:34 p.m.
On Thu, Sep 27, 2018 at 06:20:23PM +0200, Jann Horn wrote:
> On Thu, Sep 27, 2018 at 5:11 PM Tycho Andersen <tycho@tycho.ws> wrote:
> > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> > version which can acquire filters is useful. There are at least two reasons
> > this is preferable, even though it uses ptrace:
> >
> > 1. You can control tasks that aren't cooperating with you
> > 2. You can control tasks whose filters block sendmsg() and socket(); if the
> >    task installs a filter which blocks these calls, there's no way with
> >    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.
> >
> > v2: fix a bug where listener mode was not unset when an unused fd was not
> >     available
> > v3: fix refcounting bug (Oleg)
> > v4: * change the listener's fd flags to be 0
> >     * rename GET_LISTENER to NEW_LISTENER (Matthew)
> > v5: * add capable(CAP_SYS_ADMIN) requirement
> > v7: * point the new listener at the right filter (Jann)
> >
> > Signed-off-by: Tycho Andersen <tycho@tycho.ws>
> > CC: Kees Cook <keescook@chromium.org>
> > CC: Andy Lutomirski <luto@amacapital.net>
> > CC: Oleg Nesterov <oleg@redhat.com>
> > CC: Eric W. Biederman <ebiederm@xmission.com>
> > CC: "Serge E. Hallyn" <serge@hallyn.com>
> > CC: Christian Brauner <christian.brauner@ubuntu.com>
> > CC: Tyler Hicks <tyhicks@canonical.com>
> > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
> 
> If you address the two nits below, you can add:
> Reviewed-by: Jann Horn <jannh@google.com>

Thanks!

> >  include/linux/seccomp.h                       |  7 ++
> >  include/uapi/linux/ptrace.h                   |  2 +
> >  kernel/ptrace.c                               |  4 ++
> >  kernel/seccomp.c                              | 31 +++++++++
> >  tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++
> >  5 files changed, 112 insertions(+)
> >
> > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> > index 017444b5efed..234c61b37405 100644
> > --- a/include/linux/seccomp.h
> > +++ b/include/linux/seccomp.h
> > @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s)
> >  #ifdef CONFIG_SECCOMP_FILTER
> >  extern void put_seccomp_filter(struct task_struct *tsk);
> >  extern void get_seccomp_filter(struct task_struct *tsk);
> > +extern long seccomp_new_listener(struct task_struct *task,
> > +                                unsigned long filter_off);
> 
> Nit: Sorry, I only noticed this just now, but this should have return
> type int, not long. ptrace_request() returns an int, and an fd is also
> normally represented as an int, not a long.

Ugh, I could have sworn I checked this. In particular because the
other seccomp code that's called from ptrace returns a long too :)

I'll fix that for the next version, and send a different patch for the
other two.

> >  #else  /* CONFIG_SECCOMP_FILTER */
> >  static inline void put_seccomp_filter(struct task_struct *tsk)
> >  {
> > @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
> >  {
> >         return;
> >  }
> > +static inline long seccomp_new_listener(struct task_struct *task,
> > +                                       unsigned long filter_off)
> > +{
> > +       return -EINVAL;
> > +}
> >  #endif /* CONFIG_SECCOMP_FILTER */
> >
> >  #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
> > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> > index d5a1b8a492b9..e80ecb1bd427 100644
> > --- a/include/uapi/linux/ptrace.h
> > +++ b/include/uapi/linux/ptrace.h
> > @@ -73,6 +73,8 @@ struct seccomp_metadata {
> >         __u64 flags;            /* Output: filter's flags */
> >  };
> >
> > +#define PTRACE_SECCOMP_NEW_LISTENER    0x420e
> > +
> >  /* Read signals from a shared (process wide) queue */
> >  #define PTRACE_PEEKSIGINFO_SHARED      (1 << 0)
> >
> > diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> > index 21fec73d45d4..289960ac181b 100644
> > --- a/kernel/ptrace.c
> > +++ b/kernel/ptrace.c
> > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request,
> >                 ret = seccomp_get_metadata(child, addr, datavp);
> >                 break;
> >
> > +       case PTRACE_SECCOMP_NEW_LISTENER:
> > +               ret = seccomp_new_listener(child, addr);
> > +               break;
> > +
> >         default:
> >                 break;
> >         }
> > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > index 44a31ac8373a..17685803a2af 100644
> > --- a/kernel/seccomp.c
> > +++ b/kernel/seccomp.c
> > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> >
> >         return ret;
> >  }
> > +
> > +long seccomp_new_listener(struct task_struct *task,
> > +                         unsigned long filter_off)
> > +{
> > +       struct seccomp_filter *filter;
> > +       struct file *listener;
> > +       int fd;
> > +
> > +       if (!capable(CAP_SYS_ADMIN))
> > +               return -EACCES;
> > +
> > +       filter = get_nth_filter(task, filter_off);
> > +       if (IS_ERR(filter))
> > +               return PTR_ERR(filter);
> > +
> > +       fd = get_unused_fd_flags(0);
> 
> s/0/O_CLOEXEC/ ? If userspace needs a non-cloexec fd, userspace can
> easily unset O_CLOEXEC; but the reverse isn't true, because it'd be
> racy.

Sure, will do.

Tycho
Jann Horn via Containers Sept. 27, 2018, 5:35 p.m.
On Thu, Sep 27, 2018 at 5:11 PM Tycho Andersen <tycho@tycho.ws> wrote:
>
> As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> version which can acquire filters is useful. There are at least two reasons
> this is preferable, even though it uses ptrace:
>
> 1. You can control tasks that aren't cooperating with you
> 2. You can control tasks whose filters block sendmsg() and socket(); if the
>    task installs a filter which blocks these calls, there's no way with
>    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.
[...]
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 21fec73d45d4..289960ac181b 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request,
>                 ret = seccomp_get_metadata(child, addr, datavp);
>                 break;
>
> +       case PTRACE_SECCOMP_NEW_LISTENER:
> +               ret = seccomp_new_listener(child, addr);
> +               break;

Actually, could you amend this to also ensure that `data == 0` and
return -EINVAL otherwise? Then if we want to abuse `data` for passing
flags in the future, we don't have to worry about what happens if
someone passes in garbage as `data`.
Tycho Andersen Sept. 27, 2018, 6:09 p.m.
On Thu, Sep 27, 2018 at 07:35:06PM +0200, Jann Horn wrote:
> On Thu, Sep 27, 2018 at 5:11 PM Tycho Andersen <tycho@tycho.ws> wrote:
> >
> > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> > version which can acquire filters is useful. There are at least two reasons
> > this is preferable, even though it uses ptrace:
> >
> > 1. You can control tasks that aren't cooperating with you
> > 2. You can control tasks whose filters block sendmsg() and socket(); if the
> >    task installs a filter which blocks these calls, there's no way with
> >    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.
> [...]
> > diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> > index 21fec73d45d4..289960ac181b 100644
> > --- a/kernel/ptrace.c
> > +++ b/kernel/ptrace.c
> > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request,
> >                 ret = seccomp_get_metadata(child, addr, datavp);
> >                 break;
> >
> > +       case PTRACE_SECCOMP_NEW_LISTENER:
> > +               ret = seccomp_new_listener(child, addr);
> > +               break;
> 
> Actually, could you amend this to also ensure that `data == 0` and
> return -EINVAL otherwise? Then if we want to abuse `data` for passing
> flags in the future, we don't have to worry about what happens if
> someone passes in garbage as `data`.

Yes, good idea. Thanks!

Tycho
Kees Cook Sept. 27, 2018, 9:53 p.m.
On Thu, Sep 27, 2018 at 8:11 AM, Tycho Andersen <tycho@tycho.ws> wrote:
> As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> version which can acquire filters is useful. There are at least two reasons
> this is preferable, even though it uses ptrace:
>
> 1. You can control tasks that aren't cooperating with you
> 2. You can control tasks whose filters block sendmsg() and socket(); if the
>    task installs a filter which blocks these calls, there's no way with
>    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.
>
> v2: fix a bug where listener mode was not unset when an unused fd was not
>     available
> v3: fix refcounting bug (Oleg)
> v4: * change the listener's fd flags to be 0
>     * rename GET_LISTENER to NEW_LISTENER (Matthew)
> v5: * add capable(CAP_SYS_ADMIN) requirement
> v7: * point the new listener at the right filter (Jann)
>
> Signed-off-by: Tycho Andersen <tycho@tycho.ws>
> CC: Kees Cook <keescook@chromium.org>
> CC: Andy Lutomirski <luto@amacapital.net>
> CC: Oleg Nesterov <oleg@redhat.com>
> CC: Eric W. Biederman <ebiederm@xmission.com>
> CC: "Serge E. Hallyn" <serge@hallyn.com>
> CC: Christian Brauner <christian.brauner@ubuntu.com>
> CC: Tyler Hicks <tyhicks@canonical.com>
> CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
> ---
>  include/linux/seccomp.h                       |  7 ++
>  include/uapi/linux/ptrace.h                   |  2 +
>  kernel/ptrace.c                               |  4 ++
>  kernel/seccomp.c                              | 31 +++++++++
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++
>  5 files changed, 112 insertions(+)
>
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 017444b5efed..234c61b37405 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s)
>  #ifdef CONFIG_SECCOMP_FILTER
>  extern void put_seccomp_filter(struct task_struct *tsk);
>  extern void get_seccomp_filter(struct task_struct *tsk);
> +extern long seccomp_new_listener(struct task_struct *task,
> +                                unsigned long filter_off);
>  #else  /* CONFIG_SECCOMP_FILTER */
>  static inline void put_seccomp_filter(struct task_struct *tsk)
>  {
> @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
>  {
>         return;
>  }
> +static inline long seccomp_new_listener(struct task_struct *task,
> +                                       unsigned long filter_off)
> +{
> +       return -EINVAL;
> +}
>  #endif /* CONFIG_SECCOMP_FILTER */
>
>  #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
> diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> index d5a1b8a492b9..e80ecb1bd427 100644
> --- a/include/uapi/linux/ptrace.h
> +++ b/include/uapi/linux/ptrace.h
> @@ -73,6 +73,8 @@ struct seccomp_metadata {
>         __u64 flags;            /* Output: filter's flags */
>  };
>
> +#define PTRACE_SECCOMP_NEW_LISTENER    0x420e
> +
>  /* Read signals from a shared (process wide) queue */
>  #define PTRACE_PEEKSIGINFO_SHARED      (1 << 0)
>
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 21fec73d45d4..289960ac181b 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request,
>                 ret = seccomp_get_metadata(child, addr, datavp);
>                 break;
>
> +       case PTRACE_SECCOMP_NEW_LISTENER:
> +               ret = seccomp_new_listener(child, addr);
> +               break;
> +
>         default:
>                 break;
>         }
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 44a31ac8373a..17685803a2af 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
>
>         return ret;
>  }
> +
> +long seccomp_new_listener(struct task_struct *task,
> +                         unsigned long filter_off)
> +{
> +       struct seccomp_filter *filter;
> +       struct file *listener;
> +       int fd;
> +
> +       if (!capable(CAP_SYS_ADMIN))
> +               return -EACCES;
> +
> +       filter = get_nth_filter(task, filter_off);
> +       if (IS_ERR(filter))
> +               return PTR_ERR(filter);
> +
> +       fd = get_unused_fd_flags(0);
> +       if (fd < 0) {
> +               __put_seccomp_filter(filter);
> +               return fd;
> +       }
> +
> +       listener = init_listener(task, filter);
> +       __put_seccomp_filter(filter);
> +       if (IS_ERR(listener)) {
> +               put_unused_fd(fd);
> +               return PTR_ERR(listener);
> +       }
> +
> +       fd_install(fd, listener);
> +       return fd;
> +}

Observation both here and with SECCOMP_FILTER_FLAG_NEW_LISTENER:
nothing actually checks that there is a RET_USER_NOTIF bpf rule in the
filter. *shrug* Not a problem, just a weird state.

>  #endif
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 5f4b836a6792..c6ba3ed5392e 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -193,6 +193,10 @@ int seccomp(unsigned int op, unsigned int flags, void *args)
>  }
>  #endif
>
> +#ifndef PTRACE_SECCOMP_NEW_LISTENER
> +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e
> +#endif
> +
>  #if __BYTE_ORDER == __LITTLE_ENDIAN
>  #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
>  #elif __BYTE_ORDER == __BIG_ENDIAN
> @@ -3175,6 +3179,70 @@ TEST(get_user_notification_syscall)
>         EXPECT_EQ(0, WEXITSTATUS(status));
>  }
>
> +TEST(get_user_notification_ptrace)
> +{
> +       pid_t pid;
> +       int status, listener;
> +       int sk_pair[2];
> +       char c;
> +       struct seccomp_notif req = {};
> +       struct seccomp_notif_resp resp = {};
> +
> +       ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
> +
> +       pid = fork();
> +       ASSERT_GE(pid, 0);
> +
> +       if (pid == 0) {
> +               EXPECT_EQ(user_trap_syscall(__NR_getpid, 0), 0);
> +
> +               /* Test that we get ENOSYS while not attached */
> +               EXPECT_EQ(syscall(__NR_getpid), -1);
> +               EXPECT_EQ(errno, ENOSYS);
> +
> +               /* Signal we're ready and have installed the filter. */
> +               EXPECT_EQ(write(sk_pair[1], "J", 1), 1);
> +
> +               EXPECT_EQ(read(sk_pair[1], &c, 1), 1);
> +               EXPECT_EQ(c, 'H');
> +
> +               exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC);
> +       }
> +
> +       EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
> +       EXPECT_EQ(c, 'J');
> +
> +       EXPECT_EQ(ptrace(PTRACE_ATTACH, pid), 0);
> +       EXPECT_EQ(waitpid(pid, NULL, 0), pid);
> +       listener = ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0);
> +       EXPECT_GE(listener, 0);
> +
> +       /* EBUSY for second listener */
> +       EXPECT_EQ(ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0), -1);
> +       EXPECT_EQ(errno, EBUSY);
> +
> +       EXPECT_EQ(ptrace(PTRACE_DETACH, pid, NULL, 0), 0);
> +
> +       /* Now signal we are done and respond with magic */
> +       EXPECT_EQ(write(sk_pair[0], "H", 1), 1);
> +
> +       req.len = sizeof(req);
> +       EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_RECV, &req), sizeof(req));
> +
> +       resp.len = sizeof(resp);
> +       resp.id = req.id;
> +       resp.error = 0;
> +       resp.val = USER_NOTIF_MAGIC;
> +
> +       EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_SEND, &resp), sizeof(resp));
> +
> +       EXPECT_EQ(waitpid(pid, &status, 0), pid);
> +       EXPECT_EQ(true, WIFEXITED(status));
> +       EXPECT_EQ(0, WEXITSTATUS(status));
> +
> +       close(listener);
> +}
> +
>  /*
>   * Check that a pid in a child namespace still shows up as valid in ours.
>   */
> --
> 2.17.1
>

And FWIW, I agree with Jann's review notes here too. :) Looks good!

-Kees
Christian Brauner Oct. 8, 2018, 3:16 p.m.
On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> version which can acquire filters is useful. There are at least two reasons
> this is preferable, even though it uses ptrace:
> 
> 1. You can control tasks that aren't cooperating with you
> 2. You can control tasks whose filters block sendmsg() and socket(); if the
>    task installs a filter which blocks these calls, there's no way with
>    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.

So for the slow of mind aka me:
I'm not sure I completely understand this problem. Can you outline how
sendmsg() and socket() are involved in this?

I'm also not sure that this holds (but I might misunderstand the
problem) afaict, you could do try to get the fd out via CLONE_FILES and
other means so something like: 

// let's pretend the libc wrapper for clone actually has sane semantics
pid = clone(CLONE_FILES);
if (pid == 0) {
        fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);

        // Now this fd will be valid in both parent and child.
        // If you haven't blocked it you can inform the parent what
        // the fd number is via pipe2(). If you have blocked it you can
        // use dup2() and dup to a known fd number.
}

> 
> v2: fix a bug where listener mode was not unset when an unused fd was not
>     available
> v3: fix refcounting bug (Oleg)
> v4: * change the listener's fd flags to be 0
>     * rename GET_LISTENER to NEW_LISTENER (Matthew)
> v5: * add capable(CAP_SYS_ADMIN) requirement
> v7: * point the new listener at the right filter (Jann)
> 
> Signed-off-by: Tycho Andersen <tycho@tycho.ws>
> CC: Kees Cook <keescook@chromium.org>
> CC: Andy Lutomirski <luto@amacapital.net>
> CC: Oleg Nesterov <oleg@redhat.com>
> CC: Eric W. Biederman <ebiederm@xmission.com>
> CC: "Serge E. Hallyn" <serge@hallyn.com>
> CC: Christian Brauner <christian.brauner@ubuntu.com>
> CC: Tyler Hicks <tyhicks@canonical.com>
> CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
> ---
>  include/linux/seccomp.h                       |  7 ++
>  include/uapi/linux/ptrace.h                   |  2 +
>  kernel/ptrace.c                               |  4 ++
>  kernel/seccomp.c                              | 31 +++++++++
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++
>  5 files changed, 112 insertions(+)
> 
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 017444b5efed..234c61b37405 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s)
>  #ifdef CONFIG_SECCOMP_FILTER
>  extern void put_seccomp_filter(struct task_struct *tsk);
>  extern void get_seccomp_filter(struct task_struct *tsk);
> +extern long seccomp_new_listener(struct task_struct *task,
> +				 unsigned long filter_off);
>  #else  /* CONFIG_SECCOMP_FILTER */
>  static inline void put_seccomp_filter(struct task_struct *tsk)
>  {
> @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
>  {
>  	return;
>  }
> +static inline long seccomp_new_listener(struct task_struct *task,
> +					unsigned long filter_off)
> +{
> +	return -EINVAL;
> +}
>  #endif /* CONFIG_SECCOMP_FILTER */
>  
>  #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
> diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> index d5a1b8a492b9..e80ecb1bd427 100644
> --- a/include/uapi/linux/ptrace.h
> +++ b/include/uapi/linux/ptrace.h
> @@ -73,6 +73,8 @@ struct seccomp_metadata {
>  	__u64 flags;		/* Output: filter's flags */
>  };
>  
> +#define PTRACE_SECCOMP_NEW_LISTENER	0x420e
> +
>  /* Read signals from a shared (process wide) queue */
>  #define PTRACE_PEEKSIGINFO_SHARED	(1 << 0)
>  
> diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> index 21fec73d45d4..289960ac181b 100644
> --- a/kernel/ptrace.c
> +++ b/kernel/ptrace.c
> @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request,
>  		ret = seccomp_get_metadata(child, addr, datavp);
>  		break;
>  
> +	case PTRACE_SECCOMP_NEW_LISTENER:
> +		ret = seccomp_new_listener(child, addr);
> +		break;
> +
>  	default:
>  		break;
>  	}
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 44a31ac8373a..17685803a2af 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
>  
>  	return ret;
>  }
> +
> +long seccomp_new_listener(struct task_struct *task,
> +			  unsigned long filter_off)
> +{
> +	struct seccomp_filter *filter;
> +	struct file *listener;
> +	int fd;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EACCES;

I know this might have been discussed a while back but why exactly do we
require CAP_SYS_ADMIN in init_userns and not in the target userns? What
if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
use ptrace from in there?

> +
> +	filter = get_nth_filter(task, filter_off);
> +	if (IS_ERR(filter))
> +		return PTR_ERR(filter);
> +
> +	fd = get_unused_fd_flags(0);
> +	if (fd < 0) {
> +		__put_seccomp_filter(filter);
> +		return fd;
> +	}
> +
> +	listener = init_listener(task, filter);
> +	__put_seccomp_filter(filter);
> +	if (IS_ERR(listener)) {
> +		put_unused_fd(fd);
> +		return PTR_ERR(listener);
> +	}
> +
> +	fd_install(fd, listener);
> +	return fd;
> +}
>  #endif
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 5f4b836a6792..c6ba3ed5392e 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -193,6 +193,10 @@ int seccomp(unsigned int op, unsigned int flags, void *args)
>  }
>  #endif
>  
> +#ifndef PTRACE_SECCOMP_NEW_LISTENER
> +#define PTRACE_SECCOMP_NEW_LISTENER 0x420e
> +#endif
> +
>  #if __BYTE_ORDER == __LITTLE_ENDIAN
>  #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
>  #elif __BYTE_ORDER == __BIG_ENDIAN
> @@ -3175,6 +3179,70 @@ TEST(get_user_notification_syscall)
>  	EXPECT_EQ(0, WEXITSTATUS(status));
>  }
>  
> +TEST(get_user_notification_ptrace)
> +{
> +	pid_t pid;
> +	int status, listener;
> +	int sk_pair[2];
> +	char c;
> +	struct seccomp_notif req = {};
> +	struct seccomp_notif_resp resp = {};
> +
> +	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
> +
> +	pid = fork();
> +	ASSERT_GE(pid, 0);
> +
> +	if (pid == 0) {
> +		EXPECT_EQ(user_trap_syscall(__NR_getpid, 0), 0);
> +
> +		/* Test that we get ENOSYS while not attached */
> +		EXPECT_EQ(syscall(__NR_getpid), -1);
> +		EXPECT_EQ(errno, ENOSYS);
> +
> +		/* Signal we're ready and have installed the filter. */
> +		EXPECT_EQ(write(sk_pair[1], "J", 1), 1);
> +
> +		EXPECT_EQ(read(sk_pair[1], &c, 1), 1);
> +		EXPECT_EQ(c, 'H');
> +
> +		exit(syscall(__NR_getpid) != USER_NOTIF_MAGIC);
> +	}
> +
> +	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
> +	EXPECT_EQ(c, 'J');
> +
> +	EXPECT_EQ(ptrace(PTRACE_ATTACH, pid), 0);
> +	EXPECT_EQ(waitpid(pid, NULL, 0), pid);
> +	listener = ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0);
> +	EXPECT_GE(listener, 0);
> +
> +	/* EBUSY for second listener */
> +	EXPECT_EQ(ptrace(PTRACE_SECCOMP_NEW_LISTENER, pid, 0), -1);
> +	EXPECT_EQ(errno, EBUSY);
> +
> +	EXPECT_EQ(ptrace(PTRACE_DETACH, pid, NULL, 0), 0);
> +
> +	/* Now signal we are done and respond with magic */
> +	EXPECT_EQ(write(sk_pair[0], "H", 1), 1);
> +
> +	req.len = sizeof(req);
> +	EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_RECV, &req), sizeof(req));
> +
> +	resp.len = sizeof(resp);
> +	resp.id = req.id;
> +	resp.error = 0;
> +	resp.val = USER_NOTIF_MAGIC;
> +
> +	EXPECT_EQ(ioctl(listener, SECCOMP_NOTIF_SEND, &resp), sizeof(resp));
> +
> +	EXPECT_EQ(waitpid(pid, &status, 0), pid);
> +	EXPECT_EQ(true, WIFEXITED(status));
> +	EXPECT_EQ(0, WEXITSTATUS(status));
> +
> +	close(listener);
> +}
> +
>  /*
>   * Check that a pid in a child namespace still shows up as valid in ours.
>   */
> -- 
> 2.17.1
> 
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/containers
Jann Horn via Containers Oct. 8, 2018, 3:33 p.m.
On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
>
> On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> > version which can acquire filters is useful. There are at least two reasons
> > this is preferable, even though it uses ptrace:
> >
> > 1. You can control tasks that aren't cooperating with you
> > 2. You can control tasks whose filters block sendmsg() and socket(); if the
> >    task installs a filter which blocks these calls, there's no way with
> >    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.
>
> So for the slow of mind aka me:
> I'm not sure I completely understand this problem. Can you outline how
> sendmsg() and socket() are involved in this?
>
> I'm also not sure that this holds (but I might misunderstand the
> problem) afaict, you could do try to get the fd out via CLONE_FILES and
> other means so something like:
>
> // let's pretend the libc wrapper for clone actually has sane semantics
> pid = clone(CLONE_FILES);
> if (pid == 0) {
>         fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
>
>         // Now this fd will be valid in both parent and child.
>         // If you haven't blocked it you can inform the parent what
>         // the fd number is via pipe2(). If you have blocked it you can
>         // use dup2() and dup to a known fd number.
> }
>
> >
> > v2: fix a bug where listener mode was not unset when an unused fd was not
> >     available
> > v3: fix refcounting bug (Oleg)
> > v4: * change the listener's fd flags to be 0
> >     * rename GET_LISTENER to NEW_LISTENER (Matthew)
> > v5: * add capable(CAP_SYS_ADMIN) requirement
> > v7: * point the new listener at the right filter (Jann)
> >
> > Signed-off-by: Tycho Andersen <tycho@tycho.ws>
> > CC: Kees Cook <keescook@chromium.org>
> > CC: Andy Lutomirski <luto@amacapital.net>
> > CC: Oleg Nesterov <oleg@redhat.com>
> > CC: Eric W. Biederman <ebiederm@xmission.com>
> > CC: "Serge E. Hallyn" <serge@hallyn.com>
> > CC: Christian Brauner <christian.brauner@ubuntu.com>
> > CC: Tyler Hicks <tyhicks@canonical.com>
> > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
> > ---
> >  include/linux/seccomp.h                       |  7 ++
> >  include/uapi/linux/ptrace.h                   |  2 +
> >  kernel/ptrace.c                               |  4 ++
> >  kernel/seccomp.c                              | 31 +++++++++
> >  tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++
> >  5 files changed, 112 insertions(+)
> >
> > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> > index 017444b5efed..234c61b37405 100644
> > --- a/include/linux/seccomp.h
> > +++ b/include/linux/seccomp.h
> > @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s)
> >  #ifdef CONFIG_SECCOMP_FILTER
> >  extern void put_seccomp_filter(struct task_struct *tsk);
> >  extern void get_seccomp_filter(struct task_struct *tsk);
> > +extern long seccomp_new_listener(struct task_struct *task,
> > +                              unsigned long filter_off);
> >  #else  /* CONFIG_SECCOMP_FILTER */
> >  static inline void put_seccomp_filter(struct task_struct *tsk)
> >  {
> > @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
> >  {
> >       return;
> >  }
> > +static inline long seccomp_new_listener(struct task_struct *task,
> > +                                     unsigned long filter_off)
> > +{
> > +     return -EINVAL;
> > +}
> >  #endif /* CONFIG_SECCOMP_FILTER */
> >
> >  #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
> > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> > index d5a1b8a492b9..e80ecb1bd427 100644
> > --- a/include/uapi/linux/ptrace.h
> > +++ b/include/uapi/linux/ptrace.h
> > @@ -73,6 +73,8 @@ struct seccomp_metadata {
> >       __u64 flags;            /* Output: filter's flags */
> >  };
> >
> > +#define PTRACE_SECCOMP_NEW_LISTENER  0x420e
> > +
> >  /* Read signals from a shared (process wide) queue */
> >  #define PTRACE_PEEKSIGINFO_SHARED    (1 << 0)
> >
> > diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> > index 21fec73d45d4..289960ac181b 100644
> > --- a/kernel/ptrace.c
> > +++ b/kernel/ptrace.c
> > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request,
> >               ret = seccomp_get_metadata(child, addr, datavp);
> >               break;
> >
> > +     case PTRACE_SECCOMP_NEW_LISTENER:
> > +             ret = seccomp_new_listener(child, addr);
> > +             break;
> > +
> >       default:
> >               break;
> >       }
> > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > index 44a31ac8373a..17685803a2af 100644
> > --- a/kernel/seccomp.c
> > +++ b/kernel/seccomp.c
> > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> >
> >       return ret;
> >  }
> > +
> > +long seccomp_new_listener(struct task_struct *task,
> > +                       unsigned long filter_off)
> > +{
> > +     struct seccomp_filter *filter;
> > +     struct file *listener;
> > +     int fd;
> > +
> > +     if (!capable(CAP_SYS_ADMIN))
> > +             return -EACCES;
>
> I know this might have been discussed a while back but why exactly do we
> require CAP_SYS_ADMIN in init_userns and not in the target userns? What
> if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
> use ptrace from in there?

See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
. Basically, the problem is that this doesn't just give you capability
over the target task, but also over every other task that has the same
filter installed; you need some sort of "is the caller capable over
the filter and anyone who uses it" check.
Christian Brauner Oct. 8, 2018, 4:21 p.m.
On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote:
> On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
> >
> > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> > > version which can acquire filters is useful. There are at least two reasons
> > > this is preferable, even though it uses ptrace:
> > >
> > > 1. You can control tasks that aren't cooperating with you
> > > 2. You can control tasks whose filters block sendmsg() and socket(); if the
> > >    task installs a filter which blocks these calls, there's no way with
> > >    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.
> >
> > So for the slow of mind aka me:
> > I'm not sure I completely understand this problem. Can you outline how
> > sendmsg() and socket() are involved in this?
> >
> > I'm also not sure that this holds (but I might misunderstand the
> > problem) afaict, you could do try to get the fd out via CLONE_FILES and
> > other means so something like:
> >
> > // let's pretend the libc wrapper for clone actually has sane semantics
> > pid = clone(CLONE_FILES);
> > if (pid == 0) {
> >         fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
> >
> >         // Now this fd will be valid in both parent and child.
> >         // If you haven't blocked it you can inform the parent what
> >         // the fd number is via pipe2(). If you have blocked it you can
> >         // use dup2() and dup to a known fd number.
> > }
> >
> > >
> > > v2: fix a bug where listener mode was not unset when an unused fd was not
> > >     available
> > > v3: fix refcounting bug (Oleg)
> > > v4: * change the listener's fd flags to be 0
> > >     * rename GET_LISTENER to NEW_LISTENER (Matthew)
> > > v5: * add capable(CAP_SYS_ADMIN) requirement
> > > v7: * point the new listener at the right filter (Jann)
> > >
> > > Signed-off-by: Tycho Andersen <tycho@tycho.ws>
> > > CC: Kees Cook <keescook@chromium.org>
> > > CC: Andy Lutomirski <luto@amacapital.net>
> > > CC: Oleg Nesterov <oleg@redhat.com>
> > > CC: Eric W. Biederman <ebiederm@xmission.com>
> > > CC: "Serge E. Hallyn" <serge@hallyn.com>
> > > CC: Christian Brauner <christian.brauner@ubuntu.com>
> > > CC: Tyler Hicks <tyhicks@canonical.com>
> > > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
> > > ---
> > >  include/linux/seccomp.h                       |  7 ++
> > >  include/uapi/linux/ptrace.h                   |  2 +
> > >  kernel/ptrace.c                               |  4 ++
> > >  kernel/seccomp.c                              | 31 +++++++++
> > >  tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++
> > >  5 files changed, 112 insertions(+)
> > >
> > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> > > index 017444b5efed..234c61b37405 100644
> > > --- a/include/linux/seccomp.h
> > > +++ b/include/linux/seccomp.h
> > > @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s)
> > >  #ifdef CONFIG_SECCOMP_FILTER
> > >  extern void put_seccomp_filter(struct task_struct *tsk);
> > >  extern void get_seccomp_filter(struct task_struct *tsk);
> > > +extern long seccomp_new_listener(struct task_struct *task,
> > > +                              unsigned long filter_off);
> > >  #else  /* CONFIG_SECCOMP_FILTER */
> > >  static inline void put_seccomp_filter(struct task_struct *tsk)
> > >  {
> > > @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
> > >  {
> > >       return;
> > >  }
> > > +static inline long seccomp_new_listener(struct task_struct *task,
> > > +                                     unsigned long filter_off)
> > > +{
> > > +     return -EINVAL;
> > > +}
> > >  #endif /* CONFIG_SECCOMP_FILTER */
> > >
> > >  #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
> > > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> > > index d5a1b8a492b9..e80ecb1bd427 100644
> > > --- a/include/uapi/linux/ptrace.h
> > > +++ b/include/uapi/linux/ptrace.h
> > > @@ -73,6 +73,8 @@ struct seccomp_metadata {
> > >       __u64 flags;            /* Output: filter's flags */
> > >  };
> > >
> > > +#define PTRACE_SECCOMP_NEW_LISTENER  0x420e
> > > +
> > >  /* Read signals from a shared (process wide) queue */
> > >  #define PTRACE_PEEKSIGINFO_SHARED    (1 << 0)
> > >
> > > diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> > > index 21fec73d45d4..289960ac181b 100644
> > > --- a/kernel/ptrace.c
> > > +++ b/kernel/ptrace.c
> > > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request,
> > >               ret = seccomp_get_metadata(child, addr, datavp);
> > >               break;
> > >
> > > +     case PTRACE_SECCOMP_NEW_LISTENER:
> > > +             ret = seccomp_new_listener(child, addr);
> > > +             break;
> > > +
> > >       default:
> > >               break;
> > >       }
> > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > index 44a31ac8373a..17685803a2af 100644
> > > --- a/kernel/seccomp.c
> > > +++ b/kernel/seccomp.c
> > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> > >
> > >       return ret;
> > >  }
> > > +
> > > +long seccomp_new_listener(struct task_struct *task,
> > > +                       unsigned long filter_off)
> > > +{
> > > +     struct seccomp_filter *filter;
> > > +     struct file *listener;
> > > +     int fd;
> > > +
> > > +     if (!capable(CAP_SYS_ADMIN))
> > > +             return -EACCES;
> >
> > I know this might have been discussed a while back but why exactly do we
> > require CAP_SYS_ADMIN in init_userns and not in the target userns? What
> > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
> > use ptrace from in there?
> 
> See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
> . Basically, the problem is that this doesn't just give you capability
> over the target task, but also over every other task that has the same
> filter installed; you need some sort of "is the caller capable over
> the filter and anyone who uses it" check.

Thanks.
But then this new ptrace feature as it stands is imho currently broken.
If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you
are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself
if you are ns_cpabable(CAP_SYS_ADMIN) then either the new ptrace() api
extension should be fixed to allow for this too or the seccomp() way of
retrieving the pid - which I really think we want - needs to be fixed to
require capable(CAP_SYS_ADMIN) too.
The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho -
the preferred way to solve this.
Everything else will just be confusing.
Jann Horn via Containers Oct. 8, 2018, 4:42 p.m.
On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote:
> On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote:
> > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
> > >
> > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> > > > version which can acquire filters is useful. There are at least two reasons
> > > > this is preferable, even though it uses ptrace:
> > > >
> > > > 1. You can control tasks that aren't cooperating with you
> > > > 2. You can control tasks whose filters block sendmsg() and socket(); if the
> > > >    task installs a filter which blocks these calls, there's no way with
> > > >    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.
> > >
> > > So for the slow of mind aka me:
> > > I'm not sure I completely understand this problem. Can you outline how
> > > sendmsg() and socket() are involved in this?
> > >
> > > I'm also not sure that this holds (but I might misunderstand the
> > > problem) afaict, you could do try to get the fd out via CLONE_FILES and
> > > other means so something like:
> > >
> > > // let's pretend the libc wrapper for clone actually has sane semantics
> > > pid = clone(CLONE_FILES);
> > > if (pid == 0) {
> > >         fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
> > >
> > >         // Now this fd will be valid in both parent and child.
> > >         // If you haven't blocked it you can inform the parent what
> > >         // the fd number is via pipe2(). If you have blocked it you can
> > >         // use dup2() and dup to a known fd number.
> > > }
> > >
> > > >
> > > > v2: fix a bug where listener mode was not unset when an unused fd was not
> > > >     available
> > > > v3: fix refcounting bug (Oleg)
> > > > v4: * change the listener's fd flags to be 0
> > > >     * rename GET_LISTENER to NEW_LISTENER (Matthew)
> > > > v5: * add capable(CAP_SYS_ADMIN) requirement
> > > > v7: * point the new listener at the right filter (Jann)
> > > >
> > > > Signed-off-by: Tycho Andersen <tycho@tycho.ws>
> > > > CC: Kees Cook <keescook@chromium.org>
> > > > CC: Andy Lutomirski <luto@amacapital.net>
> > > > CC: Oleg Nesterov <oleg@redhat.com>
> > > > CC: Eric W. Biederman <ebiederm@xmission.com>
> > > > CC: "Serge E. Hallyn" <serge@hallyn.com>
> > > > CC: Christian Brauner <christian.brauner@ubuntu.com>
> > > > CC: Tyler Hicks <tyhicks@canonical.com>
> > > > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
> > > > ---
> > > >  include/linux/seccomp.h                       |  7 ++
> > > >  include/uapi/linux/ptrace.h                   |  2 +
> > > >  kernel/ptrace.c                               |  4 ++
> > > >  kernel/seccomp.c                              | 31 +++++++++
> > > >  tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++
> > > >  5 files changed, 112 insertions(+)
> > > >
> > > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> > > > index 017444b5efed..234c61b37405 100644
> > > > --- a/include/linux/seccomp.h
> > > > +++ b/include/linux/seccomp.h
> > > > @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s)
> > > >  #ifdef CONFIG_SECCOMP_FILTER
> > > >  extern void put_seccomp_filter(struct task_struct *tsk);
> > > >  extern void get_seccomp_filter(struct task_struct *tsk);
> > > > +extern long seccomp_new_listener(struct task_struct *task,
> > > > +                              unsigned long filter_off);
> > > >  #else  /* CONFIG_SECCOMP_FILTER */
> > > >  static inline void put_seccomp_filter(struct task_struct *tsk)
> > > >  {
> > > > @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
> > > >  {
> > > >       return;
> > > >  }
> > > > +static inline long seccomp_new_listener(struct task_struct *task,
> > > > +                                     unsigned long filter_off)
> > > > +{
> > > > +     return -EINVAL;
> > > > +}
> > > >  #endif /* CONFIG_SECCOMP_FILTER */
> > > >
> > > >  #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
> > > > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> > > > index d5a1b8a492b9..e80ecb1bd427 100644
> > > > --- a/include/uapi/linux/ptrace.h
> > > > +++ b/include/uapi/linux/ptrace.h
> > > > @@ -73,6 +73,8 @@ struct seccomp_metadata {
> > > >       __u64 flags;            /* Output: filter's flags */
> > > >  };
> > > >
> > > > +#define PTRACE_SECCOMP_NEW_LISTENER  0x420e
> > > > +
> > > >  /* Read signals from a shared (process wide) queue */
> > > >  #define PTRACE_PEEKSIGINFO_SHARED    (1 << 0)
> > > >
> > > > diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> > > > index 21fec73d45d4..289960ac181b 100644
> > > > --- a/kernel/ptrace.c
> > > > +++ b/kernel/ptrace.c
> > > > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request,
> > > >               ret = seccomp_get_metadata(child, addr, datavp);
> > > >               break;
> > > >
> > > > +     case PTRACE_SECCOMP_NEW_LISTENER:
> > > > +             ret = seccomp_new_listener(child, addr);
> > > > +             break;
> > > > +
> > > >       default:
> > > >               break;
> > > >       }
> > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > > index 44a31ac8373a..17685803a2af 100644
> > > > --- a/kernel/seccomp.c
> > > > +++ b/kernel/seccomp.c
> > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> > > >
> > > >       return ret;
> > > >  }
> > > > +
> > > > +long seccomp_new_listener(struct task_struct *task,
> > > > +                       unsigned long filter_off)
> > > > +{
> > > > +     struct seccomp_filter *filter;
> > > > +     struct file *listener;
> > > > +     int fd;
> > > > +
> > > > +     if (!capable(CAP_SYS_ADMIN))
> > > > +             return -EACCES;
> > >
> > > I know this might have been discussed a while back but why exactly do we
> > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What
> > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
> > > use ptrace from in there?
> >
> > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
> > . Basically, the problem is that this doesn't just give you capability
> > over the target task, but also over every other task that has the same
> > filter installed; you need some sort of "is the caller capable over
> > the filter and anyone who uses it" check.
>
> Thanks.
> But then this new ptrace feature as it stands is imho currently broken.
> If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you
> are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself
> if you are ns_cpabable(CAP_SYS_ADMIN) then either the new ptrace() api
> extension should be fixed to allow for this too or the seccomp() way of
> retrieving the pid - which I really think we want - needs to be fixed to
> require capable(CAP_SYS_ADMIN) too.
> The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho -
> the preferred way to solve this.
> Everything else will just be confusing.

First you say "broken", then you say "confusing". Which one do you mean?

Regarding requiring ns_capable() for ptrace: That means that you'll
have to stash namespace information in the seccomp filter. You'd also
potentially be eliding the LSM check that would normally have to occur
between the tracer and the tracee; but I guess that's probably fine?
CAP_SYS_ADMIN in the init namespace already has some abilities that
LSMs can't observe; you could argue that CAP_SYS_ADMIN in another
namespace should have similar semantics, but I'm not sure whether that
matches what the LSM people want as semantics.
Tycho Andersen Oct. 8, 2018, 6 p.m.
On Mon, Oct 08, 2018 at 05:16:30PM +0200, Christian Brauner wrote:
> On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> > version which can acquire filters is useful. There are at least two reasons
> > this is preferable, even though it uses ptrace:
> > 
> > 1. You can control tasks that aren't cooperating with you
> > 2. You can control tasks whose filters block sendmsg() and socket(); if the
> >    task installs a filter which blocks these calls, there's no way with
> >    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.
> 
> So for the slow of mind aka me:
> I'm not sure I completely understand this problem. Can you outline how
> sendmsg() and socket() are involved in this?
> 
> I'm also not sure that this holds (but I might misunderstand the
> problem) afaict, you could do try to get the fd out via CLONE_FILES and
> other means so something like: 
> 
> // let's pretend the libc wrapper for clone actually has sane semantics
> pid = clone(CLONE_FILES);
> if (pid == 0) {
>         fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
> 
>         // Now this fd will be valid in both parent and child.
>         // If you haven't blocked it you can inform the parent what
>         // the fd number is via pipe2(). If you have blocked it you can
>         // use dup2() and dup to a known fd number.
> }

But what if your seccomp filter wants to block both pipe2() and
dup2()? Whatever syscall you want to use to do this could be blocked
by some seccomp policy, which means you might not be able to use this
feature in some cases.

Perhaps it's unlikely, and we can just go forward knowing this. But it
seems like it is worth at least acknowledging that you can wedge
yourself into a corner.

Tycho
Christian Brauner Oct. 8, 2018, 6:18 p.m.
On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote:
> On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote:
> > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote:
> > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
> > > >
> > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> > > > > version which can acquire filters is useful. There are at least two reasons
> > > > > this is preferable, even though it uses ptrace:
> > > > >
> > > > > 1. You can control tasks that aren't cooperating with you
> > > > > 2. You can control tasks whose filters block sendmsg() and socket(); if the
> > > > >    task installs a filter which blocks these calls, there's no way with
> > > > >    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.
> > > >
> > > > So for the slow of mind aka me:
> > > > I'm not sure I completely understand this problem. Can you outline how
> > > > sendmsg() and socket() are involved in this?
> > > >
> > > > I'm also not sure that this holds (but I might misunderstand the
> > > > problem) afaict, you could do try to get the fd out via CLONE_FILES and
> > > > other means so something like:
> > > >
> > > > // let's pretend the libc wrapper for clone actually has sane semantics
> > > > pid = clone(CLONE_FILES);
> > > > if (pid == 0) {
> > > >         fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
> > > >
> > > >         // Now this fd will be valid in both parent and child.
> > > >         // If you haven't blocked it you can inform the parent what
> > > >         // the fd number is via pipe2(). If you have blocked it you can
> > > >         // use dup2() and dup to a known fd number.
> > > > }
> > > >
> > > > >
> > > > > v2: fix a bug where listener mode was not unset when an unused fd was not
> > > > >     available
> > > > > v3: fix refcounting bug (Oleg)
> > > > > v4: * change the listener's fd flags to be 0
> > > > >     * rename GET_LISTENER to NEW_LISTENER (Matthew)
> > > > > v5: * add capable(CAP_SYS_ADMIN) requirement
> > > > > v7: * point the new listener at the right filter (Jann)
> > > > >
> > > > > Signed-off-by: Tycho Andersen <tycho@tycho.ws>
> > > > > CC: Kees Cook <keescook@chromium.org>
> > > > > CC: Andy Lutomirski <luto@amacapital.net>
> > > > > CC: Oleg Nesterov <oleg@redhat.com>
> > > > > CC: Eric W. Biederman <ebiederm@xmission.com>
> > > > > CC: "Serge E. Hallyn" <serge@hallyn.com>
> > > > > CC: Christian Brauner <christian.brauner@ubuntu.com>
> > > > > CC: Tyler Hicks <tyhicks@canonical.com>
> > > > > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
> > > > > ---
> > > > >  include/linux/seccomp.h                       |  7 ++
> > > > >  include/uapi/linux/ptrace.h                   |  2 +
> > > > >  kernel/ptrace.c                               |  4 ++
> > > > >  kernel/seccomp.c                              | 31 +++++++++
> > > > >  tools/testing/selftests/seccomp/seccomp_bpf.c | 68 +++++++++++++++++++
> > > > >  5 files changed, 112 insertions(+)
> > > > >
> > > > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> > > > > index 017444b5efed..234c61b37405 100644
> > > > > --- a/include/linux/seccomp.h
> > > > > +++ b/include/linux/seccomp.h
> > > > > @@ -83,6 +83,8 @@ static inline int seccomp_mode(struct seccomp *s)
> > > > >  #ifdef CONFIG_SECCOMP_FILTER
> > > > >  extern void put_seccomp_filter(struct task_struct *tsk);
> > > > >  extern void get_seccomp_filter(struct task_struct *tsk);
> > > > > +extern long seccomp_new_listener(struct task_struct *task,
> > > > > +                              unsigned long filter_off);
> > > > >  #else  /* CONFIG_SECCOMP_FILTER */
> > > > >  static inline void put_seccomp_filter(struct task_struct *tsk)
> > > > >  {
> > > > > @@ -92,6 +94,11 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
> > > > >  {
> > > > >       return;
> > > > >  }
> > > > > +static inline long seccomp_new_listener(struct task_struct *task,
> > > > > +                                     unsigned long filter_off)
> > > > > +{
> > > > > +     return -EINVAL;
> > > > > +}
> > > > >  #endif /* CONFIG_SECCOMP_FILTER */
> > > > >
> > > > >  #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
> > > > > diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
> > > > > index d5a1b8a492b9..e80ecb1bd427 100644
> > > > > --- a/include/uapi/linux/ptrace.h
> > > > > +++ b/include/uapi/linux/ptrace.h
> > > > > @@ -73,6 +73,8 @@ struct seccomp_metadata {
> > > > >       __u64 flags;            /* Output: filter's flags */
> > > > >  };
> > > > >
> > > > > +#define PTRACE_SECCOMP_NEW_LISTENER  0x420e
> > > > > +
> > > > >  /* Read signals from a shared (process wide) queue */
> > > > >  #define PTRACE_PEEKSIGINFO_SHARED    (1 << 0)
> > > > >
> > > > > diff --git a/kernel/ptrace.c b/kernel/ptrace.c
> > > > > index 21fec73d45d4..289960ac181b 100644
> > > > > --- a/kernel/ptrace.c
> > > > > +++ b/kernel/ptrace.c
> > > > > @@ -1096,6 +1096,10 @@ int ptrace_request(struct task_struct *child, long request,
> > > > >               ret = seccomp_get_metadata(child, addr, datavp);
> > > > >               break;
> > > > >
> > > > > +     case PTRACE_SECCOMP_NEW_LISTENER:
> > > > > +             ret = seccomp_new_listener(child, addr);
> > > > > +             break;
> > > > > +
> > > > >       default:
> > > > >               break;
> > > > >       }
> > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > > > index 44a31ac8373a..17685803a2af 100644
> > > > > --- a/kernel/seccomp.c
> > > > > +++ b/kernel/seccomp.c
> > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> > > > >
> > > > >       return ret;
> > > > >  }
> > > > > +
> > > > > +long seccomp_new_listener(struct task_struct *task,
> > > > > +                       unsigned long filter_off)
> > > > > +{
> > > > > +     struct seccomp_filter *filter;
> > > > > +     struct file *listener;
> > > > > +     int fd;
> > > > > +
> > > > > +     if (!capable(CAP_SYS_ADMIN))
> > > > > +             return -EACCES;
> > > >
> > > > I know this might have been discussed a while back but why exactly do we
> > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What
> > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
> > > > use ptrace from in there?
> > >
> > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
> > > . Basically, the problem is that this doesn't just give you capability
> > > over the target task, but also over every other task that has the same
> > > filter installed; you need some sort of "is the caller capable over
> > > the filter and anyone who uses it" check.
> >
> > Thanks.
> > But then this new ptrace feature as it stands is imho currently broken.
> > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you
> > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself
> > if you are ns_cpabable(CAP_SYS_ADMIN) then either the new ptrace() api
> > extension should be fixed to allow for this too or the seccomp() way of
> > retrieving the pid - which I really think we want - needs to be fixed to
> > require capable(CAP_SYS_ADMIN) too.
> > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho -
> > the preferred way to solve this.
> > Everything else will just be confusing.
> 
> First you say "broken", then you say "confusing". Which one do you mean?

Both. It's broken in so far as it places a seemingly unnecessary
restriction that could be fixed. You outlined one possible fix yourself
in the link you provided. And it's confusing in so far as there is a way
via seccomp() to get the fd without said requirement.

> 
> Regarding requiring ns_capable() for ptrace: That means that you'll
> have to stash namespace information in the seccomp filter. You'd also
> potentially be eliding the LSM check that would normally have to occur
> between the tracer and the tracee; but I guess that's probably fine?
> CAP_SYS_ADMIN in the init namespace already has some abilities that
> LSMs can't observe; you could argue that CAP_SYS_ADMIN in another
> namespace should have similar semantics, but I'm not sure whether that
> matches what the LSM people want as semantics.
Christian Brauner Oct. 8, 2018, 6:41 p.m.
On Mon, Oct 08, 2018 at 12:00:43PM -0600, Tycho Andersen wrote:
> On Mon, Oct 08, 2018 at 05:16:30PM +0200, Christian Brauner wrote:
> > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> > > version which can acquire filters is useful. There are at least two reasons
> > > this is preferable, even though it uses ptrace:
> > > 
> > > 1. You can control tasks that aren't cooperating with you
> > > 2. You can control tasks whose filters block sendmsg() and socket(); if the
> > >    task installs a filter which blocks these calls, there's no way with
> > >    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.
> > 
> > So for the slow of mind aka me:
> > I'm not sure I completely understand this problem. Can you outline how
> > sendmsg() and socket() are involved in this?
> > 
> > I'm also not sure that this holds (but I might misunderstand the
> > problem) afaict, you could do try to get the fd out via CLONE_FILES and
> > other means so something like: 
> > 
> > // let's pretend the libc wrapper for clone actually has sane semantics
> > pid = clone(CLONE_FILES);
> > if (pid == 0) {
> >         fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
> > 
> >         // Now this fd will be valid in both parent and child.
> >         // If you haven't blocked it you can inform the parent what
> >         // the fd number is via pipe2(). If you have blocked it you can
> >         // use dup2() and dup to a known fd number.
> > }
> 
> But what if your seccomp filter wants to block both pipe2() and
> dup2()? Whatever syscall you want to use to do this could be blocked

(Fwiw, setup shared memory before the clone(CLONE_FILES) and write the
fd in the shared memory. :))


> by some seccomp policy, which means you might not be able to use this
> feature in some cases.

> 
> Perhaps it's unlikely, and we can just go forward knowing this. But it
> seems like it is worth at least acknowledging that you can wedge
> yourself into a corner.

Sure, if you try really really hard to shoot yourself in the foot you'll
always be able to. From the top of my hat I'd say you can at least
probably filter the seccomp() syscall with the listener argument. Once
you've loaded the policy you're out of luck.
You also might be seccomp confided and not be able to use the ptrace()
syscall. AppArmor might prevent you from using ptrace()ing etc. pp.

So I think we really want both ways but the seccomp interface is way
cleaner. To get the fd from ptrace() you need three syscalls. With
seccomp() you need one.
Jann Horn via Containers Oct. 9, 2018, 12:39 p.m.
On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote:
> On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote:
> > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote:
> > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote:
> > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > > > > index 44a31ac8373a..17685803a2af 100644
> > > > > > --- a/kernel/seccomp.c
> > > > > > +++ b/kernel/seccomp.c
> > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> > > > > >
> > > > > >       return ret;
> > > > > >  }
> > > > > > +
> > > > > > +long seccomp_new_listener(struct task_struct *task,
> > > > > > +                       unsigned long filter_off)
> > > > > > +{
> > > > > > +     struct seccomp_filter *filter;
> > > > > > +     struct file *listener;
> > > > > > +     int fd;
> > > > > > +
> > > > > > +     if (!capable(CAP_SYS_ADMIN))
> > > > > > +             return -EACCES;
> > > > >
> > > > > I know this might have been discussed a while back but why exactly do we
> > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What
> > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
> > > > > use ptrace from in there?
> > > >
> > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
> > > > . Basically, the problem is that this doesn't just give you capability
> > > > over the target task, but also over every other task that has the same
> > > > filter installed; you need some sort of "is the caller capable over
> > > > the filter and anyone who uses it" check.
> > >
> > > Thanks.
> > > But then this new ptrace feature as it stands is imho currently broken.
> > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you
> > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself
> > > if you are ns_cpabable(CAP_SYS_ADMIN)

Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as
you enable the NNP flag, I think?

> > > then either the new ptrace() api
> > > extension should be fixed to allow for this too or the seccomp() way of
> > > retrieving the pid - which I really think we want - needs to be fixed to
> > > require capable(CAP_SYS_ADMIN) too.
> > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho -
> > > the preferred way to solve this.
> > > Everything else will just be confusing.
> >
> > First you say "broken", then you say "confusing". Which one do you mean?
>
> Both. It's broken in so far as it places a seemingly unnecessary
> restriction that could be fixed. You outlined one possible fix yourself
> in the link you provided.

If by "possible fix" you mean "check whether the seccomp filter is
only attached to a single task": That wouldn't fundamentally change
the situation, it would only add an additional special case.

> And it's confusing in so far as there is a way
> via seccomp() to get the fd without said requirement.

I don't find it confusing at all. seccomp() and ptrace() are very
different situations: When you use seccomp(), infrastructure is
already in place for ensuring that your filter is only applied to
processes over which you are capable, and propagation is limited by
inheritance from your task down. When you use ptrace(), you need a
pretty different sort of access check that checks whether you're
privileged over ancestors, siblings and so on of the target task.

But thinking about it more, I think that CAP_SYS_ADMIN over the saved
current->mm->user_ns of the task that installed the filter (stored as
a "struct user_namespace *" in the filter) should be acceptable.
Christian Brauner Oct. 9, 2018, 1:28 p.m.
On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote:
> On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote:
> > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote:
> > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote:
> > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote:
> > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > > > > > index 44a31ac8373a..17685803a2af 100644
> > > > > > > --- a/kernel/seccomp.c
> > > > > > > +++ b/kernel/seccomp.c
> > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> > > > > > >
> > > > > > >       return ret;
> > > > > > >  }
> > > > > > > +
> > > > > > > +long seccomp_new_listener(struct task_struct *task,
> > > > > > > +                       unsigned long filter_off)
> > > > > > > +{
> > > > > > > +     struct seccomp_filter *filter;
> > > > > > > +     struct file *listener;
> > > > > > > +     int fd;
> > > > > > > +
> > > > > > > +     if (!capable(CAP_SYS_ADMIN))
> > > > > > > +             return -EACCES;
> > > > > >
> > > > > > I know this might have been discussed a while back but why exactly do we
> > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What
> > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
> > > > > > use ptrace from in there?
> > > > >
> > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
> > > > > . Basically, the problem is that this doesn't just give you capability
> > > > > over the target task, but also over every other task that has the same
> > > > > filter installed; you need some sort of "is the caller capable over
> > > > > the filter and anyone who uses it" check.
> > > >
> > > > Thanks.
> > > > But then this new ptrace feature as it stands is imho currently broken.
> > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you
> > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself
> > > > if you are ns_cpabable(CAP_SYS_ADMIN)
> 
> Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as
> you enable the NNP flag, I think?

Yes, if you turn on NNP you don't even need sys_admin.

> 
> > > > then either the new ptrace() api
> > > > extension should be fixed to allow for this too or the seccomp() way of
> > > > retrieving the pid - which I really think we want - needs to be fixed to
> > > > require capable(CAP_SYS_ADMIN) too.
> > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho -
> > > > the preferred way to solve this.
> > > > Everything else will just be confusing.
> > >
> > > First you say "broken", then you say "confusing". Which one do you mean?
> >
> > Both. It's broken in so far as it places a seemingly unnecessary
> > restriction that could be fixed. You outlined one possible fix yourself
> > in the link you provided.
> 
> If by "possible fix" you mean "check whether the seccomp filter is
> only attached to a single task": That wouldn't fundamentally change
> the situation, it would only add an additional special case.
> 
> > And it's confusing in so far as there is a way
> > via seccomp() to get the fd without said requirement.
> 
> I don't find it confusing at all. seccomp() and ptrace() are very

Fine, then that's a matter of opinion. I find it counterintuitive that
you can get an fd without privileges via one interface but not via
another.

> different situations: When you use seccomp(), infrastructure is

Sure. Note, that this is _one_ of the reasons why I want to make sure we
keep the native seccomp() only based way of getting an fd without
forcing userspace to switching to a differnet kernel api.

> already in place for ensuring that your filter is only applied to
> processes over which you are capable, and propagation is limited by
> inheritance from your task down. When you use ptrace(), you need a
> pretty different sort of access check that checks whether you're
> privileged over ancestors, siblings and so on of the target task.

So, don't get me wrong I'm not arguing against the ptrace() interface in
general. If this is something that people find useful, fine. But, I
would like to have a simple single-syscall pure-seccomp() based way of
getting an fd, i.e. what we have in patch 1 of this series.

> 
> But thinking about it more, I think that CAP_SYS_ADMIN over the saved
> current->mm->user_ns of the task that installed the filter (stored as
> a "struct user_namespace *" in the filter) should be acceptable.

Hm... Why not CAP_SYS_PTRACE?


One more thing. Citing from [1] 

> I think there's a security problem here. Imagine the following scenario:
> 
> 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> 2. task A forks off a child B
> 3. task B uses setuid(1) to drop its privileges
> 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> or via execve()
> 5. task C (the attacker, uid==1) attaches to task B via ptrace
> 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B

Sorry, to be late to the party but would this really pass
__ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
that it would... Doesn't look like it would get past:

 	tcred = __task_cred(task);
	if (uid_eq(caller_uid, tcred->euid) &&
	    uid_eq(caller_uid, tcred->suid) &&
	    uid_eq(caller_uid, tcred->uid)  &&
	    gid_eq(caller_gid, tcred->egid) &&
	    gid_eq(caller_gid, tcred->sgid) &&
	    gid_eq(caller_gid, tcred->gid))
		goto ok;
	if (ptrace_has_cap(tcred->user_ns, mode))
		goto ok;
	rcu_read_unlock();
	return -EPERM;
ok:
	rcu_read_unlock();
	mm = task->mm;
	if (mm &&
	    ((get_dumpable(mm) != SUID_DUMP_USER) &&
	     !ptrace_has_cap(mm->user_ns, mode)))
	    return -EPERM;

> 7. because the seccomp filter is shared by task A and task B, task C
> is now able to influence syscall results for syscalls performed by
> task A

[1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
Jann Horn via Containers Oct. 9, 2018, 1:36 p.m.
+cc selinux people explicitly, since they probably have opinions on this

On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote:
> > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote:
> > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote:
> > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote:
> > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > > > > > > index 44a31ac8373a..17685803a2af 100644
> > > > > > > > --- a/kernel/seccomp.c
> > > > > > > > +++ b/kernel/seccomp.c
> > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> > > > > > > >
> > > > > > > >       return ret;
> > > > > > > >  }
> > > > > > > > +
> > > > > > > > +long seccomp_new_listener(struct task_struct *task,
> > > > > > > > +                       unsigned long filter_off)
> > > > > > > > +{
> > > > > > > > +     struct seccomp_filter *filter;
> > > > > > > > +     struct file *listener;
> > > > > > > > +     int fd;
> > > > > > > > +
> > > > > > > > +     if (!capable(CAP_SYS_ADMIN))
> > > > > > > > +             return -EACCES;
> > > > > > >
> > > > > > > I know this might have been discussed a while back but why exactly do we
> > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What
> > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
> > > > > > > use ptrace from in there?
> > > > > >
> > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
> > > > > > . Basically, the problem is that this doesn't just give you capability
> > > > > > over the target task, but also over every other task that has the same
> > > > > > filter installed; you need some sort of "is the caller capable over
> > > > > > the filter and anyone who uses it" check.
> > > > >
> > > > > Thanks.
> > > > > But then this new ptrace feature as it stands is imho currently broken.
> > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you
> > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself
> > > > > if you are ns_cpabable(CAP_SYS_ADMIN)
> >
> > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as
> > you enable the NNP flag, I think?
>
> Yes, if you turn on NNP you don't even need sys_admin.
>
> >
> > > > > then either the new ptrace() api
> > > > > extension should be fixed to allow for this too or the seccomp() way of
> > > > > retrieving the pid - which I really think we want - needs to be fixed to
> > > > > require capable(CAP_SYS_ADMIN) too.
> > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho -
> > > > > the preferred way to solve this.
> > > > > Everything else will just be confusing.
> > > >
> > > > First you say "broken", then you say "confusing". Which one do you mean?
> > >
> > > Both. It's broken in so far as it places a seemingly unnecessary
> > > restriction that could be fixed. You outlined one possible fix yourself
> > > in the link you provided.
> >
> > If by "possible fix" you mean "check whether the seccomp filter is
> > only attached to a single task": That wouldn't fundamentally change
> > the situation, it would only add an additional special case.
> >
> > > And it's confusing in so far as there is a way
> > > via seccomp() to get the fd without said requirement.
> >
> > I don't find it confusing at all. seccomp() and ptrace() are very
>
> Fine, then that's a matter of opinion. I find it counterintuitive that
> you can get an fd without privileges via one interface but not via
> another.
>
> > different situations: When you use seccomp(), infrastructure is
>
> Sure. Note, that this is _one_ of the reasons why I want to make sure we
> keep the native seccomp() only based way of getting an fd without
> forcing userspace to switching to a differnet kernel api.
>
> > already in place for ensuring that your filter is only applied to
> > processes over which you are capable, and propagation is limited by
> > inheritance from your task down. When you use ptrace(), you need a
> > pretty different sort of access check that checks whether you're
> > privileged over ancestors, siblings and so on of the target task.
>
> So, don't get me wrong I'm not arguing against the ptrace() interface in
> general. If this is something that people find useful, fine. But, I
> would like to have a simple single-syscall pure-seccomp() based way of
> getting an fd, i.e. what we have in patch 1 of this series.

Yeah, I also prefer the seccomp() one.

> > But thinking about it more, I think that CAP_SYS_ADMIN over the saved
> > current->mm->user_ns of the task that installed the filter (stored as
> > a "struct user_namespace *" in the filter) should be acceptable.
>
> Hm... Why not CAP_SYS_PTRACE?

Because LSMs like SELinux add extra checks that apply even if you have
CAP_SYS_PTRACE, and this would subvert those. The only capability I
know of that lets you bypass LSM checks by design (if no LSM blocks
the capability itself) is CAP_SYS_ADMIN.

> One more thing. Citing from [1]
>
> > I think there's a security problem here. Imagine the following scenario:
> >
> > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > 2. task A forks off a child B
> > 3. task B uses setuid(1) to drop its privileges
> > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > or via execve()
> > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
>
> Sorry, to be late to the party but would this really pass
> __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> that it would... Doesn't look like it would get past:
>
>         tcred = __task_cred(task);
>         if (uid_eq(caller_uid, tcred->euid) &&
>             uid_eq(caller_uid, tcred->suid) &&
>             uid_eq(caller_uid, tcred->uid)  &&
>             gid_eq(caller_gid, tcred->egid) &&
>             gid_eq(caller_gid, tcred->sgid) &&
>             gid_eq(caller_gid, tcred->gid))
>                 goto ok;
>         if (ptrace_has_cap(tcred->user_ns, mode))
>                 goto ok;
>         rcu_read_unlock();
>         return -EPERM;
> ok:
>         rcu_read_unlock();
>         mm = task->mm;
>         if (mm &&
>             ((get_dumpable(mm) != SUID_DUMP_USER) &&
>              !ptrace_has_cap(mm->user_ns, mode)))
>             return -EPERM;

Which specific check would prevent task C from attaching to task B? If
the UIDs match, the first "goto ok" executes; and you're dumpable, so
you don't trigger the second "return -EPERM".

> > 7. because the seccomp filter is shared by task A and task B, task C
> > is now able to influence syscall results for syscalls performed by
> > task A
>
> [1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
Christian Brauner Oct. 9, 2018, 1:49 p.m.
On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote:
> +cc selinux people explicitly, since they probably have opinions on this
> 
> On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> > On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote:
> > > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote:
> > > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote:
> > > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote:
> > > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > > > > > > > index 44a31ac8373a..17685803a2af 100644
> > > > > > > > > --- a/kernel/seccomp.c
> > > > > > > > > +++ b/kernel/seccomp.c
> > > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> > > > > > > > >
> > > > > > > > >       return ret;
> > > > > > > > >  }
> > > > > > > > > +
> > > > > > > > > +long seccomp_new_listener(struct task_struct *task,
> > > > > > > > > +                       unsigned long filter_off)
> > > > > > > > > +{
> > > > > > > > > +     struct seccomp_filter *filter;
> > > > > > > > > +     struct file *listener;
> > > > > > > > > +     int fd;
> > > > > > > > > +
> > > > > > > > > +     if (!capable(CAP_SYS_ADMIN))
> > > > > > > > > +             return -EACCES;
> > > > > > > >
> > > > > > > > I know this might have been discussed a while back but why exactly do we
> > > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What
> > > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
> > > > > > > > use ptrace from in there?
> > > > > > >
> > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
> > > > > > > . Basically, the problem is that this doesn't just give you capability
> > > > > > > over the target task, but also over every other task that has the same
> > > > > > > filter installed; you need some sort of "is the caller capable over
> > > > > > > the filter and anyone who uses it" check.
> > > > > >
> > > > > > Thanks.
> > > > > > But then this new ptrace feature as it stands is imho currently broken.
> > > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you
> > > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself
> > > > > > if you are ns_cpabable(CAP_SYS_ADMIN)
> > >
> > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as
> > > you enable the NNP flag, I think?
> >
> > Yes, if you turn on NNP you don't even need sys_admin.
> >
> > >
> > > > > > then either the new ptrace() api
> > > > > > extension should be fixed to allow for this too or the seccomp() way of
> > > > > > retrieving the pid - which I really think we want - needs to be fixed to
> > > > > > require capable(CAP_SYS_ADMIN) too.
> > > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho -
> > > > > > the preferred way to solve this.
> > > > > > Everything else will just be confusing.
> > > > >
> > > > > First you say "broken", then you say "confusing". Which one do you mean?
> > > >
> > > > Both. It's broken in so far as it places a seemingly unnecessary
> > > > restriction that could be fixed. You outlined one possible fix yourself
> > > > in the link you provided.
> > >
> > > If by "possible fix" you mean "check whether the seccomp filter is
> > > only attached to a single task": That wouldn't fundamentally change
> > > the situation, it would only add an additional special case.
> > >
> > > > And it's confusing in so far as there is a way
> > > > via seccomp() to get the fd without said requirement.
> > >
> > > I don't find it confusing at all. seccomp() and ptrace() are very
> >
> > Fine, then that's a matter of opinion. I find it counterintuitive that
> > you can get an fd without privileges via one interface but not via
> > another.
> >
> > > different situations: When you use seccomp(), infrastructure is
> >
> > Sure. Note, that this is _one_ of the reasons why I want to make sure we
> > keep the native seccomp() only based way of getting an fd without
> > forcing userspace to switching to a differnet kernel api.
> >
> > > already in place for ensuring that your filter is only applied to
> > > processes over which you are capable, and propagation is limited by
> > > inheritance from your task down. When you use ptrace(), you need a
> > > pretty different sort of access check that checks whether you're
> > > privileged over ancestors, siblings and so on of the target task.
> >
> > So, don't get me wrong I'm not arguing against the ptrace() interface in
> > general. If this is something that people find useful, fine. But, I
> > would like to have a simple single-syscall pure-seccomp() based way of
> > getting an fd, i.e. what we have in patch 1 of this series.
> 
> Yeah, I also prefer the seccomp() one.
> 
> > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved
> > > current->mm->user_ns of the task that installed the filter (stored as
> > > a "struct user_namespace *" in the filter) should be acceptable.
> >
> > Hm... Why not CAP_SYS_PTRACE?
> 
> Because LSMs like SELinux add extra checks that apply even if you have
> CAP_SYS_PTRACE, and this would subvert those. The only capability I
> know of that lets you bypass LSM checks by design (if no LSM blocks
> the capability itself) is CAP_SYS_ADMIN.
> 
> > One more thing. Citing from [1]
> >
> > > I think there's a security problem here. Imagine the following scenario:
> > >
> > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > > 2. task A forks off a child B
> > > 3. task B uses setuid(1) to drop its privileges
> > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > > or via execve()
> > > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
> >
> > Sorry, to be late to the party but would this really pass
> > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> > that it would... Doesn't look like it would get past:
> >
> >         tcred = __task_cred(task);
> >         if (uid_eq(caller_uid, tcred->euid) &&
> >             uid_eq(caller_uid, tcred->suid) &&
> >             uid_eq(caller_uid, tcred->uid)  &&
> >             gid_eq(caller_gid, tcred->egid) &&
> >             gid_eq(caller_gid, tcred->sgid) &&
> >             gid_eq(caller_gid, tcred->gid))
> >                 goto ok;
> >         if (ptrace_has_cap(tcred->user_ns, mode))
> >                 goto ok;
> >         rcu_read_unlock();
> >         return -EPERM;
> > ok:
> >         rcu_read_unlock();
> >         mm = task->mm;
> >         if (mm &&
> >             ((get_dumpable(mm) != SUID_DUMP_USER) &&
> >              !ptrace_has_cap(mm->user_ns, mode)))
> >             return -EPERM;
> 
> Which specific check would prevent task C from attaching to task B? If
> the UIDs match, the first "goto ok" executes; and you're dumpable, so
> you don't trigger the second "return -EPERM".

You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't
have if you did a setuid to an unpriv user. (But I always find that code
confusing.)

> 
> > > 7. because the seccomp filter is shared by task A and task B, task C
> > > is now able to influence syscall results for syscalls performed by
> > > task A
> >
> > [1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
Jann Horn via Containers Oct. 9, 2018, 1:50 p.m.
On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote:
>
> On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote:
> > +cc selinux people explicitly, since they probably have opinions on this
> >
> > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> > > On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote:
> > > > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote:
> > > > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote:
> > > > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > > > > > > > > index 44a31ac8373a..17685803a2af 100644
> > > > > > > > > > --- a/kernel/seccomp.c
> > > > > > > > > > +++ b/kernel/seccomp.c
> > > > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> > > > > > > > > >
> > > > > > > > > >       return ret;
> > > > > > > > > >  }
> > > > > > > > > > +
> > > > > > > > > > +long seccomp_new_listener(struct task_struct *task,
> > > > > > > > > > +                       unsigned long filter_off)
> > > > > > > > > > +{
> > > > > > > > > > +     struct seccomp_filter *filter;
> > > > > > > > > > +     struct file *listener;
> > > > > > > > > > +     int fd;
> > > > > > > > > > +
> > > > > > > > > > +     if (!capable(CAP_SYS_ADMIN))
> > > > > > > > > > +             return -EACCES;
> > > > > > > > >
> > > > > > > > > I know this might have been discussed a while back but why exactly do we
> > > > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What
> > > > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
> > > > > > > > > use ptrace from in there?
> > > > > > > >
> > > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
> > > > > > > > . Basically, the problem is that this doesn't just give you capability
> > > > > > > > over the target task, but also over every other task that has the same
> > > > > > > > filter installed; you need some sort of "is the caller capable over
> > > > > > > > the filter and anyone who uses it" check.
> > > > > > >
> > > > > > > Thanks.
> > > > > > > But then this new ptrace feature as it stands is imho currently broken.
> > > > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you
> > > > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself
> > > > > > > if you are ns_cpabable(CAP_SYS_ADMIN)
> > > >
> > > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as
> > > > you enable the NNP flag, I think?
> > >
> > > Yes, if you turn on NNP you don't even need sys_admin.
> > >
> > > >
> > > > > > > then either the new ptrace() api
> > > > > > > extension should be fixed to allow for this too or the seccomp() way of
> > > > > > > retrieving the pid - which I really think we want - needs to be fixed to
> > > > > > > require capable(CAP_SYS_ADMIN) too.
> > > > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho -
> > > > > > > the preferred way to solve this.
> > > > > > > Everything else will just be confusing.
> > > > > >
> > > > > > First you say "broken", then you say "confusing". Which one do you mean?
> > > > >
> > > > > Both. It's broken in so far as it places a seemingly unnecessary
> > > > > restriction that could be fixed. You outlined one possible fix yourself
> > > > > in the link you provided.
> > > >
> > > > If by "possible fix" you mean "check whether the seccomp filter is
> > > > only attached to a single task": That wouldn't fundamentally change
> > > > the situation, it would only add an additional special case.
> > > >
> > > > > And it's confusing in so far as there is a way
> > > > > via seccomp() to get the fd without said requirement.
> > > >
> > > > I don't find it confusing at all. seccomp() and ptrace() are very
> > >
> > > Fine, then that's a matter of opinion. I find it counterintuitive that
> > > you can get an fd without privileges via one interface but not via
> > > another.
> > >
> > > > different situations: When you use seccomp(), infrastructure is
> > >
> > > Sure. Note, that this is _one_ of the reasons why I want to make sure we
> > > keep the native seccomp() only based way of getting an fd without
> > > forcing userspace to switching to a differnet kernel api.
> > >
> > > > already in place for ensuring that your filter is only applied to
> > > > processes over which you are capable, and propagation is limited by
> > > > inheritance from your task down. When you use ptrace(), you need a
> > > > pretty different sort of access check that checks whether you're
> > > > privileged over ancestors, siblings and so on of the target task.
> > >
> > > So, don't get me wrong I'm not arguing against the ptrace() interface in
> > > general. If this is something that people find useful, fine. But, I
> > > would like to have a simple single-syscall pure-seccomp() based way of
> > > getting an fd, i.e. what we have in patch 1 of this series.
> >
> > Yeah, I also prefer the seccomp() one.
> >
> > > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved
> > > > current->mm->user_ns of the task that installed the filter (stored as
> > > > a "struct user_namespace *" in the filter) should be acceptable.
> > >
> > > Hm... Why not CAP_SYS_PTRACE?
> >
> > Because LSMs like SELinux add extra checks that apply even if you have
> > CAP_SYS_PTRACE, and this would subvert those. The only capability I
> > know of that lets you bypass LSM checks by design (if no LSM blocks
> > the capability itself) is CAP_SYS_ADMIN.
> >
> > > One more thing. Citing from [1]
> > >
> > > > I think there's a security problem here. Imagine the following scenario:
> > > >
> > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > > > 2. task A forks off a child B
> > > > 3. task B uses setuid(1) to drop its privileges
> > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > > > or via execve()
> > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
> > >
> > > Sorry, to be late to the party but would this really pass
> > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> > > that it would... Doesn't look like it would get past:
> > >
> > >         tcred = __task_cred(task);
> > >         if (uid_eq(caller_uid, tcred->euid) &&
> > >             uid_eq(caller_uid, tcred->suid) &&
> > >             uid_eq(caller_uid, tcred->uid)  &&
> > >             gid_eq(caller_gid, tcred->egid) &&
> > >             gid_eq(caller_gid, tcred->sgid) &&
> > >             gid_eq(caller_gid, tcred->gid))
> > >                 goto ok;
> > >         if (ptrace_has_cap(tcred->user_ns, mode))
> > >                 goto ok;
> > >         rcu_read_unlock();
> > >         return -EPERM;
> > > ok:
> > >         rcu_read_unlock();
> > >         mm = task->mm;
> > >         if (mm &&
> > >             ((get_dumpable(mm) != SUID_DUMP_USER) &&
> > >              !ptrace_has_cap(mm->user_ns, mode)))
> > >             return -EPERM;
> >
> > Which specific check would prevent task C from attaching to task B? If
> > the UIDs match, the first "goto ok" executes; and you're dumpable, so
> > you don't trigger the second "return -EPERM".
>
> You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't
> have if you did a setuid to an unpriv user. (But I always find that code
> confusing.)

Only if the target hasn't gone through execve() since setuid().
Christian Brauner Oct. 9, 2018, 2:09 p.m.
On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote:
> On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote:
> >
> > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote:
> > > +cc selinux people explicitly, since they probably have opinions on this
> > >
> > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> > > > On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote:
> > > > > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote:
> > > > > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote:
> > > > > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > > > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > > > > > > > > > index 44a31ac8373a..17685803a2af 100644
> > > > > > > > > > > --- a/kernel/seccomp.c
> > > > > > > > > > > +++ b/kernel/seccomp.c
> > > > > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> > > > > > > > > > >
> > > > > > > > > > >       return ret;
> > > > > > > > > > >  }
> > > > > > > > > > > +
> > > > > > > > > > > +long seccomp_new_listener(struct task_struct *task,
> > > > > > > > > > > +                       unsigned long filter_off)
> > > > > > > > > > > +{
> > > > > > > > > > > +     struct seccomp_filter *filter;
> > > > > > > > > > > +     struct file *listener;
> > > > > > > > > > > +     int fd;
> > > > > > > > > > > +
> > > > > > > > > > > +     if (!capable(CAP_SYS_ADMIN))
> > > > > > > > > > > +             return -EACCES;
> > > > > > > > > >
> > > > > > > > > > I know this might have been discussed a while back but why exactly do we
> > > > > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What
> > > > > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
> > > > > > > > > > use ptrace from in there?
> > > > > > > > >
> > > > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
> > > > > > > > > . Basically, the problem is that this doesn't just give you capability
> > > > > > > > > over the target task, but also over every other task that has the same
> > > > > > > > > filter installed; you need some sort of "is the caller capable over
> > > > > > > > > the filter and anyone who uses it" check.
> > > > > > > >
> > > > > > > > Thanks.
> > > > > > > > But then this new ptrace feature as it stands is imho currently broken.
> > > > > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you
> > > > > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself
> > > > > > > > if you are ns_cpabable(CAP_SYS_ADMIN)
> > > > >
> > > > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as
> > > > > you enable the NNP flag, I think?
> > > >
> > > > Yes, if you turn on NNP you don't even need sys_admin.
> > > >
> > > > >
> > > > > > > > then either the new ptrace() api
> > > > > > > > extension should be fixed to allow for this too or the seccomp() way of
> > > > > > > > retrieving the pid - which I really think we want - needs to be fixed to
> > > > > > > > require capable(CAP_SYS_ADMIN) too.
> > > > > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho -
> > > > > > > > the preferred way to solve this.
> > > > > > > > Everything else will just be confusing.
> > > > > > >
> > > > > > > First you say "broken", then you say "confusing". Which one do you mean?
> > > > > >
> > > > > > Both. It's broken in so far as it places a seemingly unnecessary
> > > > > > restriction that could be fixed. You outlined one possible fix yourself
> > > > > > in the link you provided.
> > > > >
> > > > > If by "possible fix" you mean "check whether the seccomp filter is
> > > > > only attached to a single task": That wouldn't fundamentally change
> > > > > the situation, it would only add an additional special case.
> > > > >
> > > > > > And it's confusing in so far as there is a way
> > > > > > via seccomp() to get the fd without said requirement.
> > > > >
> > > > > I don't find it confusing at all. seccomp() and ptrace() are very
> > > >
> > > > Fine, then that's a matter of opinion. I find it counterintuitive that
> > > > you can get an fd without privileges via one interface but not via
> > > > another.
> > > >
> > > > > different situations: When you use seccomp(), infrastructure is
> > > >
> > > > Sure. Note, that this is _one_ of the reasons why I want to make sure we
> > > > keep the native seccomp() only based way of getting an fd without
> > > > forcing userspace to switching to a differnet kernel api.
> > > >
> > > > > already in place for ensuring that your filter is only applied to
> > > > > processes over which you are capable, and propagation is limited by
> > > > > inheritance from your task down. When you use ptrace(), you need a
> > > > > pretty different sort of access check that checks whether you're
> > > > > privileged over ancestors, siblings and so on of the target task.
> > > >
> > > > So, don't get me wrong I'm not arguing against the ptrace() interface in
> > > > general. If this is something that people find useful, fine. But, I
> > > > would like to have a simple single-syscall pure-seccomp() based way of
> > > > getting an fd, i.e. what we have in patch 1 of this series.
> > >
> > > Yeah, I also prefer the seccomp() one.
> > >
> > > > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved
> > > > > current->mm->user_ns of the task that installed the filter (stored as
> > > > > a "struct user_namespace *" in the filter) should be acceptable.
> > > >
> > > > Hm... Why not CAP_SYS_PTRACE?
> > >
> > > Because LSMs like SELinux add extra checks that apply even if you have
> > > CAP_SYS_PTRACE, and this would subvert those. The only capability I
> > > know of that lets you bypass LSM checks by design (if no LSM blocks
> > > the capability itself) is CAP_SYS_ADMIN.
> > >
> > > > One more thing. Citing from [1]
> > > >
> > > > > I think there's a security problem here. Imagine the following scenario:
> > > > >
> > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > > > > 2. task A forks off a child B
> > > > > 3. task B uses setuid(1) to drop its privileges
> > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > > > > or via execve()
> > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
> > > >
> > > > Sorry, to be late to the party but would this really pass
> > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> > > > that it would... Doesn't look like it would get past:
> > > >
> > > >         tcred = __task_cred(task);
> > > >         if (uid_eq(caller_uid, tcred->euid) &&
> > > >             uid_eq(caller_uid, tcred->suid) &&
> > > >             uid_eq(caller_uid, tcred->uid)  &&
> > > >             gid_eq(caller_gid, tcred->egid) &&
> > > >             gid_eq(caller_gid, tcred->sgid) &&
> > > >             gid_eq(caller_gid, tcred->gid))
> > > >                 goto ok;
> > > >         if (ptrace_has_cap(tcred->user_ns, mode))
> > > >                 goto ok;
> > > >         rcu_read_unlock();
> > > >         return -EPERM;
> > > > ok:
> > > >         rcu_read_unlock();
> > > >         mm = task->mm;
> > > >         if (mm &&
> > > >             ((get_dumpable(mm) != SUID_DUMP_USER) &&
> > > >              !ptrace_has_cap(mm->user_ns, mode)))
> > > >             return -EPERM;
> > >
> > > Which specific check would prevent task C from attaching to task B? If
> > > the UIDs match, the first "goto ok" executes; and you're dumpable, so
> > > you don't trigger the second "return -EPERM".
> >
> > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't
> > have if you did a setuid to an unpriv user. (But I always find that code
> > confusing.)
> 
> Only if the target hasn't gone through execve() since setuid().

Sorry if I want to know this in excessive detail but I'd like to
understand this properly so bear with me :)
- If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not
  execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode).
- If task B has setuid()ed, exeved()ed it will get its dumpable flag set
  to /proc/sys/fs/suid_dumpable which by default is 0. So C won't pass
  (get_dumpable(mm) != SUID_DUMP_USER).
In both cases PTRACE_ATTACH shouldn't work. Now, if
/proc/sys/fs/suid_dumpable is 1 I'd find it acceptable for this to work.
This is an administrator choice.
Jann Horn via Containers Oct. 9, 2018, 3:26 p.m.
On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote:
> On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote:
> > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote:
> > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote:
> > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > One more thing. Citing from [1]
> > > > >
> > > > > > I think there's a security problem here. Imagine the following scenario:
> > > > > >
> > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > > > > > 2. task A forks off a child B
> > > > > > 3. task B uses setuid(1) to drop its privileges
> > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > > > > > or via execve()
> > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
> > > > >
> > > > > Sorry, to be late to the party but would this really pass
> > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> > > > > that it would... Doesn't look like it would get past:
> > > > >
> > > > >         tcred = __task_cred(task);
> > > > >         if (uid_eq(caller_uid, tcred->euid) &&
> > > > >             uid_eq(caller_uid, tcred->suid) &&
> > > > >             uid_eq(caller_uid, tcred->uid)  &&
> > > > >             gid_eq(caller_gid, tcred->egid) &&
> > > > >             gid_eq(caller_gid, tcred->sgid) &&
> > > > >             gid_eq(caller_gid, tcred->gid))
> > > > >                 goto ok;
> > > > >         if (ptrace_has_cap(tcred->user_ns, mode))
> > > > >                 goto ok;
> > > > >         rcu_read_unlock();
> > > > >         return -EPERM;
> > > > > ok:
> > > > >         rcu_read_unlock();
> > > > >         mm = task->mm;
> > > > >         if (mm &&
> > > > >             ((get_dumpable(mm) != SUID_DUMP_USER) &&
> > > > >              !ptrace_has_cap(mm->user_ns, mode)))
> > > > >             return -EPERM;
> > > >
> > > > Which specific check would prevent task C from attaching to task B? If
> > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so
> > > > you don't trigger the second "return -EPERM".
> > >
> > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't
> > > have if you did a setuid to an unpriv user. (But I always find that code
> > > confusing.)
> >
> > Only if the target hasn't gone through execve() since setuid().
>
> Sorry if I want to know this in excessive detail but I'd like to
> understand this properly so bear with me :)
> - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not
>   execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode).

Yeah.

> - If task B has setuid()ed, exeved()ed it will get its dumpable flag set
>   to /proc/sys/fs/suid_dumpable

Not if you changed all UIDs (e.g. by calling setuid() as root). In
that case, setup_new_exec() calls "set_dumpable(current->mm,
SUID_DUMP_USER)".

> which by default is 0. So C won't pass
>   (get_dumpable(mm) != SUID_DUMP_USER).
> In both cases PTRACE_ATTACH shouldn't work. Now, if
> /proc/sys/fs/suid_dumpable is 1 I'd find it acceptable for this to work.
> This is an administrator choice.
Christian Brauner Oct. 9, 2018, 4:20 p.m.
On Tue, Oct 09, 2018 at 05:26:26PM +0200, Jann Horn wrote:
> On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote:
> > On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote:
> > > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote:
> > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote:
> > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > One more thing. Citing from [1]
> > > > > >
> > > > > > > I think there's a security problem here. Imagine the following scenario:
> > > > > > >
> > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > > > > > > 2. task A forks off a child B
> > > > > > > 3. task B uses setuid(1) to drop its privileges
> > > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > > > > > > or via execve()
> > > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
> > > > > >
> > > > > > Sorry, to be late to the party but would this really pass
> > > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> > > > > > that it would... Doesn't look like it would get past:
> > > > > >
> > > > > >         tcred = __task_cred(task);
> > > > > >         if (uid_eq(caller_uid, tcred->euid) &&
> > > > > >             uid_eq(caller_uid, tcred->suid) &&
> > > > > >             uid_eq(caller_uid, tcred->uid)  &&
> > > > > >             gid_eq(caller_gid, tcred->egid) &&
> > > > > >             gid_eq(caller_gid, tcred->sgid) &&
> > > > > >             gid_eq(caller_gid, tcred->gid))
> > > > > >                 goto ok;
> > > > > >         if (ptrace_has_cap(tcred->user_ns, mode))
> > > > > >                 goto ok;
> > > > > >         rcu_read_unlock();
> > > > > >         return -EPERM;
> > > > > > ok:
> > > > > >         rcu_read_unlock();
> > > > > >         mm = task->mm;
> > > > > >         if (mm &&
> > > > > >             ((get_dumpable(mm) != SUID_DUMP_USER) &&
> > > > > >              !ptrace_has_cap(mm->user_ns, mode)))
> > > > > >             return -EPERM;
> > > > >
> > > > > Which specific check would prevent task C from attaching to task B? If
> > > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so
> > > > > you don't trigger the second "return -EPERM".
> > > >
> > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't
> > > > have if you did a setuid to an unpriv user. (But I always find that code
> > > > confusing.)
> > >
> > > Only if the target hasn't gone through execve() since setuid().
> >
> > Sorry if I want to know this in excessive detail but I'd like to
> > understand this properly so bear with me :)
> > - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not
> >   execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode).
> 
> Yeah.
> 
> > - If task B has setuid()ed, exeved()ed it will get its dumpable flag set
> >   to /proc/sys/fs/suid_dumpable
> 
> Not if you changed all UIDs (e.g. by calling setuid() as root). In
> that case, setup_new_exec() calls "set_dumpable(current->mm,
> SUID_DUMP_USER)".

Actually, looking at this when C is trying to PTRACE_ATTACH to B as an
unprivileged user even if B execve()ed and it is dumpable C still
wouldn't have CAP_SYS_PTRACE in the mm->user_ns unless it already is
privileged over mm->user_ns which means it must be in an ancestor
user_ns.

> 
> > which by default is 0. So C won't pass
> >   (get_dumpable(mm) != SUID_DUMP_USER).
> > In both cases PTRACE_ATTACH shouldn't work. Now, if
> > /proc/sys/fs/suid_dumpable is 1 I'd find it acceptable for this to work.
> > This is an administrator choice.
Jann Horn via Containers Oct. 9, 2018, 4:26 p.m.
On Tue, Oct 9, 2018 at 6:20 PM Christian Brauner <christian@brauner.io> wrote:
> On Tue, Oct 09, 2018 at 05:26:26PM +0200, Jann Horn wrote:
> > On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote:
> > > On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote:
> > > > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote:
> > > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > One more thing. Citing from [1]
> > > > > > >
> > > > > > > > I think there's a security problem here. Imagine the following scenario:
> > > > > > > >
> > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > > > > > > > 2. task A forks off a child B
> > > > > > > > 3. task B uses setuid(1) to drop its privileges
> > > > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > > > > > > > or via execve()
> > > > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > > > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
> > > > > > >
> > > > > > > Sorry, to be late to the party but would this really pass
> > > > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> > > > > > > that it would... Doesn't look like it would get past:
> > > > > > >
> > > > > > >         tcred = __task_cred(task);
> > > > > > >         if (uid_eq(caller_uid, tcred->euid) &&
> > > > > > >             uid_eq(caller_uid, tcred->suid) &&
> > > > > > >             uid_eq(caller_uid, tcred->uid)  &&
> > > > > > >             gid_eq(caller_gid, tcred->egid) &&
> > > > > > >             gid_eq(caller_gid, tcred->sgid) &&
> > > > > > >             gid_eq(caller_gid, tcred->gid))
> > > > > > >                 goto ok;
> > > > > > >         if (ptrace_has_cap(tcred->user_ns, mode))
> > > > > > >                 goto ok;
> > > > > > >         rcu_read_unlock();
> > > > > > >         return -EPERM;
> > > > > > > ok:
> > > > > > >         rcu_read_unlock();
> > > > > > >         mm = task->mm;
> > > > > > >         if (mm &&
> > > > > > >             ((get_dumpable(mm) != SUID_DUMP_USER) &&
> > > > > > >              !ptrace_has_cap(mm->user_ns, mode)))
> > > > > > >             return -EPERM;
> > > > > >
> > > > > > Which specific check would prevent task C from attaching to task B? If
> > > > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so
> > > > > > you don't trigger the second "return -EPERM".
> > > > >
> > > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't
> > > > > have if you did a setuid to an unpriv user. (But I always find that code
> > > > > confusing.)
> > > >
> > > > Only if the target hasn't gone through execve() since setuid().
> > >
> > > Sorry if I want to know this in excessive detail but I'd like to
> > > understand this properly so bear with me :)
> > > - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not
> > >   execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode).
> >
> > Yeah.
> >
> > > - If task B has setuid()ed, exeved()ed it will get its dumpable flag set
> > >   to /proc/sys/fs/suid_dumpable
> >
> > Not if you changed all UIDs (e.g. by calling setuid() as root). In
> > that case, setup_new_exec() calls "set_dumpable(current->mm,
> > SUID_DUMP_USER)".
>
> Actually, looking at this when C is trying to PTRACE_ATTACH to B as an
> unprivileged user even if B execve()ed and it is dumpable C still
> wouldn't have CAP_SYS_PTRACE in the mm->user_ns unless it already is
> privileged over mm->user_ns which means it must be in an ancestor
> user_ns.

Huh? Why would you need CAP_SYS_PTRACE for anything here? You can
ptrace another process running under your UID just fine, no matter
what the namespaces are. I'm not sure what you're saying.
Christian Brauner Oct. 10, 2018, 12:54 p.m.
On Tue, Oct 09, 2018 at 06:26:47PM +0200, Jann Horn wrote:
> On Tue, Oct 9, 2018 at 6:20 PM Christian Brauner <christian@brauner.io> wrote:
> > On Tue, Oct 09, 2018 at 05:26:26PM +0200, Jann Horn wrote:
> > > On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote:
> > > > On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote:
> > > > > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote:
> > > > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > > One more thing. Citing from [1]
> > > > > > > >
> > > > > > > > > I think there's a security problem here. Imagine the following scenario:
> > > > > > > > >
> > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > > > > > > > > 2. task A forks off a child B
> > > > > > > > > 3. task B uses setuid(1) to drop its privileges
> > > > > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > > > > > > > > or via execve()
> > > > > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > > > > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
> > > > > > > >
> > > > > > > > Sorry, to be late to the party but would this really pass
> > > > > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> > > > > > > > that it would... Doesn't look like it would get past:
> > > > > > > >
> > > > > > > >         tcred = __task_cred(task);
> > > > > > > >         if (uid_eq(caller_uid, tcred->euid) &&
> > > > > > > >             uid_eq(caller_uid, tcred->suid) &&
> > > > > > > >             uid_eq(caller_uid, tcred->uid)  &&
> > > > > > > >             gid_eq(caller_gid, tcred->egid) &&
> > > > > > > >             gid_eq(caller_gid, tcred->sgid) &&
> > > > > > > >             gid_eq(caller_gid, tcred->gid))
> > > > > > > >                 goto ok;
> > > > > > > >         if (ptrace_has_cap(tcred->user_ns, mode))
> > > > > > > >                 goto ok;
> > > > > > > >         rcu_read_unlock();
> > > > > > > >         return -EPERM;
> > > > > > > > ok:
> > > > > > > >         rcu_read_unlock();
> > > > > > > >         mm = task->mm;
> > > > > > > >         if (mm &&
> > > > > > > >             ((get_dumpable(mm) != SUID_DUMP_USER) &&
> > > > > > > >              !ptrace_has_cap(mm->user_ns, mode)))
> > > > > > > >             return -EPERM;
> > > > > > >
> > > > > > > Which specific check would prevent task C from attaching to task B? If
> > > > > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so
> > > > > > > you don't trigger the second "return -EPERM".
> > > > > >
> > > > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't
> > > > > > have if you did a setuid to an unpriv user. (But I always find that code
> > > > > > confusing.)
> > > > >
> > > > > Only if the target hasn't gone through execve() since setuid().
> > > >
> > > > Sorry if I want to know this in excessive detail but I'd like to
> > > > understand this properly so bear with me :)
> > > > - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not
> > > >   execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode).
> > >
> > > Yeah.
> > >
> > > > - If task B has setuid()ed, exeved()ed it will get its dumpable flag set
> > > >   to /proc/sys/fs/suid_dumpable
> > >
> > > Not if you changed all UIDs (e.g. by calling setuid() as root). In
> > > that case, setup_new_exec() calls "set_dumpable(current->mm,
> > > SUID_DUMP_USER)".
> >
> > Actually, looking at this when C is trying to PTRACE_ATTACH to B as an
> > unprivileged user even if B execve()ed and it is dumpable C still
> > wouldn't have CAP_SYS_PTRACE in the mm->user_ns unless it already is
> > privileged over mm->user_ns which means it must be in an ancestor
> > user_ns.
> 
> Huh? Why would you need CAP_SYS_PTRACE for anything here? You can
> ptrace another process running under your UID just fine, no matter
> what the namespaces are. I'm not sure what you're saying.

Sorry, I was out the door yesterday when answering this and was too
brief. I forgot to mention: /proc/sys/kernel/yama/ptrace_scope. It
should be enabled by default on nearly all distros and even if not -
which is an administrators choice - you can usually easily enable it via
sysctl.

1 ("restricted ptrace") [default value]
When  performing an operation that requires a PTRACE_MODE_ATTACH check,
the calling process must either have the CAP_SYS_PTRACE capability in
the user namespace of the target process or it must have a prede‐ fined
relationship with the target process.  By default, the predefined
relationship is that the target process must be a descendant of the
caller.

If you don't have it set you're already susceptible to all kinds of
other attacks and I'm still not convinced we need to bring out the big
capable(CAP_SYS_ADMIN) gun here.
Christian Brauner Oct. 10, 2018, 1:09 p.m.
On Wed, Oct 10, 2018 at 02:54:22PM +0200, Christian Brauner wrote:
> On Tue, Oct 09, 2018 at 06:26:47PM +0200, Jann Horn wrote:
> > On Tue, Oct 9, 2018 at 6:20 PM Christian Brauner <christian@brauner.io> wrote:
> > > On Tue, Oct 09, 2018 at 05:26:26PM +0200, Jann Horn wrote:
> > > > On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote:
> > > > > > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote:
> > > > > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > > > One more thing. Citing from [1]
> > > > > > > > >
> > > > > > > > > > I think there's a security problem here. Imagine the following scenario:
> > > > > > > > > >
> > > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > > > > > > > > > 2. task A forks off a child B
> > > > > > > > > > 3. task B uses setuid(1) to drop its privileges
> > > > > > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > > > > > > > > > or via execve()
> > > > > > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > > > > > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
> > > > > > > > >
> > > > > > > > > Sorry, to be late to the party but would this really pass
> > > > > > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> > > > > > > > > that it would... Doesn't look like it would get past:
> > > > > > > > >
> > > > > > > > >         tcred = __task_cred(task);
> > > > > > > > >         if (uid_eq(caller_uid, tcred->euid) &&
> > > > > > > > >             uid_eq(caller_uid, tcred->suid) &&
> > > > > > > > >             uid_eq(caller_uid, tcred->uid)  &&
> > > > > > > > >             gid_eq(caller_gid, tcred->egid) &&
> > > > > > > > >             gid_eq(caller_gid, tcred->sgid) &&
> > > > > > > > >             gid_eq(caller_gid, tcred->gid))
> > > > > > > > >                 goto ok;
> > > > > > > > >         if (ptrace_has_cap(tcred->user_ns, mode))
> > > > > > > > >                 goto ok;
> > > > > > > > >         rcu_read_unlock();
> > > > > > > > >         return -EPERM;
> > > > > > > > > ok:
> > > > > > > > >         rcu_read_unlock();
> > > > > > > > >         mm = task->mm;
> > > > > > > > >         if (mm &&
> > > > > > > > >             ((get_dumpable(mm) != SUID_DUMP_USER) &&
> > > > > > > > >              !ptrace_has_cap(mm->user_ns, mode)))
> > > > > > > > >             return -EPERM;
> > > > > > > >
> > > > > > > > Which specific check would prevent task C from attaching to task B? If
> > > > > > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so
> > > > > > > > you don't trigger the second "return -EPERM".
> > > > > > >
> > > > > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't
> > > > > > > have if you did a setuid to an unpriv user. (But I always find that code
> > > > > > > confusing.)
> > > > > >
> > > > > > Only if the target hasn't gone through execve() since setuid().
> > > > >
> > > > > Sorry if I want to know this in excessive detail but I'd like to
> > > > > understand this properly so bear with me :)
> > > > > - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not
> > > > >   execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode).
> > > >
> > > > Yeah.
> > > >
> > > > > - If task B has setuid()ed, exeved()ed it will get its dumpable flag set
> > > > >   to /proc/sys/fs/suid_dumpable
> > > >
> > > > Not if you changed all UIDs (e.g. by calling setuid() as root). In
> > > > that case, setup_new_exec() calls "set_dumpable(current->mm,
> > > > SUID_DUMP_USER)".
> > >
> > > Actually, looking at this when C is trying to PTRACE_ATTACH to B as an
> > > unprivileged user even if B execve()ed and it is dumpable C still
> > > wouldn't have CAP_SYS_PTRACE in the mm->user_ns unless it already is
> > > privileged over mm->user_ns which means it must be in an ancestor
> > > user_ns.
> > 
> > Huh? Why would you need CAP_SYS_PTRACE for anything here? You can
> > ptrace another process running under your UID just fine, no matter
> > what the namespaces are. I'm not sure what you're saying.
> 
> Sorry, I was out the door yesterday when answering this and was too
> brief. I forgot to mention: /proc/sys/kernel/yama/ptrace_scope. It
> should be enabled by default on nearly all distros and even if not -
> which is an administrators choice - you can usually easily enable it via
> sysctl.
> 
> 1 ("restricted ptrace") [default value]
> When  performing an operation that requires a PTRACE_MODE_ATTACH check,
> the calling process must either have the CAP_SYS_PTRACE capability in
> the user namespace of the target process or it must have a prede‐ fined
> relationship with the target process.  By default, the predefined
> relationship is that the target process must be a descendant of the
> caller.
> 
> If you don't have it set you're already susceptible to all kinds of
> other attacks and I'm still not convinced we need to bring out the big
> capable(CAP_SYS_ADMIN) gun here.

That being said, given that Tycho agreed to leave in the native
seccomp() way of retrieving an fd from the task without the sys_admin
restriction [1] which we prefer and if we merge it with aforementioned
feature I care way less about whether or not the restriction is upheld
for ptrace() or not.

[1]: https://lists.linuxfoundation.org/pipermail/containers/2018-October/039553.html
Jann Horn via Containers Oct. 10, 2018, 1:10 p.m.
On Wed, Oct 10, 2018 at 2:54 PM Christian Brauner <christian@brauner.io> wrote:
> On Tue, Oct 09, 2018 at 06:26:47PM +0200, Jann Horn wrote:
> > On Tue, Oct 9, 2018 at 6:20 PM Christian Brauner <christian@brauner.io> wrote:
> > > On Tue, Oct 09, 2018 at 05:26:26PM +0200, Jann Horn wrote:
> > > > On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote:
> > > > > > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote:
> > > > > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > > > One more thing. Citing from [1]
> > > > > > > > >
> > > > > > > > > > I think there's a security problem here. Imagine the following scenario:
> > > > > > > > > >
> > > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > > > > > > > > > 2. task A forks off a child B
> > > > > > > > > > 3. task B uses setuid(1) to drop its privileges
> > > > > > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > > > > > > > > > or via execve()
> > > > > > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > > > > > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
> > > > > > > > >
> > > > > > > > > Sorry, to be late to the party but would this really pass
> > > > > > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> > > > > > > > > that it would... Doesn't look like it would get past:
> > > > > > > > >
> > > > > > > > >         tcred = __task_cred(task);
> > > > > > > > >         if (uid_eq(caller_uid, tcred->euid) &&
> > > > > > > > >             uid_eq(caller_uid, tcred->suid) &&
> > > > > > > > >             uid_eq(caller_uid, tcred->uid)  &&
> > > > > > > > >             gid_eq(caller_gid, tcred->egid) &&
> > > > > > > > >             gid_eq(caller_gid, tcred->sgid) &&
> > > > > > > > >             gid_eq(caller_gid, tcred->gid))
> > > > > > > > >                 goto ok;
> > > > > > > > >         if (ptrace_has_cap(tcred->user_ns, mode))
> > > > > > > > >                 goto ok;
> > > > > > > > >         rcu_read_unlock();
> > > > > > > > >         return -EPERM;
> > > > > > > > > ok:
> > > > > > > > >         rcu_read_unlock();
> > > > > > > > >         mm = task->mm;
> > > > > > > > >         if (mm &&
> > > > > > > > >             ((get_dumpable(mm) != SUID_DUMP_USER) &&
> > > > > > > > >              !ptrace_has_cap(mm->user_ns, mode)))
> > > > > > > > >             return -EPERM;
> > > > > > > >
> > > > > > > > Which specific check would prevent task C from attaching to task B? If
> > > > > > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so
> > > > > > > > you don't trigger the second "return -EPERM".
> > > > > > >
> > > > > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't
> > > > > > > have if you did a setuid to an unpriv user. (But I always find that code
> > > > > > > confusing.)
> > > > > >
> > > > > > Only if the target hasn't gone through execve() since setuid().
> > > > >
> > > > > Sorry if I want to know this in excessive detail but I'd like to
> > > > > understand this properly so bear with me :)
> > > > > - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not
> > > > >   execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode).
> > > >
> > > > Yeah.
> > > >
> > > > > - If task B has setuid()ed, exeved()ed it will get its dumpable flag set
> > > > >   to /proc/sys/fs/suid_dumpable
> > > >
> > > > Not if you changed all UIDs (e.g. by calling setuid() as root). In
> > > > that case, setup_new_exec() calls "set_dumpable(current->mm,
> > > > SUID_DUMP_USER)".
> > >
> > > Actually, looking at this when C is trying to PTRACE_ATTACH to B as an
> > > unprivileged user even if B execve()ed and it is dumpable C still
> > > wouldn't have CAP_SYS_PTRACE in the mm->user_ns unless it already is
> > > privileged over mm->user_ns which means it must be in an ancestor
> > > user_ns.
> >
> > Huh? Why would you need CAP_SYS_PTRACE for anything here? You can
> > ptrace another process running under your UID just fine, no matter
> > what the namespaces are. I'm not sure what you're saying.
>
> Sorry, I was out the door yesterday when answering this and was too
> brief. I forgot to mention: /proc/sys/kernel/yama/ptrace_scope. It
> should be enabled by default on nearly all distros

"nearly all distros"? AFAIK it's off on Debian, for starters. And Yama
still doesn't help you if one of the tasks enters a new user namespace
or whatever.

Yama is a little bit of extra, heuristic, **opt-in** hardening enabled
in some configurations. It is **not** a fundamental building block you
can rely on.

> and even if not -
> which is an administrators choice - you can usually easily enable it via
> sysctl.

Opt-in security isn't good enough. Kernel interfaces should still be
safe to use even on a system that has all the LSM stuff disabled in
the kernel config.

> 1 ("restricted ptrace") [default value]
> When  performing an operation that requires a PTRACE_MODE_ATTACH check,
> the calling process must either have the CAP_SYS_PTRACE capability in
> the user namespace of the target process or it must have a prede‐ fined
> relationship with the target process.  By default, the predefined
> relationship is that the target process must be a descendant of the
> caller.
>
> If you don't have it set you're already susceptible to all kinds of
> other attacks

Oh? Can you be more specific, please?

> and I'm still not convinced we need to bring out the big
> capable(CAP_SYS_ADMIN) gun here.
Christian Brauner Oct. 10, 2018, 1:18 p.m.
On Wed, Oct 10, 2018 at 03:10:11PM +0200, Jann Horn wrote:
> On Wed, Oct 10, 2018 at 2:54 PM Christian Brauner <christian@brauner.io> wrote:
> > On Tue, Oct 09, 2018 at 06:26:47PM +0200, Jann Horn wrote:
> > > On Tue, Oct 9, 2018 at 6:20 PM Christian Brauner <christian@brauner.io> wrote:
> > > > On Tue, Oct 09, 2018 at 05:26:26PM +0200, Jann Horn wrote:
> > > > > On Tue, Oct 9, 2018 at 4:09 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > On Tue, Oct 09, 2018 at 03:50:53PM +0200, Jann Horn wrote:
> > > > > > > On Tue, Oct 9, 2018 at 3:49 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > > On Tue, Oct 09, 2018 at 03:36:04PM +0200, Jann Horn wrote:
> > > > > > > > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > > > > One more thing. Citing from [1]
> > > > > > > > > >
> > > > > > > > > > > I think there's a security problem here. Imagine the following scenario:
> > > > > > > > > > >
> > > > > > > > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > > > > > > > > > > 2. task A forks off a child B
> > > > > > > > > > > 3. task B uses setuid(1) to drop its privileges
> > > > > > > > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > > > > > > > > > > or via execve()
> > > > > > > > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > > > > > > > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
> > > > > > > > > >
> > > > > > > > > > Sorry, to be late to the party but would this really pass
> > > > > > > > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> > > > > > > > > > that it would... Doesn't look like it would get past:
> > > > > > > > > >
> > > > > > > > > >         tcred = __task_cred(task);
> > > > > > > > > >         if (uid_eq(caller_uid, tcred->euid) &&
> > > > > > > > > >             uid_eq(caller_uid, tcred->suid) &&
> > > > > > > > > >             uid_eq(caller_uid, tcred->uid)  &&
> > > > > > > > > >             gid_eq(caller_gid, tcred->egid) &&
> > > > > > > > > >             gid_eq(caller_gid, tcred->sgid) &&
> > > > > > > > > >             gid_eq(caller_gid, tcred->gid))
> > > > > > > > > >                 goto ok;
> > > > > > > > > >         if (ptrace_has_cap(tcred->user_ns, mode))
> > > > > > > > > >                 goto ok;
> > > > > > > > > >         rcu_read_unlock();
> > > > > > > > > >         return -EPERM;
> > > > > > > > > > ok:
> > > > > > > > > >         rcu_read_unlock();
> > > > > > > > > >         mm = task->mm;
> > > > > > > > > >         if (mm &&
> > > > > > > > > >             ((get_dumpable(mm) != SUID_DUMP_USER) &&
> > > > > > > > > >              !ptrace_has_cap(mm->user_ns, mode)))
> > > > > > > > > >             return -EPERM;
> > > > > > > > >
> > > > > > > > > Which specific check would prevent task C from attaching to task B? If
> > > > > > > > > the UIDs match, the first "goto ok" executes; and you're dumpable, so
> > > > > > > > > you don't trigger the second "return -EPERM".
> > > > > > > >
> > > > > > > > You'd also need CAP_SYS_PTRACE in the mm->user_ns which you shouldn't
> > > > > > > > have if you did a setuid to an unpriv user. (But I always find that code
> > > > > > > > confusing.)
> > > > > > >
> > > > > > > Only if the target hasn't gone through execve() since setuid().
> > > > > >
> > > > > > Sorry if I want to know this in excessive detail but I'd like to
> > > > > > understand this properly so bear with me :)
> > > > > > - If task B has setuid()ed and prctl(PR_SET_DUMPABLE, 1)ed but not
> > > > > >   execve()ed then C won't pass ptrace_has_cap(mm->user_ns, mode).
> > > > >
> > > > > Yeah.
> > > > >
> > > > > > - If task B has setuid()ed, exeved()ed it will get its dumpable flag set
> > > > > >   to /proc/sys/fs/suid_dumpable
> > > > >
> > > > > Not if you changed all UIDs (e.g. by calling setuid() as root). In
> > > > > that case, setup_new_exec() calls "set_dumpable(current->mm,
> > > > > SUID_DUMP_USER)".
> > > >
> > > > Actually, looking at this when C is trying to PTRACE_ATTACH to B as an
> > > > unprivileged user even if B execve()ed and it is dumpable C still
> > > > wouldn't have CAP_SYS_PTRACE in the mm->user_ns unless it already is
> > > > privileged over mm->user_ns which means it must be in an ancestor
> > > > user_ns.
> > >
> > > Huh? Why would you need CAP_SYS_PTRACE for anything here? You can
> > > ptrace another process running under your UID just fine, no matter
> > > what the namespaces are. I'm not sure what you're saying.
> >
> > Sorry, I was out the door yesterday when answering this and was too
> > brief. I forgot to mention: /proc/sys/kernel/yama/ptrace_scope. It
> > should be enabled by default on nearly all distros
> 
> "nearly all distros"? AFAIK it's off on Debian, for starters. And Yama
> still doesn't help you if one of the tasks enters a new user namespace
> or whatever.
> 
> Yama is a little bit of extra, heuristic, **opt-in** hardening enabled
> in some configurations. It is **not** a fundamental building block you
> can rely on.
> 
> > and even if not -
> > which is an administrators choice - you can usually easily enable it via
> > sysctl.
> 
> Opt-in security isn't good enough. Kernel interfaces should still be
> safe to use even on a system that has all the LSM stuff disabled in
> the kernel config.

Then ptrace() isn't, I guess?

But see https://lists.linuxfoundation.org/pipermail/containers/2018-October/039567.html
I don't care as long as we have a way of getting the fd without the
CAP_SYS_ADMIN requirement throught seccomp().

> 
> > 1 ("restricted ptrace") [default value]
> > When  performing an operation that requires a PTRACE_MODE_ATTACH check,
> > the calling process must either have the CAP_SYS_PTRACE capability in
> > the user namespace of the target process or it must have a prede‐ fined
> > relationship with the target process.  By default, the predefined
> > relationship is that the target process must be a descendant of the
> > caller.
> >
> > If you don't have it set you're already susceptible to all kinds of
> > other attacks
> 
> Oh? Can you be more specific, please?

I was referring to attacks where you attach to processes that run as
your user but might expose in-memory credentials or other sensitive
information, (essentially what the manpage is outlining).

> 
> > and I'm still not convinced we need to bring out the big
> > capable(CAP_SYS_ADMIN) gun here.
Paul Moore Oct. 10, 2018, 3:31 p.m.
On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
> +cc selinux people explicitly, since they probably have opinions on this

I just spent about twenty minutes working my way through this thread,
and digging through the containers archive trying to get a good
understanding of what you guys are trying to do, and I'm not quite
sure I understand it all.  However, from what I have seen, this
approach looks very ptrace-y to me (I imagine to others as well based
on the comments) and because of this I think ensuring the usual ptrace
access controls are evaluated, including the ptrace LSM hooks, is the
right thing to do.

If I've missed something, or I'm thinking about this wrong, please
educate me; just a heads-up that I'm largely offline for most of this
week so responses on my end are going to be delayed much more than
usual.

> On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> > On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote:
> > > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote:
> > > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote:
> > > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote:
> > > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > > > > > > > index 44a31ac8373a..17685803a2af 100644
> > > > > > > > > --- a/kernel/seccomp.c
> > > > > > > > > +++ b/kernel/seccomp.c
> > > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> > > > > > > > >
> > > > > > > > >       return ret;
> > > > > > > > >  }
> > > > > > > > > +
> > > > > > > > > +long seccomp_new_listener(struct task_struct *task,
> > > > > > > > > +                       unsigned long filter_off)
> > > > > > > > > +{
> > > > > > > > > +     struct seccomp_filter *filter;
> > > > > > > > > +     struct file *listener;
> > > > > > > > > +     int fd;
> > > > > > > > > +
> > > > > > > > > +     if (!capable(CAP_SYS_ADMIN))
> > > > > > > > > +             return -EACCES;
> > > > > > > >
> > > > > > > > I know this might have been discussed a while back but why exactly do we
> > > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What
> > > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
> > > > > > > > use ptrace from in there?
> > > > > > >
> > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
> > > > > > > . Basically, the problem is that this doesn't just give you capability
> > > > > > > over the target task, but also over every other task that has the same
> > > > > > > filter installed; you need some sort of "is the caller capable over
> > > > > > > the filter and anyone who uses it" check.
> > > > > >
> > > > > > Thanks.
> > > > > > But then this new ptrace feature as it stands is imho currently broken.
> > > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you
> > > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself
> > > > > > if you are ns_cpabable(CAP_SYS_ADMIN)
> > >
> > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as
> > > you enable the NNP flag, I think?
> >
> > Yes, if you turn on NNP you don't even need sys_admin.
> >
> > >
> > > > > > then either the new ptrace() api
> > > > > > extension should be fixed to allow for this too or the seccomp() way of
> > > > > > retrieving the pid - which I really think we want - needs to be fixed to
> > > > > > require capable(CAP_SYS_ADMIN) too.
> > > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho -
> > > > > > the preferred way to solve this.
> > > > > > Everything else will just be confusing.
> > > > >
> > > > > First you say "broken", then you say "confusing". Which one do you mean?
> > > >
> > > > Both. It's broken in so far as it places a seemingly unnecessary
> > > > restriction that could be fixed. You outlined one possible fix yourself
> > > > in the link you provided.
> > >
> > > If by "possible fix" you mean "check whether the seccomp filter is
> > > only attached to a single task": That wouldn't fundamentally change
> > > the situation, it would only add an additional special case.
> > >
> > > > And it's confusing in so far as there is a way
> > > > via seccomp() to get the fd without said requirement.
> > >
> > > I don't find it confusing at all. seccomp() and ptrace() are very
> >
> > Fine, then that's a matter of opinion. I find it counterintuitive that
> > you can get an fd without privileges via one interface but not via
> > another.
> >
> > > different situations: When you use seccomp(), infrastructure is
> >
> > Sure. Note, that this is _one_ of the reasons why I want to make sure we
> > keep the native seccomp() only based way of getting an fd without
> > forcing userspace to switching to a differnet kernel api.
> >
> > > already in place for ensuring that your filter is only applied to
> > > processes over which you are capable, and propagation is limited by
> > > inheritance from your task down. When you use ptrace(), you need a
> > > pretty different sort of access check that checks whether you're
> > > privileged over ancestors, siblings and so on of the target task.
> >
> > So, don't get me wrong I'm not arguing against the ptrace() interface in
> > general. If this is something that people find useful, fine. But, I
> > would like to have a simple single-syscall pure-seccomp() based way of
> > getting an fd, i.e. what we have in patch 1 of this series.
>
> Yeah, I also prefer the seccomp() one.
>
> > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved
> > > current->mm->user_ns of the task that installed the filter (stored as
> > > a "struct user_namespace *" in the filter) should be acceptable.
> >
> > Hm... Why not CAP_SYS_PTRACE?
>
> Because LSMs like SELinux add extra checks that apply even if you have
> CAP_SYS_PTRACE, and this would subvert those. The only capability I
> know of that lets you bypass LSM checks by design (if no LSM blocks
> the capability itself) is CAP_SYS_ADMIN.
>
> > One more thing. Citing from [1]
> >
> > > I think there's a security problem here. Imagine the following scenario:
> > >
> > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > > 2. task A forks off a child B
> > > 3. task B uses setuid(1) to drop its privileges
> > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > > or via execve()
> > > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
> >
> > Sorry, to be late to the party but would this really pass
> > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> > that it would... Doesn't look like it would get past:
> >
> >         tcred = __task_cred(task);
> >         if (uid_eq(caller_uid, tcred->euid) &&
> >             uid_eq(caller_uid, tcred->suid) &&
> >             uid_eq(caller_uid, tcred->uid)  &&
> >             gid_eq(caller_gid, tcred->egid) &&
> >             gid_eq(caller_gid, tcred->sgid) &&
> >             gid_eq(caller_gid, tcred->gid))
> >                 goto ok;
> >         if (ptrace_has_cap(tcred->user_ns, mode))
> >                 goto ok;
> >         rcu_read_unlock();
> >         return -EPERM;
> > ok:
> >         rcu_read_unlock();
> >         mm = task->mm;
> >         if (mm &&
> >             ((get_dumpable(mm) != SUID_DUMP_USER) &&
> >              !ptrace_has_cap(mm->user_ns, mode)))
> >             return -EPERM;
>
> Which specific check would prevent task C from attaching to task B? If
> the UIDs match, the first "goto ok" executes; and you're dumpable, so
> you don't trigger the second "return -EPERM".
>
> > > 7. because the seccomp filter is shared by task A and task B, task C
> > > is now able to influence syscall results for syscalls performed by
> > > task A
> >
> > [1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
Jann Horn via Containers Oct. 10, 2018, 3:33 p.m.
On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote:
> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
> > +cc selinux people explicitly, since they probably have opinions on this
>
> I just spent about twenty minutes working my way through this thread,
> and digging through the containers archive trying to get a good
> understanding of what you guys are trying to do, and I'm not quite
> sure I understand it all.  However, from what I have seen, this
> approach looks very ptrace-y to me (I imagine to others as well based
> on the comments) and because of this I think ensuring the usual ptrace
> access controls are evaluated, including the ptrace LSM hooks, is the
> right thing to do.

Basically the problem is that this new ptrace() API does something
that doesn't just influence the target task, but also every other task
that has the same seccomp filter. So the classic ptrace check doesn't
work here.

> If I've missed something, or I'm thinking about this wrong, please
> educate me; just a heads-up that I'm largely offline for most of this
> week so responses on my end are going to be delayed much more than
> usual.
>
> > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> > > On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote:
> > > > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote:
> > > > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote:
> > > > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > > > > > > > > index 44a31ac8373a..17685803a2af 100644
> > > > > > > > > > --- a/kernel/seccomp.c
> > > > > > > > > > +++ b/kernel/seccomp.c
> > > > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> > > > > > > > > >
> > > > > > > > > >       return ret;
> > > > > > > > > >  }
> > > > > > > > > > +
> > > > > > > > > > +long seccomp_new_listener(struct task_struct *task,
> > > > > > > > > > +                       unsigned long filter_off)
> > > > > > > > > > +{
> > > > > > > > > > +     struct seccomp_filter *filter;
> > > > > > > > > > +     struct file *listener;
> > > > > > > > > > +     int fd;
> > > > > > > > > > +
> > > > > > > > > > +     if (!capable(CAP_SYS_ADMIN))
> > > > > > > > > > +             return -EACCES;
> > > > > > > > >
> > > > > > > > > I know this might have been discussed a while back but why exactly do we
> > > > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What
> > > > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
> > > > > > > > > use ptrace from in there?
> > > > > > > >
> > > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
> > > > > > > > . Basically, the problem is that this doesn't just give you capability
> > > > > > > > over the target task, but also over every other task that has the same
> > > > > > > > filter installed; you need some sort of "is the caller capable over
> > > > > > > > the filter and anyone who uses it" check.
> > > > > > >
> > > > > > > Thanks.
> > > > > > > But then this new ptrace feature as it stands is imho currently broken.
> > > > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you
> > > > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself
> > > > > > > if you are ns_cpabable(CAP_SYS_ADMIN)
> > > >
> > > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as
> > > > you enable the NNP flag, I think?
> > >
> > > Yes, if you turn on NNP you don't even need sys_admin.
> > >
> > > >
> > > > > > > then either the new ptrace() api
> > > > > > > extension should be fixed to allow for this too or the seccomp() way of
> > > > > > > retrieving the pid - which I really think we want - needs to be fixed to
> > > > > > > require capable(CAP_SYS_ADMIN) too.
> > > > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho -
> > > > > > > the preferred way to solve this.
> > > > > > > Everything else will just be confusing.
> > > > > >
> > > > > > First you say "broken", then you say "confusing". Which one do you mean?
> > > > >
> > > > > Both. It's broken in so far as it places a seemingly unnecessary
> > > > > restriction that could be fixed. You outlined one possible fix yourself
> > > > > in the link you provided.
> > > >
> > > > If by "possible fix" you mean "check whether the seccomp filter is
> > > > only attached to a single task": That wouldn't fundamentally change
> > > > the situation, it would only add an additional special case.
> > > >
> > > > > And it's confusing in so far as there is a way
> > > > > via seccomp() to get the fd without said requirement.
> > > >
> > > > I don't find it confusing at all. seccomp() and ptrace() are very
> > >
> > > Fine, then that's a matter of opinion. I find it counterintuitive that
> > > you can get an fd without privileges via one interface but not via
> > > another.
> > >
> > > > different situations: When you use seccomp(), infrastructure is
> > >
> > > Sure. Note, that this is _one_ of the reasons why I want to make sure we
> > > keep the native seccomp() only based way of getting an fd without
> > > forcing userspace to switching to a differnet kernel api.
> > >
> > > > already in place for ensuring that your filter is only applied to
> > > > processes over which you are capable, and propagation is limited by
> > > > inheritance from your task down. When you use ptrace(), you need a
> > > > pretty different sort of access check that checks whether you're
> > > > privileged over ancestors, siblings and so on of the target task.
> > >
> > > So, don't get me wrong I'm not arguing against the ptrace() interface in
> > > general. If this is something that people find useful, fine. But, I
> > > would like to have a simple single-syscall pure-seccomp() based way of
> > > getting an fd, i.e. what we have in patch 1 of this series.
> >
> > Yeah, I also prefer the seccomp() one.
> >
> > > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved
> > > > current->mm->user_ns of the task that installed the filter (stored as
> > > > a "struct user_namespace *" in the filter) should be acceptable.
> > >
> > > Hm... Why not CAP_SYS_PTRACE?
> >
> > Because LSMs like SELinux add extra checks that apply even if you have
> > CAP_SYS_PTRACE, and this would subvert those. The only capability I
> > know of that lets you bypass LSM checks by design (if no LSM blocks
> > the capability itself) is CAP_SYS_ADMIN.
> >
> > > One more thing. Citing from [1]
> > >
> > > > I think there's a security problem here. Imagine the following scenario:
> > > >
> > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > > > 2. task A forks off a child B
> > > > 3. task B uses setuid(1) to drop its privileges
> > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > > > or via execve()
> > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
> > >
> > > Sorry, to be late to the party but would this really pass
> > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> > > that it would... Doesn't look like it would get past:
> > >
> > >         tcred = __task_cred(task);
> > >         if (uid_eq(caller_uid, tcred->euid) &&
> > >             uid_eq(caller_uid, tcred->suid) &&
> > >             uid_eq(caller_uid, tcred->uid)  &&
> > >             gid_eq(caller_gid, tcred->egid) &&
> > >             gid_eq(caller_gid, tcred->sgid) &&
> > >             gid_eq(caller_gid, tcred->gid))
> > >                 goto ok;
> > >         if (ptrace_has_cap(tcred->user_ns, mode))
> > >                 goto ok;
> > >         rcu_read_unlock();
> > >         return -EPERM;
> > > ok:
> > >         rcu_read_unlock();
> > >         mm = task->mm;
> > >         if (mm &&
> > >             ((get_dumpable(mm) != SUID_DUMP_USER) &&
> > >              !ptrace_has_cap(mm->user_ns, mode)))
> > >             return -EPERM;
> >
> > Which specific check would prevent task C from attaching to task B? If
> > the UIDs match, the first "goto ok" executes; and you're dumpable, so
> > you don't trigger the second "return -EPERM".
> >
> > > > 7. because the seccomp filter is shared by task A and task B, task C
> > > > is now able to influence syscall results for syscalls performed by
> > > > task A
> > >
> > > [1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
>
>
>
> --
> paul moore
> www.paul-moore.com
Christian Brauner Oct. 10, 2018, 3:39 p.m.
On Wed, Oct 10, 2018 at 05:33:43PM +0200, Jann Horn wrote:
> On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote:
> > On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
> > > +cc selinux people explicitly, since they probably have opinions on this
> >
> > I just spent about twenty minutes working my way through this thread,
> > and digging through the containers archive trying to get a good
> > understanding of what you guys are trying to do, and I'm not quite
> > sure I understand it all.  However, from what I have seen, this
> > approach looks very ptrace-y to me (I imagine to others as well based
> > on the comments) and because of this I think ensuring the usual ptrace
> > access controls are evaluated, including the ptrace LSM hooks, is the
> > right thing to do.
> 
> Basically the problem is that this new ptrace() API does something
> that doesn't just influence the target task, but also every other task
> that has the same seccomp filter. So the classic ptrace check doesn't
> work here.

Just to throw this into the mix: then maybe ptrace() isn't the right
interface and we should just go with the native seccomp() approach for
now.

> 
> > If I've missed something, or I'm thinking about this wrong, please
> > educate me; just a heads-up that I'm largely offline for most of this
> > week so responses on my end are going to be delayed much more than
> > usual.
> >
> > > On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
> > > > On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote:
> > > > > On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote:
> > > > > > > On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote:
> > > > > > > > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
> > > > > > > > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > > > > > > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > > > > > > > > > > index 44a31ac8373a..17685803a2af 100644
> > > > > > > > > > > --- a/kernel/seccomp.c
> > > > > > > > > > > +++ b/kernel/seccomp.c
> > > > > > > > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
> > > > > > > > > > >
> > > > > > > > > > >       return ret;
> > > > > > > > > > >  }
> > > > > > > > > > > +
> > > > > > > > > > > +long seccomp_new_listener(struct task_struct *task,
> > > > > > > > > > > +                       unsigned long filter_off)
> > > > > > > > > > > +{
> > > > > > > > > > > +     struct seccomp_filter *filter;
> > > > > > > > > > > +     struct file *listener;
> > > > > > > > > > > +     int fd;
> > > > > > > > > > > +
> > > > > > > > > > > +     if (!capable(CAP_SYS_ADMIN))
> > > > > > > > > > > +             return -EACCES;
> > > > > > > > > >
> > > > > > > > > > I know this might have been discussed a while back but why exactly do we
> > > > > > > > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What
> > > > > > > > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
> > > > > > > > > > use ptrace from in there?
> > > > > > > > >
> > > > > > > > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
> > > > > > > > > . Basically, the problem is that this doesn't just give you capability
> > > > > > > > > over the target task, but also over every other task that has the same
> > > > > > > > > filter installed; you need some sort of "is the caller capable over
> > > > > > > > > the filter and anyone who uses it" check.
> > > > > > > >
> > > > > > > > Thanks.
> > > > > > > > But then this new ptrace feature as it stands is imho currently broken.
> > > > > > > > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you
> > > > > > > > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself
> > > > > > > > if you are ns_cpabable(CAP_SYS_ADMIN)
> > > > >
> > > > > Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as
> > > > > you enable the NNP flag, I think?
> > > >
> > > > Yes, if you turn on NNP you don't even need sys_admin.
> > > >
> > > > >
> > > > > > > > then either the new ptrace() api
> > > > > > > > extension should be fixed to allow for this too or the seccomp() way of
> > > > > > > > retrieving the pid - which I really think we want - needs to be fixed to
> > > > > > > > require capable(CAP_SYS_ADMIN) too.
> > > > > > > > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho -
> > > > > > > > the preferred way to solve this.
> > > > > > > > Everything else will just be confusing.
> > > > > > >
> > > > > > > First you say "broken", then you say "confusing". Which one do you mean?
> > > > > >
> > > > > > Both. It's broken in so far as it places a seemingly unnecessary
> > > > > > restriction that could be fixed. You outlined one possible fix yourself
> > > > > > in the link you provided.
> > > > >
> > > > > If by "possible fix" you mean "check whether the seccomp filter is
> > > > > only attached to a single task": That wouldn't fundamentally change
> > > > > the situation, it would only add an additional special case.
> > > > >
> > > > > > And it's confusing in so far as there is a way
> > > > > > via seccomp() to get the fd without said requirement.
> > > > >
> > > > > I don't find it confusing at all. seccomp() and ptrace() are very
> > > >
> > > > Fine, then that's a matter of opinion. I find it counterintuitive that
> > > > you can get an fd without privileges via one interface but not via
> > > > another.
> > > >
> > > > > different situations: When you use seccomp(), infrastructure is
> > > >
> > > > Sure. Note, that this is _one_ of the reasons why I want to make sure we
> > > > keep the native seccomp() only based way of getting an fd without
> > > > forcing userspace to switching to a differnet kernel api.
> > > >
> > > > > already in place for ensuring that your filter is only applied to
> > > > > processes over which you are capable, and propagation is limited by
> > > > > inheritance from your task down. When you use ptrace(), you need a
> > > > > pretty different sort of access check that checks whether you're
> > > > > privileged over ancestors, siblings and so on of the target task.
> > > >
> > > > So, don't get me wrong I'm not arguing against the ptrace() interface in
> > > > general. If this is something that people find useful, fine. But, I
> > > > would like to have a simple single-syscall pure-seccomp() based way of
> > > > getting an fd, i.e. what we have in patch 1 of this series.
> > >
> > > Yeah, I also prefer the seccomp() one.
> > >
> > > > > But thinking about it more, I think that CAP_SYS_ADMIN over the saved
> > > > > current->mm->user_ns of the task that installed the filter (stored as
> > > > > a "struct user_namespace *" in the filter) should be acceptable.
> > > >
> > > > Hm... Why not CAP_SYS_PTRACE?
> > >
> > > Because LSMs like SELinux add extra checks that apply even if you have
> > > CAP_SYS_PTRACE, and this would subvert those. The only capability I
> > > know of that lets you bypass LSM checks by design (if no LSM blocks
> > > the capability itself) is CAP_SYS_ADMIN.
> > >
> > > > One more thing. Citing from [1]
> > > >
> > > > > I think there's a security problem here. Imagine the following scenario:
> > > > >
> > > > > 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
> > > > > 2. task A forks off a child B
> > > > > 3. task B uses setuid(1) to drop its privileges
> > > > > 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
> > > > > or via execve()
> > > > > 5. task C (the attacker, uid==1) attaches to task B via ptrace
> > > > > 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
> > > >
> > > > Sorry, to be late to the party but would this really pass
> > > > __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
> > > > that it would... Doesn't look like it would get past:
> > > >
> > > >         tcred = __task_cred(task);
> > > >         if (uid_eq(caller_uid, tcred->euid) &&
> > > >             uid_eq(caller_uid, tcred->suid) &&
> > > >             uid_eq(caller_uid, tcred->uid)  &&
> > > >             gid_eq(caller_gid, tcred->egid) &&
> > > >             gid_eq(caller_gid, tcred->sgid) &&
> > > >             gid_eq(caller_gid, tcred->gid))
> > > >                 goto ok;
> > > >         if (ptrace_has_cap(tcred->user_ns, mode))
> > > >                 goto ok;
> > > >         rcu_read_unlock();
> > > >         return -EPERM;
> > > > ok:
> > > >         rcu_read_unlock();
> > > >         mm = task->mm;
> > > >         if (mm &&
> > > >             ((get_dumpable(mm) != SUID_DUMP_USER) &&
> > > >              !ptrace_has_cap(mm->user_ns, mode)))
> > > >             return -EPERM;
> > >
> > > Which specific check would prevent task C from attaching to task B? If
> > > the UIDs match, the first "goto ok" executes; and you're dumpable, so
> > > you don't trigger the second "return -EPERM".
> > >
> > > > > 7. because the seccomp filter is shared by task A and task B, task C
> > > > > is now able to influence syscall results for syscalls performed by
> > > > > task A
> > > >
> > > > [1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
> >
> >
> >
> > --
> > paul moore
> > www.paul-moore.com
Tycho Andersen Oct. 10, 2018, 4:54 p.m.
On Wed, Oct 10, 2018 at 05:39:57PM +0200, Christian Brauner wrote:
> On Wed, Oct 10, 2018 at 05:33:43PM +0200, Jann Horn wrote:
> > On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote:
> > > On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
> > > > +cc selinux people explicitly, since they probably have opinions on this
> > >
> > > I just spent about twenty minutes working my way through this thread,
> > > and digging through the containers archive trying to get a good
> > > understanding of what you guys are trying to do, and I'm not quite
> > > sure I understand it all.  However, from what I have seen, this
> > > approach looks very ptrace-y to me (I imagine to others as well based
> > > on the comments) and because of this I think ensuring the usual ptrace
> > > access controls are evaluated, including the ptrace LSM hooks, is the
> > > right thing to do.
> > 
> > Basically the problem is that this new ptrace() API does something
> > that doesn't just influence the target task, but also every other task
> > that has the same seccomp filter. So the classic ptrace check doesn't
> > work here.
> 
> Just to throw this into the mix: then maybe ptrace() isn't the right
> interface and we should just go with the native seccomp() approach for
> now.

Please no :).

I don't buy your arguments that 3-syscalls vs. one is better. If I'm
doing this setup with a new container, I have to do
clone(CLONE_FILES), do this seccomp thing, so that my parent can pick
it up again, then do another clone without CLONE_FILES, because in the
general case I don't want to share my fd table with the container,
wait on the middle task for errors, etc. So we're still doing a bunch
of setup, and it feels more awkward than ptrace, with at least as many
syscalls, and it only works for your children.

I don't mind leaving capable(CAP_SYS_ADMIN) for the ptrace() part,
though. So if that's ok, then I think we can agree :)

Tycho
Christian Brauner Oct. 10, 2018, 5:15 p.m.
On Wed, Oct 10, 2018 at 09:54:58AM -0700, Tycho Andersen wrote:
> On Wed, Oct 10, 2018 at 05:39:57PM +0200, Christian Brauner wrote:
> > On Wed, Oct 10, 2018 at 05:33:43PM +0200, Jann Horn wrote:
> > > On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote:
> > > > On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
> > > > > +cc selinux people explicitly, since they probably have opinions on this
> > > >
> > > > I just spent about twenty minutes working my way through this thread,
> > > > and digging through the containers archive trying to get a good
> > > > understanding of what you guys are trying to do, and I'm not quite
> > > > sure I understand it all.  However, from what I have seen, this
> > > > approach looks very ptrace-y to me (I imagine to others as well based
> > > > on the comments) and because of this I think ensuring the usual ptrace
> > > > access controls are evaluated, including the ptrace LSM hooks, is the
> > > > right thing to do.
> > > 
> > > Basically the problem is that this new ptrace() API does something
> > > that doesn't just influence the target task, but also every other task
> > > that has the same seccomp filter. So the classic ptrace check doesn't
> > > work here.
> > 
> > Just to throw this into the mix: then maybe ptrace() isn't the right
> > interface and we should just go with the native seccomp() approach for
> > now.
> 
> Please no :).
> 
> I don't buy your arguments that 3-syscalls vs. one is better. If I'm
> doing this setup with a new container, I have to do
> clone(CLONE_FILES), do this seccomp thing, so that my parent can pick
> it up again, then do another clone without CLONE_FILES, because in the
> general case I don't want to share my fd table with the container,
> wait on the middle task for errors, etc. So we're still doing a bunch
> of setup, and it feels more awkward than ptrace, with at least as many
> syscalls, and it only works for your children.

You're talking about the case where you already have shot yourself in
the foot by blocking basically all other sensible ways of getting the fd
out.

Also, this was meant to show that parts of your initial justification
for implementing the ptrace() way of getting an fd doesn't really stand.
And it doesn't really. Even with ptrace() you can get into situations
where you're not able to get an fd. (see prior threads)

> 
> I don't mind leaving capable(CAP_SYS_ADMIN) for the ptrace() part,

Again, (prior threads) ptrace() or no ptrace() is something I do not
particularly care about as long as we have the
non-capable(CAP_SYS_ADMIN) seccomp() way of getting an fd out.

> though. So if that's ok, then I think we can agree :)
> 
> Tycho
Tycho Andersen Oct. 10, 2018, 5:26 p.m.
On Wed, Oct 10, 2018 at 07:15:02PM +0200, Christian Brauner wrote:
> On Wed, Oct 10, 2018 at 09:54:58AM -0700, Tycho Andersen wrote:
> > On Wed, Oct 10, 2018 at 05:39:57PM +0200, Christian Brauner wrote:
> > > On Wed, Oct 10, 2018 at 05:33:43PM +0200, Jann Horn wrote:
> > > > On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote:
> > > > > On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
> > > > > > +cc selinux people explicitly, since they probably have opinions on this
> > > > >
> > > > > I just spent about twenty minutes working my way through this thread,
> > > > > and digging through the containers archive trying to get a good
> > > > > understanding of what you guys are trying to do, and I'm not quite
> > > > > sure I understand it all.  However, from what I have seen, this
> > > > > approach looks very ptrace-y to me (I imagine to others as well based
> > > > > on the comments) and because of this I think ensuring the usual ptrace
> > > > > access controls are evaluated, including the ptrace LSM hooks, is the
> > > > > right thing to do.
> > > > 
> > > > Basically the problem is that this new ptrace() API does something
> > > > that doesn't just influence the target task, but also every other task
> > > > that has the same seccomp filter. So the classic ptrace check doesn't
> > > > work here.
> > > 
> > > Just to throw this into the mix: then maybe ptrace() isn't the right
> > > interface and we should just go with the native seccomp() approach for
> > > now.
> > 
> > Please no :).
> > 
> > I don't buy your arguments that 3-syscalls vs. one is better. If I'm
> > doing this setup with a new container, I have to do
> > clone(CLONE_FILES), do this seccomp thing, so that my parent can pick
> > it up again, then do another clone without CLONE_FILES, because in the
> > general case I don't want to share my fd table with the container,
> > wait on the middle task for errors, etc. So we're still doing a bunch
> > of setup, and it feels more awkward than ptrace, with at least as many
> > syscalls, and it only works for your children.
> 
> You're talking about the case where you already have shot yourself in
> the foot by blocking basically all other sensible ways of getting the fd
> out.

Ok, but these other ways involve syscalls too (sendmsg() or whatever).
And if you're going to allow arbitrary policy from your users, you
have to be maximally flexible.

> Also, this was meant to show that parts of your initial justification
> for implementing the ptrace() way of getting an fd doesn't really stand.
> And it doesn't really. Even with ptrace() you can get into situations
> where you're not able to get an fd. (see prior threads)

Of course. I guess my point was that we shouldn't design an API that's
impossible to use. I'll drop the notes about sendmsg() from the commit
message.

Tycho
Andy Lutomirski Oct. 10, 2018, 5:45 p.m.
On Mon, Oct 8, 2018 at 11:00 AM Tycho Andersen <tycho@tycho.ws> wrote:
>
> On Mon, Oct 08, 2018 at 05:16:30PM +0200, Christian Brauner wrote:
> > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> > > version which can acquire filters is useful. There are at least two reasons
> > > this is preferable, even though it uses ptrace:
> > >
> > > 1. You can control tasks that aren't cooperating with you
> > > 2. You can control tasks whose filters block sendmsg() and socket(); if the
> > >    task installs a filter which blocks these calls, there's no way with
> > >    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.
> >
> > So for the slow of mind aka me:
> > I'm not sure I completely understand this problem. Can you outline how
> > sendmsg() and socket() are involved in this?
> >
> > I'm also not sure that this holds (but I might misunderstand the
> > problem) afaict, you could do try to get the fd out via CLONE_FILES and
> > other means so something like:
> >
> > // let's pretend the libc wrapper for clone actually has sane semantics
> > pid = clone(CLONE_FILES);
> > if (pid == 0) {
> >         fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
> >
> >         // Now this fd will be valid in both parent and child.
> >         // If you haven't blocked it you can inform the parent what
> >         // the fd number is via pipe2(). If you have blocked it you can
> >         // use dup2() and dup to a known fd number.
> > }
>
> But what if your seccomp filter wants to block both pipe2() and
> dup2()? Whatever syscall you want to use to do this could be blocked
> by some seccomp policy, which means you might not be able to use this
> feature in some cases.

You don't need a syscall at all. You can use shared memory.

>
> Perhaps it's unlikely, and we can just go forward knowing this. But it
> seems like it is worth at least acknowledging that you can wedge
> yourself into a corner.
>

I think that what we *really* want is a way to create a seccomp fitter
and activate it later (on execve or via another call to seccomp(),
perhaps).  And we already sort of have that using ptrace() but a
better interface would be nice when a real use case gets figured out.
Christian Brauner Oct. 10, 2018, 6:26 p.m.
On Wed, Oct 10, 2018 at 10:45:29AM -0700, Andy Lutomirski wrote:
> On Mon, Oct 8, 2018 at 11:00 AM Tycho Andersen <tycho@tycho.ws> wrote:
> >
> > On Mon, Oct 08, 2018 at 05:16:30PM +0200, Christian Brauner wrote:
> > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
> > > > As an alternative to SECCOMP_FILTER_FLAG_GET_LISTENER, perhaps a ptrace()
> > > > version which can acquire filters is useful. There are at least two reasons
> > > > this is preferable, even though it uses ptrace:
> > > >
> > > > 1. You can control tasks that aren't cooperating with you
> > > > 2. You can control tasks whose filters block sendmsg() and socket(); if the
> > > >    task installs a filter which blocks these calls, there's no way with
> > > >    SECCOMP_FILTER_FLAG_GET_LISTENER to get the fd out to the privileged task.
> > >
> > > So for the slow of mind aka me:
> > > I'm not sure I completely understand this problem. Can you outline how
> > > sendmsg() and socket() are involved in this?
> > >
> > > I'm also not sure that this holds (but I might misunderstand the
> > > problem) afaict, you could do try to get the fd out via CLONE_FILES and
> > > other means so something like:
> > >
> > > // let's pretend the libc wrapper for clone actually has sane semantics
> > > pid = clone(CLONE_FILES);
> > > if (pid == 0) {
> > >         fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
> > >
> > >         // Now this fd will be valid in both parent and child.
> > >         // If you haven't blocked it you can inform the parent what
> > >         // the fd number is via pipe2(). If you have blocked it you can
> > >         // use dup2() and dup to a known fd number.
> > > }
> >
> > But what if your seccomp filter wants to block both pipe2() and
> > dup2()? Whatever syscall you want to use to do this could be blocked
> > by some seccomp policy, which means you might not be able to use this
> > feature in some cases.
> 
> You don't need a syscall at all. You can use shared memory.

Yeah, I pointed that out too in the next mail. :)

> 
> >
> > Perhaps it's unlikely, and we can just go forward knowing this. But it
> > seems like it is worth at least acknowledging that you can wedge
> > yourself into a corner.
> >
> 
> I think that what we *really* want is a way to create a seccomp fitter

I thought about this exact thing when discussing my reservations about
ptrace() but I didn't want to defer this patchset any longer. But I
really like this idea of being able to get an fd *before* the filter is
loaded.

> and activate it later (on execve or via another call to seccomp(),
> perhaps).  And we already sort of have that using ptrace() but a
> better interface would be nice when a real use case gets figured out.
Christian Brauner Oct. 10, 2018, 6:28 p.m.
On Wed, Oct 10, 2018 at 10:26:22AM -0700, Tycho Andersen wrote:
> On Wed, Oct 10, 2018 at 07:15:02PM +0200, Christian Brauner wrote:
> > On Wed, Oct 10, 2018 at 09:54:58AM -0700, Tycho Andersen wrote:
> > > On Wed, Oct 10, 2018 at 05:39:57PM +0200, Christian Brauner wrote:
> > > > On Wed, Oct 10, 2018 at 05:33:43PM +0200, Jann Horn wrote:
> > > > > On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote:
> > > > > > On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
> > > > > > > +cc selinux people explicitly, since they probably have opinions on this
> > > > > >
> > > > > > I just spent about twenty minutes working my way through this thread,
> > > > > > and digging through the containers archive trying to get a good
> > > > > > understanding of what you guys are trying to do, and I'm not quite
> > > > > > sure I understand it all.  However, from what I have seen, this
> > > > > > approach looks very ptrace-y to me (I imagine to others as well based
> > > > > > on the comments) and because of this I think ensuring the usual ptrace
> > > > > > access controls are evaluated, including the ptrace LSM hooks, is the
> > > > > > right thing to do.
> > > > > 
> > > > > Basically the problem is that this new ptrace() API does something
> > > > > that doesn't just influence the target task, but also every other task
> > > > > that has the same seccomp filter. So the classic ptrace check doesn't
> > > > > work here.
> > > > 
> > > > Just to throw this into the mix: then maybe ptrace() isn't the right
> > > > interface and we should just go with the native seccomp() approach for
> > > > now.
> > > 
> > > Please no :).
> > > 
> > > I don't buy your arguments that 3-syscalls vs. one is better. If I'm
> > > doing this setup with a new container, I have to do
> > > clone(CLONE_FILES), do this seccomp thing, so that my parent can pick
> > > it up again, then do another clone without CLONE_FILES, because in the
> > > general case I don't want to share my fd table with the container,
> > > wait on the middle task for errors, etc. So we're still doing a bunch
> > > of setup, and it feels more awkward than ptrace, with at least as many
> > > syscalls, and it only works for your children.
> > 
> > You're talking about the case where you already have shot yourself in
> > the foot by blocking basically all other sensible ways of getting the fd
> > out.
> 
> Ok, but these other ways involve syscalls too (sendmsg() or whatever).
> And if you're going to allow arbitrary policy from your users, you
> have to be maximally flexible.

So, I totally like the idea of being able to get an fd before the filter
is active. If this could be done in seccomp()-only it would be A+ (See
Andy's mail in the other thread.)
But I really don't want to keep you working on this forever. :)

> 
> > Also, this was meant to show that parts of your initial justification
> > for implementing the ptrace() way of getting an fd doesn't really stand.
> > And it doesn't really. Even with ptrace() you can get into situations
> > where you're not able to get an fd. (see prior threads)
> 
> Of course. I guess my point was that we shouldn't design an API that's
> impossible to use. I'll drop the notes about sendmsg() from the commit
> message.
> 
> Tycho
Paul Moore Oct. 11, 2018, 7:24 a.m.
On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote:
> On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote:
>> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
>>> +cc selinux people explicitly, since they probably have opinions on this
>>
>> I just spent about twenty minutes working my way through this thread,
>> and digging through the containers archive trying to get a good
>> understanding of what you guys are trying to do, and I'm not quite
>> sure I understand it all.  However, from what I have seen, this
>> approach looks very ptrace-y to me (I imagine to others as well based
>> on the comments) and because of this I think ensuring the usual ptrace
>> access controls are evaluated, including the ptrace LSM hooks, is the
>> right thing to do.
>
> Basically the problem is that this new ptrace() API does something
> that doesn't just influence the target task, but also every other task
> that has the same seccomp filter. So the classic ptrace check doesn't
> work here.

Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks?  If it is possible, how painful would it be?

>
>> If I've missed something, or I'm thinking about this wrong, please
>> educate me; just a heads-up that I'm largely offline for most of this
>> week so responses on my end are going to be delayed much more than
>> usual.
>>
>>> On Tue, Oct 9, 2018 at 3:29 PM Christian Brauner <christian@brauner.io> wrote:
>>>> On Tue, Oct 09, 2018 at 02:39:53PM +0200, Jann Horn wrote:
>>>>> On Mon, Oct 8, 2018 at 8:18 PM Christian Brauner <christian@brauner.io> wrote:
>>>>>> On Mon, Oct 08, 2018 at 06:42:00PM +0200, Jann Horn wrote:
>>>>>>> On Mon, Oct 8, 2018 at 6:21 PM Christian Brauner <christian@brauner.io> wrote:
>>>>>>> > On Mon, Oct 08, 2018 at 05:33:22PM +0200, Jann Horn wrote:
>>>>>>> > > On Mon, Oct 8, 2018 at 5:16 PM Christian Brauner <christian@brauner.io> wrote:
>>>>>>> > > > On Thu, Sep 27, 2018 at 09:11:16AM -0600, Tycho Andersen wrote:
>>>>>>> > > > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>>>>>>> > > > > index 44a31ac8373a..17685803a2af 100644
>>>>>>> > > > > --- a/kernel/seccomp.c
>>>>>>> > > > > +++ b/kernel/seccomp.c
>>>>>>> > > > > @@ -1777,4 +1777,35 @@ static struct file *init_listener(struct task_struct *task,
>>>>>>> > > > >
>>>>>>> > > > >       return ret;
>>>>>>> > > > >  }
>>>>>>> > > > > +
>>>>>>> > > > > +long seccomp_new_listener(struct task_struct *task,
>>>>>>> > > > > +                       unsigned long filter_off)
>>>>>>> > > > > +{
>>>>>>> > > > > +     struct seccomp_filter *filter;
>>>>>>> > > > > +     struct file *listener;
>>>>>>> > > > > +     int fd;
>>>>>>> > > > > +
>>>>>>> > > > > +     if (!capable(CAP_SYS_ADMIN))
>>>>>>> > > > > +             return -EACCES;
>>>>>>> > > >
>>>>>>> > > > I know this might have been discussed a while back but why exactly do we
>>>>>>> > > > require CAP_SYS_ADMIN in init_userns and not in the target userns? What
>>>>>>> > > > if I want to do a setns()fd, CLONE_NEWUSER) to the target process and
>>>>>>> > > > use ptrace from in there?
>>>>>>> > >
>>>>>>> > > See https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
>>>>>>> > > . Basically, the problem is that this doesn't just give you capability
>>>>>>> > > over the target task, but also over every other task that has the same
>>>>>>> > > filter installed; you need some sort of "is the caller capable over
>>>>>>> > > the filter and anyone who uses it" check.
>>>>>>> >
>>>>>>> > Thanks.
>>>>>>> > But then this new ptrace feature as it stands is imho currently broken.
>>>>>>> > If you can install a seccomp filter with SECCOMP_RET_USER_NOTIF if you
>>>>>>> > are ns_cpabable(CAP_SYS_ADMIN) and also get an fd via seccomp() itself
>>>>>>> > if you are ns_cpabable(CAP_SYS_ADMIN)
>>>>>
>>>>> Actually, you don't need CAP_SYS_ADMIN for seccomp() at all as long as
>>>>> you enable the NNP flag, I think?
>>>>
>>>> Yes, if you turn on NNP you don't even need sys_admin.
>>>>
>>>>>
>>>>>>> > then either the new ptrace() api
>>>>>>> > extension should be fixed to allow for this too or the seccomp() way of
>>>>>>> > retrieving the pid - which I really think we want - needs to be fixed to
>>>>>>> > require capable(CAP_SYS_ADMIN) too.
>>>>>>> > The solution where both require ns_capable(CAP_SYS_ADMIN) is - imho -
>>>>>>> > the preferred way to solve this.
>>>>>>> > Everything else will just be confusing.
>>>>>>>
>>>>>>> First you say "broken", then you say "confusing". Which one do you mean?
>>>>>>
>>>>>> Both. It's broken in so far as it places a seemingly unnecessary
>>>>>> restriction that could be fixed. You outlined one possible fix yourself
>>>>>> in the link you provided.
>>>>>
>>>>> If by "possible fix" you mean "check whether the seccomp filter is
>>>>> only attached to a single task": That wouldn't fundamentally change
>>>>> the situation, it would only add an additional special case.
>>>>>
>>>>>> And it's confusing in so far as there is a way
>>>>>> via seccomp() to get the fd without said requirement.
>>>>>
>>>>> I don't find it confusing at all. seccomp() and ptrace() are very
>>>>
>>>> Fine, then that's a matter of opinion. I find it counterintuitive that
>>>> you can get an fd without privileges via one interface but not via
>>>> another.
>>>>
>>>>> different situations: When you use seccomp(), infrastructure is
>>>>
>>>> Sure. Note, that this is _one_ of the reasons why I want to make sure we
>>>> keep the native seccomp() only based way of getting an fd without
>>>> forcing userspace to switching to a differnet kernel api.
>>>>
>>>>> already in place for ensuring that your filter is only applied to
>>>>> processes over which you are capable, and propagation is limited by
>>>>> inheritance from your task down. When you use ptrace(), you need a
>>>>> pretty different sort of access check that checks whether you're
>>>>> privileged over ancestors, siblings and so on of the target task.
>>>>
>>>> So, don't get me wrong I'm not arguing against the ptrace() interface in
>>>> general. If this is something that people find useful, fine. But, I
>>>> would like to have a simple single-syscall pure-seccomp() based way of
>>>> getting an fd, i.e. what we have in patch 1 of this series.
>>>
>>> Yeah, I also prefer the seccomp() one.
>>>
>>>>> But thinking about it more, I think that CAP_SYS_ADMIN over the saved
>>>>> current->mm->user_ns of the task that installed the filter (stored as
>>>>> a "struct user_namespace *" in the filter) should be acceptable.
>>>>
>>>> Hm... Why not CAP_SYS_PTRACE?
>>>
>>> Because LSMs like SELinux add extra checks that apply even if you have
>>> CAP_SYS_PTRACE, and this would subvert those. The only capability I
>>> know of that lets you bypass LSM checks by design (if no LSM blocks
>>> the capability itself) is CAP_SYS_ADMIN.
>>>
>>>> One more thing. Citing from [1]
>>>>
>>>>> I think there's a security problem here. Imagine the following scenario:
>>>>>
>>>>> 1. task A (uid==0) sets up a seccomp filter that uses SECCOMP_RET_USER_NOTIF
>>>>> 2. task A forks off a child B
>>>>> 3. task B uses setuid(1) to drop its privileges
>>>>> 4. task B becomes dumpable again, either via prctl(PR_SET_DUMPABLE, 1)
>>>>> or via execve()
>>>>> 5. task C (the attacker, uid==1) attaches to task B via ptrace
>>>>> 6. task C uses PTRACE_SECCOMP_NEW_LISTENER on task B
>>>>
>>>> Sorry, to be late to the party but would this really pass
>>>> __ptrace_may_access() in ptrace_attach()? It doesn't seem obvious to me
>>>> that it would... Doesn't look like it would get past:
>>>>
>>>>    tcred = __task_cred(task);
>>>>    if (uid_eq(caller_uid, tcred->euid) &&
>>>>        uid_eq(caller_uid, tcred->suid) &&
>>>>        uid_eq(caller_uid, tcred->uid)  &&
>>>>        gid_eq(caller_gid, tcred->egid) &&
>>>>        gid_eq(caller_gid, tcred->sgid) &&
>>>>        gid_eq(caller_gid, tcred->gid))
>>>>            goto ok;
>>>>    if (ptrace_has_cap(tcred->user_ns, mode))
>>>>            goto ok;
>>>>    rcu_read_unlock();
>>>>    return -EPERM;
>>>> ok:
>>>>    rcu_read_unlock();
>>>>    mm = task->mm;
>>>>    if (mm &&
>>>>        ((get_dumpable(mm) != SUID_DUMP_USER) &&
>>>>         !ptrace_has_cap(mm->user_ns, mode)))
>>>>        return -EPERM;
>>>
>>> Which specific check would prevent task C from attaching to task B? If
>>> the UIDs match, the first "goto ok" executes; and you're dumpable, so
>>> you don't trigger the second "return -EPERM".
>>>
>>>>> 7. because the seccomp filter is shared by task A and task B, task C
>>>>> is now able to influence syscall results for syscalls performed by
>>>>> task A
>>>>
>>>> [1]: https://lore.kernel.org/lkml/CAG48ez3R+ZJ1vwGkDfGzKX2mz6f=jjJWsO5pCvnH68P+RKO8Ow@mail.gmail.com/
>>
>>
>>
>> --
>> paul moore
>> www.paul-moore.com
Jann Horn via Containers Oct. 11, 2018, 1:39 p.m.
On Thu, Oct 11, 2018 at 9:24 AM Paul Moore <paul@paul-moore.com> wrote:
> On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote:
> > On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote:
> >> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
> >>> +cc selinux people explicitly, since they probably have opinions on this
> >>
> >> I just spent about twenty minutes working my way through this thread,
> >> and digging through the containers archive trying to get a good
> >> understanding of what you guys are trying to do, and I'm not quite
> >> sure I understand it all.  However, from what I have seen, this
> >> approach looks very ptrace-y to me (I imagine to others as well based
> >> on the comments) and because of this I think ensuring the usual ptrace
> >> access controls are evaluated, including the ptrace LSM hooks, is the
> >> right thing to do.
> >
> > Basically the problem is that this new ptrace() API does something
> > that doesn't just influence the target task, but also every other task
> > that has the same seccomp filter. So the classic ptrace check doesn't
> > work here.
>
> Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks?  If it is possible, how painful would it be?

There are currently no backlinks from seccomp filters to the tasks
that use them; the only thing you have is a refcount. If the refcount
is 1, and the target task uses the filter directly (it is the last
installed one), you'd be able to infer that the ptrace target is the
only task with a reference to the filter, and you could just do the
direct check; but if the refcount is >1, you might end up having to
take some spinlock and then iterate over all tasks' filters with that
spinlock held, or something like that.
Paul Moore Oct. 11, 2018, 11:10 p.m.
On October 11, 2018 9:40:06 AM Jann Horn <jannh@google.com> wrote:
> On Thu, Oct 11, 2018 at 9:24 AM Paul Moore <paul@paul-moore.com> wrote:
>> On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote:
>>> On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote:
>>>> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
>>>>> +cc selinux people explicitly, since they probably have opinions on this
>>>>
>>>> I just spent about twenty minutes working my way through this thread,
>>>> and digging through the containers archive trying to get a good
>>>> understanding of what you guys are trying to do, and I'm not quite
>>>> sure I understand it all.  However, from what I have seen, this
>>>> approach looks very ptrace-y to me (I imagine to others as well based
>>>> on the comments) and because of this I think ensuring the usual ptrace
>>>> access controls are evaluated, including the ptrace LSM hooks, is the
>>>> right thing to do.
>>>
>>> Basically the problem is that this new ptrace() API does something
>>> that doesn't just influence the target task, but also every other task
>>> that has the same seccomp filter. So the classic ptrace check doesn't
>>> work here.
>>
>> Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks?  If it is possible, how painful would it be?
>
> There are currently no backlinks from seccomp filters to the tasks
> that use them; the only thing you have is a refcount. If the refcount
> is 1, and the target task uses the filter directly (it is the last
> installed one), you'd be able to infer that the ptrace target is the
> only task with a reference to the filter, and you could just do the
> direct check; but if the refcount is >1, you might end up having to
> take some spinlock and then iterate over all tasks' filters with that
> spinlock held, or something like that.

That's what I was afraid of.

Unfortunately, I stand by my previous statements that we still probably want a LSM access check similar to what we currently do for ptrace.

--
paul moore
www.paul-moore.com
Andy Lutomirski Oct. 12, 2018, 1:02 a.m.
On Thu, Oct 11, 2018 at 4:10 PM Paul Moore <paul@paul-moore.com> wrote:
>
> On October 11, 2018 9:40:06 AM Jann Horn <jannh@google.com> wrote:
> > On Thu, Oct 11, 2018 at 9:24 AM Paul Moore <paul@paul-moore.com> wrote:
> >> On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote:
> >>> On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote:
> >>>> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
> >>>>> +cc selinux people explicitly, since they probably have opinions on this
> >>>>
> >>>> I just spent about twenty minutes working my way through this thread,
> >>>> and digging through the containers archive trying to get a good
> >>>> understanding of what you guys are trying to do, and I'm not quite
> >>>> sure I understand it all.  However, from what I have seen, this
> >>>> approach looks very ptrace-y to me (I imagine to others as well based
> >>>> on the comments) and because of this I think ensuring the usual ptrace
> >>>> access controls are evaluated, including the ptrace LSM hooks, is the
> >>>> right thing to do.
> >>>
> >>> Basically the problem is that this new ptrace() API does something
> >>> that doesn't just influence the target task, but also every other task
> >>> that has the same seccomp filter. So the classic ptrace check doesn't
> >>> work here.
> >>
> >> Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks?  If it is possible, how painful would it be?
> >
> > There are currently no backlinks from seccomp filters to the tasks
> > that use them; the only thing you have is a refcount. If the refcount
> > is 1, and the target task uses the filter directly (it is the last
> > installed one), you'd be able to infer that the ptrace target is the
> > only task with a reference to the filter, and you could just do the
> > direct check; but if the refcount is >1, you might end up having to
> > take some spinlock and then iterate over all tasks' filters with that
> > spinlock held, or something like that.
>
> That's what I was afraid of.
>
> Unfortunately, I stand by my previous statements that we still probably want a LSM access check similar to what we currently do for ptrace.
>

I would argue that once "LSM" enters this conversation, it just means
we're on the wrong track.  Let's try to make this work without ptrace
if possible :)  The whole seccomp() mechanism is very carefully
designed so that it's perfectly safe to install seccomp filters
without involving LSM or even involving credentials at all.

--Andy
Tycho Andersen Oct. 12, 2018, 8:02 p.m.
On Thu, Oct 11, 2018 at 06:02:06PM -0700, Andy Lutomirski wrote:
> On Thu, Oct 11, 2018 at 4:10 PM Paul Moore <paul@paul-moore.com> wrote:
> >
> > On October 11, 2018 9:40:06 AM Jann Horn <jannh@google.com> wrote:
> > > On Thu, Oct 11, 2018 at 9:24 AM Paul Moore <paul@paul-moore.com> wrote:
> > >> On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote:
> > >>> On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote:
> > >>>> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
> > >>>>> +cc selinux people explicitly, since they probably have opinions on this
> > >>>>
> > >>>> I just spent about twenty minutes working my way through this thread,
> > >>>> and digging through the containers archive trying to get a good
> > >>>> understanding of what you guys are trying to do, and I'm not quite
> > >>>> sure I understand it all.  However, from what I have seen, this
> > >>>> approach looks very ptrace-y to me (I imagine to others as well based
> > >>>> on the comments) and because of this I think ensuring the usual ptrace
> > >>>> access controls are evaluated, including the ptrace LSM hooks, is the
> > >>>> right thing to do.
> > >>>
> > >>> Basically the problem is that this new ptrace() API does something
> > >>> that doesn't just influence the target task, but also every other task
> > >>> that has the same seccomp filter. So the classic ptrace check doesn't
> > >>> work here.
> > >>
> > >> Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks?  If it is possible, how painful would it be?
> > >
> > > There are currently no backlinks from seccomp filters to the tasks
> > > that use them; the only thing you have is a refcount. If the refcount
> > > is 1, and the target task uses the filter directly (it is the last
> > > installed one), you'd be able to infer that the ptrace target is the
> > > only task with a reference to the filter, and you could just do the
> > > direct check; but if the refcount is >1, you might end up having to
> > > take some spinlock and then iterate over all tasks' filters with that
> > > spinlock held, or something like that.
> >
> > That's what I was afraid of.
> >
> > Unfortunately, I stand by my previous statements that we still probably want a LSM access check similar to what we currently do for ptrace.
> >
> 
> I would argue that once "LSM" enters this conversation, it just means
> we're on the wrong track.  Let's try to make this work without ptrace
> if possible :)  The whole seccomp() mechanism is very carefully
> designed so that it's perfectly safe to install seccomp filters
> without involving LSM or even involving credentials at all.

In a last ditch effort to save the ptrace() interface: can we just
only allow it when refcount_read(filter->usage) == 1?

Tycho
Jann Horn via Containers Oct. 12, 2018, 8:06 p.m.
On Fri, Oct 12, 2018 at 10:02 PM Tycho Andersen <tycho@tycho.ws> wrote:
> On Thu, Oct 11, 2018 at 06:02:06PM -0700, Andy Lutomirski wrote:
> > On Thu, Oct 11, 2018 at 4:10 PM Paul Moore <paul@paul-moore.com> wrote:
> > >
> > > On October 11, 2018 9:40:06 AM Jann Horn <jannh@google.com> wrote:
> > > > On Thu, Oct 11, 2018 at 9:24 AM Paul Moore <paul@paul-moore.com> wrote:
> > > >> On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote:
> > > >>> On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote:
> > > >>>> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
> > > >>>>> +cc selinux people explicitly, since they probably have opinions on this
> > > >>>>
> > > >>>> I just spent about twenty minutes working my way through this thread,
> > > >>>> and digging through the containers archive trying to get a good
> > > >>>> understanding of what you guys are trying to do, and I'm not quite
> > > >>>> sure I understand it all.  However, from what I have seen, this
> > > >>>> approach looks very ptrace-y to me (I imagine to others as well based
> > > >>>> on the comments) and because of this I think ensuring the usual ptrace
> > > >>>> access controls are evaluated, including the ptrace LSM hooks, is the
> > > >>>> right thing to do.
> > > >>>
> > > >>> Basically the problem is that this new ptrace() API does something
> > > >>> that doesn't just influence the target task, but also every other task
> > > >>> that has the same seccomp filter. So the classic ptrace check doesn't
> > > >>> work here.
> > > >>
> > > >> Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks?  If it is possible, how painful would it be?
> > > >
> > > > There are currently no backlinks from seccomp filters to the tasks
> > > > that use them; the only thing you have is a refcount. If the refcount
> > > > is 1, and the target task uses the filter directly (it is the last
> > > > installed one), you'd be able to infer that the ptrace target is the
> > > > only task with a reference to the filter, and you could just do the
> > > > direct check; but if the refcount is >1, you might end up having to
> > > > take some spinlock and then iterate over all tasks' filters with that
> > > > spinlock held, or something like that.
> > >
> > > That's what I was afraid of.
> > >
> > > Unfortunately, I stand by my previous statements that we still probably want a LSM access check similar to what we currently do for ptrace.
> > >
> >
> > I would argue that once "LSM" enters this conversation, it just means
> > we're on the wrong track.  Let's try to make this work without ptrace
> > if possible :)  The whole seccomp() mechanism is very carefully
> > designed so that it's perfectly safe to install seccomp filters
> > without involving LSM or even involving credentials at all.
>
> In a last ditch effort to save the ptrace() interface: can we just
> only allow it when refcount_read(filter->usage) == 1?

From a security perspective, I think that would be fine, assuming that
we know that the target task is stopped. (But note that if the target
process e.g. uses the filter on multiple threads, the refcount will be
higher.)
Christian Brauner Oct. 12, 2018, 8:11 p.m.
On Fri, Oct 12, 2018 at 01:02:20PM -0700, Tycho Andersen wrote:
> On Thu, Oct 11, 2018 at 06:02:06PM -0700, Andy Lutomirski wrote:
> > On Thu, Oct 11, 2018 at 4:10 PM Paul Moore <paul@paul-moore.com> wrote:
> > >
> > > On October 11, 2018 9:40:06 AM Jann Horn <jannh@google.com> wrote:
> > > > On Thu, Oct 11, 2018 at 9:24 AM Paul Moore <paul@paul-moore.com> wrote:
> > > >> On October 10, 2018 11:34:11 AM Jann Horn <jannh@google.com> wrote:
> > > >>> On Wed, Oct 10, 2018 at 5:32 PM Paul Moore <paul@paul-moore.com> wrote:
> > > >>>> On Tue, Oct 9, 2018 at 9:36 AM Jann Horn <jannh@google.com> wrote:
> > > >>>>> +cc selinux people explicitly, since they probably have opinions on this
> > > >>>>
> > > >>>> I just spent about twenty minutes working my way through this thread,
> > > >>>> and digging through the containers archive trying to get a good
> > > >>>> understanding of what you guys are trying to do, and I'm not quite
> > > >>>> sure I understand it all.  However, from what I have seen, this
> > > >>>> approach looks very ptrace-y to me (I imagine to others as well based
> > > >>>> on the comments) and because of this I think ensuring the usual ptrace
> > > >>>> access controls are evaluated, including the ptrace LSM hooks, is the
> > > >>>> right thing to do.
> > > >>>
> > > >>> Basically the problem is that this new ptrace() API does something
> > > >>> that doesn't just influence the target task, but also every other task
> > > >>> that has the same seccomp filter. So the classic ptrace check doesn't
> > > >>> work here.
> > > >>
> > > >> Due to some rather unfortunate events today I'm suddenly without easy access to the kernel code, but would it be possible to run the LSM ptrace access control checks against all of the affected tasks?  If it is possible, how painful would it be?
> > > >
> > > > There are currently no backlinks from seccomp filters to the tasks
> > > > that use them; the only thing you have is a refcount. If the refcount
> > > > is 1, and the target task uses the filter directly (it is the last
> > > > installed one), you'd be able to infer that the ptrace target is the
> > > > only task with a reference to the filter, and you could just do the
> > > > direct check; but if the refcount is >1, you might end up having to
> > > > take some spinlock and then iterate over all tasks' filters with that
> > > > spinlock held, or something like that.
> > >
> > > That's what I was afraid of.
> > >
> > > Unfortunately, I stand by my previous statements that we still probably want a LSM access check similar to what we currently do for ptrace.
> > >
> > 
> > I would argue that once "LSM" enters this conversation, it just means
> > we're on the wrong track.  Let's try to make this work without ptrace
> > if possible :)  The whole seccomp() mechanism is very carefully
> > designed so that it's perfectly safe to install seccomp filters
> > without involving LSM or even involving credentials at all.
> 
> In a last ditch effort to save the ptrace() interface: can we just
> only allow it when refcount_read(filter->usage) == 1?

I mean, the filter->usage == 1 means lets us get rid of
capable(CAP_SYS_ADMIN) making the ptrace() way of getting an fd useable
in nesting scenarios and from within user namespaces. That makes it a
whole lot more useful and aligns it with the seccomp() way of getting
the fd. So I wouldn't argue against it.
I guess it comes down to (for me) whether you consider this a necessary
part of this patchset aka meaning without it it wouldn't be useable.

Christian