[v3,4/4] seccomp: add support for passing fds via USER_NOTIF

Submitted by Tycho Andersen on May 31, 2018, 2:49 p.m.

Details

Message ID 20180531144949.24995-5-tycho@tycho.ws
State New
Series "seccomp trap to userspace"
Headers show

Commit Message

Tycho Andersen May 31, 2018, 2:49 p.m.
The idea here is that the userspace handler should be able to pass an fd
back to the trapped task, for example so it can be returned from socket().

I've proposed one API here, but I'm open to other options. In particular,
this only lets you return an fd from a syscall, which may not be enough in
all cases. For example, if an fd is written to an output parameter instead
of returned, the current API can't handle this. Another case is that
netlink takes as input fds sometimes (IFLA_NET_NS_FD, e.g.). If netlink
ever decides to install an fd and output it, we wouldn't be able to handle
this either.

Still, the vast majority of interesting cases are covered by this API, so
perhaps it is Enough.

I've left it as a separate commit for two reasons:
  * It illustrates the way in which we would grow struct seccomp_notif and
    struct seccomp_notif_resp without using netlink
  * It shows just how little code is needed to accomplish this :)

v2: new in v2
v3: no changes

Signed-off-by: Tycho Andersen <tycho@tycho.ws>
CC: Kees Cook <keescook@chromium.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Oleg Nesterov <oleg@redhat.com>
CC: Eric W. Biederman <ebiederm@xmission.com>
CC: "Serge E. Hallyn" <serge@hallyn.com>
CC: Christian Brauner <christian.brauner@ubuntu.com>
CC: Tyler Hicks <tyhicks@canonical.com>
CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
---
 include/uapi/linux/seccomp.h                  |   2 +
 kernel/seccomp.c                              |  49 +++++++-
 tools/testing/selftests/seccomp/seccomp_bpf.c | 112 ++++++++++++++++++
 3 files changed, 161 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 8160e6cad528..3124427219cb 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -71,6 +71,8 @@  struct seccomp_notif_resp {
 	__u64 id;
 	__s32 error;
 	__s64 val;
+	__u8 return_fd;
+	__u32 fd;
 };
 
 #endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 6dc99c65c2f4..2ee958b3efde 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -77,6 +77,8 @@  struct seccomp_knotif {
 	/* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
 	int error;
 	long val;
+	struct file *file;
+	unsigned int flags;
 
 	/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
 	struct completion ready;
@@ -780,10 +782,32 @@  static void seccomp_do_user_notification(int this_syscall,
 			goto remove_list;
 	}
 
-	ret = n.val;
-	err = n.error;
+	if (n.file) {
+		int fd;
+
+		fd = get_unused_fd_flags(n.flags);
+		if (fd < 0) {
+			err = fd;
+			ret = -1;
+			goto remove_list;
+		}
+
+		ret = fd;
+		err = 0;
+
+		fd_install(fd, n.file);
+		/* Don't fput, since fd has a reference now */
+		n.file = NULL;
+	} else {
+		ret = n.val;
+		err = n.error;
+	}
+
 
 remove_list:
+	if (n.file)
+		fput(n.file);
+
 	list_del(&n.list);
 out:
 	mutex_unlock(&match->notify_lock);
@@ -1598,6 +1622,27 @@  static ssize_t seccomp_notify_write(struct file *file, const char __user *buf,
 	knotif->state = SECCOMP_NOTIFY_REPLIED;
 	knotif->error = resp.error;
 	knotif->val = resp.val;
+
+	if (resp.return_fd) {
+		struct fd fd;
+
+		/*
+		 * This is a little hokey: we need a real fget() (i.e. not
+		 * __fget_light(), which is what fdget does), but we also need
+		 * the flags from strcut fd. So, we get it, put it, and get it
+		 * again for real.
+		 */
+		fd = fdget(resp.fd);
+		knotif->flags = fd.flags;
+		fdput(fd);
+
+		knotif->file = fget(resp.fd);
+		if (!knotif->file) {
+			ret = -EBADF;
+			goto out;
+		}
+	}
+
 	complete(&knotif->ready);
 out:
 	mutex_unlock(&filter->notify_lock);
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 07984f7bd601..b9a4f676566d 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -167,6 +167,8 @@  struct seccomp_notif_resp {
 	__u64 id;
 	__s32 error;
 	__s64 val;
+	__u8 return_fd;
+	__u32 fd;
 };
 #endif
 
@@ -3176,6 +3178,116 @@  TEST(get_user_notification_ptrace)
 	close(listener);
 }
 
+TEST(user_notification_pass_fd)
+{
+	pid_t pid;
+	int status, listener;
+	int sk_pair[2];
+	char c;
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+	long ret;
+
+	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		int fd;
+		char buf[16];
+
+		EXPECT_EQ(user_trap_syscall(__NR_getpid, 0), 0);
+
+		/* Signal we're ready and have installed the filter. */
+		EXPECT_EQ(write(sk_pair[1], "J", 1), 1);
+
+		EXPECT_EQ(read(sk_pair[1], &c, 1), 1);
+		EXPECT_EQ(c, 'H');
+		close(sk_pair[1]);
+
+		/* An fd from getpid(). Let the games begin. */
+		fd = syscall(__NR_getpid);
+		EXPECT_GT(fd, 0);
+		EXPECT_EQ(read(fd, buf, sizeof(buf)), 12);
+		close(fd);
+
+		exit(strcmp("hello world", buf));
+	}
+
+	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
+	EXPECT_EQ(c, 'J');
+
+	EXPECT_EQ(ptrace(PTRACE_ATTACH, pid), 0);
+	EXPECT_EQ(waitpid(pid, NULL, 0), pid);
+	listener = ptrace(PTRACE_SECCOMP_GET_LISTENER, pid, 0);
+	EXPECT_GE(listener, 0);
+	EXPECT_EQ(ptrace(PTRACE_DETACH, pid, NULL, 0), 0);
+
+	/* Now signal we are done installing so it can do a getpid */
+	EXPECT_EQ(write(sk_pair[0], "H", 1), 1);
+	close(sk_pair[0]);
+
+	/* Make a new socket pair so we can send half across */
+	EXPECT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
+
+	ret = read_notif(listener, &req);
+	EXPECT_EQ(ret, sizeof(req));
+	EXPECT_EQ(errno, 0);
+
+	resp.id = req.id;
+	resp.return_fd = 1;
+	resp.fd = sk_pair[1];
+	EXPECT_EQ(write(listener, &resp, sizeof(resp)), sizeof(resp));
+	close(sk_pair[1]);
+
+	EXPECT_EQ(write(sk_pair[0], "hello world\0", 12), 12);
+	close(sk_pair[0]);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+	close(listener);
+}
+
+TEST(user_notification_struct_size_mismatch)
+{
+	pid_t pid;
+	long ret;
+	int status, listener, len;
+	struct seccomp_notif req;
+	struct seccomp_notif_resp resp;
+
+	listener = user_trap_syscall(__NR_getpid,
+				     SECCOMP_FILTER_FLAG_GET_LISTENER);
+	EXPECT_GE(listener, 0);
+
+	pid = fork();
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		ret = syscall(__NR_getpid);
+		exit(ret != USER_NOTIF_MAGIC);
+	}
+
+	EXPECT_EQ(read(listener, &req, sizeof(req)), sizeof(req));
+
+	resp.id = req.id;
+	resp.error = 0;
+	resp.val = USER_NOTIF_MAGIC;
+
+	/*
+	 * Only write a partial structure: this is what was available before we
+	 * had fd support.
+	 */
+	len = offsetof(struct seccomp_notif_resp, val) + sizeof(resp.val);
+	EXPECT_EQ(write(listener, &resp, len), len);
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
 /*
  * TODO:
  * - add microbenchmarks

Comments

Jann Horn via Containers June 2, 2018, 1:13 p.m.
On Sat, Jun 2, 2018 at 2:58 PM Tycho Andersen <tycho@tycho.ws> wrote:
> The idea here is that the userspace handler should be able to pass an fd
> back to the trapped task, for example so it can be returned from socket().
>
> I've proposed one API here, but I'm open to other options. In particular,
> this only lets you return an fd from a syscall, which may not be enough in
> all cases. For example, if an fd is written to an output parameter instead
> of returned, the current API can't handle this. Another case is that
> netlink takes as input fds sometimes (IFLA_NET_NS_FD, e.g.). If netlink
> ever decides to install an fd and output it, we wouldn't be able to handle
> this either.
>
> Still, the vast majority of interesting cases are covered by this API, so
> perhaps it is Enough.
>
> I've left it as a separate commit for two reasons:
>   * It illustrates the way in which we would grow struct seccomp_notif and
>     struct seccomp_notif_resp without using netlink
>   * It shows just how little code is needed to accomplish this :)
[...]
> +               fd = get_unused_fd_flags(n.flags);

Here, you're using n.flags in a context where it will be tested
against O_CLOEXEC to determine whether the new fd should be
close-on-exec.

[...]
> +               /*
> +                * This is a little hokey: we need a real fget() (i.e. not
> +                * __fget_light(), which is what fdget does), but we also need
> +                * the flags from strcut fd. So, we get it, put it, and get it
> +                * again for real.
> +                */
> +               fd = fdget(resp.fd);
> +               knotif->flags = fd.flags;
> +               fdput(fd);
> +
> +               knotif->file = fget(resp.fd);
> +               if (!knotif->file) {
> +                       ret = -EBADF;
> +                       goto out;
> +               }

But here fd.flags contains the low 2 bits of the return value of
__fget_light, which are either 0 or FDPUT_FPUT (encoded as 1). This
flag states whether fdget() took a reference on the file, which is
mostly equivalent to "is the current process multithreaded?". (This is
the reason why fdget returns flags and fget doesn't - the flag from
fdget is to decide whether you'll need an fput(), which is
unconditional for fget().)

Apart from this issue, I think that in general, it's probably not a
good idea to copy the close-on-exec flag from the fd in the
supervising process - the supervising process might want all the fds
it is working with to be O_CLOEXEC independent of whether the
supervised process wants an O_CLOEXEC fd. It might make sense to add a
field for this to struct seccomp_notif_resp instead.
Tycho Andersen June 2, 2018, 6:18 p.m.
Hi Jann,

Thanks for taking a look!

On Sat, Jun 02, 2018 at 03:13:39PM +0200, Jann Horn wrote:
> On Sat, Jun 2, 2018 at 2:58 PM Tycho Andersen <tycho@tycho.ws> wrote:
> > The idea here is that the userspace handler should be able to pass an fd
> > back to the trapped task, for example so it can be returned from socket().
> >
> > I've proposed one API here, but I'm open to other options. In particular,
> > this only lets you return an fd from a syscall, which may not be enough in
> > all cases. For example, if an fd is written to an output parameter instead
> > of returned, the current API can't handle this. Another case is that
> > netlink takes as input fds sometimes (IFLA_NET_NS_FD, e.g.). If netlink
> > ever decides to install an fd and output it, we wouldn't be able to handle
> > this either.
> >
> > Still, the vast majority of interesting cases are covered by this API, so
> > perhaps it is Enough.
> >
> > I've left it as a separate commit for two reasons:
> >   * It illustrates the way in which we would grow struct seccomp_notif and
> >     struct seccomp_notif_resp without using netlink
> >   * It shows just how little code is needed to accomplish this :)
> [...]
> > +               fd = get_unused_fd_flags(n.flags);
> 
> Here, you're using n.flags in a context where it will be tested
> against O_CLOEXEC to determine whether the new fd should be
> close-on-exec.
> 
> [...]
> > +               /*
> > +                * This is a little hokey: we need a real fget() (i.e. not
> > +                * __fget_light(), which is what fdget does), but we also need
> > +                * the flags from strcut fd. So, we get it, put it, and get it
> > +                * again for real.
> > +                */
> > +               fd = fdget(resp.fd);
> > +               knotif->flags = fd.flags;
> > +               fdput(fd);
> > +
> > +               knotif->file = fget(resp.fd);
> > +               if (!knotif->file) {
> > +                       ret = -EBADF;
> > +                       goto out;
> > +               }
> 
> But here fd.flags contains the low 2 bits of the return value of
> __fget_light, which are either 0 or FDPUT_FPUT (encoded as 1). This
> flag states whether fdget() took a reference on the file, which is
> mostly equivalent to "is the current process multithreaded?". (This is
> the reason why fdget returns flags and fget doesn't - the flag from
> fdget is to decide whether you'll need an fput(), which is
> unconditional for fget().)

Oof, yes.

> Apart from this issue, I think that in general, it's probably not a
> good idea to copy the close-on-exec flag from the fd in the
> supervising process - the supervising process might want all the fds
> it is working with to be O_CLOEXEC independent of whether the
> supervised process wants an O_CLOEXEC fd. It might make sense to add a
> field for this to struct seccomp_notif_resp instead.

Yes, I wondered about this. In particular, maybe it just makes sense
to pass back the exact flags that the FD should be opened with too, so
if in the future there's some other flag we might want to twiddle, we
don't need another patch. I'll make the change for v4.

Thanks!

Tycho
Alban Crequy June 2, 2018, 7:14 p.m.
On Thu, 31 May 2018 at 16:52, Tycho Andersen <tycho@tycho.ws> wrote:
>
> The idea here is that the userspace handler should be able to pass an fd
> back to the trapped task, for example so it can be returned from socket().
>
> I've proposed one API here, but I'm open to other options. In particular,
> this only lets you return an fd from a syscall, which may not be enough in
> all cases. For example, if an fd is written to an output parameter instead
> of returned, the current API can't handle this. Another case is that
> netlink takes as input fds sometimes (IFLA_NET_NS_FD, e.g.). If netlink
> ever decides to install an fd and output it, we wouldn't be able to handle
> this either.
>
> Still, the vast majority of interesting cases are covered by this API, so
> perhaps it is Enough.
>
> I've left it as a separate commit for two reasons:
>   * It illustrates the way in which we would grow struct seccomp_notif and
>     struct seccomp_notif_resp without using netlink
>   * It shows just how little code is needed to accomplish this :)
>
> v2: new in v2
> v3: no changes
>
> Signed-off-by: Tycho Andersen <tycho@tycho.ws>
> CC: Kees Cook <keescook@chromium.org>
> CC: Andy Lutomirski <luto@amacapital.net>
> CC: Oleg Nesterov <oleg@redhat.com>
> CC: Eric W. Biederman <ebiederm@xmission.com>
> CC: "Serge E. Hallyn" <serge@hallyn.com>
> CC: Christian Brauner <christian.brauner@ubuntu.com>
> CC: Tyler Hicks <tyhicks@canonical.com>
> CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
> ---
>  include/uapi/linux/seccomp.h                  |   2 +
>  kernel/seccomp.c                              |  49 +++++++-
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 112 ++++++++++++++++++
>  3 files changed, 161 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index 8160e6cad528..3124427219cb 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -71,6 +71,8 @@ struct seccomp_notif_resp {
>         __u64 id;
>         __s32 error;
>         __s64 val;
> +       __u8 return_fd;
> +       __u32 fd;
>  };
>
>  #endif /* _UAPI_LINUX_SECCOMP_H */
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 6dc99c65c2f4..2ee958b3efde 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -77,6 +77,8 @@ struct seccomp_knotif {
>         /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
>         int error;
>         long val;
> +       struct file *file;
> +       unsigned int flags;
>
>         /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
>         struct completion ready;
> @@ -780,10 +782,32 @@ static void seccomp_do_user_notification(int this_syscall,
>                         goto remove_list;
>         }
>
> -       ret = n.val;
> -       err = n.error;
> +       if (n.file) {
> +               int fd;
> +
> +               fd = get_unused_fd_flags(n.flags);
> +               if (fd < 0) {
> +                       err = fd;
> +                       ret = -1;
> +                       goto remove_list;
> +               }
> +
> +               ret = fd;
> +               err = 0;
> +
> +               fd_install(fd, n.file);
> +               /* Don't fput, since fd has a reference now */
> +               n.file = NULL;

Do we want the cgroup classid and netprio to be applied here, before
the fd_install? I am looking at similar code in net/core/scm.c
scm_detach_fds():
                sock = sock_from_file(fp[i], &err);
                if (sock) {
                        sock_update_netprioidx(&sock->sk->sk_cgrp_data);
                        sock_update_classid(&sock->sk->sk_cgrp_data);
                }

The listener process might live in a different cgroup with a different
classid & netprio, so it might not be applied as the app might expect.

Also, should we update the struct sock_cgroup_data of the socket, in
order to make the BPF helper function bpf_skb_under_cgroup() work wrt
the cgroup of the target process instead of the listener process? I am
looking at cgroup_sk_alloc(). I don't know what's the correct
behaviour we want here.

> +       } else {
> +               ret = n.val;
> +               err = n.error;
> +       }
> +
>
>  remove_list:
> +       if (n.file)
> +               fput(n.file);
> +
>         list_del(&n.list);
>  out:
>         mutex_unlock(&match->notify_lock);
> @@ -1598,6 +1622,27 @@ static ssize_t seccomp_notify_write(struct file *file, const char __user *buf,
>         knotif->state = SECCOMP_NOTIFY_REPLIED;
>         knotif->error = resp.error;
>         knotif->val = resp.val;
> +
> +       if (resp.return_fd) {
> +               struct fd fd;
> +
> +               /*
> +                * This is a little hokey: we need a real fget() (i.e. not
> +                * __fget_light(), which is what fdget does), but we also need
> +                * the flags from strcut fd. So, we get it, put it, and get it
> +                * again for real.
> +                */
> +               fd = fdget(resp.fd);
> +               knotif->flags = fd.flags;
> +               fdput(fd);
> +
> +               knotif->file = fget(resp.fd);
> +               if (!knotif->file) {
> +                       ret = -EBADF;
> +                       goto out;

Should the "knotif->state = SECCOMP_NOTIFY_REPLIED" and other changes
be done after the error case here? In case of bad fd, it seems to
return -EBADF the first time and -EINVAL the next time because the
state would have been changed to SECCOMP_NOTIFY_REPLIED already.

> +               }
> +       }
> +
>         complete(&knotif->ready);
>  out:
>         mutex_unlock(&filter->notify_lock);
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 07984f7bd601..b9a4f676566d 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -167,6 +167,8 @@ struct seccomp_notif_resp {
>         __u64 id;
>         __s32 error;
>         __s64 val;
> +       __u8 return_fd;
> +       __u32 fd;
>  };
>  #endif
>
> @@ -3176,6 +3178,116 @@ TEST(get_user_notification_ptrace)
>         close(listener);
>  }
>
> +TEST(user_notification_pass_fd)
> +{
> +       pid_t pid;
> +       int status, listener;
> +       int sk_pair[2];
> +       char c;
> +       struct seccomp_notif req = {};
> +       struct seccomp_notif_resp resp = {};
> +       long ret;
> +
> +       ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
> +
> +       pid = fork();
> +       ASSERT_GE(pid, 0);
> +
> +       if (pid == 0) {
> +               int fd;
> +               char buf[16];
> +
> +               EXPECT_EQ(user_trap_syscall(__NR_getpid, 0), 0);
> +
> +               /* Signal we're ready and have installed the filter. */
> +               EXPECT_EQ(write(sk_pair[1], "J", 1), 1);
> +
> +               EXPECT_EQ(read(sk_pair[1], &c, 1), 1);
> +               EXPECT_EQ(c, 'H');
> +               close(sk_pair[1]);
> +
> +               /* An fd from getpid(). Let the games begin. */
> +               fd = syscall(__NR_getpid);
> +               EXPECT_GT(fd, 0);
> +               EXPECT_EQ(read(fd, buf, sizeof(buf)), 12);
> +               close(fd);
> +
> +               exit(strcmp("hello world", buf));
> +       }
> +
> +       EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
> +       EXPECT_EQ(c, 'J');
> +
> +       EXPECT_EQ(ptrace(PTRACE_ATTACH, pid), 0);
> +       EXPECT_EQ(waitpid(pid, NULL, 0), pid);
> +       listener = ptrace(PTRACE_SECCOMP_GET_LISTENER, pid, 0);
> +       EXPECT_GE(listener, 0);
> +       EXPECT_EQ(ptrace(PTRACE_DETACH, pid, NULL, 0), 0);
> +
> +       /* Now signal we are done installing so it can do a getpid */
> +       EXPECT_EQ(write(sk_pair[0], "H", 1), 1);
> +       close(sk_pair[0]);
> +
> +       /* Make a new socket pair so we can send half across */
> +       EXPECT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
> +
> +       ret = read_notif(listener, &req);
> +       EXPECT_EQ(ret, sizeof(req));
> +       EXPECT_EQ(errno, 0);
> +
> +       resp.id = req.id;
> +       resp.return_fd = 1;
> +       resp.fd = sk_pair[1];
> +       EXPECT_EQ(write(listener, &resp, sizeof(resp)), sizeof(resp));
> +       close(sk_pair[1]);
> +
> +       EXPECT_EQ(write(sk_pair[0], "hello world\0", 12), 12);
> +       close(sk_pair[0]);
> +
> +       EXPECT_EQ(waitpid(pid, &status, 0), pid);
> +       EXPECT_EQ(true, WIFEXITED(status));
> +       EXPECT_EQ(0, WEXITSTATUS(status));
> +       close(listener);
> +}
> +
> +TEST(user_notification_struct_size_mismatch)
> +{
> +       pid_t pid;
> +       long ret;
> +       int status, listener, len;
> +       struct seccomp_notif req;
> +       struct seccomp_notif_resp resp;
> +
> +       listener = user_trap_syscall(__NR_getpid,
> +                                    SECCOMP_FILTER_FLAG_GET_LISTENER);
> +       EXPECT_GE(listener, 0);
> +
> +       pid = fork();
> +       ASSERT_GE(pid, 0);
> +
> +       if (pid == 0) {
> +               ret = syscall(__NR_getpid);
> +               exit(ret != USER_NOTIF_MAGIC);
> +       }
> +
> +       EXPECT_EQ(read(listener, &req, sizeof(req)), sizeof(req));
> +
> +       resp.id = req.id;
> +       resp.error = 0;
> +       resp.val = USER_NOTIF_MAGIC;
> +
> +       /*
> +        * Only write a partial structure: this is what was available before we
> +        * had fd support.
> +        */
> +       len = offsetof(struct seccomp_notif_resp, val) + sizeof(resp.val);
> +       EXPECT_EQ(write(listener, &resp, len), len);
> +
> +       EXPECT_EQ(waitpid(pid, &status, 0), pid);
> +       EXPECT_EQ(true, WIFEXITED(status));
> +       EXPECT_EQ(0, WEXITSTATUS(status));
> +}
> +
>  /*
>   * TODO:
>   * - add microbenchmarks
> --
> 2.17.0
>
Tycho Andersen June 4, 2018, 12:14 a.m.
Hi Alban,

On Sat, Jun 02, 2018 at 09:14:09PM +0200, Alban Crequy wrote:
> On Thu, 31 May 2018 at 16:52, Tycho Andersen <tycho@tycho.ws> wrote:
> >
> > The idea here is that the userspace handler should be able to pass an fd
> > back to the trapped task, for example so it can be returned from socket().
> >
> > I've proposed one API here, but I'm open to other options. In particular,
> > this only lets you return an fd from a syscall, which may not be enough in
> > all cases. For example, if an fd is written to an output parameter instead
> > of returned, the current API can't handle this. Another case is that
> > netlink takes as input fds sometimes (IFLA_NET_NS_FD, e.g.). If netlink
> > ever decides to install an fd and output it, we wouldn't be able to handle
> > this either.
> >
> > Still, the vast majority of interesting cases are covered by this API, so
> > perhaps it is Enough.
> >
> > I've left it as a separate commit for two reasons:
> >   * It illustrates the way in which we would grow struct seccomp_notif and
> >     struct seccomp_notif_resp without using netlink
> >   * It shows just how little code is needed to accomplish this :)
> >
> > v2: new in v2
> > v3: no changes
> >
> > Signed-off-by: Tycho Andersen <tycho@tycho.ws>
> > CC: Kees Cook <keescook@chromium.org>
> > CC: Andy Lutomirski <luto@amacapital.net>
> > CC: Oleg Nesterov <oleg@redhat.com>
> > CC: Eric W. Biederman <ebiederm@xmission.com>
> > CC: "Serge E. Hallyn" <serge@hallyn.com>
> > CC: Christian Brauner <christian.brauner@ubuntu.com>
> > CC: Tyler Hicks <tyhicks@canonical.com>
> > CC: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
> > ---
> >  include/uapi/linux/seccomp.h                  |   2 +
> >  kernel/seccomp.c                              |  49 +++++++-
> >  tools/testing/selftests/seccomp/seccomp_bpf.c | 112 ++++++++++++++++++
> >  3 files changed, 161 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> > index 8160e6cad528..3124427219cb 100644
> > --- a/include/uapi/linux/seccomp.h
> > +++ b/include/uapi/linux/seccomp.h
> > @@ -71,6 +71,8 @@ struct seccomp_notif_resp {
> >         __u64 id;
> >         __s32 error;
> >         __s64 val;
> > +       __u8 return_fd;
> > +       __u32 fd;
> >  };
> >
> >  #endif /* _UAPI_LINUX_SECCOMP_H */
> > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > index 6dc99c65c2f4..2ee958b3efde 100644
> > --- a/kernel/seccomp.c
> > +++ b/kernel/seccomp.c
> > @@ -77,6 +77,8 @@ struct seccomp_knotif {
> >         /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
> >         int error;
> >         long val;
> > +       struct file *file;
> > +       unsigned int flags;
> >
> >         /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
> >         struct completion ready;
> > @@ -780,10 +782,32 @@ static void seccomp_do_user_notification(int this_syscall,
> >                         goto remove_list;
> >         }
> >
> > -       ret = n.val;
> > -       err = n.error;
> > +       if (n.file) {
> > +               int fd;
> > +
> > +               fd = get_unused_fd_flags(n.flags);
> > +               if (fd < 0) {
> > +                       err = fd;
> > +                       ret = -1;
> > +                       goto remove_list;
> > +               }
> > +
> > +               ret = fd;
> > +               err = 0;
> > +
> > +               fd_install(fd, n.file);
> > +               /* Don't fput, since fd has a reference now */
> > +               n.file = NULL;
> 
> Do we want the cgroup classid and netprio to be applied here, before
> the fd_install? I am looking at similar code in net/core/scm.c
> scm_detach_fds():
>                 sock = sock_from_file(fp[i], &err);
>                 if (sock) {
>                         sock_update_netprioidx(&sock->sk->sk_cgrp_data);
>                         sock_update_classid(&sock->sk->sk_cgrp_data);
>                 }
> 
> The listener process might live in a different cgroup with a different
> classid & netprio, so it might not be applied as the app might expect.

Thanks, I hadn't really thought about this. I think doing what
SCM_RIGHTS does makes sense -- the operation is essentially the same.

> Also, should we update the struct sock_cgroup_data of the socket, in
> order to make the BPF helper function bpf_skb_under_cgroup() work wrt
> the cgroup of the target process instead of the listener process? I am
> looking at cgroup_sk_alloc(). I don't know what's the correct
> behaviour we want here.

SCM_RIGHTS seems to omit this (I assume you mean the val field of
struct sock_cgroup_data, which seems to be a pointer to struct
cgroup*), any idea why?

> > +       } else {
> > +               ret = n.val;
> > +               err = n.error;
> > +       }
> > +
> >
> >  remove_list:
> > +       if (n.file)
> > +               fput(n.file);
> > +
> >         list_del(&n.list);
> >  out:
> >         mutex_unlock(&match->notify_lock);
> > @@ -1598,6 +1622,27 @@ static ssize_t seccomp_notify_write(struct file *file, const char __user *buf,
> >         knotif->state = SECCOMP_NOTIFY_REPLIED;
> >         knotif->error = resp.error;
> >         knotif->val = resp.val;
> > +
> > +       if (resp.return_fd) {
> > +               struct fd fd;
> > +
> > +               /*
> > +                * This is a little hokey: we need a real fget() (i.e. not
> > +                * __fget_light(), which is what fdget does), but we also need
> > +                * the flags from strcut fd. So, we get it, put it, and get it
> > +                * again for real.
> > +                */
> > +               fd = fdget(resp.fd);
> > +               knotif->flags = fd.flags;
> > +               fdput(fd);
> > +
> > +               knotif->file = fget(resp.fd);
> > +               if (!knotif->file) {
> > +                       ret = -EBADF;
> > +                       goto out;
> 
> Should the "knotif->state = SECCOMP_NOTIFY_REPLIED" and other changes
> be done after the error case here? In case of bad fd, it seems to
> return -EBADF the first time and -EINVAL the next time because the
> state would have been changed to SECCOMP_NOTIFY_REPLIED already.

Yes, good catch, thanks!

Tycho