[v2,2/3] seccomp: Introduce addfd ioctl to seccomp user notifier

Submitted by Sargun Dhillon on May 28, 2020, 11:08 a.m.

Details

Message ID 20200528110858.3265-3-sargun@sargun.me
State New
Series "Add seccomp notifier ioctl that enables adding fds"
Headers show

Commit Message

Sargun Dhillon May 28, 2020, 11:08 a.m.
This adds a seccomp notifier ioctl which allows for the listener to "add"
file descriptors to a process which originated a seccomp user
notification. This allows calls like mount, and mknod to be "implemented",
as the return value, and the arguments are data in memory. On the other
hand, calls like connect can be "implemented" using pidfd_getfd.

Unfortunately, there are calls which return file descriptors, like
open, which are vulnerable to TOC-TOU attacks, and require that the
more privileged supervisor can inspect the argument, and perform the
syscall on behalf of the process generating the notifiation. This
allows the file descriptor generated from that open call to be
returned to the calling process.

In addition, there is funcitonality to allow for replacement of
specific file descriptors, following dup2-like semantics.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Suggested-by: Matt Denton <mpdenton@google.com>
Cc: Kees Cook <keescook@google.com>,
Cc: Jann Horn <jannh@google.com>,
Cc: Robert Sesek <rsesek@google.com>,
Cc: Chris Palmer <palmer@google.com>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Tycho Andersen <tycho@tycho.ws>
---
 include/uapi/linux/seccomp.h |  25 +++++
 kernel/seccomp.c             | 182 ++++++++++++++++++++++++++++++++++-
 2 files changed, 206 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index c1735455bc53..c7bfe898e7a0 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -113,6 +113,27 @@  struct seccomp_notif_resp {
 	__u32 flags;
 };
 
+/* valid flags for seccomp_notif_addfd */
+#define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
+
+/**
+ * struct seccomp_notif_addfd
+ * @size: The size of the seccomp_notif_addfd datastructure
+ * @id: The ID of the seccomp notification
+ * @flags: SECCOMP_ADDFD_FLAG_*
+ * @srcfd: The local fd number
+ * @newfd: Optional remote FD number if SETFD option is set, otherwise 0.
+ * @newfd_flags: Flags the remote FD should be allocated under
+ */
+struct seccomp_notif_addfd {
+	__u64 size;
+	__u64 id;
+	__u64 flags;
+	__u32 srcfd;
+	__u32 newfd;
+	__u32 newfd_flags;
+};
+
 #define SECCOMP_IOC_MAGIC		'!'
 #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
 #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
@@ -124,4 +145,8 @@  struct seccomp_notif_resp {
 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
 						struct seccomp_notif_resp)
 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
+/* On success, the return value is the remote process's added fd number */
+#define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOR(3,	\
+						struct seccomp_notif_addfd)
+
 #endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 94ae4c7502cc..02b9ba1fbee0 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -41,6 +41,9 @@ 
 #include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <linux/anon_inodes.h>
+#include <net/netprio_cgroup.h>
+#include <net/sock.h>
+#include <net/cls_cgroup.h>
 
 enum notify_state {
 	SECCOMP_NOTIFY_INIT,
@@ -77,10 +80,42 @@  struct seccomp_knotif {
 	long val;
 	u32 flags;
 
-	/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
+	/*
+	 * Signals when this has changed states, such as the listener
+	 * dying, a new seccomp addfd message, or changing to REPLIED
+	 */
 	struct completion ready;
 
 	struct list_head list;
+
+	/* outstanding addfd requests */
+	struct list_head addfd;
+};
+
+/**
+ * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
+ *
+ * @file: A reference to the file to install in the other task
+ * @fd: The fd number to install it at. If the fd number is -1, it means the
+ *      installing process should allocate the fd as normal.
+ * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
+ *         is allowed.
+ * @ret: The return value of the installing process. It is set to the fd num
+ *       upon success (>= 0).
+ * @completion: Indicates that the installing process has completed fd
+ *              installation, or gone away (either due to successful
+ *              reply, or signal)
+ *
+ */
+struct seccomp_kaddfd {
+	struct file *file;
+	int fd;
+	unsigned int flags;
+
+	/* To only be set on reply */
+	int ret;
+	struct completion completion;
+	struct list_head list;
 };
 
 /**
@@ -735,6 +770,41 @@  static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
 	return filter->notif->next_id++;
 }
 
+static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
+{
+	struct socket *sock;
+	int ret, err;
+
+	/*
+	 * Remove the notification, and reset the list pointers, indicating
+	 * that it has been handled.
+	 */
+	list_del_init(&addfd->list);
+
+	ret = security_file_receive(addfd->file);
+	if (ret)
+		goto out;
+
+	if (addfd->fd == -1) {
+		ret = get_unused_fd_flags(addfd->flags);
+		if (ret >= 0)
+			fd_install(ret, get_file(addfd->file));
+	} else {
+		ret = replace_fd(addfd->fd, addfd->file, addfd->flags);
+	}
+
+	/* These are the semantics from copying FDs via SCM_RIGHTS */
+	sock = sock_from_file(addfd->file, &err);
+	if (sock) {
+		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+		sock_update_classid(&sock->sk->sk_cgrp_data);
+	}
+
+out:
+	addfd->ret = ret;
+	complete(&addfd->completion);
+}
+
 static int seccomp_do_user_notification(int this_syscall,
 					struct seccomp_filter *match,
 					const struct seccomp_data *sd)
@@ -743,6 +813,7 @@  static int seccomp_do_user_notification(int this_syscall,
 	u32 flags = 0;
 	long ret = 0;
 	struct seccomp_knotif n = {};
+	struct seccomp_kaddfd *addfd, *tmp;
 
 	mutex_lock(&match->notify_lock);
 	err = -ENOSYS;
@@ -755,6 +826,7 @@  static int seccomp_do_user_notification(int this_syscall,
 	n.id = seccomp_next_notify_id(match);
 	init_completion(&n.ready);
 	list_add(&n.list, &match->notif->notifications);
+	INIT_LIST_HEAD(&n.addfd);
 
 	up(&match->notif->request);
 	wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM);
@@ -763,14 +835,31 @@  static int seccomp_do_user_notification(int this_syscall,
 	/*
 	 * This is where we wait for a reply from userspace.
 	 */
+wait:
 	err = wait_for_completion_interruptible(&n.ready);
 	mutex_lock(&match->notify_lock);
 	if (err == 0) {
+		/* Check if we were woken up by a addfd message */
+		addfd = list_first_entry_or_null(&n.addfd,
+						 struct seccomp_kaddfd, list);
+		if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
+			seccomp_handle_addfd(addfd);
+			mutex_unlock(&match->notify_lock);
+			goto wait;
+		}
 		ret = n.val;
 		err = n.error;
 		flags = n.flags;
 	}
 
+	/* If there were any pending addfd calls, clear them out */
+	list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
+		/* The process went away before we got a chance to handle it */
+		addfd->ret = -ESRCH;
+		list_del_init(&addfd->list);
+		complete(&addfd->completion);
+	}
+
 	/*
 	 * Note that it's possible the listener died in between the time when
 	 * we were notified of a respons (or a signal) and when we were able to
@@ -1174,6 +1263,95 @@  static long seccomp_notify_id_valid(struct seccomp_filter *filter,
 	return ret;
 }
 
+static long seccomp_notify_addfd(struct seccomp_filter *filter,
+				 struct seccomp_notif_addfd __user *uaddfd)
+{
+	struct seccomp_notif_addfd addfd;
+	struct seccomp_knotif *knotif;
+	struct seccomp_kaddfd kaddfd;
+	u64 size;
+	int ret;
+
+	ret = get_user(size, &uaddfd->size);
+	if (ret)
+		return ret;
+
+	ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
+	if (ret)
+		return ret;
+
+	if (addfd.newfd_flags & ~O_CLOEXEC)
+		return -EINVAL;
+
+	if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
+		return -EINVAL;
+
+	if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
+		return -EINVAL;
+
+	kaddfd.file = fget(addfd.srcfd);
+	if (!kaddfd.file)
+		return -EBADF;
+
+	kaddfd.flags = addfd.newfd_flags;
+	kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
+		    addfd.newfd : -1;
+	init_completion(&kaddfd.completion);
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret < 0)
+		goto out;
+
+	knotif = find_notification(filter, addfd.id);
+	/*
+	 * We do not want to allow for FD injection to occur before the
+	 * notification has been picked up by a userspace handler, or after
+	 * the notification has been replied to.
+	 */
+	if (!knotif) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+
+	if (knotif->state != SECCOMP_NOTIFY_SENT) {
+		ret = -EINPROGRESS;
+		goto out_unlock;
+	}
+
+	list_add(&kaddfd.list, &knotif->addfd);
+	complete(&knotif->ready);
+	mutex_unlock(&filter->notify_lock);
+
+	/* Now we wait for it to be processed */
+	ret = wait_for_completion_interruptible(&kaddfd.completion);
+	if (ret == 0) {
+		/*
+		 * We had a successful completion. The other side has already
+		 * removed us from the addfd queue, and
+		 * wait_for_completion_interruptible has a memory barrier.
+		 */
+		ret = kaddfd.ret;
+		goto out;
+	}
+
+	mutex_lock(&filter->notify_lock);
+	/*
+	 * Even though we were woken up by a signal, and not a successful
+	 * completion, a completion may have happened in the mean time.
+	 */
+	if (list_empty(&kaddfd.list))
+		ret = kaddfd.ret;
+	else
+		list_del(&kaddfd.list);
+
+out_unlock:
+	mutex_unlock(&filter->notify_lock);
+out:
+	fput(kaddfd.file);
+
+	return ret;
+}
+
 static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
 				 unsigned long arg)
 {
@@ -1187,6 +1365,8 @@  static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
 		return seccomp_notify_send(filter, buf);
 	case SECCOMP_IOCTL_NOTIF_ID_VALID:
 		return seccomp_notify_id_valid(filter, buf);
+	case SECCOMP_IOCTL_NOTIF_ADDFD:
+		return seccomp_notify_addfd(filter, buf);
 	default:
 		return -EINVAL;
 	}

Comments

Kees Cook May 29, 2020, 7:31 a.m.
On Thu, May 28, 2020 at 04:08:57AM -0700, Sargun Dhillon wrote:
> This adds a seccomp notifier ioctl which allows for the listener to "add"
> file descriptors to a process which originated a seccomp user
> notification. This allows calls like mount, and mknod to be "implemented",
> as the return value, and the arguments are data in memory. On the other
> hand, calls like connect can be "implemented" using pidfd_getfd.
> 
> Unfortunately, there are calls which return file descriptors, like
> open, which are vulnerable to TOC-TOU attacks, and require that the
> more privileged supervisor can inspect the argument, and perform the
> syscall on behalf of the process generating the notifiation. This
> allows the file descriptor generated from that open call to be
> returned to the calling process.
> 
> In addition, there is funcitonality to allow for replacement of
> specific file descriptors, following dup2-like semantics.
> 
> Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> Suggested-by: Matt Denton <mpdenton@google.com>

This looks mostly really clean. When I've got more brain tomorrow I want to
double-check the locking, but I think the use of notify_lock and being
in the ioctl fully protects everything from any use-after-free-like
issues.

Notes below...

> +/* valid flags for seccomp_notif_addfd */
> +#define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */

Nit: please use BIT()

> @@ -735,6 +770,41 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
>  	return filter->notif->next_id++;
>  }
>  
> +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
> +{
> +	struct socket *sock;
> +	int ret, err;
> +
> +	/*
> +	 * Remove the notification, and reset the list pointers, indicating
> +	 * that it has been handled.
> +	 */
> +	list_del_init(&addfd->list);
> +
> +	ret = security_file_receive(addfd->file);
> +	if (ret)
> +		goto out;
> +
> +	if (addfd->fd == -1) {
> +		ret = get_unused_fd_flags(addfd->flags);
> +		if (ret >= 0)
> +			fd_install(ret, get_file(addfd->file));
> +	} else {
> +		ret = replace_fd(addfd->fd, addfd->file, addfd->flags);
> +	}
> +
> +	/* These are the semantics from copying FDs via SCM_RIGHTS */
> +	sock = sock_from_file(addfd->file, &err);
> +	if (sock) {
> +		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
> +		sock_update_classid(&sock->sk->sk_cgrp_data);
> +	}

This made my eye twitch. ;) I see this is borrowed from
scm_detach_fds()... this really feels like the kind of thing that will
quickly go out of sync. I think this "receive an fd" logic needs to be
lifted out of scm_detach_fds() so it and seccomp can share it. I'm not
sure how to parameterize it quite right, though. Perhaps:

int file_receive(int fd, unsigned long flags, struct file *file)
{
	struct socket *sock;
	int ret;

	ret = security_file_receive(file);
	if (ret)
		return ret;

	/* Install the file. */
	if (fd == -1) {
		ret = get_unused_fd_flags(flags);
		if (ret >= 0)
			fd_install(ret, get_file(file));
	} else {
		ret = replace_fd(fd, file, flags);
	}

	/* Bump the usage count. */
	sock = sock_from_file(addfd->file, &err);
	if (sock) {
		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
		sock_update_classid(&sock->sk->sk_cgrp_data);
	}

	return ret;
}


static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
{
	/*
	 * Remove the notification, and reset the list pointers, indicating
	 * that it has been handled.
	 */
	list_del_init(&addfd->list);
	addfd->ret = file_receive(addfd->fd, addfd->flags, addfd->file);
	complete(&addfd->completion);
}

scm_detach_fds()
	...
	for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
             i++, cmfptr++)
	{

		err = file_receive(-1, MSG_CMSG_CLOEXEC & msg->msg_flags
                                          ? O_CLOEXEC : 0, fp[i]);
		if (err < 0)
			break;
		err = put_user(err, cmfptr);
		if (err)
			/* wat */
	}
	...

I'm not sure on the put_user() failure, though. We could check early
for faults with a put_user(0, cmfptr) before the file_receive() call, or
we could just ignore it? I'm not sure what SCM does here. I guess
worst-case:

int file_receive(int fd, unsigned long flags, struct file *file,
		 int __user *fdptr)
{
		...
		ret = get_unused_fd_flags(flags);
		if (ret >= 0) {
			if (cmfptr) {
				int err;

	                	err = put_user(ret, cmfptr);
				if (err) {
					put_unused_fd(ret);
					return err;
				}
			}
			fd_install(ret, get_file(file));
		}
		...
}

> @@ -763,14 +835,31 @@ static int seccomp_do_user_notification(int this_syscall,
>  	/*
>  	 * This is where we wait for a reply from userspace.
>  	 */
> +wait:
>  	err = wait_for_completion_interruptible(&n.ready);
>  	mutex_lock(&match->notify_lock);
>  	if (err == 0) {
> +		/* Check if we were woken up by a addfd message */
> +		addfd = list_first_entry_or_null(&n.addfd,
> +						 struct seccomp_kaddfd, list);
> +		if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
> +			seccomp_handle_addfd(addfd);
> +			mutex_unlock(&match->notify_lock);
> +			goto wait;
> +		}
>  		ret = n.val;
>  		err = n.error;
>  		flags = n.flags;
>  	}

This feels like it needs to be done differently, but when I tried to
make it "proper" loop, I think it got more ugly:

	for (;;) {
	  	err = wait_for_completion_interruptible(&n.ready);
	  	mutex_lock(&match->notify_lock);
	  	if (err == 0) {
			/* Check if we were woken up by a addfd message */
			addfd = list_first_entry_or_null(&n.addfd,
							 struct seccomp_kaddfd, list);
			if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
				seccomp_handle_addfd(addfd);
				mutex_unlock(&match->notify_lock);
				continue;
			}
	 		ret = n.val;
	 		err = n.error;
	 		flags = n.flags;
		}
		break;
	}

So, I guess it's fine how you have it. :)

>  
> +	/* If there were any pending addfd calls, clear them out */
> +	list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
> +		/* The process went away before we got a chance to handle it */
> +		addfd->ret = -ESRCH;
> +		list_del_init(&addfd->list);
> +		complete(&addfd->completion);
> +	}
> +
>  	/*
>  	 * Note that it's possible the listener died in between the time when
>  	 * we were notified of a respons (or a signal) and when we were able to
> @@ -1174,6 +1263,95 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
>  	return ret;
>  }
>  
> +static long seccomp_notify_addfd(struct seccomp_filter *filter,
> +				 struct seccomp_notif_addfd __user *uaddfd)
> +{
> +	struct seccomp_notif_addfd addfd;
> +	struct seccomp_knotif *knotif;
> +	struct seccomp_kaddfd kaddfd;
> +	u64 size;
> +	int ret;
> +
> +	ret = get_user(size, &uaddfd->size);
> +	if (ret)
> +		return ret;
> +
> +	ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
> +	if (ret)
> +		return ret;
> +
> +	if (addfd.newfd_flags & ~O_CLOEXEC)
> +		return -EINVAL;
> +
> +	if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
> +		return -EINVAL;
> +
> +	if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
> +		return -EINVAL;
> +
> +	kaddfd.file = fget(addfd.srcfd);
> +	if (!kaddfd.file)
> +		return -EBADF;
> +
> +	kaddfd.flags = addfd.newfd_flags;
> +	kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
> +		    addfd.newfd : -1;

Given that -1 is already illegal, do we need SECCOMP_ADDFD_FLAG_SETFD?
Could a -1 for newfd just be used instead?

> +	init_completion(&kaddfd.completion);
> +
> +	ret = mutex_lock_interruptible(&filter->notify_lock);
> +	if (ret < 0)
> +		goto out;
> +
> +	knotif = find_notification(filter, addfd.id);
> +	/*
> +	 * We do not want to allow for FD injection to occur before the
> +	 * notification has been picked up by a userspace handler, or after
> +	 * the notification has been replied to.
> +	 */
> +	if (!knotif) {
> +		ret = -ENOENT;
> +		goto out_unlock;
> +	}
> +
> +	if (knotif->state != SECCOMP_NOTIFY_SENT) {
> +		ret = -EINPROGRESS;
> +		goto out_unlock;
> +	}
> +
> +	list_add(&kaddfd.list, &knotif->addfd);
> +	complete(&knotif->ready);
> +	mutex_unlock(&filter->notify_lock);
> +
> +	/* Now we wait for it to be processed */
> +	ret = wait_for_completion_interruptible(&kaddfd.completion);
> +	if (ret == 0) {
> +		/*
> +		 * We had a successful completion. The other side has already
> +		 * removed us from the addfd queue, and
> +		 * wait_for_completion_interruptible has a memory barrier.
> +		 */
> +		ret = kaddfd.ret;
> +		goto out;
> +	}
> +
> +	mutex_lock(&filter->notify_lock);
> +	/*
> +	 * Even though we were woken up by a signal, and not a successful
> +	 * completion, a completion may have happened in the mean time.
> +	 */
> +	if (list_empty(&kaddfd.list))
> +		ret = kaddfd.ret;
> +	else
> +		list_del(&kaddfd.list);
> +
> +out_unlock:
> +	mutex_unlock(&filter->notify_lock);
> +out:
> +	fput(kaddfd.file);
> +
> +	return ret;
> +}
> +
>  static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
>  				 unsigned long arg)
>  {
> @@ -1187,6 +1365,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
>  		return seccomp_notify_send(filter, buf);
>  	case SECCOMP_IOCTL_NOTIF_ID_VALID:
>  		return seccomp_notify_id_valid(filter, buf);
> +	case SECCOMP_IOCTL_NOTIF_ADDFD:
> +		return seccomp_notify_addfd(filter, buf);
>  	default:
>  		return -EINVAL;
>  	}
> -- 
> 2.25.1
> 

Whee! :)
Christian Brauner May 29, 2020, 7:38 a.m.
On Fri, May 29, 2020 at 12:31:37AM -0700, Kees Cook wrote:
> On Thu, May 28, 2020 at 04:08:57AM -0700, Sargun Dhillon wrote:
> > This adds a seccomp notifier ioctl which allows for the listener to "add"
> > file descriptors to a process which originated a seccomp user
> > notification. This allows calls like mount, and mknod to be "implemented",
> > as the return value, and the arguments are data in memory. On the other
> > hand, calls like connect can be "implemented" using pidfd_getfd.
> > 
> > Unfortunately, there are calls which return file descriptors, like
> > open, which are vulnerable to TOC-TOU attacks, and require that the
> > more privileged supervisor can inspect the argument, and perform the
> > syscall on behalf of the process generating the notifiation. This
> > allows the file descriptor generated from that open call to be
> > returned to the calling process.
> > 
> > In addition, there is funcitonality to allow for replacement of
> > specific file descriptors, following dup2-like semantics.
> > 
> > Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> > Suggested-by: Matt Denton <mpdenton@google.com>
> 
> This looks mostly really clean. When I've got more brain tomorrow I want to
> double-check the locking, but I think the use of notify_lock and being
> in the ioctl fully protects everything from any use-after-free-like
> issues.
> 
> Notes below...
> 
> > +/* valid flags for seccomp_notif_addfd */
> > +#define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
> 
> Nit: please use BIT()

Fwiw, I don't think we can use BIT() in uapi headers, see:

commit 23b2c96fad21886c53f5e1a4ffedd45ddd2e85ba
Author: Christian Brauner <christian.brauner@ubuntu.com>
Date:   Thu Oct 24 23:25:39 2019 +0200

    seccomp: rework define for SECCOMP_USER_NOTIF_FLAG_CONTINUE

    Switch from BIT(0) to (1UL << 0).

> 
> > @@ -735,6 +770,41 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
> >  	return filter->notif->next_id++;
> >  }
> >  
> > +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
> > +{
> > +	struct socket *sock;
> > +	int ret, err;
> > +
> > +	/*
> > +	 * Remove the notification, and reset the list pointers, indicating
> > +	 * that it has been handled.
> > +	 */
> > +	list_del_init(&addfd->list);
> > +
> > +	ret = security_file_receive(addfd->file);
> > +	if (ret)
> > +		goto out;
> > +
> > +	if (addfd->fd == -1) {
> > +		ret = get_unused_fd_flags(addfd->flags);
> > +		if (ret >= 0)
> > +			fd_install(ret, get_file(addfd->file));
> > +	} else {
> > +		ret = replace_fd(addfd->fd, addfd->file, addfd->flags);
> > +	}
> > +
> > +	/* These are the semantics from copying FDs via SCM_RIGHTS */
> > +	sock = sock_from_file(addfd->file, &err);
> > +	if (sock) {
> > +		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
> > +		sock_update_classid(&sock->sk->sk_cgrp_data);
> > +	}
> 
> This made my eye twitch. ;) I see this is borrowed from
> scm_detach_fds()... this really feels like the kind of thing that will
> quickly go out of sync. I think this "receive an fd" logic needs to be
> lifted out of scm_detach_fds() so it and seccomp can share it. I'm not
> sure how to parameterize it quite right, though. Perhaps:
> 
> int file_receive(int fd, unsigned long flags, struct file *file)
> {
> 	struct socket *sock;
> 	int ret;
> 
> 	ret = security_file_receive(file);
> 	if (ret)
> 		return ret;
> 
> 	/* Install the file. */
> 	if (fd == -1) {
> 		ret = get_unused_fd_flags(flags);
> 		if (ret >= 0)
> 			fd_install(ret, get_file(file));
> 	} else {
> 		ret = replace_fd(fd, file, flags);
> 	}
> 
> 	/* Bump the usage count. */
> 	sock = sock_from_file(addfd->file, &err);
> 	if (sock) {
> 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
> 		sock_update_classid(&sock->sk->sk_cgrp_data);
> 	}
> 
> 	return ret;
> }
> 
> 
> static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
> {
> 	/*
> 	 * Remove the notification, and reset the list pointers, indicating
> 	 * that it has been handled.
> 	 */
> 	list_del_init(&addfd->list);
> 	addfd->ret = file_receive(addfd->fd, addfd->flags, addfd->file);
> 	complete(&addfd->completion);
> }
> 
> scm_detach_fds()
> 	...
> 	for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
>              i++, cmfptr++)
> 	{
> 
> 		err = file_receive(-1, MSG_CMSG_CLOEXEC & msg->msg_flags
>                                           ? O_CLOEXEC : 0, fp[i]);
> 		if (err < 0)
> 			break;
> 		err = put_user(err, cmfptr);
> 		if (err)
> 			/* wat */
> 	}
> 	...
> 
> I'm not sure on the put_user() failure, though. We could check early
> for faults with a put_user(0, cmfptr) before the file_receive() call, or
> we could just ignore it? I'm not sure what SCM does here. I guess
> worst-case:
> 
> int file_receive(int fd, unsigned long flags, struct file *file,
> 		 int __user *fdptr)
> {
> 		...
> 		ret = get_unused_fd_flags(flags);
> 		if (ret >= 0) {
> 			if (cmfptr) {
> 				int err;
> 
> 	                	err = put_user(ret, cmfptr);
> 				if (err) {
> 					put_unused_fd(ret);
> 					return err;
> 				}
> 			}
> 			fd_install(ret, get_file(file));
> 		}
> 		...
> }
> 
> > @@ -763,14 +835,31 @@ static int seccomp_do_user_notification(int this_syscall,
> >  	/*
> >  	 * This is where we wait for a reply from userspace.
> >  	 */
> > +wait:
> >  	err = wait_for_completion_interruptible(&n.ready);
> >  	mutex_lock(&match->notify_lock);
> >  	if (err == 0) {
> > +		/* Check if we were woken up by a addfd message */
> > +		addfd = list_first_entry_or_null(&n.addfd,
> > +						 struct seccomp_kaddfd, list);
> > +		if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
> > +			seccomp_handle_addfd(addfd);
> > +			mutex_unlock(&match->notify_lock);
> > +			goto wait;
> > +		}
> >  		ret = n.val;
> >  		err = n.error;
> >  		flags = n.flags;
> >  	}
> 
> This feels like it needs to be done differently, but when I tried to
> make it "proper" loop, I think it got more ugly:
> 
> 	for (;;) {
> 	  	err = wait_for_completion_interruptible(&n.ready);
> 	  	mutex_lock(&match->notify_lock);
> 	  	if (err == 0) {
> 			/* Check if we were woken up by a addfd message */
> 			addfd = list_first_entry_or_null(&n.addfd,
> 							 struct seccomp_kaddfd, list);
> 			if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
> 				seccomp_handle_addfd(addfd);
> 				mutex_unlock(&match->notify_lock);
> 				continue;
> 			}
> 	 		ret = n.val;
> 	 		err = n.error;
> 	 		flags = n.flags;
> 		}
> 		break;
> 	}
> 
> So, I guess it's fine how you have it. :)
> 
> >  
> > +	/* If there were any pending addfd calls, clear them out */
> > +	list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
> > +		/* The process went away before we got a chance to handle it */
> > +		addfd->ret = -ESRCH;
> > +		list_del_init(&addfd->list);
> > +		complete(&addfd->completion);
> > +	}
> > +
> >  	/*
> >  	 * Note that it's possible the listener died in between the time when
> >  	 * we were notified of a respons (or a signal) and when we were able to
> > @@ -1174,6 +1263,95 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
> >  	return ret;
> >  }
> >  
> > +static long seccomp_notify_addfd(struct seccomp_filter *filter,
> > +				 struct seccomp_notif_addfd __user *uaddfd)
> > +{
> > +	struct seccomp_notif_addfd addfd;
> > +	struct seccomp_knotif *knotif;
> > +	struct seccomp_kaddfd kaddfd;
> > +	u64 size;
> > +	int ret;
> > +
> > +	ret = get_user(size, &uaddfd->size);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
> > +	if (ret)
> > +		return ret;
> > +
> > +	if (addfd.newfd_flags & ~O_CLOEXEC)
> > +		return -EINVAL;
> > +
> > +	if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
> > +		return -EINVAL;
> > +
> > +	if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
> > +		return -EINVAL;
> > +
> > +	kaddfd.file = fget(addfd.srcfd);
> > +	if (!kaddfd.file)
> > +		return -EBADF;
> > +
> > +	kaddfd.flags = addfd.newfd_flags;
> > +	kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
> > +		    addfd.newfd : -1;
> 
> Given that -1 is already illegal, do we need SECCOMP_ADDFD_FLAG_SETFD?
> Could a -1 for newfd just be used instead?
> 
> > +	init_completion(&kaddfd.completion);
> > +
> > +	ret = mutex_lock_interruptible(&filter->notify_lock);
> > +	if (ret < 0)
> > +		goto out;
> > +
> > +	knotif = find_notification(filter, addfd.id);
> > +	/*
> > +	 * We do not want to allow for FD injection to occur before the
> > +	 * notification has been picked up by a userspace handler, or after
> > +	 * the notification has been replied to.
> > +	 */
> > +	if (!knotif) {
> > +		ret = -ENOENT;
> > +		goto out_unlock;
> > +	}
> > +
> > +	if (knotif->state != SECCOMP_NOTIFY_SENT) {
> > +		ret = -EINPROGRESS;
> > +		goto out_unlock;
> > +	}
> > +
> > +	list_add(&kaddfd.list, &knotif->addfd);
> > +	complete(&knotif->ready);
> > +	mutex_unlock(&filter->notify_lock);
> > +
> > +	/* Now we wait for it to be processed */
> > +	ret = wait_for_completion_interruptible(&kaddfd.completion);
> > +	if (ret == 0) {
> > +		/*
> > +		 * We had a successful completion. The other side has already
> > +		 * removed us from the addfd queue, and
> > +		 * wait_for_completion_interruptible has a memory barrier.
> > +		 */
> > +		ret = kaddfd.ret;
> > +		goto out;
> > +	}
> > +
> > +	mutex_lock(&filter->notify_lock);
> > +	/*
> > +	 * Even though we were woken up by a signal, and not a successful
> > +	 * completion, a completion may have happened in the mean time.
> > +	 */
> > +	if (list_empty(&kaddfd.list))
> > +		ret = kaddfd.ret;
> > +	else
> > +		list_del(&kaddfd.list);
> > +
> > +out_unlock:
> > +	mutex_unlock(&filter->notify_lock);
> > +out:
> > +	fput(kaddfd.file);
> > +
> > +	return ret;
> > +}
> > +
> >  static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
> >  				 unsigned long arg)
> >  {
> > @@ -1187,6 +1365,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
> >  		return seccomp_notify_send(filter, buf);
> >  	case SECCOMP_IOCTL_NOTIF_ID_VALID:
> >  		return seccomp_notify_id_valid(filter, buf);
> > +	case SECCOMP_IOCTL_NOTIF_ADDFD:
> > +		return seccomp_notify_addfd(filter, buf);
> >  	default:
> >  		return -EINVAL;
> >  	}
> > -- 
> > 2.25.1
> > 
> 
> Whee! :)
> 
> -- 
> Kees Cook
Kees Cook May 29, 2020, 7:45 a.m.
On Fri, May 29, 2020 at 09:38:28AM +0200, Christian Brauner wrote:
> On Fri, May 29, 2020 at 12:31:37AM -0700, Kees Cook wrote:
> > Nit: please use BIT()
> 
> Fwiw, I don't think we can use BIT() in uapi headers, see:

Argh. How many times do I get to trip over this? :P Thank you; yes,
please ignore my suggestion. :)
Giuseppe Scrivano May 29, 2020, 9:24 a.m.
Sargun Dhillon <sargun@sargun.me> writes:

> This adds a seccomp notifier ioctl which allows for the listener to "add"
> file descriptors to a process which originated a seccomp user
> notification. This allows calls like mount, and mknod to be "implemented",
> as the return value, and the arguments are data in memory. On the other
> hand, calls like connect can be "implemented" using pidfd_getfd.
>
> Unfortunately, there are calls which return file descriptors, like
> open, which are vulnerable to TOC-TOU attacks, and require that the
> more privileged supervisor can inspect the argument, and perform the
> syscall on behalf of the process generating the notifiation. This
> allows the file descriptor generated from that open call to be
> returned to the calling process.
>
> In addition, there is funcitonality to allow for replacement of
> specific file descriptors, following dup2-like semantics.
>
> Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> Suggested-by: Matt Denton <mpdenton@google.com>
> Cc: Kees Cook <keescook@google.com>,
> Cc: Jann Horn <jannh@google.com>,
> Cc: Robert Sesek <rsesek@google.com>,
> Cc: Chris Palmer <palmer@google.com>
> Cc: Christian Brauner <christian.brauner@ubuntu.com>
> Cc: Tycho Andersen <tycho@tycho.ws>
> ---

Thanks, this is a really useful feature.

Tested-by: Giuseppe Scrivano <gscrivan@redhat.com>
Christian Brauner May 29, 2020, 10:32 a.m.
On Thu, May 28, 2020 at 04:08:57AM -0700, Sargun Dhillon wrote:
> This adds a seccomp notifier ioctl which allows for the listener to "add"
> file descriptors to a process which originated a seccomp user
> notification. This allows calls like mount, and mknod to be "implemented",
> as the return value, and the arguments are data in memory. On the other
> hand, calls like connect can be "implemented" using pidfd_getfd.
> 
> Unfortunately, there are calls which return file descriptors, like
> open, which are vulnerable to TOC-TOU attacks, and require that the
> more privileged supervisor can inspect the argument, and perform the
> syscall on behalf of the process generating the notifiation. This
> allows the file descriptor generated from that open call to be
> returned to the calling process.
> 
> In addition, there is funcitonality to allow for replacement of
> specific file descriptors, following dup2-like semantics.
> 
> Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> Suggested-by: Matt Denton <mpdenton@google.com>
> Cc: Kees Cook <keescook@google.com>,
> Cc: Jann Horn <jannh@google.com>,
> Cc: Robert Sesek <rsesek@google.com>,
> Cc: Chris Palmer <palmer@google.com>
> Cc: Christian Brauner <christian.brauner@ubuntu.com>
> Cc: Tycho Andersen <tycho@tycho.ws>
> ---
>  include/uapi/linux/seccomp.h |  25 +++++
>  kernel/seccomp.c             | 182 ++++++++++++++++++++++++++++++++++-
>  2 files changed, 206 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index c1735455bc53..c7bfe898e7a0 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -113,6 +113,27 @@ struct seccomp_notif_resp {
>  	__u32 flags;
>  };
>  
> +/* valid flags for seccomp_notif_addfd */
> +#define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
> +
> +/**
> + * struct seccomp_notif_addfd
> + * @size: The size of the seccomp_notif_addfd datastructure
> + * @id: The ID of the seccomp notification
> + * @flags: SECCOMP_ADDFD_FLAG_*
> + * @srcfd: The local fd number
> + * @newfd: Optional remote FD number if SETFD option is set, otherwise 0.
> + * @newfd_flags: Flags the remote FD should be allocated under
> + */
> +struct seccomp_notif_addfd {
> +	__u64 size;
> +	__u64 id;
> +	__u64 flags;
> +	__u32 srcfd;
> +	__u32 newfd;
> +	__u32 newfd_flags;
> +};

This doesn't correspond to how we usually pad structs, I think:

struct seccomp_notif_addfd {
        __u64                      size;                 /*     0     8 */
        __u64                      id;                   /*     8     8 */
        __u64                      flags;                /*    16     8 */
        __u32                      srcfd;                /*    24     4 */
        __u32                      newfd;                /*    28     4 */
        __u32                      newfd_flags;          /*    32     4 */

        /* size: 40, cachelines: 1, members: 6 */
        /* padding: 4 */
        /* last cacheline: 40 bytes */
};

You can either use the packed attribute or change the flags member from
u64 to u32:

struct seccomp_notif_addfd {
	__u64 size;
	__u64 id;
	__u32 flags;
	__u32 srcfd;
	__u32 newfd;
	__u32 newfd_flags;
}

^^ This seems nicer to me and gets rid of the 4 byte padding. If we run
out of 32 flags we'll just add a second flag argument to the struct.

> +
>  #define SECCOMP_IOC_MAGIC		'!'
>  #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
>  #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
> @@ -124,4 +145,8 @@ struct seccomp_notif_resp {
>  #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
>  						struct seccomp_notif_resp)
>  #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
> +/* On success, the return value is the remote process's added fd number */
> +#define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOR(3,	\
> +						struct seccomp_notif_addfd)
> +
>  #endif /* _UAPI_LINUX_SECCOMP_H */
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 94ae4c7502cc..02b9ba1fbee0 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -41,6 +41,9 @@
>  #include <linux/tracehook.h>
>  #include <linux/uaccess.h>
>  #include <linux/anon_inodes.h>
> +#include <net/netprio_cgroup.h>
> +#include <net/sock.h>
> +#include <net/cls_cgroup.h>
>  
>  enum notify_state {
>  	SECCOMP_NOTIFY_INIT,
> @@ -77,10 +80,42 @@ struct seccomp_knotif {
>  	long val;
>  	u32 flags;
>  
> -	/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
> +	/*
> +	 * Signals when this has changed states, such as the listener
> +	 * dying, a new seccomp addfd message, or changing to REPLIED
> +	 */
>  	struct completion ready;
>  
>  	struct list_head list;
> +
> +	/* outstanding addfd requests */
> +	struct list_head addfd;
> +};
> +
> +/**
> + * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
> + *
> + * @file: A reference to the file to install in the other task
> + * @fd: The fd number to install it at. If the fd number is -1, it means the
> + *      installing process should allocate the fd as normal.
> + * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
> + *         is allowed.
> + * @ret: The return value of the installing process. It is set to the fd num
> + *       upon success (>= 0).
> + * @completion: Indicates that the installing process has completed fd
> + *              installation, or gone away (either due to successful
> + *              reply, or signal)
> + *
> + */
> +struct seccomp_kaddfd {
> +	struct file *file;
> +	int fd;
> +	unsigned int flags;
> +
> +	/* To only be set on reply */
> +	int ret;
> +	struct completion completion;
> +	struct list_head list;
>  };
>  
>  /**
> @@ -735,6 +770,41 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
>  	return filter->notif->next_id++;
>  }
>  
> +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
> +{
> +	struct socket *sock;
> +	int ret, err;
> +
> +	/*
> +	 * Remove the notification, and reset the list pointers, indicating
> +	 * that it has been handled.
> +	 */
> +	list_del_init(&addfd->list);
> +
> +	ret = security_file_receive(addfd->file);
> +	if (ret)
> +		goto out;
> +
> +	if (addfd->fd == -1) {
> +		ret = get_unused_fd_flags(addfd->flags);
> +		if (ret >= 0)
> +			fd_install(ret, get_file(addfd->file));
> +	} else {
> +		ret = replace_fd(addfd->fd, addfd->file, addfd->flags);
> +	}
> +
> +	/* These are the semantics from copying FDs via SCM_RIGHTS */
> +	sock = sock_from_file(addfd->file, &err);

Iiuc, if this is indeed a socket and the replace_fd() or fd_install()
has failed, you're now still transferring netprioidx and classid to the
task's cgroup. Should probably be something like:

if (sock && ret >= 0) {
	sock_update_netprioidx(&sock->sk->sk_cgrp_data);
	sock_update_classid(&sock->sk->sk_cgrp_data);
}

> +
> +out:
> +	addfd->ret = ret;
> +	complete(&addfd->completion);
> +}
> +
>  static int seccomp_do_user_notification(int this_syscall,
>  					struct seccomp_filter *match,
>  					const struct seccomp_data *sd)
> @@ -743,6 +813,7 @@ static int seccomp_do_user_notification(int this_syscall,
>  	u32 flags = 0;
>  	long ret = 0;
>  	struct seccomp_knotif n = {};
> +	struct seccomp_kaddfd *addfd, *tmp;
>  
>  	mutex_lock(&match->notify_lock);
>  	err = -ENOSYS;
> @@ -755,6 +826,7 @@ static int seccomp_do_user_notification(int this_syscall,
>  	n.id = seccomp_next_notify_id(match);
>  	init_completion(&n.ready);
>  	list_add(&n.list, &match->notif->notifications);
> +	INIT_LIST_HEAD(&n.addfd);
>  
>  	up(&match->notif->request);
>  	wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM);
> @@ -763,14 +835,31 @@ static int seccomp_do_user_notification(int this_syscall,
>  	/*
>  	 * This is where we wait for a reply from userspace.
>  	 */
> +wait:
>  	err = wait_for_completion_interruptible(&n.ready);
>  	mutex_lock(&match->notify_lock);
>  	if (err == 0) {
> +		/* Check if we were woken up by a addfd message */
> +		addfd = list_first_entry_or_null(&n.addfd,
> +						 struct seccomp_kaddfd, list);
> +		if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
> +			seccomp_handle_addfd(addfd);
> +			mutex_unlock(&match->notify_lock);
> +			goto wait;
> +		}
>  		ret = n.val;
>  		err = n.error;
>  		flags = n.flags;
>  	}
>  
> +	/* If there were any pending addfd calls, clear them out */
> +	list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
> +		/* The process went away before we got a chance to handle it */
> +		addfd->ret = -ESRCH;
> +		list_del_init(&addfd->list);
> +		complete(&addfd->completion);
> +	}
> +
>  	/*
>  	 * Note that it's possible the listener died in between the time when
>  	 * we were notified of a respons (or a signal) and when we were able to
> @@ -1174,6 +1263,95 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
>  	return ret;
>  }
>  
> +static long seccomp_notify_addfd(struct seccomp_filter *filter,
> +				 struct seccomp_notif_addfd __user *uaddfd)
> +{
> +	struct seccomp_notif_addfd addfd;
> +	struct seccomp_knotif *knotif;
> +	struct seccomp_kaddfd kaddfd;
> +	u64 size;
> +	int ret;
> +
> +	ret = get_user(size, &uaddfd->size);
> +	if (ret)
> +		return ret;
> +
> +	ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
> +	if (ret)
> +		return ret;
> +
> +	if (addfd.newfd_flags & ~O_CLOEXEC)
> +		return -EINVAL;
> +
> +	if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
> +		return -EINVAL;
> +
> +	if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
> +		return -EINVAL;
> +
> +	kaddfd.file = fget(addfd.srcfd);
> +	if (!kaddfd.file)
> +		return -EBADF;
> +
> +	kaddfd.flags = addfd.newfd_flags;
> +	kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
> +		    addfd.newfd : -1;
> +	init_completion(&kaddfd.completion);
> +
> +	ret = mutex_lock_interruptible(&filter->notify_lock);
> +	if (ret < 0)
> +		goto out;
> +
> +	knotif = find_notification(filter, addfd.id);
> +	/*
> +	 * We do not want to allow for FD injection to occur before the
> +	 * notification has been picked up by a userspace handler, or after
> +	 * the notification has been replied to.
> +	 */

That comment ^^ should probably go above...

> +	if (!knotif) {
> +		ret = -ENOENT;
> +		goto out_unlock;
> +	}

... this vv check, no?

> +
> +	if (knotif->state != SECCOMP_NOTIFY_SENT) {
> +		ret = -EINPROGRESS;
> +		goto out_unlock;
> +	}
> +
> +	list_add(&kaddfd.list, &knotif->addfd);
> +	complete(&knotif->ready);
> +	mutex_unlock(&filter->notify_lock);
> +
> +	/* Now we wait for it to be processed */
> +	ret = wait_for_completion_interruptible(&kaddfd.completion);
> +	if (ret == 0) {
> +		/*
> +		 * We had a successful completion. The other side has already
> +		 * removed us from the addfd queue, and
> +		 * wait_for_completion_interruptible has a memory barrier.
> +		 */
> +		ret = kaddfd.ret;
> +		goto out;
> +	}
> +
> +	mutex_lock(&filter->notify_lock);
> +	/*
> +	 * Even though we were woken up by a signal, and not a successful
> +	 * completion, a completion may have happened in the mean time.
> +	 */
> +	if (list_empty(&kaddfd.list))
> +		ret = kaddfd.ret;
> +	else
> +		list_del(&kaddfd.list);
> +
> +out_unlock:
> +	mutex_unlock(&filter->notify_lock);
> +out:
> +	fput(kaddfd.file);
> +
> +	return ret;
> +}
> +
>  static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
>  				 unsigned long arg)
>  {
> @@ -1187,6 +1365,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
>  		return seccomp_notify_send(filter, buf);
>  	case SECCOMP_IOCTL_NOTIF_ID_VALID:
>  		return seccomp_notify_id_valid(filter, buf);
> +	case SECCOMP_IOCTL_NOTIF_ADDFD:
> +		return seccomp_notify_addfd(filter, buf);
>  	default:
>  		return -EINVAL;
>  	}
> -- 
> 2.25.1
>
Christian Brauner May 29, 2020, 1:31 p.m.
On Fri, May 29, 2020 at 12:32:55PM +0200, Christian Brauner wrote:
> On Thu, May 28, 2020 at 04:08:57AM -0700, Sargun Dhillon wrote:
> > This adds a seccomp notifier ioctl which allows for the listener to "add"
> > file descriptors to a process which originated a seccomp user
> > notification. This allows calls like mount, and mknod to be "implemented",
> > as the return value, and the arguments are data in memory. On the other
> > hand, calls like connect can be "implemented" using pidfd_getfd.
> > 
> > Unfortunately, there are calls which return file descriptors, like
> > open, which are vulnerable to TOC-TOU attacks, and require that the
> > more privileged supervisor can inspect the argument, and perform the
> > syscall on behalf of the process generating the notifiation. This
> > allows the file descriptor generated from that open call to be
> > returned to the calling process.
> > 
> > In addition, there is funcitonality to allow for replacement of
> > specific file descriptors, following dup2-like semantics.
> > 
> > Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> > Suggested-by: Matt Denton <mpdenton@google.com>
> > Cc: Kees Cook <keescook@google.com>,
> > Cc: Jann Horn <jannh@google.com>,
> > Cc: Robert Sesek <rsesek@google.com>,
> > Cc: Chris Palmer <palmer@google.com>
> > Cc: Christian Brauner <christian.brauner@ubuntu.com>
> > Cc: Tycho Andersen <tycho@tycho.ws>
> > ---
> >  include/uapi/linux/seccomp.h |  25 +++++
> >  kernel/seccomp.c             | 182 ++++++++++++++++++++++++++++++++++-
> >  2 files changed, 206 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> > index c1735455bc53..c7bfe898e7a0 100644
> > --- a/include/uapi/linux/seccomp.h
> > +++ b/include/uapi/linux/seccomp.h
> > @@ -113,6 +113,27 @@ struct seccomp_notif_resp {
> >  	__u32 flags;
> >  };
> >  
> > +/* valid flags for seccomp_notif_addfd */
> > +#define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
> > +
> > +/**
> > + * struct seccomp_notif_addfd
> > + * @size: The size of the seccomp_notif_addfd datastructure
> > + * @id: The ID of the seccomp notification
> > + * @flags: SECCOMP_ADDFD_FLAG_*
> > + * @srcfd: The local fd number
> > + * @newfd: Optional remote FD number if SETFD option is set, otherwise 0.
> > + * @newfd_flags: Flags the remote FD should be allocated under
> > + */
> > +struct seccomp_notif_addfd {
> > +	__u64 size;
> > +	__u64 id;
> > +	__u64 flags;
> > +	__u32 srcfd;
> > +	__u32 newfd;
> > +	__u32 newfd_flags;
> > +};
> 
> This doesn't correspond to how we usually pad structs, I think:
> 
> struct seccomp_notif_addfd {
>         __u64                      size;                 /*     0     8 */
>         __u64                      id;                   /*     8     8 */
>         __u64                      flags;                /*    16     8 */
>         __u32                      srcfd;                /*    24     4 */
>         __u32                      newfd;                /*    28     4 */
>         __u32                      newfd_flags;          /*    32     4 */
> 
>         /* size: 40, cachelines: 1, members: 6 */
>         /* padding: 4 */
>         /* last cacheline: 40 bytes */
> };
> 
> You can either use the packed attribute or change the flags member from
> u64 to u32:
> 
> struct seccomp_notif_addfd {
> 	__u64 size;
> 	__u64 id;
> 	__u32 flags;
> 	__u32 srcfd;
> 	__u32 newfd;
> 	__u32 newfd_flags;
> }
> 
> ^^ This seems nicer to me and gets rid of the 4 byte padding. If we run
> out of 32 flags we'll just add a second flag argument to the struct.
> 
> > +
> >  #define SECCOMP_IOC_MAGIC		'!'
> >  #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
> >  #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
> > @@ -124,4 +145,8 @@ struct seccomp_notif_resp {
> >  #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
> >  						struct seccomp_notif_resp)
> >  #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
> > +/* On success, the return value is the remote process's added fd number */
> > +#define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOR(3,	\
> > +						struct seccomp_notif_addfd)
> > +
> >  #endif /* _UAPI_LINUX_SECCOMP_H */
> > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > index 94ae4c7502cc..02b9ba1fbee0 100644
> > --- a/kernel/seccomp.c
> > +++ b/kernel/seccomp.c
> > @@ -41,6 +41,9 @@
> >  #include <linux/tracehook.h>
> >  #include <linux/uaccess.h>
> >  #include <linux/anon_inodes.h>
> > +#include <net/netprio_cgroup.h>
> > +#include <net/sock.h>
> > +#include <net/cls_cgroup.h>
> >  
> >  enum notify_state {
> >  	SECCOMP_NOTIFY_INIT,
> > @@ -77,10 +80,42 @@ struct seccomp_knotif {
> >  	long val;
> >  	u32 flags;
> >  
> > -	/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
> > +	/*
> > +	 * Signals when this has changed states, such as the listener
> > +	 * dying, a new seccomp addfd message, or changing to REPLIED
> > +	 */
> >  	struct completion ready;
> >  
> >  	struct list_head list;
> > +
> > +	/* outstanding addfd requests */
> > +	struct list_head addfd;
> > +};
> > +
> > +/**
> > + * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages
> > + *
> > + * @file: A reference to the file to install in the other task
> > + * @fd: The fd number to install it at. If the fd number is -1, it means the
> > + *      installing process should allocate the fd as normal.
> > + * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
> > + *         is allowed.
> > + * @ret: The return value of the installing process. It is set to the fd num
> > + *       upon success (>= 0).
> > + * @completion: Indicates that the installing process has completed fd
> > + *              installation, or gone away (either due to successful
> > + *              reply, or signal)
> > + *
> > + */
> > +struct seccomp_kaddfd {
> > +	struct file *file;
> > +	int fd;
> > +	unsigned int flags;
> > +
> > +	/* To only be set on reply */
> > +	int ret;
> > +	struct completion completion;
> > +	struct list_head list;
> >  };
> >  
> >  /**
> > @@ -735,6 +770,41 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
> >  	return filter->notif->next_id++;
> >  }
> >  
> > +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
> > +{
> > +	struct socket *sock;
> > +	int ret, err;
> > +
> > +	/*
> > +	 * Remove the notification, and reset the list pointers, indicating
> > +	 * that it has been handled.
> > +	 */
> > +	list_del_init(&addfd->list);
> > +
> > +	ret = security_file_receive(addfd->file);
> > +	if (ret)
> > +		goto out;
> > +
> > +	if (addfd->fd == -1) {
> > +		ret = get_unused_fd_flags(addfd->flags);
> > +		if (ret >= 0)
> > +			fd_install(ret, get_file(addfd->file));
> > +	} else {
> > +		ret = replace_fd(addfd->fd, addfd->file, addfd->flags);
> > +	}
> > +
> > +	/* These are the semantics from copying FDs via SCM_RIGHTS */
> > +	sock = sock_from_file(addfd->file, &err);
> 
> Iiuc, if this is indeed a socket and the replace_fd() or fd_install()
> has failed, you're now still transferring netprioidx and classid to the
> task's cgroup. Should probably be something like:
> 
> if (sock && ret >= 0) {
> 	sock_update_netprioidx(&sock->sk->sk_cgrp_data);
> 	sock_update_classid(&sock->sk->sk_cgrp_data);
> }
> 
> > +
> > +out:
> > +	addfd->ret = ret;
> > +	complete(&addfd->completion);
> > +}
> > +
> >  static int seccomp_do_user_notification(int this_syscall,
> >  					struct seccomp_filter *match,
> >  					const struct seccomp_data *sd)
> > @@ -743,6 +813,7 @@ static int seccomp_do_user_notification(int this_syscall,
> >  	u32 flags = 0;
> >  	long ret = 0;
> >  	struct seccomp_knotif n = {};
> > +	struct seccomp_kaddfd *addfd, *tmp;
> >  
> >  	mutex_lock(&match->notify_lock);
> >  	err = -ENOSYS;
> > @@ -755,6 +826,7 @@ static int seccomp_do_user_notification(int this_syscall,
> >  	n.id = seccomp_next_notify_id(match);
> >  	init_completion(&n.ready);
> >  	list_add(&n.list, &match->notif->notifications);
> > +	INIT_LIST_HEAD(&n.addfd);
> >  
> >  	up(&match->notif->request);
> >  	wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM);
> > @@ -763,14 +835,31 @@ static int seccomp_do_user_notification(int this_syscall,
> >  	/*
> >  	 * This is where we wait for a reply from userspace.
> >  	 */
> > +wait:
> >  	err = wait_for_completion_interruptible(&n.ready);
> >  	mutex_lock(&match->notify_lock);
> >  	if (err == 0) {
> > +		/* Check if we were woken up by a addfd message */
> > +		addfd = list_first_entry_or_null(&n.addfd,
> > +						 struct seccomp_kaddfd, list);
> > +		if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
> > +			seccomp_handle_addfd(addfd);
> > +			mutex_unlock(&match->notify_lock);
> > +			goto wait;
> > +		}
> >  		ret = n.val;
> >  		err = n.error;
> >  		flags = n.flags;
> >  	}
> >  
> > +	/* If there were any pending addfd calls, clear them out */
> > +	list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
> > +		/* The process went away before we got a chance to handle it */
> > +		addfd->ret = -ESRCH;
> > +		list_del_init(&addfd->list);
> > +		complete(&addfd->completion);
> > +	}

I forgot to ask this in my first review before, don't you need a
complete(&addfd->completion) call in seccomp_notify_release() before
freeing it?

> > +
> >  	/*
> >  	 * Note that it's possible the listener died in between the time when
> >  	 * we were notified of a respons (or a signal) and when we were able to
> > @@ -1174,6 +1263,95 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
> >  	return ret;
> >  }
> >  
> > +static long seccomp_notify_addfd(struct seccomp_filter *filter,
> > +				 struct seccomp_notif_addfd __user *uaddfd)
> > +{
> > +	struct seccomp_notif_addfd addfd;
> > +	struct seccomp_knotif *knotif;
> > +	struct seccomp_kaddfd kaddfd;
> > +	u64 size;
> > +	int ret;
> > +
> > +	ret = get_user(size, &uaddfd->size);
> > +	if (ret)
> > +		return ret;
> > +
> > +	ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size);
> > +	if (ret)
> > +		return ret;
> > +
> > +	if (addfd.newfd_flags & ~O_CLOEXEC)
> > +		return -EINVAL;
> > +
> > +	if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD)
> > +		return -EINVAL;
> > +
> > +	if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
> > +		return -EINVAL;
> > +
> > +	kaddfd.file = fget(addfd.srcfd);
> > +	if (!kaddfd.file)
> > +		return -EBADF;
> > +
> > +	kaddfd.flags = addfd.newfd_flags;
> > +	kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ?
> > +		    addfd.newfd : -1;
> > +	init_completion(&kaddfd.completion);
> > +
> > +	ret = mutex_lock_interruptible(&filter->notify_lock);
> > +	if (ret < 0)
> > +		goto out;
> > +
> > +	knotif = find_notification(filter, addfd.id);
> > +	/*
> > +	 * We do not want to allow for FD injection to occur before the
> > +	 * notification has been picked up by a userspace handler, or after
> > +	 * the notification has been replied to.
> > +	 */
> 
> That comment ^^ should probably go above...
> 
> > +	if (!knotif) {
> > +		ret = -ENOENT;
> > +		goto out_unlock;
> > +	}
> 
> ... this vv check, no?
> 
> > +
> > +	if (knotif->state != SECCOMP_NOTIFY_SENT) {
> > +		ret = -EINPROGRESS;
> > +		goto out_unlock;
> > +	}
> > +
> > +	list_add(&kaddfd.list, &knotif->addfd);
> > +	complete(&knotif->ready);
> > +	mutex_unlock(&filter->notify_lock);
> > +
> > +	/* Now we wait for it to be processed */
> > +	ret = wait_for_completion_interruptible(&kaddfd.completion);
> > +	if (ret == 0) {
> > +		/*
> > +		 * We had a successful completion. The other side has already
> > +		 * removed us from the addfd queue, and
> > +		 * wait_for_completion_interruptible has a memory barrier.
> > +		 */
> > +		ret = kaddfd.ret;
> > +		goto out;
> > +	}
> > +
> > +	mutex_lock(&filter->notify_lock);
> > +	/*
> > +	 * Even though we were woken up by a signal, and not a successful
> > +	 * completion, a completion may have happened in the mean time.
> > +	 */
> > +	if (list_empty(&kaddfd.list))
> > +		ret = kaddfd.ret;
> > +	else
> > +		list_del(&kaddfd.list);
> > +
> > +out_unlock:
> > +	mutex_unlock(&filter->notify_lock);
> > +out:
> > +	fput(kaddfd.file);
> > +
> > +	return ret;
> > +}
> > +
> >  static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
> >  				 unsigned long arg)
> >  {
> > @@ -1187,6 +1365,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
> >  		return seccomp_notify_send(filter, buf);
> >  	case SECCOMP_IOCTL_NOTIF_ID_VALID:
> >  		return seccomp_notify_id_valid(filter, buf);
> > +	case SECCOMP_IOCTL_NOTIF_ADDFD:
> > +		return seccomp_notify_addfd(filter, buf);
> >  	default:
> >  		return -EINVAL;
> >  	}
> > -- 
> > 2.25.1
> >
Sargun Dhillon May 29, 2020, 10:35 p.m.
On Fri, May 29, 2020 at 6:31 AM Christian Brauner
<christian.brauner@ubuntu.com> wrote:
>
> > > +           /* Check if we were woken up by a addfd message */
> > > +           addfd = list_first_entry_or_null(&n.addfd,
> > > +                                            struct seccomp_kaddfd, list);
> > > +           if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
> > > +                   seccomp_handle_addfd(addfd);
> > > +                   mutex_unlock(&match->notify_lock);
> > > +                   goto wait;
> > > +           }
> > >             ret = n.val;
> > >             err = n.error;
> > >             flags = n.flags;
> > >     }
> > >
> > > +   /* If there were any pending addfd calls, clear them out */
> > > +   list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
> > > +           /* The process went away before we got a chance to handle it */
> > > +           addfd->ret = -ESRCH;
> > > +           list_del_init(&addfd->list);
> > > +           complete(&addfd->completion);
> > > +   }
>
> I forgot to ask this in my first review before, don't you need a
> complete(&addfd->completion) call in seccomp_notify_release() before
> freeing it?
>

When complete(&knotif->ready) is called in seccomp_notify_release,
subsequently the notifier (seccomp_do_user_notification) will be woken up and
it'll fail this check:
if (addfd && n.state != SECCOMP_NOTIFY_REPLIED)

Falling through to:
/* If there were any pending addfd calls, clear them out */
list_for_each_entry_safe(addfd, tmp, &n.addfd, list) {
    /* The process went away before we got a chance to handle it */
    addfd->ret = -ESRCH;
    list_del_init(&addfd->list);
    complete(&addfd->completion);
}

Although ESRCH isn't the "right" response, this fall through behaviour
should work.
Sargun Dhillon May 30, 2020, 1:10 a.m.
On Fri, May 29, 2020 at 12:31:37AM -0700, Kees Cook wrote:
> On Thu, May 28, 2020 at 04:08:57AM -0700, Sargun Dhillon wrote:
> > This adds a seccomp notifier ioctl which allows for the listener to "add"
> > file descriptors to a process which originated a seccomp user
> > notification. This allows calls like mount, and mknod to be "implemented",
> > as the return value, and the arguments are data in memory. On the other
> > hand, calls like connect can be "implemented" using pidfd_getfd.
> > 
> > Unfortunately, there are calls which return file descriptors, like
> > open, which are vulnerable to TOC-TOU attacks, and require that the
> > more privileged supervisor can inspect the argument, and perform the
> > syscall on behalf of the process generating the notifiation. This
> > allows the file descriptor generated from that open call to be
> > returned to the calling process.
> > 
> > In addition, there is funcitonality to allow for replacement of
> > specific file descriptors, following dup2-like semantics.
> > 
> > Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> > Suggested-by: Matt Denton <mpdenton@google.com>
> 
> This looks mostly really clean. When I've got more brain tomorrow I want to
> double-check the locking, but I think the use of notify_lock and being
> in the ioctl fully protects everything from any use-after-free-like
> issues.
> 
> Notes below...
> 
> > +/* valid flags for seccomp_notif_addfd */
> > +#define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
> 
> Nit: please use BIT()
> 
> > @@ -735,6 +770,41 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
> >  	return filter->notif->next_id++;
> >  }
> >  
> > +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
> > +{
> > +	struct socket *sock;
> > +	int ret, err;
> > +
> > +	/*
> > +	 * Remove the notification, and reset the list pointers, indicating
> > +	 * that it has been handled.
> > +	 */
> > +	list_del_init(&addfd->list);
> > +
> > +	ret = security_file_receive(addfd->file);
> > +	if (ret)
> > +		goto out;
> > +
> > +	if (addfd->fd == -1) {
> > +		ret = get_unused_fd_flags(addfd->flags);
> > +		if (ret >= 0)
> > +			fd_install(ret, get_file(addfd->file));
> > +	} else {
> > +		ret = replace_fd(addfd->fd, addfd->file, addfd->flags);
> > +	}
> > +
> > +	/* These are the semantics from copying FDs via SCM_RIGHTS */
> > +	sock = sock_from_file(addfd->file, &err);
> > +	if (sock) {
> > +		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
> > +		sock_update_classid(&sock->sk->sk_cgrp_data);
> > +	}
> 
> This made my eye twitch. ;) I see this is borrowed from
> scm_detach_fds()... this really feels like the kind of thing that will
> quickly go out of sync. I think this "receive an fd" logic needs to be
> lifted out of scm_detach_fds() so it and seccomp can share it. I'm not
> sure how to parameterize it quite right, though. Perhaps:
> 
> int file_receive(int fd, unsigned long flags, struct file *file)
> {
> 	struct socket *sock;
> 	int ret;
> 
> 	ret = security_file_receive(file);
> 	if (ret)
> 		return ret;
> 
> 	/* Install the file. */
> 	if (fd == -1) {
> 		ret = get_unused_fd_flags(flags);
> 		if (ret >= 0)
> 			fd_install(ret, get_file(file));
> 	} else {
> 		ret = replace_fd(fd, file, flags);
> 	}
> 
> 	/* Bump the usage count. */
> 	sock = sock_from_file(addfd->file, &err);
> 	if (sock) {
> 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
> 		sock_update_classid(&sock->sk->sk_cgrp_data);
> 	}
> 
> 	return ret;
> }
> 
> 
> static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
> {
> 	/*
> 	 * Remove the notification, and reset the list pointers, indicating
> 	 * that it has been handled.
> 	 */
> 	list_del_init(&addfd->list);
> 	addfd->ret = file_receive(addfd->fd, addfd->flags, addfd->file);
> 	complete(&addfd->completion);
> }
> 
> scm_detach_fds()
> 	...
> 	for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
>              i++, cmfptr++)
> 	{
> 
> 		err = file_receive(-1, MSG_CMSG_CLOEXEC & msg->msg_flags
>                                           ? O_CLOEXEC : 0, fp[i]);
> 		if (err < 0)
> 			break;
> 		err = put_user(err, cmfptr);
> 		if (err)
> 			/* wat */
> 	}
> 	...
> 
> I'm not sure on the put_user() failure, though. We could check early
> for faults with a put_user(0, cmfptr) before the file_receive() call, or
> we could just ignore it? I'm not sure what SCM does here. I guess
> worst-case:
> 
> int file_receive(int fd, unsigned long flags, struct file *file,
> 		 int __user *fdptr)
> {
> 		...
> 		ret = get_unused_fd_flags(flags);
> 		if (ret >= 0) {
> 			if (cmfptr) {
> 				int err;
> 
> 	                	err = put_user(ret, cmfptr);
> 				if (err) {
> 					put_unused_fd(ret);
> 					return err;
> 				}
> 			}
> 			fd_install(ret, get_file(file));
> 		}
> 		...
> }
> 
What about:

/*
 * File Receive - Retrieve a file from another process
 *
 * It can either replace an existing fd, or use a newly allocated fd. If you
 * intend on using an existing fd, replace should be false, and flags will
 * be ignored. The fd should be allocated using get_unused_fd_flags with the
 * flags that you want. It does not consume the reference to file.
 *
 * Returns 0 upon success
 */
static int __file_receive(int fd, unsigned int flags, struct file *file,
			  bool replace)
{
	struct socket *sock;
	int err;

	err = security_file_receive(file);
	if (err)
		return err;

	/* Is this an existing FD? */
	if (replace) {
		err = replace_fd(fd, file, flags);
		if (err)
			return err;
	} else {
		fd_install(fd, get_file(file));
	}

	sock = sock_from_file(file, &err);
	if (sock) {
		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
		sock_update_classid(&sock->sk->sk_cgrp_data);
	}

	return 0;
}

int file_receive_replace(int fd, unsigned int flags, struct file *file)
{
	return __file_receive(fd, flags, file, true);
}

int file_receive(int fd, struct file *file)
{
	return __file_receive(fd, 0, file, false);
}


// And then SCM reads:
	for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
	     i++, cmfptr++)
	{
		int new_fd;
		err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
					  ? O_CLOEXEC : 0);
		if (err < 0)
			break;
		new_fd = err;
		err = put_user(new_fd, cmfptr);
		if (err) {
			put_unused_fd(new_fd);
			break;
		}

		err = file_receive(new_fd, fp[i]);
		if (err) {
			put_unused_fd(new_fd);
			break;
		}
	}

And our code reads:


static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
{
	int ret, err;

	/*
	 * Remove the notification, and reset the list pointers, indicating
	 * that it has been handled.
	 */
	list_del_init(&addfd->list);

	if (addfd->fd == -1) {
		ret = get_unused_fd_flags(addfd->flags);
		if (ret < 0)
			goto err;

		err = file_receive(ret, addfd->file);
		if (err) {
			put_unused_fd(ret);
			ret = err;
		}
	} else {
		ret = file_receive_replace(addfd->fd, addfd->flags,
					   addfd->file);
	}

err:
	addfd->ret = ret;
	complete(&addfd->completion);
}


And the pidfd getfd code reads:

static int pidfd_getfd(struct pid *pid, int fd)
{
	struct task_struct *task;
	struct file *file;
	int ret, err;

	task = get_pid_task(pid, PIDTYPE_PID);
	if (!task)
		return -ESRCH;

	file = __pidfd_fget(task, fd);
	put_task_struct(task);
	if (IS_ERR(file))
		return PTR_ERR(file);

	ret = get_unused_fd_flags(O_CLOEXEC);
	if (ret >= 0) {
		err = file_receive(ret, file);
		if (err) {
			put_unused_fd(ret);
			ret = err;
		}
	}

	fput(file);
	return ret;
}
Kees Cook May 30, 2020, 2:43 a.m.
On Sat, May 30, 2020 at 01:10:55AM +0000, Sargun Dhillon wrote:
> // And then SCM reads:
> 	for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
> 	     i++, cmfptr++)
> 	{
> 		int new_fd;
> 		err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
> 					  ? O_CLOEXEC : 0);
> 		if (err < 0)
> 			break;
> 		new_fd = err;
> 		err = put_user(new_fd, cmfptr);
> 		if (err) {
> 			put_unused_fd(new_fd);
> 			break;
> 		}
> 
> 		err = file_receive(new_fd, fp[i]);
> 		if (err) {
> 			put_unused_fd(new_fd);
> 			break;
> 		}
> 	}
> 
> And our code reads:
> 
> 
> static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
> {
> 	int ret, err;
> 
> 	/*
> 	 * Remove the notification, and reset the list pointers, indicating
> 	 * that it has been handled.
> 	 */
> 	list_del_init(&addfd->list);
> 
> 	if (addfd->fd == -1) {
> 		ret = get_unused_fd_flags(addfd->flags);
> 		if (ret < 0)
> 			goto err;
> 
> 		err = file_receive(ret, addfd->file);
> 		if (err) {
> 			put_unused_fd(ret);
> 			ret = err;
> 		}
> 	} else {
> 		ret = file_receive_replace(addfd->fd, addfd->flags,
> 					   addfd->file);
> 	}
> 
> err:
> 	addfd->ret = ret;
> 	complete(&addfd->completion);
> }
> 
> 
> And the pidfd getfd code reads:
> 
> static int pidfd_getfd(struct pid *pid, int fd)
> {
> 	struct task_struct *task;
> 	struct file *file;
> 	int ret, err;
> 
> 	task = get_pid_task(pid, PIDTYPE_PID);
> 	if (!task)
> 		return -ESRCH;
> 
> 	file = __pidfd_fget(task, fd);
> 	put_task_struct(task);
> 	if (IS_ERR(file))
> 		return PTR_ERR(file);
> 
> 	ret = get_unused_fd_flags(O_CLOEXEC);
> 	if (ret >= 0) {
> 		err = file_receive(ret, file);
> 		if (err) {
> 			put_unused_fd(ret);
> 			ret = err;
> 		}
> 	}
> 
> 	fput(file);
> 	return ret;
> }

I mean, yes, that's certainly better, but it just seems a shame that
everyone has to do the get_unused/put_unused dance just because of how
SCM_RIGHTS does this weird put_user() in the middle.

Can anyone clarify the expected failure mode from SCM_RIGHTS? Can we
move the put_user() after instead? I think cleanup would just be:
replace_fd(fd, NULL, 0)

So:

(updated to skip sock updates on failure; thank you Christian!)

int file_receive(int fd, unsigned long flags, struct file *file)
{
	struct socket *sock;
	int ret;

	ret = security_file_receive(file);
	if (ret)
		return ret;

	/* Install the file. */
	if (fd == -1) {
		ret = get_unused_fd_flags(flags);
		if (ret >= 0)
			fd_install(ret, get_file(file));
	} else {
		ret = replace_fd(fd, file, flags);
	}

	/* Bump the sock usage counts. */
	if (ret >= 0) {
		sock = sock_from_file(addfd->file, &err);
		if (sock) {
			sock_update_netprioidx(&sock->sk->sk_cgrp_data);
			sock_update_classid(&sock->sk->sk_cgrp_data);
		}
	}

	return ret;
}

scm_detach_fds()
	...
	for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
             i++, cmfptr++)
	{
		int new_fd;

		err = file_receive(-1, MSG_CMSG_CLOEXEC & msg->msg_flags
                                          ? O_CLOEXEC : 0, fp[i]);
		if (err < 0)
			break;
		new_fd = err;

		err = put_user(err, cmfptr);
		if (err) {
			/*
			 * If we can't notify userspace that it got the
			 * fd, we need to unwind and remove it again.
			 */
			replace_fd(new_fd, NULL, 0);
			break;
		}
	}
	...
Jann Horn via Containers May 30, 2020, 3:17 a.m.
On Sat, May 30, 2020 at 4:43 AM Kees Cook <keescook@chromium.org> wrote:
> I mean, yes, that's certainly better, but it just seems a shame that
> everyone has to do the get_unused/put_unused dance just because of how
> SCM_RIGHTS does this weird put_user() in the middle.
>
> Can anyone clarify the expected failure mode from SCM_RIGHTS? Can we
> move the put_user() after instead?

Honestly, I think trying to remove file descriptors and such after
-EFAULT is a waste of time. If userspace runs into -EFAULT, userspace
is beyond saving and can't really do much other than exit immediately.
There are a bunch of places that will change state and then throw
-EFAULT at the end if userspace supplied an invalid address, because
trying to hold locks across userspace accesses just in case userspace
supplied a bogus address is kinda silly (and often borderline
impossible).

You can actually see that even scm_detach_fds() currently just
silently swallows errors if writing some header fields fails at the
end.
Sargun Dhillon May 30, 2020, 3:58 a.m.
> 
> I mean, yes, that's certainly better, but it just seems a shame that
> everyone has to do the get_unused/put_unused dance just because of how
> SCM_RIGHTS does this weird put_user() in the middle.
> 
> Can anyone clarify the expected failure mode from SCM_RIGHTS? Can we
> move the put_user() after instead? I think cleanup would just be:
> replace_fd(fd, NULL, 0)
> 
> So:
> 
> (updated to skip sock updates on failure; thank you Christian!)
> 
> int file_receive(int fd, unsigned long flags, struct file *file)
> {
> 	struct socket *sock;
> 	int ret;
> 
> 	ret = security_file_receive(file);
> 	if (ret)
> 		return ret;
> 
> 	/* Install the file. */
> 	if (fd == -1) {
> 		ret = get_unused_fd_flags(flags);
> 		if (ret >= 0)
> 			fd_install(ret, get_file(file));
> 	} else {
> 		ret = replace_fd(fd, file, flags);
> 	}
> 
> 	/* Bump the sock usage counts. */
> 	if (ret >= 0) {
> 		sock = sock_from_file(addfd->file, &err);
> 		if (sock) {
> 			sock_update_netprioidx(&sock->sk->sk_cgrp_data);
> 			sock_update_classid(&sock->sk->sk_cgrp_data);
> 		}
> 	}
> 
> 	return ret;
> }
> 
> scm_detach_fds()
> 	...
> 	for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
>              i++, cmfptr++)
> 	{
> 		int new_fd;
> 
> 		err = file_receive(-1, MSG_CMSG_CLOEXEC & msg->msg_flags
>                                           ? O_CLOEXEC : 0, fp[i]);
> 		if (err < 0)
> 			break;
> 		new_fd = err;
> 
Isn't the "right" way to do this to allocate a bunch of file descriptors,
and fill up the user buffer with them, and then install the files? This
seems to like half-install the file descriptors and then error out.

I know that's the current behaviour, but that seems like a bad idea. Do
we really want to perpetuate this half-broken state? I guess that some
userspace programs could be depending on this -- and their recovery
semantics could rely on this. I mean this is 10+ year old code.

> 		err = put_user(err, cmfptr);
> 		if (err) {
> 			/*
> 			 * If we can't notify userspace that it got the
> 			 * fd, we need to unwind and remove it again.
> 			 */
> 			replace_fd(new_fd, NULL, 0);
> 			break;
> 		}
> 	}
> 	...
> 
> 
> 
> -- 
> Kees Cook
Kees Cook May 30, 2020, 5:22 a.m.
On Sat, May 30, 2020 at 05:17:24AM +0200, Jann Horn wrote:
> On Sat, May 30, 2020 at 4:43 AM Kees Cook <keescook@chromium.org> wrote:
> > I mean, yes, that's certainly better, but it just seems a shame that
> > everyone has to do the get_unused/put_unused dance just because of how
> > SCM_RIGHTS does this weird put_user() in the middle.
> >
> > Can anyone clarify the expected failure mode from SCM_RIGHTS? Can we
> > move the put_user() after instead?
> 
> Honestly, I think trying to remove file descriptors and such after
> -EFAULT is a waste of time. If userspace runs into -EFAULT, userspace
> is beyond saving and can't really do much other than exit immediately.
> There are a bunch of places that will change state and then throw
> -EFAULT at the end if userspace supplied an invalid address, because
> trying to hold locks across userspace accesses just in case userspace
> supplied a bogus address is kinda silly (and often borderline
> impossible).

Logically, I agree. I'm more worried about the behavioral change -- if
we don't remove the fd on failure, the fd is installed with no
indication to the process that it exists (it won't know the close it --
if it keeps running -- and it may survive across exec). Before, it never
entered the file table.

> You can actually see that even scm_detach_fds() currently just
> silently swallows errors if writing some header fields fails at the
> end.

Yeah, and it's a corner case. But it should be possible (trivial, even)
to clean up on failure to retain the original results.
Kees Cook May 30, 2020, 5:47 a.m.
On Sat, May 30, 2020 at 03:58:18AM +0000, Sargun Dhillon wrote:
> Isn't the "right" way to do this to allocate a bunch of file descriptors,
> and fill up the user buffer with them, and then install the files? This
> seems to like half-install the file descriptors and then error out.
> 
> I know that's the current behaviour, but that seems like a bad idea. Do
> we really want to perpetuate this half-broken state? I guess that some
> userspace programs could be depending on this -- and their recovery
> semantics could rely on this. I mean this is 10+ year old code.

Right -- my instincts on this are to leave the behavior as-is. I've
been burned by so many "nothing could possible rely on THIS" cases. ;)

It might be worth adding a comment (or commit log note) that describes
the alternative you suggest here. But I think building a common helper
that does all of the work (and will get used in three^Wfour places now)
is the correct way to refactor this.

Oh hey! Look at scm_detach_fds_compat(). It needs this too. (And it's
missing the cgroup tracking.) That would fix:

48a87cc26c13 ("net: netprio: fd passed in SCM_RIGHTS datagram not set correctly")
d84295067fc7 ("net: net_cls: fd passed in SCM_RIGHTS datagram not set correctly")

So, yes, let's get this fixed up. I'd say first fix the missing sock
update in the compat path (so it can be CCed stable). Then fix the missing
sock update in pidfd_getfd() (so it can be CCed stable), then write the
helper with a refactoring of scm_detach_fds(), scm_detach_fds_compat(),
and pidfd_getfd(). And then add the addfd seccomp user_notif ioctl cmd.
Christian Brauner May 30, 2020, 1:58 p.m.
On Sat, May 30, 2020 at 05:17:24AM +0200, Jann Horn wrote:
> On Sat, May 30, 2020 at 4:43 AM Kees Cook <keescook@chromium.org> wrote:
> > I mean, yes, that's certainly better, but it just seems a shame that
> > everyone has to do the get_unused/put_unused dance just because of how
> > SCM_RIGHTS does this weird put_user() in the middle.
> >
> > Can anyone clarify the expected failure mode from SCM_RIGHTS? Can we
> > move the put_user() after instead?
> 
> Honestly, I think trying to remove file descriptors and such after
> -EFAULT is a waste of time. If userspace runs into -EFAULT, userspace

Agreed, we've never bothered with trying to recover from EFAULT. Just
look at kernel/fork.c:_do_fork():
	if (clone_flags & CLONE_PARENT_SETTID)
		put_user(nr, args->parent_tid);

we don't even bother even though we technically could.

> is beyond saving and can't really do much other than exit immediately.
> There are a bunch of places that will change state and then throw
> -EFAULT at the end if userspace supplied an invalid address, because
> trying to hold locks across userspace accesses just in case userspace
> supplied a bogus address is kinda silly (and often borderline
> impossible).
> 
> You can actually see that even scm_detach_fds() currently just
> silently swallows errors if writing some header fields fails at the
> end.

There's really no point in trying to save a broken scm message imho.
Al Viro May 30, 2020, 2:08 p.m.
On Fri, May 29, 2020 at 07:43:10PM -0700, Kees Cook wrote:

> Can anyone clarify the expected failure mode from SCM_RIGHTS? Can we
> move the put_user() after instead? I think cleanup would just be:
> replace_fd(fd, NULL, 0)

Bollocks.

Repeat after me: descriptor tables can be shared.  There is no
"cleanup" after you've put something there.  If you do not get
it, you have no business messing with any of this stuff.
Christian Brauner May 30, 2020, 2:13 p.m.
On Fri, May 29, 2020 at 10:47:12PM -0700, Kees Cook wrote:
> On Sat, May 30, 2020 at 03:58:18AM +0000, Sargun Dhillon wrote:
> > Isn't the "right" way to do this to allocate a bunch of file descriptors,
> > and fill up the user buffer with them, and then install the files? This
> > seems to like half-install the file descriptors and then error out.
> > 
> > I know that's the current behaviour, but that seems like a bad idea. Do
> > we really want to perpetuate this half-broken state? I guess that some
> > userspace programs could be depending on this -- and their recovery
> > semantics could rely on this. I mean this is 10+ year old code.
> 
> Right -- my instincts on this are to leave the behavior as-is. I've
> been burned by so many "nothing could possible rely on THIS" cases. ;)
> 
> It might be worth adding a comment (or commit log note) that describes
> the alternative you suggest here. But I think building a common helper
> that does all of the work (and will get used in three^Wfour places now)
> is the correct way to refactor this.

If you do this, then probably

> 
> Oh hey! Look at scm_detach_fds_compat(). It needs this too. (And it's
> missing the cgroup tracking.) That would fix:
> 
> 48a87cc26c13 ("net: netprio: fd passed in SCM_RIGHTS datagram not set correctly")
> d84295067fc7 ("net: net_cls: fd passed in SCM_RIGHTS datagram not set correctly")
> 
> So, yes, let's get this fixed up. I'd say first fix the missing sock
> update in the compat path (so it can be CCed stable). Then fix the missing

send this patch to net.

> sock update in pidfd_getfd() (so it can be CCed stable), then write the

send this patch to me.

> helper with a refactoring of scm_detach_fds(), scm_detach_fds_compat(),

this would be net-next most likely.

> and pidfd_getfd(). And then add the addfd seccomp user_notif ioctl cmd.

If you do this first, I'd suggest you resend the series here after all
this has been merged. We're not in a rush since this won't make it for
the 5.8 merge window anyway. By the time the changes land Kees might've
applied my changes to his tree so you can rebase yours on top of it
relieving Kees from fixing up merge conflicts.

About your potential net and net-next changes. Just in case you don't
know - otherwise ignore this - please read and treat
https://www.kernel.org/doc/Documentation/networking/netdev-FAQ.txt
as the gospel. Also note, that after this Sunday - assuming Linus
releases - net-next will be closed until the merge window is closed,
i.e. for _at least_ 2 weeks. After the merge window closes you can check
http://vger.kernel.org/~davem/net-next.html
which either has a picture saying "Come In We're Open" or a sign saying
"Sorry, We're Closed". Only send when the first sign is up or the wrath
of Dave might hit you. :)

Christian
Kees Cook May 30, 2020, 4:07 p.m.
On Sat, May 30, 2020 at 03:08:37PM +0100, Al Viro wrote:
> On Fri, May 29, 2020 at 07:43:10PM -0700, Kees Cook wrote:
> 
> > Can anyone clarify the expected failure mode from SCM_RIGHTS? Can we
> > move the put_user() after instead? I think cleanup would just be:
> > replace_fd(fd, NULL, 0)
> 
> Bollocks.
> 
> Repeat after me: descriptor tables can be shared.  There is no
> "cleanup" after you've put something there.

Right -- this is what I was trying to ask about, and why I didn't like
the idea of just leaving the fd in the table on failure. But yeah, there
is a race if the process is about to fork or something.

So the choice here is how to handle the put_user() failure:

- add the put_user() address to the new helper, as I suggest in [1].
  (exactly duplicates current behavior)
- just leave the fd in place (not current behavior: dumps a fd into
  the process without "agreed" notification).
- do a double put_user (once before and once after), also in [1].
  (sort of a best-effort combo of the above two. and SCM_RIGHTS is
  hardly fast-pth).

-Kees

[1] https://lore.kernel.org/linux-api/202005282345.573B917@keescook/
Kees Cook May 30, 2020, 4:09 p.m.
On Sat, May 30, 2020 at 03:58:27PM +0200, Christian Brauner wrote:
> On Sat, May 30, 2020 at 05:17:24AM +0200, Jann Horn wrote:
> > On Sat, May 30, 2020 at 4:43 AM Kees Cook <keescook@chromium.org> wrote:
> > > I mean, yes, that's certainly better, but it just seems a shame that
> > > everyone has to do the get_unused/put_unused dance just because of how
> > > SCM_RIGHTS does this weird put_user() in the middle.
> > >
> > > Can anyone clarify the expected failure mode from SCM_RIGHTS? Can we
> > > move the put_user() after instead?
> > 
> > Honestly, I think trying to remove file descriptors and such after
> > -EFAULT is a waste of time. If userspace runs into -EFAULT, userspace
> [...]
> 
> There's really no point in trying to save a broken scm message imho.

Right -- my concern is about stuffing a fd into a process without it
knowing (this is likely an overly paranoid concern, given that if the
process is getting EFAULT at the end of a list of fds, all the prior
ones will be installed too..)
Kees Cook May 30, 2020, 4:14 p.m.
On Sat, May 30, 2020 at 04:13:29PM +0200, Christian Brauner wrote:
> On Fri, May 29, 2020 at 10:47:12PM -0700, Kees Cook wrote:
> > Oh hey! Look at scm_detach_fds_compat(). It needs this too. (And it's
> > missing the cgroup tracking.) That would fix:
> > 
> > 48a87cc26c13 ("net: netprio: fd passed in SCM_RIGHTS datagram not set correctly")
> > d84295067fc7 ("net: net_cls: fd passed in SCM_RIGHTS datagram not set correctly")
> > 
> > So, yes, let's get this fixed up. I'd say first fix the missing sock
> > update in the compat path (so it can be CCed stable). Then fix the missing
> 
> send this patch to net.
> 
> > sock update in pidfd_getfd() (so it can be CCed stable), then write the
> 
> send this patch to me.
> 
> > helper with a refactoring of scm_detach_fds(), scm_detach_fds_compat(),
> 
> this would be net-next most likely.
> 
> > and pidfd_getfd(). And then add the addfd seccomp user_notif ioctl cmd.
> 
> If you do this first, I'd suggest you resend the series here after all
> this has been merged. We're not in a rush since this won't make it for
> the 5.8 merge window anyway. By the time the changes land Kees might've
> applied my changes to his tree so you can rebase yours on top of it
> relieving Kees from fixing up merge conflicts.
> 
> About your potential net and net-next changes. Just in case you don't
> know - otherwise ignore this - please read and treat
> https://www.kernel.org/doc/Documentation/networking/netdev-FAQ.txt
> as the gospel. Also note, that after this Sunday - assuming Linus
> releases - net-next will be closed until the merge window is closed,
> i.e. for _at least_ 2 weeks. After the merge window closes you can check
> http://vger.kernel.org/~davem/net-next.html
> which either has a picture saying "Come In We're Open" or a sign saying
> "Sorry, We're Closed". Only send when the first sign is up or the wrath
> of Dave might hit you. :)

Yeah, timing is awkward here. I was originally thinking it could all
just land via seccomp (with appropriate Acks). Hmmm.
Christian Brauner May 30, 2020, 4:21 p.m.
On Sat, May 30, 2020 at 09:14:50AM -0700, Kees Cook wrote:
> On Sat, May 30, 2020 at 04:13:29PM +0200, Christian Brauner wrote:
> > On Fri, May 29, 2020 at 10:47:12PM -0700, Kees Cook wrote:
> > > Oh hey! Look at scm_detach_fds_compat(). It needs this too. (And it's
> > > missing the cgroup tracking.) That would fix:
> > > 
> > > 48a87cc26c13 ("net: netprio: fd passed in SCM_RIGHTS datagram not set correctly")
> > > d84295067fc7 ("net: net_cls: fd passed in SCM_RIGHTS datagram not set correctly")
> > > 
> > > So, yes, let's get this fixed up. I'd say first fix the missing sock
> > > update in the compat path (so it can be CCed stable). Then fix the missing
> > 
> > send this patch to net.
> > 
> > > sock update in pidfd_getfd() (so it can be CCed stable), then write the
> > 
> > send this patch to me.
> > 
> > > helper with a refactoring of scm_detach_fds(), scm_detach_fds_compat(),
> > 
> > this would be net-next most likely.
> > 
> > > and pidfd_getfd(). And then add the addfd seccomp user_notif ioctl cmd.
> > 
> > If you do this first, I'd suggest you resend the series here after all
> > this has been merged. We're not in a rush since this won't make it for
> > the 5.8 merge window anyway. By the time the changes land Kees might've
> > applied my changes to his tree so you can rebase yours on top of it
> > relieving Kees from fixing up merge conflicts.
> > 
> > About your potential net and net-next changes. Just in case you don't
> > know - otherwise ignore this - please read and treat
> > https://www.kernel.org/doc/Documentation/networking/netdev-FAQ.txt
> > as the gospel. Also note, that after this Sunday - assuming Linus
> > releases - net-next will be closed until the merge window is closed,
> > i.e. for _at least_ 2 weeks. After the merge window closes you can check
> > http://vger.kernel.org/~davem/net-next.html
> > which either has a picture saying "Come In We're Open" or a sign saying
> > "Sorry, We're Closed". Only send when the first sign is up or the wrath
> > of Dave might hit you. :)
> 
> Yeah, timing is awkward here. I was originally thinking it could all
> just land via seccomp (with appropriate Acks). Hmmm.

I don't particularly care so sure. :)
Christian
Sargun Dhillon June 1, 2020, 7:02 p.m.
On Sat, May 30, 2020 at 9:07 AM Kees Cook <keescook@chromium.org> wrote:
>
> On Sat, May 30, 2020 at 03:08:37PM +0100, Al Viro wrote:
> > On Fri, May 29, 2020 at 07:43:10PM -0700, Kees Cook wrote:
> >
> > > Can anyone clarify the expected failure mode from SCM_RIGHTS? Can we
> > > move the put_user() after instead? I think cleanup would just be:
> > > replace_fd(fd, NULL, 0)
> >
> > Bollocks.
> >
> > Repeat after me: descriptor tables can be shared.  There is no
> > "cleanup" after you've put something there.
>
> Right -- this is what I was trying to ask about, and why I didn't like
> the idea of just leaving the fd in the table on failure. But yeah, there
> is a race if the process is about to fork or something.
>
> So the choice here is how to handle the put_user() failure:
>
> - add the put_user() address to the new helper, as I suggest in [1].
>   (exactly duplicates current behavior)
> - just leave the fd in place (not current behavior: dumps a fd into
>   the process without "agreed" notification).
> - do a double put_user (once before and once after), also in [1].
>   (sort of a best-effort combo of the above two. and SCM_RIGHTS is
>   hardly fast-pth).
>
> -Kees
>
> [1] https://lore.kernel.org/linux-api/202005282345.573B917@keescook/
>
> --
> Kees Cook

I'm going to suggest we stick to the approach of doing[1]:
1. Allocate FD
2. put_user
3. "Receive" and install file into FD

That is the only way to preserve the current behaviour in which userspace
is notified about *every* FD that is received via SCM_RIGHTS. The
scm_detach_fds code as it reads today does effectively what is above,
in that the fd is not installed until *after* the put user. Therefore
if put_user
gets an EFAULT or ENOMEM, it falls through to the MSG_CTRUNC bit.

The approach suggested[2] has a "change" in behaviour, in that (all in
file_receive):
1. Allocate FD
2. Receive file
3. put_user

Based on what Al Viro said, I don't think we can simply add step #4,
being "just" uninstall the FD.

[1]: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg2179418.html
[2]: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg2179453.html
Kees Cook June 1, 2020, 7:59 p.m.
On Mon, Jun 01, 2020 at 12:02:10PM -0700, Sargun Dhillon wrote:
> On Sat, May 30, 2020 at 9:07 AM Kees Cook <keescook@chromium.org> wrote:
> >
> > On Sat, May 30, 2020 at 03:08:37PM +0100, Al Viro wrote:
> > > On Fri, May 29, 2020 at 07:43:10PM -0700, Kees Cook wrote:
> > >
> > > > Can anyone clarify the expected failure mode from SCM_RIGHTS? Can we
> > > > move the put_user() after instead? I think cleanup would just be:
> > > > replace_fd(fd, NULL, 0)
> > >
> > > Bollocks.
> > >
> > > Repeat after me: descriptor tables can be shared.  There is no
> > > "cleanup" after you've put something there.
> >
> > Right -- this is what I was trying to ask about, and why I didn't like
> > the idea of just leaving the fd in the table on failure. But yeah, there
> > is a race if the process is about to fork or something.
> >
> > So the choice here is how to handle the put_user() failure:
> >
> > - add the put_user() address to the new helper, as I suggest in [1].
> >   (exactly duplicates current behavior)
> > - just leave the fd in place (not current behavior: dumps a fd into
> >   the process without "agreed" notification).
> > - do a double put_user (once before and once after), also in [1].
> >   (sort of a best-effort combo of the above two. and SCM_RIGHTS is
> >   hardly fast-pth).
> >
> > -Kees
> >
> > [1] https://lore.kernel.org/linux-api/202005282345.573B917@keescook/
> >
> > --
> > Kees Cook
> 
> I'm going to suggest we stick to the approach of doing[1]:
> 1. Allocate FD
> 2. put_user
> 3. "Receive" and install file into FD
> 
> That is the only way to preserve the current behaviour in which userspace
> is notified about *every* FD that is received via SCM_RIGHTS. The
> scm_detach_fds code as it reads today does effectively what is above,
> in that the fd is not installed until *after* the put user. Therefore
> if put_user
> gets an EFAULT or ENOMEM, it falls through to the MSG_CTRUNC bit.
> 
> The approach suggested[2] has a "change" in behaviour, in that (all in
> file_receive):
> 1. Allocate FD
> 2. Receive file
> 3. put_user
> 
> Based on what Al Viro said, I don't think we can simply add step #4,
> being "just" uninstall the FD.
> 
> [1]: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg2179418.html
> [2]: https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg2179453.html

Agreed. Thanks!