seccomp: Add group_leader pid to seccomp_notif

Submitted by Sargun Dhillon on May 18, 2020, 8:32 a.m.

Details

Message ID 20200518083224.GA16270@ircssh-2.c.rugged-nimbus-611.internal
State New
Series "seccomp: Add group_leader pid to seccomp_notif"
Headers show

Commit Message

Sargun Dhillon May 18, 2020, 8:32 a.m.
On Sun, May 17, 2020 at 02:30:57PM -0700, Kees Cook wrote:
> On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> 
> I'm going read this thread more carefully tomorrow, but I just wanted to
> mention that I'd *like* to extend seccomp_data for doing deep argument
> inspection of the new syscalls. I think it's the least bad of many
> designs, and I'll write that up in more detail. (I would *really* like
> to avoid extending seccomp's BPF language, and instead allow probing
> into the struct copied from userspace, etc.)
> 
> Anyway, it's very related to this, so, yeah, probably we need a v2 of the
> notif API, but I'll try to get all the ideas here collected in one place.
I scratched together a proposal of what I think would make a not-terrible
V2 API. I'm sure there's bugs in this code, but I think it's workable --
or at least a place to start. The biggest thing I think we should consider
is unrolling seccomp_data if we don't intend to add new BPF-accessible
fields.

If also uses read(2), so we get to take advantage of read(2)'s ability
to pass a size along with the read, as opposed to doing ioctl tricks.
It also makes programming from against it slightly simpler. I can imagine
that the send API could be similar, in that it could support write, and
thus making it 100% usable from Go (and the like) without requiring
a separate OS-thread be spun up to interact with the listener.



> 
> -- 
> Kees Cook
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/containers

Patch hide | download patch | download mbox

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index c1735455bc53..db6e5335ec6a 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -113,6 +113,41 @@  struct seccomp_notif_resp {
 	__u32 flags;
 };
 
+#define SECCOMP_DATA_SIZE_VER0	64
+
+#define SECCOMP_NOTIF_CONFIG_SIZE_VER0	16
+
+/* Optional seccomp return fields */
+/* __u32 for triggering pid */
+#define SECCOMP_NOTIF_FIELD_PID		(1UL << 1)
+/* __u32 for triggering  thread group leader id */
+#define SECCOMP_NOTIF_FIELD_TGID	(1UL << 1)
+
+struct seccomp_notif_config {
+	__u32 size;
+	/*
+	 * The supported SECCOMP_DATA_SIZE to be returned. If the size passed in
+	 * is unsupported (seccomp_data is too small or if it is larger than the
+	 * currently supported size), EINVAL or E2BIG will be, respectively,
+	 * returned, and this will be set to the latest supported size.
+	 */
+	__u32 seccomp_data_size;
+	/*
+	 * This is an OR'd together list of SECCOMP_NOTIF_FIELD_*. If an
+	 * unsupported field is requested, ENOTSUPP will be returned.
+	 */
+	__u64 optional_fields;
+};
+
+/* read(2) Notification format example:
+ * struct seccomp_notif_read {
+ *	__u64 id;
+ *	__u32 pid; if SECCOMP_NOTIF_FIELD_PID
+ *	__u32 tgid; if SECCOMP_NOTIF_FIELD_TGID
+ *	struct seccomp_data data;
+ * };
+ */
+
 #define SECCOMP_IOC_MAGIC		'!'
 #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
 #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
@@ -124,4 +159,10 @@  struct seccomp_notif_resp {
 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
 						struct seccomp_notif_resp)
 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOR(2, __u64)
+/*
+ * Configure the data structure to be returned by read(2).
+ * Returns the size of the data structure which will be returned.
+ */
+#define SECCOMP_IOCTL_NOTIF_CONFIG	SECCOMP_IOR(3,	\
+						struct seccomp_notif_config)
 #endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 55a6184f5990..4670cb03c390 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -95,12 +95,19 @@  struct seccomp_knotif {
  * @next_id: The id of the next request.
  * @notifications: A list of struct seccomp_knotif elements.
  * @wqh: A wait queue for poll.
+ * @read_size: The size of the expected read. If it is set to 0, it means that
+ *             reading notifications via read(2) is not yet configured. Until
+ *             then, EINVAL will be returned via read(2).
+ * @data_size: The supported size of seccomp_data.
+ * @optional_fields: The optional fields to return on read
  */
 struct notification {
 	struct semaphore request;
 	u64 next_id;
 	struct list_head notifications;
 	wait_queue_head_t wqh;
+	u32 read_size;
+	u64 optional_fields;
 };
 
 /**
@@ -1021,79 +1028,160 @@  static int seccomp_notify_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static long seccomp_notify_recv(struct seccomp_filter *filter,
-				void __user *buf)
+static void seccomp_reset_notif(struct seccomp_filter *filter,
+				u64 id)
 {
-	struct seccomp_knotif *knotif = NULL, *cur;
-	struct seccomp_notif unotif;
-	ssize_t ret;
-
-	/* Verify that we're not given garbage to keep struct extensible. */
-	ret = check_zeroed_user(buf, sizeof(unotif));
-	if (ret < 0)
-		return ret;
-	if (!ret)
-		return -EINVAL;
+	struct seccomp_knotif *cur;
 
-	memset(&unotif, 0, sizeof(unotif));
+	/*
+	 * Something went wrong. To make sure that we keep this
+	 * notification alive, let's reset it back to INIT. It
+	 * may have died when we released the lock, so we need to make
+	 * sure it's still around.
+	 */
+	mutex_lock(&filter->notify_lock);
+	list_for_each_entry(cur, &filter->notif->notifications, list) {
+		if (cur->id == id) {
+			cur->state = SECCOMP_NOTIFY_INIT;
+			up(&filter->notif->request);
+			break;
+		}
+	}
+	mutex_unlock(&filter->notify_lock);
+}
+/*
+ * Returns the next knotif available. If the return is a non-error, it will
+ * be with notify_lock held. It is up to the caller to unlock it.
+ */
+static struct seccomp_knotif *seccomp_get_notif(struct seccomp_filter *filter)
+{
+	struct seccomp_knotif *cur;
+	int ret;
 
 	ret = down_interruptible(&filter->notif->request);
 	if (ret < 0)
-		return ret;
+		return ERR_PTR(ret);
 
 	mutex_lock(&filter->notify_lock);
 	list_for_each_entry(cur, &filter->notif->notifications, list) {
 		if (cur->state == SECCOMP_NOTIFY_INIT) {
-			knotif = cur;
-			break;
+			cur->state = SECCOMP_NOTIFY_SENT;
+			wake_up_poll(&filter->notif->wqh,
+				     EPOLLOUT | EPOLLWRNORM);
+			return cur;
 		}
 	}
-
+	mutex_unlock(&filter->notify_lock);
 	/*
 	 * If we didn't find a notification, it could be that the task was
 	 * interrupted by a fatal signal between the time we were woken and
 	 * when we were able to acquire the rw lock.
 	 */
-	if (!knotif) {
-		ret = -ENOENT;
-		goto out;
-	}
+	return ERR_PTR(-ENOENT);
+}
+
+static long seccomp_notify_recv(struct seccomp_filter *filter,
+				void __user *buf)
+{
+	struct seccomp_knotif *knotif;
+	struct seccomp_notif unotif;
+	ssize_t ret;
+
+	/* Verify that we're not given garbage to keep struct extensible. */
+	ret = check_zeroed_user(buf, sizeof(unotif));
+	if (ret < 0)
+		return ret;
+	if (!ret)
+		return -EINVAL;
+
+	memset(&unotif, 0, sizeof(unotif));
+
+	knotif = seccomp_get_notif(filter);
+	if (IS_ERR(knotif))
+		return PTR_ERR(knotif);
 
 	unotif.id = knotif->id;
 	unotif.pid = task_pid_vnr(knotif->task);
 	unotif.data = *(knotif->data);
 
-	knotif->state = SECCOMP_NOTIFY_SENT;
-	wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM);
-	ret = 0;
-out:
 	mutex_unlock(&filter->notify_lock);
 
-	if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) {
-		ret = -EFAULT;
+	if (!copy_to_user(buf, &unotif, sizeof(unotif)))
+		return 0;
 
-		/*
-		 * Userspace screwed up. To make sure that we keep this
-		 * notification alive, let's reset it back to INIT. It
-		 * may have died when we released the lock, so we need to make
-		 * sure it's still around.
-		 */
-		knotif = NULL;
-		mutex_lock(&filter->notify_lock);
-		list_for_each_entry(cur, &filter->notif->notifications, list) {
-			if (cur->id == unotif.id) {
-				knotif = cur;
-				break;
-			}
-		}
+	seccomp_reset_notif(filter, unotif.id);
+	return -EFAULT;
+}
 
-		if (knotif) {
-			knotif->state = SECCOMP_NOTIFY_INIT;
-			up(&filter->notif->request);
-		}
-		mutex_unlock(&filter->notify_lock);
+/* Append the src value to the dst, and return the new cursor */
+#define append(dst, src)	({		\
+	typeof(src) _src = src;			\
+	memcpy(dst, &_src, sizeof(_src));	\
+	dst + sizeof(_src);			\
+})
+
+/*
+ * Append the value pointer to by the pointer, src, to the dst
+ * and return the new cursor
+ */
+#define append_ptr(dst, src)	({		\
+	typeof(src) _src = src;			\
+	memcpy(dst, _src, sizeof(*_src));	\
+	dst + sizeof(*_src);			\
+})
+
+static ssize_t seccomp_notify_read(struct file *file, char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct seccomp_filter *filter = file->private_data;
+	u32 optional_fields, read_size;
+	struct seccomp_knotif *knotif;
+	void *kbuf, *cursor;
+	int ret;
+	u64 id;
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret)
+		return ret;
+	read_size = filter->notif->read_size;
+	optional_fields = filter->notif->optional_fields;
+	mutex_unlock(&filter->notify_lock);
+
+	if (read_size == 0)
+		return -EINVAL;
+	if (count < read_size)
+		return -ENOSPC;
+
+	knotif = seccomp_get_notif(filter);
+	if (IS_ERR(knotif))
+		return PTR_ERR(knotif);
+
+	id = knotif->id;
+	kbuf = kzalloc(read_size, GFP_KERNEL);
+	if (!kbuf) {
+		ret = -ENOMEM;
+		goto out;
 	}
 
+	cursor = kbuf;
+	cursor = append(cursor, knotif->id);
+	if (filter->notif->optional_fields & SECCOMP_NOTIF_FIELD_PID)
+		cursor = append(cursor, task_pid_vnr(knotif->task));
+	if (filter->notif->optional_fields & SECCOMP_NOTIF_FIELD_TGID)
+		cursor = append(cursor, task_tgid_vnr(knotif->task));
+	cursor = append_ptr(cursor, knotif->data);
+	WARN_ON_ONCE(cursor != kbuf + read_size);
+
+	ret = copy_to_user(buf, kbuf, read_size);
+	if (!ret)
+		ret = read_size;
+
+	kfree(kbuf);
+out:
+	mutex_unlock(&filter->notify_lock);
+	if (ret < 0)
+		seccomp_reset_notif(filter, id);
+
 	return ret;
 }
 
@@ -1175,6 +1263,70 @@  static long seccomp_notify_id_valid(struct seccomp_filter *filter,
 	return ret;
 }
 
+static long seccomp_notif_config(struct seccomp_filter *filter,
+				 struct seccomp_notif_config __user *uconfig)
+{
+	struct seccomp_notif_config config;
+	/* ret_size is a minimum of 8, given id */
+	int ret, notification_size = 8;
+	u32 size;
+
+
+	ret = get_user(size, &uconfig->size);
+	if (ret)
+		return ret;
+	ret = copy_struct_from_user(&config, sizeof(config), uconfig, size);
+	if (ret)
+		return ret;
+
+	/*
+	 * This needs to be bumped every time we change seccomp_data's size
+	 */
+	BUILD_BUG_ON(sizeof(struct seccomp_data) != SECCOMP_DATA_SIZE_VER0);
+	if (config.seccomp_data_size < SECCOMP_DATA_SIZE_VER0) {
+		ret = -EINVAL;
+		goto invalid_seccomp_data_size;
+	}
+	if (config.seccomp_data_size > SECCOMP_DATA_SIZE_VER0) {
+		ret = -E2BIG;
+		goto invalid_seccomp_data_size;
+	}
+	notification_size += config.seccomp_data_size;
+
+	if (config.optional_fields & ~(SECCOMP_NOTIF_FIELD_PID|SECCOMP_NOTIF_FIELD_TGID))
+		return -ENOTSUPP;
+
+	if (config.optional_fields & SECCOMP_NOTIF_FIELD_PID)
+		notification_size += 4;
+	if (config.optional_fields & SECCOMP_NOTIF_FIELD_TGID)
+		notification_size += 4;
+
+
+	ret = mutex_lock_interruptible(&filter->notify_lock);
+	if (ret < 0)
+		return ret;
+
+	if (filter->notif->read_size) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = notification_size;
+	filter->notif->read_size = notification_size;
+	filter->notif->optional_fields = config.optional_fields;
+
+out:
+	mutex_unlock(&filter->notify_lock);
+
+	return ret;
+
+invalid_seccomp_data_size:
+	if (put_user(SECCOMP_DATA_SIZE_VER0, &uconfig->seccomp_data_size))
+		return -EFAULT;
+
+	return ret;
+}
+
 static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
 				 unsigned long arg)
 {
@@ -1188,6 +1340,8 @@  static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
 		return seccomp_notify_send(filter, buf);
 	case SECCOMP_IOCTL_NOTIF_ID_VALID:
 		return seccomp_notify_id_valid(filter, buf);
+	case SECCOMP_IOCTL_NOTIF_CONFIG:
+		return seccomp_notif_config(filter, buf);
 	default:
 		return -EINVAL;
 	}
@@ -1224,6 +1378,7 @@  static const struct file_operations seccomp_notify_ops = {
 	.release = seccomp_notify_release,
 	.unlocked_ioctl = seccomp_notify_ioctl,
 	.compat_ioctl = seccomp_notify_ioctl,
+	.read = seccomp_notify_read,
 };
 
 static struct file *init_listener(struct seccomp_filter *filter)

Comments

Christian Brauner May 18, 2020, 12:45 p.m.
On Mon, May 18, 2020 at 08:32:25AM +0000, Sargun Dhillon wrote:
> On Sun, May 17, 2020 at 02:30:57PM -0700, Kees Cook wrote:
> > On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> > 
> > I'm going read this thread more carefully tomorrow, but I just wanted to
> > mention that I'd *like* to extend seccomp_data for doing deep argument
> > inspection of the new syscalls. I think it's the least bad of many
> > designs, and I'll write that up in more detail. (I would *really* like
> > to avoid extending seccomp's BPF language, and instead allow probing
> > into the struct copied from userspace, etc.)
> > 
> > Anyway, it's very related to this, so, yeah, probably we need a v2 of the
> > notif API, but I'll try to get all the ideas here collected in one place.
> I scratched together a proposal of what I think would make a not-terrible
> V2 API. I'm sure there's bugs in this code, but I think it's workable --
> or at least a place to start. The biggest thing I think we should consider
> is unrolling seccomp_data if we don't intend to add new BPF-accessible
> fields.
> 
> If also uses read(2), so we get to take advantage of read(2)'s ability
> to pass a size along with the read, as opposed to doing ioctl tricks.
> It also makes programming from against it slightly simpler. I can imagine
> that the send API could be similar, in that it could support write, and
> thus making it 100% usable from Go (and the like) without requiring
> a separate OS-thread be spun up to interact with the listener.

I don't have strong feelings about using read() and write() here but I
think that Jann had reservations and that's why we didn't do it in the
first version. But his reservations were specifically tied to fd passing
which we never implemented:
http://lkml.iu.edu/hypermail/linux/kernel/1806.2/05995.html

But still, worth considering.

Christian
Tycho Andersen May 18, 2020, 1:23 p.m.
On Mon, May 18, 2020 at 02:45:00PM +0200, Christian Brauner wrote:
> On Mon, May 18, 2020 at 08:32:25AM +0000, Sargun Dhillon wrote:
> > On Sun, May 17, 2020 at 02:30:57PM -0700, Kees Cook wrote:
> > > On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> > > 
> > > I'm going read this thread more carefully tomorrow, but I just wanted to
> > > mention that I'd *like* to extend seccomp_data for doing deep argument
> > > inspection of the new syscalls. I think it's the least bad of many
> > > designs, and I'll write that up in more detail. (I would *really* like
> > > to avoid extending seccomp's BPF language, and instead allow probing
> > > into the struct copied from userspace, etc.)
> > > 
> > > Anyway, it's very related to this, so, yeah, probably we need a v2 of the
> > > notif API, but I'll try to get all the ideas here collected in one place.
> > I scratched together a proposal of what I think would make a not-terrible
> > V2 API. I'm sure there's bugs in this code, but I think it's workable --
> > or at least a place to start. The biggest thing I think we should consider
> > is unrolling seccomp_data if we don't intend to add new BPF-accessible
> > fields.
> > 
> > If also uses read(2), so we get to take advantage of read(2)'s ability
> > to pass a size along with the read, as opposed to doing ioctl tricks.
> > It also makes programming from against it slightly simpler. I can imagine
> > that the send API could be similar, in that it could support write, and
> > thus making it 100% usable from Go (and the like) without requiring
> > a separate OS-thread be spun up to interact with the listener.
> 
> I don't have strong feelings about using read() and write() here but I
> think that Jann had reservations and that's why we didn't do it in the
> first version. But his reservations were specifically tied to fd passing
> which we never implemented:
> http://lkml.iu.edu/hypermail/linux/kernel/1806.2/05995.html
> 
> But still, worth considering.

There was a thread about this same time for some other API (I can't
find it now, but I can dig if you want) that suggests that "read() is
for data" and we shouldn't use it for control in APIs.

Tycho
Christian Brauner May 18, 2020, 2 p.m.
On Mon, May 18, 2020 at 07:23:55AM -0600, Tycho Andersen wrote:
> On Mon, May 18, 2020 at 02:45:00PM +0200, Christian Brauner wrote:
> > On Mon, May 18, 2020 at 08:32:25AM +0000, Sargun Dhillon wrote:
> > > On Sun, May 17, 2020 at 02:30:57PM -0700, Kees Cook wrote:
> > > > On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> > > > 
> > > > I'm going read this thread more carefully tomorrow, but I just wanted to
> > > > mention that I'd *like* to extend seccomp_data for doing deep argument
> > > > inspection of the new syscalls. I think it's the least bad of many
> > > > designs, and I'll write that up in more detail. (I would *really* like
> > > > to avoid extending seccomp's BPF language, and instead allow probing
> > > > into the struct copied from userspace, etc.)
> > > > 
> > > > Anyway, it's very related to this, so, yeah, probably we need a v2 of the
> > > > notif API, but I'll try to get all the ideas here collected in one place.
> > > I scratched together a proposal of what I think would make a not-terrible
> > > V2 API. I'm sure there's bugs in this code, but I think it's workable --
> > > or at least a place to start. The biggest thing I think we should consider
> > > is unrolling seccomp_data if we don't intend to add new BPF-accessible
> > > fields.
> > > 
> > > If also uses read(2), so we get to take advantage of read(2)'s ability
> > > to pass a size along with the read, as opposed to doing ioctl tricks.
> > > It also makes programming from against it slightly simpler. I can imagine
> > > that the send API could be similar, in that it could support write, and
> > > thus making it 100% usable from Go (and the like) without requiring
> > > a separate OS-thread be spun up to interact with the listener.
> > 
> > I don't have strong feelings about using read() and write() here but I
> > think that Jann had reservations and that's why we didn't do it in the
> > first version. But his reservations were specifically tied to fd passing
> > which we never implemented:
> > http://lkml.iu.edu/hypermail/linux/kernel/1806.2/05995.html
> > 
> > But still, worth considering.
> 
> There was a thread about this same time for some other API (I can't
> find it now, but I can dig if you want) that suggests that "read() is
> for data" and we shouldn't use it for control in APIs.

Oh that sounds useful. Though I think you can wait with digging it out
until someone insists on using read(). :)

Christian