seccomp: Add group_leader pid to seccomp_notif

Submitted by Sargun Dhillon on May 15, 2020, 11:40 p.m.

Details

Message ID 20200515234005.32370-1-sargun@sargun.me
State New
Series "seccomp: Add group_leader pid to seccomp_notif"
Headers show

Commit Message

Sargun Dhillon May 15, 2020, 11:40 p.m.
This includes the thread group leader ID in the seccomp_notif. This is
immediately useful for opening up a pidfd for the group leader, as
pidfds only work on group leaders.

Previously, it was considered to include an actual pidfd in the
seccomp_notif structure[1], but it was suggested to avoid proliferating
mechanisms to create pidfds[2].

[1]: https://lkml.org/lkml/2020/1/24/133
[2]: https://lkml.org/lkml/2020/5/15/481

Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Sargun Dhillon <sargun@sargun.me>
---
 include/uapi/linux/seccomp.h                  |  2 +
 kernel/seccomp.c                              |  1 +
 tools/testing/selftests/seccomp/seccomp_bpf.c | 50 +++++++++++++++++++
 3 files changed, 53 insertions(+)

Patch hide | download patch | download mbox

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index c1735455bc53..f0c272ef0f1e 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -75,6 +75,8 @@  struct seccomp_notif {
 	__u32 pid;
 	__u32 flags;
 	struct seccomp_data data;
+	__u32 tgid;
+	__u8 pad0[4];
 };
 
 /*
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 55a6184f5990..538bcbbcf4da 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1061,6 +1061,7 @@  static long seccomp_notify_recv(struct seccomp_filter *filter,
 
 	unotif.id = knotif->id;
 	unotif.pid = task_pid_vnr(knotif->task);
+	unotif.tgid = task_tgid_vnr(knotif->task);
 	unotif.data = *(knotif->data);
 
 	knotif->state = SECCOMP_NOTIFY_SENT;
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index c0aa46ce14f6..5658c6e95461 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -187,6 +187,8 @@  struct seccomp_notif {
 	__u32 pid;
 	__u32 flags;
 	struct seccomp_data data;
+	__u32 tgid;
+	__u8 pad0[4];
 };
 
 struct seccomp_notif_resp {
@@ -3226,6 +3228,8 @@  TEST(user_notification_basic)
 		req.pid = 0;
 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
 	}
+	EXPECT_EQ(pid, req.pid);
+	EXPECT_EQ(pid, req.tgid);
 
 	pollfd.fd = listener;
 	pollfd.events = POLLIN | POLLOUT;
@@ -3453,6 +3457,7 @@  TEST(user_notification_child_pid_ns)
 
 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
 	EXPECT_EQ(req.pid, pid);
+	EXPECT_EQ(req.tgid, pid);
 
 	resp.id = req.id;
 	resp.error = 0;
@@ -3686,6 +3691,51 @@  TEST(user_notification_continue)
 	}
 }
 
+void *getppid_thread(void *arg)
+{
+	int *tid = arg;
+
+	*tid = syscall(__NR_gettid);
+	if (*tid <= 0)
+		return (void *)(long)errno;
+	return NULL;
+}
+
+TEST(user_notification_groupleader)
+{
+	struct seccomp_notif_resp resp = {};
+	struct seccomp_notif req = {};
+	int ret, listener, tid, pid;
+	pthread_t thread;
+	void *status;
+
+	pid = getpid();
+	ASSERT_GT(pid, 0);
+
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret) {
+		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+	}
+
+	listener = user_trap_syscall(__NR_gettid,
+				     SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	ASSERT_GE(listener, 0);
+
+	ASSERT_EQ(0, pthread_create(&thread, NULL, getppid_thread, &tid));
+
+	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+	resp.id = req.id;
+	resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+	ASSERT_EQ(0, pthread_join(thread, &status));
+	ASSERT_EQ(0, status);
+
+	EXPECT_EQ(tid, req.pid);
+	EXPECT_EQ(pid, req.tgid);
+}
+
+
 /*
  * TODO:
  * - add microbenchmarks

Comments

Kees Cook May 17, 2020, 7:17 a.m.
On Fri, May 15, 2020 at 04:40:05PM -0700, Sargun Dhillon wrote:
> This includes the thread group leader ID in the seccomp_notif. This is
> immediately useful for opening up a pidfd for the group leader, as
> pidfds only work on group leaders.
> 
> Previously, it was considered to include an actual pidfd in the
> seccomp_notif structure[1], but it was suggested to avoid proliferating
> mechanisms to create pidfds[2].
> 
> [1]: https://lkml.org/lkml/2020/1/24/133
> [2]: https://lkml.org/lkml/2020/5/15/481

nit: please use lore.kernel.org/lkml/ URLs

> Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
> Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> ---
>  include/uapi/linux/seccomp.h                  |  2 +
>  kernel/seccomp.c                              |  1 +
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 50 +++++++++++++++++++
>  3 files changed, 53 insertions(+)
> 
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index c1735455bc53..f0c272ef0f1e 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -75,6 +75,8 @@ struct seccomp_notif {
>  	__u32 pid;
>  	__u32 flags;
>  	struct seccomp_data data;
> +	__u32 tgid;
> +	__u8 pad0[4];
>  };

I think we need to leave off padding and instead use __packed. If we
don't then userspace can't tell when "pad0" changes its "meaning" (i.e.
the size of seccomp_notif becomes 88 bytes with above -- either via
explicit padding like you've got or via implicit by the compiler. If
some other u32 gets added in the future, user space will still see "88"
as the size.

So I *think* the right change here is:

-};
+	__u32 tgid;
+} __packed;

Though tgid may need to go above seccomp_data... for when it grows.
Agh...

_However_, unfortunately, I appear to have no thought this through very
well, and there is actually no sanity-checking in the kernel for dealing
with an old userspace when sizes change. :( For example, if a userspace
doesn't check sizes and calls an ioctl, etc, the kernel will clobber the
user buffer if it's too small.

Even the SECCOMP_GET_NOTIF_SIZES command lacks a buffer size argument.
:(

So:

- should we just declare such userspace as "wrong"? I don't think
  that'll work, especially since what if we ever change the size of
  seccomp_data...  that predated the ..._SIZES command.

- should we add a SECCOMP_SET_SIZES command to tell the kernel what
  we're expecting? There's no "state" associated across seccomp(2)
  calls, but maybe that doesn't matter because only user_notif writes
  back to userspace. For the ioctl, the state could be part of the
  private file data? Sending seccomp_data back to userspace only
  happens here, and any changes in seccomp_data size will just be seen
  as allowing a filter to query further into it.

- should GET_SIZES report "useful" size? (i.e. exclude padding?)

> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c

Yay test updates! :)

> +TEST(user_notification_groupleader)

In my first pass of review I was going to say "can you please also check
the sizes used by the ioctl?" But that triggered the above size checking
mess in my mind.

Let me look at this more closely on Monday, and I'll proposed something.
:P

Thanks!

-Kees
Christian Brauner May 17, 2020, 10:47 a.m.
On Sun, May 17, 2020 at 12:17:14AM -0700, Kees Cook wrote:
> On Fri, May 15, 2020 at 04:40:05PM -0700, Sargun Dhillon wrote:
> > This includes the thread group leader ID in the seccomp_notif. This is
> > immediately useful for opening up a pidfd for the group leader, as
> > pidfds only work on group leaders.
> > 
> > Previously, it was considered to include an actual pidfd in the
> > seccomp_notif structure[1], but it was suggested to avoid proliferating
> > mechanisms to create pidfds[2].
> > 
> > [1]: https://lkml.org/lkml/2020/1/24/133
> > [2]: https://lkml.org/lkml/2020/5/15/481
> 
> nit: please use lore.kernel.org/lkml/ URLs
> 
> > Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
> > Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> > ---
> >  include/uapi/linux/seccomp.h                  |  2 +
> >  kernel/seccomp.c                              |  1 +
> >  tools/testing/selftests/seccomp/seccomp_bpf.c | 50 +++++++++++++++++++
> >  3 files changed, 53 insertions(+)
> > 
> > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> > index c1735455bc53..f0c272ef0f1e 100644
> > --- a/include/uapi/linux/seccomp.h
> > +++ b/include/uapi/linux/seccomp.h
> > @@ -75,6 +75,8 @@ struct seccomp_notif {
> >  	__u32 pid;
> >  	__u32 flags;
> >  	struct seccomp_data data;
> > +	__u32 tgid;
> > +	__u8 pad0[4];
> >  };
> 
> I think we need to leave off padding and instead use __packed. If we
> don't then userspace can't tell when "pad0" changes its "meaning" (i.e.
> the size of seccomp_notif becomes 88 bytes with above -- either via
> explicit padding like you've got or via implicit by the compiler. If
> some other u32 gets added in the future, user space will still see "88"
> as the size.
> 
> So I *think* the right change here is:
> 
> -};
> +	__u32 tgid;
> +} __packed;
> 
> Though tgid may need to go above seccomp_data... for when it grows.
> Agh...
> 
> _However_, unfortunately, I appear to have no thought this through very
> well, and there is actually no sanity-checking in the kernel for dealing
> with an old userspace when sizes change. :( For example, if a userspace
> doesn't check sizes and calls an ioctl, etc, the kernel will clobber the
> user buffer if it's too small.

I'd just argue that that's just userspace messing up.

> 
> Even the SECCOMP_GET_NOTIF_SIZES command lacks a buffer size argument.
> :(
> 
> So:
> 
> - should we just declare such userspace as "wrong"? I don't think
>   that'll work, especially since what if we ever change the size of
>   seccomp_data...  that predated the ..._SIZES command.

Yeah, that's nasty since the struct member in seccomp_notif would now
clobber each other.

> 
> - should we add a SECCOMP_SET_SIZES command to tell the kernel what
>   we're expecting? There's no "state" associated across seccomp(2)
>   calls, but maybe that doesn't matter because only user_notif writes
>   back to userspace. For the ioctl, the state could be part of the
>   private file data? Sending seccomp_data back to userspace only
>   happens here, and any changes in seccomp_data size will just be seen
>   as allowing a filter to query further into it.

Sounds ok-ish in my opinion.

> 
> - should GET_SIZES report "useful" size? (i.e. exclude padding?)

Or... And that's more invasive but ultimately cleaner we v2 the whole
thing so e.g. SECCOMP_IOCTL_NOTIF_RECV2, SECCOMP_IOCTL_NOTIF_SEND2, and
embedd the size argument in the structs. Userspace sets the size
argument, we use get_user() to get the size first and then
copy_struct_from_user() to handle it cleanly based on that. A similar
model as with sched (has other unrelated quirks because they messed up
something too):

static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
{
	u32 size;
	int ret;

	/* Zero the full structure, so that a short copy will be nice: */
	memset(attr, 0, sizeof(*attr));

	ret = get_user(size, &uattr->size);
	if (ret)
		return ret;

	/* ABI compatibility quirk: */
	if (!size)
		size = SCHED_ATTR_SIZE_VER0;
	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
		goto err_size;

	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
	if (ret) {
		if (ret == -E2BIG)
			goto err_size;
		return ret;
	}

We're probably the biggest user of this right now and I'd be ok with
that change. If it's a v2 than whatever. :)

Christian
Sargun Dhillon May 17, 2020, 11:17 a.m.
On Sun, May 17, 2020 at 12:17 AM Kees Cook <keescook@chromium.org> wrote:
>
> On Fri, May 15, 2020 at 04:40:05PM -0700, Sargun Dhillon wrote:
> > This includes the thread group leader ID in the seccomp_notif. This is
> > immediately useful for opening up a pidfd for the group leader, as
> > pidfds only work on group leaders.
> >
> > Previously, it was considered to include an actual pidfd in the
> > seccomp_notif structure[1], but it was suggested to avoid proliferating
> > mechanisms to create pidfds[2].
> >
> > [1]: https://lkml.org/lkml/2020/1/24/133
> > [2]: https://lkml.org/lkml/2020/5/15/481
>
> nit: please use lore.kernel.org/lkml/ URLs
>
> > Suggested-by: Christian Brauner <christian.brauner@ubuntu.com>
> > Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> > ---
> >  include/uapi/linux/seccomp.h                  |  2 +
> >  kernel/seccomp.c                              |  1 +
> >  tools/testing/selftests/seccomp/seccomp_bpf.c | 50 +++++++++++++++++++
> >  3 files changed, 53 insertions(+)
> >
> > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> > index c1735455bc53..f0c272ef0f1e 100644
> > --- a/include/uapi/linux/seccomp.h
> > +++ b/include/uapi/linux/seccomp.h
> > @@ -75,6 +75,8 @@ struct seccomp_notif {
> >       __u32 pid;
> >       __u32 flags;
> >       struct seccomp_data data;
> > +     __u32 tgid;
> > +     __u8 pad0[4];
> >  };
>
> I think we need to leave off padding and instead use __packed. If we
> don't then userspace can't tell when "pad0" changes its "meaning" (i.e.
> the size of seccomp_notif becomes 88 bytes with above -- either via
> explicit padding like you've got or via implicit by the compiler. If
> some other u32 gets added in the future, user space will still see "88"
> as the size.
>
I've had previous feedback about using "packed". See:
https://lore.kernel.org/lkml/87o8w9bcaf.fsf@mid.deneb.enyo.de/
https://lore.kernel.org/lkml/a328b91d-fd8f-4f27-b3c2-91a9c45f18c0@rasmusvillemoes.dk/
> So I *think* the right change here is:
>
> -};
> +       __u32 tgid;
> +} __packed;
>
> Though tgid may need to go above seccomp_data... for when it grows.
> Agh...
(How) can seccomp_data grow safely, even with this extensibility mechanism?
>
> _However_, unfortunately, I appear to have no thought this through very
> well, and there is actually no sanity-checking in the kernel for dealing
> with an old userspace when sizes change. :( For example, if a userspace
> doesn't check sizes and calls an ioctl, etc, the kernel will clobber the
> user buffer if it's too small.
>
> Even the SECCOMP_GET_NOTIF_SIZES command lacks a buffer size argument.
> :(
>
> So:
>
> - should we just declare such userspace as "wrong"? I don't think
>   that'll work, especially since what if we ever change the size of
>   seccomp_data...  that predated the ..._SIZES command.
>
> - should we add a SECCOMP_SET_SIZES command to tell the kernel what
>   we're expecting? There's no "state" associated across seccomp(2)
>   calls, but maybe that doesn't matter because only user_notif writes
>   back to userspace. For the ioctl, the state could be part of the
>   private file data? Sending seccomp_data back to userspace only
>   happens here, and any changes in seccomp_data size will just be seen
>   as allowing a filter to query further into it.
Will we ever grow seccomp_data?

I suggest we throw away the _SIZES api, and just introduce RECV2, which sends
back a known, fixed format, and deprecate these dynamically sized uapi
shenanigans.

(Queue RECV3, etc..)

Maybe we do something like perf_event_open, where there's a read_format,
and that's used by the user to determine how big of a response / fields they
want to get?

>
> - should GET_SIZES report "useful" size? (i.e. exclude padding?)
>
> > diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
>
> Yay test updates! :)
>
> > +TEST(user_notification_groupleader)
>
> In my first pass of review I was going to say "can you please also check
> the sizes used by the ioctl?" But that triggered the above size checking
> mess in my mind.
>
> Let me look at this more closely on Monday, and I'll proposed something.
> :P
To summarize my set of ideas:
1. We take the ptrace-style API, where we have a request to get the tgid of
a given request ID (or any new / extensible field)
2. We add a perf_event_open style API, where you have to tell it what fields
to include in the response
3. We introduce RECV2 [through N]
4. We never extend seccomp_data, and just continue to append things to the API
5. We rev the API _once_ and unroll seccomp_data, and make it so that
new members have to be *asked for*, rather than are implicitly
included.
>
> Thanks!
>
> -Kees
>
> --
> Kees Cook
Aleksa Sarai May 17, 2020, 11:21 a.m.
On 2020-05-17, Christian Brauner <christian.brauner@ubuntu.com> wrote:
> Or... And that's more invasive but ultimately cleaner we v2 the whole
> thing so e.g. SECCOMP_IOCTL_NOTIF_RECV2, SECCOMP_IOCTL_NOTIF_SEND2, and
> embedd the size argument in the structs. Userspace sets the size
> argument, we use get_user() to get the size first and then
> copy_struct_from_user() to handle it cleanly based on that. A similar
> model as with sched (has other unrelated quirks because they messed up
> something too):
> 
> static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
> {
> 	u32 size;
> 	int ret;
> 
> 	/* Zero the full structure, so that a short copy will be nice: */
> 	memset(attr, 0, sizeof(*attr));
> 
> 	ret = get_user(size, &uattr->size);
> 	if (ret)
> 		return ret;
> 
> 	/* ABI compatibility quirk: */
> 	if (!size)
> 		size = SCHED_ATTR_SIZE_VER0;
> 	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
> 		goto err_size;
> 
> 	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
> 	if (ret) {
> 		if (ret == -E2BIG)
> 			goto err_size;
> 		return ret;
> 	}
> 
> We're probably the biggest user of this right now and I'd be ok with
> that change. If it's a v2 than whatever. :)

I'm :+1: on a new version and switch to copy_struct_from_user(). I was a
little surprised when I found out that user_notif doesn't do it this
way a while ago (and although in theory it is userspace's fault, ideally
we could have an API that doesn't have built-in footguns).
Tycho Andersen May 17, 2020, 2:23 p.m.
On Sun, May 17, 2020 at 09:21:56PM +1000, Aleksa Sarai wrote:
> On 2020-05-17, Christian Brauner <christian.brauner@ubuntu.com> wrote:
> > Or... And that's more invasive but ultimately cleaner we v2 the whole
> > thing so e.g. SECCOMP_IOCTL_NOTIF_RECV2, SECCOMP_IOCTL_NOTIF_SEND2, and
> > embedd the size argument in the structs. Userspace sets the size
> > argument, we use get_user() to get the size first and then
> > copy_struct_from_user() to handle it cleanly based on that. A similar
> > model as with sched (has other unrelated quirks because they messed up
> > something too):
> > 
> > static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
> > {
> > 	u32 size;
> > 	int ret;
> > 
> > 	/* Zero the full structure, so that a short copy will be nice: */
> > 	memset(attr, 0, sizeof(*attr));
> > 
> > 	ret = get_user(size, &uattr->size);
> > 	if (ret)
> > 		return ret;
> > 
> > 	/* ABI compatibility quirk: */
> > 	if (!size)
> > 		size = SCHED_ATTR_SIZE_VER0;
> > 	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
> > 		goto err_size;
> > 
> > 	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
> > 	if (ret) {
> > 		if (ret == -E2BIG)
> > 			goto err_size;
> > 		return ret;
> > 	}
> > 
> > We're probably the biggest user of this right now and I'd be ok with
> > that change. If it's a v2 than whatever. :)
> 
> I'm :+1: on a new version and switch to copy_struct_from_user(). I was a
> little surprised when I found out that user_notif doesn't do it this
> way a while ago (and although in theory it is userspace's fault, ideally
> we could have an API that doesn't have built-in footguns).

But I thought the whole point was that we couldn't do that, because
there's two things that can vary in length (struct seccomp_notif and
struct seccomp_data)?

https://lore.kernel.org/lkml/CAGXu5j+ZPxu6egE1fEr+N9+zLx3N+SJ_vbS_zzj9_hrdWrrrWQ@mail.gmail.com/

Tycho
Christian Brauner May 17, 2020, 2:33 p.m.
On Sun, May 17, 2020 at 08:23:16AM -0600, Tycho Andersen wrote:
> On Sun, May 17, 2020 at 09:21:56PM +1000, Aleksa Sarai wrote:
> > On 2020-05-17, Christian Brauner <christian.brauner@ubuntu.com> wrote:
> > > Or... And that's more invasive but ultimately cleaner we v2 the whole
> > > thing so e.g. SECCOMP_IOCTL_NOTIF_RECV2, SECCOMP_IOCTL_NOTIF_SEND2, and
> > > embedd the size argument in the structs. Userspace sets the size
> > > argument, we use get_user() to get the size first and then
> > > copy_struct_from_user() to handle it cleanly based on that. A similar
> > > model as with sched (has other unrelated quirks because they messed up
> > > something too):
> > > 
> > > static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
> > > {
> > > 	u32 size;
> > > 	int ret;
> > > 
> > > 	/* Zero the full structure, so that a short copy will be nice: */
> > > 	memset(attr, 0, sizeof(*attr));
> > > 
> > > 	ret = get_user(size, &uattr->size);
> > > 	if (ret)
> > > 		return ret;
> > > 
> > > 	/* ABI compatibility quirk: */
> > > 	if (!size)
> > > 		size = SCHED_ATTR_SIZE_VER0;
> > > 	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
> > > 		goto err_size;
> > > 
> > > 	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
> > > 	if (ret) {
> > > 		if (ret == -E2BIG)
> > > 			goto err_size;
> > > 		return ret;
> > > 	}
> > > 
> > > We're probably the biggest user of this right now and I'd be ok with
> > > that change. If it's a v2 than whatever. :)
> > 
> > I'm :+1: on a new version and switch to copy_struct_from_user(). I was a
> > little surprised when I found out that user_notif doesn't do it this
> > way a while ago (and although in theory it is userspace's fault, ideally
> > we could have an API that doesn't have built-in footguns).
> 
> But I thought the whole point was that we couldn't do that, because
> there's two things that can vary in length (struct seccomp_notif and
> struct seccomp_data)?

I may have missed that discussion you linked.
But why wouldn't:

struct seccomp_notif2 {
	__u32 notif_size;
	__u64 id;
	__u32 pid;
	__u32 flags;
	struct seccomp_data data;
	__u32 data_size;
};

struct seccomp_notif_resp2 {
	__u32 notif_resp_size;
	__u64 id;
	__s64 val;
	__s32 error;
	__u32 flags;
};

work?
Christian Brauner May 17, 2020, 2:35 p.m.
On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> On Sun, May 17, 2020 at 08:23:16AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 09:21:56PM +1000, Aleksa Sarai wrote:
> > > On 2020-05-17, Christian Brauner <christian.brauner@ubuntu.com> wrote:
> > > > Or... And that's more invasive but ultimately cleaner we v2 the whole
> > > > thing so e.g. SECCOMP_IOCTL_NOTIF_RECV2, SECCOMP_IOCTL_NOTIF_SEND2, and
> > > > embedd the size argument in the structs. Userspace sets the size
> > > > argument, we use get_user() to get the size first and then
> > > > copy_struct_from_user() to handle it cleanly based on that. A similar
> > > > model as with sched (has other unrelated quirks because they messed up
> > > > something too):
> > > > 
> > > > static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
> > > > {
> > > > 	u32 size;
> > > > 	int ret;
> > > > 
> > > > 	/* Zero the full structure, so that a short copy will be nice: */
> > > > 	memset(attr, 0, sizeof(*attr));
> > > > 
> > > > 	ret = get_user(size, &uattr->size);
> > > > 	if (ret)
> > > > 		return ret;
> > > > 
> > > > 	/* ABI compatibility quirk: */
> > > > 	if (!size)
> > > > 		size = SCHED_ATTR_SIZE_VER0;
> > > > 	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
> > > > 		goto err_size;
> > > > 
> > > > 	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
> > > > 	if (ret) {
> > > > 		if (ret == -E2BIG)
> > > > 			goto err_size;
> > > > 		return ret;
> > > > 	}
> > > > 
> > > > We're probably the biggest user of this right now and I'd be ok with
> > > > that change. If it's a v2 than whatever. :)
> > > 
> > > I'm :+1: on a new version and switch to copy_struct_from_user(). I was a
> > > little surprised when I found out that user_notif doesn't do it this
> > > way a while ago (and although in theory it is userspace's fault, ideally
> > > we could have an API that doesn't have built-in footguns).
> > 
> > But I thought the whole point was that we couldn't do that, because
> > there's two things that can vary in length (struct seccomp_notif and
> > struct seccomp_data)?
> 
> I may have missed that discussion you linked.
> But why wouldn't:
> 
> struct seccomp_notif2 {
> 	__u32 notif_size;
> 	__u64 id;
> 	__u32 pid;
> 	__u32 flags;
> 	struct seccomp_data data;
> 	__u32 data_size;
> };
> 
> struct seccomp_notif_resp2 {
> 	__u32 notif_resp_size;
> 	__u64 id;
> 	__s64 val;
> 	__s32 error;
> 	__u32 flags;
> };

(Ignore the missing 32 bits here.)
Tycho Andersen May 17, 2020, 2:46 p.m.
On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> On Sun, May 17, 2020 at 08:23:16AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 09:21:56PM +1000, Aleksa Sarai wrote:
> > > On 2020-05-17, Christian Brauner <christian.brauner@ubuntu.com> wrote:
> > > > Or... And that's more invasive but ultimately cleaner we v2 the whole
> > > > thing so e.g. SECCOMP_IOCTL_NOTIF_RECV2, SECCOMP_IOCTL_NOTIF_SEND2, and
> > > > embedd the size argument in the structs. Userspace sets the size
> > > > argument, we use get_user() to get the size first and then
> > > > copy_struct_from_user() to handle it cleanly based on that. A similar
> > > > model as with sched (has other unrelated quirks because they messed up
> > > > something too):
> > > > 
> > > > static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
> > > > {
> > > > 	u32 size;
> > > > 	int ret;
> > > > 
> > > > 	/* Zero the full structure, so that a short copy will be nice: */
> > > > 	memset(attr, 0, sizeof(*attr));
> > > > 
> > > > 	ret = get_user(size, &uattr->size);
> > > > 	if (ret)
> > > > 		return ret;
> > > > 
> > > > 	/* ABI compatibility quirk: */
> > > > 	if (!size)
> > > > 		size = SCHED_ATTR_SIZE_VER0;
> > > > 	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
> > > > 		goto err_size;
> > > > 
> > > > 	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
> > > > 	if (ret) {
> > > > 		if (ret == -E2BIG)
> > > > 			goto err_size;
> > > > 		return ret;
> > > > 	}
> > > > 
> > > > We're probably the biggest user of this right now and I'd be ok with
> > > > that change. If it's a v2 than whatever. :)
> > > 
> > > I'm :+1: on a new version and switch to copy_struct_from_user(). I was a
> > > little surprised when I found out that user_notif doesn't do it this
> > > way a while ago (and although in theory it is userspace's fault, ideally
> > > we could have an API that doesn't have built-in footguns).
> > 
> > But I thought the whole point was that we couldn't do that, because
> > there's two things that can vary in length (struct seccomp_notif and
> > struct seccomp_data)?
> 
> I may have missed that discussion you linked.
> But why wouldn't:
> 
> struct seccomp_notif2 {
> 	__u32 notif_size;
> 	__u64 id;
> 	__u32 pid;
> 	__u32 flags;
> 	struct seccomp_data data;
> 	__u32 data_size;
> };

I guess you need to put data_size before data, otherwise old userspace
with a smaller struct seccomp_data will look in the wrong place.

But yes, that'll work if you put two sizes in, which is probably
reasonable since we're talking about two structs.

Tycho
Tycho Andersen May 17, 2020, 3:02 p.m.
On Sun, May 17, 2020 at 08:46:03AM -0600, Tycho Andersen wrote:
> On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> > struct seccomp_notif2 {
> > 	__u32 notif_size;
> > 	__u64 id;
> > 	__u32 pid;
> > 	__u32 flags;
> > 	struct seccomp_data data;
> > 	__u32 data_size;
> > };
> 
> I guess you need to put data_size before data, otherwise old userspace
> with a smaller struct seccomp_data will look in the wrong place.
> 
> But yes, that'll work if you put two sizes in, which is probably
> reasonable since we're talking about two structs.

Well, no, it doesn't either. Suppose we add a new field first to
struct seccomp_notif2:

struct seccomp_notif2 {
    __u32 notif_size;
    __u64 id;
    __u32 pid;
    __u32 flags;
    struct seccomp_data data;
    __u32 data_size;
    __u32 new_field;
};

And next we add a new field to struct secccomp_data. When a userspace
compiled with just the new seccomp_notif2 field does:

seccomp_notif2.new_field = ...;

the compiler will put it in the wrong place for the kernel with the
new seccomp_data field too.

Sort of feels like we should do:

struct seccomp_notif2 {
    struct seccomp_notif *notif;
    struct seccomp_data *data;
};

?

Tycho
Kees Cook May 17, 2020, 9:30 p.m.
On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> On Sun, May 17, 2020 at 08:46:03AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> > > struct seccomp_notif2 {
> > > 	__u32 notif_size;
> > > 	__u64 id;
> > > 	__u32 pid;
> > > 	__u32 flags;
> > > 	struct seccomp_data data;
> > > 	__u32 data_size;
> > > };
> > 
> > I guess you need to put data_size before data, otherwise old userspace
> > with a smaller struct seccomp_data will look in the wrong place.
> > 
> > But yes, that'll work if you put two sizes in, which is probably
> > reasonable since we're talking about two structs.
> 
> Well, no, it doesn't either. Suppose we add a new field first to
> struct seccomp_notif2:
> 
> struct seccomp_notif2 {
>     __u32 notif_size;
>     __u64 id;
>     __u32 pid;
>     __u32 flags;
>     struct seccomp_data data;
>     __u32 data_size;
>     __u32 new_field;
> };
> 
> And next we add a new field to struct secccomp_data. When a userspace
> compiled with just the new seccomp_notif2 field does:
> 
> seccomp_notif2.new_field = ...;
> 
> the compiler will put it in the wrong place for the kernel with the
> new seccomp_data field too.
> 
> Sort of feels like we should do:
> 
> struct seccomp_notif2 {
>     struct seccomp_notif *notif;
>     struct seccomp_data *data;
> };

I'm going read this thread more carefully tomorrow, but I just wanted to
mention that I'd *like* to extend seccomp_data for doing deep argument
inspection of the new syscalls. I think it's the least bad of many
designs, and I'll write that up in more detail. (I would *really* like
to avoid extending seccomp's BPF language, and instead allow probing
into the struct copied from userspace, etc.)

Anyway, it's very related to this, so, yeah, probably we need a v2 of the
notif API, but I'll try to get all the ideas here collected in one place.
Christian Brauner May 18, 2020, 12:05 p.m.
On Sun, May 17, 2020 at 02:30:57PM -0700, Kees Cook wrote:
> On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 08:46:03AM -0600, Tycho Andersen wrote:
> > > On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> > > > struct seccomp_notif2 {
> > > > 	__u32 notif_size;
> > > > 	__u64 id;
> > > > 	__u32 pid;
> > > > 	__u32 flags;
> > > > 	struct seccomp_data data;
> > > > 	__u32 data_size;
> > > > };
> > > 
> > > I guess you need to put data_size before data, otherwise old userspace
> > > with a smaller struct seccomp_data will look in the wrong place.
> > > 
> > > But yes, that'll work if you put two sizes in, which is probably
> > > reasonable since we're talking about two structs.
> > 
> > Well, no, it doesn't either. Suppose we add a new field first to
> > struct seccomp_notif2:
> > 
> > struct seccomp_notif2 {
> >     __u32 notif_size;
> >     __u64 id;
> >     __u32 pid;
> >     __u32 flags;
> >     struct seccomp_data data;
> >     __u32 data_size;
> >     __u32 new_field;
> > };
> > 
> > And next we add a new field to struct secccomp_data. When a userspace
> > compiled with just the new seccomp_notif2 field does:
> > 
> > seccomp_notif2.new_field = ...;
> > 
> > the compiler will put it in the wrong place for the kernel with the
> > new seccomp_data field too.
> > 
> > Sort of feels like we should do:
> > 
> > struct seccomp_notif2 {
> >     struct seccomp_notif *notif;
> >     struct seccomp_data *data;
> > };
> 
> I'm going read this thread more carefully tomorrow, but I just wanted to
> mention that I'd *like* to extend seccomp_data for doing deep argument
> inspection of the new syscalls. I think it's the least bad of many
> designs, and I'll write that up in more detail. (I would *really* like
> to avoid extending seccomp's BPF language, and instead allow probing
> into the struct copied from userspace, etc.)

It's great to hear that you're on this. I haven't had time to work on
this since our kernel summit session. :/
And so far, I really like what I hear. I had the same general thought
that not extending seccomp's bpf is what we want. And to stress this
again before the mails come flooding in: we really need this in seccomp
itself not in any current or future LSM. :)

> 
> Anyway, it's very related to this, so, yeah, probably we need a v2 of the
> notif API, but I'll try to get all the ideas here collected in one place.

Cool, I was kinda worried that people would think that's a crazy idea
but I really think we're better off with a redesign. And I think that's
totally ok and cleaner than hacking around it.

Christian
Christian Brauner May 18, 2020, 12:53 p.m.
On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> On Sun, May 17, 2020 at 08:46:03AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> > > struct seccomp_notif2 {
> > > 	__u32 notif_size;
> > > 	__u64 id;
> > > 	__u32 pid;
> > > 	__u32 flags;
> > > 	struct seccomp_data data;
> > > 	__u32 data_size;
> > > };
> > 
> > I guess you need to put data_size before data, otherwise old userspace
> > with a smaller struct seccomp_data will look in the wrong place.
> > 
> > But yes, that'll work if you put two sizes in, which is probably
> > reasonable since we're talking about two structs.
> 
> Well, no, it doesn't either. Suppose we add a new field first to
> struct seccomp_notif2:
> 
> struct seccomp_notif2 {
>     __u32 notif_size;
>     __u64 id;
>     __u32 pid;
>     __u32 flags;
>     struct seccomp_data data;
>     __u32 data_size;
>     __u32 new_field;
> };
> 
> And next we add a new field to struct secccomp_data. When a userspace
> compiled with just the new seccomp_notif2 field does:
> 
> seccomp_notif2.new_field = ...;
> 
> the compiler will put it in the wrong place for the kernel with the
> new seccomp_data field too.
> 
> Sort of feels like we should do:
> 
> struct seccomp_notif2 {
>     struct seccomp_notif *notif;
>     struct seccomp_data *data;
> };
> 
> ?

Oh yes of course, sorry that was my stupid typo. I meant:

struct seccomp_notif2 {
    __u32 notif_size;
    __u64 id;
    __u32 pid;
    __u32 flags;
    struct seccomp_data *data;
    __u32 data_size;
    __u32 new_field;
}

at which point things should just work imho. This is similar to how the
set_tid array works. The kernel doesn't need to allocate any more too.
The kernel can just always use the currently know seccomp_data size.
If the kernel supports _less_ than what the caller expects, it can
report the supported size in data_size to userspace returning EINVAL. If
it supports more then it can just copy the known fields, I guess.

This way we don't need to add yet another ioctl...

Christian
Tycho Andersen May 18, 2020, 1:20 p.m.
On Mon, May 18, 2020 at 02:53:25PM +0200, Christian Brauner wrote:
> On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 08:46:03AM -0600, Tycho Andersen wrote:
> > > On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> > > > struct seccomp_notif2 {
> > > > 	__u32 notif_size;
> > > > 	__u64 id;
> > > > 	__u32 pid;
> > > > 	__u32 flags;
> > > > 	struct seccomp_data data;
> > > > 	__u32 data_size;
> > > > };
> > > 
> > > I guess you need to put data_size before data, otherwise old userspace
> > > with a smaller struct seccomp_data will look in the wrong place.
> > > 
> > > But yes, that'll work if you put two sizes in, which is probably
> > > reasonable since we're talking about two structs.
> > 
> > Well, no, it doesn't either. Suppose we add a new field first to
> > struct seccomp_notif2:
> > 
> > struct seccomp_notif2 {
> >     __u32 notif_size;
> >     __u64 id;
> >     __u32 pid;
> >     __u32 flags;
> >     struct seccomp_data data;
> >     __u32 data_size;
> >     __u32 new_field;
> > };
> > 
> > And next we add a new field to struct secccomp_data. When a userspace
> > compiled with just the new seccomp_notif2 field does:
> > 
> > seccomp_notif2.new_field = ...;
> > 
> > the compiler will put it in the wrong place for the kernel with the
> > new seccomp_data field too.
> > 
> > Sort of feels like we should do:
> > 
> > struct seccomp_notif2 {
> >     struct seccomp_notif *notif;
> >     struct seccomp_data *data;
> > };
> > 
> > ?
> 
> Oh yes of course, sorry that was my stupid typo. I meant:
> 
> struct seccomp_notif2 {
>     __u32 notif_size;
>     __u64 id;
>     __u32 pid;
>     __u32 flags;
>     struct seccomp_data *data;
>     __u32 data_size;
>     __u32 new_field;
> }
> 
> at which point things should just work imho.

Are you saying that data_size is an input? Because I don't think they
Just Work otherwise.

Tycho
Kees Cook May 18, 2020, 9:10 p.m.
On Sun, May 17, 2020 at 02:30:57PM -0700, Kees Cook wrote:
> Anyway, it's very related to this, so, yeah, probably we need a v2 of the
> notif API, but I'll try to get all the ideas here collected in one place.

For future thread arch├Žologists, I put my thoughts here:
https://lore.kernel.org/lkml/202005181120.971232B7B@keescook/
Sargun Dhillon May 18, 2020, 9:37 p.m.
On Mon, May 18, 2020 at 02:53:25PM +0200, Christian Brauner wrote:
> On Sun, May 17, 2020 at 09:02:15AM -0600, Tycho Andersen wrote:
> > On Sun, May 17, 2020 at 08:46:03AM -0600, Tycho Andersen wrote:
> > > On Sun, May 17, 2020 at 04:33:11PM +0200, Christian Brauner wrote:
> > > > struct seccomp_notif2 {
> > > > 	__u32 notif_size;
> > > > 	__u64 id;
> > > > 	__u32 pid;
> > > > 	__u32 flags;
> > > > 	struct seccomp_data data;
> > > > 	__u32 data_size;
> > > > };
> > > 
> > > I guess you need to put data_size before data, otherwise old userspace
> > > with a smaller struct seccomp_data will look in the wrong place.
> > > 
> > > But yes, that'll work if you put two sizes in, which is probably
> > > reasonable since we're talking about two structs.
> > 
> > Well, no, it doesn't either. Suppose we add a new field first to
> > struct seccomp_notif2:
> > 
> > struct seccomp_notif2 {
> >     __u32 notif_size;
> >     __u64 id;
> >     __u32 pid;
> >     __u32 flags;
> >     struct seccomp_data data;
> >     __u32 data_size;
> >     __u32 new_field;
> > };
> > 
> > And next we add a new field to struct secccomp_data. When a userspace
> > compiled with just the new seccomp_notif2 field does:
> > 
> > seccomp_notif2.new_field = ...;
> > 
> > the compiler will put it in the wrong place for the kernel with the
> > new seccomp_data field too.
> > 
> > Sort of feels like we should do:
> > 
> > struct seccomp_notif2 {
> >     struct seccomp_notif *notif;
> >     struct seccomp_data *data;
> > };
> > 
> > ?
> 
> Oh yes of course, sorry that was my stupid typo. I meant:
> 
> struct seccomp_notif2 {
>     __u32 notif_size;
>     __u64 id;
>     __u32 pid;
>     __u32 flags;
>     struct seccomp_data *data;
>     __u32 data_size;
>     __u32 new_field;
> }
One big difference in the approach I described is that the user gets to ask
for specific fields, and can configure the listener upfront, versus having to
do the dance of fetching the sizes, and dynamically allocating memory.

This way userspace can just do on-stack static allocations. We can get
rid of the kbuf bits in my PR, if we incrementally fill up the userspace buffer
(copy_to_user shouldn't be *that* costly).

In addition, we're not copying a bunch of unnecessary data, or calculating
values that the user may not be interested in. This is particularly valuable
if we ever want to do things like passing optional fields (think PIDs) back.

Code-wise, it looks something like:
struct read_output_format {
	__u64 id;
	__u32 tgid;
	struct seccomp_data data;
} __packed;

TEST(user_notification_read)
{
	long ret;
	int status, pid, listener, read_size;
	struct seccomp_notif_config config = {};
	struct seccomp_notif_resp resp = {};
	struct read_output_format buf;

	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
	ASSERT_EQ(0, ret) {
		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
	}

	listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
	ASSERT_GE(listener, 0);

	EXPECT_EQ(read(listener, &buf, sizeof(buf)), -1) {
		EXPECT_EQ(errno, -EINVAL);
	}

	config.size = sizeof(config);
	config.seccomp_data_size = sizeof(struct seccomp_data);
	config.optional_fields = ~(0);
	/* Make sure invalid fields are not accepted */
	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_CONFIG, &config), -1);

	config.optional_fields = SECCOMP_NOTIF_FIELD_TGID;
	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_CONFIG, &config), sizeof(buf));

	pid = fork();
	ASSERT_GE(pid, 0);
	if (pid == 0)
		exit(syscall(__NR_dup, 42, 1, 1, 1) != USER_NOTIF_MAGIC);


	/* Passing a smaller value in should fail */
	EXPECT_EQ(read(listener, &buf, read_size - 1), -1) {
		EXPECT_EQ(errno, -E2BIG);
	}
	/* Passing a larger value in should succeed */
	ASSERT_EQ(read(listener, &buf, 200), sizeof(buf));
	EXPECT_EQ(buf.tgid, pid);
	EXPECT_EQ(buf.data.args[0], 42);
	EXPECT_EQ(buf.data.nr, __NR_dup);

	resp.id = buf.id;
	resp.error = 0;
	resp.val = USER_NOTIF_MAGIC;

	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);

	EXPECT_EQ(waitpid(pid, &status, 0), pid);
	EXPECT_EQ(true, WIFEXITED(status));
	EXPECT_EQ(0, WEXITSTATUS(status));
}

> 
> at which point things should just work imho. This is similar to how the
> set_tid array works. The kernel doesn't need to allocate any more too.
> The kernel can just always use the currently know seccomp_data size.
> If the kernel supports _less_ than what the caller expects, it can
> report the supported size in data_size to userspace returning EINVAL. If
> it supports more then it can just copy the known fields, I guess.
> 
> This way we don't need to add yet another ioctl...
> 
> Christian
Eric W. Biederman May 18, 2020, 11:08 p.m.
Sargun Dhillon <sargun@sargun.me> writes:

> This includes the thread group leader ID in the seccomp_notif. This is
> immediately useful for opening up a pidfd for the group leader, as
> pidfds only work on group leaders.

The code looks fine (except for the name of the test), but can you
please talk and think about this as something other than the
group leader?

The initial thread in a thread group can die, and the tgid is still
valid for the entire group.  Because the initial thread of a
process/thread group can die (but rarely does) that tends to result in
kernel code that fails when thread_group_leader dies.

To remove that class of bugs I am slowy working to remove the
thread_group_leader from the kernel entirely.

Looking at the names of the fields in the structure it looks like
there is another class of bugs to be removed by renaming PIDTYPE_PID
to PIDTYPE_TID in the kernel as well.  Just skimming the example code
it looks very simple to get confused.

Is there any chance some can modify struct seccomp_notify to do
{
	...
        union {
		__u32 pid;
                __u32 tid;
	};
        ...
}

Just to reduce the chance of confusion between the userspace pid and the
in kernel pid names?

Eric
Sargun Dhillon May 22, 2020, 5:54 p.m.
On Mon, May 18, 2020 at 4:11 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> Sargun Dhillon <sargun@sargun.me> writes:
>
> > This includes the thread group leader ID in the seccomp_notif. This is
> > immediately useful for opening up a pidfd for the group leader, as
> > pidfds only work on group leaders.
>
> The code looks fine (except for the name of the test), but can you
> please talk and think about this as something other than the
> group leader?
>
> The initial thread in a thread group can die, and the tgid is still
> valid for the entire group.  Because the initial thread of a
> process/thread group can die (but rarely does) that tends to result in
> kernel code that fails when thread_group_leader dies.
>
> To remove that class of bugs I am slowy working to remove the
> thread_group_leader from the kernel entirely.
>
> Looking at the names of the fields in the structure it looks like
> there is another class of bugs to be removed by renaming PIDTYPE_PID
> to PIDTYPE_TID in the kernel as well.  Just skimming the example code
> it looks very simple to get confused.
>
> Is there any chance some can modify struct seccomp_notify to do
> {
>         ...
>         union {
>                 __u32 pid;
>                 __u32 tid;
>         };
>         ...
> }
>
> Just to reduce the chance of confusion between the userspace pid and the
> in kernel pid names?
>
> Eric
Our use cases would be unaffected by this. I think this would be a wonderful
way to move forward, but I don't know if it could break userspace.

I believe Christian's team is the biggest user of this feature in OSS right now,
so he might know.

In addition, I'm not sure where you would want the thread's ID versus the
process's ID, unless you wanted to do something like SIGSTOP, and freeze
the thread to prevent it from making more progress, or being interrupted
while you go do notifier work.

Christian & Kees,
Thoughts?