[07/14] Emulate wait4 using waitid

Submitted by Stefan O'Rear on Sept. 3, 2020, 11:23 a.m.

Details

Message ID 20200903112309.102601-8-sorear@fastmail.com
State New
Series "riscv32 support"
Headers show

Commit Message

Stefan O'Rear Sept. 3, 2020, 11:23 a.m.
riscv32 and future architectures lack wait4.
---
 src/internal/wait4_waitid.h |  1 +
 src/linux/__wait4_waitid.c  | 52 +++++++++++++++++++++++++++++++++++++
 src/linux/wait4.c           |  5 ++++
 src/process/waitpid.c       |  6 +++++
 src/stdio/pclose.c          |  6 +++++
 src/unistd/faccessat.c      |  6 ++++-
 6 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 src/internal/wait4_waitid.h
 create mode 100644 src/linux/__wait4_waitid.c

Patch hide | download patch | download mbox

diff --git a/src/internal/wait4_waitid.h b/src/internal/wait4_waitid.h
new file mode 100644
index 00000000..0ef1b9b1
--- /dev/null
+++ b/src/internal/wait4_waitid.h
@@ -0,0 +1 @@ 
+hidden pid_t __wait4_waitid(pid_t pid, int *status, int options, void *kru, int cp);
diff --git a/src/linux/__wait4_waitid.c b/src/linux/__wait4_waitid.c
new file mode 100644
index 00000000..208f1631
--- /dev/null
+++ b/src/linux/__wait4_waitid.c
@@ -0,0 +1,52 @@ 
+#include <sys/wait.h>
+#include "syscall.h"
+#include "wait4_waitid.h"
+
+hidden pid_t __wait4_waitid(pid_t pid, int *status, int options, void *kru, int cp)
+{
+	idtype_t t;
+	int r;
+	siginfo_t info;
+
+	info.si_pid = 0;
+	if (pid < -1) {
+		t = P_PGID;
+		pid = -pid;
+	} else if (pid == -1) {
+		t = P_ALL;
+	} else if (pid == 0) {
+		t = P_PGID;
+	} else {
+		t = P_PID;
+	}
+
+	if (cp) r = __syscall_cp(SYS_waitid, t, pid, &info, options|WEXITED, kru);
+	else r = __syscall(SYS_waitid, t, pid, &info, options|WEXITED, kru);
+
+	if (r<0) return r;
+
+	if (info.si_pid && status) {
+		int sw=0;
+		switch (info.si_code) {
+		case CLD_CONTINUED:
+			sw = 0xffff;
+			break;
+		case CLD_DUMPED:
+			sw = info.si_status&0x7f | 0x80;
+			break;
+		case CLD_EXITED:
+			sw = (info.si_status&0xff) << 8;
+			break;
+		case CLD_KILLED:
+			sw = info.si_status&0x7f;
+			break;
+		case CLD_STOPPED:
+		case CLD_TRAPPED:
+			sw = ((info.si_status&0xff) << 8) + 0x7f;
+			break;
+		}
+		*status = sw;
+	}
+
+	return info.si_pid;
+}
diff --git a/src/linux/wait4.c b/src/linux/wait4.c
index 83650e34..e4d3a053 100644
--- a/src/linux/wait4.c
+++ b/src/linux/wait4.c
@@ -4,6 +4,7 @@ 
 #include <string.h>
 #include <errno.h>
 #include "syscall.h"
+#include "wait4_waitid.h"
 
 pid_t wait4(pid_t pid, int *status, int options, struct rusage *ru)
 {
@@ -26,7 +27,11 @@  pid_t wait4(pid_t pid, int *status, int options, struct rusage *ru)
 	}
 #endif
 	char *dest = ru ? (char *)&ru->ru_maxrss - 4*sizeof(long) : 0;
+#ifdef SYS_wait4
 	r = __syscall(SYS_wait4, pid, status, options, dest);
+#else
+	r = __wait4_waitid(pid, status, options, dest, 0);
+#endif
 	if (r>0 && ru && sizeof(time_t) > sizeof(long)) {
 		long kru[4];
 		memcpy(kru, dest, 4*sizeof(long));
diff --git a/src/process/waitpid.c b/src/process/waitpid.c
index 1b65bf05..49668822 100644
--- a/src/process/waitpid.c
+++ b/src/process/waitpid.c
@@ -1,7 +1,13 @@ 
+#define _GNU_SOURCE
 #include <sys/wait.h>
 #include "syscall.h"
+#include "wait4_waitid.h"
 
 pid_t waitpid(pid_t pid, int *status, int options)
 {
+#ifdef SYS_wait4
 	return syscall_cp(SYS_wait4, pid, status, options, 0);
+#else
+	return __syscall_ret(__wait4_waitid(pid, status, options, 0, 1));
+#endif
 }
diff --git a/src/stdio/pclose.c b/src/stdio/pclose.c
index 080a4262..79e3d1d5 100644
--- a/src/stdio/pclose.c
+++ b/src/stdio/pclose.c
@@ -1,4 +1,6 @@ 
 #include "stdio_impl.h"
+#include <sys/wait.h>
+#include "wait4_waitid.h"
 #include <errno.h>
 #include <unistd.h>
 
@@ -7,7 +9,11 @@  int pclose(FILE *f)
 	int status, r;
 	pid_t pid = f->pipe_pid;
 	fclose(f);
+#ifdef SYS_wait4
 	while ((r=__syscall(SYS_wait4, pid, &status, 0, 0)) == -EINTR);
+#else
+	while ((r=__wait4_waitid(pid, &status, 0, 0, 0)) == -EINTR);
+#endif
 	if (r<0) return __syscall_ret(r);
 	return status;
 }
diff --git a/src/unistd/faccessat.c b/src/unistd/faccessat.c
index 76bbd4c7..8bf34995 100644
--- a/src/unistd/faccessat.c
+++ b/src/unistd/faccessat.c
@@ -34,7 +34,6 @@  int faccessat(int fd, const char *filename, int amode, int flag)
 	char stack[1024];
 	sigset_t set;
 	pid_t pid;
-	int status;
 	int ret, p[2];
 
 	if (pipe2(p, O_CLOEXEC)) return __syscall_ret(-EBUSY);
@@ -48,7 +47,12 @@  int faccessat(int fd, const char *filename, int amode, int flag)
 	if (pid<0 || __syscall(SYS_read, p[0], &ret, sizeof ret) != sizeof(ret))
 		ret = -EBUSY;
 	__syscall(SYS_close, p[0]);
+#ifdef SYS_wait4
+	int status;
 	__syscall(SYS_wait4, pid, &status, __WCLONE, 0);
+#else
+	__syscall(SYS_waitid, P_PID, pid, &(siginfo_t){0}, __WCLONE|WEXITED, 0);
+#endif
 
 	__restore_sigs(&set);
 

Comments

Stefan O'Rear Sept. 3, 2020, 2:56 p.m.
On Thu, Sep 3, 2020, at 7:23 AM, Stefan O'Rear wrote:
> +		case CLD_STOPPED:
> +		case CLD_TRAPPED:
> +			sw = ((info.si_status&0xff) << 8) + 0x7f;
> +			break;

This is trying to be defensive but it is the cause of the strace issue
in the cover letter since the ptrace interface generates si_status
greater than 8 bits which must be visible in WSTOPSIG; the v2 will not
mask here.
Arnd Bergmann Sept. 3, 2020, 3:36 p.m.
On Thu, Sep 3, 2020 at 4:56 PM Stefan O'Rear <sorear@fastmail.com> wrote:
>
> On Thu, Sep 3, 2020, at 7:23 AM, Stefan O'Rear wrote:
> > +             case CLD_STOPPED:
> > +             case CLD_TRAPPED:
> > +                     sw = ((info.si_status&0xff) << 8) + 0x7f;
> > +                     break;
>
> This is trying to be defensive but it is the cause of the strace issue
> in the cover letter since the ptrace interface generates si_status
> greater than 8 bits which must be visible in WSTOPSIG; the v2 will not
> mask here.

Ah, I was trying to find out what exactly the masking was for since
I did not have that in my original version of the same function for Arm:


+       if (status) {
+               *status = 0;
+               switch (info.si_code) {
+               case CLD_EXITED:
+                       *status = info.si_status << 8;
+                       break;
+               case CLD_DUMPED:
+                       *status = 0x80;
+               case CLD_KILLED:
+                       *status |= info.si_status;
+                       break;
+               case CLD_TRAPPED:
+               case CLD_STOPPED:
+                       *status = info.si_status << 8 | 0x7f;
+                       break;
+               case CLD_CONTINUED:
+                       *status = 0xffff;
+                       break;
+               }
+       }

Aside from the mask, this seems to be functionally the same.

      Arnd
Stefan O'Rear Sept. 3, 2020, 3:40 p.m.
On Thu, Sep 3, 2020, at 11:36 AM, Arnd Bergmann wrote:
> On Thu, Sep 3, 2020 at 4:56 PM Stefan O'Rear <sorear@fastmail.com> wrote:
> >
> > On Thu, Sep 3, 2020, at 7:23 AM, Stefan O'Rear wrote:
> > > +             case CLD_STOPPED:
> > > +             case CLD_TRAPPED:
> > > +                     sw = ((info.si_status&0xff) << 8) + 0x7f;
> > > +                     break;
> >
> > This is trying to be defensive but it is the cause of the strace issue
> > in the cover letter since the ptrace interface generates si_status
> > greater than 8 bits which must be visible in WSTOPSIG; the v2 will not
> > mask here.
> 
> Ah, I was trying to find out what exactly the masking was for since
> I did not have that in my original version of the same function for Arm:

Where is this coming from?  I didn't want to blindly copy code from glibc
and I couldn't find any documentation of how the mapping is supposed
to work other than the POSIX descriptions of the wait and waitid functions
(which is why I missed the non-POSIX ptrace cases).

If there's an authoritative description of the mapping I would like to match
it exactly.
 
-s
Rich Felker Sept. 3, 2020, 3:49 p.m.
On Thu, Sep 03, 2020 at 07:23:02AM -0400, Stefan O'Rear wrote:
> riscv32 and future architectures lack wait4.
> ---
>  src/internal/wait4_waitid.h |  1 +
>  src/linux/__wait4_waitid.c  | 52 +++++++++++++++++++++++++++++++++++++
>  src/linux/wait4.c           |  5 ++++
>  src/process/waitpid.c       |  6 +++++
>  src/stdio/pclose.c          |  6 +++++
>  src/unistd/faccessat.c      |  6 ++++-
>  6 files changed, 75 insertions(+), 1 deletion(-)
>  create mode 100644 src/internal/wait4_waitid.h
>  create mode 100644 src/linux/__wait4_waitid.c

I really don't like introducing a new src/internal file for this. If
the code needs to be shared it should probably just be in wait4.c,
renamed to __wait4, declared in src/include/sys/wait.h according to
the usual pattern for exposing namespace-safe versions of
namespace-violating functions.

Ideally I'd like to just not need it, but I'm not sure that's
possible:

- faccessat throws away the status and doesn't have any need for the
  status or idtype emulation code. It could easily be just different
  syscalls in #ifdef/#else paths.

- pclose does need status but not the idtype part. but it's only
  listening for process termination not all the other weird stuff. I'm
  not sure if it would make sense to construct status inline here. It
  might if we had a macro like the glibc "inverse-W" macro (I forget
  its name) to construct a wait status from parts; if so this could
  later be considered for a public API (previously requested).

- waitpid and wait4 at least need the idtype and status construction.

Unfortunately, wait4 also needs the rusage conversion, which waitpid
doesn't want. This kinda defeats my idea of just exposing __wait4 from
wait4.c.

A side observation to keep in mind is that passing the argument cp
doesn't really help optimize anything; it only worked well in
c2feda4e2e because the function there is inline. Of the 4 callers you
have, faccessat and pclose have hard requirements not to be a
cancellation point and waitpid has a hard requirement to be one. But
if the function weren't used for the former, it could just always be
cancellable -- wait4 probably should have been cancellable all along,
and almost surely is on other implementations, so to use it safely
you'd have to not use, block, or handle cancellation.

So I'm leaning towards sticking with something like what you've done,
but with declaration moved to src/include/sys/wait.h, or possibly
src/internal/syscall.h (since it's essentially emulation of a
syscall). Inline in src/internal/syscall.h might also be an option;
one appealing aspect of that is that it would get rid of the #ifdefs
in source files and allow calling __sys_wait4() unconditionally with
no codegen regression on existing archs. This is analogous to
__sys_open[23].

Rich
Stefan O'Rear Sept. 3, 2020, 4:25 p.m.
On Thu, Sep 3, 2020, at 11:49 AM, Rich Felker wrote:
> On Thu, Sep 03, 2020 at 07:23:02AM -0400, Stefan O'Rear wrote:
> > riscv32 and future architectures lack wait4.
> > ---
> >  src/internal/wait4_waitid.h |  1 +
> >  src/linux/__wait4_waitid.c  | 52 +++++++++++++++++++++++++++++++++++++
> >  src/linux/wait4.c           |  5 ++++
> >  src/process/waitpid.c       |  6 +++++
> >  src/stdio/pclose.c          |  6 +++++
> >  src/unistd/faccessat.c      |  6 ++++-
> >  6 files changed, 75 insertions(+), 1 deletion(-)
> >  create mode 100644 src/internal/wait4_waitid.h
> >  create mode 100644 src/linux/__wait4_waitid.c
> 
> I really don't like introducing a new src/internal file for this. If

I agree.

> the code needs to be shared it should probably just be in wait4.c,
> renamed to __wait4, declared in src/include/sys/wait.h according to
> the usual pattern for exposing namespace-safe versions of
> namespace-violating functions.

Currently waitpid and waitid are cancellation points (required by POSIX)
but wait4 is not, which is why I did not have anything else call a function
with exactly wait4 semantics.

> Ideally I'd like to just not need it, but I'm not sure that's
> possible:
> 
> - faccessat throws away the status and doesn't have any need for the
>   status or idtype emulation code. It could easily be just different
>   syscalls in #ifdef/#else paths.

I think that is exactly what my patch does?

> - pclose does need status but not the idtype part. but it's only
>   listening for process termination not all the other weird stuff. I'm
>   not sure if it would make sense to construct status inline here. It
>   might if we had a macro like the glibc "inverse-W" macro (I forget
>   its name) to construct a wait status from parts; if so this could
>   later be considered for a public API (previously requested).

Earlier I had the wait status conversion code broken out separately
but it seemed not worth it for just pclose.

> - waitpid and wait4 at least need the idtype and status construction.
> 
> Unfortunately, wait4 also needs the rusage conversion, which waitpid
> doesn't want. This kinda defeats my idea of just exposing __wait4 from
> wait4.c.
> 
> A side observation to keep in mind is that passing the argument cp
> doesn't really help optimize anything; it only worked well in
> c2feda4e2e because the function there is inline. Of the 4 callers you
> have, faccessat and pclose have hard requirements not to be a
> cancellation point and waitpid has a hard requirement to be one. But

pclose has a hard requirement to not be a cancellation point but it also
has "If a thread is canceled during execution of pclose(), the behavior
is undefined."  The implications of the combination are confusing.

> if the function weren't used for the former, it could just always be
> cancellable -- wait4 probably should have been cancellable all along,
> and almost surely is on other implementations, so to use it safely
> you'd have to not use, block, or handle cancellation.

I have no immediate objection to making wait4 a cancellation point but
I would prefer to make that change consistently on all architectures.
I don't have a good sense of who uses cancellation and wait4 and what
compatibility constraints are imposed by code (as opposed to standards).

> So I'm leaning towards sticking with something like what you've done,
> but with declaration moved to src/include/sys/wait.h, or possibly
> src/internal/syscall.h (since it's essentially emulation of a

I'd be very happy to not have a one-line header file.  I looked at syscall.h
earlier but wasn't sure if it fit; I did not consider reserved namespace in
a public header (would this preclude use of hidden?).

Is src/linux/__exact_name_of_function.c the appropriate name for the file?

> syscall). Inline in src/internal/syscall.h might also be an option;
> one appealing aspect of that is that it would get rid of the #ifdefs
> in source files and allow calling __sys_wait4() unconditionally with
> no codegen regression on existing archs. This is analogous to
> __sys_open[23].
> 
> Rich

-s
Rich Felker Sept. 3, 2020, 4:38 p.m.
On Thu, Sep 03, 2020 at 12:25:19PM -0400, Stefan O'Rear wrote:
> 
> 
> On Thu, Sep 3, 2020, at 11:49 AM, Rich Felker wrote:
> > On Thu, Sep 03, 2020 at 07:23:02AM -0400, Stefan O'Rear wrote:
> > > riscv32 and future architectures lack wait4.
> > > ---
> > >  src/internal/wait4_waitid.h |  1 +
> > >  src/linux/__wait4_waitid.c  | 52 +++++++++++++++++++++++++++++++++++++
> > >  src/linux/wait4.c           |  5 ++++
> > >  src/process/waitpid.c       |  6 +++++
> > >  src/stdio/pclose.c          |  6 +++++
> > >  src/unistd/faccessat.c      |  6 ++++-
> > >  6 files changed, 75 insertions(+), 1 deletion(-)
> > >  create mode 100644 src/internal/wait4_waitid.h
> > >  create mode 100644 src/linux/__wait4_waitid.c
> > 
> > I really don't like introducing a new src/internal file for this. If
> 
> I agree.
> 
> > the code needs to be shared it should probably just be in wait4.c,
> > renamed to __wait4, declared in src/include/sys/wait.h according to
> > the usual pattern for exposing namespace-safe versions of
> > namespace-violating functions.
> 
> Currently waitpid and waitid are cancellation points (required by POSIX)
> but wait4 is not, which is why I did not have anything else call a function
> with exactly wait4 semantics.
> 
> > Ideally I'd like to just not need it, but I'm not sure that's
> > possible:
> > 
> > - faccessat throws away the status and doesn't have any need for the
> >   status or idtype emulation code. It could easily be just different
> >   syscalls in #ifdef/#else paths.
> 
> I think that is exactly what my patch does?

Yes, sorry I missed that. I think I saw the other three and just
assumed this one was the same since it was part of the same patch.

> > - pclose does need status but not the idtype part. but it's only
> >   listening for process termination not all the other weird stuff. I'm
> >   not sure if it would make sense to construct status inline here. It
> >   might if we had a macro like the glibc "inverse-W" macro (I forget
> >   its name) to construct a wait status from parts; if so this could
> >   later be considered for a public API (previously requested).
> 
> Earlier I had the wait status conversion code broken out separately
> but it seemed not worth it for just pclose.
> 
> > - waitpid and wait4 at least need the idtype and status construction.
> > 
> > Unfortunately, wait4 also needs the rusage conversion, which waitpid
> > doesn't want. This kinda defeats my idea of just exposing __wait4 from
> > wait4.c.
> > 
> > A side observation to keep in mind is that passing the argument cp
> > doesn't really help optimize anything; it only worked well in
> > c2feda4e2e because the function there is inline. Of the 4 callers you
> > have, faccessat and pclose have hard requirements not to be a
> > cancellation point and waitpid has a hard requirement to be one. But
> 
> pclose has a hard requirement to not be a cancellation point but it also
> has "If a thread is canceled during execution of pclose(), the behavior
> is undefined."  The implications of the combination are confusing.

It's an optional cancellation point, but if cancelled the side effects
must be correct. Since the FILE and underlying fd have already been
closed at this point, it's too late for cancellation to be acted upon;
any thing you do would leave the process in an inconsistent state,
where the caller has to assume the FILE is still valid and thus
performs UAF/DF if it does anything with it.

Note that you can't swap the order of the fd close and wait because
the child very well may not exit until the pipe is closed.

So, the only valid way pclose could act on cancellation is before
taking any action. This is almost surely not what was intended in the
standard, since it's useles, but the conclusion here is just that
nobody thought about this stuff enough to realize that allowing pclose
to be a cancellation point was fundamentally wrong.

> > if the function weren't used for the former, it could just always be
> > cancellable -- wait4 probably should have been cancellable all along,
> > and almost surely is on other implementations, so to use it safely
> > you'd have to not use, block, or handle cancellation.
> 
> I have no immediate objection to making wait4 a cancellation point but
> I would prefer to make that change consistently on all architectures.
> I don't have a good sense of who uses cancellation and wait4 and what
> compatibility constraints are imposed by code (as opposed to standards).

Yes, it should be done consistently and as a separate patch if we go
this way.

> > So I'm leaning towards sticking with something like what you've done,
> > but with declaration moved to src/include/sys/wait.h, or possibly
> > src/internal/syscall.h (since it's essentially emulation of a
> 
> I'd be very happy to not have a one-line header file.  I looked at syscall.h
> earlier but wasn't sure if it fit; I did not consider reserved namespace in
> a public header (would this preclude use of hidden?).

src/include/* are not public headers, but wrappers around the public
headers to add namespace-protected versions of functions etc. However
since this is really not a "version of wait4" but an "emulation of
SYS_wait4", it probably makes more sense to put it in
internal/syscall.h, just like __sys_open[23]. That doesn't mean it has
to be inline, just that the declaration could go there, and a macro to
just use __syscall(SYS_wait4...) directly if it exists which would
keep some small amount of #ifdef out of source files.

> Is src/linux/__exact_name_of_function.c the appropriate name for the file?

If there's an external source file still, I'd put it in
src/process/__function_name.c or src/internal/... like you did. The
existing naming here is not entirely consistent, but generally,
src/subsystem/__internal_function.c is an internal function for
implementing that subsystem/family, not an internal functon used by
other subsystems that just happens to mostly-match a public function
in that subsystem.

src/internal is a kinda sloppy mix of things, but is mostly internal
functions that are shared between two or more subsystems.

Rich
Arnd Bergmann Sept. 3, 2020, 6:08 p.m.
On Thu, Sep 3, 2020 at 5:41 PM Stefan O'Rear <sorear@fastmail.com> wrote:
>
> On Thu, Sep 3, 2020, at 11:36 AM, Arnd Bergmann wrote:
> > On Thu, Sep 3, 2020 at 4:56 PM Stefan O'Rear <sorear@fastmail.com> wrote:
> > >
> > > On Thu, Sep 3, 2020, at 7:23 AM, Stefan O'Rear wrote:
> > > > +             case CLD_STOPPED:
> > > > +             case CLD_TRAPPED:
> > > > +                     sw = ((info.si_status&0xff) << 8) + 0x7f;
> > > > +                     break;
> > >
> > > This is trying to be defensive but it is the cause of the strace issue
> > > in the cover letter since the ptrace interface generates si_status
> > > greater than 8 bits which must be visible in WSTOPSIG; the v2 will not
> > > mask here.
> >
> > Ah, I was trying to find out what exactly the masking was for since
> > I did not have that in my original version of the same function for Arm:
>
> Where is this coming from?  I didn't want to blindly copy code from glibc
> and I couldn't find any documentation of how the mapping is supposed
> to work other than the POSIX descriptions of the wait and waitid functions
> (which is why I missed the non-POSIX ptrace cases).
>
> If there's an authoritative description of the mapping I would like to match
> it exactly.

I did this prototype implementation to test the syscall changes before
I submitted them into the mainline kernel:
https://git.linaro.org/people/arnd.bergmann/musl-y2038.git/

This was never meant to get merged, but I made sure that I could
get LTP to pass just using the new minimum set of syscalls.

       Arnd