[v2] glob: implement GLOB_TILDE

Submitted by Ismael Luceno on July 24, 2019, 9:33 p.m.

Details

Message ID 20190724213338.27138-1-ismael@iodev.co.uk
State New
Series "glob: implement GLOB_TILDE"
Headers show

Commit Message

Ismael Luceno July 24, 2019, 9:33 p.m.
Signed-off-by: Ismael Luceno <ismael@iodev.co.uk>
---
 include/glob.h   |  1 +
 src/regex/glob.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 44 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/glob.h b/include/glob.h
index 76f6c1c68a23..fc8106b20c5b 100644
--- a/include/glob.h
+++ b/include/glob.h
@@ -30,6 +30,7 @@  void globfree(glob_t *);
 #define GLOB_APPEND   0x20
 #define GLOB_NOESCAPE 0x40
 #define	GLOB_PERIOD   0x80
+#define GLOB_TILDE    0x100
 
 #define GLOB_NOSPACE 1
 #define GLOB_ABORTED 2
diff --git a/src/regex/glob.c b/src/regex/glob.c
index aa1c6a4482ee..2428d5f02c2e 100644
--- a/src/regex/glob.c
+++ b/src/regex/glob.c
@@ -4,10 +4,13 @@ 
 #include <sys/stat.h>
 #include <dirent.h>
 #include <limits.h>
-#include <string.h>
 #include <stdlib.h>
 #include <errno.h>
 #include <stddef.h>
+#include <unistd.h>
+#include <pwd.h>
+#define _GNU_SOURCE
+#include <string.h>
 
 struct match
 {
@@ -182,6 +185,39 @@  static int sort(const void *a, const void *b)
 	return strcmp(*(const char **)a, *(const char **)b);
 }
 
+static int expand_tilde(char **pat, char *buf, size_t *pos)
+{
+	char *p = *pat + 1;
+	size_t i = 0;
+
+	char *name_end = strchrnul(p, '/');
+	if (*name_end) *name_end++ = 0;
+	*pat = name_end;
+
+	char *home = (*p || issetugid()) ? NULL : getenv("HOME");
+	if (!home) {
+		struct passwd pw, *res;
+		switch (*p ? getpwnam_r(p, &pw, buf, PATH_MAX, &res)
+			   : getpwuid_r(getuid(), &pw, buf, PATH_MAX, &res)) {
+		default:
+			return GLOB_ABORTED;
+		case ERANGE: case ENOMEM:
+			return GLOB_NOSPACE;
+		case 0:
+			if (!res) return GLOB_NOMATCH;
+		}
+		home = pw.pw_dir;
+	}
+	while (i < PATH_MAX - 2 && *home)
+		buf[i++] = *home++;
+	if (i > PATH_MAX - 2)
+		return GLOB_NOSPACE;
+	buf[i++] =  '/';
+	buf[i] = 0;
+	*pos = i;
+	return 0;
+}
+
 int glob(const char *restrict pat, int flags, int (*errfunc)(const char *path, int err), glob_t *restrict g)
 {
 	struct match head = { .next = NULL }, *tail = &head;
@@ -202,7 +238,12 @@  int glob(const char *restrict pat, int flags, int (*errfunc)(const char *path, i
 		char *p = strdup(pat);
 		if (!p) return GLOB_NOSPACE;
 		buf[0] = 0;
-		error = do_glob(buf, 0, 0, p, flags, errfunc, &tail);
+		size_t pos = 0;
+		char *s = p;
+		if ((flags & GLOB_TILDE) && *p == '~')
+			error = expand_tilde(&s, buf, &pos);
+		if (!error)
+			error = do_glob(buf, pos, 0, s, flags, errfunc, &tail);
 		free(p);
 	}
 

Comments

Rich Felker July 25, 2019, 3:48 a.m.
On Wed, Jul 24, 2019 at 11:33:38PM +0200, Ismael Luceno wrote:
> Signed-off-by: Ismael Luceno <ismael@iodev.co.uk>
> ---
>  include/glob.h   |  1 +
>  src/regex/glob.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 44 insertions(+), 2 deletions(-)
> 
> diff --git a/include/glob.h b/include/glob.h
> index 76f6c1c68a23..fc8106b20c5b 100644
> --- a/include/glob.h
> +++ b/include/glob.h
> @@ -30,6 +30,7 @@ void globfree(glob_t *);
>  #define GLOB_APPEND   0x20
>  #define GLOB_NOESCAPE 0x40
>  #define	GLOB_PERIOD   0x80
> +#define GLOB_TILDE    0x100
>  
>  #define GLOB_NOSPACE 1
>  #define GLOB_ABORTED 2
> diff --git a/src/regex/glob.c b/src/regex/glob.c
> index aa1c6a4482ee..2428d5f02c2e 100644
> --- a/src/regex/glob.c
> +++ b/src/regex/glob.c
> @@ -4,10 +4,13 @@
>  #include <sys/stat.h>
>  #include <dirent.h>
>  #include <limits.h>
> -#include <string.h>
>  #include <stdlib.h>
>  #include <errno.h>
>  #include <stddef.h>
> +#include <unistd.h>
> +#include <pwd.h>
> +#define _GNU_SOURCE
> +#include <string.h>

If feature test macros are used, they have to appear before any
headers are included (at the top of the file). But you can't call
GNU-namespace functions from a standard-namespace function (glob)
anyway. Instead just use __strchrnul if you need it; it's available
(musl-internal only) without any special FTMs.

>  
>  struct match
>  {
> @@ -182,6 +185,39 @@ static int sort(const void *a, const void *b)
>  	return strcmp(*(const char **)a, *(const char **)b);
>  }
>  
> +static int expand_tilde(char **pat, char *buf, size_t *pos)
> +{

I like that you've put this in a separate function.

> +	char *p = *pat + 1;
> +	size_t i = 0;
> +
> +	char *name_end = strchrnul(p, '/');
> +	if (*name_end) *name_end++ = 0;
> +	*pat = name_end;
> +
> +	char *home = (*p || issetugid()) ? NULL : getenv("HOME");

I still don't see any justification for violating the semantics of ~
because the program is suid.

Also, issetugid isn't in the namespace we can use here. Rather than
poking directly at libc.secure, if we needed this I think it would be
better to make a namespace-protected alias, but I don't think it's the
right behavior anyway.

> +	if (!home) {
> +		struct passwd pw, *res;
> +		switch (*p ? getpwnam_r(p, &pw, buf, PATH_MAX, &res)
> +			   : getpwuid_r(getuid(), &pw, buf, PATH_MAX, &res)) {

Same here. I'm not actually sure what should happen if $HOME is unset,
but it's probably whatever the Shell Command Language specification
says.

> +		default:
> +			return GLOB_ABORTED;
> +		case ERANGE: case ENOMEM:
> +			return GLOB_NOSPACE;
> +		case 0:
> +			if (!res) return GLOB_NOMATCH;

The man page says:

    If the username is invalid, or the home directory cannot be
    determined, then no substitution is performed.

and the glibc manual says:

    If the user name is not valid or the home directory cannot be
    determined for some reason the pattern is left untouched and
    itself used as the result

So I think this should not be an error. I'm not entirely clear if it's
supposed to suppress all further glob expansion and just return a
single literal result, or treat "~" or "~nosuchuser" as a literal
component and search it.

> +		}
> +		home = pw.pw_dir;
> +	}
> +	while (i < PATH_MAX - 2 && *home)
> +		buf[i++] = *home++;

This could be strnlen+memmove, but perhaps just open coding it like
you've done is simpler and lighter.

> +	if (i > PATH_MAX - 2)

Off-by-one. If it stopped due to reaching the limit in the above loop,
you'll have equality not greater-than, then overflow below. Perhaps
strnlen+memmove would be easier to ensure the safety of.

> +		return GLOB_NOSPACE;

GLOB_NOSPACE is for allocation errors. This is a no-match case, since
there can be no matches for a pattern PATH_MAX or longer.

> +	buf[i++] =  '/';
> +	buf[i] = 0;
> +	*pos = i;
> +	return 0;
> +}
> +
>  int glob(const char *restrict pat, int flags, int (*errfunc)(const char *path, int err), glob_t *restrict g)
>  {
>  	struct match head = { .next = NULL }, *tail = &head;
> @@ -202,7 +238,12 @@ int glob(const char *restrict pat, int flags, int (*errfunc)(const char *path, i
>  		char *p = strdup(pat);
>  		if (!p) return GLOB_NOSPACE;
>  		buf[0] = 0;
> -		error = do_glob(buf, 0, 0, p, flags, errfunc, &tail);
> +		size_t pos = 0;
> +		char *s = p;
> +		if ((flags & GLOB_TILDE) && *p == '~')
> +			error = expand_tilde(&s, buf, &pos);
> +		if (!error)
> +			error = do_glob(buf, pos, 0, s, flags, errfunc, &tail);
>  		free(p);
>  	}

This part looks good, but may need minor rework depending on how the
nonexistent user case is supposed to behave.

Rich
Ismael Luceno July 25, 2019, 10:33 a.m.
On 24/Jul/2019 23:48, Rich Felker wrote:
<...>
> > +	char *p = *pat + 1;
> > +	size_t i = 0;
> > +
> > +	char *name_end = strchrnul(p, '/');
> > +	if (*name_end) *name_end++ = 0;
> > +	*pat = name_end;
> > +
> > +	char *home = (*p || issetugid()) ? NULL : getenv("HOME");
> 
> I still don't see any justification for violating the semantics of ~
> because the program is suid.

I could only find a couple of implementations which don't check for
privileges:
- 4.3BSD
  + NetBSD
- uClibc
  + uClibc-ng

Almost everyone else does:
- FreeBSD
  + MidnightBSD
  + DragonFlyBSD
  + Android (Bionic)
- GNU
- Newlib
- OpenBSD
  + MirBSD
- Solaris
  + Illumos


I think most users/software would expect it.

> Also, issetugid isn't in the namespace we can use here. Rather than
> poking directly at libc.secure, if we needed this I think it would be
> better to make a namespace-protected alias, but I don't think it's the
> right behavior anyway.

I would go with that.

> > +	if (!home) {
> > +		struct passwd pw, *res;
> > +		switch (*p ? getpwnam_r(p, &pw, buf, PATH_MAX, &res)
> > +			   : getpwuid_r(getuid(), &pw, buf, PATH_MAX, &res)) {
> 
> Same here. I'm not actually sure what should happen if $HOME is unset,
> but it's probably whatever the Shell Command Language specification
> says.

All the implementations I could find fall back to getpw*.

> > +		default:
> > +			return GLOB_ABORTED;
> > +		case ERANGE: case ENOMEM:
> > +			return GLOB_NOSPACE;
> > +		case 0:
> > +			if (!res) return GLOB_NOMATCH;
> 
> The man page says:
> 
>     If the username is invalid, or the home directory cannot be
>     determined, then no substitution is performed.
> 
> and the glibc manual says:
> 
>     If the user name is not valid or the home directory cannot be
>     determined for some reason the pattern is left untouched and
>     itself used as the result
>
> So I think this should not be an error. I'm not entirely clear if it's
> supposed to suppress all further glob expansion and just return a
> single literal result, or treat "~" or "~nosuchuser" as a literal
> component and search it.

It's supposed to continue; but IMO that's neither intuitive nor safe.

I guess that's why glibc added GLOB_TILDE_CHECK, but it should be the
only possibility.

However, if you feel strongly about it, I can add _CHECK instead.

I think:
- ENOMEM should map to NOSPACE.
- a closer look reveals that all other errors should map to NOMATCH.

> > +		}
> > +		home = pw.pw_dir;
> > +	}
> > +	while (i < PATH_MAX - 2 && *home)
> > +		buf[i++] = *home++;
> 
> This could be strnlen+memmove, but perhaps just open coding it like
> you've done is simpler and lighter.
> 
> > +	if (i > PATH_MAX - 2)
> 
> Off-by-one. If it stopped due to reaching the limit in the above loop,
> you'll have equality not greater-than, then overflow below. Perhaps
> strnlen+memmove would be easier to ensure the safety of.

No overflow:

i++ => PATH_MAX - 2
i   => PATH_MAX - 1

> > +		return GLOB_NOSPACE;
> 
> GLOB_NOSPACE is for allocation errors.

Right.

> This is a no-match case, since there can be no matches for a pattern
> PATH_MAX or longer.

It must work for "~\0".
Rich Felker July 25, 2019, 2:30 p.m.
On Thu, Jul 25, 2019 at 12:33:33PM +0200, Ismael Luceno wrote:
> 
> On 24/Jul/2019 23:48, Rich Felker wrote:
> <...>
> > > +	char *p = *pat + 1;
> > > +	size_t i = 0;
> > > +
> > > +	char *name_end = strchrnul(p, '/');
> > > +	if (*name_end) *name_end++ = 0;
> > > +	*pat = name_end;
> > > +
> > > +	char *home = (*p || issetugid()) ? NULL : getenv("HOME");
> > 
> > I still don't see any justification for violating the semantics of ~
> > because the program is suid.
> 
> I could only find a couple of implementations which don't check for
> privileges:
> - 4.3BSD
>   + NetBSD
> - uClibc
>   + uClibc-ng
> 
> Almost everyone else does:
> - FreeBSD
>   + MidnightBSD
>   + DragonFlyBSD
>   + Android (Bionic)
> - GNU
> - Newlib
> - OpenBSD
>   + MirBSD
> - Solaris
>   + Illumos
> 
> 
> I think most users/software would expect it.

Up to now I've tried to avoid just being "the maintainer's job is to
say no", but here the maintainer's job is to say no. There are 3 (or
4, if you count the dynamic linker) places in musl where libc.secure
is used:

1. preopening /dev/null on fds 0-2 if they start closed
2. disallowing arbitrary user-provided timezone files
3. disallowing arbitrary user-provided locale files
4. disallowing LD_PRELOAD/LD_LIBRARY_PATH

Here, (1) is explicitly allowed by POSIX, and the environment with
them initially closed is deemed a non-conforming execution environment
anyway. (2) and (3) are somewhat unfortunate, but necessary in order
to prevent invoking-user-controlled state from causing undefined
behavior; I actually hope to remove (2) at some point by doing a
read-and-validate instead of mmap in the suid case but it's low
priority. (4) is obviously necessary.

There's nowhere in musl where we violate the contract of an interface
with the application in suid mode because there's some conceivable way
the application could do something dangerous/wrong with the result. I
was likewise adamant about _FORTIFY_SOURCE only catching actual
overflows/UB, not things that are just "likely misusage".

~ should produce $HOME. It's arguable that it should instead lookup
the homedir based on getlogin like glibc documents, but (1) this
contradicts the Shell Command Language definition, and (2) it's not
viable in musl to begin with since getlogin() just gives you
getenv("LOGNAME").

> > > +	if (!home) {
> > > +		struct passwd pw, *res;
> > > +		switch (*p ? getpwnam_r(p, &pw, buf, PATH_MAX, &res)
> > > +			   : getpwuid_r(getuid(), &pw, buf, PATH_MAX, &res)) {
> > 
> > Same here. I'm not actually sure what should happen if $HOME is unset,
> > but it's probably whatever the Shell Command Language specification
> > says.
> 
> All the implementations I could find fall back to getpw*.

This seems consistent with POSIX sh ~:

    If the login name is null (that is, the tilde-prefix contains only
    the tilde), the tilde-prefix is replaced by the value of the
    variable HOME. If HOME is unset, the results are unspecified.
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

I'm not sure if it's the best behavior but I don't have any strong
opinion on it.

> > > +		default:
> > > +			return GLOB_ABORTED;
> > > +		case ERANGE: case ENOMEM:
> > > +			return GLOB_NOSPACE;
> > > +		case 0:
> > > +			if (!res) return GLOB_NOMATCH;
> > 
> > The man page says:
> > 
> >     If the username is invalid, or the home directory cannot be
> >     determined, then no substitution is performed.
> > 
> > and the glibc manual says:
> > 
> >     If the user name is not valid or the home directory cannot be
> >     determined for some reason the pattern is left untouched and
> >     itself used as the result
> >
> > So I think this should not be an error. I'm not entirely clear if it's
> > supposed to suppress all further glob expansion and just return a
> > single literal result, or treat "~" or "~nosuchuser" as a literal
> > component and search it.
> 
> It's supposed to continue; but IMO that's neither intuitive nor safe.
> 
> I guess that's why glibc added GLOB_TILDE_CHECK, but it should be the
> only possibility.
> 
> However, if you feel strongly about it, I can add _CHECK instead.

At first I thought I'd prefer that, and still might, but POSIX doesn't
seem to mandate anything here for the shell:

    If the system does not recognize the login name, the results are
    undefined.

But in any case we should probably define and support both flags, even
if both have the same behavior. This way applications that care will
continue to get the "check" behavior if in the future there turns out
to be reason to make GLOB_TILDE do the "non-check" thing. This will
also improve glibc ABI-compat in case a caller is using the "check"
version of the flag. (BTW I didn't check, but I assume your flag value
matches the glibc one?)

> I think:
> - ENOMEM should map to NOSPACE.
> - a closer look reveals that all other errors should map to NOMATCH.

None of the three error codes are really good, but NOMATCH is probably
best. GLOB_ABORTED may be more appropriate with GLOB_ERR set (to allow
application to distinguish between an error processing the glob and
simple nonexistence of matches). Thoughts?

> > > +		}
> > > +		home = pw.pw_dir;
> > > +	}
> > > +	while (i < PATH_MAX - 2 && *home)
> > > +		buf[i++] = *home++;
> > 
> > This could be strnlen+memmove, but perhaps just open coding it like
> > you've done is simpler and lighter.
> > 
> > > +	if (i > PATH_MAX - 2)
> > 
> > Off-by-one. If it stopped due to reaching the limit in the above loop,
> > you'll have equality not greater-than, then overflow below. Perhaps
> > strnlen+memmove would be easier to ensure the safety of.
> 
> No overflow:
> 
> i++ => PATH_MAX - 2
> i   => PATH_MAX - 1

On the last iteration of the above loop that terminates due to length
limit, i<PATH_MAX-2, so i==PATH_MAX-3. The ++ in the loop body results
in i==PATH_MAX-2.

You're right that it's not an overflow, because you only write the '/'
and null terminator below, but the user's homedir pathname has been
silently truncated (unless you happened to hit !*home on the same
iteration). I think a test like "if (*home) error-out.." would work
here.

> > > +		return GLOB_NOSPACE;
> > 
> > GLOB_NOSPACE is for allocation errors.
> 
> Right.
> 
> > This is a no-match case, since there can be no matches for a pattern
> > PATH_MAX or longer.
> 
> It must work for "~\0".

Assuming ~ expands to something that fits.

Rich
Ismael Luceno July 25, 2019, 3:39 p.m.
On 25/Jul/2019 10:30, Rich Felker wrote:
<...>
> > I think:
> > - ENOMEM should map to NOSPACE.
> > - a closer look reveals that all other errors should map to NOMATCH.
> 
> None of the three error codes are really good, but NOMATCH is probably
> best. GLOB_ABORTED may be more appropriate with GLOB_ERR set (to allow
> application to distinguish between an error processing the glob and
> simple nonexistence of matches). Thoughts?

Other implementations happily expand to a literal '~', so I guess we
could keep things simple and return NOMATCH.