Message ID | 20190507164317.13562-6-cyphar@cyphar.com |
---|---|
State | New |
Series | "namei: resolveat(2) path resolution restriction API" |
Headers | show |
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 63ed39cbd3bd..72f431b1dc9c 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -461,5 +461,6 @@ 530 common getegid sys_getegid 531 common geteuid sys_geteuid 532 common getppid sys_getppid +533 common resolveat sys_resolveat # all other architectures have common numbers for new syscall, alpha # is the exception. diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 9016f4081bb9..1bc0282a67f7 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -437,3 +437,4 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +428 common resolveat sys_resolveat diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index ab9cda5f6136..d3ae73ffaf48 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -344,3 +344,4 @@ 332 common pkey_free sys_pkey_free 333 common rseq sys_rseq # 334 through 423 are reserved to sync up with other architectures +428 common resolveat sys_resolveat diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 125c14178979..81b7389e9e58 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -423,3 +423,4 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +428 common resolveat sys_resolveat diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 8ee3a8c18498..626aed10e2b5 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -429,3 +429,4 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +428 common resolveat sys_resolveat diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 15f4117900ee..8cbd6032d6bf 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -362,3 +362,4 @@ 421 n32 rt_sigtimedwait_time64 compat_sys_rt_sigtimedwait_time64 422 n32 futex_time64 sys_futex 423 n32 sched_rr_get_interval_time64 sys_sched_rr_get_interval +428 n32 resolveat sys_resolveat diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index c85502e67b44..234923a1fc88 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -338,3 +338,4 @@ 327 n64 rseq sys_rseq 328 n64 io_pgetevents sys_io_pgetevents # 329 through 423 are reserved to sync up with other architectures +428 n64 resolveat sys_resolveat diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 2e063d0f837e..7b4586acf35d 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -411,3 +411,4 @@ 421 o32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 o32 futex_time64 sys_futex sys_futex 423 o32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +428 o32 resolveat sys_resolveat sys_resolveat diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index b26766c6647d..19a9a92dc5f8 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -420,3 +420,4 @@ 421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 sys_futex sys_futex 423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +428 common resolveat sys_resolveat sys_resolveat diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index b18abb0c3dae..bfcd75b928de 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -505,3 +505,4 @@ 421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 sys_futex sys_futex 423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +428 common resolveat sys_resolveat sys_resolveat diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 02579f95f391..084e51f02e65 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -426,3 +426,4 @@ 421 32 rt_sigtimedwait_time64 - compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 - sys_futex 423 32 sched_rr_get_interval_time64 - sys_sched_rr_get_interval +428 common resolveat sys_resolveat - diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index bfda678576e4..e9115c5cec72 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -426,3 +426,4 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +428 common resolveat sys_resolveat diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index b9a5a04b2d2c..2d3fdd913d89 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -469,3 +469,4 @@ 421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 sys_futex sys_futex 423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +428 common resolveat sys_resolveat sys_resolveat diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 4cd5f982b1e5..101feef22473 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -438,3 +438,4 @@ 425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup 426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter 427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register +428 i386 resolveat sys_resolveat __ia32_sys_resolveat diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 64ca0d06259a..c7c197bf07bc 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -355,6 +355,7 @@ 425 common io_uring_setup __x64_sys_io_uring_setup 426 common io_uring_enter __x64_sys_io_uring_enter 427 common io_uring_register __x64_sys_io_uring_register +428 common resolveat __x64_sys_resolveat # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 6af49929de85..86c44f15dfa4 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -394,3 +394,4 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +428 common resolveat sys_resolveat diff --git a/fs/namei.c b/fs/namei.c index 2b6a1bf4e745..2cc5b171f6ec 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3656,6 +3656,52 @@ struct file *do_filp_open(int dfd, struct filename *pathname, return filp; } +SYSCALL_DEFINE3(resolveat, int, dfd, const char __user *, path, + unsigned long, flags) +{ + int fd; + struct filename *tmp; + struct open_flags op = { + .open_flag = O_PATH, + }; + + if (flags & ~VALID_RESOLVE_FLAGS) + return -EINVAL; + + if (flags & RESOLVE_CLOEXEC) + op.open_flag |= O_CLOEXEC; + if (!(flags & RESOLVE_NOFOLLOW)) + op.lookup_flags |= LOOKUP_FOLLOW; + if (flags & RESOLVE_BENEATH) + op.lookup_flags |= LOOKUP_BENEATH; + if (flags & RESOLVE_XDEV) + op.lookup_flags |= LOOKUP_XDEV; + if (flags & RESOLVE_NO_MAGICLINKS) + op.lookup_flags |= LOOKUP_NO_MAGICLINKS; + if (flags & RESOLVE_NO_SYMLINKS) + op.lookup_flags |= LOOKUP_NO_SYMLINKS; + if (flags & RESOLVE_IN_ROOT) + op.lookup_flags |= LOOKUP_IN_ROOT; + + tmp = getname(path); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + fd = get_unused_fd_flags(op.open_flag); + if (fd >= 0) { + struct file *f = do_filp_open(dfd, tmp, &op); + if (IS_ERR(f)) { + put_unused_fd(fd); + fd = PTR_ERR(f); + } else { + fsnotify_open(f); + fd_install(fd, f); + } + } + putname(tmp); + return fd; +} + struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, const char *name, const struct open_flags *op) { diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 1d338357df8a..c57245a21281 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -94,4 +94,17 @@ #define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ +/* Bottom 3 bits of RESOLVE_* are reserved for future ACC_MODE extensions. */ +#define RESOLVE_CLOEXEC 0x008 /* Set O_CLOEXEC on the returned fd. */ +#define RESOLVE_NOFOLLOW 0x010 /* Don't follow trailing symlinks. */ + +#define RESOLVE_RESOLUTION_TYPE 0x3E0 /* Type of path-resolution scoping we are applying. */ +#define RESOLVE_BENEATH 0x020 /* - Block "lexical" trickery like "..", symlinks, absolute paths, etc. */ +#define RESOLVE_XDEV 0x040 /* - Block mount-point crossings (includes bind-mounts). */ +#define RESOLVE_NO_MAGICLINKS 0x080 /* - Block procfs-style "magic" symlinks. */ +#define RESOLVE_NO_SYMLINKS 0x100 /* - Block all symlinks (implies AT_NO_MAGICLINKS). */ +#define RESOLVE_IN_ROOT 0x200 /* - Scope ".." resolution to dirfd (like chroot(2)). */ + +#define VALID_RESOLVE_FLAGS (RESOLVE_CLOEXEC | RESOLVE_NOFOLLOW | RESOLVE_RESOLUTION_TYPE) + #endif /* _UAPI_LINUX_FCNTL_H */
On Tue, May 7, 2019 at 9:44 AM Aleksa Sarai <cyphar@cyphar.com> wrote: > > The most obvious syscall to add support for the new LOOKUP_* scoping > flags would be openat(2) (along with the required execveat(2) change > included in this series). However, there are a few reasons to not do > this: So honestly, this last patch is what turns me off the whole thing. It goes from a nice new feature ("you can use O_NOSYMLINKS to disallow symlink traversal") to a special-case joke that isn't worth it any more. You get a useless path descrptor back from s special hacky system call, you don't actually get the useful data that you probably *want* the open to get you. Sure, you could eventually then use a *second* system call (openat with O_EMPTYPATH) to actually get something you can *use*, but at this point you've just wasted everybodys time and effort with a pointless second system call. So I really don't see the point of this whole thing. Why even bother. Nobody sane will ever use that odd two-systemcall model, and even if they did, it would be slower and inconvenient. The whole and only point of this seems to be the two lines that say if (flags & ~VALID_RESOLVE_FLAGS) return -EINVAL; but that adds absolutely zero value to anything. The argument is that "we can't add it to existing flags, because old kernels won't honor it", but that's a completely BS argument, since the user has to have a fallback anyway for the old kernel case - so we literally could much more conveniently just expose it as a prctl() or something to _ask_ the kernel what flags it honors. So to me, this whole argument means that "Oh, we'll make it really inconvenient to actually use this". If we want to introduce a new system call that allows cool new features, it should have *more* powerful semantics than the existing ones, not be clearly weaker and less useful. So how about making the new system call be something that is a *superset* of "openat()" so that people can use that, and then if it fails, just fall back to openat(). But if it succeeds, it just succeeds, and you don't need to then do other system calls to actually make it useful. Make the new system call something people *want* to use because it's useful, not a crippled useless thing that has some special case use for some limited thing and just wastes system call space. Example *useful* system call attributes: - make it like openat(), but have another argument with the "limit flags" - maybe return more status of the resulting file. People very commonly do "open->fstat" just to get the size for mmap or to check some other detail of the file before use. In other words, make the new system call *useful*. Not some castrated "not useful on its own" thing. So I still support the whole "let's make it easy to limit path lookup in sane ways", but this model of then limiting using the result sanely just makes me a sad panda. Linus
On 2019-05-24, Linus Torvalds <torvalds@linux-foundation.org> wrote: > On Tue, May 7, 2019 at 9:44 AM Aleksa Sarai <cyphar@cyphar.com> wrote: > > > > The most obvious syscall to add support for the new LOOKUP_* scoping > > flags would be openat(2) (along with the required execveat(2) change > > included in this series). However, there are a few reasons to not do > > this: > > So honestly, this last patch is what turns me off the whole thing. > > It goes from a nice new feature ("you can use O_NOSYMLINKS to disallow > symlink traversal") to a special-case joke that isn't worth it any > more. You get a useless path descrptor back from s special hacky > system call, you don't actually get the useful data that you probably > *want* the open to get you. > > Sure, you could eventually then use a *second* system call (openat > with O_EMPTYPATH) to actually get something you can *use*, but at this > point you've just wasted everybodys time and effort with a pointless > second system call. > > So I really don't see the point of this whole thing. Why even bother. > Nobody sane will ever use that odd two-systemcall model, and even if > they did, it would be slower and inconvenient. > > The whole and only point of this seems to be the two lines that say > > if (flags & ~VALID_RESOLVE_FLAGS) > return -EINVAL; > > but that adds absolutely zero value to anything. The argument is that > "we can't add it to existing flags, because old kernels won't honor > it", but that's a completely BS argument, since the user has to have a > fallback anyway for the old kernel case - so we literally could much > more conveniently just expose it as a prctl() or something to _ask_ > the kernel what flags it honors. > > So to me, this whole argument means that "Oh, we'll make it really > inconvenient to actually use this". > > If we want to introduce a new system call that allows cool new > features, it should have *more* powerful semantics than the existing > ones, not be clearly weaker and less useful. You might not have seen the v8 of this set I sent a few days ago[1]. The new set includes an example of a feature that is possible with resolveat(2) but not with the current openat(O_PATH) interface. The feature is that you can set RESOLVE_UPGRADE_NO{READ,WRITE} which then blocks the re-opening of the file descriptor with those MAY_* modes. (Though of course you might be against the entire idea of this feature which allows for restricting the opening of magic-links.) This can't be done with openat(2) without adding even more flags such as O_PATH_UPGRADE_NOWRITE -- because O_RDONLY = 0, which means you can't distinguish the "don't allow read or write" case (we could define 0x3 for that, but that feels a tad ugly). Not to mention that broken userspace programs might already be setting O_PATH|O_RDWR. So, while making it easier for userspace to be sure these flags are working is one benefit, it's not the only reason. And outside arguments for future features, several folks (some on-list, some on LWN) argued that adding more "open" flags which aren't clearly related to the mode the file is opened with makes not-much-more sense than a separate syscall for it. Another (weaker) argument is that O_PATH should've been separate from the beginning because of how unlike an ordinary fd it is. Funnily enough, v8 does contain O_EMPTYPATH. However, this is just an example of another /proc-less interface and we need it for O_PATH descriptors even if you can do an full open in one shot with restricted path resolution. Having an O_PATH can be useful on its own (LXC takes an O_PATH of /dev/pts/ptmx inside the container and then re-opens it each time a new console is required to avoid touching paths inside the container). But it would be neat to have a way for userspace to easily check what flags the kernel honours, regardless of this patchset. > So how about making the new system call be something that is a > *superset* of "openat()" so that people can use that, and then if it > fails, just fall back to openat(). But if it succeeds, it just > succeeds, and you don't need to then do other system calls to actually > make it useful. > > Make the new system call something people *want* to use because it's > useful, not a crippled useless thing that has some special case use > for some limited thing and just wastes system call space. At the moment, I'm working on implementing userspace library wrappers which use resolveat(2) for safe handling of an untrusted rootfs. I would expect that most users of resolveat(2) would be using a library to handle it -- because to do an "mkdir -p" you need to do a fair bit of work for it to be safe unless we add LOOKUP_* flags to mkdirat(2) and every other syscall. This is true whether or not openat(2) provides this feature or if it's a separate syscall. > Example *useful* system call attributes: > > - make it like openat(), but have another argument with the "limit flags" Sure, this would also work. I didn't know if anyone was open to the idea of openat2(2). There is a follow-up question of how RESOLVE_UPGRADE_NO* flags would be handled (they aren't obviously "lookup" flags so we'd need to add more openat(2) flags to accommodate them) but I'm sure that can be ironed out once you've taken a look at that patchset. > - maybe return more status of the resulting file. People very > commonly do "open->fstat" just to get the size for mmap or to check > some other detail of the file before use. So something like resolveat(rootfd, "path/to/file", RESOLVE_IN_ROOT, &statbuf); or openat2(rootfd, "path/to/file", O_PATH, RESOLVE_IN_ROOT, &statbuf); ? Is there a large amount of overhead or downside to the current open->fstat way of doing things, or is this more of a "if we're going to add more ways of opening we might as well add more useful things"? > In other words, make the new system call *useful*. Not some castrated > "not useful on its own" thing. > > So I still support the whole "let's make it easy to limit path lookup > in sane ways", but this model of then limiting using the result sanely > just makes me a sad panda. I am glad that you agree with the general thrust, and it's just the interface that is the hang-up. [1]: https://marc.info/?l=linux-fsdevel&m=155835923516235&w=2
On Sat, May 25, 2019 at 12:03 AM Aleksa Sarai <cyphar@cyphar.com> wrote: > > You might not have seen the v8 of this set I sent a few days ago[1]. The > new set includes an example of a feature that is possible with > resolveat(2) but not with the current openat(O_PATH) interface. It's the "forced O_PATH" model that makes resolveat() basically entirely pointless to me. You can do almost nothing with an O_PATH file descriptor. Yes, it's a really cool feature, and it's great for what it is, but it's less than 0.001% of all opens people have. Even among security-conscious users, it's pointless. Yes, an O_PATH file descriptor is somewhat more "secure", but it's secure because it's mostly USELESS, and has to be converted into something else to be used. So say you are something like a static web server that actually wants to use the "don't traverse '..', don't follow symlinks" features etc. What you want to do with the end result is read() it or mmap it, or sendpage or whatever. This is why I think resolveat() is entirely pointless. Even with O_EMPTYPATH it's pointless - because you shouldn't *need* that extra "ok, now get me the actual useful fd" phase. In fact, I think resolveat() as a model is fundamentally wrong for yet another reason: O_CREAT. If you want to _create_ a new file, and you want to still have the path resolution modifiers in place, the resolveat() model is broken, because it only gives you path resolution for the lookup, and then when you do openat(O_CREAT) for the final component, you now don't have any way to limit that last component. Sure, you can probably effectively hack around it with resolveat() on everything but the last component, and then openat(O_CREAT|O_EXCL|O_NOFOLLOW) on the last component, because the O_EXCL|O_NOFOLLOW should mean that it won't do any of the unsafe things. And then (if you didn't actually want the O_EXCL), you handle the race between "somebody else got there first" by re-trying etc. So I suspect the O_CREAT thing could be worked around with extra complexity, but it's an example of how the O_PATH model really screws you over. End result: I really think resolveat() is broken. It absolutely *needs* to be a full-fledged "openat()" system call, just with added path resolution flags. > openat2(rootfd, "path/to/file", O_PATH, RESOLVE_IN_ROOT, &statbuf); Note that for O_CREAT, it either needs the 'mode' parameter too, or the statbuf would need to be an in-out thing. I think the in-out model might be nice (the varargs model with a conditional 'mode' parameter is horrid, I think), but at some point it's just bike-shedding. Also, I'm not absolutely married to the statbuf, but I do think it might be a useful extension. A *lot* of users need the size of the file for subsequent mmap() calls (or for buffer management for read/write interface) or for things like just headers (ie "Content-length:" in html etc). I'm not sure people actually want a full 'struct stat', but people historically also do st_ino/st_dev for indexing into existing user-space caches (or to check permissions like that it's on the right filesystem, but the resolve flags kind of make that less of an issue). And st_mode to verify that it's a proper regular file etc etc. You can always just leave it as NULL if you don't care, there's almost no downside to adding it, and I do think that people who want a "walk pathname carefully" model almost always would want to also check the end result anyway. Again, I'm thinking of the most obvious use cases where you want these kinds of special pathname traversals: file servers, static web content serving etc. They *all* want the file size and type when they open a file. > Is there a large amount of overhead or downside to the current > open->fstat way of doing things, or is this more of a "if we're going to > add more ways of opening we might as well add more useful things"? Right now, system calls are sadly very expensive on a lot of hardware. We used to be very proud of the fact that Linux system calls were fast, but with meltdown and retpoline etc, we're back to "system calls can be several thousand cycles each, just in overhead, on commonly available hardware". Is "several thousand cycles" fatal? Not necessarily. But I think that if we do add a new system call, particularly a fancy one for special security-conscious models, we should look at what people need and use, and want. And performance should always be a concern. I realize that people who come at this from primarily just a security issue background may think that security is the primary goal. But no. Security always needs to realize that the _primary_ goal is to have people _use_ it. Without users, security is entirely pointless. And the users part is partly performance, but mostly "it's convenient". The whole "this is Linux-specific" is a big inconvenience point, but aside from that, let's make any new interface as welcoming and simple and useful as possible. Not a "you have to do extra work" interface. Quite the reverse. Make it something that makes people go "ahh, yes, this actually means I don't have to do anything extra, because it already does everything I want for opening and checking a pathname". So the way to sell the path lookup improvements should not be "look, here's a secure way to look up paths". No. That's entirely missing the point. No, the way to do path lookup improvements is to say "look, here's a _convenient_ way to look up paths, and btw, it's also easy to make secure". Talking about securely opening things - another flag that we may want to avoid issues is a "don't open device nodes" flag. Sure, O_NONBLOCK plus checking the st_mode of the result is usually sufficient, but it's actually fairly easy to get that wrong. Things like /dev/tty and /dev/zero often need to be available for various reasons, and have been used to screw careless "open and read" users up that missed a check. I also do wonder that if the only actual user-facing interface for the resolution flags is a new system call, should we not make the *default* value be "don't open anything odd at all". So instead of saying RESOLVE_XDEV for "don't cross mount points", maybe the flags should be the other way around, and say "yes, allow mount point crossings", and "yes, explicitly allow device node opening", and "yes, allow DOTDOT" etc. Linus
On 2019-05-25, Linus Torvalds <torvalds@linux-foundation.org> wrote: > In fact, I think resolveat() as a model is fundamentally wrong for yet > another reason: O_CREAT. If you want to _create_ a new file, and you > want to still have the path resolution modifiers in place, the > resolveat() model is broken, because it only gives you path resolution > for the lookup, and then when you do openat(O_CREAT) for the final > component, you now don't have any way to limit that last component. > > Sure, you can probably effectively hack around it with resolveat() on > everything but the last component, and then > openat(O_CREAT|O_EXCL|O_NOFOLLOW) on the last component, because the > O_EXCL|O_NOFOLLOW should mean that it won't do any of the unsafe > things. And then (if you didn't actually want the O_EXCL), you handle > the race between "somebody else got there first" by re-trying etc. So > I suspect the O_CREAT thing could be worked around with extra > complexity, but it's an example of how the O_PATH model really screws > you over. > > End result: I really think resolveat() is broken. It absolutely > *needs* to be a full-fledged "openat()" system call, just with added > path resolution flags. That's a very good point. I was starting to work on O_CREAT via resolveat(2) and it definitely was much harder than most people would be happy to deal with -- I'm not even sure that I handled all the cases. I'll go for an openat2(2) then. Thinking about it some more -- since it's a new syscall, I could actually implement the O_PATH link-mode as being just the regular mode argument (since openat(2) ignores the mode for non-O_CREAT anyway). The openat(2) wrapper might be more than one-line as a result but it should avoid polluting the resolution flags with mode flags (since openat(O_PATH) needs to have a g+rwx mode for backwards-compatibility). > > openat2(rootfd, "path/to/file", O_PATH, RESOLVE_IN_ROOT, &statbuf); > > Note that for O_CREAT, it either needs the 'mode' parameter too, or > the statbuf would need to be an in-out thing. I think the in-out model > might be nice (the varargs model with a conditional 'mode' parameter > is horrid, I think), but at some point it's just bike-shedding. > > Also, I'm not absolutely married to the statbuf, but I do think it > might be a useful extension. A *lot* of users need the size of the > file for subsequent mmap() calls (or for buffer management for > read/write interface) or for things like just headers (ie > "Content-length:" in html etc). > > I'm not sure people actually want a full 'struct stat', but people > historically also do st_ino/st_dev for indexing into existing > user-space caches (or to check permissions like that it's on the right > filesystem, but the resolve flags kind of make that less of an issue). > And st_mode to verify that it's a proper regular file etc etc. > > You can always just leave it as NULL if you don't care, there's almost > no downside to adding it, and I do think that people who want a "walk > pathname carefully" model almost always would want to also check the > end result anyway. Yeah, I agree -- most folks would want to double-check what they've opened or O_PATH'd. Though I'm still not clear what is the best way of doing the "stat" argument is -- especially given how much fun architecture-specific shenanigans are in fs/stat.c (should I only use cp_new_stat64 or have a separate 64-bit syscall). > > Is there a large amount of overhead or downside to the current > > open->fstat way of doing things, or is this more of a "if we're going to > > add more ways of opening we might as well add more useful things"? > > Right now, system calls are sadly very expensive on a lot of hardware. > We used to be very proud of the fact that Linux system calls were > fast, but with meltdown and retpoline etc, we're back to "system calls > can be several thousand cycles each, just in overhead, on commonly > available hardware". > > Is "several thousand cycles" fatal? Not necessarily. But I think that > if we do add a new system call, particularly a fancy one for special > security-conscious models, we should look at what people need and use, > and want. And performance should always be a concern. > > I realize that people who come at this from primarily just a security > issue background may think that security is the primary goal. But no. > Security always needs to realize that the _primary_ goal is to have > people _use_ it. Without users, security is entirely pointless. And > the users part is partly performance, but mostly "it's convenient". Yup, I agree. I was hoping to shunt most of the convenience to userspace to avoid ruffling feathers in VFS-land, but I'm more than happy to make the kernel ABI more convenient. > The whole "this is Linux-specific" is a big inconvenience point I hope that other OSes will take our lead and have a similar interface so that the particular inconvenience can go away eventually (this was one of the arguments for resolveat(2) -- it's a clear "this is a new idea" interface rather than mixing it with other O_* flags). > Talking about securely opening things - another flag that we may want > to avoid issues is a "don't open device nodes" flag. Sure, O_NONBLOCK > plus checking the st_mode of the result is usually sufficient, but > it's actually fairly easy to get that wrong. Things like /dev/tty and > /dev/zero often need to be available for various reasons, and have > been used to screw careless "open and read" users up that missed a > check. Sure, this could be added -- though I'm sure folks would have disagreements over whether it should be a resolution flag on an open flag. > I also do wonder that if the only actual user-facing interface for the > resolution flags is a new system call, should we not make the > *default* value be "don't open anything odd at all". > > So instead of saying RESOLVE_XDEV for "don't cross mount points", > maybe the flags should be the other way around, and say "yes, allow > mount point crossings", and "yes, explicitly allow device node > opening", and "yes, allow DOTDOT" etc. This seems like a reasonable default, except for the cases of RESOLVE_BENEATH and RESOLVE_IN_ROOT (that would make using AT_FDCWD more cumbersome than necessary). But other than that, I'd be happy to invert all the other flags.
The most obvious syscall to add support for the new LOOKUP_* scoping flags would be openat(2) (along with the required execveat(2) change included in this series). However, there are a few reasons to not do this: * The new LOOKUP_* flags are intended to be security features, and openat(2) will silently ignore all unknown flags. This means that users would need to avoid foot-gunning themselves constantly when using this interface if it were part of openat(2). * Resolution scoping feels like a different operation to the existing O_* flags. And since openat(2) has limited flag space, it seems to be quite wasteful to clutter it with 5 flags that are all resolution-related. Arguably O_NOFOLLOw is also a resolution flag but its entire purpose is to error out if you encounter a trailing symlink not to scope resolution. * Other systems would be able to reimplement this syscall allowing for cross-OS standardisation rather than being hidden amongst O_* flags which may result in it not being used by all the parties that might want to use it (file servers, web servers, container runtimes, etc). * It gives us the opportunity to iterate on the O_PATH interface in the future. There are some potential security improvements that can be made to O_PATH (handling /proc/self/fd re-opening of file descriptors much more sanely) which could be made even better with some other bits (such as ACC_MODE bits which work for O_PATH). To this end, we introduce the resolveat(2) syscall. At the moment it's effectively another way of getting a bog-standard O_PATH descriptor but with the ability to use the new LOOKUP_* flags. Because resolveat(2) only provides the ability to get O_PATH descriptors, users will need to get creative with /proc/self/fd in order to get a usable file descriptor for other uses. However, in future we can add O_EMPTYPATH support to openat(2) which would allow for re-opening without procfs (though as mentioned above there are some security improvements that should be made to the interfaces). NOTE: This patch adds the syscall to all architectures using the new unified syscall numbering, but several architectures are missing newer (nr > 423) syscalls -- hence the uneven gaps in the syscall tables. Cc: Christian Brauner <christian@brauner.io> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com> --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + fs/namei.c | 46 +++++++++++++++++++++ include/uapi/linux/fcntl.h | 13 ++++++ 18 files changed, 75 insertions(+)