[v4,2/5] pid: Add PIDFD_IOCTL_GETFD to fetch file descriptors from processes

Submitted by Sargun Dhillon on Dec. 18, 2019, 11:55 p.m.

Details

Message ID 20191218235459.GA17271@ircssh-2.c.rugged-nimbus-611.internal
State New
Series "Series without cover letter"
Headers show

Commit Message

Sargun Dhillon Dec. 18, 2019, 11:55 p.m.
This adds an ioctl which allows file descriptors to be extracted
from processes based on their pidfd.

One reason to use this is to allow sandboxers to take actions on file
descriptors on the behalf of another process. For example, this can be
combined with seccomp-bpf's user notification to do on-demand fd
extraction and take privileged actions. For example, it can be used
to bind a socket to a privileged port. This is similar to ptrace, and
using ptrace parasitic code injection to extract a file descriptor from a
process, but without breaking debuggers, or paying the ptrace overhead
cost.

You must have the ability to ptrace the process in order to extract any
file descriptors from it. ptrace can already be used to extract file
descriptors based on parasitic code injections, so the permissions
model is aligned.

The ioctl takes a pointer to pidfd_getfd_args. pidfd_getfd_args contains
a size, which allows for gradual evolution of the API. There is an options
field, which can be used to state whether the fd should be opened with
CLOEXEC, or not. An additional options field may be added in the future
to include the ability to clear cgroup information about the file
descriptor at a later point. If the structure is from a newer kernel, and
includes members which make it larger than the structure that's known to
this kernel version, E2BIG will be returned.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
---
 .../userspace-api/ioctl/ioctl-number.rst      |  1 +
 MAINTAINERS                                   |  1 +
 include/uapi/linux/pidfd.h                    | 10 +++
 kernel/fork.c                                 | 77 +++++++++++++++++++
 4 files changed, 89 insertions(+)
 create mode 100644 include/uapi/linux/pidfd.h

Patch hide | download patch | download mbox

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 4ef86433bd67..9f9be681662b 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -273,6 +273,7 @@  Code  Seq#    Include File                                           Comments
                                                                      <mailto:tim@cyberelk.net>
 'p'   A1-A5  linux/pps.h                                             LinuxPPS
                                                                      <mailto:giometti@linux.it>
+'p'   B0-CF  linux/pidfd.h
 'q'   00-1F  linux/serio.h
 'q'   80-FF  linux/telephony.h                                       Internet PhoneJACK, Internet LineJACK
              linux/ixjuser.h                                         <http://web.archive.org/web/%2A/http://www.quicknet.net>
diff --git a/MAINTAINERS b/MAINTAINERS
index cc0a4a8ae06a..bc370ff59dbf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13014,6 +13014,7 @@  M:	Christian Brauner <christian@brauner.io>
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
+F:	include/uapi/linux/pidfd.h
 F:	samples/pidfd/
 F:	tools/testing/selftests/pidfd/
 F:	tools/testing/selftests/clone3/
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
new file mode 100644
index 000000000000..90ff535be048
--- /dev/null
+++ b/include/uapi/linux/pidfd.h
@@ -0,0 +1,10 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_PID_H
+#define _UAPI_LINUX_PID_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define PIDFD_IOCTL_GETFD	_IOWR('p', 0xb0, __u32)
+
+#endif /* _UAPI_LINUX_PID_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 2508a4f238a3..09b5f233b5a8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -94,6 +94,7 @@ 
 #include <linux/thread_info.h>
 #include <linux/stackleak.h>
 #include <linux/kasan.h>
+#include <uapi/linux/pidfd.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -1790,9 +1791,85 @@  static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 	return poll_flags;
 }
 
+static struct file *__pidfd_getfd_fget_task(struct task_struct *task, u32 fd)
+{
+	struct file *file;
+	int ret;
+
+	ret = mutex_lock_killable(&task->signal->cred_guard_mutex);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+		file = ERR_PTR(-EPERM);
+		goto out;
+	}
+
+	file = fget_task(task, fd);
+	if (!file)
+		file = ERR_PTR(-EBADF);
+
+out:
+	mutex_unlock(&task->signal->cred_guard_mutex);
+	return file;
+}
+
+static long pidfd_getfd(struct pid *pid, u32 fd)
+{
+	struct task_struct *task;
+	struct file *file;
+	int ret, retfd;
+
+	task = get_pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		return -ESRCH;
+
+	file = __pidfd_getfd_fget_task(task, fd);
+	put_task_struct(task);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	retfd = get_unused_fd_flags(O_CLOEXEC);
+	if (retfd < 0) {
+		ret = retfd;
+		goto out;
+	}
+
+	/*
+	 * security_file_receive must come last since it may have side effects
+	 * and cannot be reversed.
+	 */
+	ret = security_file_receive(file);
+	if (ret)
+		goto out_put_fd;
+
+	fd_install(retfd, file);
+	return retfd;
+
+out_put_fd:
+	put_unused_fd(retfd);
+out:
+	fput(file);
+	return ret;
+}
+
+static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pid *pid = file->private_data;
+
+	switch (cmd) {
+	case PIDFD_IOCTL_GETFD:
+		return pidfd_getfd(pid, arg);
+	default:
+		return -EINVAL;
+	}
+}
+
 const struct file_operations pidfd_fops = {
 	.release = pidfd_release,
 	.poll = pidfd_poll,
+	.unlocked_ioctl = pidfd_ioctl,
+	.compat_ioctl = compat_ptr_ioctl,
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo = pidfd_show_fdinfo,
 #endif

Comments

Arnd Bergmann Dec. 19, 2019, 8:03 a.m.
On Thu, Dec 19, 2019 at 12:55 AM Sargun Dhillon <sargun@sargun.me> wrote:

> +#define PIDFD_IOCTL_GETFD      _IOWR('p', 0xb0, __u32)

This describes an ioctl command that reads and writes a __u32 variable
using a pointer passed as the argument, which doesn't match the
implementation:

> +static long pidfd_getfd(struct pid *pid, u32 fd)
> +{
...
> +       return retfd;

This function passes an fd as the argument and returns a new
fd, so the command number would be

#define PIDFD_IOCTL_GETFD      _IO('p', 0xb0)

While this implementation looks easy enough, and it is roughly what
I would do in case of a system call, I would recommend for an ioctl
implementation to use the __u32 pointer instead:

static long pidfd_getfd_ioctl(struct pid *pid, u32 __user *arg)
{
         int their_fd, new_fd;
         int ret;

         ret = get_user(their_fd, arg);
         if (ret)
              return ret;

        new_fd = pidfd_getfd(pid, their_fd);
        if (new_fd < 0)
                return new_fd;

         return put_user(new_fd, arg);
}

Direct argument passing in ioctls may confuse readers because it
is fairly unusual, and it doesn't work with this:

>  const struct file_operations pidfd_fops = {
>         .release = pidfd_release,
>         .poll = pidfd_poll,
> +       .unlocked_ioctl = pidfd_ioctl,
> +       .compat_ioctl = compat_ptr_ioctl,

compat_ptr_ioctl() only works if the argument is a pointer, as it
mangles the argument to turn it from a 32-bit pointer value into
a 64-bit pointer value. These are almost always the same
(arch/s390 being the sole exception), but you should not rely
on it. For now it would be find to do '.compat_ioctl = pidfd_ioctl',
but that in turn is wrong if you ever add another ioctl command
that does pass a pointer.

       Arnd
Christian Brauner Dec. 19, 2019, 10:23 a.m.
On Wed, Dec 18, 2019 at 11:55:01PM +0000, Sargun Dhillon wrote:
> This adds an ioctl which allows file descriptors to be extracted
> from processes based on their pidfd.
> 
> One reason to use this is to allow sandboxers to take actions on file
> descriptors on the behalf of another process. For example, this can be
> combined with seccomp-bpf's user notification to do on-demand fd
> extraction and take privileged actions. For example, it can be used
> to bind a socket to a privileged port. This is similar to ptrace, and
> using ptrace parasitic code injection to extract a file descriptor from a
> process, but without breaking debuggers, or paying the ptrace overhead
> cost.
> 
> You must have the ability to ptrace the process in order to extract any
> file descriptors from it. ptrace can already be used to extract file
> descriptors based on parasitic code injections, so the permissions
> model is aligned.
> 
> The ioctl takes a pointer to pidfd_getfd_args. pidfd_getfd_args contains
> a size, which allows for gradual evolution of the API. There is an options
> field, which can be used to state whether the fd should be opened with
> CLOEXEC, or not. An additional options field may be added in the future
> to include the ability to clear cgroup information about the file
> descriptor at a later point. If the structure is from a newer kernel, and
> includes members which make it larger than the structure that's known to
> this kernel version, E2BIG will be returned.

I think that whole part can go now. :)

The rest is just nits (see below).

> 
> Signed-off-by: Sargun Dhillon <sargun@sargun.me>
> ---
>  .../userspace-api/ioctl/ioctl-number.rst      |  1 +
>  MAINTAINERS                                   |  1 +
>  include/uapi/linux/pidfd.h                    | 10 +++
>  kernel/fork.c                                 | 77 +++++++++++++++++++
>  4 files changed, 89 insertions(+)
>  create mode 100644 include/uapi/linux/pidfd.h
> 
> diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
> index 4ef86433bd67..9f9be681662b 100644
> --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> @@ -273,6 +273,7 @@ Code  Seq#    Include File                                           Comments
>                                                                       <mailto:tim@cyberelk.net>
>  'p'   A1-A5  linux/pps.h                                             LinuxPPS
>                                                                       <mailto:giometti@linux.it>
> +'p'   B0-CF  linux/pidfd.h
>  'q'   00-1F  linux/serio.h
>  'q'   80-FF  linux/telephony.h                                       Internet PhoneJACK, Internet LineJACK
>               linux/ixjuser.h                                         <http://web.archive.org/web/%2A/http://www.quicknet.net>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cc0a4a8ae06a..bc370ff59dbf 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -13014,6 +13014,7 @@ M:	Christian Brauner <christian@brauner.io>
>  L:	linux-kernel@vger.kernel.org
>  S:	Maintained
>  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
> +F:	include/uapi/linux/pidfd.h
>  F:	samples/pidfd/
>  F:	tools/testing/selftests/pidfd/
>  F:	tools/testing/selftests/clone3/
> diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
> new file mode 100644
> index 000000000000..90ff535be048
> --- /dev/null
> +++ b/include/uapi/linux/pidfd.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_PID_H
> +#define _UAPI_LINUX_PID_H

I think that wants to be

#ifndef _UAPI_LINUX_PIDFD_H

> +
> +#include <linux/types.h>
> +#include <linux/ioctl.h>
> +
> +#define PIDFD_IOCTL_GETFD	_IOWR('p', 0xb0, __u32)

As Arnd mentioned, this defines a read-write ioctl where it just needs
to be a read-ioctl.

> +
> +#endif /* _UAPI_LINUX_PID_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 2508a4f238a3..09b5f233b5a8 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -94,6 +94,7 @@
>  #include <linux/thread_info.h>
>  #include <linux/stackleak.h>
>  #include <linux/kasan.h>
> +#include <uapi/linux/pidfd.h>
>  
>  #include <asm/pgtable.h>
>  #include <asm/pgalloc.h>
> @@ -1790,9 +1791,85 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
>  	return poll_flags;
>  }
>  
> +static struct file *__pidfd_getfd_fget_task(struct task_struct *task, u32 fd)
> +{
> +	struct file *file;
> +	int ret;
> +
> +	ret = mutex_lock_killable(&task->signal->cred_guard_mutex);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
> +		file = ERR_PTR(-EPERM);
> +		goto out;
> +	}
> +
> +	file = fget_task(task, fd);
> +	if (!file)
> +		file = ERR_PTR(-EBADF);
> +
> +out:
> +	mutex_unlock(&task->signal->cred_guard_mutex);
> +	return file;
> +}
> +
> +static long pidfd_getfd(struct pid *pid, u32 fd)
> +{
> +	struct task_struct *task;
> +	struct file *file;
> +	int ret, retfd;
> +
> +	task = get_pid_task(pid, PIDTYPE_PID);
> +	if (!task)
> +		return -ESRCH;
> +
> +	file = __pidfd_getfd_fget_task(task, fd);
> +	put_task_struct(task);
> +	if (IS_ERR(file))
> +		return PTR_ERR(file);
> +
> +	retfd = get_unused_fd_flags(O_CLOEXEC);
> +	if (retfd < 0) {
> +		ret = retfd;
> +		goto out;
> +	}
> +
> +	/*
> +	 * security_file_receive must come last since it may have side effects
> +	 * and cannot be reversed.
> +	 */
> +	ret = security_file_receive(file);
> +	if (ret)
> +		goto out_put_fd;
> +
> +	fd_install(retfd, file);
> +	return retfd;
> +
> +out_put_fd:
> +	put_unused_fd(retfd);
> +out:
> +	fput(file);
> +	return ret;
> +}
> +
> +static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> +{
> +	struct pid *pid = file->private_data;
> +
> +	switch (cmd) {
> +	case PIDFD_IOCTL_GETFD:
> +		return pidfd_getfd(pid, arg);
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
>  const struct file_operations pidfd_fops = {
>  	.release = pidfd_release,
>  	.poll = pidfd_poll,
> +	.unlocked_ioctl = pidfd_ioctl,
> +	.compat_ioctl = compat_ptr_ioctl,
>  #ifdef CONFIG_PROC_FS
>  	.show_fdinfo = pidfd_show_fdinfo,
>  #endif
> -- 
> 2.20.1
>
Christian Brauner Dec. 19, 2019, 10:35 a.m.
On Thu, Dec 19, 2019 at 09:03:09AM +0100, Arnd Bergmann wrote:
> On Thu, Dec 19, 2019 at 12:55 AM Sargun Dhillon <sargun@sargun.me> wrote:
> 
> > +#define PIDFD_IOCTL_GETFD      _IOWR('p', 0xb0, __u32)
> 
> This describes an ioctl command that reads and writes a __u32 variable
> using a pointer passed as the argument, which doesn't match the
> implementation:
> 
> > +static long pidfd_getfd(struct pid *pid, u32 fd)
> > +{
> ...
> > +       return retfd;
> 
> This function passes an fd as the argument and returns a new
> fd, so the command number would be
> 
> #define PIDFD_IOCTL_GETFD      _IO('p', 0xb0)
> 
> While this implementation looks easy enough, and it is roughly what
> I would do in case of a system call, I would recommend for an ioctl

I guess this is the remaining question we should settle, i.e. what do we
prefer.
I still think that adding a new syscall for this seems a bit rich. On
the other hand it seems that a lot more people agree that using a
dedicated syscall instead of an ioctl is the correct way; especially
when it touches core kernel functionality. I mean that was one of the
takeaways from the pidfd API ioctl-vs-syscall discussion.

A syscall is nicer especially for core-kernel code like this.
So I guess the only way to find out is to try the syscall approach and
either get yelled and switch to an ioctl() or have it accepted.

What does everyone else think? Arnd, still in favor of a syscall I take
it. Oleg, you had suggested a syscall too, right? Florian, any
thoughts/worries on/about this from the glibc side?

Christian
Arnd Bergmann Dec. 19, 2019, 11:31 a.m.
On Thu, Dec 19, 2019 at 11:35 AM Christian Brauner
<christian.brauner@ubuntu.com> wrote:
> On Thu, Dec 19, 2019 at 09:03:09AM +0100, Arnd Bergmann wrote:
> > On Thu, Dec 19, 2019 at 12:55 AM Sargun Dhillon <sargun@sargun.me> wrote:

> What does everyone else think? Arnd, still in favor of a syscall I take it.

Yes, but I would not object the ioctl if others prefer that.

      Arnd
Sargun Dhillon Dec. 19, 2019, 4:15 p.m.
On Thu, Dec 19, 2019 at 2:35 AM Christian Brauner
<christian.brauner@ubuntu.com> wrote:
> I guess this is the remaining question we should settle, i.e. what do we
> prefer.
> I still think that adding a new syscall for this seems a bit rich. On
> the other hand it seems that a lot more people agree that using a
> dedicated syscall instead of an ioctl is the correct way; especially
> when it touches core kernel functionality. I mean that was one of the
> takeaways from the pidfd API ioctl-vs-syscall discussion.
>
> A syscall is nicer especially for core-kernel code like this.
> So I guess the only way to find out is to try the syscall approach and
> either get yelled and switch to an ioctl() or have it accepted.
>
> What does everyone else think? Arnd, still in favor of a syscall I take
> it. Oleg, you had suggested a syscall too, right? Florian, any
> thoughts/worries on/about this from the glibc side?
>
> Christian

My feelings towards this are that syscalls might pose a problem if we
ever want to extend this API. Of course we can have a reserved
"flags" field, and populate it later, but what if we turn out to need
a proper struct? I already know we're going to want to add one
around cgroup metadata (net_cls), and likely we'll want to add
a "steal" flag as well. As Arnd mentioned earlier, this is trivial to
fix in a traditional ioctl environment, as ioctls are "cheap". How
do we feel about potentially adding a pidfd_getfd2? Or are we
confident that reserved flags will save us?
Andy Lutomirski Dec. 20, 2019, 1:43 a.m.
On Wed, Dec 18, 2019 at 3:55 PM Sargun Dhillon <sargun@sargun.me> wrote:
>
> +
> +       if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
> +               file = ERR_PTR(-EPERM);
> +               goto out;
> +       }

I don't think this is MODE_READ.  By copying an fd from the task, you
can easily change its state.

IMO it would be really nice if pidfd could act more like a capability
here and carry a ptrace mode, for example.  But I guess it doesn't
right now.


--Andy
Aleksa Sarai Dec. 20, 2019, 4:35 a.m.
On 2019-12-19, Sargun Dhillon <sargun@sargun.me> wrote:
> On Thu, Dec 19, 2019 at 2:35 AM Christian Brauner
> <christian.brauner@ubuntu.com> wrote:
> > I guess this is the remaining question we should settle, i.e. what do we
> > prefer.
> > I still think that adding a new syscall for this seems a bit rich. On
> > the other hand it seems that a lot more people agree that using a
> > dedicated syscall instead of an ioctl is the correct way; especially
> > when it touches core kernel functionality. I mean that was one of the
> > takeaways from the pidfd API ioctl-vs-syscall discussion.
> >
> > A syscall is nicer especially for core-kernel code like this.
> > So I guess the only way to find out is to try the syscall approach and
> > either get yelled and switch to an ioctl() or have it accepted.
> >
> > What does everyone else think? Arnd, still in favor of a syscall I take
> > it. Oleg, you had suggested a syscall too, right? Florian, any
> > thoughts/worries on/about this from the glibc side?
> >
> > Christian
> 
> My feelings towards this are that syscalls might pose a problem if we
> ever want to extend this API. Of course we can have a reserved
> "flags" field, and populate it later, but what if we turn out to need
> a proper struct? I already know we're going to want to add one
> around cgroup metadata (net_cls), and likely we'll want to add
> a "steal" flag as well. As Arnd mentioned earlier, this is trivial to
> fix in a traditional ioctl environment, as ioctls are "cheap". How
> do we feel about potentially adding a pidfd_getfd2? Or are we
> confident that reserved flags will save us?

If we end up making this a syscall, then we can re-use the
copy_struct_from_user() API to make it both extensible and compatible in
both directions. I wasn't aware that this was frowned upon for ioctls
(sorry for the extra work) but there are several syscalls which use this
model for extendability (clone3, openat2, sched_setattr,
perf_events_open) so there shouldn't be any such complaints for a
syscall which is extensible.
Sargun Dhillon Dec. 20, 2019, 5:21 a.m.
On Thu, Dec 19, 2019 at 5:43 PM Andy Lutomirski <luto@kernel.org> wrote:
>
>
> I don't think this is MODE_READ.  By copying an fd from the task, you
> can easily change its state.
Would PTRACE_MODE_ATTACH_REALCREDS  work? I'm curious what
kind of state change you can cause by borrowing an FD?


>
> IMO it would be really nice if pidfd could act more like a capability
> here and carry a ptrace mode, for example.  But I guess it doesn't
> right now.
>
>
> --Andy
Christian Brauner Dec. 20, 2019, 9:20 a.m.
On Fri, Dec 20, 2019 at 2:43 AM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Wed, Dec 18, 2019 at 3:55 PM Sargun Dhillon <sargun@sargun.me> wrote:
> >
> > +
> > +       if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
> > +               file = ERR_PTR(-EPERM);
> > +               goto out;
> > +       }
>
> I don't think this is MODE_READ.  By copying an fd from the task, you
> can easily change its state.
>
> IMO it would be really nice if pidfd could act more like a capability

That's ultimately what I would like to get to.

> here and carry a ptrace mode, for example.  But I guess it doesn't
> right now.

It doesn't right now for mainly two reasons.
The way I think about it is that a pidfd gets a capability at process
creation time. Before v5.3 we couldn't have done that because legacy
clone() couldn't be extended anymore. Imho, this has changed with clone3().
The other reason was that the basic properties a process can be created
with right now do not lend itself to be turned into a capability. Even
if they did
suddenly treating them like such would prevent userspace from switching to
clone3() because it would regress usecases they had.
However, for new properties this is not a problem. I have some ideas around this
(e.g. spawning private processes only reapable through pidfds and auto-cleanup
if there's no pidfd anymore).
From an implementation perspective clone3() could get a __aligned_u64 caps
(naming up for debate since we don't want people to think this is equivalent
to our current capabilities) field.
Where at process creation time you could e.g. specify PIDFD_CAP_GET_FD and
only then can you use that pidfd to get file descriptors from other processes.
You still need ptrace_access() to get the actual fd of course.

Christian
Arnd Bergmann Dec. 21, 2019, 1:53 p.m.
On Fri, Dec 20, 2019 at 4:35 AM Aleksa Sarai <cyphar@cyphar.com> wrote:
>
> On 2019-12-19, Sargun Dhillon <sargun@sargun.me> wrote:
> > On Thu, Dec 19, 2019 at 2:35 AM Christian Brauner
> > <christian.brauner@ubuntu.com> wrote:
> > > I guess this is the remaining question we should settle, i.e. what do we
> > > prefer.
> > > I still think that adding a new syscall for this seems a bit rich. On
> > > the other hand it seems that a lot more people agree that using a
> > > dedicated syscall instead of an ioctl is the correct way; especially
> > > when it touches core kernel functionality. I mean that was one of the
> > > takeaways from the pidfd API ioctl-vs-syscall discussion.
> > >
> > > A syscall is nicer especially for core-kernel code like this.
> > > So I guess the only way to find out is to try the syscall approach and
> > > either get yelled and switch to an ioctl() or have it accepted.
> > >
> > > What does everyone else think? Arnd, still in favor of a syscall I take
> > > it. Oleg, you had suggested a syscall too, right? Florian, any
> > > thoughts/worries on/about this from the glibc side?
> > >
> > > Christian
> >
> > My feelings towards this are that syscalls might pose a problem if we
> > ever want to extend this API. Of course we can have a reserved
> > "flags" field, and populate it later, but what if we turn out to need
> > a proper struct? I already know we're going to want to add one
> > around cgroup metadata (net_cls), and likely we'll want to add
> > a "steal" flag as well. As Arnd mentioned earlier, this is trivial to
> > fix in a traditional ioctl environment, as ioctls are "cheap". How
> > do we feel about potentially adding a pidfd_getfd2? Or are we
> > confident that reserved flags will save us?
>
> If we end up making this a syscall, then we can re-use the
> copy_struct_from_user() API to make it both extensible and compatible in
> both directions. I wasn't aware that this was frowned upon for ioctls
> (sorry for the extra work) but there are several syscalls which use this
> model for extendability (clone3, openat2, sched_setattr,
> perf_events_open) so there shouldn't be any such complaints for a
> syscall which is extensible.

I would still not do it for syscalls, although for other reasons:

- in an ioctl, it's better to come up with a new command code if you
  have a larger structure

- in a system call, it's best to pass all arguments as individual
  registers, the only time we use indirect data structures is when there
  are more than six arguments.

       Arnd