[08/11] fuse: Support fuse filesystems outside of init_user_ns

Submitted by Dongsu Park on Dec. 22, 2017, 2:32 p.m.

Details

Message ID c85c293e19a478353aba8e6e3ee39e5914f798d5.1512041070.git.dongsu@kinvolk.io
State New
Series "FUSE mounts from non-init user namespaces"
Headers show

Commit Message

Dongsu Park Dec. 22, 2017, 2:32 p.m.
From: Seth Forshee <seth.forshee@canonical.com>

In order to support mounts from namespaces other than
init_user_ns, fuse must translate uids and gids to/from the
userns of the process servicing requests on /dev/fuse. This
patch does that, with a couple of restrictions on the namespace:

 - The userns for the fuse connection is fixed to the namespace
   from which /dev/fuse is opened.

 - The namespace must be the same as s_user_ns.

These restrictions simplify the implementation by avoiding the
need to pass around userns references and by allowing fuse to
rely on the checks in inode_change_ok for ownership changes.
Either restriction could be relaxed in the future if needed.

For cuse the namespace used for the connection is also simply
current_user_ns() at the time /dev/cuse is opened.

Patch v4 is available: https://patchwork.kernel.org/patch/8944661/

Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Dongsu Park <dongsu@kinvolk.io>
---
 fs/fuse/cuse.c   |  3 ++-
 fs/fuse/dev.c    | 11 ++++++++---
 fs/fuse/dir.c    | 14 +++++++-------
 fs/fuse/fuse_i.h |  6 +++++-
 fs/fuse/inode.c  | 31 +++++++++++++++++++------------
 5 files changed, 41 insertions(+), 24 deletions(-)

Patch hide | download patch | download mbox

diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e9e97803..b1b83259 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -48,6 +48,7 @@ 
 #include <linux/stat.h>
 #include <linux/module.h>
 #include <linux/uio.h>
+#include <linux/user_namespace.h>
 
 #include "fuse_i.h"
 
@@ -498,7 +499,7 @@  static int cuse_channel_open(struct inode *inode, struct file *file)
 	if (!cc)
 		return -ENOMEM;
 
-	fuse_conn_init(&cc->fc);
+	fuse_conn_init(&cc->fc, current_user_ns());
 
 	fud = fuse_dev_alloc(&cc->fc);
 	if (!fud) {
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 17f0d05b..0f780e16 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -114,8 +114,8 @@  static void __fuse_put_request(struct fuse_req *req)
 
 static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req)
 {
-	req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
-	req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
+	req->in.h.uid = from_kuid(fc->user_ns, current_fsuid());
+	req->in.h.gid = from_kgid(fc->user_ns, current_fsgid());
 	req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
 }
 
@@ -167,6 +167,10 @@  static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
 	__set_bit(FR_WAITING, &req->flags);
 	if (for_background)
 		__set_bit(FR_BACKGROUND, &req->flags);
+	if (req->in.h.uid == (uid_t)-1 || req->in.h.gid == (gid_t)-1) {
+		fuse_put_request(fc, req);
+		return ERR_PTR(-EOVERFLOW);
+	}
 
 	return req;
 
@@ -1260,7 +1264,8 @@  static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
 	in = &req->in;
 	reqsize = in->h.len;
 
-	if (task_active_pid_ns(current) != fc->pid_ns) {
+	if (task_active_pid_ns(current) != fc->pid_ns ||
+	    current_user_ns() != fc->user_ns) {
 		rcu_read_lock();
 		in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns));
 		rcu_read_unlock();
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 24967382..ad1cfac1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -858,8 +858,8 @@  static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
 	stat->ino = attr->ino;
 	stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
 	stat->nlink = attr->nlink;
-	stat->uid = make_kuid(&init_user_ns, attr->uid);
-	stat->gid = make_kgid(&init_user_ns, attr->gid);
+	stat->uid = make_kuid(fc->user_ns, attr->uid);
+	stat->gid = make_kgid(fc->user_ns, attr->gid);
 	stat->rdev = inode->i_rdev;
 	stat->atime.tv_sec = attr->atime;
 	stat->atime.tv_nsec = attr->atimensec;
@@ -1475,17 +1475,17 @@  static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
 	return true;
 }
 
-static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
-			   bool trust_local_cmtime)
+static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
+			   struct fuse_setattr_in *arg, bool trust_local_cmtime)
 {
 	unsigned ivalid = iattr->ia_valid;
 
 	if (ivalid & ATTR_MODE)
 		arg->valid |= FATTR_MODE,   arg->mode = iattr->ia_mode;
 	if (ivalid & ATTR_UID)
-		arg->valid |= FATTR_UID,    arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
+		arg->valid |= FATTR_UID,    arg->uid = from_kuid(fc->user_ns, iattr->ia_uid);
 	if (ivalid & ATTR_GID)
-		arg->valid |= FATTR_GID,    arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
+		arg->valid |= FATTR_GID,    arg->gid = from_kgid(fc->user_ns, iattr->ia_gid);
 	if (ivalid & ATTR_SIZE)
 		arg->valid |= FATTR_SIZE,   arg->size = iattr->ia_size;
 	if (ivalid & ATTR_ATIME) {
@@ -1646,7 +1646,7 @@  int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
 
 	memset(&inarg, 0, sizeof(inarg));
 	memset(&outarg, 0, sizeof(outarg));
-	iattr_to_fattr(attr, &inarg, trust_local_cmtime);
+	iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime);
 	if (file) {
 		struct fuse_file *ff = file->private_data;
 		inarg.valid |= FATTR_FH;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index d5773ca6..364e65c8 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -26,6 +26,7 @@ 
 #include <linux/xattr.h>
 #include <linux/pid_namespace.h>
 #include <linux/refcount.h>
+#include <linux/user_namespace.h>
 
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -466,6 +467,9 @@  struct fuse_conn {
 	/** The pid namespace for this mount */
 	struct pid_namespace *pid_ns;
 
+	/** The user namespace for this mount */
+	struct user_namespace *user_ns;
+
 	/** Maximum read size */
 	unsigned max_read;
 
@@ -870,7 +874,7 @@  struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
 /**
  * Initialize fuse_conn
  */
-void fuse_conn_init(struct fuse_conn *fc);
+void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns);
 
 /**
  * Release reference to fuse_conn
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2f504d61..7f6b2e55 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -171,8 +171,8 @@  void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 	inode->i_ino     = fuse_squash_ino(attr->ino);
 	inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
 	set_nlink(inode, attr->nlink);
-	inode->i_uid     = make_kuid(&init_user_ns, attr->uid);
-	inode->i_gid     = make_kgid(&init_user_ns, attr->gid);
+	inode->i_uid     = make_kuid(fc->user_ns, attr->uid);
+	inode->i_gid     = make_kgid(fc->user_ns, attr->gid);
 	inode->i_blocks  = attr->blocks;
 	inode->i_atime.tv_sec   = attr->atime;
 	inode->i_atime.tv_nsec  = attr->atimensec;
@@ -477,7 +477,8 @@  static int fuse_match_uint(substring_t *s, unsigned int *res)
 	return err;
 }
 
-static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
+static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
+			  struct user_namespace *user_ns)
 {
 	char *p;
 	memset(d, 0, sizeof(struct fuse_mount_data));
@@ -513,7 +514,7 @@  static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
 		case OPT_USER_ID:
 			if (fuse_match_uint(&args[0], &uv))
 				return 0;
-			d->user_id = make_kuid(current_user_ns(), uv);
+			d->user_id = make_kuid(user_ns, uv);
 			if (!uid_valid(d->user_id))
 				return 0;
 			d->user_id_present = 1;
@@ -522,7 +523,7 @@  static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
 		case OPT_GROUP_ID:
 			if (fuse_match_uint(&args[0], &uv))
 				return 0;
-			d->group_id = make_kgid(current_user_ns(), uv);
+			d->group_id = make_kgid(user_ns, uv);
 			if (!gid_valid(d->group_id))
 				return 0;
 			d->group_id_present = 1;
@@ -565,8 +566,8 @@  static int fuse_show_options(struct seq_file *m, struct dentry *root)
 	struct super_block *sb = root->d_sb;
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
-	seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
-	seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
+	seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
+	seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
 	if (fc->default_permissions)
 		seq_puts(m, ",default_permissions");
 	if (fc->allow_other)
@@ -597,7 +598,7 @@  static void fuse_pqueue_init(struct fuse_pqueue *fpq)
 	fpq->connected = 1;
 }
 
-void fuse_conn_init(struct fuse_conn *fc)
+void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
 {
 	memset(fc, 0, sizeof(*fc));
 	spin_lock_init(&fc->lock);
@@ -621,6 +622,7 @@  void fuse_conn_init(struct fuse_conn *fc)
 	fc->attr_version = 1;
 	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
 	fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
+	fc->user_ns = get_user_ns(user_ns);
 }
 EXPORT_SYMBOL_GPL(fuse_conn_init);
 
@@ -630,6 +632,7 @@  void fuse_conn_put(struct fuse_conn *fc)
 		if (fc->destroy_req)
 			fuse_request_free(fc->destroy_req);
 		put_pid_ns(fc->pid_ns);
+		put_user_ns(fc->user_ns);
 		fc->release(fc);
 	}
 }
@@ -1061,7 +1064,7 @@  static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_flags &= ~(MS_NOSEC | SB_I_VERSION);
 
-	if (!parse_fuse_opt(data, &d, is_bdev))
+	if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns))
 		goto err;
 
 	if (is_bdev) {
@@ -1086,8 +1089,12 @@  static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!file)
 		goto err;
 
-	if ((file->f_op != &fuse_dev_operations) ||
-	    (file->f_cred->user_ns != &init_user_ns))
+	/*
+	 * Require mount to happen from the same user namespace which
+	 * opened /dev/fuse to prevent potential attacks.
+	 */
+	if (file->f_op != &fuse_dev_operations ||
+	    file->f_cred->user_ns != sb->s_user_ns)
 		goto err_fput;
 
 	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
@@ -1095,7 +1102,7 @@  static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!fc)
 		goto err_fput;
 
-	fuse_conn_init(fc);
+	fuse_conn_init(fc, sb->s_user_ns);
 	fc->release = fuse_free_conn;
 
 	fud = fuse_dev_alloc(fc);

Comments

Serge E. Hallyn Dec. 23, 2017, 3:46 a.m.
On Fri, Dec 22, 2017 at 03:32:32PM +0100, Dongsu Park wrote:
> From: Seth Forshee <seth.forshee@canonical.com>
> 
> In order to support mounts from namespaces other than
> init_user_ns, fuse must translate uids and gids to/from the
> userns of the process servicing requests on /dev/fuse. This
> patch does that, with a couple of restrictions on the namespace:
> 
>  - The userns for the fuse connection is fixed to the namespace
>    from which /dev/fuse is opened.
> 
>  - The namespace must be the same as s_user_ns.
> 
> These restrictions simplify the implementation by avoiding the
> need to pass around userns references and by allowing fuse to
> rely on the checks in inode_change_ok for ownership changes.
> Either restriction could be relaxed in the future if needed.
> 
> For cuse the namespace used for the connection is also simply
> current_user_ns() at the time /dev/cuse is opened.
> 
> Patch v4 is available: https://patchwork.kernel.org/patch/8944661/
> 
> Cc: linux-fsdevel@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: Miklos Szeredi <mszeredi@redhat.com>
> Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
> Signed-off-by: Dongsu Park <dongsu@kinvolk.io>

Acked-by: Serge Hallyn <serge@hallyn.com>

> ---
>  fs/fuse/cuse.c   |  3 ++-
>  fs/fuse/dev.c    | 11 ++++++++---
>  fs/fuse/dir.c    | 14 +++++++-------
>  fs/fuse/fuse_i.h |  6 +++++-
>  fs/fuse/inode.c  | 31 +++++++++++++++++++------------
>  5 files changed, 41 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
> index e9e97803..b1b83259 100644
> --- a/fs/fuse/cuse.c
> +++ b/fs/fuse/cuse.c
> @@ -48,6 +48,7 @@
>  #include <linux/stat.h>
>  #include <linux/module.h>
>  #include <linux/uio.h>
> +#include <linux/user_namespace.h>
>  
>  #include "fuse_i.h"
>  
> @@ -498,7 +499,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
>  	if (!cc)
>  		return -ENOMEM;
>  
> -	fuse_conn_init(&cc->fc);
> +	fuse_conn_init(&cc->fc, current_user_ns());
>  
>  	fud = fuse_dev_alloc(&cc->fc);
>  	if (!fud) {
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index 17f0d05b..0f780e16 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -114,8 +114,8 @@ static void __fuse_put_request(struct fuse_req *req)
>  
>  static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req)
>  {
> -	req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
> -	req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
> +	req->in.h.uid = from_kuid(fc->user_ns, current_fsuid());
> +	req->in.h.gid = from_kgid(fc->user_ns, current_fsgid());
>  	req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
>  }
>  
> @@ -167,6 +167,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
>  	__set_bit(FR_WAITING, &req->flags);
>  	if (for_background)
>  		__set_bit(FR_BACKGROUND, &req->flags);
> +	if (req->in.h.uid == (uid_t)-1 || req->in.h.gid == (gid_t)-1) {
> +		fuse_put_request(fc, req);
> +		return ERR_PTR(-EOVERFLOW);
> +	}
>  
>  	return req;
>  
> @@ -1260,7 +1264,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
>  	in = &req->in;
>  	reqsize = in->h.len;
>  
> -	if (task_active_pid_ns(current) != fc->pid_ns) {
> +	if (task_active_pid_ns(current) != fc->pid_ns ||
> +	    current_user_ns() != fc->user_ns) {
>  		rcu_read_lock();
>  		in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns));
>  		rcu_read_unlock();
> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> index 24967382..ad1cfac1 100644
> --- a/fs/fuse/dir.c
> +++ b/fs/fuse/dir.c
> @@ -858,8 +858,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
>  	stat->ino = attr->ino;
>  	stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
>  	stat->nlink = attr->nlink;
> -	stat->uid = make_kuid(&init_user_ns, attr->uid);
> -	stat->gid = make_kgid(&init_user_ns, attr->gid);
> +	stat->uid = make_kuid(fc->user_ns, attr->uid);
> +	stat->gid = make_kgid(fc->user_ns, attr->gid);
>  	stat->rdev = inode->i_rdev;
>  	stat->atime.tv_sec = attr->atime;
>  	stat->atime.tv_nsec = attr->atimensec;
> @@ -1475,17 +1475,17 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
>  	return true;
>  }
>  
> -static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
> -			   bool trust_local_cmtime)
> +static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
> +			   struct fuse_setattr_in *arg, bool trust_local_cmtime)
>  {
>  	unsigned ivalid = iattr->ia_valid;
>  
>  	if (ivalid & ATTR_MODE)
>  		arg->valid |= FATTR_MODE,   arg->mode = iattr->ia_mode;
>  	if (ivalid & ATTR_UID)
> -		arg->valid |= FATTR_UID,    arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
> +		arg->valid |= FATTR_UID,    arg->uid = from_kuid(fc->user_ns, iattr->ia_uid);
>  	if (ivalid & ATTR_GID)
> -		arg->valid |= FATTR_GID,    arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
> +		arg->valid |= FATTR_GID,    arg->gid = from_kgid(fc->user_ns, iattr->ia_gid);
>  	if (ivalid & ATTR_SIZE)
>  		arg->valid |= FATTR_SIZE,   arg->size = iattr->ia_size;
>  	if (ivalid & ATTR_ATIME) {
> @@ -1646,7 +1646,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
>  
>  	memset(&inarg, 0, sizeof(inarg));
>  	memset(&outarg, 0, sizeof(outarg));
> -	iattr_to_fattr(attr, &inarg, trust_local_cmtime);
> +	iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime);
>  	if (file) {
>  		struct fuse_file *ff = file->private_data;
>  		inarg.valid |= FATTR_FH;
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index d5773ca6..364e65c8 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -26,6 +26,7 @@
>  #include <linux/xattr.h>
>  #include <linux/pid_namespace.h>
>  #include <linux/refcount.h>
> +#include <linux/user_namespace.h>
>  
>  /** Max number of pages that can be used in a single read request */
>  #define FUSE_MAX_PAGES_PER_REQ 32
> @@ -466,6 +467,9 @@ struct fuse_conn {
>  	/** The pid namespace for this mount */
>  	struct pid_namespace *pid_ns;
>  
> +	/** The user namespace for this mount */
> +	struct user_namespace *user_ns;
> +
>  	/** Maximum read size */
>  	unsigned max_read;
>  
> @@ -870,7 +874,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
>  /**
>   * Initialize fuse_conn
>   */
> -void fuse_conn_init(struct fuse_conn *fc);
> +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns);
>  
>  /**
>   * Release reference to fuse_conn
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 2f504d61..7f6b2e55 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -171,8 +171,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
>  	inode->i_ino     = fuse_squash_ino(attr->ino);
>  	inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
>  	set_nlink(inode, attr->nlink);
> -	inode->i_uid     = make_kuid(&init_user_ns, attr->uid);
> -	inode->i_gid     = make_kgid(&init_user_ns, attr->gid);
> +	inode->i_uid     = make_kuid(fc->user_ns, attr->uid);
> +	inode->i_gid     = make_kgid(fc->user_ns, attr->gid);
>  	inode->i_blocks  = attr->blocks;
>  	inode->i_atime.tv_sec   = attr->atime;
>  	inode->i_atime.tv_nsec  = attr->atimensec;
> @@ -477,7 +477,8 @@ static int fuse_match_uint(substring_t *s, unsigned int *res)
>  	return err;
>  }
>  
> -static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
> +static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
> +			  struct user_namespace *user_ns)
>  {
>  	char *p;
>  	memset(d, 0, sizeof(struct fuse_mount_data));
> @@ -513,7 +514,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
>  		case OPT_USER_ID:
>  			if (fuse_match_uint(&args[0], &uv))
>  				return 0;
> -			d->user_id = make_kuid(current_user_ns(), uv);
> +			d->user_id = make_kuid(user_ns, uv);
>  			if (!uid_valid(d->user_id))
>  				return 0;
>  			d->user_id_present = 1;
> @@ -522,7 +523,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
>  		case OPT_GROUP_ID:
>  			if (fuse_match_uint(&args[0], &uv))
>  				return 0;
> -			d->group_id = make_kgid(current_user_ns(), uv);
> +			d->group_id = make_kgid(user_ns, uv);
>  			if (!gid_valid(d->group_id))
>  				return 0;
>  			d->group_id_present = 1;
> @@ -565,8 +566,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
>  	struct super_block *sb = root->d_sb;
>  	struct fuse_conn *fc = get_fuse_conn_super(sb);
>  
> -	seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
> -	seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
> +	seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
> +	seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
>  	if (fc->default_permissions)
>  		seq_puts(m, ",default_permissions");
>  	if (fc->allow_other)
> @@ -597,7 +598,7 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq)
>  	fpq->connected = 1;
>  }
>  
> -void fuse_conn_init(struct fuse_conn *fc)
> +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
>  {
>  	memset(fc, 0, sizeof(*fc));
>  	spin_lock_init(&fc->lock);
> @@ -621,6 +622,7 @@ void fuse_conn_init(struct fuse_conn *fc)
>  	fc->attr_version = 1;
>  	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
>  	fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
> +	fc->user_ns = get_user_ns(user_ns);
>  }
>  EXPORT_SYMBOL_GPL(fuse_conn_init);
>  
> @@ -630,6 +632,7 @@ void fuse_conn_put(struct fuse_conn *fc)
>  		if (fc->destroy_req)
>  			fuse_request_free(fc->destroy_req);
>  		put_pid_ns(fc->pid_ns);
> +		put_user_ns(fc->user_ns);
>  		fc->release(fc);
>  	}
>  }
> @@ -1061,7 +1064,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
>  
>  	sb->s_flags &= ~(MS_NOSEC | SB_I_VERSION);
>  
> -	if (!parse_fuse_opt(data, &d, is_bdev))
> +	if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns))
>  		goto err;
>  
>  	if (is_bdev) {
> @@ -1086,8 +1089,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
>  	if (!file)
>  		goto err;
>  
> -	if ((file->f_op != &fuse_dev_operations) ||
> -	    (file->f_cred->user_ns != &init_user_ns))
> +	/*
> +	 * Require mount to happen from the same user namespace which
> +	 * opened /dev/fuse to prevent potential attacks.
> +	 */
> +	if (file->f_op != &fuse_dev_operations ||
> +	    file->f_cred->user_ns != sb->s_user_ns)
>  		goto err_fput;
>  
>  	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
> @@ -1095,7 +1102,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
>  	if (!fc)
>  		goto err_fput;
>  
> -	fuse_conn_init(fc);
> +	fuse_conn_init(fc, sb->s_user_ns);
>  	fc->release = fuse_free_conn;
>  
>  	fud = fuse_dev_alloc(fc);
> -- 
> 2.13.6
> 
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/containers
Alban Crequy Jan. 17, 2018, 10:59 a.m.
[Adding Tejun, David, Tom for question about cuse]

On Fri, Dec 22, 2017 at 3:32 PM, Dongsu Park <dongsu@kinvolk.io> wrote:
> From: Seth Forshee <seth.forshee@canonical.com>
>
> In order to support mounts from namespaces other than
> init_user_ns, fuse must translate uids and gids to/from the
> userns of the process servicing requests on /dev/fuse. This
> patch does that, with a couple of restrictions on the namespace:
>
>  - The userns for the fuse connection is fixed to the namespace
>    from which /dev/fuse is opened.
>
>  - The namespace must be the same as s_user_ns.
>
> These restrictions simplify the implementation by avoiding the
> need to pass around userns references and by allowing fuse to
> rely on the checks in inode_change_ok for ownership changes.
> Either restriction could be relaxed in the future if needed.
>
> For cuse the namespace used for the connection is also simply
> current_user_ns() at the time /dev/cuse is opened.

Was a use case discussed for using cuse in a new unprivileged userns?

I ran some tests yesterday with cusexmp [1] and I could add a new char
device as an unprivileged user with:

$ unshare -U -r -m sh -c 'mount --bind /mnt/cuse /dev/cuse ; cusexmp
--maj=99 --min=30 --name=foo

where /mnt/cuse is previously mknod'ed correctly and chmod'ed 777.
Then, I could see the new device:

$ cat /proc/devices | grep foo
 99 foo

On normal distros, we don't have a /mnt/cuse chmod'ed 777 but still it
seems dangerous if the dev node can be provided otherwise and if we
don't have a use case for it.

Thoughts?

[1] https://github.com/fuse4x/fuse/blob/master/example/cusexmp.c#L9

Cheers,
Alban


> Patch v4 is available: https://patchwork.kernel.org/patch/8944661/
>
> Cc: linux-fsdevel@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: Miklos Szeredi <mszeredi@redhat.com>
> Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
> Signed-off-by: Dongsu Park <dongsu@kinvolk.io>
> ---
>  fs/fuse/cuse.c   |  3 ++-
>  fs/fuse/dev.c    | 11 ++++++++---
>  fs/fuse/dir.c    | 14 +++++++-------
>  fs/fuse/fuse_i.h |  6 +++++-
>  fs/fuse/inode.c  | 31 +++++++++++++++++++------------
>  5 files changed, 41 insertions(+), 24 deletions(-)
>
> diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
> index e9e97803..b1b83259 100644
> --- a/fs/fuse/cuse.c
> +++ b/fs/fuse/cuse.c
> @@ -48,6 +48,7 @@
>  #include <linux/stat.h>
>  #include <linux/module.h>
>  #include <linux/uio.h>
> +#include <linux/user_namespace.h>
>
>  #include "fuse_i.h"
>
> @@ -498,7 +499,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
>         if (!cc)
>                 return -ENOMEM;
>
> -       fuse_conn_init(&cc->fc);
> +       fuse_conn_init(&cc->fc, current_user_ns());
>
>         fud = fuse_dev_alloc(&cc->fc);
>         if (!fud) {
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index 17f0d05b..0f780e16 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -114,8 +114,8 @@ static void __fuse_put_request(struct fuse_req *req)
>
>  static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req)
>  {
> -       req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
> -       req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
> +       req->in.h.uid = from_kuid(fc->user_ns, current_fsuid());
> +       req->in.h.gid = from_kgid(fc->user_ns, current_fsgid());
>         req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
>  }
>
> @@ -167,6 +167,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
>         __set_bit(FR_WAITING, &req->flags);
>         if (for_background)
>                 __set_bit(FR_BACKGROUND, &req->flags);
> +       if (req->in.h.uid == (uid_t)-1 || req->in.h.gid == (gid_t)-1) {
> +               fuse_put_request(fc, req);
> +               return ERR_PTR(-EOVERFLOW);
> +       }
>
>         return req;
>
> @@ -1260,7 +1264,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
>         in = &req->in;
>         reqsize = in->h.len;
>
> -       if (task_active_pid_ns(current) != fc->pid_ns) {
> +       if (task_active_pid_ns(current) != fc->pid_ns ||
> +           current_user_ns() != fc->user_ns) {
>                 rcu_read_lock();
>                 in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns));
>                 rcu_read_unlock();
> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> index 24967382..ad1cfac1 100644
> --- a/fs/fuse/dir.c
> +++ b/fs/fuse/dir.c
> @@ -858,8 +858,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
>         stat->ino = attr->ino;
>         stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
>         stat->nlink = attr->nlink;
> -       stat->uid = make_kuid(&init_user_ns, attr->uid);
> -       stat->gid = make_kgid(&init_user_ns, attr->gid);
> +       stat->uid = make_kuid(fc->user_ns, attr->uid);
> +       stat->gid = make_kgid(fc->user_ns, attr->gid);
>         stat->rdev = inode->i_rdev;
>         stat->atime.tv_sec = attr->atime;
>         stat->atime.tv_nsec = attr->atimensec;
> @@ -1475,17 +1475,17 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
>         return true;
>  }
>
> -static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
> -                          bool trust_local_cmtime)
> +static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
> +                          struct fuse_setattr_in *arg, bool trust_local_cmtime)
>  {
>         unsigned ivalid = iattr->ia_valid;
>
>         if (ivalid & ATTR_MODE)
>                 arg->valid |= FATTR_MODE,   arg->mode = iattr->ia_mode;
>         if (ivalid & ATTR_UID)
> -               arg->valid |= FATTR_UID,    arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
> +               arg->valid |= FATTR_UID,    arg->uid = from_kuid(fc->user_ns, iattr->ia_uid);
>         if (ivalid & ATTR_GID)
> -               arg->valid |= FATTR_GID,    arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
> +               arg->valid |= FATTR_GID,    arg->gid = from_kgid(fc->user_ns, iattr->ia_gid);
>         if (ivalid & ATTR_SIZE)
>                 arg->valid |= FATTR_SIZE,   arg->size = iattr->ia_size;
>         if (ivalid & ATTR_ATIME) {
> @@ -1646,7 +1646,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
>
>         memset(&inarg, 0, sizeof(inarg));
>         memset(&outarg, 0, sizeof(outarg));
> -       iattr_to_fattr(attr, &inarg, trust_local_cmtime);
> +       iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime);
>         if (file) {
>                 struct fuse_file *ff = file->private_data;
>                 inarg.valid |= FATTR_FH;
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index d5773ca6..364e65c8 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -26,6 +26,7 @@
>  #include <linux/xattr.h>
>  #include <linux/pid_namespace.h>
>  #include <linux/refcount.h>
> +#include <linux/user_namespace.h>
>
>  /** Max number of pages that can be used in a single read request */
>  #define FUSE_MAX_PAGES_PER_REQ 32
> @@ -466,6 +467,9 @@ struct fuse_conn {
>         /** The pid namespace for this mount */
>         struct pid_namespace *pid_ns;
>
> +       /** The user namespace for this mount */
> +       struct user_namespace *user_ns;
> +
>         /** Maximum read size */
>         unsigned max_read;
>
> @@ -870,7 +874,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
>  /**
>   * Initialize fuse_conn
>   */
> -void fuse_conn_init(struct fuse_conn *fc);
> +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns);
>
>  /**
>   * Release reference to fuse_conn
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 2f504d61..7f6b2e55 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -171,8 +171,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
>         inode->i_ino     = fuse_squash_ino(attr->ino);
>         inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
>         set_nlink(inode, attr->nlink);
> -       inode->i_uid     = make_kuid(&init_user_ns, attr->uid);
> -       inode->i_gid     = make_kgid(&init_user_ns, attr->gid);
> +       inode->i_uid     = make_kuid(fc->user_ns, attr->uid);
> +       inode->i_gid     = make_kgid(fc->user_ns, attr->gid);
>         inode->i_blocks  = attr->blocks;
>         inode->i_atime.tv_sec   = attr->atime;
>         inode->i_atime.tv_nsec  = attr->atimensec;
> @@ -477,7 +477,8 @@ static int fuse_match_uint(substring_t *s, unsigned int *res)
>         return err;
>  }
>
> -static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
> +static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
> +                         struct user_namespace *user_ns)
>  {
>         char *p;
>         memset(d, 0, sizeof(struct fuse_mount_data));
> @@ -513,7 +514,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
>                 case OPT_USER_ID:
>                         if (fuse_match_uint(&args[0], &uv))
>                                 return 0;
> -                       d->user_id = make_kuid(current_user_ns(), uv);
> +                       d->user_id = make_kuid(user_ns, uv);
>                         if (!uid_valid(d->user_id))
>                                 return 0;
>                         d->user_id_present = 1;
> @@ -522,7 +523,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
>                 case OPT_GROUP_ID:
>                         if (fuse_match_uint(&args[0], &uv))
>                                 return 0;
> -                       d->group_id = make_kgid(current_user_ns(), uv);
> +                       d->group_id = make_kgid(user_ns, uv);
>                         if (!gid_valid(d->group_id))
>                                 return 0;
>                         d->group_id_present = 1;
> @@ -565,8 +566,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
>         struct super_block *sb = root->d_sb;
>         struct fuse_conn *fc = get_fuse_conn_super(sb);
>
> -       seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
> -       seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
> +       seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
> +       seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
>         if (fc->default_permissions)
>                 seq_puts(m, ",default_permissions");
>         if (fc->allow_other)
> @@ -597,7 +598,7 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq)
>         fpq->connected = 1;
>  }
>
> -void fuse_conn_init(struct fuse_conn *fc)
> +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
>  {
>         memset(fc, 0, sizeof(*fc));
>         spin_lock_init(&fc->lock);
> @@ -621,6 +622,7 @@ void fuse_conn_init(struct fuse_conn *fc)
>         fc->attr_version = 1;
>         get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
>         fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
> +       fc->user_ns = get_user_ns(user_ns);
>  }
>  EXPORT_SYMBOL_GPL(fuse_conn_init);
>
> @@ -630,6 +632,7 @@ void fuse_conn_put(struct fuse_conn *fc)
>                 if (fc->destroy_req)
>                         fuse_request_free(fc->destroy_req);
>                 put_pid_ns(fc->pid_ns);
> +               put_user_ns(fc->user_ns);
>                 fc->release(fc);
>         }
>  }
> @@ -1061,7 +1064,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
>
>         sb->s_flags &= ~(MS_NOSEC | SB_I_VERSION);
>
> -       if (!parse_fuse_opt(data, &d, is_bdev))
> +       if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns))
>                 goto err;
>
>         if (is_bdev) {
> @@ -1086,8 +1089,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
>         if (!file)
>                 goto err;
>
> -       if ((file->f_op != &fuse_dev_operations) ||
> -           (file->f_cred->user_ns != &init_user_ns))
> +       /*
> +        * Require mount to happen from the same user namespace which
> +        * opened /dev/fuse to prevent potential attacks.
> +        */
> +       if (file->f_op != &fuse_dev_operations ||
> +           file->f_cred->user_ns != sb->s_user_ns)
>                 goto err_fput;
>
>         fc = kmalloc(sizeof(*fc), GFP_KERNEL);
> @@ -1095,7 +1102,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
>         if (!fc)
>                 goto err_fput;
>
> -       fuse_conn_init(fc);
> +       fuse_conn_init(fc, sb->s_user_ns);
>         fc->release = fuse_free_conn;
>
>         fud = fuse_dev_alloc(fc);
> --
> 2.13.6
>
Seth Forshee Jan. 17, 2018, 2:29 p.m.
On Wed, Jan 17, 2018 at 11:59:06AM +0100, Alban Crequy wrote:
> [Adding Tejun, David, Tom for question about cuse]
> 
> On Fri, Dec 22, 2017 at 3:32 PM, Dongsu Park <dongsu@kinvolk.io> wrote:
> > From: Seth Forshee <seth.forshee@canonical.com>
> >
> > In order to support mounts from namespaces other than
> > init_user_ns, fuse must translate uids and gids to/from the
> > userns of the process servicing requests on /dev/fuse. This
> > patch does that, with a couple of restrictions on the namespace:
> >
> >  - The userns for the fuse connection is fixed to the namespace
> >    from which /dev/fuse is opened.
> >
> >  - The namespace must be the same as s_user_ns.
> >
> > These restrictions simplify the implementation by avoiding the
> > need to pass around userns references and by allowing fuse to
> > rely on the checks in inode_change_ok for ownership changes.
> > Either restriction could be relaxed in the future if needed.
> >
> > For cuse the namespace used for the connection is also simply
> > current_user_ns() at the time /dev/cuse is opened.
> 
> Was a use case discussed for using cuse in a new unprivileged userns?
> 
> I ran some tests yesterday with cusexmp [1] and I could add a new char
> device as an unprivileged user with:
> 
> $ unshare -U -r -m sh -c 'mount --bind /mnt/cuse /dev/cuse ; cusexmp
> --maj=99 --min=30 --name=foo
> 
> where /mnt/cuse is previously mknod'ed correctly and chmod'ed 777.
> Then, I could see the new device:
> 
> $ cat /proc/devices | grep foo
>  99 foo
> 
> On normal distros, we don't have a /mnt/cuse chmod'ed 777 but still it
> seems dangerous if the dev node can be provided otherwise and if we
> don't have a use case for it.
> 
> Thoughts?

I can't remember the specific reasons, but I had concluded that letting
unprivileged users use cuse within a user namespace isn't safe. But
having a cuse device node usable by regular users at all is equally
unsafe I suspect, so I don't think your example demonstrates any problem
specific to user namespaces. There shouldn't be any way to use a user
namespace to gain access permissions towards /dev/cuse, otherwise we
have bigger problems than cuse to worry about.

Seth
Alban Crequy Jan. 17, 2018, 6:56 p.m.
On Wed, Jan 17, 2018 at 3:29 PM, Seth Forshee
<seth.forshee@canonical.com> wrote:
> On Wed, Jan 17, 2018 at 11:59:06AM +0100, Alban Crequy wrote:
>> [Adding Tejun, David, Tom for question about cuse]
>>
>> On Fri, Dec 22, 2017 at 3:32 PM, Dongsu Park <dongsu@kinvolk.io> wrote:
>> > From: Seth Forshee <seth.forshee@canonical.com>
>> >
>> > In order to support mounts from namespaces other than
>> > init_user_ns, fuse must translate uids and gids to/from the
>> > userns of the process servicing requests on /dev/fuse. This
>> > patch does that, with a couple of restrictions on the namespace:
>> >
>> >  - The userns for the fuse connection is fixed to the namespace
>> >    from which /dev/fuse is opened.
>> >
>> >  - The namespace must be the same as s_user_ns.
>> >
>> > These restrictions simplify the implementation by avoiding the
>> > need to pass around userns references and by allowing fuse to
>> > rely on the checks in inode_change_ok for ownership changes.
>> > Either restriction could be relaxed in the future if needed.
>> >
>> > For cuse the namespace used for the connection is also simply
>> > current_user_ns() at the time /dev/cuse is opened.
>>
>> Was a use case discussed for using cuse in a new unprivileged userns?
>>
>> I ran some tests yesterday with cusexmp [1] and I could add a new char
>> device as an unprivileged user with:
>>
>> $ unshare -U -r -m sh -c 'mount --bind /mnt/cuse /dev/cuse ; cusexmp
>> --maj=99 --min=30 --name=foo
>>
>> where /mnt/cuse is previously mknod'ed correctly and chmod'ed 777.
>> Then, I could see the new device:
>>
>> $ cat /proc/devices | grep foo
>>  99 foo
>>
>> On normal distros, we don't have a /mnt/cuse chmod'ed 777 but still it
>> seems dangerous if the dev node can be provided otherwise and if we
>> don't have a use case for it.
>>
>> Thoughts?
>
> I can't remember the specific reasons, but I had concluded that letting
> unprivileged users use cuse within a user namespace isn't safe. But
> having a cuse device node usable by regular users at all is equally
> unsafe I suspect,

This makes sense.

> so I don't think your example demonstrates any problem
> specific to user namespaces. There shouldn't be any way to use a user
> namespace to gain access permissions towards /dev/cuse, otherwise we
> have bigger problems than cuse to worry about.

From my tests, the patch seem safe but I don't fully understand why that is.

I am not trying to gain more permissions towards /dev/cuse but to
create another cuse char file from within the unprivileged userns. I
tested the scenario by patching the memfs userspace FUSE driver to
generate the char device whenever the file is named "cuse" (turning
the regular file into a char device with the cuse major/minor behind
the scene):

$ unshare -U -r -m
# memfs /mnt/memfs &
# ls -l /mnt/memfs
# echo -n > /mnt/memfs/cuse
-bash: /mnt/memfs/cuse: Input/output error
# ls -l /mnt/memfs/cuse
crwxrwxrwx. 1 root root 10, 203 Jan 17 18:24 /mnt/memfs/cuse
# cat /mnt/memfs/cuse
cat: /mnt/memfs/cuse: Permission denied

But then, I could not use that char device, even though it seems to
have the correct major/minor and permissions. The kernel FUSE code
seems to call init_special_inode() to handle character devices. I
don't understand why it seems to be safe.

Thanks!
Alban
Seth Forshee Jan. 17, 2018, 7:31 p.m.
On Wed, Jan 17, 2018 at 07:56:59PM +0100, Alban Crequy wrote:
> On Wed, Jan 17, 2018 at 3:29 PM, Seth Forshee
> <seth.forshee@canonical.com> wrote:
> > On Wed, Jan 17, 2018 at 11:59:06AM +0100, Alban Crequy wrote:
> >> [Adding Tejun, David, Tom for question about cuse]
> >>
> >> On Fri, Dec 22, 2017 at 3:32 PM, Dongsu Park <dongsu@kinvolk.io> wrote:
> >> > From: Seth Forshee <seth.forshee@canonical.com>
> >> >
> >> > In order to support mounts from namespaces other than
> >> > init_user_ns, fuse must translate uids and gids to/from the
> >> > userns of the process servicing requests on /dev/fuse. This
> >> > patch does that, with a couple of restrictions on the namespace:
> >> >
> >> >  - The userns for the fuse connection is fixed to the namespace
> >> >    from which /dev/fuse is opened.
> >> >
> >> >  - The namespace must be the same as s_user_ns.
> >> >
> >> > These restrictions simplify the implementation by avoiding the
> >> > need to pass around userns references and by allowing fuse to
> >> > rely on the checks in inode_change_ok for ownership changes.
> >> > Either restriction could be relaxed in the future if needed.
> >> >
> >> > For cuse the namespace used for the connection is also simply
> >> > current_user_ns() at the time /dev/cuse is opened.
> >>
> >> Was a use case discussed for using cuse in a new unprivileged userns?
> >>
> >> I ran some tests yesterday with cusexmp [1] and I could add a new char
> >> device as an unprivileged user with:
> >>
> >> $ unshare -U -r -m sh -c 'mount --bind /mnt/cuse /dev/cuse ; cusexmp
> >> --maj=99 --min=30 --name=foo
> >>
> >> where /mnt/cuse is previously mknod'ed correctly and chmod'ed 777.
> >> Then, I could see the new device:
> >>
> >> $ cat /proc/devices | grep foo
> >>  99 foo
> >>
> >> On normal distros, we don't have a /mnt/cuse chmod'ed 777 but still it
> >> seems dangerous if the dev node can be provided otherwise and if we
> >> don't have a use case for it.
> >>
> >> Thoughts?
> >
> > I can't remember the specific reasons, but I had concluded that letting
> > unprivileged users use cuse within a user namespace isn't safe. But
> > having a cuse device node usable by regular users at all is equally
> > unsafe I suspect,
> 
> This makes sense.
> 
> > so I don't think your example demonstrates any problem
> > specific to user namespaces. There shouldn't be any way to use a user
> > namespace to gain access permissions towards /dev/cuse, otherwise we
> > have bigger problems than cuse to worry about.
> 
> From my tests, the patch seem safe but I don't fully understand why that is.
> 
> I am not trying to gain more permissions towards /dev/cuse but to
> create another cuse char file from within the unprivileged userns. I
> tested the scenario by patching the memfs userspace FUSE driver to
> generate the char device whenever the file is named "cuse" (turning
> the regular file into a char device with the cuse major/minor behind
> the scene):
> 
> $ unshare -U -r -m
> # memfs /mnt/memfs &
> # ls -l /mnt/memfs
> # echo -n > /mnt/memfs/cuse
> -bash: /mnt/memfs/cuse: Input/output error
> # ls -l /mnt/memfs/cuse
> crwxrwxrwx. 1 root root 10, 203 Jan 17 18:24 /mnt/memfs/cuse
> # cat /mnt/memfs/cuse
> cat: /mnt/memfs/cuse: Permission denied
> 
> But then, I could not use that char device, even though it seems to
> have the correct major/minor and permissions. The kernel FUSE code
> seems to call init_special_inode() to handle character devices. I
> don't understand why it seems to be safe.

Because for new mounts in non-init user namespaces alloc_super() sets
SB_I_NODEV flag in s_iflags, which disallows opening device nodes in
that filesystem.

Seth
Alban Crequy Jan. 18, 2018, 10:29 a.m.
On Wed, Jan 17, 2018 at 8:31 PM, Seth Forshee
<seth.forshee@canonical.com> wrote:
> On Wed, Jan 17, 2018 at 07:56:59PM +0100, Alban Crequy wrote:
>> On Wed, Jan 17, 2018 at 3:29 PM, Seth Forshee
>> <seth.forshee@canonical.com> wrote:
>> > On Wed, Jan 17, 2018 at 11:59:06AM +0100, Alban Crequy wrote:
>> >> [Adding Tejun, David, Tom for question about cuse]
>> >>
>> >> On Fri, Dec 22, 2017 at 3:32 PM, Dongsu Park <dongsu@kinvolk.io> wrote:
>> >> > From: Seth Forshee <seth.forshee@canonical.com>
>> >> >
>> >> > In order to support mounts from namespaces other than
>> >> > init_user_ns, fuse must translate uids and gids to/from the
>> >> > userns of the process servicing requests on /dev/fuse. This
>> >> > patch does that, with a couple of restrictions on the namespace:
>> >> >
>> >> >  - The userns for the fuse connection is fixed to the namespace
>> >> >    from which /dev/fuse is opened.
>> >> >
>> >> >  - The namespace must be the same as s_user_ns.
>> >> >
>> >> > These restrictions simplify the implementation by avoiding the
>> >> > need to pass around userns references and by allowing fuse to
>> >> > rely on the checks in inode_change_ok for ownership changes.
>> >> > Either restriction could be relaxed in the future if needed.
>> >> >
>> >> > For cuse the namespace used for the connection is also simply
>> >> > current_user_ns() at the time /dev/cuse is opened.
>> >>
>> >> Was a use case discussed for using cuse in a new unprivileged userns?
>> >>
>> >> I ran some tests yesterday with cusexmp [1] and I could add a new char
>> >> device as an unprivileged user with:
>> >>
>> >> $ unshare -U -r -m sh -c 'mount --bind /mnt/cuse /dev/cuse ; cusexmp
>> >> --maj=99 --min=30 --name=foo
>> >>
>> >> where /mnt/cuse is previously mknod'ed correctly and chmod'ed 777.
>> >> Then, I could see the new device:
>> >>
>> >> $ cat /proc/devices | grep foo
>> >>  99 foo
>> >>
>> >> On normal distros, we don't have a /mnt/cuse chmod'ed 777 but still it
>> >> seems dangerous if the dev node can be provided otherwise and if we
>> >> don't have a use case for it.
>> >>
>> >> Thoughts?
>> >
>> > I can't remember the specific reasons, but I had concluded that letting
>> > unprivileged users use cuse within a user namespace isn't safe. But
>> > having a cuse device node usable by regular users at all is equally
>> > unsafe I suspect,
>>
>> This makes sense.
>>
>> > so I don't think your example demonstrates any problem
>> > specific to user namespaces. There shouldn't be any way to use a user
>> > namespace to gain access permissions towards /dev/cuse, otherwise we
>> > have bigger problems than cuse to worry about.
>>
>> From my tests, the patch seem safe but I don't fully understand why that is.
>>
>> I am not trying to gain more permissions towards /dev/cuse but to
>> create another cuse char file from within the unprivileged userns. I
>> tested the scenario by patching the memfs userspace FUSE driver to
>> generate the char device whenever the file is named "cuse" (turning
>> the regular file into a char device with the cuse major/minor behind
>> the scene):
>>
>> $ unshare -U -r -m
>> # memfs /mnt/memfs &
>> # ls -l /mnt/memfs
>> # echo -n > /mnt/memfs/cuse
>> -bash: /mnt/memfs/cuse: Input/output error
>> # ls -l /mnt/memfs/cuse
>> crwxrwxrwx. 1 root root 10, 203 Jan 17 18:24 /mnt/memfs/cuse
>> # cat /mnt/memfs/cuse
>> cat: /mnt/memfs/cuse: Permission denied
>>
>> But then, I could not use that char device, even though it seems to
>> have the correct major/minor and permissions. The kernel FUSE code
>> seems to call init_special_inode() to handle character devices. I
>> don't understand why it seems to be safe.
>
> Because for new mounts in non-init user namespaces alloc_super() sets
> SB_I_NODEV flag in s_iflags, which disallows opening device nodes in
> that filesystem.

I see. Thanks for the explanation!
Miklos Szeredi Feb. 12, 2018, 3:57 p.m.
On Fri, Dec 22, 2017 at 3:32 PM, Dongsu Park <dongsu@kinvolk.io> wrote:
> From: Seth Forshee <seth.forshee@canonical.com>
>
> In order to support mounts from namespaces other than
> init_user_ns, fuse must translate uids and gids to/from the
> userns of the process servicing requests on /dev/fuse. This
> patch does that, with a couple of restrictions on the namespace:
>
>  - The userns for the fuse connection is fixed to the namespace
>    from which /dev/fuse is opened.
>
>  - The namespace must be the same as s_user_ns.
>
> These restrictions simplify the implementation by avoiding the
> need to pass around userns references and by allowing fuse to
> rely on the checks in inode_change_ok for ownership changes.
> Either restriction could be relaxed in the future if needed.

Can we not introduce potential userspace interface regressions?

The issue with pid namespaces fixed in commit 5d6d3a301c4e ("fuse:
allow server to run in different pid_ns") will probably bite us here
as well.

We basically need two modes of operation:

a) old, backward compatible (not introducing any new failure mores),
created with privileged mount
b) new, non-backward compatible, created with unprivileged mount

Technically there would still be a risk from breaking userspace, since
we are using the same entry point for both, but let's hope that no
practical problems come from that.

> For cuse the namespace used for the connection is also simply
> current_user_ns() at the time /dev/cuse is opened.
>
> Patch v4 is available: https://patchwork.kernel.org/patch/8944661/
>
> Cc: linux-fsdevel@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: Miklos Szeredi <mszeredi@redhat.com>
> Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
> Signed-off-by: Dongsu Park <dongsu@kinvolk.io>
> ---
>  fs/fuse/cuse.c   |  3 ++-
>  fs/fuse/dev.c    | 11 ++++++++---
>  fs/fuse/dir.c    | 14 +++++++-------
>  fs/fuse/fuse_i.h |  6 +++++-
>  fs/fuse/inode.c  | 31 +++++++++++++++++++------------
>  5 files changed, 41 insertions(+), 24 deletions(-)
>
> diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
> index e9e97803..b1b83259 100644
> --- a/fs/fuse/cuse.c
> +++ b/fs/fuse/cuse.c
> @@ -48,6 +48,7 @@
>  #include <linux/stat.h>
>  #include <linux/module.h>
>  #include <linux/uio.h>
> +#include <linux/user_namespace.h>
>
>  #include "fuse_i.h"
>
> @@ -498,7 +499,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
>         if (!cc)
>                 return -ENOMEM;
>
> -       fuse_conn_init(&cc->fc);
> +       fuse_conn_init(&cc->fc, current_user_ns());
>
>         fud = fuse_dev_alloc(&cc->fc);
>         if (!fud) {
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index 17f0d05b..0f780e16 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -114,8 +114,8 @@ static void __fuse_put_request(struct fuse_req *req)
>
>  static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req)
>  {
> -       req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
> -       req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
> +       req->in.h.uid = from_kuid(fc->user_ns, current_fsuid());
> +       req->in.h.gid = from_kgid(fc->user_ns, current_fsgid());
>         req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
>  }
>
> @@ -167,6 +167,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
>         __set_bit(FR_WAITING, &req->flags);
>         if (for_background)
>                 __set_bit(FR_BACKGROUND, &req->flags);
> +       if (req->in.h.uid == (uid_t)-1 || req->in.h.gid == (gid_t)-1) {
> +               fuse_put_request(fc, req);
> +               return ERR_PTR(-EOVERFLOW);
> +       }
>
>         return req;
>
> @@ -1260,7 +1264,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
>         in = &req->in;
>         reqsize = in->h.len;
>
> -       if (task_active_pid_ns(current) != fc->pid_ns) {
> +       if (task_active_pid_ns(current) != fc->pid_ns ||
> +           current_user_ns() != fc->user_ns) {

I don't get it.  Why recalculate the pid if the user_ns does not match?

>                 rcu_read_lock();
>                 in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns));
>                 rcu_read_unlock();
> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
> index 24967382..ad1cfac1 100644
> --- a/fs/fuse/dir.c
> +++ b/fs/fuse/dir.c
> @@ -858,8 +858,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
>         stat->ino = attr->ino;
>         stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
>         stat->nlink = attr->nlink;
> -       stat->uid = make_kuid(&init_user_ns, attr->uid);
> -       stat->gid = make_kgid(&init_user_ns, attr->gid);
> +       stat->uid = make_kuid(fc->user_ns, attr->uid);
> +       stat->gid = make_kgid(fc->user_ns, attr->gid);
>         stat->rdev = inode->i_rdev;
>         stat->atime.tv_sec = attr->atime;
>         stat->atime.tv_nsec = attr->atimensec;
> @@ -1475,17 +1475,17 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
>         return true;
>  }
>
> -static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
> -                          bool trust_local_cmtime)
> +static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
> +                          struct fuse_setattr_in *arg, bool trust_local_cmtime)
>  {
>         unsigned ivalid = iattr->ia_valid;
>
>         if (ivalid & ATTR_MODE)
>                 arg->valid |= FATTR_MODE,   arg->mode = iattr->ia_mode;
>         if (ivalid & ATTR_UID)
> -               arg->valid |= FATTR_UID,    arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
> +               arg->valid |= FATTR_UID,    arg->uid = from_kuid(fc->user_ns, iattr->ia_uid);
>         if (ivalid & ATTR_GID)
> -               arg->valid |= FATTR_GID,    arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
> +               arg->valid |= FATTR_GID,    arg->gid = from_kgid(fc->user_ns, iattr->ia_gid);
>         if (ivalid & ATTR_SIZE)
>                 arg->valid |= FATTR_SIZE,   arg->size = iattr->ia_size;
>         if (ivalid & ATTR_ATIME) {
> @@ -1646,7 +1646,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
>
>         memset(&inarg, 0, sizeof(inarg));
>         memset(&outarg, 0, sizeof(outarg));
> -       iattr_to_fattr(attr, &inarg, trust_local_cmtime);
> +       iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime);
>         if (file) {
>                 struct fuse_file *ff = file->private_data;
>                 inarg.valid |= FATTR_FH;
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index d5773ca6..364e65c8 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -26,6 +26,7 @@
>  #include <linux/xattr.h>
>  #include <linux/pid_namespace.h>
>  #include <linux/refcount.h>
> +#include <linux/user_namespace.h>
>
>  /** Max number of pages that can be used in a single read request */
>  #define FUSE_MAX_PAGES_PER_REQ 32
> @@ -466,6 +467,9 @@ struct fuse_conn {
>         /** The pid namespace for this mount */
>         struct pid_namespace *pid_ns;
>
> +       /** The user namespace for this mount */
> +       struct user_namespace *user_ns;
> +
>         /** Maximum read size */
>         unsigned max_read;
>
> @@ -870,7 +874,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
>  /**
>   * Initialize fuse_conn
>   */
> -void fuse_conn_init(struct fuse_conn *fc);
> +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns);
>
>  /**
>   * Release reference to fuse_conn
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 2f504d61..7f6b2e55 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -171,8 +171,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
>         inode->i_ino     = fuse_squash_ino(attr->ino);
>         inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
>         set_nlink(inode, attr->nlink);
> -       inode->i_uid     = make_kuid(&init_user_ns, attr->uid);
> -       inode->i_gid     = make_kgid(&init_user_ns, attr->gid);
> +       inode->i_uid     = make_kuid(fc->user_ns, attr->uid);
> +       inode->i_gid     = make_kgid(fc->user_ns, attr->gid);
>         inode->i_blocks  = attr->blocks;
>         inode->i_atime.tv_sec   = attr->atime;
>         inode->i_atime.tv_nsec  = attr->atimensec;
> @@ -477,7 +477,8 @@ static int fuse_match_uint(substring_t *s, unsigned int *res)
>         return err;
>  }
>
> -static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
> +static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
> +                         struct user_namespace *user_ns)
>  {
>         char *p;
>         memset(d, 0, sizeof(struct fuse_mount_data));
> @@ -513,7 +514,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
>                 case OPT_USER_ID:
>                         if (fuse_match_uint(&args[0], &uv))
>                                 return 0;
> -                       d->user_id = make_kuid(current_user_ns(), uv);
> +                       d->user_id = make_kuid(user_ns, uv);
>                         if (!uid_valid(d->user_id))
>                                 return 0;
>                         d->user_id_present = 1;
> @@ -522,7 +523,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
>                 case OPT_GROUP_ID:
>                         if (fuse_match_uint(&args[0], &uv))
>                                 return 0;
> -                       d->group_id = make_kgid(current_user_ns(), uv);
> +                       d->group_id = make_kgid(user_ns, uv);
>                         if (!gid_valid(d->group_id))
>                                 return 0;
>                         d->group_id_present = 1;
> @@ -565,8 +566,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
>         struct super_block *sb = root->d_sb;
>         struct fuse_conn *fc = get_fuse_conn_super(sb);
>
> -       seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
> -       seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
> +       seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
> +       seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
>         if (fc->default_permissions)
>                 seq_puts(m, ",default_permissions");
>         if (fc->allow_other)
> @@ -597,7 +598,7 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq)
>         fpq->connected = 1;
>  }
>
> -void fuse_conn_init(struct fuse_conn *fc)
> +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
>  {
>         memset(fc, 0, sizeof(*fc));
>         spin_lock_init(&fc->lock);
> @@ -621,6 +622,7 @@ void fuse_conn_init(struct fuse_conn *fc)
>         fc->attr_version = 1;
>         get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
>         fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
> +       fc->user_ns = get_user_ns(user_ns);
>  }
>  EXPORT_SYMBOL_GPL(fuse_conn_init);
>
> @@ -630,6 +632,7 @@ void fuse_conn_put(struct fuse_conn *fc)
>                 if (fc->destroy_req)
>                         fuse_request_free(fc->destroy_req);
>                 put_pid_ns(fc->pid_ns);
> +               put_user_ns(fc->user_ns);
>                 fc->release(fc);
>         }
>  }
> @@ -1061,7 +1064,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
>
>         sb->s_flags &= ~(MS_NOSEC | SB_I_VERSION);
>
> -       if (!parse_fuse_opt(data, &d, is_bdev))
> +       if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns))
>                 goto err;
>
>         if (is_bdev) {
> @@ -1086,8 +1089,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
>         if (!file)
>                 goto err;
>
> -       if ((file->f_op != &fuse_dev_operations) ||
> -           (file->f_cred->user_ns != &init_user_ns))
> +       /*
> +        * Require mount to happen from the same user namespace which
> +        * opened /dev/fuse to prevent potential attacks.
> +        */
> +       if (file->f_op != &fuse_dev_operations ||
> +           file->f_cred->user_ns != sb->s_user_ns)
>                 goto err_fput;
>
>         fc = kmalloc(sizeof(*fc), GFP_KERNEL);
> @@ -1095,7 +1102,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
>         if (!fc)
>                 goto err_fput;
>
> -       fuse_conn_init(fc);
> +       fuse_conn_init(fc, sb->s_user_ns);
>         fc->release = fuse_free_conn;
>
>         fud = fuse_dev_alloc(fc);
> --
> 2.13.6
>
Eric W. Biederman Feb. 12, 2018, 4:35 p.m.
Miklos Szeredi <mszeredi@redhat.com> writes:

> On Fri, Dec 22, 2017 at 3:32 PM, Dongsu Park <dongsu@kinvolk.io> wrote:
>> From: Seth Forshee <seth.forshee@canonical.com>
>>
>> In order to support mounts from namespaces other than
>> init_user_ns, fuse must translate uids and gids to/from the
>> userns of the process servicing requests on /dev/fuse. This
>> patch does that, with a couple of restrictions on the namespace:
>>
>>  - The userns for the fuse connection is fixed to the namespace
>>    from which /dev/fuse is opened.
>>
>>  - The namespace must be the same as s_user_ns.
>>
>> These restrictions simplify the implementation by avoiding the
>> need to pass around userns references and by allowing fuse to
>> rely on the checks in inode_change_ok for ownership changes.
>> Either restriction could be relaxed in the future if needed.
>
> Can we not introduce potential userspace interface regressions?
>
> The issue with pid namespaces fixed in commit 5d6d3a301c4e ("fuse:
> allow server to run in different pid_ns") will probably bite us here
> as well.

Maybe, but unlike the pid namespace no one has been able to mount
fuse outside of init_user_ns so we are much less exposed.  I agree we
should be careful.

> We basically need two modes of operation:
>
> a) old, backward compatible (not introducing any new failure mores),
> created with privileged mount
> b) new, non-backward compatible, created with unprivileged mount
>
> Technically there would still be a risk from breaking userspace, since
> we are using the same entry point for both, but let's hope that no
> practical problems come from that.

Answering from a 10,000 foot perspective:

There are two cases.  Requests to read/write the filesystem from outside
of s_user_ns.  These run no risk of breaking userspace as this mode has
not been implemented before.

Restrictions at mount time to ensure we are not dealing with a crazy mix
of namespaces.  This has a small chance of breaking someone's crazy
setup.


Dropping requests to read/write the filesystem when the requester does
not map into s_user_ns should not be a problem to enable universally.  If
s_user_ns is init_user_ns everything maps so there is no restriction.



What we can do if we want to ensure maximum backwards compatibility
is if the fuse filesystem is mounted in init_user_ns but if device for
the communication channel is opened in some other user namespace we
can just force the communication channel to operate in init_user_ns.

That will be 100% backwards compatible in all cases and as far as I can
see remove the need for having different ``modes'' of operation.



This does look like the time to give all of this a hard look and see if
we can get these patches in shape to be merged.

Eric



>> For cuse the namespace used for the connection is also simply
>> current_user_ns() at the time /dev/cuse is opened.
>>
>> Patch v4 is available: https://patchwork.kernel.org/patch/8944661/
>>
>> Cc: linux-fsdevel@vger.kernel.org
>> Cc: linux-kernel@vger.kernel.org
>> Cc: Miklos Szeredi <mszeredi@redhat.com>
>> Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
>> Signed-off-by: Dongsu Park <dongsu@kinvolk.io>
>> ---
>>  fs/fuse/cuse.c   |  3 ++-
>>  fs/fuse/dev.c    | 11 ++++++++---
>>  fs/fuse/dir.c    | 14 +++++++-------
>>  fs/fuse/fuse_i.h |  6 +++++-
>>  fs/fuse/inode.c  | 31 +++++++++++++++++++------------
>>  5 files changed, 41 insertions(+), 24 deletions(-)
>>
>> diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
>> index e9e97803..b1b83259 100644
>> --- a/fs/fuse/cuse.c
>> +++ b/fs/fuse/cuse.c
>> @@ -48,6 +48,7 @@
>>  #include <linux/stat.h>
>>  #include <linux/module.h>
>>  #include <linux/uio.h>
>> +#include <linux/user_namespace.h>
>>
>>  #include "fuse_i.h"
>>
>> @@ -498,7 +499,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
>>         if (!cc)
>>                 return -ENOMEM;
>>
>> -       fuse_conn_init(&cc->fc);
>> +       fuse_conn_init(&cc->fc, current_user_ns());
>>
>>         fud = fuse_dev_alloc(&cc->fc);
>>         if (!fud) {
>> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
>> index 17f0d05b..0f780e16 100644
>> --- a/fs/fuse/dev.c
>> +++ b/fs/fuse/dev.c
>> @@ -114,8 +114,8 @@ static void __fuse_put_request(struct fuse_req *req)
>>
>>  static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req)
>>  {
>> -       req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
>> -       req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
>> +       req->in.h.uid = from_kuid(fc->user_ns, current_fsuid());
>> +       req->in.h.gid = from_kgid(fc->user_ns, current_fsgid());
>>         req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
>>  }
>>
>> @@ -167,6 +167,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
>>         __set_bit(FR_WAITING, &req->flags);
>>         if (for_background)
>>                 __set_bit(FR_BACKGROUND, &req->flags);
>> +       if (req->in.h.uid == (uid_t)-1 || req->in.h.gid == (gid_t)-1) {
>> +               fuse_put_request(fc, req);
>> +               return ERR_PTR(-EOVERFLOW);
>> +       }
>>
>>         return req;
>>
>> @@ -1260,7 +1264,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
>>         in = &req->in;
>>         reqsize = in->h.len;
>>
>> -       if (task_active_pid_ns(current) != fc->pid_ns) {
>> +       if (task_active_pid_ns(current) != fc->pid_ns ||
>> +           current_user_ns() != fc->user_ns) {
>
> I don't get it.  Why recalculate the pid if the user_ns does not match?
>
>>                 rcu_read_lock();
>>                 in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns));
>>                 rcu_read_unlock();
>> diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
>> index 24967382..ad1cfac1 100644
>> --- a/fs/fuse/dir.c
>> +++ b/fs/fuse/dir.c
>> @@ -858,8 +858,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
>>         stat->ino = attr->ino;
>>         stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
>>         stat->nlink = attr->nlink;
>> -       stat->uid = make_kuid(&init_user_ns, attr->uid);
>> -       stat->gid = make_kgid(&init_user_ns, attr->gid);
>> +       stat->uid = make_kuid(fc->user_ns, attr->uid);
>> +       stat->gid = make_kgid(fc->user_ns, attr->gid);
>>         stat->rdev = inode->i_rdev;
>>         stat->atime.tv_sec = attr->atime;
>>         stat->atime.tv_nsec = attr->atimensec;
>> @@ -1475,17 +1475,17 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
>>         return true;
>>  }
>>
>> -static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
>> -                          bool trust_local_cmtime)
>> +static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
>> +                          struct fuse_setattr_in *arg, bool trust_local_cmtime)
>>  {
>>         unsigned ivalid = iattr->ia_valid;
>>
>>         if (ivalid & ATTR_MODE)
>>                 arg->valid |= FATTR_MODE,   arg->mode = iattr->ia_mode;
>>         if (ivalid & ATTR_UID)
>> -               arg->valid |= FATTR_UID,    arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
>> +               arg->valid |= FATTR_UID,    arg->uid = from_kuid(fc->user_ns, iattr->ia_uid);
>>         if (ivalid & ATTR_GID)
>> -               arg->valid |= FATTR_GID,    arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
>> +               arg->valid |= FATTR_GID,    arg->gid = from_kgid(fc->user_ns, iattr->ia_gid);
>>         if (ivalid & ATTR_SIZE)
>>                 arg->valid |= FATTR_SIZE,   arg->size = iattr->ia_size;
>>         if (ivalid & ATTR_ATIME) {
>> @@ -1646,7 +1646,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
>>
>>         memset(&inarg, 0, sizeof(inarg));
>>         memset(&outarg, 0, sizeof(outarg));
>> -       iattr_to_fattr(attr, &inarg, trust_local_cmtime);
>> +       iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime);
>>         if (file) {
>>                 struct fuse_file *ff = file->private_data;
>>                 inarg.valid |= FATTR_FH;
>> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
>> index d5773ca6..364e65c8 100644
>> --- a/fs/fuse/fuse_i.h
>> +++ b/fs/fuse/fuse_i.h
>> @@ -26,6 +26,7 @@
>>  #include <linux/xattr.h>
>>  #include <linux/pid_namespace.h>
>>  #include <linux/refcount.h>
>> +#include <linux/user_namespace.h>
>>
>>  /** Max number of pages that can be used in a single read request */
>>  #define FUSE_MAX_PAGES_PER_REQ 32
>> @@ -466,6 +467,9 @@ struct fuse_conn {
>>         /** The pid namespace for this mount */
>>         struct pid_namespace *pid_ns;
>>
>> +       /** The user namespace for this mount */
>> +       struct user_namespace *user_ns;
>> +
>>         /** Maximum read size */
>>         unsigned max_read;
>>
>> @@ -870,7 +874,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
>>  /**
>>   * Initialize fuse_conn
>>   */
>> -void fuse_conn_init(struct fuse_conn *fc);
>> +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns);
>>
>>  /**
>>   * Release reference to fuse_conn
>> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
>> index 2f504d61..7f6b2e55 100644
>> --- a/fs/fuse/inode.c
>> +++ b/fs/fuse/inode.c
>> @@ -171,8 +171,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
>>         inode->i_ino     = fuse_squash_ino(attr->ino);
>>         inode->i_mode    = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
>>         set_nlink(inode, attr->nlink);
>> -       inode->i_uid     = make_kuid(&init_user_ns, attr->uid);
>> -       inode->i_gid     = make_kgid(&init_user_ns, attr->gid);
>> +       inode->i_uid     = make_kuid(fc->user_ns, attr->uid);
>> +       inode->i_gid     = make_kgid(fc->user_ns, attr->gid);
>>         inode->i_blocks  = attr->blocks;
>>         inode->i_atime.tv_sec   = attr->atime;
>>         inode->i_atime.tv_nsec  = attr->atimensec;
>> @@ -477,7 +477,8 @@ static int fuse_match_uint(substring_t *s, unsigned int *res)
>>         return err;
>>  }
>>
>> -static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
>> +static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
>> +                         struct user_namespace *user_ns)
>>  {
>>         char *p;
>>         memset(d, 0, sizeof(struct fuse_mount_data));
>> @@ -513,7 +514,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
>>                 case OPT_USER_ID:
>>                         if (fuse_match_uint(&args[0], &uv))
>>                                 return 0;
>> -                       d->user_id = make_kuid(current_user_ns(), uv);
>> +                       d->user_id = make_kuid(user_ns, uv);
>>                         if (!uid_valid(d->user_id))
>>                                 return 0;
>>                         d->user_id_present = 1;
>> @@ -522,7 +523,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
>>                 case OPT_GROUP_ID:
>>                         if (fuse_match_uint(&args[0], &uv))
>>                                 return 0;
>> -                       d->group_id = make_kgid(current_user_ns(), uv);
>> +                       d->group_id = make_kgid(user_ns, uv);
>>                         if (!gid_valid(d->group_id))
>>                                 return 0;
>>                         d->group_id_present = 1;
>> @@ -565,8 +566,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
>>         struct super_block *sb = root->d_sb;
>>         struct fuse_conn *fc = get_fuse_conn_super(sb);
>>
>> -       seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
>> -       seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
>> +       seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
>> +       seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
>>         if (fc->default_permissions)
>>                 seq_puts(m, ",default_permissions");
>>         if (fc->allow_other)
>> @@ -597,7 +598,7 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq)
>>         fpq->connected = 1;
>>  }
>>
>> -void fuse_conn_init(struct fuse_conn *fc)
>> +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
>>  {
>>         memset(fc, 0, sizeof(*fc));
>>         spin_lock_init(&fc->lock);
>> @@ -621,6 +622,7 @@ void fuse_conn_init(struct fuse_conn *fc)
>>         fc->attr_version = 1;
>>         get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
>>         fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
>> +       fc->user_ns = get_user_ns(user_ns);
>>  }
>>  EXPORT_SYMBOL_GPL(fuse_conn_init);
>>
>> @@ -630,6 +632,7 @@ void fuse_conn_put(struct fuse_conn *fc)
>>                 if (fc->destroy_req)
>>                         fuse_request_free(fc->destroy_req);
>>                 put_pid_ns(fc->pid_ns);
>> +               put_user_ns(fc->user_ns);
>>                 fc->release(fc);
>>         }
>>  }
>> @@ -1061,7 +1064,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
>>
>>         sb->s_flags &= ~(MS_NOSEC | SB_I_VERSION);
>>
>> -       if (!parse_fuse_opt(data, &d, is_bdev))
>> +       if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns))
>>                 goto err;
>>
>>         if (is_bdev) {
>> @@ -1086,8 +1089,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
>>         if (!file)
>>                 goto err;
>>
>> -       if ((file->f_op != &fuse_dev_operations) ||
>> -           (file->f_cred->user_ns != &init_user_ns))
>> +       /*
>> +        * Require mount to happen from the same user namespace which
>> +        * opened /dev/fuse to prevent potential attacks.
>> +        */
>> +       if (file->f_op != &fuse_dev_operations ||
>> +           file->f_cred->user_ns != sb->s_user_ns)
>>                 goto err_fput;
>>
>>         fc = kmalloc(sizeof(*fc), GFP_KERNEL);
>> @@ -1095,7 +1102,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
>>         if (!fc)
>>                 goto err_fput;
>>
>> -       fuse_conn_init(fc);
>> +       fuse_conn_init(fc, sb->s_user_ns);
>>         fc->release = fuse_free_conn;
>>
>>         fud = fuse_dev_alloc(fc);
>> --
>> 2.13.6
>>
Miklos Szeredi Feb. 13, 2018, 10:20 a.m.
On Mon, Feb 12, 2018 at 5:35 PM, Eric W. Biederman
<ebiederm@xmission.com> wrote:
> Miklos Szeredi <mszeredi@redhat.com> writes:
>
>> On Fri, Dec 22, 2017 at 3:32 PM, Dongsu Park <dongsu@kinvolk.io> wrote:
>>> From: Seth Forshee <seth.forshee@canonical.com>
>>>
>>> In order to support mounts from namespaces other than
>>> init_user_ns, fuse must translate uids and gids to/from the
>>> userns of the process servicing requests on /dev/fuse. This
>>> patch does that, with a couple of restrictions on the namespace:
>>>
>>>  - The userns for the fuse connection is fixed to the namespace
>>>    from which /dev/fuse is opened.
>>>
>>>  - The namespace must be the same as s_user_ns.
>>>
>>> These restrictions simplify the implementation by avoiding the
>>> need to pass around userns references and by allowing fuse to
>>> rely on the checks in inode_change_ok for ownership changes.
>>> Either restriction could be relaxed in the future if needed.
>>
>> Can we not introduce potential userspace interface regressions?
>>
>> The issue with pid namespaces fixed in commit 5d6d3a301c4e ("fuse:
>> allow server to run in different pid_ns") will probably bite us here
>> as well.
>
> Maybe, but unlike the pid namespace no one has been able to mount
> fuse outside of init_user_ns so we are much less exposed.  I agree we
> should be careful.

Have to wrap my head around all the rules here.

There's the may_mount() one:

    ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)

Um, first of all, why isn't it checking current->cred->user_ns?

Ah, there it is in sget():

    ns_capable(user_ns, CAP_SYS_ADMIN)

I get the plain capable(CAP_SYS_ADMIN) check in sget_userns() if fs
doesn't have FS_USERNS_MOUNT.  This is the one that prevents fuse
mounts from being created when (current->cred->user_ns !=
&init_user_ns).

Maybe there's a logic to this web of namespaces, but I don't yet see
it.  Is it documented somewhere?

>> We basically need two modes of operation:
>>
>> a) old, backward compatible (not introducing any new failure mores),
>> created with privileged mount
>> b) new, non-backward compatible, created with unprivileged mount
>>
>> Technically there would still be a risk from breaking userspace, since
>> we are using the same entry point for both, but let's hope that no
>> practical problems come from that.
>
> Answering from a 10,000 foot perspective:
>
> There are two cases.  Requests to read/write the filesystem from outside
> of s_user_ns.  These run no risk of breaking userspace as this mode has
> not been implemented before.

This comes from the fact that (s_user_ns == &init_user_ns) and all
user namespaces are "inside" init_user_ns, right?

One question: why does current code use the from_[ug]id_munged()
variant, when the conversion can never fail.  Or can it?

> Restrictions at mount time to ensure we are not dealing with a crazy mix
> of namespaces.  This has a small chance of breaking someone's crazy
> setup.
>
>
> Dropping requests to read/write the filesystem when the requester does
> not map into s_user_ns should not be a problem to enable universally.  If
> s_user_ns is init_user_ns everything maps so there is no restriction.
>
>
>
> What we can do if we want to ensure maximum backwards compatibility
> is if the fuse filesystem is mounted in init_user_ns but if device for
> the communication channel is opened in some other user namespace we
> can just force the communication channel to operate in init_user_ns.
>
> That will be 100% backwards compatible in all cases and as far as I can
> see remove the need for having different ``modes'' of operation.

Okay.

Thanks,
Miklos
Eric W. Biederman Feb. 16, 2018, 9:52 p.m.
Miklos Szeredi <mszeredi@redhat.com> writes:

> On Mon, Feb 12, 2018 at 5:35 PM, Eric W. Biederman
> <ebiederm@xmission.com> wrote:
>> Miklos Szeredi <mszeredi@redhat.com> writes:
>>
>>> On Fri, Dec 22, 2017 at 3:32 PM, Dongsu Park <dongsu@kinvolk.io> wrote:
>>>> From: Seth Forshee <seth.forshee@canonical.com>
>>>>
>>>> In order to support mounts from namespaces other than
>>>> init_user_ns, fuse must translate uids and gids to/from the
>>>> userns of the process servicing requests on /dev/fuse. This
>>>> patch does that, with a couple of restrictions on the namespace:
>>>>
>>>>  - The userns for the fuse connection is fixed to the namespace
>>>>    from which /dev/fuse is opened.
>>>>
>>>>  - The namespace must be the same as s_user_ns.
>>>>
>>>> These restrictions simplify the implementation by avoiding the
>>>> need to pass around userns references and by allowing fuse to
>>>> rely on the checks in inode_change_ok for ownership changes.
>>>> Either restriction could be relaxed in the future if needed.
>>>
>>> Can we not introduce potential userspace interface regressions?
>>>
>>> The issue with pid namespaces fixed in commit 5d6d3a301c4e ("fuse:
>>> allow server to run in different pid_ns") will probably bite us here
>>> as well.
>>
>> Maybe, but unlike the pid namespace no one has been able to mount
>> fuse outside of init_user_ns so we are much less exposed.  I agree we
>> should be careful.
>
> Have to wrap my head around all the rules here.
>
> There's the may_mount() one:
>
>     ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)
>
> Um, first of all, why isn't it checking current->cred->user_ns?
>
> Ah, there it is in sget():
>
>     ns_capable(user_ns, CAP_SYS_ADMIN)
>
> I get the plain capable(CAP_SYS_ADMIN) check in sget_userns() if fs
> doesn't have FS_USERNS_MOUNT.  This is the one that prevents fuse
> mounts from being created when (current->cred->user_ns !=
> &init_user_ns).
>
> Maybe there's a logic to this web of namespaces, but I don't yet see
> it.  Is it documented somewhere?

I think this is a bit simpler than the fiddly details in the
implementation might make it look.

The fundamental idea is that permission to have full control over
a mount namespace, is different than permission to have full control
over an instance of a filesystem.

Implementing that separation of permission checks gets a little bit
fiddly.  The first challenge is that there are several filesystems like
sysfs and proc whose internal mount is created outside of a process.
Then there are the file systems like nfs and afs that have ``referral
points'' that transition you to other instances of those filesystems
when you transition over them.  That is the reason why there are
exceptions for SB_KERNMOUNT and SB_SUBMOUNT.

may_mount is just the permission check for the mount namespace.  It
checks that the current process has CAP_SYS_ADMIN in the user namespace
that owns the current mount namespace.  AKA is the process allowed to
change the mount namespace.

sget is just the permission check for mounting a filesystem.  It checks
that the mounter has CAP_SYS_ADMIN over the user namespace that will own
the newly mounted filesystem.

By the time execition gets to to sget_userns in general all of the
permission checks have all been made.  But if the filesystem is not one
that supports mounting within a user namespace the code checks
capable(CAP_SYS_ADMIN).

That is more convoluted than I would like but the checks derive from the
definition of what we are doing.

>
>>> We basically need two modes of operation:
>>>
>>> a) old, backward compatible (not introducing any new failure mores),
>>> created with privileged mount
>>> b) new, non-backward compatible, created with unprivileged mount
>>>
>>> Technically there would still be a risk from breaking userspace, since
>>> we are using the same entry point for both, but let's hope that no
>>> practical problems come from that.
>>
>> Answering from a 10,000 foot perspective:
>>
>> There are two cases.  Requests to read/write the filesystem from outside
>> of s_user_ns.  These run no risk of breaking userspace as this mode has
>> not been implemented before.
>
> This comes from the fact that (s_user_ns == &init_user_ns) and all
> user namespaces are "inside" init_user_ns, right?

Yes.

> One question: why does current code use the from_[ug]id_munged()
> variant, when the conversion can never fail.  Or can it?

There is always at least (uid_t)-1 that can fail if it shows up on a
filesystem.  As far as I can tell no one was using it for a uid, there
were already uses of (uid_t)-1 as a special case, and I just grabbed it
to become INVALID_UID.

In practice the mapping can't fail unless someone malicious starts using
that id.

I believe I picked the _munged variant so in case that version hits
we are guaranteed to return the 16bit nobody user.

Eric
Eric W. Biederman Feb. 20, 2018, 2:12 a.m.
Dongsu Park <dongsu@kinvolk.io> writes:

> From: Seth Forshee <seth.forshee@canonical.com>
>
> In order to support mounts from namespaces other than
> init_user_ns, fuse must translate uids and gids to/from the
> userns of the process servicing requests on /dev/fuse. This
> patch does that, with a couple of restrictions on the namespace:
>
>  - The userns for the fuse connection is fixed to the namespace
>    from which /dev/fuse is opened.
>
>  - The namespace must be the same as s_user_ns.
>
> These restrictions simplify the implementation by avoiding the
> need to pass around userns references and by allowing fuse to
> rely on the checks in inode_change_ok for ownership changes.
> Either restriction could be relaxed in the future if needed.
>
> For cuse the namespace used for the connection is also simply
> current_user_ns() at the time /dev/cuse is opened.
>
> Patch v4 is available: https://patchwork.kernel.org/patch/8944661/
>
> Cc: linux-fsdevel@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: Miklos Szeredi <mszeredi@redhat.com>
> Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
> Signed-off-by: Dongsu Park <dongsu@kinvolk.io>
> ---
>  fs/fuse/cuse.c   |  3 ++-
>  fs/fuse/dev.c    | 11 ++++++++---
>  fs/fuse/dir.c    | 14 +++++++-------
>  fs/fuse/fuse_i.h |  6 +++++-
>  fs/fuse/inode.c  | 31 +++++++++++++++++++------------
>  5 files changed, 41 insertions(+), 24 deletions(-)
>
> diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
> index e9e97803..b1b83259 100644
> --- a/fs/fuse/cuse.c
> +++ b/fs/fuse/cuse.c
> @@ -48,6 +48,7 @@
>  #include <linux/stat.h>
>  #include <linux/module.h>
>  #include <linux/uio.h>
> +#include <linux/user_namespace.h>
>  
>  #include "fuse_i.h"
>  
> @@ -498,7 +499,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
>  	if (!cc)
>  		return -ENOMEM;
>  
As noticed in the review this should probably say:
	if (current_user_ns() != &init_user_ns)
		return -EINVAL;

Just so we don't need to think about cuse being opened in a user
namespace at this point.  It is probably harmless.  But it isn't
what we are focusing on.

> -	fuse_conn_init(&cc->fc);
> +	fuse_conn_init(&cc->fc, current_user_ns());
>  
>  	fud = fuse_dev_alloc(&cc->fc);
>  	if (!fud) {


> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index 17f0d05b..0f780e16 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -114,8 +114,8 @@ static void __fuse_put_request(struct fuse_req *req)
>  
>  static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req)
>  {
> -	req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
> -	req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
> +	req->in.h.uid = from_kuid(fc->user_ns, current_fsuid());
> +	req->in.h.gid = from_kgid(fc->user_ns, current_fsgid());
>  	req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
>  }
>  
> @@ -167,6 +167,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
>  	__set_bit(FR_WAITING, &req->flags);
>  	if (for_background)
>  		__set_bit(FR_BACKGROUND, &req->flags);
> +	if (req->in.h.uid == (uid_t)-1 || req->in.h.gid == (gid_t)-1) {
> +		fuse_put_request(fc, req);
> +		return ERR_PTR(-EOVERFLOW);
> +	}
>  
>  	return req;
>  
> @@ -1260,7 +1264,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
>  	in = &req->in;
>  	reqsize = in->h.len;
>  
> -	if (task_active_pid_ns(current) != fc->pid_ns) {
> +	if (task_active_pid_ns(current) != fc->pid_ns ||
> +	    current_user_ns() != fc->user_ns) {
>  		rcu_read_lock();
>  		in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns));
>  		rcu_read_unlock();

The hunk above is a rebase error.  I believe it started out by erroring
out in the same case the pid namespace case errored out.  Miklos has a
good point that we need to handle the case where we have servers running
in jails of one sort or another because at least sandstorm runs
applications in that fashion, and we have previously had error reports
about that configuration breaking.

I think we can easily fix that.  Either by adding extra translation as
we did for the pid namespace or changing the user namespace used on the
connection.  I believe extra translation like we did with the pid
namespace will be more consistent.  And again it won't be a special
case except possibly during mount.  Of course there is weirdness there.

Eric