procfs: change the owner of non-dumpable and writeable files

Submitted by Eric W. Biederman on Jan. 20, 2017, 1:57 a.m.

Details

Message ID 87r33yv6gk.fsf@xmission.com
State New
Series "procfs: change the owner of non-dumpable and writeable files"
Headers show

Commit Message

Eric W. Biederman Jan. 20, 2017, 1:57 a.m.
The patch under discussion just appears wrong.  Part of what we are
detecting when we ask if a task is dumpable is if a process might have
under gone a security transition.  If a process has undergone a security
transition (calling setuid or the like) it is quite possible that
everyone with it's current effective credentials should not be able
to access that process.

Please verify but the ptrace issue that allowed processes in a container
to call setns on our processes should be fixed as of 4.10-rc1.  And the
change has been marked for backporting.

AKA it should be this fix that removes the need for your dumpable setting.
Fixes: bfedb589252c ("mm: Add a user_ns owner to mm_struct and fix ptrace permission checks")

Now with that said I believe we want to add the following change now
that dumpable is user namespace relative.  That will use not the
GLOBAL_ROOT_UID/GID but instead uid and gid 0 in the namespace
that dumpable is relative too.

But ugh!  Your case is even more confused that I had first noticed.
Saying that a processes is undumpable is completely unnecessary
when you are entering into a new fresh user namespace.  Touching
setgroups at any point where there are other processes in the namespace
makes no sense whatsoever.  Clearing dumpable is to help not leak things
into a container when you call setns on a user namespace.

So my general impression of your use case is don't do that, it isn't
necessary and it causes you problems.

At the same time here is what I am looking at to better set the uids and
gids of proc files.

From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 3 Jan 2017 10:23:11 +1300
Subject: [PATCH] proc: Better ownership of files for non-dumpable tasks in user namespaces

Instead of making the files owned by the GLOBAL_ROOT_USER.  Make
non-dumpable files whose mm has always lived in a user namespace owned
by the user namespace root.  This allows the container root to have
things work as expected in a container.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/base.c     | 88 +++++++++++++++++++++++++++++++-----------------------
 fs/proc/fd.c       | 12 +-------
 fs/proc/internal.h | 16 ++--------
 3 files changed, 53 insertions(+), 63 deletions(-)

Patch hide | download patch | download mbox

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8e7e61b28f31..e96487af2307 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1667,12 +1667,55 @@  const struct inode_operations proc_pid_link_inode_operations = {
 
 /* building an inode */
 
+void task_dump_owner(struct task_struct *task, mode_t mode,
+		     kuid_t *ruid, kgid_t *rgid)
+{
+	/* Depending on the state of dumpable compute who should own a
+	 * proc file for a task.
+	 */
+	const struct cred *cred;
+	kuid_t uid;
+	kgid_t gid;
+
+	/* Default to the tasks effective ownership */
+	rcu_read_lock();
+	cred = __task_cred(task);
+	uid = cred->euid;
+	gid = cred->egid;
+	rcu_read_unlock();
+
+	if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
+		struct mm_struct *mm;
+		task_lock(task);
+		mm = task->mm;
+		/* Make non-dumpable tasks owned by some root */
+		if (mm) {
+			if (get_dumpable(mm) != SUID_DUMP_USER) {
+				struct user_namespace *user_ns = mm->user_ns;
+
+				uid = make_kuid(user_ns, 0);
+				if (!uid_valid(uid))
+					uid = GLOBAL_ROOT_UID;
+
+				gid = make_kgid(user_ns, 0);
+				if (!gid_valid(gid))
+					gid = GLOBAL_ROOT_GID;
+			}
+		} else {
+			uid = GLOBAL_ROOT_UID;
+			gid = GLOBAL_ROOT_GID;
+		}
+		task_unlock(task);
+	}
+	*ruid = uid;
+	*rgid = gid;
+}
+
 struct inode *proc_pid_make_inode(struct super_block * sb,
 				  struct task_struct *task, umode_t mode)
 {
 	struct inode * inode;
 	struct proc_inode *ei;
-	const struct cred *cred;
 
 	/* We need a new inode */
 
@@ -1694,13 +1737,7 @@  struct inode *proc_pid_make_inode(struct super_block * sb,
 	if (!ei->pid)
 		goto out_unlock;
 
-	if (task_dumpable(task)) {
-		rcu_read_lock();
-		cred = __task_cred(task);
-		inode->i_uid = cred->euid;
-		inode->i_gid = cred->egid;
-		rcu_read_unlock();
-	}
+	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
 	security_task_to_inode(task, inode);
 
 out:
@@ -1715,7 +1752,6 @@  int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = d_inode(dentry);
 	struct task_struct *task;
-	const struct cred *cred;
 	struct pid_namespace *pid = dentry->d_sb->s_fs_info;
 
 	generic_fillattr(inode, stat);
@@ -1733,12 +1769,7 @@  int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 			 */
 			return -ENOENT;
 		}
-		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
-		    task_dumpable(task)) {
-			cred = __task_cred(task);
-			stat->uid = cred->euid;
-			stat->gid = cred->egid;
-		}
+		task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
 	}
 	rcu_read_unlock();
 	return 0;
@@ -1765,7 +1796,6 @@  int pid_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 	struct task_struct *task;
-	const struct cred *cred;
 
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
@@ -1774,17 +1804,8 @@  int pid_revalidate(struct dentry *dentry, unsigned int flags)
 	task = get_proc_task(inode);
 
 	if (task) {
-		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
-		    task_dumpable(task)) {
-			rcu_read_lock();
-			cred = __task_cred(task);
-			inode->i_uid = cred->euid;
-			inode->i_gid = cred->egid;
-			rcu_read_unlock();
-		} else {
-			inode->i_uid = GLOBAL_ROOT_UID;
-			inode->i_gid = GLOBAL_ROOT_GID;
-		}
+		task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
+
 		inode->i_mode &= ~(S_ISUID | S_ISGID);
 		security_task_to_inode(task, inode);
 		put_task_struct(task);
@@ -1881,7 +1902,6 @@  static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
 	bool exact_vma_exists = false;
 	struct mm_struct *mm = NULL;
 	struct task_struct *task;
-	const struct cred *cred;
 	struct inode *inode;
 	int status = 0;
 
@@ -1906,16 +1926,8 @@  static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
 	mmput(mm);
 
 	if (exact_vma_exists) {
-		if (task_dumpable(task)) {
-			rcu_read_lock();
-			cred = __task_cred(task);
-			inode->i_uid = cred->euid;
-			inode->i_gid = cred->egid;
-			rcu_read_unlock();
-		} else {
-			inode->i_uid = GLOBAL_ROOT_UID;
-			inode->i_gid = GLOBAL_ROOT_GID;
-		}
+		task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
+
 		security_task_to_inode(task, inode);
 		status = 1;
 	}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 4274f83bf100..00ce1531b2f5 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -84,7 +84,6 @@  static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct files_struct *files;
 	struct task_struct *task;
-	const struct cred *cred;
 	struct inode *inode;
 	unsigned int fd;
 
@@ -108,16 +107,7 @@  static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
 				rcu_read_unlock();
 				put_files_struct(files);
 
-				if (task_dumpable(task)) {
-					rcu_read_lock();
-					cred = __task_cred(task);
-					inode->i_uid = cred->euid;
-					inode->i_gid = cred->egid;
-					rcu_read_unlock();
-				} else {
-					inode->i_uid = GLOBAL_ROOT_UID;
-					inode->i_gid = GLOBAL_ROOT_GID;
-				}
+				task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
 
 				if (S_ISLNK(inode->i_mode)) {
 					unsigned i_mode = S_IFLNK;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 2de5194ba378..e2c3c461fa20 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -97,20 +97,8 @@  static inline struct task_struct *get_proc_task(struct inode *inode)
 	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
 }
 
-static inline int task_dumpable(struct task_struct *task)
-{
-	int dumpable = 0;
-	struct mm_struct *mm;
-
-	task_lock(task);
-	mm = task->mm;
-	if (mm)
-		dumpable = get_dumpable(mm);
-	task_unlock(task);
-	if (dumpable == SUID_DUMP_USER)
-		return 1;
-	return 0;
-}
+void task_dump_owner(struct task_struct *task, mode_t mode,
+		     kuid_t *ruid, kgid_t *rgid);
 
 static inline unsigned name_to_int(const struct qstr *qstr)
 {

Comments

Aleksa Sarai Jan. 20, 2017, 2:35 a.m.
> Please verify but the ptrace issue that allowed processes in a container
> to call setns on our processes should be fixed as of 4.10-rc1.  And the
> change has been marked for backporting.

ptrace(2) is not the only issue, the issue that we had in runC is that a 
process joining a namespace may have file descriptors that refer to the 
host filesystem. If the process joining is dumpable, a racing process 
inside the container can access those file descriptors through the 
/proc/[pid]/fd/... mechanism.

See CVE-2016-9962.

> AKA it should be this fix that removes the need for your dumpable setting.
> Fixes: bfedb589252c ("mm: Add a user_ns owner to mm_struct and fix ptrace permission checks")

I will check, though from what I recall that patch doesn't fix the 
ptrace_may_access checks. Not to mention it won't help if the container 
doesn't have it's own user namespace.

> Now with that said I believe we want to add the following change now
> that dumpable is user namespace relative.  That will use not the
> GLOBAL_ROOT_UID/GID but instead uid and gid 0 in the namespace
> that dumpable is relative too.

Sure, but that's tangential to the issue under discussion.

> But ugh!  Your case is even more confused that I had first noticed.
> Saying that a processes is undumpable is completely unnecessary
> when you are entering into a new fresh user namespace.  Touching
> setgroups at any point where there are other processes in the namespace
> makes no sense whatsoever.

Currently in runC the ordering for mixed create-and-join namespaces is 
that we first join existing namespaces and _then_ create new ones. So we 
need to be non-dumpable to avoid the problem in CVE-2016-9962.

> Clearing dumpable is to help not leak things
> into a container when you call setns on a user namespace.

It is also to help not leak things into a container when you join other 
namespaces. Most notably the PID namespace.

> +	if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {

I'd just like to draw your attention to this special case -- why is this 
special cased? What was the original reasoning behind it? Does it make 
sense for a non-dumpable process to allow someone to change the mode of 
some random /proc/[pid]/ directories?

I get the feeling that some of this logic is a bit iffy.
Eric W. Biederman Jan. 20, 2017, 4:35 a.m.
Aleksa Sarai <asarai@suse.de> writes:

>> Please verify but the ptrace issue that allowed processes in a container
>> to call setns on our processes should be fixed as of 4.10-rc1.  And the
>> change has been marked for backporting.
>
> ptrace(2) is not the only issue, the issue that we had in runC is that
> a process joining a namespace may have file descriptors that refer to
> the host filesystem. If the process joining is dumpable, a racing
> process inside the container can access those file descriptors through
> the /proc/[pid]/fd/... mechanism.
>
> See CVE-2016-9962.

Wow, I said that badly.  But the issue is ptrace_attach and the
ptrace_may_access checks.

>> AKA it should be this fix that removes the need for your dumpable setting.
>> bfedb589252c ("mm: Add a user_ns owner to mm_struct and fix ptrace permission checks")
>
> I will check, though from what I recall that patch doesn't fix the
> ptrace_may_access checks. Not to mention it won't help if the
> container doesn't have it's own user namespace.

That change very much is to the ptrace_may_access checks.

You are not playing with setgroups if you don't have your own user
namespace.  So I don't see how the other cases are relevant.

Going further there are only two namespaces that are only 3 pieces of
information that I see are relevant in this context.  The pid namespace,
the user namespace, and the process credentials.

You can not join a PID namespace you can only fork into one.

The user namespace is relevant in older kernels as joining a user
namespace would allow ptrace_has_cap check in ptrace_may_access
to pass.  After the change listed above that check does not start
passing until after exec.

The credentials on your process are relevant as changing those
allow the ptrace_may_access checks to start passing for other sets
of processes.



What I think I would do in the situation you describe is to
join what you are going to join.  Limit yourself to creating pid
namespaces with unshare.

If you are joining a user namespace set undumpable.
If you are creating a user namespace create it and then set undumpable.

fork your process into the new pid namespace.
change your pocesses credentials.
exec whatever you are execing, letting close_on_exec clean up for you.

>> Now with that said I believe we want to add the following change now
>> that dumpable is user namespace relative.  That will use not the
>> GLOBAL_ROOT_UID/GID but instead uid and gid 0 in the namespace
>> that dumpable is relative too.
>
> Sure, but that's tangential to the issue under discussion.

It is pretty much the only case I see worth considering.  Certainly I
don't see setgroups as a motivational example.

>> But ugh!  Your case is even more confused that I had first noticed.
>> Saying that a processes is undumpable is completely unnecessary
>> when you are entering into a new fresh user namespace.  Touching
>> setgroups at any point where there are other processes in the namespace
>> makes no sense whatsoever.
>
> Currently in runC the ordering for mixed create-and-join namespaces is
> that we first join existing namespaces and _then_ create new ones. So
> we need to be non-dumpable to avoid the problem in CVE-2016-9962.

Yes mixed create and join is trickier than a pure create and use case,
and trickier than a pure join case.  But see above it looks quite doable
to me to avoid being vulnerable while using the existing permissions
even in the case you describe.

>> Clearing dumpable is to help not leak things
>> into a container when you call setns on a user namespace.
>
> It is also to help not leak things into a container when you join
> other namespaces. Most notably the PID namespace.

Except that you don't strictly join a PID namespace.  You set a context
for children to run in a different PID namespace.  So you are safe
from PID namespaces as long as you don't call fork.

>> +	if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
>
> I'd just like to draw your attention to this special case -- why is
> this special cased? What was the original reasoning behind it? Does it
> make sense for a non-dumpable process to allow someone to change the
> mode of some random /proc/[pid]/ directories?

This has nothing at all to do with changing modes and is all about
what uid/gid are set on the proc inode.   Usually it is the uid/gid
of the process in question but occassionally for undumpable processes
it is root/root to prevent people from accessing the files in question.

> I get the feeling that some of this logic is a bit iffy.

It looks like I forgot to carry forward the comment that explains that
case in my patch.  Something I need to fix before I merge it.

/*
 * Before the /proc/pid/status file was created the only way to read
 * the effective uid of a /process was to stat /proc/pid.  Reading
 * /proc/pid/status is slow enough that procps and other packages
 * kept stating /proc/pid.  To keep the rules in /proc simple I have
 * made this apply to all per process world readable and executable
 * directories.
 */

Or in short.  I broke ps when I removed all of the special cases, and to
fix ps I added the existing special case.  Not that the uid or gid of a
directory that the whole world can access matters.

Eric
Aleksa Sarai Jan. 25, 2017, 6:43 a.m.
>>> AKA it should be this fix that removes the need for your dumpable setting.
>>> bfedb589252c ("mm: Add a user_ns owner to mm_struct and fix ptrace permission checks")
>>
>> I will check, though from what I recall that patch doesn't fix the
>> ptrace_may_access checks. Not to mention it won't help if the
>> container doesn't have it's own user namespace.
>
> That change very much is to the ptrace_may_access checks.

Sorry, you're right. I misremembered the patch.

> You are not playing with setgroups if you don't have your own user
> namespace.  So I don't see how the other cases are relevant.

There's also oom_score_adj and a few others. User namespaces were just 
the first example that I hit.

> What I think I would do in the situation you describe is to
> join what you are going to join.  Limit yourself to creating pid
> namespaces with unshare.
>
> If you are joining a user namespace set undumpable.
> If you are creating a user namespace create it and then set undumpable.

I just tried to implement this, and it works _okay_. Except that 
oom_scoore_adj also is no longer writeable if the process is not 
dumpable -- and the "create container" and "modify process" stages in 
runC are very separate and there's no easy way of reconciling the two.

Ultimately this only affects rootless containers, where security is 
simply not a major concern. However it is a bit frustrating that a 
process which is not dumpable cannot even modify _its own_ 
/proc/self/... files.

My current fix is to just not set dumpable for rootless containers, as 
it seems there's no proper way of setting dumpable in this context. It 
appears as though the only way that dumpable works is by just using 
CAP_DAC_OVERRIDE and completely ignoring the dumpable restrictions.

>>> Clearing dumpable is to help not leak things
>>> into a container when you call setns on a user namespace.
>>
>> It is also to help not leak things into a container when you join
>> other namespaces. Most notably the PID namespace.
>
> Except that you don't strictly join a PID namespace.  You set a context
> for children to run in a different PID namespace.  So you are safe
> from PID namespaces as long as you don't call fork.

But you need to fork eventually if you want to set settings on the 
process which will eventually be the container process (PID 1 in the new 
container). You can't exec before then, you need to fork first.

>>> +	if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
>>
>> I'd just like to draw your attention to this special case -- why is
>> this special cased? What was the original reasoning behind it? Does it
>> make sense for a non-dumpable process to allow someone to change the
>> mode of some random /proc/[pid]/ directories?
>
> This has nothing at all to do with changing modes and is all about
> what uid/gid are set on the proc inode.   Usually it is the uid/gid
> of the process in question but occassionally for undumpable processes
> it is root/root to prevent people from accessing the files in question.

My question was "why are o+rx directories set to the process's e[ug]id 
but other inodes are set to the root [ug]id?". And it's the other way 
around -- only for two directories on my system is it the case that the 
process has /proc/pid/... inodes owned by the process's e[ug]id (the 
rest are owned by root).

And it is about modes, because once you're the owner of a file you can 
change its mode (allowing other processes to access the inodes). I'm not 
sure what other purpose changing the ownership *of a directory* serves 
-- you cannot create new files (or unlink files) under /proc/self/... 
directories so u+w permissions aren't actually "useful" (as far as I can 
tell).


>> I get the feeling that some of this logic is a bit iffy.
>
> It looks like I forgot to carry forward the comment that explains that
> case in my patch.  Something I need to fix before I merge it.
>
> /*
>  * Before the /proc/pid/status file was created the only way to read
>  * the effective uid of a /process was to stat /proc/pid.  Reading
>  * /proc/pid/status is slow enough that procps and other packages
>  * kept stating /proc/pid.  To keep the rules in /proc simple I have
>  * made this apply to all per process world readable and executable
>  * directories.
>  */
>
> Or in short.  I broke ps when I removed all of the special cases, and to
> fix ps I added the existing special case.  Not that the uid or gid of a
> directory that the whole world can access matters.

Okay, but why is it being applied to _all_ subdirectories of /proc/pid? 
Why not make it *only* set the owner of /proc/pid and nothing else? 
Looks like that was not intended given the reasons you just provided.