[REVIEW] exec: Don't exec files the userns root can not read.

Submitted by Eric W. Biederman on Oct. 18, 2016, 9:15 p.m.

Details

Message ID 87k2d5nytz.fsf_-_@xmission.com
State New
Series "mm: Add a user_ns owner to mm_struct and fix ptrace_may_access"
Headers show

Commit Message

Eric W. Biederman Oct. 18, 2016, 9:15 p.m.
When the user namespace support was merged the need to prevent
ptracing an executable that is not readable was overlooked.

Correct this oversight by not letting exec succeed if during exec an
executable is not readable and the current user namespace capabilities
do not apply to the executable's file.

While it happens that distros install some files setuid and
non-readable I have not found any executable files just installed
non-readalbe.  Executables that are setuid to a user not mapped in a
user namespace are worthless, so I don't expect this to introduce
any problems in practice.

There may be a way to allow this execution to happen by setting
mm->user_ns to a more privileged user namespace and watching out for
the possibility of using dynamic linkers or other shared libraries
that the kernel loads into the mm to bypass the read-only
restriction.  But the analysis is more difficult and it would
require more code churn so I don't think the effort is worth it.

Cc: stable@vger.kernel.org
Reported-by: Jann Horn <jann@thejh.net>
Fixes: 9e4a36ece652 ("userns: Fail exec for suid and sgid binaries with ids outside our user namespace.")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---

Tossing this out for review in case I missed something silly but this
patch seems pretty trivial.

 arch/x86/ia32/ia32_aout.c |  4 +++-
 fs/binfmt_aout.c          |  4 +++-
 fs/binfmt_elf.c           |  4 +++-
 fs/binfmt_elf_fdpic.c     |  4 +++-
 fs/binfmt_flat.c          |  4 +++-
 fs/exec.c                 | 19 ++++++++++++++++---
 include/linux/binfmts.h   |  6 +++++-
 7 files changed, 36 insertions(+), 9 deletions(-)

Patch hide | download patch | download mbox

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index cb26f18d43af..7ad20dedd929 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -294,7 +294,9 @@  static int load_aout_binary(struct linux_binprm *bprm)
 	set_personality(PER_LINUX);
 	set_personality_ia32(false);
 
-	setup_new_exec(bprm);
+	retval = setup_new_exec(bprm);
+	if (retval)
+		return retval;
 
 	regs->cs = __USER32_CS;
 	regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ae1b5404fced..b7b8aa03ccd0 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -242,7 +242,9 @@  static int load_aout_binary(struct linux_binprm * bprm)
 #else
 	set_personality(PER_LINUX);
 #endif
-	setup_new_exec(bprm);
+	retval = setup_new_exec(bprm);
+	if (retval)
+		return retval;
 
 	current->mm->end_code = ex.a_text +
 		(current->mm->start_code = N_TXTADDR(ex));
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 2472af2798c7..423fece0b8c4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -852,7 +852,9 @@  static int load_elf_binary(struct linux_binprm *bprm)
 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
 		current->flags |= PF_RANDOMIZE;
 
-	setup_new_exec(bprm);
+	retval = setup_new_exec(bprm);
+	if (retval)
+		goto out_free_dentry;
 	install_exec_creds(bprm);
 
 	/* Do this so that we can load the interpreter, if need be.  We will
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 464a972e88c1..d3099caff96d 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -352,7 +352,9 @@  static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 	if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
 		current->personality |= READ_IMPLIES_EXEC;
 
-	setup_new_exec(bprm);
+	retval = setup_new_exec(bprm);
+	if (retval)
+		goto error;
 
 	set_binfmt(&elf_fdpic_format);
 
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 9b2917a30294..25ca68940ad4 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -524,7 +524,9 @@  static int load_flat_file(struct linux_binprm *bprm,
 
 		/* OK, This is the point of no return */
 		set_personality(PER_LINUX_32BIT);
-		setup_new_exec(bprm);
+		ret = setup_new_exec(bprm);
+		if (ret)
+			goto err;
 	}
 
 	/*
diff --git a/fs/exec.c b/fs/exec.c
index 6fcfb3f7b137..f724ed94ba7a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1270,12 +1270,21 @@  EXPORT_SYMBOL(flush_old_exec);
 
 void would_dump(struct linux_binprm *bprm, struct file *file)
 {
-	if (inode_permission(file_inode(file), MAY_READ) < 0)
+	struct inode *inode = file_inode(file);
+	if (inode_permission(inode, MAY_READ) < 0) {
+		struct user_namespace *user_ns = current->mm->user_ns;
 		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+
+		/* May the user_ns root read the executable? */
+		if (!kuid_has_mapping(user_ns, inode->i_uid) ||
+		    !kgid_has_mapping(user_ns, inode->i_gid)) {
+			bprm->interp_flags |= BINPRM_FLAGS_EXEC_INACCESSIBLE;
+		}
+	}
 }
 EXPORT_SYMBOL(would_dump);
 
-void setup_new_exec(struct linux_binprm * bprm)
+int setup_new_exec(struct linux_binprm * bprm)
 {
 	arch_pick_mmap_layout(current->mm);
 
@@ -1296,12 +1305,15 @@  void setup_new_exec(struct linux_binprm * bprm)
 	 */
 	current->mm->task_size = TASK_SIZE;
 
+	would_dump(bprm, bprm->file);
+	if (bprm->interp_flags & BINPRM_FLAGS_EXEC_INACCESSIBLE)
+		return -EPERM;
+
 	/* install the new credentials */
 	if (!uid_eq(bprm->cred->uid, current_euid()) ||
 	    !gid_eq(bprm->cred->gid, current_egid())) {
 		current->pdeath_signal = 0;
 	} else {
-		would_dump(bprm, bprm->file);
 		if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
 			set_dumpable(current->mm, suid_dumpable);
 	}
@@ -1311,6 +1323,7 @@  void setup_new_exec(struct linux_binprm * bprm)
 	current->self_exec_id++;
 	flush_signal_handlers(current, 0);
 	do_close_on_exec(current->files);
+	return 0;
 }
 EXPORT_SYMBOL(setup_new_exec);
 
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 1303b570b18c..8e5fb9eca2ee 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -57,6 +57,10 @@  struct linux_binprm {
 #define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2
 #define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT)
 
+/* executable is inaccessible for performing exec */
+#define BINPRM_FLAGS_EXEC_INACCESSIBLE_BIT 3
+#define BINPRM_FLAGS_EXEC_INACCESSIBLE (1 << BINPRM_FLAGS_EXEC_INACCESSIBLE_BIT)
+
 /* Function parameter for binfmt->coredump */
 struct coredump_params {
 	const siginfo_t *siginfo;
@@ -100,7 +104,7 @@  extern int prepare_binprm(struct linux_binprm *);
 extern int __must_check remove_arg_zero(struct linux_binprm *);
 extern int search_binary_handler(struct linux_binprm *);
 extern int flush_old_exec(struct linux_binprm * bprm);
-extern void setup_new_exec(struct linux_binprm * bprm);
+extern int setup_new_exec(struct linux_binprm * bprm);
 extern void would_dump(struct linux_binprm *, struct file *);
 
 extern int suid_dumpable;

Comments

Amir Goldstein Oct. 19, 2016, 6:13 a.m.
On Wed, Oct 19, 2016 at 12:15 AM, Eric W. Biederman
<ebiederm@xmission.com> wrote:
>
> When the user namespace support was merged the need to prevent
> ptracing an executable that is not readable was overlooked.
>
> Correct this oversight by not letting exec succeed if during exec an
> executable is not readable and the current user namespace capabilities
> do not apply to the executable's file.
>
> While it happens that distros install some files setuid and
> non-readable I have not found any executable files just installed
> non-readalbe.  Executables that are setuid to a user not mapped in a
> user namespace are worthless, so I don't expect this to introduce
> any problems in practice.
>
> There may be a way to allow this execution to happen by setting
> mm->user_ns to a more privileged user namespace and watching out for
> the possibility of using dynamic linkers or other shared libraries
> that the kernel loads into the mm to bypass the read-only
> restriction.  But the analysis is more difficult and it would
> require more code churn so I don't think the effort is worth it.
>
> Cc: stable@vger.kernel.org
> Reported-by: Jann Horn <jann@thejh.net>
> Fixes: 9e4a36ece652 ("userns: Fail exec for suid and sgid binaries with ids outside our user namespace.")
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
>
> Tossing this out for review in case I missed something silly but this
> patch seems pretty trivial.
>
>  arch/x86/ia32/ia32_aout.c |  4 +++-
>  fs/binfmt_aout.c          |  4 +++-
>  fs/binfmt_elf.c           |  4 +++-
>  fs/binfmt_elf_fdpic.c     |  4 +++-
>  fs/binfmt_flat.c          |  4 +++-
>  fs/exec.c                 | 19 ++++++++++++++++---
>  include/linux/binfmts.h   |  6 +++++-
>  7 files changed, 36 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
> index cb26f18d43af..7ad20dedd929 100644
> --- a/arch/x86/ia32/ia32_aout.c
> +++ b/arch/x86/ia32/ia32_aout.c
> @@ -294,7 +294,9 @@ static int load_aout_binary(struct linux_binprm *bprm)
>         set_personality(PER_LINUX);
>         set_personality_ia32(false);
>
> -       setup_new_exec(bprm);
> +       retval = setup_new_exec(bprm);
> +       if (retval)
> +               return retval;
>
>         regs->cs = __USER32_CS;
>         regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
> diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
> index ae1b5404fced..b7b8aa03ccd0 100644
> --- a/fs/binfmt_aout.c
> +++ b/fs/binfmt_aout.c
> @@ -242,7 +242,9 @@ static int load_aout_binary(struct linux_binprm * bprm)
>  #else
>         set_personality(PER_LINUX);
>  #endif
> -       setup_new_exec(bprm);
> +       retval = setup_new_exec(bprm);
> +       if (retval)
> +               return retval;
>
>         current->mm->end_code = ex.a_text +
>                 (current->mm->start_code = N_TXTADDR(ex));
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 2472af2798c7..423fece0b8c4 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -852,7 +852,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
>         if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
>                 current->flags |= PF_RANDOMIZE;
>
> -       setup_new_exec(bprm);
> +       retval = setup_new_exec(bprm);
> +       if (retval)
> +               goto out_free_dentry;
>         install_exec_creds(bprm);
>
>         /* Do this so that we can load the interpreter, if need be.  We will
> diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
> index 464a972e88c1..d3099caff96d 100644
> --- a/fs/binfmt_elf_fdpic.c
> +++ b/fs/binfmt_elf_fdpic.c
> @@ -352,7 +352,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
>         if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
>                 current->personality |= READ_IMPLIES_EXEC;
>
> -       setup_new_exec(bprm);
> +       retval = setup_new_exec(bprm);
> +       if (retval)
> +               goto error;
>
>         set_binfmt(&elf_fdpic_format);
>
> diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
> index 9b2917a30294..25ca68940ad4 100644
> --- a/fs/binfmt_flat.c
> +++ b/fs/binfmt_flat.c
> @@ -524,7 +524,9 @@ static int load_flat_file(struct linux_binprm *bprm,
>
>                 /* OK, This is the point of no return */
>                 set_personality(PER_LINUX_32BIT);
> -               setup_new_exec(bprm);
> +               ret = setup_new_exec(bprm);
> +               if (ret)
> +                       goto err;
>         }
>
>         /*
> diff --git a/fs/exec.c b/fs/exec.c
> index 6fcfb3f7b137..f724ed94ba7a 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1270,12 +1270,21 @@ EXPORT_SYMBOL(flush_old_exec);
>
>  void would_dump(struct linux_binprm *bprm, struct file *file)
>  {
> -       if (inode_permission(file_inode(file), MAY_READ) < 0)
> +       struct inode *inode = file_inode(file);
> +       if (inode_permission(inode, MAY_READ) < 0) {
> +               struct user_namespace *user_ns = current->mm->user_ns;
>                 bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
> +
> +               /* May the user_ns root read the executable? */
> +               if (!kuid_has_mapping(user_ns, inode->i_uid) ||
> +                   !kgid_has_mapping(user_ns, inode->i_gid)) {
> +                       bprm->interp_flags |= BINPRM_FLAGS_EXEC_INACCESSIBLE;
> +               }

This feels like it should belong inside
inode_permission(file_inode(file), MAY_EXEC)
which hopefully should be checked long before getting here??

> +       }
>  }
>  EXPORT_SYMBOL(would_dump);
>
> -void setup_new_exec(struct linux_binprm * bprm)
> +int setup_new_exec(struct linux_binprm * bprm)
>  {
>         arch_pick_mmap_layout(current->mm);
>
> @@ -1296,12 +1305,15 @@ void setup_new_exec(struct linux_binprm * bprm)
>          */
>         current->mm->task_size = TASK_SIZE;
>
> +       would_dump(bprm, bprm->file);
> +       if (bprm->interp_flags & BINPRM_FLAGS_EXEC_INACCESSIBLE)
> +               return -EPERM;
> +
>         /* install the new credentials */
>         if (!uid_eq(bprm->cred->uid, current_euid()) ||
>             !gid_eq(bprm->cred->gid, current_egid())) {
>                 current->pdeath_signal = 0;
>         } else {
> -               would_dump(bprm, bprm->file);
>                 if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
>                         set_dumpable(current->mm, suid_dumpable);
>         }
> @@ -1311,6 +1323,7 @@ void setup_new_exec(struct linux_binprm * bprm)
>         current->self_exec_id++;
>         flush_signal_handlers(current, 0);
>         do_close_on_exec(current->files);
> +       return 0;
>  }
>  EXPORT_SYMBOL(setup_new_exec);
>
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index 1303b570b18c..8e5fb9eca2ee 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -57,6 +57,10 @@ struct linux_binprm {
>  #define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2
>  #define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT)
>
> +/* executable is inaccessible for performing exec */
> +#define BINPRM_FLAGS_EXEC_INACCESSIBLE_BIT 3
> +#define BINPRM_FLAGS_EXEC_INACCESSIBLE (1 << BINPRM_FLAGS_EXEC_INACCESSIBLE_BIT)
> +
>  /* Function parameter for binfmt->coredump */
>  struct coredump_params {
>         const siginfo_t *siginfo;
> @@ -100,7 +104,7 @@ extern int prepare_binprm(struct linux_binprm *);
>  extern int __must_check remove_arg_zero(struct linux_binprm *);
>  extern int search_binary_handler(struct linux_binprm *);
>  extern int flush_old_exec(struct linux_binprm * bprm);
> -extern void setup_new_exec(struct linux_binprm * bprm);
> +extern int setup_new_exec(struct linux_binprm * bprm);
>  extern void would_dump(struct linux_binprm *, struct file *);
>
>  extern int suid_dumpable;
> --
> 2.8.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Oct. 19, 2016, 1:33 p.m.
Amir Goldstein <amir73il@gmail.com> writes:

>> diff --git a/fs/exec.c b/fs/exec.c
>> index 6fcfb3f7b137..f724ed94ba7a 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1270,12 +1270,21 @@ EXPORT_SYMBOL(flush_old_exec);
>>
>>  void would_dump(struct linux_binprm *bprm, struct file *file)
>>  {
>> -       if (inode_permission(file_inode(file), MAY_READ) < 0)
>> +       struct inode *inode = file_inode(file);
>> +       if (inode_permission(inode, MAY_READ) < 0) {
>> +               struct user_namespace *user_ns = current->mm->user_ns;
>>                 bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
>> +
>> +               /* May the user_ns root read the executable? */
>> +               if (!kuid_has_mapping(user_ns, inode->i_uid) ||
>> +                   !kgid_has_mapping(user_ns, inode->i_gid)) {
>> +                       bprm->interp_flags |= BINPRM_FLAGS_EXEC_INACCESSIBLE;
>> +               }
>
> This feels like it should belong inside
> inode_permission(file_inode(file), MAY_EXEC)
> which hopefully should be checked long before getting here??

It is the active ingredient in capable_wrt_inode_uidgid and is indeed
inside of inode_permission.

What I am testing for here is if I have a process with a full
set of capabilities in current->mm->user_ns will the inode be readable.

I can see an argument for calling prepare_creds stuffing the new cred
full of capabilities.  Calling override_cred.  Calling inode_permission,
restoring the credentials.  But it seems very much like overkill and
more error prone because of the more code involved.

So I have done the simple thing that doesn't hide what is really going on.

Eric
Andy Lutomirski Oct. 19, 2016, 3:30 p.m.
On Tue, Oct 18, 2016 at 2:15 PM, Eric W. Biederman
<ebiederm@xmission.com> wrote:
>
> When the user namespace support was merged the need to prevent
> ptracing an executable that is not readable was overlooked.

Before getting too excited about this fix, isn't there a much bigger
hole that's been there forever?  Simply ptrace yourself, exec the
program, and then dump the program out.  A program that really wants
to be unreadable should have a stub: the stub is setuid and readable,
but all the stub does is to exec the real program, and the real
program should have mode 0500 or similar.

ISTM the "right" check would be to enforce that the program's new
creds can read the program, but that will break backwards
compatibility.

--Andy
Eric W. Biederman Oct. 19, 2016, 4:52 p.m.
Andy Lutomirski <luto@amacapital.net> writes:

> On Tue, Oct 18, 2016 at 2:15 PM, Eric W. Biederman
> <ebiederm@xmission.com> wrote:
>>
>> When the user namespace support was merged the need to prevent
>> ptracing an executable that is not readable was overlooked.
>
> Before getting too excited about this fix, isn't there a much bigger
> hole that's been there forever?

In this case it was a newish hole (2011) that the user namespace support
added that I am closing.  I am not super excited but I figure it is
useful to make the kernel semantics at least as secure as they were
before.

> Simply ptrace yourself, exec the
> program, and then dump the program out.  A program that really wants
> to be unreadable should have a stub: the stub is setuid and readable,
> but all the stub does is to exec the real program, and the real
> program should have mode 0500 or similar.
>
> ISTM the "right" check would be to enforce that the program's new
> creds can read the program, but that will break backwards
> compatibility.

Last I looked I had the impression that exec of a setuid program kills
the ptrace.

If we are talking about a exec of a simple unreadable executable (aka
something that sets undumpable but is not setuid or setgid).  Then I
agree it should break the ptrace as well and since those programs are as
rare as hens teeth I don't see any problem with changing the ptrace behavior
in that case.

Eric
Eric W. Biederman Oct. 19, 2016, 5:04 p.m.
ebiederm@xmission.com (Eric W. Biederman) writes:

> Amir Goldstein <amir73il@gmail.com> writes:
>
>>> diff --git a/fs/exec.c b/fs/exec.c
>>> index 6fcfb3f7b137..f724ed94ba7a 100644
>>> --- a/fs/exec.c
>>> +++ b/fs/exec.c
>>> @@ -1270,12 +1270,21 @@ EXPORT_SYMBOL(flush_old_exec);
>>>
>>>  void would_dump(struct linux_binprm *bprm, struct file *file)
>>>  {
>>> -       if (inode_permission(file_inode(file), MAY_READ) < 0)
>>> +       struct inode *inode = file_inode(file);
>>> +       if (inode_permission(inode, MAY_READ) < 0) {
>>> +               struct user_namespace *user_ns = current->mm->user_ns;
>>>                 bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
>>> +
>>> +               /* May the user_ns root read the executable? */
>>> +               if (!kuid_has_mapping(user_ns, inode->i_uid) ||
>>> +                   !kgid_has_mapping(user_ns, inode->i_gid)) {
>>> +                       bprm->interp_flags |= BINPRM_FLAGS_EXEC_INACCESSIBLE;
>>> +               }
>>
>> This feels like it should belong inside
>> inode_permission(file_inode(file), MAY_EXEC)
>> which hopefully should be checked long before getting here??
>
> It is the active ingredient in capable_wrt_inode_uidgid and is indeed
> inside of inode_permission.
>
> What I am testing for here is if I have a process with a full
> set of capabilities in current->mm->user_ns will the inode be readable.
>
> I can see an argument for calling prepare_creds stuffing the new cred
> full of capabilities.  Calling override_cred.  Calling inode_permission,
> restoring the credentials.  But it seems very much like overkill and
> more error prone because of the more code involved.
>
> So I have done the simple thing that doesn't hide what is really going on.

At the same time I can see the addition of a helper function
bool ns_inode(struct user_namespace *user_ns, struct inode *inode)
{
	return kuid_has_mapping(user_ns, inode->i_uid) &&
        	kgid_has_mapping(user_ns, inode->i_gid);
}

That abstracts out the concept instead of open codes it.

Eric
Jann Horn Oct. 19, 2016, 5:29 p.m.
On Wed, Oct 19, 2016 at 11:52:50AM -0500, Eric W. Biederman wrote:
> Andy Lutomirski <luto@amacapital.net> writes:
> > Simply ptrace yourself, exec the
> > program, and then dump the program out.  A program that really wants
> > to be unreadable should have a stub: the stub is setuid and readable,
> > but all the stub does is to exec the real program, and the real
> > program should have mode 0500 or similar.
> >
> > ISTM the "right" check would be to enforce that the program's new
> > creds can read the program, but that will break backwards
> > compatibility.
> 
> Last I looked I had the impression that exec of a setuid program kills
> the ptrace.
> 
> If we are talking about a exec of a simple unreadable executable (aka
> something that sets undumpable but is not setuid or setgid).  Then I
> agree it should break the ptrace as well and since those programs are as
> rare as hens teeth I don't see any problem with changing the ptrace behavior
> in that case.

Nope. check_unsafe_exec() sets LSM_UNSAFE_* flags in bprm->unsafe, and then
the flags are checked by the LSMs and cap_bprm_set_creds() in commoncap.c.
cap_bprm_set_creds() just degrades the execution to a non-setuid-ish one,
and e.g. ptracers stay attached.

Same thing happens if the fs struct is shared with another process or if
NO_NEW_PRIVS is active.

(Actually, it's still a bit like normal setuid execution: IIRC AT_SECURE
stays active, and the resulting process still won't be dumpable, so it's
not possible for a *new* ptracer to attach afterwards. But this is just
from memory, I'm not entirely sure.)
Andy Lutomirski Oct. 19, 2016, 5:32 p.m.
On Wed, Oct 19, 2016 at 10:29 AM, Jann Horn <jann@thejh.net> wrote:
> On Wed, Oct 19, 2016 at 11:52:50AM -0500, Eric W. Biederman wrote:
>> Andy Lutomirski <luto@amacapital.net> writes:
>> > Simply ptrace yourself, exec the
>> > program, and then dump the program out.  A program that really wants
>> > to be unreadable should have a stub: the stub is setuid and readable,
>> > but all the stub does is to exec the real program, and the real
>> > program should have mode 0500 or similar.
>> >
>> > ISTM the "right" check would be to enforce that the program's new
>> > creds can read the program, but that will break backwards
>> > compatibility.
>>
>> Last I looked I had the impression that exec of a setuid program kills
>> the ptrace.
>>
>> If we are talking about a exec of a simple unreadable executable (aka
>> something that sets undumpable but is not setuid or setgid).  Then I
>> agree it should break the ptrace as well and since those programs are as
>> rare as hens teeth I don't see any problem with changing the ptrace behavior
>> in that case.
>
> Nope. check_unsafe_exec() sets LSM_UNSAFE_* flags in bprm->unsafe, and then
> the flags are checked by the LSMs and cap_bprm_set_creds() in commoncap.c.
> cap_bprm_set_creds() just degrades the execution to a non-setuid-ish one,
> and e.g. ptracers stay attached.

I think you're right.  I ought to be completely sure because I rewrote
that code back in 2005 or so back when I thought kernel programming
was only for the cool kids.  It was probably my first kernel patch
ever and it closed an awkward-to-exploit root hole.  But it's been a
while.  (Too bad my second (IIRC) kernel patch was more mundane and
fixed the mute button on "new" Lenovo X60-era laptops and spend
several years in limbo...)

--Andy
Eric W. Biederman Oct. 19, 2016, 5:55 p.m.
Andy Lutomirski <luto@amacapital.net> writes:

> On Wed, Oct 19, 2016 at 10:29 AM, Jann Horn <jann@thejh.net> wrote:
>> On Wed, Oct 19, 2016 at 11:52:50AM -0500, Eric W. Biederman wrote:
>>> Andy Lutomirski <luto@amacapital.net> writes:
>>> > Simply ptrace yourself, exec the
>>> > program, and then dump the program out.  A program that really wants
>>> > to be unreadable should have a stub: the stub is setuid and readable,
>>> > but all the stub does is to exec the real program, and the real
>>> > program should have mode 0500 or similar.
>>> >
>>> > ISTM the "right" check would be to enforce that the program's new
>>> > creds can read the program, but that will break backwards
>>> > compatibility.
>>>
>>> Last I looked I had the impression that exec of a setuid program kills
>>> the ptrace.
>>>
>>> If we are talking about a exec of a simple unreadable executable (aka
>>> something that sets undumpable but is not setuid or setgid).  Then I
>>> agree it should break the ptrace as well and since those programs are as
>>> rare as hens teeth I don't see any problem with changing the ptrace behavior
>>> in that case.
>>
>> Nope. check_unsafe_exec() sets LSM_UNSAFE_* flags in bprm->unsafe, and then
>> the flags are checked by the LSMs and cap_bprm_set_creds() in commoncap.c.
>> cap_bprm_set_creds() just degrades the execution to a non-setuid-ish one,
>> and e.g. ptracers stay attached.
>
> I think you're right.  I ought to be completely sure because I rewrote
> that code back in 2005 or so back when I thought kernel programming
> was only for the cool kids.  It was probably my first kernel patch
> ever and it closed an awkward-to-exploit root hole.  But it's been a
> while.  (Too bad my second (IIRC) kernel patch was more mundane and
> fixed the mute button on "new" Lenovo X60-era laptops and spend
> several years in limbo...)

Ah yes and this is only a problem if the ptracer does not have
CAP_SYS_PTRACE.

If the tracer does not have sufficient permissions any opinions on
failing the exec or kicking out the ptracer?  I am leaning towards failing
the exec as it is more obvious if someone cares.  Dropping the ptracer
could be a major mystery.

Eric
Andy Lutomirski Oct. 19, 2016, 6:36 p.m.
On Oct 19, 2016 9:54 AM, "Eric W. Biederman" <ebiederm@xmission.com> wrote:
>
> Andy Lutomirski <luto@amacapital.net> writes:
>
> > On Tue, Oct 18, 2016 at 2:15 PM, Eric W. Biederman
> > <ebiederm@xmission.com> wrote:
> >>
> >> When the user namespace support was merged the need to prevent
> >> ptracing an executable that is not readable was overlooked.
> >
> > Before getting too excited about this fix, isn't there a much bigger
> > hole that's been there forever?
>
> In this case it was a newish hole (2011) that the user namespace support
> added that I am closing.  I am not super excited but I figure it is
> useful to make the kernel semantics at least as secure as they were
> before.
>

But if it was never secure in the first place...

> > Simply ptrace yourself, exec the
> > program, and then dump the program out.  A program that really wants
> > to be unreadable should have a stub: the stub is setuid and readable,
> > but all the stub does is to exec the real program, and the real
> > program should have mode 0500 or similar.
> >
> > ISTM the "right" check would be to enforce that the program's new
> > creds can read the program, but that will break backwards
> > compatibility.
>
> Last I looked I had the impression that exec of a setuid program kills
> the ptrace.

I thought it killed the setuid, not the ptrace.

(I ought to know because I rewrote that code back in 2005 or so back when I
thought kernel programming was only for the cool kids.  It was probably my
first kernel patch ever and it fixed an awkward-to-exploit race.  But it's
been a while.)
Andy Lutomirski Oct. 19, 2016, 6:38 p.m.
On Wed, Oct 19, 2016 at 10:55 AM, Eric W. Biederman
<ebiederm@xmission.com> wrote:
> Andy Lutomirski <luto@amacapital.net> writes:
>
>> On Wed, Oct 19, 2016 at 10:29 AM, Jann Horn <jann@thejh.net> wrote:
>>> On Wed, Oct 19, 2016 at 11:52:50AM -0500, Eric W. Biederman wrote:
>>>> Andy Lutomirski <luto@amacapital.net> writes:
>>>> > Simply ptrace yourself, exec the
>>>> > program, and then dump the program out.  A program that really wants
>>>> > to be unreadable should have a stub: the stub is setuid and readable,
>>>> > but all the stub does is to exec the real program, and the real
>>>> > program should have mode 0500 or similar.
>>>> >
>>>> > ISTM the "right" check would be to enforce that the program's new
>>>> > creds can read the program, but that will break backwards
>>>> > compatibility.
>>>>
>>>> Last I looked I had the impression that exec of a setuid program kills
>>>> the ptrace.
>>>>
>>>> If we are talking about a exec of a simple unreadable executable (aka
>>>> something that sets undumpable but is not setuid or setgid).  Then I
>>>> agree it should break the ptrace as well and since those programs are as
>>>> rare as hens teeth I don't see any problem with changing the ptrace behavior
>>>> in that case.
>>>
>>> Nope. check_unsafe_exec() sets LSM_UNSAFE_* flags in bprm->unsafe, and then
>>> the flags are checked by the LSMs and cap_bprm_set_creds() in commoncap.c.
>>> cap_bprm_set_creds() just degrades the execution to a non-setuid-ish one,
>>> and e.g. ptracers stay attached.
>>
>> I think you're right.  I ought to be completely sure because I rewrote
>> that code back in 2005 or so back when I thought kernel programming
>> was only for the cool kids.  It was probably my first kernel patch
>> ever and it closed an awkward-to-exploit root hole.  But it's been a
>> while.  (Too bad my second (IIRC) kernel patch was more mundane and
>> fixed the mute button on "new" Lenovo X60-era laptops and spend
>> several years in limbo...)
>
> Ah yes and this is only a problem if the ptracer does not have
> CAP_SYS_PTRACE.
>
> If the tracer does not have sufficient permissions any opinions on
> failing the exec or kicking out the ptracer?  I am leaning towards failing
> the exec as it is more obvious if someone cares.  Dropping the ptracer
> could be a major mystery.

I would suggest leaving it alone.  Changing it could break enough
things that a sysctl would be needed, and I just don't see how this is
a significant issue, especially since it's been insecure forever.
Anyone who cares should do the stub executable trick:

/sbin/foo: 04755, literally just does execve("/sbin/foo-helper");

/sbin/foo-helper: 0500.

--Andy
Eric W. Biederman Oct. 19, 2016, 9:26 p.m.
Andy Lutomirski <luto@amacapital.net> writes:

> On Wed, Oct 19, 2016 at 10:55 AM, Eric W. Biederman
> <ebiederm@xmission.com> wrote:
>> Andy Lutomirski <luto@amacapital.net> writes:
>>
>>> On Wed, Oct 19, 2016 at 10:29 AM, Jann Horn <jann@thejh.net> wrote:
>>>> On Wed, Oct 19, 2016 at 11:52:50AM -0500, Eric W. Biederman wrote:
>>>>> Andy Lutomirski <luto@amacapital.net> writes:
>>>>> > Simply ptrace yourself, exec the
>>>>> > program, and then dump the program out.  A program that really wants
>>>>> > to be unreadable should have a stub: the stub is setuid and readable,
>>>>> > but all the stub does is to exec the real program, and the real
>>>>> > program should have mode 0500 or similar.
>>>>> >
>>>>> > ISTM the "right" check would be to enforce that the program's new
>>>>> > creds can read the program, but that will break backwards
>>>>> > compatibility.
>>>>>
>>>>> Last I looked I had the impression that exec of a setuid program kills
>>>>> the ptrace.
>>>>>
>>>>> If we are talking about a exec of a simple unreadable executable (aka
>>>>> something that sets undumpable but is not setuid or setgid).  Then I
>>>>> agree it should break the ptrace as well and since those programs are as
>>>>> rare as hens teeth I don't see any problem with changing the ptrace behavior
>>>>> in that case.
>>>>
>>>> Nope. check_unsafe_exec() sets LSM_UNSAFE_* flags in bprm->unsafe, and then
>>>> the flags are checked by the LSMs and cap_bprm_set_creds() in commoncap.c.
>>>> cap_bprm_set_creds() just degrades the execution to a non-setuid-ish one,
>>>> and e.g. ptracers stay attached.
>>>
>>> I think you're right.  I ought to be completely sure because I rewrote
>>> that code back in 2005 or so back when I thought kernel programming
>>> was only for the cool kids.  It was probably my first kernel patch
>>> ever and it closed an awkward-to-exploit root hole.  But it's been a
>>> while.  (Too bad my second (IIRC) kernel patch was more mundane and
>>> fixed the mute button on "new" Lenovo X60-era laptops and spend
>>> several years in limbo...)
>>
>> Ah yes and this is only a problem if the ptracer does not have
>> CAP_SYS_PTRACE.
>>
>> If the tracer does not have sufficient permissions any opinions on
>> failing the exec or kicking out the ptracer?  I am leaning towards failing
>> the exec as it is more obvious if someone cares.  Dropping the ptracer
>> could be a major mystery.
>
> I would suggest leaving it alone.  Changing it could break enough
> things that a sysctl would be needed, and I just don't see how this is
> a significant issue, especially since it's been insecure forever.
> Anyone who cares should do the stub executable trick:
>
> /sbin/foo: 04755, literally just does execve("/sbin/foo-helper");
>
> /sbin/foo-helper: 0500.

I can't imagine what non-malware would depend on being able to
circumvent file permissions and ptrace a read-only executable.  Is there
something you are thinking of?

I know I saw someone depending on read-only executables being read-only
earlier this week on the security list, and it could definitely act as
part of a counter measure to make binaries harder to exploit.

So given that people actually expect no-read permissions to be honored
on executables (with what seem valid and sensible use cases), that
I can't see any valid reason not to honor no-read permissions, that it
takes a really convoluted setup to bypass the current no-read
permissions, and that I can't believe anyone cares about the current
behavior of ptrace I think this is worth fixing.

Eric
Andy Lutomirski Oct. 19, 2016, 11:17 p.m.
On Oct 19, 2016 2:28 PM, "Eric W. Biederman" <ebiederm@xmission.com> wrote:
>
> Andy Lutomirski <luto@amacapital.net> writes:
>
> > On Wed, Oct 19, 2016 at 10:55 AM, Eric W. Biederman
> > <ebiederm@xmission.com> wrote:
> >> Andy Lutomirski <luto@amacapital.net> writes:
> >>
> >>> On Wed, Oct 19, 2016 at 10:29 AM, Jann Horn <jann@thejh.net> wrote:
> >>>> On Wed, Oct 19, 2016 at 11:52:50AM -0500, Eric W. Biederman wrote:
> >>>>> Andy Lutomirski <luto@amacapital.net> writes:
> >>>>> > Simply ptrace yourself, exec the
> >>>>> > program, and then dump the program out.  A program that really wants
> >>>>> > to be unreadable should have a stub: the stub is setuid and readable,
> >>>>> > but all the stub does is to exec the real program, and the real
> >>>>> > program should have mode 0500 or similar.
> >>>>> >
> >>>>> > ISTM the "right" check would be to enforce that the program's new
> >>>>> > creds can read the program, but that will break backwards
> >>>>> > compatibility.
> >>>>>
> >>>>> Last I looked I had the impression that exec of a setuid program kills
> >>>>> the ptrace.
> >>>>>
> >>>>> If we are talking about a exec of a simple unreadable executable (aka
> >>>>> something that sets undumpable but is not setuid or setgid).  Then I
> >>>>> agree it should break the ptrace as well and since those programs are as
> >>>>> rare as hens teeth I don't see any problem with changing the ptrace behavior
> >>>>> in that case.
> >>>>
> >>>> Nope. check_unsafe_exec() sets LSM_UNSAFE_* flags in bprm->unsafe, and then
> >>>> the flags are checked by the LSMs and cap_bprm_set_creds() in commoncap.c.
> >>>> cap_bprm_set_creds() just degrades the execution to a non-setuid-ish one,
> >>>> and e.g. ptracers stay attached.
> >>>
> >>> I think you're right.  I ought to be completely sure because I rewrote
> >>> that code back in 2005 or so back when I thought kernel programming
> >>> was only for the cool kids.  It was probably my first kernel patch
> >>> ever and it closed an awkward-to-exploit root hole.  But it's been a
> >>> while.  (Too bad my second (IIRC) kernel patch was more mundane and
> >>> fixed the mute button on "new" Lenovo X60-era laptops and spend
> >>> several years in limbo...)
> >>
> >> Ah yes and this is only a problem if the ptracer does not have
> >> CAP_SYS_PTRACE.
> >>
> >> If the tracer does not have sufficient permissions any opinions on
> >> failing the exec or kicking out the ptracer?  I am leaning towards failing
> >> the exec as it is more obvious if someone cares.  Dropping the ptracer
> >> could be a major mystery.
> >
> > I would suggest leaving it alone.  Changing it could break enough
> > things that a sysctl would be needed, and I just don't see how this is
> > a significant issue, especially since it's been insecure forever.
> > Anyone who cares should do the stub executable trick:
> >
> > /sbin/foo: 04755, literally just does execve("/sbin/foo-helper");
> >
> > /sbin/foo-helper: 0500.
>
> I can't imagine what non-malware would depend on being able to
> circumvent file permissions and ptrace a read-only executable.  Is there
> something you are thinking of?

$ strace sudo foobar

or

$ strace auditctl

I find the current behavior somewhat odd, but I've taken advantage of
it on a semi-regular basis.

That being said, the "May the user_ns root read the executable?" test
in your patch is not strictly correct.  Do we keep a struct cred
around for the ns root?
Eric W. Biederman Nov. 17, 2016, 5:02 p.m.
With everyone heading to Kernel Summit and Plumbers I put this set of
patches down temporarily.   Now is the time to take it back up and to
make certain I am not missing something stupid in this set of patches.

There are other issues in this area as well, but these are the pieces
that I can see clearly, and have tested fixes for.

Andy as to your criticism about using strace sudo I can't possibly see
how that is effective or useful.  Under strace sudo won't run as root
today, and will immediately exit because it is not root.  Furthermore
the only place I can find non-readable executables is people hardening
suid root executables so they are more difficult to trace.  So I
definitely think we should honor the unix permissions and people's
expressed wishes.

Eric W. Biederman (3):
      ptrace: Capture the ptracer's creds not PT_PTRACE_CAP
      exec: Don't allow ptracing an exec of an unreadable file
      exec: Ensure mm->user_ns contains the execed files

 fs/exec.c                  | 26 +++++++++++++++++++++++---
 include/linux/capability.h |  2 ++
 include/linux/ptrace.h     |  1 -
 include/linux/sched.h      |  1 +
 kernel/capability.c        | 36 ++++++++++++++++++++++++++++++++++--
 kernel/ptrace.c            | 12 +++++++-----
 6 files changed, 67 insertions(+), 11 deletions(-)
Willy Tarreau Nov. 19, 2016, 7:17 a.m.
Hi Eric,

On Thu, Nov 17, 2016 at 11:02:47AM -0600, Eric W. Biederman wrote:
> 
> With everyone heading to Kernel Summit and Plumbers I put this set of
> patches down temporarily.   Now is the time to take it back up and to
> make certain I am not missing something stupid in this set of patches.

I couldn't get your patch set to apply to any of the kernels I tried,
I manually adjusted some parts but the second one has too many rejects.
What kernel should I apply this to ? Or maybe some preliminary patches
are needed ?

Thanks,
Willy
Willy Tarreau Nov. 19, 2016, 9:28 a.m.
On Sat, Nov 19, 2016 at 08:17:00AM +0100, Willy Tarreau wrote:
> Hi Eric,
> 
> On Thu, Nov 17, 2016 at 11:02:47AM -0600, Eric W. Biederman wrote:
> > 
> > With everyone heading to Kernel Summit and Plumbers I put this set of
> > patches down temporarily.   Now is the time to take it back up and to
> > make certain I am not missing something stupid in this set of patches.
> 
> I couldn't get your patch set to apply to any of the kernels I tried,
> I manually adjusted some parts but the second one has too many rejects.
> What kernel should I apply this to ? Or maybe some preliminary patches
> are needed ?

OK I finally managed to get it to work on top of 4.8.9 (required less changes
than master). I also had to drop the user_ns changes since there's no such
user_ns in mm_struct there.

I could run a test on it, that looks reasonable :

FS:

admin@vm:~$ strace -e trace=fstat,uname,ioctl,open uname
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7f3f9a1663e3, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = 3
fstat(3, {...})                         = 0
open(0x7ffd01bbee80, O_RDONLY|O_CLOEXEC) = 3
fstat(3, {...})                         = 0
uname({...})                            = 0
fstat(1, {...})                         = 0
ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, 0x7ffd01bbf400) = 0

admin@vm:~$ sudo strace -e trace=fstat,uname,ioctl,open uname
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/lib64/tls/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/lib64/tls/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/lib64/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0555, st_size=101312, ...}) = 0
open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0555, st_size=1479016, ...}) = 0
uname({sys="Linux", node="vm", ...})    = 0
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(4, 64), ...}) = 0
ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0

Network:

admin@vm:~$ strace -e trace=socket,setsockopt,connect /tmp/nc 198.18.3 22
socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
connect(3, {...}, 110)                  = -1 ENOENT (No such file or directory)
socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
connect(3, {...}, 110)                  = -1 ENOENT (No such file or directory)
socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 3
setsockopt(3, SOL_SOCKET, SO_REUSEADDR, 0x7ffd2c26bdbc, 4) = 0
connect(3, {...}, 16)                   = 0

admin@vm:~$ sudo strace -e trace=socket,setsockopt,connect /tmp/nc 198.18.3 22
socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 3
setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("198.18.0.3")}, 16) = 0

So in short now we can at least see what syscall fails eventhough we can't
know why. I think it can be an acceptable trade-off.

Thanks,
Willy
Willy Tarreau Nov. 19, 2016, 9:33 a.m.
On Sat, Nov 19, 2016 at 10:28:04AM +0100, Willy Tarreau wrote:
> On Sat, Nov 19, 2016 at 08:17:00AM +0100, Willy Tarreau wrote:
> > Hi Eric,
> > 
> > On Thu, Nov 17, 2016 at 11:02:47AM -0600, Eric W. Biederman wrote:
> > > 
> > > With everyone heading to Kernel Summit and Plumbers I put this set of
> > > patches down temporarily.   Now is the time to take it back up and to
> > > make certain I am not missing something stupid in this set of patches.
> > 
> > I couldn't get your patch set to apply to any of the kernels I tried,
> > I manually adjusted some parts but the second one has too many rejects.
> > What kernel should I apply this to ? Or maybe some preliminary patches
> > are needed ?
> 
> OK I finally managed to get it to work on top of 4.8.9 (required less changes
> than master). I also had to drop the user_ns changes since there's no such
> user_ns in mm_struct there.
> 
> I could run a test on it, that looks reasonable :
> 
> FS:
> 
> admin@vm:~$ strace -e trace=fstat,uname,ioctl,open uname
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7f3f9a1663e3, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {...})                         = 0
> open(0x7ffd01bbee80, O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {...})                         = 0
> uname({...})                            = 0
> fstat(1, {...})                         = 0
> ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, 0x7ffd01bbf400) = 0
> 
> admin@vm:~$ sudo strace -e trace=fstat,uname,ioctl,open uname
> open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/tls/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/tls/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {st_mode=S_IFREG|0555, st_size=101312, ...}) = 0
> open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {st_mode=S_IFREG|0555, st_size=1479016, ...}) = 0
> uname({sys="Linux", node="vm", ...})    = 0
> fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(4, 64), ...}) = 0
> ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0
> 
> Network:
> 
> admin@vm:~$ strace -e trace=socket,setsockopt,connect /tmp/nc 198.18.3 22
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {...}, 110)                  = -1 ENOENT (No such file or directory)
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {...}, 110)                  = -1 ENOENT (No such file or directory)
> socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 3
> setsockopt(3, SOL_SOCKET, SO_REUSEADDR, 0x7ffd2c26bdbc, 4) = 0
> connect(3, {...}, 16)                   = 0
> 
> admin@vm:~$ sudo strace -e trace=socket,setsockopt,connect /tmp/nc 198.18.3 22
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
> socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 3
> setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
> connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("198.18.0.3")}, 16) = 0
> 
> So in short now we can at least see what syscall fails eventhough we can't
> know why. I think it can be an acceptable trade-off.

I also tested with gdb and it behaves as desired :

admin@vm:~$ sleep 100000 &
[1] 1615
admin@vm:~$ /tmp/gdb-x86_64 -q -p 1615
Attaching to process 1615
ptrace: Operation not permitted.
(gdb) quit

admin@vm:~$ sudo cp /bin/sleep /var/tmp/
admin@vm:~$ sudo chmod 755 /var/tmp/sleep 
admin@vm:~$ /tmp/sleep 100000 &
[1] 1620
admin@vm:~$ /tmp/gdb-x86_64 -q -p 1620
Attaching to process 1620
Reading symbols from /var/tmp/sleep...(no debugging symbols found)...done.
(...)
0x00007f6723d561b0 in nanosleep () from /lib64/libpthread.so.0
(gdb) quit

Willy
Eric W. Biederman Nov. 19, 2016, 6:35 p.m.
Willy Tarreau <w@1wt.eu> writes:

> Hi Eric,
>
> On Thu, Nov 17, 2016 at 11:02:47AM -0600, Eric W. Biederman wrote:
>> 
>> With everyone heading to Kernel Summit and Plumbers I put this set of
>> patches down temporarily.   Now is the time to take it back up and to
>> make certain I am not missing something stupid in this set of patches.
>
> I couldn't get your patch set to apply to any of the kernels I tried,
> I manually adjusted some parts but the second one has too many rejects.
> What kernel should I apply this to ? Or maybe some preliminary patches
> are needed ?

It is against my for-next branch, and there is one patch in there
that is significant.

The entire patchset should be at:
git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-next

Eric
Eric W. Biederman Nov. 19, 2016, 6:37 p.m.
ebiederm@xmission.com (Eric W. Biederman) writes:

> Willy Tarreau <w@1wt.eu> writes:
>
>> Hi Eric,
>>
>> On Thu, Nov 17, 2016 at 11:02:47AM -0600, Eric W. Biederman wrote:
>>> 
>>> With everyone heading to Kernel Summit and Plumbers I put this set of
>>> patches down temporarily.   Now is the time to take it back up and to
>>> make certain I am not missing something stupid in this set of patches.
>>
>> I couldn't get your patch set to apply to any of the kernels I tried,
>> I manually adjusted some parts but the second one has too many rejects.
>> What kernel should I apply this to ? Or maybe some preliminary patches
>> are needed ?
>
> It is against my for-next branch, and there is one patch in there
> that is significant.
>
> The entire patchset should be at:
> git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-next

Grr. I typed too fast, the branch above is the base:
git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-testing

should be the entire thing.

Eric
Eric W. Biederman Nov. 19, 2016, 6:44 p.m.
Willy Tarreau <w@1wt.eu> writes:

> On Sat, Nov 19, 2016 at 08:17:00AM +0100, Willy Tarreau wrote:
>> Hi Eric,
>> 
>> On Thu, Nov 17, 2016 at 11:02:47AM -0600, Eric W. Biederman wrote:
>> > 
>> > With everyone heading to Kernel Summit and Plumbers I put this set of
>> > patches down temporarily.   Now is the time to take it back up and to
>> > make certain I am not missing something stupid in this set of patches.
>> 
>> I couldn't get your patch set to apply to any of the kernels I tried,
>> I manually adjusted some parts but the second one has too many rejects.
>> What kernel should I apply this to ? Or maybe some preliminary patches
>> are needed ?
>
> OK I finally managed to get it to work on top of 4.8.9 (required less changes
> than master). I also had to drop the user_ns changes since there's no such
> user_ns in mm_struct there.
>
> I could run a test on it, that looks reasonable :
>
> FS:
>
> admin@vm:~$ strace -e trace=fstat,uname,ioctl,open uname
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7f3f9a1663e3, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open(0x7ffd01bbeeb0, O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {...})                         = 0
> open(0x7ffd01bbee80, O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {...})                         = 0
> uname({...})                            = 0
> fstat(1, {...})                         = 0
> ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, 0x7ffd01bbf400) = 0
>
> admin@vm:~$ sudo strace -e trace=fstat,uname,ioctl,open uname
> open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/tls/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/tls/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/x86_64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
> open("/lib64/libpthread.so.0", O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {st_mode=S_IFREG|0555, st_size=101312, ...}) = 0
> open("/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {st_mode=S_IFREG|0555, st_size=1479016, ...}) = 0
> uname({sys="Linux", node="vm", ...})    = 0
> fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(4, 64), ...}) = 0
> ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0
>
> Network:
>
> admin@vm:~$ strace -e trace=socket,setsockopt,connect /tmp/nc 198.18.3 22
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {...}, 110)                  = -1 ENOENT (No such file or directory)
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {...}, 110)                  = -1 ENOENT (No such file or directory)
> socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 3
> setsockopt(3, SOL_SOCKET, SO_REUSEADDR, 0x7ffd2c26bdbc, 4) = 0
> connect(3, {...}, 16)                   = 0
>
> admin@vm:~$ sudo strace -e trace=socket,setsockopt,connect /tmp/nc 198.18.3 22
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
> socket(PF_FILE, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
> connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
> socket(PF_INET, SOCK_STREAM, IPPROTO_TCP) = 3
> setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
> connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("198.18.0.3")}, 16) = 0
>
> So in short now we can at least see what syscall fails eventhough we can't
> know why. I think it can be an acceptable trade-off.

Thanks for testing, and thanks for you acceptance even if I didn't make
it easy for you.


Eric