[RFC,ghak32,V2,01/13] audit: add container id

Submitted by Richard Guy Briggs on March 16, 2018, 9 a.m.

Details

Message ID e284617ad667ad8f17958dd8babb87fe1b4d7205.1521179281.git.rgb@redhat.com
State New
Series "audit: implement container id"
Headers show

Commit Message

Richard Guy Briggs March 16, 2018, 9 a.m.
Implement the proc fs write to set the audit container ID of a process,
emitting an AUDIT_CONTAINER record to document the event.

This is a write from the container orchestrator task to a proc entry of
the form /proc/PID/containerid where PID is the process ID of the newly
created task that is to become the first task in a container, or an
additional task added to a container.

The write expects up to a u64 value (unset: 18446744073709551615).

This will produce a record such as this:
type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455 res=0

The "op" field indicates an initial set.  The "pid" to "ses" fields are
the orchestrator while the "opid" field is the object's PID, the process
being "contained".  Old and new container ID values are given in the
"contid" fields, while res indicates its success.

It is not permitted to self-set, unset or re-set the container ID.  A
child inherits its parent's container ID, but then can be set only once
after.

See: https://github.com/linux-audit/audit-kernel/issues/32

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 fs/proc/base.c             | 37 ++++++++++++++++++++
 include/linux/audit.h      | 16 +++++++++
 include/linux/init_task.h  |  4 ++-
 include/linux/sched.h      |  1 +
 include/uapi/linux/audit.h |  2 ++
 kernel/auditsc.c           | 84 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 143 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 60316b5..6ce4fbe 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1299,6 +1299,41 @@  static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
 	.read		= proc_sessionid_read,
 	.llseek		= generic_file_llseek,
 };
+
+static ssize_t proc_containerid_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct inode *inode = file_inode(file);
+	u64 containerid;
+	int rv;
+	struct task_struct *task = get_proc_task(inode);
+
+	if (!task)
+		return -ESRCH;
+	if (*ppos != 0) {
+		/* No partial writes. */
+		put_task_struct(task);
+		return -EINVAL;
+	}
+
+	rv = kstrtou64_from_user(buf, count, 10, &containerid);
+	if (rv < 0) {
+		put_task_struct(task);
+		return rv;
+	}
+
+	rv = audit_set_containerid(task, containerid);
+	put_task_struct(task);
+	if (rv < 0)
+		return rv;
+	return count;
+}
+
+static const struct file_operations proc_containerid_operations = {
+	.write		= proc_containerid_write,
+	.llseek		= generic_file_llseek,
+};
+
 #endif
 
 #ifdef CONFIG_FAULT_INJECTION
@@ -2961,6 +2996,7 @@  static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
+	REG("containerid", S_IWUSR, proc_containerid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
@@ -3355,6 +3391,7 @@  static int proc_tid_comm_permission(struct inode *inode, int mask)
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
+	REG("containerid", S_IWUSR, proc_containerid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
diff --git a/include/linux/audit.h b/include/linux/audit.h
index af410d9..fe4ba3f 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -29,6 +29,7 @@ 
 
 #define AUDIT_INO_UNSET ((unsigned long)-1)
 #define AUDIT_DEV_UNSET ((dev_t)-1)
+#define INVALID_CID AUDIT_CID_UNSET
 
 struct audit_sig_info {
 	uid_t		uid;
@@ -321,6 +322,7 @@  static inline void audit_ptrace(struct task_struct *t)
 extern int auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec64 *t, unsigned int *serial);
 extern int audit_set_loginuid(kuid_t loginuid);
+extern int audit_set_containerid(struct task_struct *tsk, u64 containerid);
 
 static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
 {
@@ -332,6 +334,11 @@  static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 	return tsk->sessionid;
 }
 
+static inline u64 audit_get_containerid(struct task_struct *tsk)
+{
+	return tsk->containerid;
+}
+
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
 extern void __audit_bprm(struct linux_binprm *bprm);
@@ -517,6 +524,10 @@  static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 {
 	return -1;
 }
+static inline kuid_t audit_get_containerid(struct task_struct *tsk)
+{
+	return INVALID_CID;
+}
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 { }
 static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
@@ -581,6 +592,11 @@  static inline bool audit_loginuid_set(struct task_struct *tsk)
 	return uid_valid(audit_get_loginuid(tsk));
 }
 
+static inline bool audit_containerid_set(struct task_struct *tsk)
+{
+	return audit_get_containerid(tsk) != INVALID_CID;
+}
+
 static inline void audit_log_string(struct audit_buffer *ab, const char *buf)
 {
 	audit_log_n_string(ab, buf, strlen(buf));
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6a53262..046bd0a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -18,6 +18,7 @@ 
 #include <linux/sched/rt.h>
 #include <linux/livepatch.h>
 #include <linux/mm_types.h>
+#include <linux/audit.h>
 
 #include <asm/thread_info.h>
 
@@ -120,7 +121,8 @@ 
 #ifdef CONFIG_AUDITSYSCALL
 #define INIT_IDS \
 	.loginuid = INVALID_UID, \
-	.sessionid = (unsigned int)-1,
+	.sessionid = (unsigned int)-1, \
+	.containerid = INVALID_CID,
 #else
 #define INIT_IDS
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d258826..1b82191 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -796,6 +796,7 @@  struct task_struct {
 #ifdef CONFIG_AUDITSYSCALL
 	kuid_t				loginuid;
 	unsigned int			sessionid;
+	u64				containerid;
 #endif
 	struct seccomp			seccomp;
 
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 4e61a9e..921a71f 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -71,6 +71,7 @@ 
 #define AUDIT_TTY_SET		1017	/* Set TTY auditing status */
 #define AUDIT_SET_FEATURE	1018	/* Turn an audit feature on or off */
 #define AUDIT_GET_FEATURE	1019	/* Get which features are enabled */
+#define AUDIT_CONTAINER		1020	/* Define the container id and information */
 
 #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC		1107	/* We filter this differently */
@@ -465,6 +466,7 @@  struct audit_tty_status {
 };
 
 #define AUDIT_UID_UNSET (unsigned int)-1
+#define AUDIT_CID_UNSET ((u64)-1)
 
 /* audit_rule_data supports filter rules with both integer and string
  * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4e0a4ac..29c8482 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2073,6 +2073,90 @@  int audit_set_loginuid(kuid_t loginuid)
 	return rc;
 }
 
+static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
+{
+	struct task_struct *parent;
+	u64 pcontainerid, ccontainerid;
+
+	/* Don't allow to set our own containerid */
+	if (current == task)
+		return -EPERM;
+	/* Don't allow the containerid to be unset */
+	if (!cid_valid(containerid))
+		return -EINVAL;
+	/* if we don't have caps, reject */
+	if (!capable(CAP_AUDIT_CONTROL))
+		return -EPERM;
+	/* if containerid is unset, allow */
+	if (!audit_containerid_set(task))
+		return 0;
+	/* it is already set, and not inherited from the parent, reject */
+	ccontainerid = audit_get_containerid(task);
+	rcu_read_lock();
+	parent = rcu_dereference(task->real_parent);
+	rcu_read_unlock();
+	task_lock(parent);
+	pcontainerid = audit_get_containerid(parent);
+	task_unlock(parent);
+	if (ccontainerid != pcontainerid)
+		return -EPERM;
+	return 0;
+}
+
+static void audit_log_set_containerid(struct task_struct *task, u64 oldcontainerid,
+				      u64 containerid, int rc)
+{
+	struct audit_buffer *ab;
+	uid_t uid;
+	struct tty_struct *tty;
+
+	if (!audit_enabled)
+		return;
+
+	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONTAINER);
+	if (!ab)
+		return;
+
+	uid = from_kuid(&init_user_ns, task_uid(current));
+	tty = audit_get_tty(current);
+
+	audit_log_format(ab, "op=set pid=%d uid=%u", task_tgid_nr(current), uid);
+	audit_log_task_context(ab);
+	audit_log_format(ab, " auid=%u tty=%s ses=%u opid=%d old-contid=%llu contid=%llu res=%d",
+			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
+			 tty ? tty_name(tty) : "(none)", audit_get_sessionid(current),
+			 task_tgid_nr(task), oldcontainerid, containerid, !rc);
+
+	audit_put_tty(tty);
+	audit_log_end(ab);
+}
+
+/**
+ * audit_set_containerid - set current task's audit_context containerid
+ * @containerid: containerid value
+ *
+ * Returns 0 on success, -EPERM on permission failure.
+ *
+ * Called (set) from fs/proc/base.c::proc_containerid_write().
+ */
+int audit_set_containerid(struct task_struct *task, u64 containerid)
+{
+	u64 oldcontainerid;
+	int rc;
+
+	oldcontainerid = audit_get_containerid(task);
+
+	rc = audit_set_containerid_perm(task, containerid);
+	if (!rc) {
+		task_lock(task);
+		task->containerid = containerid;
+		task_unlock(task);
+	}
+
+	audit_log_set_containerid(task, oldcontainerid, containerid, rc);
+	return rc;
+}
+
 /**
  * __audit_mq_open - record audit data for a POSIX MQ open
  * @oflag: open flag

Comments

Jonathan Corbet March 28, 2018, 6:39 p.m.
On Fri, 16 Mar 2018 05:00:28 -0400
Richard Guy Briggs <rgb@redhat.com> wrote:

> Implement the proc fs write to set the audit container ID of a process,
> emitting an AUDIT_CONTAINER record to document the event.

A little detail, but still...

> +static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
> +{
> +	struct task_struct *parent;
> +	u64 pcontainerid, ccontainerid;
> +
> +	/* Don't allow to set our own containerid */
> +	if (current == task)
> +		return -EPERM;
> +	/* Don't allow the containerid to be unset */
> +	if (!cid_valid(containerid))
> +		return -EINVAL;

I went looking for cid_valid(), but it turns out you don't add it until
patch 5.  That, I expect, will not be good for bisectability (or patch
review).

Thanks,

jon
Richard Guy Briggs March 29, 2018, 9:01 a.m.
On 2018-03-28 12:39, Jonathan Corbet wrote:
> On Fri, 16 Mar 2018 05:00:28 -0400
> Richard Guy Briggs <rgb@redhat.com> wrote:
> > Implement the proc fs write to set the audit container ID of a process,
> > emitting an AUDIT_CONTAINER record to document the event.
> 
> A little detail, but still...

I am understanding that you would prefer more context (as opposed to
operational detail) in the description, laying out the use case for this
patch(set)?

> > +static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
> > +{
> > +	struct task_struct *parent;
> > +	u64 pcontainerid, ccontainerid;
> > +
> > +	/* Don't allow to set our own containerid */
> > +	if (current == task)
> > +		return -EPERM;
> > +	/* Don't allow the containerid to be unset */
> > +	if (!cid_valid(containerid))
> > +		return -EINVAL;
> 
> I went looking for cid_valid(), but it turns out you don't add it until
> patch 5.  That, I expect, will not be good for bisectability (or patch
> review).

Nice catch, thanks Jon.  That is very likely another victim of a git
rebase to re-order afterthoughts in the right place.  I'll need to be
more careful of that class of bug, rethink my workflow, or script builds
to verify each commit is compilable.

> Thanks,
> 
> jon

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Jonathan Corbet March 29, 2018, 1:03 p.m.
On Thu, 29 Mar 2018 05:01:32 -0400
Richard Guy Briggs <rgb@redhat.com> wrote:

> > A little detail, but still...  
> 
> I am understanding that you would prefer more context (as opposed to
> operational detail) in the description, laying out the use case for this
> patch(set)?

No, sorry, "a little detail" was referring to my comment.  The use case,
I believe, has been well described.

Thanks,

jon
Richard Guy Briggs March 30, 2018, 5:06 a.m.
On 2018-03-29 07:03, Jonathan Corbet wrote:
> On Thu, 29 Mar 2018 05:01:32 -0400
> Richard Guy Briggs <rgb@redhat.com> wrote:
> 
> > > A little detail, but still...  
> > 
> > I am understanding that you would prefer more context (as opposed to
> > operational detail) in the description, laying out the use case for this
> > patch(set)?
> 
> No, sorry, "a little detail" was referring to my comment.  The use case,
> I believe, has been well described.

Ah!  "A minor nit".  :-)

> jon

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore April 18, 2018, 11:47 p.m.
On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> Implement the proc fs write to set the audit container ID of a process,
> emitting an AUDIT_CONTAINER record to document the event.
>
> This is a write from the container orchestrator task to a proc entry of
> the form /proc/PID/containerid where PID is the process ID of the newly
> created task that is to become the first task in a container, or an
> additional task added to a container.
>
> The write expects up to a u64 value (unset: 18446744073709551615).
>
> This will produce a record such as this:
> type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455 res=0
>
> The "op" field indicates an initial set.  The "pid" to "ses" fields are
> the orchestrator while the "opid" field is the object's PID, the process
> being "contained".  Old and new container ID values are given in the
> "contid" fields, while res indicates its success.
>
> It is not permitted to self-set, unset or re-set the container ID.  A
> child inherits its parent's container ID, but then can be set only once
> after.
>
> See: https://github.com/linux-audit/audit-kernel/issues/32
>
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
>  fs/proc/base.c             | 37 ++++++++++++++++++++
>  include/linux/audit.h      | 16 +++++++++
>  include/linux/init_task.h  |  4 ++-
>  include/linux/sched.h      |  1 +
>  include/uapi/linux/audit.h |  2 ++
>  kernel/auditsc.c           | 84 ++++++++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 143 insertions(+), 1 deletion(-)
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 60316b5..6ce4fbe 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1299,6 +1299,41 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
>         .read           = proc_sessionid_read,
>         .llseek         = generic_file_llseek,
>  };
> +
> +static ssize_t proc_containerid_write(struct file *file, const char __user *buf,
> +                                  size_t count, loff_t *ppos)
> +{
> +       struct inode *inode = file_inode(file);
> +       u64 containerid;
> +       int rv;
> +       struct task_struct *task = get_proc_task(inode);
> +
> +       if (!task)
> +               return -ESRCH;
> +       if (*ppos != 0) {
> +               /* No partial writes. */
> +               put_task_struct(task);
> +               return -EINVAL;
> +       }
> +
> +       rv = kstrtou64_from_user(buf, count, 10, &containerid);
> +       if (rv < 0) {
> +               put_task_struct(task);
> +               return rv;
> +       }
> +
> +       rv = audit_set_containerid(task, containerid);
> +       put_task_struct(task);
> +       if (rv < 0)
> +               return rv;
> +       return count;
> +}
> +
> +static const struct file_operations proc_containerid_operations = {
> +       .write          = proc_containerid_write,
> +       .llseek         = generic_file_llseek,
> +};
> +
>  #endif
>
>  #ifdef CONFIG_FAULT_INJECTION
> @@ -2961,6 +2996,7 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
>  #ifdef CONFIG_AUDITSYSCALL
>         REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
>         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> +       REG("containerid", S_IWUSR, proc_containerid_operations),
>  #endif
>  #ifdef CONFIG_FAULT_INJECTION
>         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> @@ -3355,6 +3391,7 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
>  #ifdef CONFIG_AUDITSYSCALL
>         REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
>         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> +       REG("containerid", S_IWUSR, proc_containerid_operations),
>  #endif
>  #ifdef CONFIG_FAULT_INJECTION
>         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index af410d9..fe4ba3f 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -29,6 +29,7 @@
>
>  #define AUDIT_INO_UNSET ((unsigned long)-1)
>  #define AUDIT_DEV_UNSET ((dev_t)-1)
> +#define INVALID_CID AUDIT_CID_UNSET

Why can't we just use AUDIT_CID_UNSET?  Is there an important
distinction?  If so, they shouldn't they have different values?

If we do need to keep INVALID_CID, let's rename it to
AUDIT_CID_INVALID so we have some consistency to the naming patterns
and we stress that it is an *audit* container ID.

> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d258826..1b82191 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -796,6 +796,7 @@ struct task_struct {
>  #ifdef CONFIG_AUDITSYSCALL
>         kuid_t                          loginuid;
>         unsigned int                    sessionid;
> +       u64                             containerid;

This one line addition to the task_struct scares me the most of
anything in this patchset.  Why?  It's a field named "containerid" in
a perhaps one of the most widely used core kernel structures; the
possibilities for abuse are endless, and it's foolish to think we
would ever be able to adequately police this.

Unfortunately, we can't add the field to audit_context as things
currently stand because we don't always allocate an audit_context,
it's dependent on the system's configuration, and we need to track the
audit container ID for a given process, regardless of the audit
configuration.  Pretty much the same reason why loginuid and sessionid
are located directly in task_struct now.  As I stressed during the
design phase, I really want to keep this as an *audit* container ID
and not a general purpose kernel wide container ID.  If the kernel
ever grows a general purpose container ID token, I'll be the first in
line to convert the audit code, but I don't want audit to be that
general purpose mechanism ... audit is hated enough as-is ;)

I think the right solution to this is to create another new struct,
audit_task_info (or similar, the name really isn't that important),
which would be stored as a pointer in task_struct and would replace
the audit_context pointer, loginuid, sessionid, and the newly proposed
containerid.  The new audit_task_info would always be allocated in the
audit_alloc() function (please use kmem_cache), and the audit_context
pointer included inside would continue to be allocated based on the
existing conditions.  By keeping audit_task_info as a pointer inside
task_struct we could hide the structure definition inside
kernel/audit*.c and make it much more difficult for other subsystems
to abuse it.[1]

  struct audit_task_info {
    kuid_t loginuid;
    unsigned int sessionid;
    u64 containerid;
    struct audit_context *ctx;
  }

Actually, we might even want to consider storing audit_context in
audit_task_info (no pointer), or making it a zero length array
(ctx[0]) and going with a variable sized allocation of audit_task_info
... but all that could be done as a follow up optimization once we get
the basic idea sorted.

[1] If for some reason allocating audit_task_info becomes too much
overhead to bear (somewhat doubtful since we would only do it at task
creation), we could do some ugly tricks to directly include an
audit_task_struct chunk in task_struct but I'd like to avoid that if
possible (and I think we can).

>  #endif
>         struct seccomp                  seccomp;

...

> diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> index 4e61a9e..921a71f 100644
> --- a/include/uapi/linux/audit.h
> +++ b/include/uapi/linux/audit.h
> @@ -71,6 +71,7 @@
>  #define AUDIT_TTY_SET          1017    /* Set TTY auditing status */
>  #define AUDIT_SET_FEATURE      1018    /* Turn an audit feature on or off */
>  #define AUDIT_GET_FEATURE      1019    /* Get which features are enabled */
> +#define AUDIT_CONTAINER                1020    /* Define the container id and information */
>
>  #define AUDIT_FIRST_USER_MSG   1100    /* Userspace messages mostly uninteresting to kernel */
>  #define AUDIT_USER_AVC         1107    /* We filter this differently */
> @@ -465,6 +466,7 @@ struct audit_tty_status {
>  };
>
>  #define AUDIT_UID_UNSET (unsigned int)-1
> +#define AUDIT_CID_UNSET ((u64)-1)

I think we need to decide if we want to distinguish between the "host"
(e.g. init ns) and "unset".  Looking at this patch (I've only quickly
skimmed the others so far) it would appear that you don't think we
need to worry about this distinction; that's fine, but let's make it
explicit with a comment in the code that AUDIT_CID_UNSET means "unset"
as well as "host".

If we do need to make a distinction, let's add a constant/macro for "host".

>  /* audit_rule_data supports filter rules with both integer and string
>   * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index 4e0a4ac..29c8482 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -2073,6 +2073,90 @@ int audit_set_loginuid(kuid_t loginuid)
>         return rc;
>  }
>
> +static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
> +{
> +       struct task_struct *parent;
> +       u64 pcontainerid, ccontainerid;
> +
> +       /* Don't allow to set our own containerid */
> +       if (current == task)
> +               return -EPERM;

Why not?  Is there some obvious security concern that I missing?

I ask because I suppose it might be possible for some container
runtime to do a fork, setup some of the environment and them exec the
container (before you answer the obvious "namespaces!" please remember
we're not trying to define containers).

> +       /* Don't allow the containerid to be unset */
> +       if (!cid_valid(containerid))
> +               return -EINVAL;
> +       /* if we don't have caps, reject */
> +       if (!capable(CAP_AUDIT_CONTROL))
> +               return -EPERM;
> +       /* if containerid is unset, allow */
> +       if (!audit_containerid_set(task))
> +               return 0;
> +       /* it is already set, and not inherited from the parent, reject */
> +       ccontainerid = audit_get_containerid(task);
> +       rcu_read_lock();
> +       parent = rcu_dereference(task->real_parent);
> +       rcu_read_unlock();
> +       task_lock(parent);
> +       pcontainerid = audit_get_containerid(parent);
> +       task_unlock(parent);
> +       if (ccontainerid != pcontainerid)
> +               return -EPERM;
> +       return 0;
> +}
> +
> +static void audit_log_set_containerid(struct task_struct *task, u64 oldcontainerid,
> +                                     u64 containerid, int rc)
> +{
> +       struct audit_buffer *ab;
> +       uid_t uid;
> +       struct tty_struct *tty;
> +
> +       if (!audit_enabled)
> +               return;
> +
> +       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONTAINER);
> +       if (!ab)
> +               return;
> +
> +       uid = from_kuid(&init_user_ns, task_uid(current));
> +       tty = audit_get_tty(current);
> +
> +       audit_log_format(ab, "op=set pid=%d uid=%u", task_tgid_nr(current), uid);
> +       audit_log_task_context(ab);
> +       audit_log_format(ab, " auid=%u tty=%s ses=%u opid=%d old-contid=%llu contid=%llu res=%d",
> +                        from_kuid(&init_user_ns, audit_get_loginuid(current)),
> +                        tty ? tty_name(tty) : "(none)", audit_get_sessionid(current),
> +                        task_tgid_nr(task), oldcontainerid, containerid, !rc);
> +
> +       audit_put_tty(tty);
> +       audit_log_end(ab);
> +}
> +
> +/**
> + * audit_set_containerid - set current task's audit_context containerid
> + * @containerid: containerid value
> + *
> + * Returns 0 on success, -EPERM on permission failure.
> + *
> + * Called (set) from fs/proc/base.c::proc_containerid_write().
> + */
> +int audit_set_containerid(struct task_struct *task, u64 containerid)
> +{
> +       u64 oldcontainerid;
> +       int rc;
> +
> +       oldcontainerid = audit_get_containerid(task);
> +
> +       rc = audit_set_containerid_perm(task, containerid);
> +       if (!rc) {
> +               task_lock(task);
> +               task->containerid = containerid;
> +               task_unlock(task);
> +       }
> +
> +       audit_log_set_containerid(task, oldcontainerid, containerid, rc);
> +       return rc;

Why are audit_set_containerid_perm() and audit_log_containerid()
separate functions?
Casey Schaufler April 19, 2018, 12:41 a.m.
On 4/18/2018 4:47 PM, Paul Moore wrote:
> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
>> Implement the proc fs write to set the audit container ID of a process,
>> emitting an AUDIT_CONTAINER record to document the event.
>> ...
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index d258826..1b82191 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -796,6 +796,7 @@ struct task_struct {
>>  #ifdef CONFIG_AUDITSYSCALL
>>         kuid_t                          loginuid;
>>         unsigned int                    sessionid;
>> +       u64                             containerid;
> This one line addition to the task_struct scares me the most of
> anything in this patchset.  Why?  It's a field named "containerid" in
> a perhaps one of the most widely used core kernel structures; the
> possibilities for abuse are endless, and it's foolish to think we
> would ever be able to adequately police this.

If we can get the LSM infrastructure managed task blobs from 
module stacking in ahead of this we could create a trivial security
module to manage this. It's not as if there aren't all sorts of
interactions between security modules and the audit system already.
Paul Moore April 19, 2018, 12:46 a.m.
On Wed, Apr 18, 2018 at 8:41 PM, Casey Schaufler <casey@schaufler-ca.com> wrote:
> On 4/18/2018 4:47 PM, Paul Moore wrote:
>> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
>>> Implement the proc fs write to set the audit container ID of a process,
>>> emitting an AUDIT_CONTAINER record to document the event.
>>> ...
>>>
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index d258826..1b82191 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -796,6 +796,7 @@ struct task_struct {
>>>  #ifdef CONFIG_AUDITSYSCALL
>>>         kuid_t                          loginuid;
>>>         unsigned int                    sessionid;
>>> +       u64                             containerid;
>> This one line addition to the task_struct scares me the most of
>> anything in this patchset.  Why?  It's a field named "containerid" in
>> a perhaps one of the most widely used core kernel structures; the
>> possibilities for abuse are endless, and it's foolish to think we
>> would ever be able to adequately police this.
>
> If we can get the LSM infrastructure managed task blobs from
> module stacking in ahead of this we could create a trivial security
> module to manage this. It's not as if there aren't all sorts of
> interactions between security modules and the audit system already.

While yes, there are plenty of interactions between the two, it is
possible to use audit without the LSMs and I would like to preserve
that.  Further, I don't want to entangle two very complicated code
changes or make the audit container ID effort dependent on LSM
stacking.

You're a good salesman Casey, but you're not that good ;)
Casey Schaufler April 19, 2018, 1:15 a.m.
On 4/18/2018 5:46 PM, Paul Moore wrote:
> On Wed, Apr 18, 2018 at 8:41 PM, Casey Schaufler <casey@schaufler-ca.com> wrote:
>> On 4/18/2018 4:47 PM, Paul Moore wrote:
>>> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
>>>> Implement the proc fs write to set the audit container ID of a process,
>>>> emitting an AUDIT_CONTAINER record to document the event.
>>>> ...
>>>>
>>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>>> index d258826..1b82191 100644
>>>> --- a/include/linux/sched.h
>>>> +++ b/include/linux/sched.h
>>>> @@ -796,6 +796,7 @@ struct task_struct {
>>>>  #ifdef CONFIG_AUDITSYSCALL
>>>>         kuid_t                          loginuid;
>>>>         unsigned int                    sessionid;
>>>> +       u64                             containerid;
>>> This one line addition to the task_struct scares me the most of
>>> anything in this patchset.  Why?  It's a field named "containerid" in
>>> a perhaps one of the most widely used core kernel structures; the
>>> possibilities for abuse are endless, and it's foolish to think we
>>> would ever be able to adequately police this.
>> If we can get the LSM infrastructure managed task blobs from
>> module stacking in ahead of this we could create a trivial security
>> module to manage this. It's not as if there aren't all sorts of
>> interactions between security modules and the audit system already.
> While yes, there are plenty of interactions between the two, it is
> possible to use audit without the LSMs and I would like to preserve
> that.  

Fair enough.

> Further, I don't want to entangle two very complicated code
> changes or make the audit container ID effort dependent on LSM
> stacking.

Also fair, although the use case for container audit IDs is
already pulling in audit, namespaces (yeah, I know it's not
necessary for a container to use namespaces) security modules
(stacked and/or namespaced), cgroups and who knows what else.

> You're a good salesman Casey, but you're not that good ;)

I have to keep the skills sharpened somehow!

OK, I'll grant that this isn't a great fit.
Richard Guy Briggs April 21, 2018, 2:34 p.m.
On 2018-04-18 19:47, Paul Moore wrote:
> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> > Implement the proc fs write to set the audit container ID of a process,
> > emitting an AUDIT_CONTAINER record to document the event.
> >
> > This is a write from the container orchestrator task to a proc entry of
> > the form /proc/PID/containerid where PID is the process ID of the newly
> > created task that is to become the first task in a container, or an
> > additional task added to a container.
> >
> > The write expects up to a u64 value (unset: 18446744073709551615).
> >
> > This will produce a record such as this:
> > type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455 res=0
> >
> > The "op" field indicates an initial set.  The "pid" to "ses" fields are
> > the orchestrator while the "opid" field is the object's PID, the process
> > being "contained".  Old and new container ID values are given in the
> > "contid" fields, while res indicates its success.
> >
> > It is not permitted to self-set, unset or re-set the container ID.  A
> > child inherits its parent's container ID, but then can be set only once
> > after.
> >
> > See: https://github.com/linux-audit/audit-kernel/issues/32
> >
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > ---
> >  fs/proc/base.c             | 37 ++++++++++++++++++++
> >  include/linux/audit.h      | 16 +++++++++
> >  include/linux/init_task.h  |  4 ++-
> >  include/linux/sched.h      |  1 +
> >  include/uapi/linux/audit.h |  2 ++
> >  kernel/auditsc.c           | 84 ++++++++++++++++++++++++++++++++++++++++++++++
> >  6 files changed, 143 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> > index 60316b5..6ce4fbe 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -1299,6 +1299,41 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
> >         .read           = proc_sessionid_read,
> >         .llseek         = generic_file_llseek,
> >  };
> > +
> > +static ssize_t proc_containerid_write(struct file *file, const char __user *buf,
> > +                                  size_t count, loff_t *ppos)
> > +{
> > +       struct inode *inode = file_inode(file);
> > +       u64 containerid;
> > +       int rv;
> > +       struct task_struct *task = get_proc_task(inode);
> > +
> > +       if (!task)
> > +               return -ESRCH;
> > +       if (*ppos != 0) {
> > +               /* No partial writes. */
> > +               put_task_struct(task);
> > +               return -EINVAL;
> > +       }
> > +
> > +       rv = kstrtou64_from_user(buf, count, 10, &containerid);
> > +       if (rv < 0) {
> > +               put_task_struct(task);
> > +               return rv;
> > +       }
> > +
> > +       rv = audit_set_containerid(task, containerid);
> > +       put_task_struct(task);
> > +       if (rv < 0)
> > +               return rv;
> > +       return count;
> > +}
> > +
> > +static const struct file_operations proc_containerid_operations = {
> > +       .write          = proc_containerid_write,
> > +       .llseek         = generic_file_llseek,
> > +};
> > +
> >  #endif
> >
> >  #ifdef CONFIG_FAULT_INJECTION
> > @@ -2961,6 +2996,7 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
> >  #ifdef CONFIG_AUDITSYSCALL
> >         REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
> >         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> > +       REG("containerid", S_IWUSR, proc_containerid_operations),
> >  #endif
> >  #ifdef CONFIG_FAULT_INJECTION
> >         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> > @@ -3355,6 +3391,7 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
> >  #ifdef CONFIG_AUDITSYSCALL
> >         REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
> >         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> > +       REG("containerid", S_IWUSR, proc_containerid_operations),
> >  #endif
> >  #ifdef CONFIG_FAULT_INJECTION
> >         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > index af410d9..fe4ba3f 100644
> > --- a/include/linux/audit.h
> > +++ b/include/linux/audit.h
> > @@ -29,6 +29,7 @@
> >
> >  #define AUDIT_INO_UNSET ((unsigned long)-1)
> >  #define AUDIT_DEV_UNSET ((dev_t)-1)
> > +#define INVALID_CID AUDIT_CID_UNSET
> 
> Why can't we just use AUDIT_CID_UNSET?  Is there an important
> distinction?  If so, they shouldn't they have different values?

One was intended as user-facing and the other was intended for kernel
internal.  As you point out, this does not appear to be necessary since
they are both the same type.  This was to mirror loginuid due to UID
namespace practice to seperate the two to make things very clear that a
userspace view of a UID needed to be translated from the user's user
namespace to the kernel's absolute view of UIDs from the init user
namespace.  Since container ID meanings do not depend on any namespace
context, I agree we can use just one and I'd go with AUDIT_CID_UNSET.

> If we do need to keep INVALID_CID, let's rename it to
> AUDIT_CID_INVALID so we have some consistency to the naming patterns
> and we stress that it is an *audit* container ID.
> 
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index d258826..1b82191 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -796,6 +796,7 @@ struct task_struct {
> >  #ifdef CONFIG_AUDITSYSCALL
> >         kuid_t                          loginuid;
> >         unsigned int                    sessionid;
> > +       u64                             containerid;
> 
> This one line addition to the task_struct scares me the most of
> anything in this patchset.  Why?  It's a field named "containerid" in
> a perhaps one of the most widely used core kernel structures; the
> possibilities for abuse are endless, and it's foolish to think we
> would ever be able to adequately police this.

Fair enough.

> Unfortunately, we can't add the field to audit_context as things
> currently stand because we don't always allocate an audit_context,
> it's dependent on the system's configuration, and we need to track the
> audit container ID for a given process, regardless of the audit
> configuration.  Pretty much the same reason why loginuid and sessionid
> are located directly in task_struct now.  As I stressed during the
> design phase, I really want to keep this as an *audit* container ID
> and not a general purpose kernel wide container ID.  If the kernel
> ever grows a general purpose container ID token, I'll be the first in
> line to convert the audit code, but I don't want audit to be that
> general purpose mechanism ... audit is hated enough as-is ;)

When would we need an audit container ID when audit is not enabled
enough to have an audit_context?

If it is only used for audit, and audit is the only consumer, and audit
can only use it when it is enabled, then we can just return success to
any write to the proc filehandle, or not even present it.  Nothing will
be able to know that value wasn't used.

When are loginuid and sessionid used now when audit is not enabled (or
should I say, explicitly disabled)?

> I think the right solution to this is to create another new struct,
> audit_task_info (or similar, the name really isn't that important),
> which would be stored as a pointer in task_struct and would replace
> the audit_context pointer, loginuid, sessionid, and the newly proposed
> containerid.  The new audit_task_info would always be allocated in the
> audit_alloc() function (please use kmem_cache), and the audit_context
> pointer included inside would continue to be allocated based on the
> existing conditions.  By keeping audit_task_info as a pointer inside
> task_struct we could hide the structure definition inside
> kernel/audit*.c and make it much more difficult for other subsystems
> to abuse it.[1]
> 
>   struct audit_task_info {
>     kuid_t loginuid;
>     unsigned int sessionid;
>     u64 containerid;
>     struct audit_context *ctx;
>   }

I agree this looks like a good change.

> Actually, we might even want to consider storing audit_context in
> audit_task_info (no pointer), or making it a zero length array
> (ctx[0]) and going with a variable sized allocation of audit_task_info
> ... but all that could be done as a follow up optimization once we get
> the basic idea sorted.
> 
> [1] If for some reason allocating audit_task_info becomes too much
> overhead to bear (somewhat doubtful since we would only do it at task
> creation), we could do some ugly tricks to directly include an
> audit_task_struct chunk in task_struct but I'd like to avoid that if
> possible (and I think we can).
> 
> >  #endif
> >         struct seccomp                  seccomp;
> 
> ...
> 
> > diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> > index 4e61a9e..921a71f 100644
> > --- a/include/uapi/linux/audit.h
> > +++ b/include/uapi/linux/audit.h
> > @@ -71,6 +71,7 @@
> >  #define AUDIT_TTY_SET          1017    /* Set TTY auditing status */
> >  #define AUDIT_SET_FEATURE      1018    /* Turn an audit feature on or off */
> >  #define AUDIT_GET_FEATURE      1019    /* Get which features are enabled */
> > +#define AUDIT_CONTAINER                1020    /* Define the container id and information */
> >
> >  #define AUDIT_FIRST_USER_MSG   1100    /* Userspace messages mostly uninteresting to kernel */
> >  #define AUDIT_USER_AVC         1107    /* We filter this differently */
> > @@ -465,6 +466,7 @@ struct audit_tty_status {
> >  };
> >
> >  #define AUDIT_UID_UNSET (unsigned int)-1
> > +#define AUDIT_CID_UNSET ((u64)-1)
> 
> I think we need to decide if we want to distinguish between the "host"
> (e.g. init ns) and "unset".  Looking at this patch (I've only quickly
> skimmed the others so far) it would appear that you don't think we
> need to worry about this distinction; that's fine, but let's make it
> explicit with a comment in the code that AUDIT_CID_UNSET means "unset"
> as well as "host".

I don't see any reason to distinguish between "host" and "unset".  Since
a container doesn't have a concrete definition based in namespaces, the
initial namespace set is meaningless here.

Is there value in having a container orchestrator process have a
reserved container ID that has a policy distinct from any other
container?  If so, then I could see the value in making the distinction.
For example, I've heard of interest in systemd acting as a container
orchestrator, so if it took on that role as PID 1, then every process in
the system would inherit that ID and none would be unset.

I can't picture how having seperate "host" and "unset" values helps us.

> If we do need to make a distinction, let's add a constant/macro for "host".

Currently "unset" is -1 which fits the convention used for sessionid and
loginuid and a number of others, so I think it makes sense to stick with
that.  If we decide we need a "host" flag, would it make sense to use 0
or (u64)-2?

> >  /* audit_rule_data supports filter rules with both integer and string
> >   * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
> > diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> > index 4e0a4ac..29c8482 100644
> > --- a/kernel/auditsc.c
> > +++ b/kernel/auditsc.c
> > @@ -2073,6 +2073,90 @@ int audit_set_loginuid(kuid_t loginuid)
> >         return rc;
> >  }
> >
> > +static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
> > +{
> > +       struct task_struct *parent;
> > +       u64 pcontainerid, ccontainerid;
> > +
> > +       /* Don't allow to set our own containerid */
> > +       if (current == task)
> > +               return -EPERM;
> 
> Why not?  Is there some obvious security concern that I missing?

We then lose the distinction in the AUDIT_CONTAINER record between the
initiating PID and the target PID.  This was outlined in the proposal.

Having said that, I'm still not sure we have protected sufficiently from
a child turning around and setting it's parent's as yet unset or
inherited audit container ID.

> I ask because I suppose it might be possible for some container
> runtime to do a fork, setup some of the environment and them exec the
> container (before you answer the obvious "namespaces!" please remember
> we're not trying to define containers).

I don't think namespaces have any bearing on this concern since none are
required.

> > +       /* Don't allow the containerid to be unset */
> > +       if (!cid_valid(containerid))
> > +               return -EINVAL;
> > +       /* if we don't have caps, reject */
> > +       if (!capable(CAP_AUDIT_CONTROL))
> > +               return -EPERM;
> > +       /* if containerid is unset, allow */
> > +       if (!audit_containerid_set(task))
> > +               return 0;
> > +       /* it is already set, and not inherited from the parent, reject */
> > +       ccontainerid = audit_get_containerid(task);
> > +       rcu_read_lock();
> > +       parent = rcu_dereference(task->real_parent);
> > +       rcu_read_unlock();
> > +       task_lock(parent);
> > +       pcontainerid = audit_get_containerid(parent);
> > +       task_unlock(parent);
> > +       if (ccontainerid != pcontainerid)
> > +               return -EPERM;
> > +       return 0;
> > +}
> > +
> > +static void audit_log_set_containerid(struct task_struct *task, u64 oldcontainerid,
> > +                                     u64 containerid, int rc)
> > +{
> > +       struct audit_buffer *ab;
> > +       uid_t uid;
> > +       struct tty_struct *tty;
> > +
> > +       if (!audit_enabled)
> > +               return;
> > +
> > +       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONTAINER);
> > +       if (!ab)
> > +               return;
> > +
> > +       uid = from_kuid(&init_user_ns, task_uid(current));
> > +       tty = audit_get_tty(current);
> > +
> > +       audit_log_format(ab, "op=set pid=%d uid=%u", task_tgid_nr(current), uid);
> > +       audit_log_task_context(ab);
> > +       audit_log_format(ab, " auid=%u tty=%s ses=%u opid=%d old-contid=%llu contid=%llu res=%d",
> > +                        from_kuid(&init_user_ns, audit_get_loginuid(current)),
> > +                        tty ? tty_name(tty) : "(none)", audit_get_sessionid(current),
> > +                        task_tgid_nr(task), oldcontainerid, containerid, !rc);
> > +
> > +       audit_put_tty(tty);
> > +       audit_log_end(ab);
> > +}
> > +
> > +/**
> > + * audit_set_containerid - set current task's audit_context containerid
> > + * @containerid: containerid value
> > + *
> > + * Returns 0 on success, -EPERM on permission failure.
> > + *
> > + * Called (set) from fs/proc/base.c::proc_containerid_write().
> > + */
> > +int audit_set_containerid(struct task_struct *task, u64 containerid)
> > +{
> > +       u64 oldcontainerid;
> > +       int rc;
> > +
> > +       oldcontainerid = audit_get_containerid(task);
> > +
> > +       rc = audit_set_containerid_perm(task, containerid);
> > +       if (!rc) {
> > +               task_lock(task);
> > +               task->containerid = containerid;
> > +               task_unlock(task);
> > +       }
> > +
> > +       audit_log_set_containerid(task, oldcontainerid, containerid, rc);
> > +       return rc;
> 
> Why are audit_set_containerid_perm() and audit_log_containerid()
> separate functions?

(I assume you mean audit_log_set_containerid()?)
It seemed clearer that all the permission checking was in one function
and its return code could be used to report the outcome when logging the
(attempted) action.  This is the same structure as audit_set_loginuid()
and it made sense.

This would be the time to connect it to a syscall if that seems like a
good idea and remove pid, uid, auid, tty, ses fields.

> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore April 23, 2018, 11:15 p.m.
On Sat, Apr 21, 2018 at 10:34 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2018-04-18 19:47, Paul Moore wrote:
>> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
>> > Implement the proc fs write to set the audit container ID of a process,
>> > emitting an AUDIT_CONTAINER record to document the event.
>> >
>> > This is a write from the container orchestrator task to a proc entry of
>> > the form /proc/PID/containerid where PID is the process ID of the newly
>> > created task that is to become the first task in a container, or an
>> > additional task added to a container.
>> >
>> > The write expects up to a u64 value (unset: 18446744073709551615).
>> >
>> > This will produce a record such as this:
>> > type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455 res=0
>> >
>> > The "op" field indicates an initial set.  The "pid" to "ses" fields are
>> > the orchestrator while the "opid" field is the object's PID, the process
>> > being "contained".  Old and new container ID values are given in the
>> > "contid" fields, while res indicates its success.
>> >
>> > It is not permitted to self-set, unset or re-set the container ID.  A
>> > child inherits its parent's container ID, but then can be set only once
>> > after.
>> >
>> > See: https://github.com/linux-audit/audit-kernel/issues/32
>> >
>> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
>> > ---
>> >  fs/proc/base.c             | 37 ++++++++++++++++++++
>> >  include/linux/audit.h      | 16 +++++++++
>> >  include/linux/init_task.h  |  4 ++-
>> >  include/linux/sched.h      |  1 +
>> >  include/uapi/linux/audit.h |  2 ++
>> >  kernel/auditsc.c           | 84 ++++++++++++++++++++++++++++++++++++++++++++++
>> >  6 files changed, 143 insertions(+), 1 deletion(-)

...

>> > diff --git a/include/linux/sched.h b/include/linux/sched.h
>> > index d258826..1b82191 100644
>> > --- a/include/linux/sched.h
>> > +++ b/include/linux/sched.h
>> > @@ -796,6 +796,7 @@ struct task_struct {
>> >  #ifdef CONFIG_AUDITSYSCALL
>> >         kuid_t                          loginuid;
>> >         unsigned int                    sessionid;
>> > +       u64                             containerid;
>>
>> This one line addition to the task_struct scares me the most of
>> anything in this patchset.  Why?  It's a field named "containerid" in
>> a perhaps one of the most widely used core kernel structures; the
>> possibilities for abuse are endless, and it's foolish to think we
>> would ever be able to adequately police this.
>
> Fair enough.
>
>> Unfortunately, we can't add the field to audit_context as things
>> currently stand because we don't always allocate an audit_context,
>> it's dependent on the system's configuration, and we need to track the
>> audit container ID for a given process, regardless of the audit
>> configuration.  Pretty much the same reason why loginuid and sessionid
>> are located directly in task_struct now.  As I stressed during the
>> design phase, I really want to keep this as an *audit* container ID
>> and not a general purpose kernel wide container ID.  If the kernel
>> ever grows a general purpose container ID token, I'll be the first in
>> line to convert the audit code, but I don't want audit to be that
>> general purpose mechanism ... audit is hated enough as-is ;)
>
> When would we need an audit container ID when audit is not enabled
> enough to have an audit_context?

I'm thinking of the audit_alloc() case where audit_filter_task()
returns AUDIT_DISABLED.

I believe this is the same reason why loginuid and sessionid live
directly in the task_struct and not in the audit_context; they need to
persist for the lifetime of the task.

> If it is only used for audit, and audit is the only consumer, and audit
> can only use it when it is enabled, then we can just return success to
> any write to the proc filehandle, or not even present it.  Nothing will
> be able to know that value wasn't used.
>
> When are loginuid and sessionid used now when audit is not enabled (or
> should I say, explicitly disabled)?

See above.  I think that should answer these questions.

>> > diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
>> > index 4e61a9e..921a71f 100644
>> > --- a/include/uapi/linux/audit.h
>> > +++ b/include/uapi/linux/audit.h
>> > @@ -71,6 +71,7 @@
>> >  #define AUDIT_TTY_SET          1017    /* Set TTY auditing status */
>> >  #define AUDIT_SET_FEATURE      1018    /* Turn an audit feature on or off */
>> >  #define AUDIT_GET_FEATURE      1019    /* Get which features are enabled */
>> > +#define AUDIT_CONTAINER                1020    /* Define the container id and information */
>> >
>> >  #define AUDIT_FIRST_USER_MSG   1100    /* Userspace messages mostly uninteresting to kernel */
>> >  #define AUDIT_USER_AVC         1107    /* We filter this differently */
>> > @@ -465,6 +466,7 @@ struct audit_tty_status {
>> >  };
>> >
>> >  #define AUDIT_UID_UNSET (unsigned int)-1
>> > +#define AUDIT_CID_UNSET ((u64)-1)
>>
>> I think we need to decide if we want to distinguish between the "host"
>> (e.g. init ns) and "unset".  Looking at this patch (I've only quickly
>> skimmed the others so far) it would appear that you don't think we
>> need to worry about this distinction; that's fine, but let's make it
>> explicit with a comment in the code that AUDIT_CID_UNSET means "unset"
>> as well as "host".
>
> I don't see any reason to distinguish between "host" and "unset".  Since
> a container doesn't have a concrete definition based in namespaces, the
> initial namespace set is meaningless here.

Okay, that sounds reasonable.

> Is there value in having a container orchestrator process have a
> reserved container ID that has a policy distinct from any other
> container?

I'm open to arguments for this idea, but I don't see a point to it right now.

> If so, then I could see the value in making the distinction.
> For example, I've heard of interest in systemd acting as a container
> orchestrator, so if it took on that role as PID 1, then every process in
> the system would inherit that ID and none would be unset.
>
> I can't picture how having seperate "host" and "unset" values helps us.

I don't have a strong feeling either way, I just wanted to ask the question.

>> >  /* audit_rule_data supports filter rules with both integer and string
>> >   * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
>> > diff --git a/kernel/auditsc.c b/kernel/auditsc.c
>> > index 4e0a4ac..29c8482 100644
>> > --- a/kernel/auditsc.c
>> > +++ b/kernel/auditsc.c
>> > @@ -2073,6 +2073,90 @@ int audit_set_loginuid(kuid_t loginuid)
>> >         return rc;
>> >  }
>> >
>> > +static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
>> > +{
>> > +       struct task_struct *parent;
>> > +       u64 pcontainerid, ccontainerid;
>> > +
>> > +       /* Don't allow to set our own containerid */
>> > +       if (current == task)
>> > +               return -EPERM;
>>
>> Why not?  Is there some obvious security concern that I missing?
>
> We then lose the distinction in the AUDIT_CONTAINER record between the
> initiating PID and the target PID.  This was outlined in the proposal.

I just went back and reread the v3 proposal and I still don't see a
good explanation of this.  Why is this bad?  What's the security
concern?

> Having said that, I'm still not sure we have protected sufficiently from
> a child turning around and setting it's parent's as yet unset or
> inherited audit container ID.

Yes, I believe we only want to let a task set the audit container for
it's children (or itself/threads if we decide to allow that, see
above).  There *has* to be a function to check to see if a task if a
child of a given task ... right? ... although this is likely to be a
pointer traversal and locking nightmare ... hmmm.

>> I ask because I suppose it might be possible for some container
>> runtime to do a fork, setup some of the environment and them exec the
>> container (before you answer the obvious "namespaces!" please remember
>> we're not trying to define containers).
>
> I don't think namespaces have any bearing on this concern since none are
> required.
>
>> > +       /* Don't allow the containerid to be unset */
>> > +       if (!cid_valid(containerid))
>> > +               return -EINVAL;
>> > +       /* if we don't have caps, reject */
>> > +       if (!capable(CAP_AUDIT_CONTROL))
>> > +               return -EPERM;
>> > +       /* if containerid is unset, allow */
>> > +       if (!audit_containerid_set(task))
>> > +               return 0;
>> > +       /* it is already set, and not inherited from the parent, reject */
>> > +       ccontainerid = audit_get_containerid(task);
>> > +       rcu_read_lock();
>> > +       parent = rcu_dereference(task->real_parent);
>> > +       rcu_read_unlock();
>> > +       task_lock(parent);
>> > +       pcontainerid = audit_get_containerid(parent);
>> > +       task_unlock(parent);
>> > +       if (ccontainerid != pcontainerid)
>> > +               return -EPERM;
>> > +       return 0;
>> > +}
>> > +
>> > +static void audit_log_set_containerid(struct task_struct *task, u64 oldcontainerid,
>> > +                                     u64 containerid, int rc)
>> > +{
>> > +       struct audit_buffer *ab;
>> > +       uid_t uid;
>> > +       struct tty_struct *tty;
>> > +
>> > +       if (!audit_enabled)
>> > +               return;
>> > +
>> > +       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONTAINER);
>> > +       if (!ab)
>> > +               return;
>> > +
>> > +       uid = from_kuid(&init_user_ns, task_uid(current));
>> > +       tty = audit_get_tty(current);
>> > +
>> > +       audit_log_format(ab, "op=set pid=%d uid=%u", task_tgid_nr(current), uid);
>> > +       audit_log_task_context(ab);
>> > +       audit_log_format(ab, " auid=%u tty=%s ses=%u opid=%d old-contid=%llu contid=%llu res=%d",
>> > +                        from_kuid(&init_user_ns, audit_get_loginuid(current)),
>> > +                        tty ? tty_name(tty) : "(none)", audit_get_sessionid(current),
>> > +                        task_tgid_nr(task), oldcontainerid, containerid, !rc);
>> > +
>> > +       audit_put_tty(tty);
>> > +       audit_log_end(ab);
>> > +}
>> > +
>> > +/**
>> > + * audit_set_containerid - set current task's audit_context containerid
>> > + * @containerid: containerid value
>> > + *
>> > + * Returns 0 on success, -EPERM on permission failure.
>> > + *
>> > + * Called (set) from fs/proc/base.c::proc_containerid_write().
>> > + */
>> > +int audit_set_containerid(struct task_struct *task, u64 containerid)
>> > +{
>> > +       u64 oldcontainerid;
>> > +       int rc;
>> > +
>> > +       oldcontainerid = audit_get_containerid(task);
>> > +
>> > +       rc = audit_set_containerid_perm(task, containerid);
>> > +       if (!rc) {
>> > +               task_lock(task);
>> > +               task->containerid = containerid;
>> > +               task_unlock(task);
>> > +       }
>> > +
>> > +       audit_log_set_containerid(task, oldcontainerid, containerid, rc);
>> > +       return rc;
>>
>> Why are audit_set_containerid_perm() and audit_log_containerid()
>> separate functions?
>
> (I assume you mean audit_log_set_containerid()?)

Yep.  My fingers got tired typing in that function name and decided a
shortcut was necessary.

> It seemed clearer that all the permission checking was in one function
> and its return code could be used to report the outcome when logging the
> (attempted) action.  This is the same structure as audit_set_loginuid()
> and it made sense.

When possible I really like it when the permission checks are in the
same function as the code which does the work; it's less likely to get
abused that way (you have to willfully bypass the access checks).  The
exceptions might be if you wanted to reuse the access control code, or
insert a modular access mechanism (e.g. LSMs).

I'm less concerned about audit_log_set_containerid(), but the usual
idea of avoiding single-use function within the same scope applies
here.

> This would be the time to connect it to a syscall if that seems like a
> good idea and remove pid, uid, auid, tty, ses fields.

Ah yes, I missed that.  You know my stance on connecting records by
now (hint: yes, connect them) so I think that would be a good thing to
do for the next round.
Richard Guy Briggs April 24, 2018, 2:02 a.m.
On 2018-04-23 19:15, Paul Moore wrote:
> On Sat, Apr 21, 2018 at 10:34 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2018-04-18 19:47, Paul Moore wrote:
> >> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> >> > Implement the proc fs write to set the audit container ID of a process,
> >> > emitting an AUDIT_CONTAINER record to document the event.
> >> >
> >> > This is a write from the container orchestrator task to a proc entry of
> >> > the form /proc/PID/containerid where PID is the process ID of the newly
> >> > created task that is to become the first task in a container, or an
> >> > additional task added to a container.
> >> >
> >> > The write expects up to a u64 value (unset: 18446744073709551615).
> >> >
> >> > This will produce a record such as this:
> >> > type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455 res=0
> >> >
> >> > The "op" field indicates an initial set.  The "pid" to "ses" fields are
> >> > the orchestrator while the "opid" field is the object's PID, the process
> >> > being "contained".  Old and new container ID values are given in the
> >> > "contid" fields, while res indicates its success.
> >> >
> >> > It is not permitted to self-set, unset or re-set the container ID.  A
> >> > child inherits its parent's container ID, but then can be set only once
> >> > after.
> >> >
> >> > See: https://github.com/linux-audit/audit-kernel/issues/32
> >> >
> >> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> >> > ---
> >> >  fs/proc/base.c             | 37 ++++++++++++++++++++
> >> >  include/linux/audit.h      | 16 +++++++++
> >> >  include/linux/init_task.h  |  4 ++-
> >> >  include/linux/sched.h      |  1 +
> >> >  include/uapi/linux/audit.h |  2 ++
> >> >  kernel/auditsc.c           | 84 ++++++++++++++++++++++++++++++++++++++++++++++
> >> >  6 files changed, 143 insertions(+), 1 deletion(-)
> 
> ...
> 
> >> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> >> > index d258826..1b82191 100644
> >> > --- a/include/linux/sched.h
> >> > +++ b/include/linux/sched.h
> >> > @@ -796,6 +796,7 @@ struct task_struct {
> >> >  #ifdef CONFIG_AUDITSYSCALL
> >> >         kuid_t                          loginuid;
> >> >         unsigned int                    sessionid;
> >> > +       u64                             containerid;
> >>
> >> This one line addition to the task_struct scares me the most of
> >> anything in this patchset.  Why?  It's a field named "containerid" in
> >> a perhaps one of the most widely used core kernel structures; the
> >> possibilities for abuse are endless, and it's foolish to think we
> >> would ever be able to adequately police this.
> >
> > Fair enough.
> >
> >> Unfortunately, we can't add the field to audit_context as things
> >> currently stand because we don't always allocate an audit_context,
> >> it's dependent on the system's configuration, and we need to track the
> >> audit container ID for a given process, regardless of the audit
> >> configuration.  Pretty much the same reason why loginuid and sessionid
> >> are located directly in task_struct now.  As I stressed during the
> >> design phase, I really want to keep this as an *audit* container ID
> >> and not a general purpose kernel wide container ID.  If the kernel
> >> ever grows a general purpose container ID token, I'll be the first in
> >> line to convert the audit code, but I don't want audit to be that
> >> general purpose mechanism ... audit is hated enough as-is ;)
> >
> > When would we need an audit container ID when audit is not enabled
> > enough to have an audit_context?
> 
> I'm thinking of the audit_alloc() case where audit_filter_task()
> returns AUDIT_DISABLED.

Ok, so a task could be marked as filtered but its children would still
be auditable and inheriting its parent containerid (as well at its
loginuid and sessionid)...

> I believe this is the same reason why loginuid and sessionid live
> directly in the task_struct and not in the audit_context; they need to
> persist for the lifetime of the task.

Yes, probably.

> > If it is only used for audit, and audit is the only consumer, and audit
> > can only use it when it is enabled, then we can just return success to
> > any write to the proc filehandle, or not even present it.  Nothing will
> > be able to know that value wasn't used.
> >
> > When are loginuid and sessionid used now when audit is not enabled (or
> > should I say, explicitly disabled)?
> 
> See above.  I think that should answer these questions.

Ok.

> >> > diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> >> > index 4e61a9e..921a71f 100644
> >> > --- a/include/uapi/linux/audit.h
> >> > +++ b/include/uapi/linux/audit.h
> >> > @@ -71,6 +71,7 @@
> >> >  #define AUDIT_TTY_SET          1017    /* Set TTY auditing status */
> >> >  #define AUDIT_SET_FEATURE      1018    /* Turn an audit feature on or off */
> >> >  #define AUDIT_GET_FEATURE      1019    /* Get which features are enabled */
> >> > +#define AUDIT_CONTAINER                1020    /* Define the container id and information */
> >> >
> >> >  #define AUDIT_FIRST_USER_MSG   1100    /* Userspace messages mostly uninteresting to kernel */
> >> >  #define AUDIT_USER_AVC         1107    /* We filter this differently */
> >> > @@ -465,6 +466,7 @@ struct audit_tty_status {
> >> >  };
> >> >
> >> >  #define AUDIT_UID_UNSET (unsigned int)-1
> >> > +#define AUDIT_CID_UNSET ((u64)-1)
> >>
> >> I think we need to decide if we want to distinguish between the "host"
> >> (e.g. init ns) and "unset".  Looking at this patch (I've only quickly
> >> skimmed the others so far) it would appear that you don't think we
> >> need to worry about this distinction; that's fine, but let's make it
> >> explicit with a comment in the code that AUDIT_CID_UNSET means "unset"
> >> as well as "host".
> >
> > I don't see any reason to distinguish between "host" and "unset".  Since
> > a container doesn't have a concrete definition based in namespaces, the
> > initial namespace set is meaningless here.
> 
> Okay, that sounds reasonable.
> 
> > Is there value in having a container orchestrator process have a
> > reserved container ID that has a policy distinct from any other
> > container?
> 
> I'm open to arguments for this idea, but I don't see a point to it right now.
> 
> > If so, then I could see the value in making the distinction.
> > For example, I've heard of interest in systemd acting as a container
> > orchestrator, so if it took on that role as PID 1, then every process in
> > the system would inherit that ID and none would be unset.
> >
> > I can't picture how having seperate "host" and "unset" values helps us.
> 
> I don't have a strong feeling either way, I just wanted to ask the question.
> 
> >> >  /* audit_rule_data supports filter rules with both integer and string
> >> >   * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
> >> > diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> >> > index 4e0a4ac..29c8482 100644
> >> > --- a/kernel/auditsc.c
> >> > +++ b/kernel/auditsc.c
> >> > @@ -2073,6 +2073,90 @@ int audit_set_loginuid(kuid_t loginuid)
> >> >         return rc;
> >> >  }
> >> >
> >> > +static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
> >> > +{
> >> > +       struct task_struct *parent;
> >> > +       u64 pcontainerid, ccontainerid;
> >> > +
> >> > +       /* Don't allow to set our own containerid */
> >> > +       if (current == task)
> >> > +               return -EPERM;
> >>
> >> Why not?  Is there some obvious security concern that I missing?
> >
> > We then lose the distinction in the AUDIT_CONTAINER record between the
> > initiating PID and the target PID.  This was outlined in the proposal.
> 
> I just went back and reread the v3 proposal and I still don't see a
> good explanation of this.  Why is this bad?  What's the security
> concern?

I don't remember, specifically.  Maybe this has been addressed by the
check for children/threads or identical parent container ID.  So, I'm
reluctantly willing to remove that check for now.

> > Having said that, I'm still not sure we have protected sufficiently from
> > a child turning around and setting it's parent's as yet unset or
> > inherited audit container ID.
> 
> Yes, I believe we only want to let a task set the audit container for
> it's children (or itself/threads if we decide to allow that, see
> above).  There *has* to be a function to check to see if a task if a
> child of a given task ... right? ... although this is likely to be a
> pointer traversal and locking nightmare ... hmmm.

Isn't that just (struct task_struct)parent == (struct
task_struct)child->parent (or ->real_parent)?

And now that I say that, it is covered by the following patch's child
check, so as long as we keep that, we should be fine.

> >> I ask because I suppose it might be possible for some container
> >> runtime to do a fork, setup some of the environment and them exec the
> >> container (before you answer the obvious "namespaces!" please remember
> >> we're not trying to define containers).
> >
> > I don't think namespaces have any bearing on this concern since none are
> > required.
> >
> >> > +       /* Don't allow the containerid to be unset */
> >> > +       if (!cid_valid(containerid))
> >> > +               return -EINVAL;
> >> > +       /* if we don't have caps, reject */
> >> > +       if (!capable(CAP_AUDIT_CONTROL))
> >> > +               return -EPERM;
> >> > +       /* if containerid is unset, allow */
> >> > +       if (!audit_containerid_set(task))
> >> > +               return 0;
> >> > +       /* it is already set, and not inherited from the parent, reject */
> >> > +       ccontainerid = audit_get_containerid(task);
> >> > +       rcu_read_lock();
> >> > +       parent = rcu_dereference(task->real_parent);
> >> > +       rcu_read_unlock();
> >> > +       task_lock(parent);
> >> > +       pcontainerid = audit_get_containerid(parent);
> >> > +       task_unlock(parent);
> >> > +       if (ccontainerid != pcontainerid)
> >> > +               return -EPERM;
> >> > +       return 0;
> >> > +}
> >> > +
> >> > +static void audit_log_set_containerid(struct task_struct *task, u64 oldcontainerid,
> >> > +                                     u64 containerid, int rc)
> >> > +{
> >> > +       struct audit_buffer *ab;
> >> > +       uid_t uid;
> >> > +       struct tty_struct *tty;
> >> > +
> >> > +       if (!audit_enabled)
> >> > +               return;
> >> > +
> >> > +       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONTAINER);
> >> > +       if (!ab)
> >> > +               return;
> >> > +
> >> > +       uid = from_kuid(&init_user_ns, task_uid(current));
> >> > +       tty = audit_get_tty(current);
> >> > +
> >> > +       audit_log_format(ab, "op=set pid=%d uid=%u", task_tgid_nr(current), uid);
> >> > +       audit_log_task_context(ab);
> >> > +       audit_log_format(ab, " auid=%u tty=%s ses=%u opid=%d old-contid=%llu contid=%llu res=%d",
> >> > +                        from_kuid(&init_user_ns, audit_get_loginuid(current)),
> >> > +                        tty ? tty_name(tty) : "(none)", audit_get_sessionid(current),
> >> > +                        task_tgid_nr(task), oldcontainerid, containerid, !rc);
> >> > +
> >> > +       audit_put_tty(tty);
> >> > +       audit_log_end(ab);
> >> > +}
> >> > +
> >> > +/**
> >> > + * audit_set_containerid - set current task's audit_context containerid
> >> > + * @containerid: containerid value
> >> > + *
> >> > + * Returns 0 on success, -EPERM on permission failure.
> >> > + *
> >> > + * Called (set) from fs/proc/base.c::proc_containerid_write().
> >> > + */
> >> > +int audit_set_containerid(struct task_struct *task, u64 containerid)
> >> > +{
> >> > +       u64 oldcontainerid;
> >> > +       int rc;
> >> > +
> >> > +       oldcontainerid = audit_get_containerid(task);
> >> > +
> >> > +       rc = audit_set_containerid_perm(task, containerid);
> >> > +       if (!rc) {
> >> > +               task_lock(task);
> >> > +               task->containerid = containerid;
> >> > +               task_unlock(task);
> >> > +       }
> >> > +
> >> > +       audit_log_set_containerid(task, oldcontainerid, containerid, rc);
> >> > +       return rc;
> >>
> >> Why are audit_set_containerid_perm() and audit_log_containerid()
> >> separate functions?
> >
> > (I assume you mean audit_log_set_containerid()?)
> 
> Yep.  My fingers got tired typing in that function name and decided a
> shortcut was necessary.
> 
> > It seemed clearer that all the permission checking was in one function
> > and its return code could be used to report the outcome when logging the
> > (attempted) action.  This is the same structure as audit_set_loginuid()
> > and it made sense.
> 
> When possible I really like it when the permission checks are in the
> same function as the code which does the work; it's less likely to get
> abused that way (you have to willfully bypass the access checks).  The
> exceptions might be if you wanted to reuse the access control code, or
> insert a modular access mechanism (e.g. LSMs).

I don't follow how it could be abused.  The return code from the perm
check gates setting the value and is used in the success field in the
log.

> I'm less concerned about audit_log_set_containerid(), but the usual
> idea of avoiding single-use function within the same scope applies
> here.
> 
> > This would be the time to connect it to a syscall if that seems like a
> > good idea and remove pid, uid, auid, tty, ses fields.
> 
> Ah yes, I missed that.  You know my stance on connecting records by
> now (hint: yes, connect them) so I think that would be a good thing to
> do for the next round.

Ok...

> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore April 24, 2018, 7:01 p.m.
On Mon, Apr 23, 2018 at 10:02 PM, Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2018-04-23 19:15, Paul Moore wrote:
>> On Sat, Apr 21, 2018 at 10:34 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
>> > On 2018-04-18 19:47, Paul Moore wrote:
>> >> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
>> >> > Implement the proc fs write to set the audit container ID of a process,
>> >> > emitting an AUDIT_CONTAINER record to document the event.
>> >> >
>> >> > This is a write from the container orchestrator task to a proc entry of
>> >> > the form /proc/PID/containerid where PID is the process ID of the newly
>> >> > created task that is to become the first task in a container, or an
>> >> > additional task added to a container.
>> >> >
>> >> > The write expects up to a u64 value (unset: 18446744073709551615).
>> >> >
>> >> > This will produce a record such as this:
>> >> > type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455 res=0
>> >> >
>> >> > The "op" field indicates an initial set.  The "pid" to "ses" fields are
>> >> > the orchestrator while the "opid" field is the object's PID, the process
>> >> > being "contained".  Old and new container ID values are given in the
>> >> > "contid" fields, while res indicates its success.
>> >> >
>> >> > It is not permitted to self-set, unset or re-set the container ID.  A
>> >> > child inherits its parent's container ID, but then can be set only once
>> >> > after.
>> >> >
>> >> > See: https://github.com/linux-audit/audit-kernel/issues/32
>> >> >
>> >> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
>> >> > ---
>> >> >  fs/proc/base.c             | 37 ++++++++++++++++++++
>> >> >  include/linux/audit.h      | 16 +++++++++
>> >> >  include/linux/init_task.h  |  4 ++-
>> >> >  include/linux/sched.h      |  1 +
>> >> >  include/uapi/linux/audit.h |  2 ++
>> >> >  kernel/auditsc.c           | 84 ++++++++++++++++++++++++++++++++++++++++++++++
>> >> >  6 files changed, 143 insertions(+), 1 deletion(-)

...

>> >> >  /* audit_rule_data supports filter rules with both integer and string
>> >> >   * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
>> >> > diff --git a/kernel/auditsc.c b/kernel/auditsc.c
>> >> > index 4e0a4ac..29c8482 100644
>> >> > --- a/kernel/auditsc.c
>> >> > +++ b/kernel/auditsc.c
>> >> > @@ -2073,6 +2073,90 @@ int audit_set_loginuid(kuid_t loginuid)
>> >> >         return rc;
>> >> >  }
>> >> >
>> >> > +static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
>> >> > +{
>> >> > +       struct task_struct *parent;
>> >> > +       u64 pcontainerid, ccontainerid;
>> >> > +
>> >> > +       /* Don't allow to set our own containerid */
>> >> > +       if (current == task)
>> >> > +               return -EPERM;
>> >>
>> >> Why not?  Is there some obvious security concern that I missing?
>> >
>> > We then lose the distinction in the AUDIT_CONTAINER record between the
>> > initiating PID and the target PID.  This was outlined in the proposal.
>>
>> I just went back and reread the v3 proposal and I still don't see a
>> good explanation of this.  Why is this bad?  What's the security
>> concern?
>
> I don't remember, specifically.  Maybe this has been addressed by the
> check for children/threads or identical parent container ID.  So, I'm
> reluctantly willing to remove that check for now.

Okay.  For the record, if someone can explain to me why this
restriction saves us from some terrible situation I'm all for leaving
it.  I'm just opposed to restrictions without solid reasoning behind
them.

>> > Having said that, I'm still not sure we have protected sufficiently from
>> > a child turning around and setting it's parent's as yet unset or
>> > inherited audit container ID.
>>
>> Yes, I believe we only want to let a task set the audit container for
>> it's children (or itself/threads if we decide to allow that, see
>> above).  There *has* to be a function to check to see if a task if a
>> child of a given task ... right? ... although this is likely to be a
>> pointer traversal and locking nightmare ... hmmm.
>
> Isn't that just (struct task_struct)parent == (struct
> task_struct)child->parent (or ->real_parent)?
>
> And now that I say that, it is covered by the following patch's child
> check, so as long as we keep that, we should be fine.

I was thinking of checking not just current's immediate children, but
any of it's descendants as I believe that is what we want to limit,
yes?  I just worry that it isn't really practical to perform that
check.

>> >> I ask because I suppose it might be possible for some container
>> >> runtime to do a fork, setup some of the environment and them exec the
>> >> container (before you answer the obvious "namespaces!" please remember
>> >> we're not trying to define containers).
>> >
>> > I don't think namespaces have any bearing on this concern since none are
>> > required.
>> >
>> >> > +       /* Don't allow the containerid to be unset */
>> >> > +       if (!cid_valid(containerid))
>> >> > +               return -EINVAL;
>> >> > +       /* if we don't have caps, reject */
>> >> > +       if (!capable(CAP_AUDIT_CONTROL))
>> >> > +               return -EPERM;
>> >> > +       /* if containerid is unset, allow */
>> >> > +       if (!audit_containerid_set(task))
>> >> > +               return 0;
>> >> > +       /* it is already set, and not inherited from the parent, reject */
>> >> > +       ccontainerid = audit_get_containerid(task);
>> >> > +       rcu_read_lock();
>> >> > +       parent = rcu_dereference(task->real_parent);
>> >> > +       rcu_read_unlock();
>> >> > +       task_lock(parent);
>> >> > +       pcontainerid = audit_get_containerid(parent);
>> >> > +       task_unlock(parent);
>> >> > +       if (ccontainerid != pcontainerid)
>> >> > +               return -EPERM;
>> >> > +       return 0;

I'm looking at the parent checks again and I wonder if the logic above
is what we really want.  Maybe it is, but I'm not sure.

Things I'm wondering about:

* "ccontainerid" and "containerid" are too close in name, I kept
confusing myself when looking at this code.  Please change one.  Bonus
points if it is shorter.

* What if the orchestrator wants to move the task to a new container?
Right now it looks like you can only do that once, then then the
task's audit container ID will no longer be the same as real_parent
... or does the orchestrator change that?  *Can* the orchestrator
change real_parent (I suspect the answer is "no")?

* I think the key is the relationship between current and task, not
between task and task->real_parent.  I believe what we really care
about is that task is a descendant of current.  We might also want to
allow current to change the audit container ID if it holds
CAP_AUDIT_CONTROL, regardless of it's relationship with task.

>> >> > +static void audit_log_set_containerid(struct task_struct *task, u64 oldcontainerid,
>> >> > +                                     u64 containerid, int rc)
>> >> > +{
>> >> > +       struct audit_buffer *ab;
>> >> > +       uid_t uid;
>> >> > +       struct tty_struct *tty;
>> >> > +
>> >> > +       if (!audit_enabled)
>> >> > +               return;
>> >> > +
>> >> > +       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONTAINER);
>> >> > +       if (!ab)
>> >> > +               return;
>> >> > +
>> >> > +       uid = from_kuid(&init_user_ns, task_uid(current));
>> >> > +       tty = audit_get_tty(current);
>> >> > +
>> >> > +       audit_log_format(ab, "op=set pid=%d uid=%u", task_tgid_nr(current), uid);
>> >> > +       audit_log_task_context(ab);
>> >> > +       audit_log_format(ab, " auid=%u tty=%s ses=%u opid=%d old-contid=%llu contid=%llu res=%d",
>> >> > +                        from_kuid(&init_user_ns, audit_get_loginuid(current)),
>> >> > +                        tty ? tty_name(tty) : "(none)", audit_get_sessionid(current),
>> >> > +                        task_tgid_nr(task), oldcontainerid, containerid, !rc);
>> >> > +
>> >> > +       audit_put_tty(tty);
>> >> > +       audit_log_end(ab);
>> >> > +}
>> >> > +
>> >> > +/**
>> >> > + * audit_set_containerid - set current task's audit_context containerid
>> >> > + * @containerid: containerid value
>> >> > + *
>> >> > + * Returns 0 on success, -EPERM on permission failure.
>> >> > + *
>> >> > + * Called (set) from fs/proc/base.c::proc_containerid_write().
>> >> > + */
>> >> > +int audit_set_containerid(struct task_struct *task, u64 containerid)
>> >> > +{
>> >> > +       u64 oldcontainerid;
>> >> > +       int rc;
>> >> > +
>> >> > +       oldcontainerid = audit_get_containerid(task);
>> >> > +
>> >> > +       rc = audit_set_containerid_perm(task, containerid);
>> >> > +       if (!rc) {
>> >> > +               task_lock(task);
>> >> > +               task->containerid = containerid;
>> >> > +               task_unlock(task);
>> >> > +       }
>> >> > +
>> >> > +       audit_log_set_containerid(task, oldcontainerid, containerid, rc);
>> >> > +       return rc;
>> >>
>> >> Why are audit_set_containerid_perm() and audit_log_containerid()
>> >> separate functions?
>> >
>> > (I assume you mean audit_log_set_containerid()?)
>>
>> Yep.  My fingers got tired typing in that function name and decided a
>> shortcut was necessary.
>>
>> > It seemed clearer that all the permission checking was in one function
>> > and its return code could be used to report the outcome when logging the
>> > (attempted) action.  This is the same structure as audit_set_loginuid()
>> > and it made sense.
>>
>> When possible I really like it when the permission checks are in the
>> same function as the code which does the work; it's less likely to get
>> abused that way (you have to willfully bypass the access checks).  The
>> exceptions might be if you wanted to reuse the access control code, or
>> insert a modular access mechanism (e.g. LSMs).
>
> I don't follow how it could be abused.  The return code from the perm
> check gates setting the value and is used in the success field in the
> log.

If the permission checks are in the same function body as the code
which does the work you have to either split the function, or rewrite
it, if you want to bypass the permission checks.  It may be more of a
style issue than an actual safety issue, but the comments about
single-use functions in the same scope is the tie breaker.
Richard Guy Briggs April 25, 2018, 12:40 a.m.
On 2018-04-24 15:01, Paul Moore wrote:
> On Mon, Apr 23, 2018 at 10:02 PM, Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2018-04-23 19:15, Paul Moore wrote:
> >> On Sat, Apr 21, 2018 at 10:34 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> >> > On 2018-04-18 19:47, Paul Moore wrote:
> >> >> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> >> >> > Implement the proc fs write to set the audit container ID of a process,
> >> >> > emitting an AUDIT_CONTAINER record to document the event.
> >> >> >
> >> >> > This is a write from the container orchestrator task to a proc entry of
> >> >> > the form /proc/PID/containerid where PID is the process ID of the newly
> >> >> > created task that is to become the first task in a container, or an
> >> >> > additional task added to a container.
> >> >> >
> >> >> > The write expects up to a u64 value (unset: 18446744073709551615).
> >> >> >
> >> >> > This will produce a record such as this:
> >> >> > type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455 res=0
> >> >> >
> >> >> > The "op" field indicates an initial set.  The "pid" to "ses" fields are
> >> >> > the orchestrator while the "opid" field is the object's PID, the process
> >> >> > being "contained".  Old and new container ID values are given in the
> >> >> > "contid" fields, while res indicates its success.
> >> >> >
> >> >> > It is not permitted to self-set, unset or re-set the container ID.  A
> >> >> > child inherits its parent's container ID, but then can be set only once
> >> >> > after.
> >> >> >
> >> >> > See: https://github.com/linux-audit/audit-kernel/issues/32
> >> >> >
> >> >> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> >> >> > ---
> >> >> >  fs/proc/base.c             | 37 ++++++++++++++++++++
> >> >> >  include/linux/audit.h      | 16 +++++++++
> >> >> >  include/linux/init_task.h  |  4 ++-
> >> >> >  include/linux/sched.h      |  1 +
> >> >> >  include/uapi/linux/audit.h |  2 ++
> >> >> >  kernel/auditsc.c           | 84 ++++++++++++++++++++++++++++++++++++++++++++++
> >> >> >  6 files changed, 143 insertions(+), 1 deletion(-)
> 
> ...
> 
> >> >> >  /* audit_rule_data supports filter rules with both integer and string
> >> >> >   * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
> >> >> > diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> >> >> > index 4e0a4ac..29c8482 100644
> >> >> > --- a/kernel/auditsc.c
> >> >> > +++ b/kernel/auditsc.c
> >> >> > @@ -2073,6 +2073,90 @@ int audit_set_loginuid(kuid_t loginuid)
> >> >> >         return rc;
> >> >> >  }
> >> >> >
> >> >> > +static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
> >> >> > +{
> >> >> > +       struct task_struct *parent;
> >> >> > +       u64 pcontainerid, ccontainerid;
> >> >> > +
> >> >> > +       /* Don't allow to set our own containerid */
> >> >> > +       if (current == task)
> >> >> > +               return -EPERM;
> >> >>
> >> >> Why not?  Is there some obvious security concern that I missing?
> >> >
> >> > We then lose the distinction in the AUDIT_CONTAINER record between the
> >> > initiating PID and the target PID.  This was outlined in the proposal.
> >>
> >> I just went back and reread the v3 proposal and I still don't see a
> >> good explanation of this.  Why is this bad?  What's the security
> >> concern?
> >
> > I don't remember, specifically.  Maybe this has been addressed by the
> > check for children/threads or identical parent container ID.  So, I'm
> > reluctantly willing to remove that check for now.
> 
> Okay.  For the record, if someone can explain to me why this
> restriction saves us from some terrible situation I'm all for leaving
> it.  I'm just opposed to restrictions without solid reasoning behind
> them.
> 
> >> > Having said that, I'm still not sure we have protected sufficiently from
> >> > a child turning around and setting it's parent's as yet unset or
> >> > inherited audit container ID.
> >>
> >> Yes, I believe we only want to let a task set the audit container for
> >> it's children (or itself/threads if we decide to allow that, see
> >> above).  There *has* to be a function to check to see if a task if a
> >> child of a given task ... right? ... although this is likely to be a
> >> pointer traversal and locking nightmare ... hmmm.
> >
> > Isn't that just (struct task_struct)parent == (struct
> > task_struct)child->parent (or ->real_parent)?
> >
> > And now that I say that, it is covered by the following patch's child
> > check, so as long as we keep that, we should be fine.
> 
> I was thinking of checking not just current's immediate children, but
> any of it's descendants as I believe that is what we want to limit,
> yes?  I just worry that it isn't really practical to perform that
> check.

The child check I'm talking about prevents setting a task's audit
container ID if it *has* any children or threads, so if it has children
it is automatically disqualified and its grandchildren are irrelevant.

> >> >> I ask because I suppose it might be possible for some container
> >> >> runtime to do a fork, setup some of the environment and them exec the
> >> >> container (before you answer the obvious "namespaces!" please remember
> >> >> we're not trying to define containers).
> >> >
> >> > I don't think namespaces have any bearing on this concern since none are
> >> > required.
> >> >
> >> >> > +       /* Don't allow the containerid to be unset */
> >> >> > +       if (!cid_valid(containerid))
> >> >> > +               return -EINVAL;
> >> >> > +       /* if we don't have caps, reject */
> >> >> > +       if (!capable(CAP_AUDIT_CONTROL))
> >> >> > +               return -EPERM;
> >> >> > +       /* if containerid is unset, allow */
> >> >> > +       if (!audit_containerid_set(task))
> >> >> > +               return 0;
> >> >> > +       /* it is already set, and not inherited from the parent, reject */
> >> >> > +       ccontainerid = audit_get_containerid(task);
> >> >> > +       rcu_read_lock();
> >> >> > +       parent = rcu_dereference(task->real_parent);
> >> >> > +       rcu_read_unlock();
> >> >> > +       task_lock(parent);
> >> >> > +       pcontainerid = audit_get_containerid(parent);
> >> >> > +       task_unlock(parent);
> >> >> > +       if (ccontainerid != pcontainerid)
> >> >> > +               return -EPERM;
> >> >> > +       return 0;
> 
> I'm looking at the parent checks again and I wonder if the logic above
> is what we really want.  Maybe it is, but I'm not sure.
> 
> Things I'm wondering about:
> 
> * "ccontainerid" and "containerid" are too close in name, I kept
> confusing myself when looking at this code.  Please change one.  Bonus
> points if it is shorter.

Would c_containerid and p_containerid be ok?  child_cid and parent_cid?
I'd really like it to have the same root as the parameter handed in so
teh code is easier to follow.  It would be nice to have that across
caller to local, but that's challenging.

I've been tempted to use contid or even cid everywhere instead of
containerid.  Perhaps the longer name doesn't bother me because I
like its uniqueness and I learned touch-typing in grade 9 and I like
100+ character wide terminals?  ;-)

> * What if the orchestrator wants to move the task to a new container?
> Right now it looks like you can only do that once, then then the
> task's audit container ID will no longer be the same as real_parent

A task's audit container ID can be unset or inherited, and then set
only once.  After that, if you want it moved to a new container you
can't and your only option is to spawn another peer to that task or a
child of it and set that new task's audit container ID.

Currently, the method of detecting if its audit container ID has been
set (rather than inherited) was to check its parent's audit container
ID.  The only reason to change this might be if the audit container ID
were not inheritable, but then we lose the accountability of a task
spawning another process and being able to leave its child's audit
container ID unset and unaccountable to any existing container.  I think
the relationship to the parent is crucial, and if something wants to
change audit container ID it can, by spawning childrent and leaving a
trail of container IDs in its parent processes.  (So what if a parent
dies?)

> ... or does the orchestrator change that?  *Can* the orchestrator
> change real_parent (I suspect the answer is "no")?

I don't think the orchestrator is able to change real_parent.  I've
forgotten why there is a ->parent and ->real_parent and how they can
change.  One is for the wait signal.  I don't remember the purpose of
the other.

If the parent dies before the child, the child will be re-parented on
its grandparent if the parent doesn't hang around zombified, if I
understand correctly.  If anything, a parent dying would likely further
restrict the ability to set a task's audit container ID because a parent
with an identical ID could vanish.

> * I think the key is the relationship between current and task, not
> between task and task->real_parent.  I believe what we really care
> about is that task is a descendant of current.  We might also want to
> allow current to change the audit container ID if it holds
> CAP_AUDIT_CONTROL, regardless of it's relationship with task.

Currently, a process with CAP_AUDIT_CONTROL can set the audit container
ID of any task that hasn't got children or threads, isn't itself, and
its audit container ID is inherited or unset.  This was to try to
prevent games with parents and children scratching each other's backs.

I would feel more comfortable if only descendants were settable, so
adding that restriction sounds like a good idea to me other than the
tree-climbing excercise and overhead involved.

> >> >> > +static void audit_log_set_containerid(struct task_struct *task, u64 oldcontainerid,
> >> >> > +                                     u64 containerid, int rc)
> >> >> > +{
> >> >> > +       struct audit_buffer *ab;
> >> >> > +       uid_t uid;
> >> >> > +       struct tty_struct *tty;
> >> >> > +
> >> >> > +       if (!audit_enabled)
> >> >> > +               return;
> >> >> > +
> >> >> > +       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONTAINER);
> >> >> > +       if (!ab)
> >> >> > +               return;
> >> >> > +
> >> >> > +       uid = from_kuid(&init_user_ns, task_uid(current));
> >> >> > +       tty = audit_get_tty(current);
> >> >> > +
> >> >> > +       audit_log_format(ab, "op=set pid=%d uid=%u", task_tgid_nr(current), uid);
> >> >> > +       audit_log_task_context(ab);
> >> >> > +       audit_log_format(ab, " auid=%u tty=%s ses=%u opid=%d old-contid=%llu contid=%llu res=%d",
> >> >> > +                        from_kuid(&init_user_ns, audit_get_loginuid(current)),
> >> >> > +                        tty ? tty_name(tty) : "(none)", audit_get_sessionid(current),
> >> >> > +                        task_tgid_nr(task), oldcontainerid, containerid, !rc);
> >> >> > +
> >> >> > +       audit_put_tty(tty);
> >> >> > +       audit_log_end(ab);
> >> >> > +}
> >> >> > +
> >> >> > +/**
> >> >> > + * audit_set_containerid - set current task's audit_context containerid
> >> >> > + * @containerid: containerid value
> >> >> > + *
> >> >> > + * Returns 0 on success, -EPERM on permission failure.
> >> >> > + *
> >> >> > + * Called (set) from fs/proc/base.c::proc_containerid_write().
> >> >> > + */
> >> >> > +int audit_set_containerid(struct task_struct *task, u64 containerid)
> >> >> > +{
> >> >> > +       u64 oldcontainerid;
> >> >> > +       int rc;
> >> >> > +
> >> >> > +       oldcontainerid = audit_get_containerid(task);
> >> >> > +
> >> >> > +       rc = audit_set_containerid_perm(task, containerid);
> >> >> > +       if (!rc) {
> >> >> > +               task_lock(task);
> >> >> > +               task->containerid = containerid;
> >> >> > +               task_unlock(task);
> >> >> > +       }
> >> >> > +
> >> >> > +       audit_log_set_containerid(task, oldcontainerid, containerid, rc);
> >> >> > +       return rc;
> >> >>
> >> >> Why are audit_set_containerid_perm() and audit_log_containerid()
> >> >> separate functions?
> >> >
> >> > (I assume you mean audit_log_set_containerid()?)
> >>
> >> Yep.  My fingers got tired typing in that function name and decided a
> >> shortcut was necessary.
> >>
> >> > It seemed clearer that all the permission checking was in one function
> >> > and its return code could be used to report the outcome when logging the
> >> > (attempted) action.  This is the same structure as audit_set_loginuid()
> >> > and it made sense.
> >>
> >> When possible I really like it when the permission checks are in the
> >> same function as the code which does the work; it's less likely to get
> >> abused that way (you have to willfully bypass the access checks).  The
> >> exceptions might be if you wanted to reuse the access control code, or
> >> insert a modular access mechanism (e.g. LSMs).
> >
> > I don't follow how it could be abused.  The return code from the perm
> > check gates setting the value and is used in the success field in the
> > log.
> 
> If the permission checks are in the same function body as the code
> which does the work you have to either split the function, or rewrite
> it, if you want to bypass the permission checks.  It may be more of a
> style issue than an actual safety issue, but the comments about
> single-use functions in the same scope is the tie breaker.

Perhaps I'm just being quite dense, but I just don't follow what the
problem is and how you suggest fixing it.  A bunch of gotos to a label
such as "out:" to log the refused action?  That seems messy and
unstructured.

> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore April 26, 2018, 10:47 p.m.
On Tue, Apr 24, 2018 at 8:40 PM, Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2018-04-24 15:01, Paul Moore wrote:
>> On Mon, Apr 23, 2018 at 10:02 PM, Richard Guy Briggs <rgb@redhat.com> wrote:
>> > On 2018-04-23 19:15, Paul Moore wrote:
>> >> On Sat, Apr 21, 2018 at 10:34 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
>> >> > On 2018-04-18 19:47, Paul Moore wrote:
>> >> >> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
>> >> >> > Implement the proc fs write to set the audit container ID of a process,
>> >> >> > emitting an AUDIT_CONTAINER record to document the event.
>> >> >> >
>> >> >> > This is a write from the container orchestrator task to a proc entry of
>> >> >> > the form /proc/PID/containerid where PID is the process ID of the newly
>> >> >> > created task that is to become the first task in a container, or an
>> >> >> > additional task added to a container.
>> >> >> >
>> >> >> > The write expects up to a u64 value (unset: 18446744073709551615).
>> >> >> >
>> >> >> > This will produce a record such as this:
>> >> >> > type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455 res=0
>> >> >> >
>> >> >> > The "op" field indicates an initial set.  The "pid" to "ses" fields are
>> >> >> > the orchestrator while the "opid" field is the object's PID, the process
>> >> >> > being "contained".  Old and new container ID values are given in the
>> >> >> > "contid" fields, while res indicates its success.
>> >> >> >
>> >> >> > It is not permitted to self-set, unset or re-set the container ID.  A
>> >> >> > child inherits its parent's container ID, but then can be set only once
>> >> >> > after.
>> >> >> >
>> >> >> > See: https://github.com/linux-audit/audit-kernel/issues/32
>> >> >> >
>> >> >> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
>> >> >> > ---
>> >> >> >  fs/proc/base.c             | 37 ++++++++++++++++++++
>> >> >> >  include/linux/audit.h      | 16 +++++++++
>> >> >> >  include/linux/init_task.h  |  4 ++-
>> >> >> >  include/linux/sched.h      |  1 +
>> >> >> >  include/uapi/linux/audit.h |  2 ++
>> >> >> >  kernel/auditsc.c           | 84 ++++++++++++++++++++++++++++++++++++++++++++++
>> >> >> >  6 files changed, 143 insertions(+), 1 deletion(-)
>>
>> ...
>>
>> >> >> >  /* audit_rule_data supports filter rules with both integer and string
>> >> >> >   * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
>> >> >> > diff --git a/kernel/auditsc.c b/kernel/auditsc.c
>> >> >> > index 4e0a4ac..29c8482 100644
>> >> >> > --- a/kernel/auditsc.c
>> >> >> > +++ b/kernel/auditsc.c
>> >> >> > @@ -2073,6 +2073,90 @@ int audit_set_loginuid(kuid_t loginuid)
>> >> >> >         return rc;
>> >> >> >  }
>> >> >> >
>> >> >> > +static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
>> >> >> > +{
>> >> >> > +       struct task_struct *parent;
>> >> >> > +       u64 pcontainerid, ccontainerid;
>> >> >> > +
>> >> >> > +       /* Don't allow to set our own containerid */
>> >> >> > +       if (current == task)
>> >> >> > +               return -EPERM;
>> >> >>
>> >> >> Why not?  Is there some obvious security concern that I missing?
>> >> >
>> >> > We then lose the distinction in the AUDIT_CONTAINER record between the
>> >> > initiating PID and the target PID.  This was outlined in the proposal.
>> >>
>> >> I just went back and reread the v3 proposal and I still don't see a
>> >> good explanation of this.  Why is this bad?  What's the security
>> >> concern?
>> >
>> > I don't remember, specifically.  Maybe this has been addressed by the
>> > check for children/threads or identical parent container ID.  So, I'm
>> > reluctantly willing to remove that check for now.
>>
>> Okay.  For the record, if someone can explain to me why this
>> restriction saves us from some terrible situation I'm all for leaving
>> it.  I'm just opposed to restrictions without solid reasoning behind
>> them.
>>
>> >> > Having said that, I'm still not sure we have protected sufficiently from
>> >> > a child turning around and setting it's parent's as yet unset or
>> >> > inherited audit container ID.
>> >>
>> >> Yes, I believe we only want to let a task set the audit container for
>> >> it's children (or itself/threads if we decide to allow that, see
>> >> above).  There *has* to be a function to check to see if a task if a
>> >> child of a given task ... right? ... although this is likely to be a
>> >> pointer traversal and locking nightmare ... hmmm.
>> >
>> > Isn't that just (struct task_struct)parent == (struct
>> > task_struct)child->parent (or ->real_parent)?
>> >
>> > And now that I say that, it is covered by the following patch's child
>> > check, so as long as we keep that, we should be fine.
>>
>> I was thinking of checking not just current's immediate children, but
>> any of it's descendants as I believe that is what we want to limit,
>> yes?  I just worry that it isn't really practical to perform that
>> check.
>
> The child check I'm talking about prevents setting a task's audit
> container ID if it *has* any children or threads, so if it has children
> it is automatically disqualified and its grandchildren are irrelevant.
>
>> >> >> I ask because I suppose it might be possible for some container
>> >> >> runtime to do a fork, setup some of the environment and them exec the
>> >> >> container (before you answer the obvious "namespaces!" please remember
>> >> >> we're not trying to define containers).
>> >> >
>> >> > I don't think namespaces have any bearing on this concern since none are
>> >> > required.
>> >> >
>> >> >> > +       /* Don't allow the containerid to be unset */
>> >> >> > +       if (!cid_valid(containerid))
>> >> >> > +               return -EINVAL;
>> >> >> > +       /* if we don't have caps, reject */
>> >> >> > +       if (!capable(CAP_AUDIT_CONTROL))
>> >> >> > +               return -EPERM;
>> >> >> > +       /* if containerid is unset, allow */
>> >> >> > +       if (!audit_containerid_set(task))
>> >> >> > +               return 0;
>> >> >> > +       /* it is already set, and not inherited from the parent, reject */
>> >> >> > +       ccontainerid = audit_get_containerid(task);
>> >> >> > +       rcu_read_lock();
>> >> >> > +       parent = rcu_dereference(task->real_parent);
>> >> >> > +       rcu_read_unlock();
>> >> >> > +       task_lock(parent);
>> >> >> > +       pcontainerid = audit_get_containerid(parent);
>> >> >> > +       task_unlock(parent);
>> >> >> > +       if (ccontainerid != pcontainerid)
>> >> >> > +               return -EPERM;
>> >> >> > +       return 0;
>>
>> I'm looking at the parent checks again and I wonder if the logic above
>> is what we really want.  Maybe it is, but I'm not sure.
>>
>> Things I'm wondering about:
>>
>> * "ccontainerid" and "containerid" are too close in name, I kept
>> confusing myself when looking at this code.  Please change one.  Bonus
>> points if it is shorter.
>
> Would c_containerid and p_containerid be ok?  child_cid and parent_cid?

Either would be an improvement over ccontainerid/containerid.  I would
give a slight node to child_cid/parent_cid just for length reasons.

> I'd really like it to have the same root as the parameter handed in so
> teh code is easier to follow.  It would be nice to have that across
> caller to local, but that's challenging.

That's fine, but you have to admit that ccontainerid/containerid is
awkward and not easy to quickly differentiate :)

> I've been tempted to use contid or even cid everywhere instead of
> containerid.  Perhaps the longer name doesn't bother me because I
> like its uniqueness and I learned touch-typing in grade 9 and I like
> 100+ character wide terminals?  ;-)

I would definitely appreciate contid/cid or similar, but I don't care
too much either way.  As far as terminal width is concerned, please
make sure your code fits in 80 char terminals.

>> * What if the orchestrator wants to move the task to a new container?
>> Right now it looks like you can only do that once, then then the
>> task's audit container ID will no longer be the same as real_parent
>
> A task's audit container ID can be unset or inherited, and then set
> only once.  After that, if you want it moved to a new container you
> can't and your only option is to spawn another peer to that task or a
> child of it and set that new task's audit container ID.

Okay.  We've had some many discussions about this both on and off list
that I lose track on where we stand for certain things.  I think
preventing task movement is fine for the initial effort so long as we
don't prevent adding it in the future; I don't see anything (other
than the permission checks under discussion, which is fine) preventing
this.

> Currently, the method of detecting if its audit container ID has been
> set (rather than inherited) was to check its parent's audit container
> ID.

Yeah ... those are two different things.  I've been wondering if we
should introduce a set/inherited flag as simply checking the parent
task's audit container ID isn't quite the same; although it may be
"close enough" that it doesn't matter in practice.  However, I'm
beginning to think this parent/child relationship isn't really
important beyond the inheritance issue ... more on this below.

> The only reason to change this might be if the audit container ID
> were not inheritable, but then we lose the accountability of a task
> spawning another process and being able to leave its child's audit
> container ID unset and unaccountable to any existing container.  I think
> the relationship to the parent is crucial, and if something wants to
> change audit container ID it can, by spawning childrent and leaving a
> trail of container IDs in its parent processes.  (So what if a parent
> dies?)

The audit container ID *must* be inherited, I don't really think
anyone is questioning that.  What I'm wondering about is what we
accomplish by comparing the child's and parent's audit container ID?

I've thought about this a bit more and I think we are making this way
too complicated right now.  We basically have three rules for the
audit container ID which we need to follow:

1. Children inherit their parent's audit container ID; this includes
the magic "unset" audit container ID.
2. You can't change the audit container ID once set.
3. In order to set the audit container ID of a process you must have
CAP_AUDIT_CONTROL.

With that in mind, I think the permission checks would be something like this:

[SIDE NOTE: Audit Container ID in acronym form works out to "acid" ;) ]

  int perm(task, acid)
  {
      if (!task || !valid(acid))
         return -EINVAL;
      if (!capable(CAP_AUDIT_CONTROL))
         return -EPERM;
      if (task->acid != UNSET)
         return -EPERM;
      return 0;
  }

>> ... or does the orchestrator change that?  *Can* the orchestrator
>> change real_parent (I suspect the answer is "no")?
>
> I don't think the orchestrator is able to change real_parent.

I didn't think so either, but I didn't do an exhaustive check.

> I've forgotten why there is a ->parent and ->real_parent and how they can
> change.  One is for the wait signal.  I don't remember the purpose of
> the other.

I know ptrace makes use of real_parent when re-parenting the process
being ptrace'd.

> If the parent dies before the child, the child will be re-parented on
> its grandparent if the parent doesn't hang around zombified, if I
> understand correctly.  If anything, a parent dying would likely further
> restrict the ability to set a task's audit container ID because a parent
> with an identical ID could vanish.

All the more reason to go with the simplified approach above.  I think
the parent/child relationship is a bit of a distraction and a
complexity that isn't important (except for the inheritance of
course).

>> * I think the key is the relationship between current and task, not
>> between task and task->real_parent.  I believe what we really care
>> about is that task is a descendant of current.  We might also want to
>> allow current to change the audit container ID if it holds
>> CAP_AUDIT_CONTROL, regardless of it's relationship with task.
>
> Currently, a process with CAP_AUDIT_CONTROL can set the audit container
> ID of any task that hasn't got children or threads, isn't itself, and
> its audit container ID is inherited or unset.  This was to try to
> prevent games with parents and children scratching each other's backs.
>
> I would feel more comfortable if only descendants were settable, so
> adding that restriction sounds like a good idea to me other than the
> tree-climbing excercise and overhead involved.
>
>> >> >> > +static void audit_log_set_containerid(struct task_struct *task, u64 oldcontainerid,
>> >> >> > +                                     u64 containerid, int rc)
>> >> >> > +{
>> >> >> > +       struct audit_buffer *ab;
>> >> >> > +       uid_t uid;
>> >> >> > +       struct tty_struct *tty;
>> >> >> > +
>> >> >> > +       if (!audit_enabled)
>> >> >> > +               return;
>> >> >> > +
>> >> >> > +       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONTAINER);
>> >> >> > +       if (!ab)
>> >> >> > +               return;
>> >> >> > +
>> >> >> > +       uid = from_kuid(&init_user_ns, task_uid(current));
>> >> >> > +       tty = audit_get_tty(current);
>> >> >> > +
>> >> >> > +       audit_log_format(ab, "op=set pid=%d uid=%u", task_tgid_nr(current), uid);
>> >> >> > +       audit_log_task_context(ab);
>> >> >> > +       audit_log_format(ab, " auid=%u tty=%s ses=%u opid=%d old-contid=%llu contid=%llu res=%d",
>> >> >> > +                        from_kuid(&init_user_ns, audit_get_loginuid(current)),
>> >> >> > +                        tty ? tty_name(tty) : "(none)", audit_get_sessionid(current),
>> >> >> > +                        task_tgid_nr(task), oldcontainerid, containerid, !rc);
>> >> >> > +
>> >> >> > +       audit_put_tty(tty);
>> >> >> > +       audit_log_end(ab);
>> >> >> > +}
>> >> >> > +
>> >> >> > +/**
>> >> >> > + * audit_set_containerid - set current task's audit_context containerid
>> >> >> > + * @containerid: containerid value
>> >> >> > + *
>> >> >> > + * Returns 0 on success, -EPERM on permission failure.
>> >> >> > + *
>> >> >> > + * Called (set) from fs/proc/base.c::proc_containerid_write().
>> >> >> > + */
>> >> >> > +int audit_set_containerid(struct task_struct *task, u64 containerid)
>> >> >> > +{
>> >> >> > +       u64 oldcontainerid;
>> >> >> > +       int rc;
>> >> >> > +
>> >> >> > +       oldcontainerid = audit_get_containerid(task);
>> >> >> > +
>> >> >> > +       rc = audit_set_containerid_perm(task, containerid);
>> >> >> > +       if (!rc) {
>> >> >> > +               task_lock(task);
>> >> >> > +               task->containerid = containerid;
>> >> >> > +               task_unlock(task);
>> >> >> > +       }
>> >> >> > +
>> >> >> > +       audit_log_set_containerid(task, oldcontainerid, containerid, rc);
>> >> >> > +       return rc;
>> >> >>
>> >> >> Why are audit_set_containerid_perm() and audit_log_containerid()
>> >> >> separate functions?
>> >> >
>> >> > (I assume you mean audit_log_set_containerid()?)
>> >>
>> >> Yep.  My fingers got tired typing in that function name and decided a
>> >> shortcut was necessary.
>> >>
>> >> > It seemed clearer that all the permission checking was in one function
>> >> > and its return code could be used to report the outcome when logging the
>> >> > (attempted) action.  This is the same structure as audit_set_loginuid()
>> >> > and it made sense.
>> >>
>> >> When possible I really like it when the permission checks are in the
>> >> same function as the code which does the work; it's less likely to get
>> >> abused that way (you have to willfully bypass the access checks).  The
>> >> exceptions might be if you wanted to reuse the access control code, or
>> >> insert a modular access mechanism (e.g. LSMs).
>> >
>> > I don't follow how it could be abused.  The return code from the perm
>> > check gates setting the value and is used in the success field in the
>> > log.
>>
>> If the permission checks are in the same function body as the code
>> which does the work you have to either split the function, or rewrite
>> it, if you want to bypass the permission checks.  It may be more of a
>> style issue than an actual safety issue, but the comments about
>> single-use functions in the same scope is the tie breaker.
>
> Perhaps I'm just being quite dense, but I just don't follow what the
> problem is and how you suggest fixing it.  A bunch of gotos to a label
> such as "out:" to log the refused action?  That seems messy and
> unstructured.

Fold audit_set_containerid_perm() and audit_log_set_containerid() into
their only caller, audit_set_containerid().
Richard Guy Briggs May 6, 2018, 4:51 p.m.
On 2018-04-18 19:47, Paul Moore wrote:
> On Fri, Mar 16, 2018 at 5:00 AM, Richard Guy Briggs <rgb@redhat.com> wrote:
> > Implement the proc fs write to set the audit container ID of a process,
> > emitting an AUDIT_CONTAINER record to document the event.
> >
> > This is a write from the container orchestrator task to a proc entry of
> > the form /proc/PID/containerid where PID is the process ID of the newly
> > created task that is to become the first task in a container, or an
> > additional task added to a container.
> >
> > The write expects up to a u64 value (unset: 18446744073709551615).
> >
> > This will produce a record such as this:
> > type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455 res=0
> >
> > The "op" field indicates an initial set.  The "pid" to "ses" fields are
> > the orchestrator while the "opid" field is the object's PID, the process
> > being "contained".  Old and new container ID values are given in the
> > "contid" fields, while res indicates its success.
> >
> > It is not permitted to self-set, unset or re-set the container ID.  A
> > child inherits its parent's container ID, but then can be set only once
> > after.
> >
> > See: https://github.com/linux-audit/audit-kernel/issues/32
> >
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > ---
> >  fs/proc/base.c             | 37 ++++++++++++++++++++
> >  include/linux/audit.h      | 16 +++++++++
> >  include/linux/init_task.h  |  4 ++-
> >  include/linux/sched.h      |  1 +
> >  include/uapi/linux/audit.h |  2 ++
> >  kernel/auditsc.c           | 84 ++++++++++++++++++++++++++++++++++++++++++++++
> >  6 files changed, 143 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> > index 60316b5..6ce4fbe 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -1299,6 +1299,41 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
> >         .read           = proc_sessionid_read,
> >         .llseek         = generic_file_llseek,
> >  };
> > +
> > +static ssize_t proc_containerid_write(struct file *file, const char __user *buf,
> > +                                  size_t count, loff_t *ppos)
> > +{
> > +       struct inode *inode = file_inode(file);
> > +       u64 containerid;
> > +       int rv;
> > +       struct task_struct *task = get_proc_task(inode);
> > +
> > +       if (!task)
> > +               return -ESRCH;
> > +       if (*ppos != 0) {
> > +               /* No partial writes. */
> > +               put_task_struct(task);
> > +               return -EINVAL;
> > +       }
> > +
> > +       rv = kstrtou64_from_user(buf, count, 10, &containerid);
> > +       if (rv < 0) {
> > +               put_task_struct(task);
> > +               return rv;
> > +       }
> > +
> > +       rv = audit_set_containerid(task, containerid);
> > +       put_task_struct(task);
> > +       if (rv < 0)
> > +               return rv;
> > +       return count;
> > +}
> > +
> > +static const struct file_operations proc_containerid_operations = {
> > +       .write          = proc_containerid_write,
> > +       .llseek         = generic_file_llseek,
> > +};
> > +
> >  #endif
> >
> >  #ifdef CONFIG_FAULT_INJECTION
> > @@ -2961,6 +2996,7 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
> >  #ifdef CONFIG_AUDITSYSCALL
> >         REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
> >         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> > +       REG("containerid", S_IWUSR, proc_containerid_operations),
> >  #endif
> >  #ifdef CONFIG_FAULT_INJECTION
> >         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
> > @@ -3355,6 +3391,7 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
> >  #ifdef CONFIG_AUDITSYSCALL
> >         REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
> >         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> > +       REG("containerid", S_IWUSR, proc_containerid_operations),
> >  #endif
> >  #ifdef CONFIG_FAULT_INJECTION
> >         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),

...

> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index d258826..1b82191 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -796,6 +796,7 @@ struct task_struct {
> >  #ifdef CONFIG_AUDITSYSCALL
> >         kuid_t                          loginuid;
> >         unsigned int                    sessionid;
> > +       u64                             containerid;
> 
> This one line addition to the task_struct scares me the most of
> anything in this patchset.  Why?  It's a field named "containerid" in
> a perhaps one of the most widely used core kernel structures; the
> possibilities for abuse are endless, and it's foolish to think we
> would ever be able to adequately police this.
> 
> Unfortunately, we can't add the field to audit_context as things
> currently stand because we don't always allocate an audit_context,
> it's dependent on the system's configuration, and we need to track the
> audit container ID for a given process, regardless of the audit
> configuration.  Pretty much the same reason why loginuid and sessionid
> are located directly in task_struct now.  As I stressed during the
> design phase, I really want to keep this as an *audit* container ID
> and not a general purpose kernel wide container ID.  If the kernel
> ever grows a general purpose container ID token, I'll be the first in
> line to convert the audit code, but I don't want audit to be that
> general purpose mechanism ... audit is hated enough as-is ;)
> 
> I think the right solution to this is to create another new struct,
> audit_task_info (or similar, the name really isn't that important),
> which would be stored as a pointer in task_struct and would replace
> the audit_context pointer, loginuid, sessionid, and the newly proposed
> containerid.  The new audit_task_info would always be allocated in the
> audit_alloc() function (please use kmem_cache), and the audit_context
> pointer included inside would continue to be allocated based on the
> existing conditions.  By keeping audit_task_info as a pointer inside
> task_struct we could hide the structure definition inside
> kernel/audit*.c and make it much more difficult for other subsystems
> to abuse it.[1]
> 
>   struct audit_task_info {
>     kuid_t loginuid;
>     unsigned int sessionid;
>     u64 containerid;
>     struct audit_context *ctx;
>   }
> 
> Actually, we might even want to consider storing audit_context in
> audit_task_info (no pointer), or making it a zero length array
> (ctx[0]) and going with a variable sized allocation of audit_task_info
> ... but all that could be done as a follow up optimization once we get
> the basic idea sorted.

I tried statically allocating struct audit_task_info (with a pointer to
struct audit_context) in addition to dynamically allocating struct
audit_task_info due to a bug I'd introduced while dynamically allocating
audit_task_info, so I now have proof-of-concepts for working static and
almost working dynamic allocated struct audit_task_info.

Statically allocating it required a new header file, so I'm not that
crazy about it, but it proved it works.

Dynamically allocating it isn't quite as clean as was hoped since
init/init_task.c still needs initializaiton values for loginuid and
sessionid which could be supplied by a statically allocated struct
audit_task_info and still needs to know the internals of that struct to
do so.  Dynamic allocation is also more disruptive initially, but in the
long run will be more stable to the rest of the kernel.

I'm not crazy about the idea of dynamically (or even statically)
allocating struct audit_task_info which includes allocated space for
struct audit_context since the latter is far larger than the former.

> [1] If for some reason allocating audit_task_info becomes too much
> overhead to bear (somewhat doubtful since we would only do it at task
> creation), we could do some ugly tricks to directly include an
> audit_task_struct chunk in task_struct but I'd like to avoid that if
> possible (and I think we can).

On allocation, I don't see too much of a problem.  When calling
audit_free() if there is no audit context it is pretty lightweight, but
gets heavier if we eliminate the inline audit_free() and rename
__audit_free() back to audit_free().  Having struct audit_task_info
directly in struct task_struct would be faster and also allow defaults
to be set in init/init_task.c (which has recently been populated from
include/linux/init_task.h).  I'm not sure this is enough of a reason to
avoid a pointer from task_struct.

(As an aside, converting allocation of audit_context could also benefit
from kmem_cache... and maybe even struct audit_names)

> >  #endif
> >         struct seccomp                  seccomp;
> 
> ...

> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Steve Grubb May 17, 2018, 9 p.m.
On Fri, 16 Mar 2018 05:00:28 -0400
Richard Guy Briggs <rgb@redhat.com> wrote:

> Implement the proc fs write to set the audit container ID of a
> process, emitting an AUDIT_CONTAINER record to document the event.
> 
> This is a write from the container orchestrator task to a proc entry
> of the form /proc/PID/containerid where PID is the process ID of the
> newly created task that is to become the first task in a container,
> or an additional task added to a container.
> 
> The write expects up to a u64 value (unset: 18446744073709551615).
> 
> This will produce a record such as this:
> type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0
> subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0
> tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455
> res=0

The was one thing I was wondering about. Currently when we set the
loginuid, the record is AUDIT_LOGINUID. The corollary is that when we
set the container id, the event should be AUDIT_CONTAINERID or
AUDIT_CONTAINER_ID.

During syscall events, the path info is returned in a a record simply
called AUDIT_PATH, cwd info is returned in AUDIT_CWD. So, rather than
calling the record that gets attached to everything
AUDIT_CONTAINER_INFO, how about simply AUDIT_CONTAINER.


> The "op" field indicates an initial set.  The "pid" to "ses" fields
> are the orchestrator while the "opid" field is the object's PID, the
> process being "contained".  Old and new container ID values are given
> in the "contid" fields, while res indicates its success.
> 
> It is not permitted to self-set, unset or re-set the container ID.  A
> child inherits its parent's container ID, but then can be set only
> once after.
> 
> See: https://github.com/linux-audit/audit-kernel/issues/32
> 
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
>  fs/proc/base.c             | 37 ++++++++++++++++++++
>  include/linux/audit.h      | 16 +++++++++
>  include/linux/init_task.h  |  4 ++-
>  include/linux/sched.h      |  1 +
>  include/uapi/linux/audit.h |  2 ++
>  kernel/auditsc.c           | 84
> ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 143
> insertions(+), 1 deletion(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 60316b5..6ce4fbe 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1299,6 +1299,41 @@ static ssize_t proc_sessionid_read(struct file
> * file, char __user * buf, .read		= proc_sessionid_read,
>  	.llseek		= generic_file_llseek,
>  };
> +
> +static ssize_t proc_containerid_write(struct file *file, const char
> __user *buf,
> +				   size_t count, loff_t *ppos)
> +{
> +	struct inode *inode = file_inode(file);
> +	u64 containerid;
> +	int rv;
> +	struct task_struct *task = get_proc_task(inode);
> +
> +	if (!task)
> +		return -ESRCH;
> +	if (*ppos != 0) {
> +		/* No partial writes. */
> +		put_task_struct(task);
> +		return -EINVAL;
> +	}
> +
> +	rv = kstrtou64_from_user(buf, count, 10, &containerid);
> +	if (rv < 0) {
> +		put_task_struct(task);
> +		return rv;
> +	}
> +
> +	rv = audit_set_containerid(task, containerid);
> +	put_task_struct(task);
> +	if (rv < 0)
> +		return rv;
> +	return count;
> +}
> +
> +static const struct file_operations proc_containerid_operations = {
> +	.write		= proc_containerid_write,
> +	.llseek		= generic_file_llseek,
> +};
> +
>  #endif
>  
>  #ifdef CONFIG_FAULT_INJECTION
> @@ -2961,6 +2996,7 @@ static int proc_pid_patch_state(struct seq_file
> *m, struct pid_namespace *ns, #ifdef CONFIG_AUDITSYSCALL
>  	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
>  	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> +	REG("containerid", S_IWUSR, proc_containerid_operations),
>  #endif
>  #ifdef CONFIG_FAULT_INJECTION
>  	REG("make-it-fail", S_IRUGO|S_IWUSR,
> proc_fault_inject_operations), @@ -3355,6 +3391,7 @@ static int
> proc_tid_comm_permission(struct inode *inode, int mask) #ifdef
> CONFIG_AUDITSYSCALL REG("loginuid",  S_IWUSR|S_IRUGO,
> proc_loginuid_operations), REG("sessionid",  S_IRUGO,
> proc_sessionid_operations),
> +	REG("containerid", S_IWUSR, proc_containerid_operations),
>  #endif
>  #ifdef CONFIG_FAULT_INJECTION
>  	REG("make-it-fail", S_IRUGO|S_IWUSR,
> proc_fault_inject_operations), diff --git a/include/linux/audit.h
> b/include/linux/audit.h index af410d9..fe4ba3f 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -29,6 +29,7 @@
>  
>  #define AUDIT_INO_UNSET ((unsigned long)-1)
>  #define AUDIT_DEV_UNSET ((dev_t)-1)
> +#define INVALID_CID AUDIT_CID_UNSET
>  
>  struct audit_sig_info {
>  	uid_t		uid;
> @@ -321,6 +322,7 @@ static inline void audit_ptrace(struct
> task_struct *t) extern int auditsc_get_stamp(struct audit_context
> *ctx, struct timespec64 *t, unsigned int *serial);
>  extern int audit_set_loginuid(kuid_t loginuid);
> +extern int audit_set_containerid(struct task_struct *tsk, u64
> containerid); 
>  static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
>  {
> @@ -332,6 +334,11 @@ static inline unsigned int
> audit_get_sessionid(struct task_struct *tsk) return tsk->sessionid;
>  }
>  
> +static inline u64 audit_get_containerid(struct task_struct *tsk)
> +{
> +	return tsk->containerid;
> +}
> +
>  extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
>  extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
> gid_t gid, umode_t mode); extern void __audit_bprm(struct
> linux_binprm *bprm); @@ -517,6 +524,10 @@ static inline unsigned int
> audit_get_sessionid(struct task_struct *tsk) {
>  	return -1;
>  }
> +static inline kuid_t audit_get_containerid(struct task_struct *tsk)
> +{
> +	return INVALID_CID;
> +}
>  static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
>  { }
>  static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t
> uid, @@ -581,6 +592,11 @@ static inline bool
> audit_loginuid_set(struct task_struct *tsk) return
> uid_valid(audit_get_loginuid(tsk)); }
>  
> +static inline bool audit_containerid_set(struct task_struct *tsk)
> +{
> +	return audit_get_containerid(tsk) != INVALID_CID;
> +}
> +
>  static inline void audit_log_string(struct audit_buffer *ab, const
> char *buf) {
>  	audit_log_n_string(ab, buf, strlen(buf));
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 6a53262..046bd0a 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -18,6 +18,7 @@
>  #include <linux/sched/rt.h>
>  #include <linux/livepatch.h>
>  #include <linux/mm_types.h>
> +#include <linux/audit.h>
>  
>  #include <asm/thread_info.h>
>  
> @@ -120,7 +121,8 @@
>  #ifdef CONFIG_AUDITSYSCALL
>  #define INIT_IDS \
>  	.loginuid = INVALID_UID, \
> -	.sessionid = (unsigned int)-1,
> +	.sessionid = (unsigned int)-1, \
> +	.containerid = INVALID_CID,
>  #else
>  #define INIT_IDS
>  #endif
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d258826..1b82191 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -796,6 +796,7 @@ struct task_struct {
>  #ifdef CONFIG_AUDITSYSCALL
>  	kuid_t				loginuid;
>  	unsigned int			sessionid;
> +	u64				containerid;
>  #endif
>  	struct seccomp			seccomp;
>  
> diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> index 4e61a9e..921a71f 100644
> --- a/include/uapi/linux/audit.h
> +++ b/include/uapi/linux/audit.h
> @@ -71,6 +71,7 @@
>  #define AUDIT_TTY_SET		1017	/* Set TTY auditing
> status */ #define AUDIT_SET_FEATURE	1018	/* Turn an
> audit feature on or off */ #define AUDIT_GET_FEATURE
> 1019	/* Get which features are enabled */ +#define
> AUDIT_CONTAINER		1020	/* Define the container id
> and information */ #define AUDIT_FIRST_USER_MSG	1100	/*
> Userspace messages mostly uninteresting to kernel */ #define
> AUDIT_USER_AVC		1107	/* We filter this
> differently */ @@ -465,6 +466,7 @@ struct audit_tty_status { };
>  
>  #define AUDIT_UID_UNSET (unsigned int)-1
> +#define AUDIT_CID_UNSET ((u64)-1)
>  
>  /* audit_rule_data supports filter rules with both integer and string
>   * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
> diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> index 4e0a4ac..29c8482 100644
> --- a/kernel/auditsc.c
> +++ b/kernel/auditsc.c
> @@ -2073,6 +2073,90 @@ int audit_set_loginuid(kuid_t loginuid)
>  	return rc;
>  }
>  
> +static int audit_set_containerid_perm(struct task_struct *task, u64
> containerid) +{
> +	struct task_struct *parent;
> +	u64 pcontainerid, ccontainerid;
> +
> +	/* Don't allow to set our own containerid */
> +	if (current == task)
> +		return -EPERM;
> +	/* Don't allow the containerid to be unset */
> +	if (!cid_valid(containerid))
> +		return -EINVAL;
> +	/* if we don't have caps, reject */
> +	if (!capable(CAP_AUDIT_CONTROL))
> +		return -EPERM;
> +	/* if containerid is unset, allow */
> +	if (!audit_containerid_set(task))
> +		return 0;
> +	/* it is already set, and not inherited from the parent,
> reject */
> +	ccontainerid = audit_get_containerid(task);
> +	rcu_read_lock();
> +	parent = rcu_dereference(task->real_parent);
> +	rcu_read_unlock();
> +	task_lock(parent);
> +	pcontainerid = audit_get_containerid(parent);
> +	task_unlock(parent);
> +	if (ccontainerid != pcontainerid)
> +		return -EPERM;
> +	return 0;
> +}
> +
> +static void audit_log_set_containerid(struct task_struct *task, u64
> oldcontainerid,
> +				      u64 containerid, int rc)
> +{
> +	struct audit_buffer *ab;
> +	uid_t uid;
> +	struct tty_struct *tty;
> +
> +	if (!audit_enabled)
> +		return;
> +
> +	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONTAINER);
> +	if (!ab)
> +		return;
> +
> +	uid = from_kuid(&init_user_ns, task_uid(current));
> +	tty = audit_get_tty(current);
> +
> +	audit_log_format(ab, "op=set pid=%d uid=%u",
> task_tgid_nr(current), uid);
> +	audit_log_task_context(ab);
> +	audit_log_format(ab, " auid=%u tty=%s ses=%u opid=%d
> old-contid=%llu contid=%llu res=%d",

The preferred ordering would be: op, opid, old-contid, contid, pid, uid,
tty, ses, subj, comm, exe, res. This groups the searchable fields
together using the most common ordering so that parsing is simple.

Thanks,
-Steve


> +			 from_kuid(&init_user_ns,
> audit_get_loginuid(current)),
> +			 tty ? tty_name(tty) : "(none)",
> audit_get_sessionid(current),
> +			 task_tgid_nr(task), oldcontainerid,
> containerid, !rc); +
> +	audit_put_tty(tty);
> +	audit_log_end(ab);
> +}
> +
> +/**
> + * audit_set_containerid - set current task's audit_context
> containerid
> + * @containerid: containerid value
> + *
> + * Returns 0 on success, -EPERM on permission failure.
> + *
> + * Called (set) from fs/proc/base.c::proc_containerid_write().
> + */
> +int audit_set_containerid(struct task_struct *task, u64 containerid)
> +{
> +	u64 oldcontainerid;
> +	int rc;
> +
> +	oldcontainerid = audit_get_containerid(task);
> +
> +	rc = audit_set_containerid_perm(task, containerid);
> +	if (!rc) {
> +		task_lock(task);
> +		task->containerid = containerid;
> +		task_unlock(task);
> +	}
> +
> +	audit_log_set_containerid(task, oldcontainerid, containerid,
> rc);
> +	return rc;
> +}
> +
>  /**
>   * __audit_mq_open - record audit data for a POSIX MQ open
>   * @oflag: open flag
Richard Guy Briggs May 17, 2018, 9:56 p.m.
On 2018-05-17 17:00, Steve Grubb wrote:
> On Fri, 16 Mar 2018 05:00:28 -0400
> Richard Guy Briggs <rgb@redhat.com> wrote:
> 
> > Implement the proc fs write to set the audit container ID of a
> > process, emitting an AUDIT_CONTAINER record to document the event.
> > 
> > This is a write from the container orchestrator task to a proc entry
> > of the form /proc/PID/containerid where PID is the process ID of the
> > newly created task that is to become the first task in a container,
> > or an additional task added to a container.
> > 
> > The write expects up to a u64 value (unset: 18446744073709551615).
> > 
> > This will produce a record such as this:
> > type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0
> > subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0
> > tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455
> > res=0
> 
> The was one thing I was wondering about. Currently when we set the
> loginuid, the record is AUDIT_LOGINUID. The corollary is that when we
> set the container id, the event should be AUDIT_CONTAINERID or
> AUDIT_CONTAINER_ID.

The record type is actually AUDIT_LOGIN.  The field type is
AUDIT_LOGINUID.  Given that correction, I think we're fine and could
potentially violently agree.  The existing naming is consistent.

> During syscall events, the path info is returned in a a record simply
> called AUDIT_PATH, cwd info is returned in AUDIT_CWD. So, rather than
> calling the record that gets attached to everything
> AUDIT_CONTAINER_INFO, how about simply AUDIT_CONTAINER.

Considering the container initiation record is different than the record
to document the container involved in an otherwise normal syscall, we
need two names.  I don't have a strong opinion what they are.

I'd prefer AUDIT_CONTAINER and AUDIT_CONTAINER_INFO so that the two are
different enough to be visually distinct while leaving
AUDIT_CONTAINERID for the field type in patch 4 ("audit: add containerid
filtering")

> > The "op" field indicates an initial set.  The "pid" to "ses" fields
> > are the orchestrator while the "opid" field is the object's PID, the
> > process being "contained".  Old and new container ID values are given
> > in the "contid" fields, while res indicates its success.
> > 
> > It is not permitted to self-set, unset or re-set the container ID.  A
> > child inherits its parent's container ID, but then can be set only
> > once after.
> > 
> > See: https://github.com/linux-audit/audit-kernel/issues/32
> > 
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > ---
> >  fs/proc/base.c             | 37 ++++++++++++++++++++
> >  include/linux/audit.h      | 16 +++++++++
> >  include/linux/init_task.h  |  4 ++-
> >  include/linux/sched.h      |  1 +
> >  include/uapi/linux/audit.h |  2 ++
> >  kernel/auditsc.c           | 84
> > ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 143
> > insertions(+), 1 deletion(-)
> > 
> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> > index 60316b5..6ce4fbe 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -1299,6 +1299,41 @@ static ssize_t proc_sessionid_read(struct file
> > * file, char __user * buf, .read		= proc_sessionid_read,
> >  	.llseek		= generic_file_llseek,
> >  };
> > +
> > +static ssize_t proc_containerid_write(struct file *file, const char
> > __user *buf,
> > +				   size_t count, loff_t *ppos)
> > +{
> > +	struct inode *inode = file_inode(file);
> > +	u64 containerid;
> > +	int rv;
> > +	struct task_struct *task = get_proc_task(inode);
> > +
> > +	if (!task)
> > +		return -ESRCH;
> > +	if (*ppos != 0) {
> > +		/* No partial writes. */
> > +		put_task_struct(task);
> > +		return -EINVAL;
> > +	}
> > +
> > +	rv = kstrtou64_from_user(buf, count, 10, &containerid);
> > +	if (rv < 0) {
> > +		put_task_struct(task);
> > +		return rv;
> > +	}
> > +
> > +	rv = audit_set_containerid(task, containerid);
> > +	put_task_struct(task);
> > +	if (rv < 0)
> > +		return rv;
> > +	return count;
> > +}
> > +
> > +static const struct file_operations proc_containerid_operations = {
> > +	.write		= proc_containerid_write,
> > +	.llseek		= generic_file_llseek,
> > +};
> > +
> >  #endif
> >  
> >  #ifdef CONFIG_FAULT_INJECTION
> > @@ -2961,6 +2996,7 @@ static int proc_pid_patch_state(struct seq_file
> > *m, struct pid_namespace *ns, #ifdef CONFIG_AUDITSYSCALL
> >  	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
> >  	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> > +	REG("containerid", S_IWUSR, proc_containerid_operations),
> >  #endif
> >  #ifdef CONFIG_FAULT_INJECTION
> >  	REG("make-it-fail", S_IRUGO|S_IWUSR,
> > proc_fault_inject_operations), @@ -3355,6 +3391,7 @@ static int
> > proc_tid_comm_permission(struct inode *inode, int mask) #ifdef
> > CONFIG_AUDITSYSCALL REG("loginuid",  S_IWUSR|S_IRUGO,
> > proc_loginuid_operations), REG("sessionid",  S_IRUGO,
> > proc_sessionid_operations),
> > +	REG("containerid", S_IWUSR, proc_containerid_operations),
> >  #endif
> >  #ifdef CONFIG_FAULT_INJECTION
> >  	REG("make-it-fail", S_IRUGO|S_IWUSR,
> > proc_fault_inject_operations), diff --git a/include/linux/audit.h
> > b/include/linux/audit.h index af410d9..fe4ba3f 100644
> > --- a/include/linux/audit.h
> > +++ b/include/linux/audit.h
> > @@ -29,6 +29,7 @@
> >  
> >  #define AUDIT_INO_UNSET ((unsigned long)-1)
> >  #define AUDIT_DEV_UNSET ((dev_t)-1)
> > +#define INVALID_CID AUDIT_CID_UNSET
> >  
> >  struct audit_sig_info {
> >  	uid_t		uid;
> > @@ -321,6 +322,7 @@ static inline void audit_ptrace(struct
> > task_struct *t) extern int auditsc_get_stamp(struct audit_context
> > *ctx, struct timespec64 *t, unsigned int *serial);
> >  extern int audit_set_loginuid(kuid_t loginuid);
> > +extern int audit_set_containerid(struct task_struct *tsk, u64
> > containerid); 
> >  static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
> >  {
> > @@ -332,6 +334,11 @@ static inline unsigned int
> > audit_get_sessionid(struct task_struct *tsk) return tsk->sessionid;
> >  }
> >  
> > +static inline u64 audit_get_containerid(struct task_struct *tsk)
> > +{
> > +	return tsk->containerid;
> > +}
> > +
> >  extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
> >  extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
> > gid_t gid, umode_t mode); extern void __audit_bprm(struct
> > linux_binprm *bprm); @@ -517,6 +524,10 @@ static inline unsigned int
> > audit_get_sessionid(struct task_struct *tsk) {
> >  	return -1;
> >  }
> > +static inline kuid_t audit_get_containerid(struct task_struct *tsk)
> > +{
> > +	return INVALID_CID;
> > +}
> >  static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
> >  { }
> >  static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t
> > uid, @@ -581,6 +592,11 @@ static inline bool
> > audit_loginuid_set(struct task_struct *tsk) return
> > uid_valid(audit_get_loginuid(tsk)); }
> >  
> > +static inline bool audit_containerid_set(struct task_struct *tsk)
> > +{
> > +	return audit_get_containerid(tsk) != INVALID_CID;
> > +}
> > +
> >  static inline void audit_log_string(struct audit_buffer *ab, const
> > char *buf) {
> >  	audit_log_n_string(ab, buf, strlen(buf));
> > diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> > index 6a53262..046bd0a 100644
> > --- a/include/linux/init_task.h
> > +++ b/include/linux/init_task.h
> > @@ -18,6 +18,7 @@
> >  #include <linux/sched/rt.h>
> >  #include <linux/livepatch.h>
> >  #include <linux/mm_types.h>
> > +#include <linux/audit.h>
> >  
> >  #include <asm/thread_info.h>
> >  
> > @@ -120,7 +121,8 @@
> >  #ifdef CONFIG_AUDITSYSCALL
> >  #define INIT_IDS \
> >  	.loginuid = INVALID_UID, \
> > -	.sessionid = (unsigned int)-1,
> > +	.sessionid = (unsigned int)-1, \
> > +	.containerid = INVALID_CID,
> >  #else
> >  #define INIT_IDS
> >  #endif
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index d258826..1b82191 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -796,6 +796,7 @@ struct task_struct {
> >  #ifdef CONFIG_AUDITSYSCALL
> >  	kuid_t				loginuid;
> >  	unsigned int			sessionid;
> > +	u64				containerid;
> >  #endif
> >  	struct seccomp			seccomp;
> >  
> > diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
> > index 4e61a9e..921a71f 100644
> > --- a/include/uapi/linux/audit.h
> > +++ b/include/uapi/linux/audit.h
> > @@ -71,6 +71,7 @@
> >  #define AUDIT_TTY_SET		1017	/* Set TTY auditing
> > status */ #define AUDIT_SET_FEATURE	1018	/* Turn an
> > audit feature on or off */ #define AUDIT_GET_FEATURE
> > 1019	/* Get which features are enabled */ +#define
> > AUDIT_CONTAINER		1020	/* Define the container id
> > and information */ #define AUDIT_FIRST_USER_MSG	1100	/*
> > Userspace messages mostly uninteresting to kernel */ #define
> > AUDIT_USER_AVC		1107	/* We filter this
> > differently */ @@ -465,6 +466,7 @@ struct audit_tty_status { };
> >  
> >  #define AUDIT_UID_UNSET (unsigned int)-1
> > +#define AUDIT_CID_UNSET ((u64)-1)
> >  
> >  /* audit_rule_data supports filter rules with both integer and string
> >   * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
> > diff --git a/kernel/auditsc.c b/kernel/auditsc.c
> > index 4e0a4ac..29c8482 100644
> > --- a/kernel/auditsc.c
> > +++ b/kernel/auditsc.c
> > @@ -2073,6 +2073,90 @@ int audit_set_loginuid(kuid_t loginuid)
> >  	return rc;
> >  }
> >  
> > +static int audit_set_containerid_perm(struct task_struct *task, u64
> > containerid) +{
> > +	struct task_struct *parent;
> > +	u64 pcontainerid, ccontainerid;
> > +
> > +	/* Don't allow to set our own containerid */
> > +	if (current == task)
> > +		return -EPERM;
> > +	/* Don't allow the containerid to be unset */
> > +	if (!cid_valid(containerid))
> > +		return -EINVAL;
> > +	/* if we don't have caps, reject */
> > +	if (!capable(CAP_AUDIT_CONTROL))
> > +		return -EPERM;
> > +	/* if containerid is unset, allow */
> > +	if (!audit_containerid_set(task))
> > +		return 0;
> > +	/* it is already set, and not inherited from the parent,
> > reject */
> > +	ccontainerid = audit_get_containerid(task);
> > +	rcu_read_lock();
> > +	parent = rcu_dereference(task->real_parent);
> > +	rcu_read_unlock();
> > +	task_lock(parent);
> > +	pcontainerid = audit_get_containerid(parent);
> > +	task_unlock(parent);
> > +	if (ccontainerid != pcontainerid)
> > +		return -EPERM;
> > +	return 0;
> > +}
> > +
> > +static void audit_log_set_containerid(struct task_struct *task, u64
> > oldcontainerid,
> > +				      u64 containerid, int rc)
> > +{
> > +	struct audit_buffer *ab;
> > +	uid_t uid;
> > +	struct tty_struct *tty;
> > +
> > +	if (!audit_enabled)
> > +		return;
> > +
> > +	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONTAINER);
> > +	if (!ab)
> > +		return;
> > +
> > +	uid = from_kuid(&init_user_ns, task_uid(current));
> > +	tty = audit_get_tty(current);
> > +
> > +	audit_log_format(ab, "op=set pid=%d uid=%u",
> > task_tgid_nr(current), uid);
> > +	audit_log_task_context(ab);
> > +	audit_log_format(ab, " auid=%u tty=%s ses=%u opid=%d
> > old-contid=%llu contid=%llu res=%d",
> 
> The preferred ordering would be: op, opid, old-contid, contid, pid, uid,
> tty, ses, subj, comm, exe, res. This groups the searchable fields
> together using the most common ordering so that parsing is simple.

There has been a suggestion to make this a syscall-connected record, and
if that is the case, we'd simply drop all the fields that would be
duplicated by the syscall record.  Otherwise, I'll use your suggested
order.

As you may recall this suggestion was also made for the AUDIT_LOGIN
record.

> Thanks,
> -Steve
> 
> > +			 from_kuid(&init_user_ns,
> > audit_get_loginuid(current)),
> > +			 tty ? tty_name(tty) : "(none)",
> > audit_get_sessionid(current),
> > +			 task_tgid_nr(task), oldcontainerid,
> > containerid, !rc); +
> > +	audit_put_tty(tty);
> > +	audit_log_end(ab);
> > +}
> > +
> > +/**
> > + * audit_set_containerid - set current task's audit_context
> > containerid
> > + * @containerid: containerid value
> > + *
> > + * Returns 0 on success, -EPERM on permission failure.
> > + *
> > + * Called (set) from fs/proc/base.c::proc_containerid_write().
> > + */
> > +int audit_set_containerid(struct task_struct *task, u64 containerid)
> > +{
> > +	u64 oldcontainerid;
> > +	int rc;
> > +
> > +	oldcontainerid = audit_get_containerid(task);
> > +
> > +	rc = audit_set_containerid_perm(task, containerid);
> > +	if (!rc) {
> > +		task_lock(task);
> > +		task->containerid = containerid;
> > +		task_unlock(task);
> > +	}
> > +
> > +	audit_log_set_containerid(task, oldcontainerid, containerid,
> > rc);
> > +	return rc;
> > +}
> > +
> >  /**
> >   * __audit_mq_open - record audit data for a POSIX MQ open
> >   * @oflag: open flag
> 

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Steve Grubb May 18, 2018, 1:56 p.m.
On Thu, 17 May 2018 17:56:00 -0400
Richard Guy Briggs <rgb@redhat.com> wrote:

> > During syscall events, the path info is returned in a a record
> > simply called AUDIT_PATH, cwd info is returned in AUDIT_CWD. So,
> > rather than calling the record that gets attached to everything
> > AUDIT_CONTAINER_INFO, how about simply AUDIT_CONTAINER.  
> 
> Considering the container initiation record is different than the
> record to document the container involved in an otherwise normal
> syscall, we need two names.  I don't have a strong opinion what they
> are.
> 
> I'd prefer AUDIT_CONTAIN and AUDIT_CONTAINER_INFO so that the two
> are different enough to be visually distinct while leaving
> AUDIT_CONTAINERID for the field type in patch 4 ("audit: add
> containerid filtering")

How about AUDIT_CONTAINER for the auxiliary record? The one that starts
the container, I don't have a strong opinion on. Could be
AUDIT_CONTAINER_INIT, AUDIT_CONTAINER_START, AUDIT_CONTAINERID,
AUDIT_CONTAINER_ID, or something else. The API call that sets the ID
for filtering could be AUDIT_CID or AUDIT_CONTID if that helps decide
what the initial event might be. Normally, it should match the field
being filtered.

Best Regards,
-Steve
Richard Guy Briggs May 18, 2018, 3:21 p.m.
On 2018-05-18 09:56, Steve Grubb wrote:
> On Thu, 17 May 2018 17:56:00 -0400
> Richard Guy Briggs <rgb@redhat.com> wrote:
> 
> > > During syscall events, the path info is returned in a a record
> > > simply called AUDIT_PATH, cwd info is returned in AUDIT_CWD. So,
> > > rather than calling the record that gets attached to everything
> > > AUDIT_CONTAINER_INFO, how about simply AUDIT_CONTAINER.  
> > 
> > Considering the container initiation record is different than the
> > record to document the container involved in an otherwise normal
> > syscall, we need two names.  I don't have a strong opinion what they
> > are.
> > 
> > I'd prefer AUDIT_CONTAIN and AUDIT_CONTAINER_INFO so that the two
> > are different enough to be visually distinct while leaving
> > AUDIT_CONTAINERID for the field type in patch 4 ("audit: add
> > containerid filtering")

(Sorry, I had intended AUDIT_CONTAINER for the first in that paragraph
above.)

> How about AUDIT_CONTAINER for the auxiliary record? The one that starts
> the container, I don't have a strong opinion on. Could be
> AUDIT_CONTAINER_INIT, AUDIT_CONTAINER_START, AUDIT_CONTAINERID,
> AUDIT_CONTAINER_ID, or something else. The API call that sets the ID
> for filtering could be AUDIT_CID or AUDIT_CONTID if that helps decide
> what the initial event might be. Normally, it should match the field
> being filtered.

Ok, I had shortened the record field name to "contid=" to be unique
enough while not using too much netlink bandwidth.  I could have used
"cid=" but that could be unobvious or ambiguous.  I didn't want to use
the full "containerid=" due to that.  I suppose I could change the
field name macro to AUDIT_CONTID.

For the one that starts the container, I'd prefer to leave the name a
bit more general than "_INIT", "_START", so maybe I'll swap them around
and use AUDIT_CONTAINER_INFO for the startup record, and use
AUDIT_CONTAINER for the syscall auxiliary record.

Does that work?

> -Steve

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Steve Grubb May 18, 2018, 3:38 p.m.
On Fri, 18 May 2018 11:21:06 -0400
Richard Guy Briggs <rgb@redhat.com> wrote:

> On 2018-05-18 09:56, Steve Grubb wrote:
> > On Thu, 17 May 2018 17:56:00 -0400
> > Richard Guy Briggs <rgb@redhat.com> wrote:
> >   
> > > > During syscall events, the path info is returned in a a record
> > > > simply called AUDIT_PATH, cwd info is returned in AUDIT_CWD. So,
> > > > rather than calling the record that gets attached to everything
> > > > AUDIT_CONTAINER_INFO, how about simply AUDIT_CONTAINER.    
> > > 
> > > Considering the container initiation record is different than the
> > > record to document the container involved in an otherwise normal
> > > syscall, we need two names.  I don't have a strong opinion what
> > > they are.
> > > 
> > > I'd prefer AUDIT_CONTAIN and AUDIT_CONTAINER_INFO so that the two
> > > are different enough to be visually distinct while leaving
> > > AUDIT_CONTAINERID for the field type in patch 4 ("audit: add
> > > containerid filtering")  
> 
> (Sorry, I had intended AUDIT_CONTAINER for the first in that paragraph
> above.)
> 
> > How about AUDIT_CONTAINER for the auxiliary record? The one that
> > starts the container, I don't have a strong opinion on. Could be
> > AUDIT_CONTAINER_INIT, AUDIT_CONTAINER_START, AUDIT_CONTAINERID,
> > AUDIT_CONTAINER_ID, or something else. The API call that sets the ID
> > for filtering could be AUDIT_CID or AUDIT_CONTID if that helps
> > decide what the initial event might be. Normally, it should match
> > the field being filtered.  
> 
> Ok, I had shortened the record field name to "contid=" to be unique
> enough while not using too much netlink bandwidth.  I could have used
> "cid=" but that could be unobvious or ambiguous.  I didn't want to use
> the full "containerid=" due to that.  I suppose I could change the
> field name macro to AUDIT_CONTID.
> 
> For the one that starts the container, I'd prefer to leave the name a
> bit more general than "_INIT", "_START", so maybe I'll swap them
> around and use AUDIT_CONTAINER_INFO for the startup record, and use
> AUDIT_CONTAINER for the syscall auxiliary record.
> 
> Does that work?

I'll go along with that. Thanks. But making that swap frees up
AUDIT_CONTAINER_ID which could be the first event. But
AUDIT_CONTAINER_INFO is also fine with me.

Best Regards,
-Steve