[PATCHv3] inotify: Convert to using per-namespace limits

Submitted by Nikolay Borisov on Dec. 14, 2016, 1:56 p.m.

Details

Message ID 1481723793-6756-1-git-send-email-n.borisov.lkml@gmail.com
State New
Series "inotify: Convert to using per-namespace limits"
Headers show

Commit Message

Nikolay Borisov Dec. 14, 2016, 1:56 p.m.
This patchset converts inotify to using the newly introduced
per-userns sysctl infrastructure.

Currently the inotify instances/watches are being accounted in the
user_struct structure. This means that in setups where multiple
users in unprivileged containers map to the same underlying
real user (i.e. pointing to the same user_struct) the inotify limits
are going to be shared as well, allowing one user(or application) to exhaust
all others limits.

Fix this by switching the inotify sysctls to using the
per-namespace/per-user limits. This will allow the server admin to
set sensible global limits, which can further be tuned inside every
individual user namespace. Additionally, in order to preserve the
sysctl ABI make the existing inotify instances/watches sysctls
modify the values of the initial user namespace.

Signed-off-by: Nikolay Borisov <n.borisov.lkml@gmail.com>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Serge Hallyn <serge@hallyn.com>
---

Okay, so here is another version, which should 
hopefully be free of slab corruptions. There was an issue
where in ucount.c the ifdef was checking the CONFIG_INOTIFY_USER_
(pay attention to the trailing _, this was clearly a mistake). This 
led to the user_table (and all duplicated from it tables) to not 
contain the inotify-related members. In my local testing I got 
kasan splats even during kernel boot, due to out-of-bound writes. 
Let's see how this version fares. 

 fs/notify/inotify/inotify.h          | 17 +++++++++++++++++
 fs/notify/inotify/inotify_fsnotify.c |  6 ++----
 fs/notify/inotify/inotify_user.c     | 34 +++++++++++++++++-----------------
 include/linux/fsnotify_backend.h     |  3 ++-
 include/linux/sched.h                |  4 ----
 include/linux/user_namespace.h       |  4 ++++
 kernel/ucount.c                      |  6 +++++-
 7 files changed, 47 insertions(+), 27 deletions(-)

Patch hide | download patch | download mbox

diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index ed855ef6f077..b5536f8ad3e0 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -30,3 +30,20 @@  extern int inotify_handle_event(struct fsnotify_group *group,
 				const unsigned char *file_name, u32 cookie);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;
+
+#ifdef CONFIG_INOTIFY_USER
+static void dec_inotify_instances(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES);
+}
+
+static struct ucounts *inc_inotify_watches(struct ucounts *ucounts)
+{
+	return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES);
+}
+
+static void dec_inotify_watches(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES);
+}
+#endif
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 2cd900c2c737..1e6b3cfc2cfd 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -165,10 +165,8 @@  static void inotify_free_group_priv(struct fsnotify_group *group)
 	/* ideally the idr is empty and we won't hit the BUG in the callback */
 	idr_for_each(&group->inotify_data.idr, idr_callback, group);
 	idr_destroy(&group->inotify_data.idr);
-	if (group->inotify_data.user) {
-		atomic_dec(&group->inotify_data.user->inotify_devs);
-		free_uid(group->inotify_data.user);
-	}
+	if (group->inotify_data.ucounts)
+		dec_inotify_instances(group->inotify_data.ucounts);
 }
 
 static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 69d1ea3d292a..1cf41c623be1 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -44,10 +44,8 @@ 
 
 #include <asm/ioctls.h>
 
-/* these are configurable via /proc/sys/fs/inotify/ */
-static int inotify_max_user_instances __read_mostly;
+/* configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_queued_events __read_mostly;
-static int inotify_max_user_watches __read_mostly;
 
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 
@@ -60,7 +58,7 @@  static int zero;
 struct ctl_table inotify_table[] = {
 	{
 		.procname	= "max_user_instances",
-		.data		= &inotify_max_user_instances,
+		.data		= &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -68,7 +66,7 @@  struct ctl_table inotify_table[] = {
 	},
 	{
 		.procname	= "max_user_watches",
-		.data		= &inotify_max_user_watches,
+		.data		= &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -500,7 +498,7 @@  void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 	/* remove this mark from the idr */
 	inotify_remove_from_idr(group, i_mark);
 
-	atomic_dec(&group->inotify_data.user->inotify_watches);
+	dec_inotify_watches(group->inotify_data.ucounts);
 }
 
 /* ding dong the mark is dead */
@@ -584,14 +582,17 @@  static int inotify_new_watch(struct fsnotify_group *group,
 	tmp_i_mark->fsn_mark.mask = mask;
 	tmp_i_mark->wd = -1;
 
-	ret = -ENOSPC;
-	if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
-		goto out_err;
-
 	ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
 	if (ret)
 		goto out_err;
 
+	/* increment the number of watches the user has */
+	if (!inc_inotify_watches(group->inotify_data.ucounts)) {
+		inotify_remove_from_idr(group, tmp_i_mark);
+		ret = -ENOSPC;
+		goto out_err;
+	}
+
 	/* we are on the idr, now get on the inode */
 	ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
 				       NULL, 0);
@@ -601,8 +602,6 @@  static int inotify_new_watch(struct fsnotify_group *group,
 		goto out_err;
 	}
 
-	/* increment the number of watches the user has */
-	atomic_inc(&group->inotify_data.user->inotify_watches);
 
 	/* return the watch descriptor for this new mark */
 	ret = tmp_i_mark->wd;
@@ -653,10 +652,11 @@  static struct fsnotify_group *inotify_new_group(unsigned int max_events)
 
 	spin_lock_init(&group->inotify_data.idr_lock);
 	idr_init(&group->inotify_data.idr);
-	group->inotify_data.user = get_current_user();
+	group->inotify_data.ucounts = inc_ucount(current_user_ns(),
+						 current_euid(),
+						 UCOUNT_INOTIFY_INSTANCES);
 
-	if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
-	    inotify_max_user_instances) {
+	if (!group->inotify_data.ucounts) {
 		fsnotify_destroy_group(group);
 		return ERR_PTR(-EMFILE);
 	}
@@ -819,8 +819,8 @@  static int __init inotify_user_setup(void)
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
 
 	inotify_max_queued_events = 16384;
-	inotify_max_user_instances = 128;
-	inotify_max_user_watches = 8192;
+	init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
+	init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192;
 
 	return 0;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 79467b239fcf..251f2268baad 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -16,6 +16,7 @@ 
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/atomic.h>
+#include <linux/user_namespace.h>
 
 /*
  * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
@@ -170,7 +171,7 @@  struct fsnotify_group {
 		struct inotify_group_private_data {
 			spinlock_t	idr_lock;
 			struct idr      idr;
-			struct user_struct      *user;
+			struct ucounts *ucounts;
 		} inotify_data;
 #endif
 #ifdef CONFIG_FANOTIFY
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e9c009dc3a4a..bba39f3d83c3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -844,10 +844,6 @@  struct user_struct {
 	atomic_t __count;	/* reference count */
 	atomic_t processes;	/* How many processes does this user have? */
 	atomic_t sigpending;	/* How many pending signals does this user have? */
-#ifdef CONFIG_INOTIFY_USER
-	atomic_t inotify_watches; /* How many inotify watches does this user have? */
-	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
-#endif
 #ifdef CONFIG_FANOTIFY
 	atomic_t fanotify_listeners;
 #endif
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index eb209d4523f5..363e0e8082a9 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -32,6 +32,10 @@  enum ucount_type {
 	UCOUNT_NET_NAMESPACES,
 	UCOUNT_MNT_NAMESPACES,
 	UCOUNT_CGROUP_NAMESPACES,
+#ifdef CONFIG_INOTIFY_USER
+	UCOUNT_INOTIFY_INSTANCES,
+	UCOUNT_INOTIFY_WATCHES,
+#endif
 	UCOUNT_COUNTS,
 };
 
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 9d20d5dd298a..b4aaee935b3e 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -57,7 +57,7 @@  static struct ctl_table_root set_root = {
 
 static int zero = 0;
 static int int_max = INT_MAX;
-#define UCOUNT_ENTRY(name) 				\
+#define UCOUNT_ENTRY(name)				\
 	{						\
 		.procname	= name,			\
 		.maxlen		= sizeof(int),		\
@@ -74,6 +74,10 @@  static struct ctl_table user_table[] = {
 	UCOUNT_ENTRY("max_net_namespaces"),
 	UCOUNT_ENTRY("max_mnt_namespaces"),
 	UCOUNT_ENTRY("max_cgroup_namespaces"),
+#ifdef CONFIG_INOTIFY_USER
+	UCOUNT_ENTRY("max_inotify_instances"),
+	UCOUNT_ENTRY("max_inotify_watches"),
+#endif
 	{ }
 };
 #endif /* CONFIG_SYSCTL */

Comments

Eric W. Biederman Dec. 14, 2016, 10:29 p.m.
Nikolay Borisov <n.borisov.lkml@gmail.com> writes:

> This patchset converts inotify to using the newly introduced
> per-userns sysctl infrastructure.
>
> Currently the inotify instances/watches are being accounted in the
> user_struct structure. This means that in setups where multiple
> users in unprivileged containers map to the same underlying
> real user (i.e. pointing to the same user_struct) the inotify limits
> are going to be shared as well, allowing one user(or application) to exhaust
> all others limits.
>
> Fix this by switching the inotify sysctls to using the
> per-namespace/per-user limits. This will allow the server admin to
> set sensible global limits, which can further be tuned inside every
> individual user namespace. Additionally, in order to preserve the
> sysctl ABI make the existing inotify instances/watches sysctls
> modify the values of the initial user namespace.
>
> Signed-off-by: Nikolay Borisov <n.borisov.lkml@gmail.com>
> Acked-by: Jan Kara <jack@suse.cz>
> Acked-by: Serge Hallyn <serge@hallyn.com>
> ---
>
> Okay, so here is another version, which should 
> hopefully be free of slab corruptions. There was an issue
> where in ucount.c the ifdef was checking the CONFIG_INOTIFY_USER_
> (pay attention to the trailing _, this was clearly a mistake). This 
> led to the user_table (and all duplicated from it tables) to not 
> contain the inotify-related members. In my local testing I got 
> kasan splats even during kernel boot, due to out-of-bound writes. 
> Let's see how this version fares.

Thank you I will place this in my for-testing branch shortly and see how
it fares.

Eric
Eric W. Biederman Dec. 15, 2016, 12:37 a.m.
Nikolay Borisov <n.borisov.lkml@gmail.com> writes:

> This patchset converts inotify to using the newly introduced
> per-userns sysctl infrastructure.
>
> Currently the inotify instances/watches are being accounted in the
> user_struct structure. This means that in setups where multiple
> users in unprivileged containers map to the same underlying
> real user (i.e. pointing to the same user_struct) the inotify limits
> are going to be shared as well, allowing one user(or application) to exhaust
> all others limits.
>
> Fix this by switching the inotify sysctls to using the
> per-namespace/per-user limits. This will allow the server admin to
> set sensible global limits, which can further be tuned inside every
> individual user namespace. Additionally, in order to preserve the
> sysctl ABI make the existing inotify instances/watches sysctls
> modify the values of the initial user namespace.
>
> Signed-off-by: Nikolay Borisov <n.borisov.lkml@gmail.com>
> Acked-by: Jan Kara <jack@suse.cz>
> Acked-by: Serge Hallyn <serge@hallyn.com>
> ---
>
> Okay, so here is another version, which should 
> hopefully be free of slab corruptions. There was an issue
> where in ucount.c the ifdef was checking the CONFIG_INOTIFY_USER_
> (pay attention to the trailing _, this was clearly a mistake). This 
> led to the user_table (and all duplicated from it tables) to not 
> contain the inotify-related members. In my local testing I got 
> kasan splats even during kernel boot, due to out-of-bound writes. 
> Let's see how this version fares.

So there is one more thing that needs to be addressed with your patch.

In inotify.h the functions need to be marked static inline
rather than just static or else there a number of new compiler warnings.

I have addressed this for now, but if anything else comes up or if you
resend this patch I would appreciate it if you add the static inline
notations in your internal copy of the patch.

Thank you,
Eric Biederman


> diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
> index ed855ef6f077..b5536f8ad3e0 100644
> --- a/fs/notify/inotify/inotify.h
> +++ b/fs/notify/inotify/inotify.h
> @@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group,
>  				const unsigned char *file_name, u32 cookie);
>  
>  extern const struct fsnotify_ops inotify_fsnotify_ops;
> +
> +#ifdef CONFIG_INOTIFY_USER
> +static void dec_inotify_instances(struct ucounts *ucounts)
> +{
> +	dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES);
> +}
> +
> +static struct ucounts *inc_inotify_watches(struct ucounts *ucounts)
> +{
> +	return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES);
> +}
> +
> +static void dec_inotify_watches(struct ucounts *ucounts)
> +{
> +	dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES);
> +}
> +#endif
Nikolay Borisov Dec. 15, 2016, 7:26 a.m.
On 15.12.2016 02:37, Eric W. Biederman wrote:
> Nikolay Borisov <n.borisov.lkml@gmail.com> writes:
> 
>> This patchset converts inotify to using the newly introduced
>> per-userns sysctl infrastructure.
>>
>> Currently the inotify instances/watches are being accounted in the
>> user_struct structure. This means that in setups where multiple
>> users in unprivileged containers map to the same underlying
>> real user (i.e. pointing to the same user_struct) the inotify limits
>> are going to be shared as well, allowing one user(or application) to exhaust
>> all others limits.
>>
>> Fix this by switching the inotify sysctls to using the
>> per-namespace/per-user limits. This will allow the server admin to
>> set sensible global limits, which can further be tuned inside every
>> individual user namespace. Additionally, in order to preserve the
>> sysctl ABI make the existing inotify instances/watches sysctls
>> modify the values of the initial user namespace.
>>
>> Signed-off-by: Nikolay Borisov <n.borisov.lkml@gmail.com>
>> Acked-by: Jan Kara <jack@suse.cz>
>> Acked-by: Serge Hallyn <serge@hallyn.com>
>> ---
>>
>> Okay, so here is another version, which should 
>> hopefully be free of slab corruptions. There was an issue
>> where in ucount.c the ifdef was checking the CONFIG_INOTIFY_USER_
>> (pay attention to the trailing _, this was clearly a mistake). This 
>> led to the user_table (and all duplicated from it tables) to not 
>> contain the inotify-related members. In my local testing I got 
>> kasan splats even during kernel boot, due to out-of-bound writes. 
>> Let's see how this version fares.
> 
> So there is one more thing that needs to be addressed with your patch.
> 
> In inotify.h the functions need to be marked static inline
> rather than just static or else there a number of new compiler warnings.
> 
> I have addressed this for now, but if anything else comes up or if you
> resend this patch I would appreciate it if you add the static inline
> notations in your internal copy of the patch.

Okay, I will keep this in mind. Btw, do you compile with W=1 to get
those warnings, since I don't get them when I just run plain make?


> 
> Thank you,
> Eric Biederman
> 
> 
>> diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
>> index ed855ef6f077..b5536f8ad3e0 100644
>> --- a/fs/notify/inotify/inotify.h
>> +++ b/fs/notify/inotify/inotify.h
>> @@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group,
>>  				const unsigned char *file_name, u32 cookie);
>>  
>>  extern const struct fsnotify_ops inotify_fsnotify_ops;
>> +
>> +#ifdef CONFIG_INOTIFY_USER
>> +static void dec_inotify_instances(struct ucounts *ucounts)
>> +{
>> +	dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES);
>> +}
>> +
>> +static struct ucounts *inc_inotify_watches(struct ucounts *ucounts)
>> +{
>> +	return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES);
>> +}
>> +
>> +static void dec_inotify_watches(struct ucounts *ucounts)
>> +{
>> +	dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES);
>> +}
>> +#endif