[RH8,1/2] ve/fs/aio: aio_nr & aio_max_nr variables virtualization

Submitted by Alexander Mikhalitsyn on Dec. 3, 2020, 4:07 p.m.

Details

Message ID 20201203160725.10719-1-alexander.mikhalitsyn@virtuozzo.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Alexander Mikhalitsyn Dec. 3, 2020, 4:07 p.m.
From: Stanislav Kinsburskiy <skinsbursky@virtuozzo.com>

Virtualization of kernel global aio_nr & aio_max_nr variables is required
to isolate containers and ve0 when allocating aio request/events resources.

Each ve and ve0 has own aio_nr, aio_max_nr values. Function ioctx_alloc trying
to charge appropriate aio_nr value selected by ve context.

It's not possible to exhaust aio events resources of one ve from another ve.

Default per-CT aio_max_nr value == 0x10000, including CT0.

https://jira.sw.ru/browse/PSBM-29017

Signed-off-by: Andrey Ryabinin <aryabinin@odin.com>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>

Patch hide | download patch | download mbox

==============================

fs-aio-show-real-number-of-aio----------------------------------------

fs/aio: show real number of aio events in fs.aio-nr sysctl

fs.aio-nr accounts number of aio events requested by user via io_setup()
syscall. The kernel usually creates more events than was requested.
CRIU doesn't care about the number of requested events, it cares only
about created events. So while restoring the process CRIU requests
in io_setup() the number of actually created events. This leads
to inconsistent value of fs.aio-nr after the restore.

Let's show in fs.aio-nr a number of created events, not requested.

https://jira.sw.ru/browse/PSBM-47209

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Kirill Tkhai <ktkhai@virtuozzo.com>

+++
fs/aio-nr: fix decrement of aio-nr

Commit 280363c ("fs/aio: show real number of aio events in fs.aio-nr sysctl")
changed only incrementing of fs.aio-nr counter. It failed to update
decrement path which leads to constant growing of fs.aio-nr value.

mFixes commit 280363c ("fs/aio: show real number of aio events in fs.aio-nr
sysctl").

https://jira.sw.ru/browse/PSBM-47209

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Kirill Tkhai <ktkhai@virtuozzo.com>

+++
Ported to VZ8:
ve->aio_nr now incremented by ctx->nr_events (really allocated
io events) as in ms kernel
https://jira.sw.ru/browse/PSBM-123159

Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
 fs/aio.c            | 45 ++++++++++++++++++++++++---------------------
 include/linux/aio.h |  6 ++----
 include/linux/ve.h  |  6 ++++++
 kernel/sysctl.c     | 16 ++++++++--------
 kernel/ve/ve.c      |  7 +++++++
 5 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index e81c8583e055..492f1a8b7661 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -30,6 +30,7 @@ 
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/aio.h>
+#include <linux/ve.h>
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
@@ -155,6 +156,7 @@  struct kioctx {
 
 	struct page		*internal_pages[AIO_RING_PAGES];
 	struct file		*aio_ring_file;
+	struct ve_struct	*ve;
 
 	unsigned		id;
 };
@@ -187,12 +189,6 @@  struct aio_kiocb {
 	struct eventfd_ctx	*ki_eventfd;
 };
 
-/*------ sysctl variables----*/
-static DEFINE_SPINLOCK(aio_nr_lock);
-unsigned long aio_nr;		/* current system wide number of aio requests */
-unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
-/*----end sysctl variables---*/
-
 static struct kmem_cache	*kiocb_cachep;
 static struct kmem_cache	*kioctx_cachep;
 
@@ -555,12 +551,14 @@  static void free_ioctx(struct work_struct *work)
 {
 	struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
 					  free_rwork);
+	struct ve_struct *ve = ctx->ve;
 	pr_debug("freeing %p\n", ctx);
 
 	aio_free_ring(ctx);
 	free_percpu(ctx->cpu);
 	percpu_ref_exit(&ctx->reqs);
 	percpu_ref_exit(&ctx->users);
+	put_ve(ve);
 	kmem_cache_free(kioctx_cachep, ctx);
 }
 
@@ -657,14 +655,16 @@  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 	}
 }
 
-static void aio_nr_sub(unsigned nr)
+static void aio_nr_sub(struct kioctx *ctx, unsigned nr)
 {
-	spin_lock(&aio_nr_lock);
-	if (WARN_ON(aio_nr - nr > aio_nr))
-		aio_nr = 0;
+	struct ve_struct *ve = ctx->ve;
+
+	spin_lock(&ve->aio_nr_lock);
+	if (WARN_ON(ve->aio_nr - nr > ve->aio_nr))
+		ve->aio_nr = 0;
 	else
-		aio_nr -= nr;
-	spin_unlock(&aio_nr_lock);
+		ve->aio_nr -= nr;
+	spin_unlock(&ve->aio_nr_lock);
 }
 
 /* ioctx_alloc
@@ -674,6 +674,7 @@  static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx;
+	struct ve_struct *ve = get_exec_env();
 	int err = -ENOMEM;
 
 	/*
@@ -700,7 +701,7 @@  static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
+	if (!nr_events || (unsigned long)max_reqs > ve->aio_max_nr)
 		return ERR_PTR(-EAGAIN);
 
 	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
@@ -708,6 +709,7 @@  static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-ENOMEM);
 
 	ctx->max_reqs = max_reqs;
+	ctx->ve = get_ve(ve);
 
 	spin_lock_init(&ctx->ctx_lock);
 	spin_lock_init(&ctx->completion_lock);
@@ -739,15 +741,15 @@  static struct kioctx *ioctx_alloc(unsigned nr_events)
 		ctx->req_batch = 1;
 
 	/* limit the number of system wide aios */
-	spin_lock(&aio_nr_lock);
-	if (aio_nr + ctx->max_reqs > aio_max_nr ||
-	    aio_nr + ctx->max_reqs < aio_nr) {
-		spin_unlock(&aio_nr_lock);
+	spin_lock(&ve->aio_nr_lock);
+	if (ve->aio_nr + ctx->max_reqs > ve->aio_max_nr ||
+	    ve->aio_nr + ctx->max_reqs < ve->aio_nr) {
+		spin_unlock(&ve->aio_nr_lock);
 		err = -EAGAIN;
 		goto err_ctx;
 	}
-	aio_nr += ctx->max_reqs;
-	spin_unlock(&aio_nr_lock);
+	ve->aio_nr += ctx->max_reqs;
+	spin_unlock(&ve->aio_nr_lock);
 
 	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
 	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
@@ -764,13 +766,14 @@  static struct kioctx *ioctx_alloc(unsigned nr_events)
 	return ctx;
 
 err_cleanup:
-	aio_nr_sub(ctx->max_reqs);
+	aio_nr_sub(ctx, ctx->max_reqs);
 err_ctx:
 	atomic_set(&ctx->dead, 1);
 	if (ctx->mmap_size)
 		vm_munmap(ctx->mmap_base, ctx->mmap_size);
 	aio_free_ring(ctx);
 err:
+	put_ve(ctx->ve);
 	mutex_unlock(&ctx->ring_lock);
 	free_percpu(ctx->cpu);
 	percpu_ref_exit(&ctx->reqs);
@@ -811,7 +814,7 @@  static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 	 * -EAGAIN with no ioctxs actually in use (as far as userspace
 	 *  could tell).
 	 */
-	aio_nr_sub(ctx->max_reqs);
+	aio_nr_sub(ctx, ctx->max_reqs);
 
 	if (ctx->mmap_size)
 		vm_munmap(ctx->mmap_base, ctx->mmap_size);
diff --git a/include/linux/aio.h b/include/linux/aio.h
index b83e68dd006f..4b7a331156ff 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -10,6 +10,8 @@  struct mm_struct;
 
 typedef int (kiocb_cancel_fn)(struct kiocb *);
 
+#define AIO_MAX_NR_DEFAULT	0x10000
+
 /* prototypes */
 #ifdef CONFIG_AIO
 extern void exit_aio(struct mm_struct *mm);
@@ -20,8 +22,4 @@  static inline void kiocb_set_cancel_fn(struct kiocb *req,
 				       kiocb_cancel_fn *cancel) { }
 #endif /* CONFIG_AIO */
 
-/* for sysctl: */
-extern unsigned long aio_nr;
-extern unsigned long aio_max_nr;
-
 #endif /* __LINUX__AIO_H */
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 7cb416f342e7..06dd3d9281e9 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -99,6 +99,12 @@  struct ve_struct {
 
 	struct list_head	devmnt_list;
 	struct mutex		devmnt_mutex;
+
+#ifdef CONFIG_AIO
+	spinlock_t		aio_nr_lock;
+	unsigned long		aio_nr;
+	unsigned long		aio_max_nr;
+#endif
 };
 
 struct ve_devmnt {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2f20e52a1489..fac5a32ce934 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1816,17 +1816,17 @@  static struct ctl_table fs_table[] = {
 #ifdef CONFIG_AIO
 	{
 		.procname	= "aio-nr",
-		.data		= &aio_nr,
-		.maxlen		= sizeof(aio_nr),
-		.mode		= 0444,
-		.proc_handler	= proc_doulongvec_minmax,
+		.data		= &ve0.aio_nr,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0444 | S_ISVTX,
+		.proc_handler	= proc_doulongvec_minmax_virtual,
 	},
 	{
 		.procname	= "aio-max-nr",
-		.data		= &aio_max_nr,
-		.maxlen		= sizeof(aio_max_nr),
-		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax,
+		.data		= &ve0.aio_max_nr,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_doulongvec_minmax_virtual,
 	},
 #endif /* CONFIG_AIO */
 #ifdef CONFIG_INOTIFY_USER
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 29e98e6396dc..9485056dcef0 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -14,6 +14,7 @@ 
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/ve.h>
+#include <linux/aio.h>
 #include <linux/errno.h>
 #include <linux/rcupdate.h>
 #include <linux/init_task.h>
@@ -648,6 +649,12 @@  static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	INIT_LIST_HEAD(&ve->devmnt_list);
 	mutex_init(&ve->devmnt_mutex);
 
+#ifdef CONFIG_AIO
+	spin_lock_init(&ve->aio_nr_lock);
+	ve->aio_nr = 0;
+	ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
+#endif
+
 	return &ve->css;
 
 err_vdso: