[RHEL7,COMMIT] net: Introduce net_sem for protection of pernet_list

Submitted by Konstantin Khorenko on May 27, 2020, 6:34 p.m.

Details

Message ID 202005271834.04RIYvEb015091@finist-ce7.sw.ru
State New
Series "Parallel per-net init/exit"
Headers show

Commit Message

Konstantin Khorenko May 27, 2020, 6:34 p.m.
The commit is pushed to "branch-rh7-3.10.0-1127.8.2.vz7.161.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1127.8.2.vz7.161.3
------>
commit 28872d426e6b5a9797349b49739411079d642dc6
Author: Kirill Tkhai <ktkhai@virtuozzo.com>
Date:   Wed May 27 21:34:57 2020 +0300

    net: Introduce net_sem for protection of pernet_list
    
    ms commit 1a57feb847c5
    
    Currently, the mutex is mostly used to protect pernet operations
    list. It orders setup_net() and cleanup_net() with parallel
    {un,}register_pernet_operations() calls, so ->exit{,batch} methods
    of the same pernet operations are executed for a dying net, as
    were used to call ->init methods, even after the net namespace
    is unlinked from net_namespace_list in cleanup_net().
    
    But there are several problems with scalability. The first one
    is that more than one net can't be created or destroyed
    at the same moment on the node. For big machines with many cpus
    running many containers it's very sensitive.
    
    The second one is that it's need to synchronize_rcu() after net
    is removed from net_namespace_list():
    
    Destroy net_ns:
    cleanup_net()
      mutex_lock(&net_mutex)
      list_del_rcu(&net->list)
      synchronize_rcu()                                  <--- Sleep there for ages
      list_for_each_entry_reverse(ops, &pernet_list, list)
        ops_exit_list(ops, &net_exit_list)
      list_for_each_entry_reverse(ops, &pernet_list, list)
        ops_free_list(ops, &net_exit_list)
      mutex_unlock(&net_mutex)
    
    This primitive is not fast, especially on the systems with many processors
    and/or when preemptible RCU is enabled in config. So, all the time, while
    cleanup_net() is waiting for RCU grace period, creation of new net namespaces
    is not possible, the tasks, who makes it, are sleeping on the same mutex:
    
    Create net_ns:
    copy_net_ns()
      mutex_lock_killable(&net_mutex)                    <--- Sleep there for ages
    
    I observed 20-30 seconds hangs of "unshare -n" on ordinary 8-cpu laptop
    with preemptible RCU enabled after CRIU tests round is finished.
    
    The solution is to convert net_mutex to the rw_semaphore and add fine grain
    locks to really small number of pernet_operations, what really need them.
    
    Then, pernet_operations::init/::exit methods, modifying the net-related data,
    will require down_read() locking only, while down_write() will be used
    for changing pernet_list (i.e., when modules are being loaded and unloaded).
    
    This gives signify performance increase, after all patch set is applied,
    like you may see here:
    
    %for i in {1..10000}; do unshare -n bash -c exit; done
    
    *before*
    real 1m40,377s
    user 0m9,672s
    sys 0m19,928s
    
    *after*
    real 0m17,007s
    user 0m5,311s
    sys 0m11,779
    
    (5.8 times faster)
    
    This patch starts replacing net_mutex to net_sem. It adds rw_semaphore,
    describes the variables it protects, and makes to use, where appropriate.
    net_mutex is still present, and next patches will kick it out step-by-step.
    
    Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
    
    Acked-by: Andrei Vagin <avagin@virtuozzo.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>
    
    =====================
    Patchset description:
    
    Parallel per-net init/exit
    
    https://jira.sw.ru/browse/PSBM-104158
---
 include/linux/rtnetlink.h |  1 +
 net/core/net_namespace.c  | 41 +++++++++++++++++++++++++++--------------
 net/core/rtnetlink.c      |  4 ++--
 3 files changed, 30 insertions(+), 16 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 10c5fae799ee5..65dcc5859dd19 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -35,6 +35,7 @@  extern int rtnl_is_locked(void);
 
 extern wait_queue_head_t netdev_unregistering_wq;
 extern struct mutex net_mutex;
+extern struct rw_semaphore net_sem;
 
 #ifdef CONFIG_PROVE_LOCKING
 extern bool lockdep_rtnl_is_held(void);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c461acce68bb5..f63e32495e493 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -32,6 +32,11 @@ 
 static LIST_HEAD(pernet_list);
 static struct list_head *first_device = &pernet_list;
 DEFINE_MUTEX(net_mutex);
+/*
+ * net_sem: protects: pernet_list, net_generic_ids,
+ * init_net_initialized and first_device pointer.
+ */
+DECLARE_RWSEM(net_sem);
 
 LIST_HEAD(net_namespace_list);
 EXPORT_SYMBOL_GPL(net_namespace_list);
@@ -302,7 +307,7 @@  static void dec_net_namespaces(struct ucounts *ucounts)
  */
 static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 {
-	/* Must be called with net_mutex held */
+	/* Must be called with net_sem held */
 	const struct pernet_operations *ops, *saved_ops;
 	int error = 0;
 	LIST_HEAD(net_exit_list);
@@ -421,12 +426,18 @@  struct net *copy_net_ns(unsigned long flags,
 	net->ucounts = ucounts;
 	get_user_ns(user_ns);
 
-	rv = mutex_lock_killable(&net_mutex);
+	rv = down_read_killable(&net_sem);
 	if (rv < 0)
 		goto put_userns;
 
+	rv = mutex_lock_killable(&net_mutex);
+	if (rv < 0)
+		goto up_read;
+
 	rv = setup_net(net, user_ns);
 	mutex_unlock(&net_mutex);
+up_read:
+	up_read(&net_sem);
 	if (rv < 0) {
 put_userns:
 		put_user_ns(user_ns);
@@ -482,6 +493,7 @@  static void cleanup_net(struct work_struct *work)
 	list_replace_init(&cleanup_list, &net_kill_list);
 	spin_unlock_irq(&cleanup_list_lock);
 
+	down_read(&net_sem);
 	mutex_lock(&net_mutex);
 
 	/* Don't let anyone else find us. */
@@ -521,6 +533,9 @@  static void cleanup_net(struct work_struct *work)
 	list_for_each_entry_reverse(ops, &pernet_list, list)
 		ops_free_list(ops, &net_exit_list);
 
+	mutex_unlock(&net_mutex);
+	up_read(&net_sem);
+
 	list_for_each_entry(net, &net_kill_list, cleanup_list) {
 		struct ve_struct *ve = net->owner_ve;
 
@@ -528,8 +543,6 @@  static void cleanup_net(struct work_struct *work)
 		put_ve(ve);
 	}
 
-	mutex_unlock(&net_mutex);
-
 	/* Ensure there are no outstanding rcu callbacks using this
 	 * network namespace.
 	 */
@@ -838,11 +851,11 @@  static int __init net_ns_init(void)
 
 	rcu_assign_pointer(init_net.gen, ng);
 
-	mutex_lock(&net_mutex);
+	down_write(&net_sem);
 	if (setup_net(&init_net, &init_user_ns))
 		panic("Could not setup the initial network namespace");
 
-	mutex_unlock(&net_mutex);
+	up_write(&net_sem);
 
 	register_pernet_subsys(&net_ns_ops);
 
@@ -972,9 +985,9 @@  static void unregister_pernet_operations(struct pernet_operations *ops)
 int register_pernet_subsys(struct pernet_operations *ops)
 {
 	int error;
-	mutex_lock(&net_mutex);
+	down_write(&net_sem);
 	error =  register_pernet_operations(first_device, ops);
-	mutex_unlock(&net_mutex);
+	up_write(&net_sem);
 	return error;
 }
 EXPORT_SYMBOL_GPL(register_pernet_subsys);
@@ -990,9 +1003,9 @@  EXPORT_SYMBOL_GPL(register_pernet_subsys);
  */
 void unregister_pernet_subsys(struct pernet_operations *ops)
 {
-	mutex_lock(&net_mutex);
+	down_write(&net_sem);
 	unregister_pernet_operations(ops);
-	mutex_unlock(&net_mutex);
+	up_write(&net_sem);
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
 
@@ -1018,11 +1031,11 @@  EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
 int register_pernet_device(struct pernet_operations *ops)
 {
 	int error;
-	mutex_lock(&net_mutex);
+	down_write(&net_sem);
 	error = register_pernet_operations(&pernet_list, ops);
 	if (!error && (first_device == &pernet_list))
 		first_device = &ops->list;
-	mutex_unlock(&net_mutex);
+	up_write(&net_sem);
 	return error;
 }
 EXPORT_SYMBOL_GPL(register_pernet_device);
@@ -1038,11 +1051,11 @@  EXPORT_SYMBOL_GPL(register_pernet_device);
  */
 void unregister_pernet_device(struct pernet_operations *ops)
 {
-	mutex_lock(&net_mutex);
+	down_write(&net_sem);
 	if (&ops->list == first_device)
 		first_device = first_device->next;
 	unregister_pernet_operations(ops);
-	mutex_unlock(&net_mutex);
+	up_write(&net_sem);
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_device);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 93528356f87e1..4525cecdd94bc 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -504,11 +504,11 @@  static void rtnl_lock_unregistering_all(void)
 void rtnl_link_unregister(struct rtnl_link_ops *ops)
 {
 	/* Close the race with cleanup_net() */
-	mutex_lock(&net_mutex);
+	down_write(&net_sem);
 	rtnl_lock_unregistering_all();
 	__rtnl_link_unregister(ops);
 	rtnl_unlock();
-	mutex_unlock(&net_mutex);
+	up_write(&net_sem);
 }
 EXPORT_SYMBOL_GPL(rtnl_link_unregister);