[RHEL7,COMMIT] netfilter: drop/reaquire nfnl_lock on request_module() in nft_log module

Submitted by Vasily Averin on July 21, 2020, 8:59 a.m.

Details

Message ID 202007210859.06L8xchc026065@vvs.co7.work.ct
State New
Series "netfilter: drop/reaquire nfnl_lock on request_module() in nft_log module"
Headers show

Commit Message

Vasily Averin July 21, 2020, 8:59 a.m.
The commit is pushed to "branch-rh7-3.10.0-1127.10.1.vz7.162.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1127.10.1.vz7.162.11
------>
commit 19745d654b4e86439306e342fa20328dd9b23946
Author: Konstantin Khorenko <khorenko@virtuozzo.com>
Date:   Tue Jul 21 11:59:38 2020 +0300

    netfilter: drop/reaquire nfnl_lock on request_module() in nft_log module
    
    There could be a deadlock:
    
    1. "nft" process calls request_module() with
    nfnl_lock(NFNL_SUBSYS_NFTABLES) taken and waits for its completion:
    
    [UN]  PID: 7391   TASK: ffff939e154bc000  COMMAND: "nft"
    wait_for_completion_killable
     __call_usermodehelper_exec
      __request_module
       nf_logger_find_get
        nft_log_init                // ops->init()
         nf_tables_newexpr
          nf_tables_newrule         // called with nfnl_lock taken
           nfnetlink_rcv_batch      // takes nfnl_lock
            nfnetlink_rcv
             netlink_unicast
              netlink_sendmsg
    
    2. request_module() tries to load the module, but waits on "net_mutex":
    
    [UN]  PID: 8913   TASK: ffff939e140ec000  COMMAND: "modprobe"
    __mutex_lock_slowpath
     mutex_lock                     // mutex_lock(&net_mutex);
     (owner = 0xffff939e315f2000 == PID: 158  COMMAND: "kworker/u64:1")
       register_pernet_subsys
        init_module
         do_one_initcall
          load_module
           sys_init_module
    
    3. "kworker" holds net_mutex, but itself waits for nfnl_lock:
    
    [UN]  PID: 158    TASK: ffff939e315f2000  COMMAND: "kworker/u64:1"
    __mutex_lock_slowpath
     mutex_lock                     // nfnl_lock(NFNL_SUBSYS_NFTABLES)
     (owner = 0xffff939e154bc000  PID: 7391  COMMAND: "nft")
      nfnl_lock
       nft_unregister_afinfo
        nf_tables_inet_exit_net
         ops_exit_list
          cleanup_net
           process_one_work
    
    Reproducer:
    ===========
    [console 1]: export i=0; while true; do
            nft add table filter; \
            nft add chain filter input; \
            nft add rule filter input tcp dport 23 ct state new \
                    log prefix \"Connection to port 23:\" accept; \
            nft flush ruleset; \
            rmmod nft_log; rmmod nf_log_ipv4; rmmod nf_log_common; \
            i=$(($i+1)); echo $i; \
    done
    
    [console 2]: modprobe nf_tables; \
            export j=0; while true; do \
                    ip net add n1; \
                    ip net del n1; \
                    j=$(($j + 1)); echo $j; \
            done
    
    In mainstream this sutiation is impossible, it's fixed as a side effect
    by: f102d66b335a4 ("netfilter: nf_tables: use dedicated mutex to guard
    transactions"), but backporting it is quite heavy,
    
    so let's just drop/reacquire nfnl lock around request_module() like it's
    done in similar places.
    
    v2: be ready in nfnetlink_rcv_batch() that -ENOENT could be returned
    with nfnl lock dropped/reacquired like -EAGAIN => re-read ss.
    
    https://jira.sw.ru/browse/PSBM-105534
    https://bugzilla.redhat.com/show_bug.cgi?id=1858329
    Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
    Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
---
 include/net/netfilter/nf_log.h |  2 +-
 net/netfilter/nf_log.c         |  8 ++++---
 net/netfilter/nfnetlink.c      | 11 +++++++++-
 net/netfilter/nft_log.c        | 49 +++++++++++++++++++++++++++++++++++++++---
 net/netfilter/xt_LOG.c         |  2 +-
 net/netfilter/xt_TRACE.c       |  2 +-
 6 files changed, 64 insertions(+), 10 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index 4b1b4c44e537b..2686caac8080e 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -64,7 +64,7 @@  int nf_log_bind_pf(struct net *net, u_int8_t pf,
 		   const struct nf_logger *logger);
 void nf_log_unbind_pf(struct net *net, u_int8_t pf);
 
-int nf_logger_find_get(int pf, enum nf_log_type type);
+int nf_logger_find_get_nolock(int pf, enum nf_log_type type);
 void nf_logger_put(int pf, enum nf_log_type type);
 void nf_logger_request_module(int pf, enum nf_log_type type);
 
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index ff07ee54bbdd3..dd34fd081a89d 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -19,7 +19,9 @@ 
 int sysctl_nf_log_all_netns __read_mostly;
 EXPORT_SYMBOL(sysctl_nf_log_all_netns);
 
-static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
+struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
+EXPORT_SYMBOL(loggers);
+
 static DEFINE_MUTEX(nf_log_mutex);
 
 #define nft_log_dereference(logger) \
@@ -164,7 +166,7 @@  void nf_logger_request_module(int pf, enum nf_log_type type)
 }
 EXPORT_SYMBOL_GPL(nf_logger_request_module);
 
-int nf_logger_find_get(int pf, enum nf_log_type type)
+int nf_logger_find_get_nolock(int pf, enum nf_log_type type)
 {
 	struct nf_logger *logger;
 	int ret = -ENOENT;
@@ -184,7 +186,7 @@  int nf_logger_find_get(int pf, enum nf_log_type type)
 	rcu_read_unlock();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_logger_find_get);
+EXPORT_SYMBOL_GPL(nf_logger_find_get_nolock);
 
 void nf_logger_put(int pf, enum nf_log_type type)
 {
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 83259e7be0fbd..a48f185f7c1a1 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -440,7 +440,16 @@  static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
 	} else if (status == NFNL_BATCH_DONE) {
 		ss->commit(oskb);
 	} else {
-		ss->abort(oskb);
+		const struct nfnetlink_subsystem *ss2;
+
+		/* nf_logger_find_get_lock() can return -ENOENT after
+		 * nfnl_unlock drop/reacquire, so need to reread ss like in
+		 * -EGAIN case
+		 */
+		ss2 = rcu_dereference_protected(table[subsys_id].subsys,
+			lockdep_is_held(&table[subsys_id].mutex));
+		if (ss2 == ss)
+			ss->abort(oskb);
 	}
 
 	nfnl_err_deliver(&err_list, oskb);
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index d21fe91d50fc0..71e35feb1fe27 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -15,6 +15,7 @@ 
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter/nfnetlink.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_log.h>
 #include <linux/netdevice.h>
@@ -46,6 +47,48 @@  static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
 	[NFTA_LOG_FLAGS]	= { .type = NLA_U32 },
 };
 
+extern struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
+
+/*
+ * In "nft_log" module we need to drop nfnl lock while performing
+ * request_module(), calls to nf_logger_find_get() in other
+ * modules are done without nfnl_lock taken.
+ *
+ * nf_logger_find_get
+ *  nft_log_init
+ *   nf_tables_newexpr
+ *    nf_tables_newrule		// nc->call_batch
+ *				// called with nfnl_lock taken
+ *
+ *     nfnetlink_rcv_batch	// takes nfnl_lock(NFNL_SUBSYS_NFTABLES)
+ *      nfnetlink_rcv
+ *       netlink_unicast
+ *        netlink_sendmsg
+ */
+static int nf_logger_find_get_lock(int pf, enum nf_log_type type)
+{
+	struct nf_logger *logger;
+	int ret = 0;
+
+	logger = loggers[pf][type];
+	if (logger == NULL) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nf-logger-%u-%u", pf, type);
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		ret = -EAGAIN;
+	}
+
+	rcu_read_lock();
+	logger = rcu_dereference(loggers[pf][type]);
+
+	if ((logger == NULL) ||
+	    ((ret == 0) && !try_module_get(logger->me))) {
+		ret = -ENOENT;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
 static int nft_log_init(const struct nft_ctx *ctx,
 			const struct nft_expr *expr,
 			const struct nlattr * const tb[])
@@ -99,11 +142,11 @@  static int nft_log_init(const struct nft_ctx *ctx,
 	}
 
 	if (ctx->afi->family == NFPROTO_INET) {
-		ret = nf_logger_find_get(NFPROTO_IPV4, li->type);
+		ret = nf_logger_find_get_lock(NFPROTO_IPV4, li->type);
 		if (ret < 0)
 			return ret;
 
-		ret = nf_logger_find_get(NFPROTO_IPV6, li->type);
+		ret = nf_logger_find_get_lock(NFPROTO_IPV6, li->type);
 		if (ret < 0) {
 			nf_logger_put(NFPROTO_IPV4, li->type);
 			return ret;
@@ -111,7 +154,7 @@  static int nft_log_init(const struct nft_ctx *ctx,
 		return 0;
 	}
 
-	return nf_logger_find_get(ctx->afi->family, li->type);
+	return nf_logger_find_get_lock(ctx->afi->family, li->type);
 }
 
 static void nft_log_destroy(const struct nft_ctx *ctx,
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index eb31d73e31ebf..89059468e0f80 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -61,7 +61,7 @@  static int log_tg_check(const struct xt_tgchk_param *par)
 		return -EINVAL;
 	}
 
-	return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+	return nf_logger_find_get_nolock(par->family, NF_LOG_TYPE_LOG);
 }
 
 static void log_tg_destroy(const struct xt_tgdtor_param *par)
diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c
index 858d189a13031..d0e77eea6b829 100644
--- a/net/netfilter/xt_TRACE.c
+++ b/net/netfilter/xt_TRACE.c
@@ -13,7 +13,7 @@  MODULE_ALIAS("ip6t_TRACE");
 
 static int trace_tg_check(const struct xt_tgchk_param *par)
 {
-	return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+	return nf_logger_find_get_nolock(par->family, NF_LOG_TYPE_LOG);
 }
 
 static void trace_tg_destroy(const struct xt_tgdtor_param *par)