[RH7,v2] netfilter: drop/reaquire nfnl_lock on request_module() in nft_log module

Submitted by Konstantin Khorenko on July 17, 2020, 2:37 p.m.

Details

Message ID 20200717143739.11556-1-khorenko@virtuozzo.com
State New
Series "netfilter: drop/reaquire nfnl_lock on request_module() in nft_log module"
Headers show

Commit Message

Konstantin Khorenko July 17, 2020, 2:37 p.m.
There could be a deadlock:

1. "nft" process calls request_module() with
nfnl_lock(NFNL_SUBSYS_NFTABLES) taken and waits for its completion:

[UN]  PID: 7391   TASK: ffff939e154bc000  COMMAND: "nft"
wait_for_completion_killable
 __call_usermodehelper_exec
  __request_module
   nf_logger_find_get
    nft_log_init		// ops->init()
     nf_tables_newexpr
      nf_tables_newrule		// called with nfnl_lock taken
       nfnetlink_rcv_batch	// takes nfnl_lock
        nfnetlink_rcv
         netlink_unicast
          netlink_sendmsg

2. request_module() tries to load the module, but waits on "net_mutex":

[UN]  PID: 8913   TASK: ffff939e140ec000  COMMAND: "modprobe"
__mutex_lock_slowpath
 mutex_lock			// mutex_lock(&net_mutex);
 (owner = 0xffff939e315f2000 == PID: 158  COMMAND: "kworker/u64:1")
   register_pernet_subsys
    init_module
     do_one_initcall
      load_module
       sys_init_module

3. "kworker" holds net_mutex, but itself waits for nfnl_lock:

[UN]  PID: 158    TASK: ffff939e315f2000  COMMAND: "kworker/u64:1"
__mutex_lock_slowpath
 mutex_lock			// nfnl_lock(NFNL_SUBSYS_NFTABLES)
 (owner = 0xffff939e154bc000  PID: 7391  COMMAND: "nft")
  nfnl_lock
   nft_unregister_afinfo
    nf_tables_inet_exit_net
     ops_exit_list
      cleanup_net
       process_one_work

Reproducer:

Patch hide | download patch | download mbox

===========
[console 1]: export i=0; while true; do
	nft add table filter; \
	nft add chain filter input; \
	nft add rule filter input tcp dport 23 ct state new \
		log prefix \"Connection to port 23:\" accept; \
	nft flush ruleset; \
	rmmod nft_log; rmmod nf_log_ipv4; rmmod nf_log_common; \
	i=$(($i+1)); echo $i; \
done

[console 2]: modprobe nf_tables; \
	export j=0; while true; do \
		ip net add n1; \
		ip net del n1; \
		j=$(($j + 1)); echo $j; \
	done

In mainstream this sutiation is impossible, it's fixed as a side effect
by: f102d66b335a4 ("netfilter: nf_tables: use dedicated mutex to guard
transactions"), but backporting it is quite heavy,

so let's just drop/reacquire nfnl lock around request_module() like it's
done in similar places.

https://jira.sw.ru/browse/PSBM-105534

Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>

v2: be ready in nfnetlink_rcv_batch() that -ENOENT could be returned
with nfnl lock dropped/reacquired like -EAGAIN => re-read ss.
---
 include/net/netfilter/nf_log.h |  2 +-
 net/netfilter/nf_log.c         |  8 ++++---
 net/netfilter/nfnetlink.c      | 11 +++++++++-
 net/netfilter/nft_log.c        | 49 +++++++++++++++++++++++++++++++++++++++---
 net/netfilter/xt_LOG.c         |  2 +-
 net/netfilter/xt_TRACE.c       |  2 +-
 6 files changed, 64 insertions(+), 10 deletions(-)

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index 4b1b4c44e537b..2686caac8080e 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -64,7 +64,7 @@  int nf_log_bind_pf(struct net *net, u_int8_t pf,
 		   const struct nf_logger *logger);
 void nf_log_unbind_pf(struct net *net, u_int8_t pf);
 
-int nf_logger_find_get(int pf, enum nf_log_type type);
+int nf_logger_find_get_nolock(int pf, enum nf_log_type type);
 void nf_logger_put(int pf, enum nf_log_type type);
 void nf_logger_request_module(int pf, enum nf_log_type type);
 
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index ff07ee54bbdd3..dd34fd081a89d 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -19,7 +19,9 @@ 
 int sysctl_nf_log_all_netns __read_mostly;
 EXPORT_SYMBOL(sysctl_nf_log_all_netns);
 
-static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
+struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
+EXPORT_SYMBOL(loggers);
+
 static DEFINE_MUTEX(nf_log_mutex);
 
 #define nft_log_dereference(logger) \
@@ -164,7 +166,7 @@  void nf_logger_request_module(int pf, enum nf_log_type type)
 }
 EXPORT_SYMBOL_GPL(nf_logger_request_module);
 
-int nf_logger_find_get(int pf, enum nf_log_type type)
+int nf_logger_find_get_nolock(int pf, enum nf_log_type type)
 {
 	struct nf_logger *logger;
 	int ret = -ENOENT;
@@ -184,7 +186,7 @@  int nf_logger_find_get(int pf, enum nf_log_type type)
 	rcu_read_unlock();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_logger_find_get);
+EXPORT_SYMBOL_GPL(nf_logger_find_get_nolock);
 
 void nf_logger_put(int pf, enum nf_log_type type)
 {
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 83259e7be0fbd..a48f185f7c1a1 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -440,7 +440,16 @@  static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
 	} else if (status == NFNL_BATCH_DONE) {
 		ss->commit(oskb);
 	} else {
-		ss->abort(oskb);
+		const struct nfnetlink_subsystem *ss2;
+
+		/* nf_logger_find_get_lock() can return -ENOENT after
+		 * nfnl_unlock drop/reacquire, so need to reread ss like in
+		 * -EGAIN case
+		 */
+		ss2 = rcu_dereference_protected(table[subsys_id].subsys,
+			lockdep_is_held(&table[subsys_id].mutex));
+		if (ss2 == ss)
+			ss->abort(oskb);
 	}
 
 	nfnl_err_deliver(&err_list, oskb);
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index d21fe91d50fc0..71e35feb1fe27 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -15,6 +15,7 @@ 
 #include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter/nfnetlink.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_log.h>
 #include <linux/netdevice.h>
@@ -46,6 +47,48 @@  static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
 	[NFTA_LOG_FLAGS]	= { .type = NLA_U32 },
 };
 
+extern struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
+
+/*
+ * In "nft_log" module we need to drop nfnl lock while performing
+ * request_module(), calls to nf_logger_find_get() in other
+ * modules are done without nfnl_lock taken.
+ *
+ * nf_logger_find_get
+ *  nft_log_init
+ *   nf_tables_newexpr
+ *    nf_tables_newrule		// nc->call_batch
+ *				// called with nfnl_lock taken
+ *
+ *     nfnetlink_rcv_batch	// takes nfnl_lock(NFNL_SUBSYS_NFTABLES)
+ *      nfnetlink_rcv
+ *       netlink_unicast
+ *        netlink_sendmsg
+ */
+static int nf_logger_find_get_lock(int pf, enum nf_log_type type)
+{
+	struct nf_logger *logger;
+	int ret = 0;
+
+	logger = loggers[pf][type];
+	if (logger == NULL) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nf-logger-%u-%u", pf, type);
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		ret = -EAGAIN;
+	}
+
+	rcu_read_lock();
+	logger = rcu_dereference(loggers[pf][type]);
+
+	if ((logger == NULL) ||
+	    ((ret == 0) && !try_module_get(logger->me))) {
+		ret = -ENOENT;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
 static int nft_log_init(const struct nft_ctx *ctx,
 			const struct nft_expr *expr,
 			const struct nlattr * const tb[])
@@ -99,11 +142,11 @@  static int nft_log_init(const struct nft_ctx *ctx,
 	}
 
 	if (ctx->afi->family == NFPROTO_INET) {
-		ret = nf_logger_find_get(NFPROTO_IPV4, li->type);
+		ret = nf_logger_find_get_lock(NFPROTO_IPV4, li->type);
 		if (ret < 0)
 			return ret;
 
-		ret = nf_logger_find_get(NFPROTO_IPV6, li->type);
+		ret = nf_logger_find_get_lock(NFPROTO_IPV6, li->type);
 		if (ret < 0) {
 			nf_logger_put(NFPROTO_IPV4, li->type);
 			return ret;
@@ -111,7 +154,7 @@  static int nft_log_init(const struct nft_ctx *ctx,
 		return 0;
 	}
 
-	return nf_logger_find_get(ctx->afi->family, li->type);
+	return nf_logger_find_get_lock(ctx->afi->family, li->type);
 }
 
 static void nft_log_destroy(const struct nft_ctx *ctx,
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index eb31d73e31ebf..89059468e0f80 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -61,7 +61,7 @@  static int log_tg_check(const struct xt_tgchk_param *par)
 		return -EINVAL;
 	}
 
-	return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+	return nf_logger_find_get_nolock(par->family, NF_LOG_TYPE_LOG);
 }
 
 static void log_tg_destroy(const struct xt_tgdtor_param *par)
diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c
index 858d189a13031..d0e77eea6b829 100644
--- a/net/netfilter/xt_TRACE.c
+++ b/net/netfilter/xt_TRACE.c
@@ -13,7 +13,7 @@  MODULE_ALIAS("ip6t_TRACE");
 
 static int trace_tg_check(const struct xt_tgchk_param *par)
 {
-	return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+	return nf_logger_find_get_nolock(par->family, NF_LOG_TYPE_LOG);
 }
 
 static void trace_tg_destroy(const struct xt_tgdtor_param *par)

Comments

Andrey Ryabinin July 17, 2020, 3:09 p.m.
On 7/17/20 5:37 PM, Konstantin Khorenko wrote:
> There could be a deadlock:
> 
> 1. "nft" process calls request_module() with
> nfnl_lock(NFNL_SUBSYS_NFTABLES) taken and waits for its completion:
> 
> [UN]  PID: 7391   TASK: ffff939e154bc000  COMMAND: "nft"
> wait_for_completion_killable
>  __call_usermodehelper_exec
>   __request_module
>    nf_logger_find_get
>     nft_log_init		// ops->init()
>      nf_tables_newexpr
>       nf_tables_newrule		// called with nfnl_lock taken
>        nfnetlink_rcv_batch	// takes nfnl_lock
>         nfnetlink_rcv
>          netlink_unicast
>           netlink_sendmsg
> 
> 2. request_module() tries to load the module, but waits on "net_mutex":
> 
> [UN]  PID: 8913   TASK: ffff939e140ec000  COMMAND: "modprobe"
> __mutex_lock_slowpath
>  mutex_lock			// mutex_lock(&net_mutex);
>  (owner = 0xffff939e315f2000 == PID: 158  COMMAND: "kworker/u64:1")
>    register_pernet_subsys
>     init_module
>      do_one_initcall
>       load_module
>        sys_init_module
> 
> 3. "kworker" holds net_mutex, but itself waits for nfnl_lock:
> 
> [UN]  PID: 158    TASK: ffff939e315f2000  COMMAND: "kworker/u64:1"
> __mutex_lock_slowpath
>  mutex_lock			// nfnl_lock(NFNL_SUBSYS_NFTABLES)
>  (owner = 0xffff939e154bc000  PID: 7391  COMMAND: "nft")
>   nfnl_lock
>    nft_unregister_afinfo
>     nf_tables_inet_exit_net
>      ops_exit_list
>       cleanup_net
>        process_one_work
> 
> Reproducer:
> ===========
> [console 1]: export i=0; while true; do
> 	nft add table filter; \
> 	nft add chain filter input; \
> 	nft add rule filter input tcp dport 23 ct state new \
> 		log prefix \"Connection to port 23:\" accept; \
> 	nft flush ruleset; \
> 	rmmod nft_log; rmmod nf_log_ipv4; rmmod nf_log_common; \
> 	i=$(($i+1)); echo $i; \
> done
> 
> [console 2]: modprobe nf_tables; \
> 	export j=0; while true; do \
> 		ip net add n1; \
> 		ip net del n1; \
> 		j=$(($j + 1)); echo $j; \
> 	done
> 
> In mainstream this sutiation is impossible, it's fixed as a side effect
> by: f102d66b335a4 ("netfilter: nf_tables: use dedicated mutex to guard
> transactions"), but backporting it is quite heavy,
> 
> so let's just drop/reacquire nfnl lock around request_module() like it's
> done in similar places.
> 
> https://jira.sw.ru/browse/PSBM-105534
> 
> Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
> 

Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>