Message ID | 20200716123731.9451-1-khorenko@virtuozzo.com |
---|---|
State | New |
Series | "netfilter: drop/reaquire nfnl_lock on request_module() in nft_log module" |
Headers | show |
=========== [console 1]: export i=0; while true; do nft add table filter; \ nft add chain filter input; \ nft add rule filter input tcp dport 23 ct state new \ log prefix \"Connection to port 23:\" accept; \ nft flush ruleset; \ rmmod nft_log; rmmod nf_log_ipv4; rmmod nf_log_common; \ i=$(($i+1)); echo $i; \ done [console 2]: modprobe nf_tables; \ export j=0; while true; do \ ip net add n1; \ ip net del n1; \ j=$(($j + 1)); echo $j; \ done In mainstream this sutiation is impossible, it's fixed as a side effect by: f102d66b335a4 ("netfilter: nf_tables: use dedicated mutex to guard transactions"), but backporting it is quite heavy, so let's just drop/reacquire nfnl lock around request_module() like it's done in similar places. https://jira.sw.ru/browse/PSBM-105534 Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com> --- include/net/netfilter/nf_log.h | 2 +- net/netfilter/nf_log.c | 8 ++++--- net/netfilter/nft_log.c | 49 +++++++++++++++++++++++++++++++++++++++--- net/netfilter/xt_LOG.c | 2 +- net/netfilter/xt_TRACE.c | 2 +- 5 files changed, 54 insertions(+), 9 deletions(-) diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 4b1b4c44e537b..2686caac8080e 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -64,7 +64,7 @@ int nf_log_bind_pf(struct net *net, u_int8_t pf, const struct nf_logger *logger); void nf_log_unbind_pf(struct net *net, u_int8_t pf); -int nf_logger_find_get(int pf, enum nf_log_type type); +int nf_logger_find_get_nolock(int pf, enum nf_log_type type); void nf_logger_put(int pf, enum nf_log_type type); void nf_logger_request_module(int pf, enum nf_log_type type); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index ff07ee54bbdd3..dd34fd081a89d 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -19,7 +19,9 @@ int sysctl_nf_log_all_netns __read_mostly; EXPORT_SYMBOL(sysctl_nf_log_all_netns); -static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly; +struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly; +EXPORT_SYMBOL(loggers); + static DEFINE_MUTEX(nf_log_mutex); #define nft_log_dereference(logger) \ @@ -164,7 +166,7 @@ void nf_logger_request_module(int pf, enum nf_log_type type) } EXPORT_SYMBOL_GPL(nf_logger_request_module); -int nf_logger_find_get(int pf, enum nf_log_type type) +int nf_logger_find_get_nolock(int pf, enum nf_log_type type) { struct nf_logger *logger; int ret = -ENOENT; @@ -184,7 +186,7 @@ int nf_logger_find_get(int pf, enum nf_log_type type) rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(nf_logger_find_get); +EXPORT_SYMBOL_GPL(nf_logger_find_get_nolock); void nf_logger_put(int pf, enum nf_log_type type) { diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index d21fe91d50fc0..71e35feb1fe27 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -15,6 +15,7 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <linux/netfilter/nfnetlink.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_log.h> #include <linux/netdevice.h> @@ -46,6 +47,48 @@ static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { [NFTA_LOG_FLAGS] = { .type = NLA_U32 }, }; +extern struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly; + +/* + * In "nft_log" module we need to drop nfnl lock while performing + * request_module(), calls to nf_logger_find_get() in other + * modules are done without nfnl_lock taken. + * + * nf_logger_find_get + * nft_log_init + * nf_tables_newexpr + * nf_tables_newrule // nc->call_batch + * // called with nfnl_lock taken + * + * nfnetlink_rcv_batch // takes nfnl_lock(NFNL_SUBSYS_NFTABLES) + * nfnetlink_rcv + * netlink_unicast + * netlink_sendmsg + */ +static int nf_logger_find_get_lock(int pf, enum nf_log_type type) +{ + struct nf_logger *logger; + int ret = 0; + + logger = loggers[pf][type]; + if (logger == NULL) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + request_module("nf-logger-%u-%u", pf, type); + nfnl_lock(NFNL_SUBSYS_NFTABLES); + ret = -EAGAIN; + } + + rcu_read_lock(); + logger = rcu_dereference(loggers[pf][type]); + + if ((logger == NULL) || + ((ret == 0) && !try_module_get(logger->me))) { + ret = -ENOENT; + } + rcu_read_unlock(); + return ret; +} + static int nft_log_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -99,11 +142,11 @@ static int nft_log_init(const struct nft_ctx *ctx, } if (ctx->afi->family == NFPROTO_INET) { - ret = nf_logger_find_get(NFPROTO_IPV4, li->type); + ret = nf_logger_find_get_lock(NFPROTO_IPV4, li->type); if (ret < 0) return ret; - ret = nf_logger_find_get(NFPROTO_IPV6, li->type); + ret = nf_logger_find_get_lock(NFPROTO_IPV6, li->type); if (ret < 0) { nf_logger_put(NFPROTO_IPV4, li->type); return ret; @@ -111,7 +154,7 @@ static int nft_log_init(const struct nft_ctx *ctx, return 0; } - return nf_logger_find_get(ctx->afi->family, li->type); + return nf_logger_find_get_lock(ctx->afi->family, li->type); } static void nft_log_destroy(const struct nft_ctx *ctx, diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index eb31d73e31ebf..89059468e0f80 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -61,7 +61,7 @@ static int log_tg_check(const struct xt_tgchk_param *par) return -EINVAL; } - return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + return nf_logger_find_get_nolock(par->family, NF_LOG_TYPE_LOG); } static void log_tg_destroy(const struct xt_tgdtor_param *par) diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index 858d189a13031..d0e77eea6b829 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -13,7 +13,7 @@ MODULE_ALIAS("ip6t_TRACE"); static int trace_tg_check(const struct xt_tgchk_param *par) { - return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + return nf_logger_find_get_nolock(par->family, NF_LOG_TYPE_LOG); } static void trace_tg_destroy(const struct xt_tgdtor_param *par)
On 7/16/20 3:37 PM, Konstantin Khorenko wrote: > > +extern struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly; > + > +/* > + * In "nft_log" module we need to drop nfnl lock while performing > + * request_module(), calls to nf_logger_find_get() in other > + * modules are done without nfnl_lock taken. > + * > + * nf_logger_find_get > + * nft_log_init > + * nf_tables_newexpr > + * nf_tables_newrule // nc->call_batch > + * // called with nfnl_lock taken > + * > + * nfnetlink_rcv_batch // takes nfnl_lock(NFNL_SUBSYS_NFTABLES) > + * nfnetlink_rcv > + * netlink_unicast > + * netlink_sendmsg > + */ > +static int nf_logger_find_get_lock(int pf, enum nf_log_type type) > +{ > + struct nf_logger *logger; > + int ret = 0; > + > + logger = loggers[pf][type]; > + if (logger == NULL) { > + nfnl_unlock(NFNL_SUBSYS_NFTABLES); nfnetlink_rcv_batch() takes nfnl_lock(subsys_id) lock How you can be sure that subsys_id is always NFNL_SUBSYS_NFTABLES ? > + request_module("nf-logger-%u-%u", pf, type); > + nfnl_lock(NFNL_SUBSYS_NFTABLES); > + ret = -EAGAIN; > + }
On 07/16/2020 04:55 PM, Andrey Ryabinin wrote: > On 7/16/20 3:37 PM, Konstantin Khorenko wrote: >> >> +extern struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly; >> + >> +/* >> + * In "nft_log" module we need to drop nfnl lock while performing >> + * request_module(), calls to nf_logger_find_get() in other >> + * modules are done without nfnl_lock taken. >> + * >> + * nf_logger_find_get >> + * nft_log_init >> + * nf_tables_newexpr >> + * nf_tables_newrule // nc->call_batch >> + * // called with nfnl_lock taken >> + * >> + * nfnetlink_rcv_batch // takes nfnl_lock(NFNL_SUBSYS_NFTABLES) >> + * nfnetlink_rcv >> + * netlink_unicast >> + * netlink_sendmsg >> + */ >> +static int nf_logger_find_get_lock(int pf, enum nf_log_type type) >> +{ >> + struct nf_logger *logger; >> + int ret = 0; >> + >> + logger = loggers[pf][type]; >> + if (logger == NULL) { >> + nfnl_unlock(NFNL_SUBSYS_NFTABLES); > > nfnetlink_rcv_batch() takes nfnl_lock(subsys_id) lock > How you can be sure that subsys_id is always NFNL_SUBSYS_NFTABLES ? 1) nf_logger_find_get_lock() is called in only 1 place - in nft_log_init() 2) nft_log_init() == nft_log_ops.init() == struct nft_expr_ops::init() struct nft_expr_ops::init() is called only in nf_tables_newexpr() => nft_log_init() is called only in nf_tables_newexpr() 3) nf_tables_newexpr() is called only in nf_tables_newrule() and in nft_expr_init() a) nft_expr_init() is called only in nft_dynset_init() nft_dynset_init() == nft_dynset_ops.init() == struct nft_expr_ops::init() struct nft_expr_ops::init() is called only in nf_tables_newexpr() => nft_dynset_init() is called only in nf_tables_newexpr() => goto 3) b) nf_tables_newrule() is the nf_tables_cb[NFT_MSG_NEWRULE].call_batch() struct nfnl_callback nf_tables_cb[NFT_MSG_MAX]; struct nfnl_callback::call_batch() is called only in nfnetlink_rcv_batch() (nc->call_batch()) => b.1) nf_tables_newrule() is called only in nfnetlink_rcv_batch() b.2) nfnetlink_rcv_batch() acquires nfnl_lock(NFNL_SUBSYS_NFTABLES ???) =========== /* Work around old nft using host byte order */ if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES) res_id = NFNL_SUBSYS_NFTABLES; else res_id = ntohs(nfgenmsg->res_id); nfnetlink_rcv_batch(skb, nlh, res_id); =========== Well, looks like i was not patient enough, i've proven that nfnl_lock() is always taken by above (i mean there is no path nf_logger_find_get() is called without nfnl_lock() is taken), but you are right, it can be not nfnl_lock(NFNL_SUBSYS_NFTABLES) but for some other subsys_id. Thank you for catching this, i will rework the patch. >> + request_module("nf-logger-%u-%u", pf, type); >> + nfnl_lock(NFNL_SUBSYS_NFTABLES); >> + ret = -EAGAIN; >> + } > . >
On 07/16/2020 07:47 PM, Konstantin Khorenko wrote: > On 07/16/2020 04:55 PM, Andrey Ryabinin wrote: >> On 7/16/20 3:37 PM, Konstantin Khorenko wrote: >>> >>> +extern struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly; >>> + >>> +/* >>> + * In "nft_log" module we need to drop nfnl lock while performing >>> + * request_module(), calls to nf_logger_find_get() in other >>> + * modules are done without nfnl_lock taken. >>> + * >>> + * nf_logger_find_get >>> + * nft_log_init >>> + * nf_tables_newexpr >>> + * nf_tables_newrule // nc->call_batch >>> + * // called with nfnl_lock taken >>> + * >>> + * nfnetlink_rcv_batch // takes nfnl_lock(NFNL_SUBSYS_NFTABLES) >>> + * nfnetlink_rcv >>> + * netlink_unicast >>> + * netlink_sendmsg >>> + */ >>> +static int nf_logger_find_get_lock(int pf, enum nf_log_type type) >>> +{ >>> + struct nf_logger *logger; >>> + int ret = 0; >>> + >>> + logger = loggers[pf][type]; >>> + if (logger == NULL) { >>> + nfnl_unlock(NFNL_SUBSYS_NFTABLES); >> >> nfnetlink_rcv_batch() takes nfnl_lock(subsys_id) lock >> How you can be sure that subsys_id is always NFNL_SUBSYS_NFTABLES ? > > 1) nf_logger_find_get_lock() is called in only 1 place - in nft_log_init() > 2) nft_log_init() == nft_log_ops.init() == struct nft_expr_ops::init() > struct nft_expr_ops::init() is called only in nf_tables_newexpr() > => nft_log_init() is called only in nf_tables_newexpr() > > 3) nf_tables_newexpr() is called only in nf_tables_newrule() and in nft_expr_init() > > a) nft_expr_init() is called only in nft_dynset_init() > nft_dynset_init() == nft_dynset_ops.init() == struct nft_expr_ops::init() > > struct nft_expr_ops::init() is called only in nf_tables_newexpr() > => nft_dynset_init() is called only in nf_tables_newexpr() > => goto 3) > > b) nf_tables_newrule() is the nf_tables_cb[NFT_MSG_NEWRULE].call_batch() > struct nfnl_callback nf_tables_cb[NFT_MSG_MAX]; > > struct nfnl_callback::call_batch() is called only in nfnetlink_rcv_batch() (nc->call_batch()) > => > b.1) nf_tables_newrule() is called only in nfnetlink_rcv_batch() > b.2) nfnetlink_rcv_batch() acquires nfnl_lock(NFNL_SUBSYS_NFTABLES ???) > =========== > /* Work around old nft using host byte order */ > if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES) > res_id = NFNL_SUBSYS_NFTABLES; > else > res_id = ntohs(nfgenmsg->res_id); > nfnetlink_rcv_batch(skb, nlh, res_id); > =========== > > Well, looks like i was not patient enough, > i've proven that nfnl_lock() is always taken by above > (i mean there is no path nf_logger_find_get() is called without nfnl_lock() is taken), > but you are right, it can be not nfnl_lock(NFNL_SUBSYS_NFTABLES) but for some other subsys_id. > > Thank you for catching this, i will rework the patch. Nope, i've recalled my checks! static const struct nfnetlink_subsystem nf_tables_subsys = { .name = "nf_tables", .subsys_id = NFNL_SUBSYS_NFTABLES, .cb = nf_tables_cb, ... static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWRULE] = { .call_batch = nf_tables_newrule, }, ... nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss) { u_int8_t cb_id = NFNL_MSG_TYPE(type); return &ss->cb[cb_id]; } nfnetlink_rcv_batch(subsys_id) { ... nfnl_lock(subsys_id); ss = rcu_dereference_protected(table[subsys_id].subsys, ... ... /* We only accept a batch with messages for the same * subsystem. */ if (NFNL_SUBSYS_ID(type) != subsys_id) { err = -EINVAL; goto ack; } nc = nfnetlink_find_client(type, ss); ... nc->call_batch() If nc->call_batch() == nf_tables_newrule(), it means nc == nf_tables_cb[NFT_MSG_NEWRULE] nc == &ss->cb[cb_id] => ss == nf_tables_subsys and we know that nf_tables_subsys.subsys_id = NFNL_SUBSYS_NFTABLES static const struct nfnetlink_subsystem nf_tables_subsys = { .name = "nf_tables", .subsys_id = NFNL_SUBSYS_NFTABLES, .cb = nf_tables_cb, ... Proven? >>> + request_module("nf-logger-%u-%u", pf, type); >>> + nfnl_lock(NFNL_SUBSYS_NFTABLES); >>> + ret = -EAGAIN; >>> + } >> . >>
On 7/16/20 3:37 PM, Konstantin Khorenko wrote: > +static int nf_logger_find_get_lock(int pf, enum nf_log_type type) > +{ > + struct nf_logger *logger; > + int ret = 0; > + > + logger = loggers[pf][type]; > + if (logger == NULL) { > + nfnl_unlock(NFNL_SUBSYS_NFTABLES); > + request_module("nf-logger-%u-%u", pf, type); > + nfnl_lock(NFNL_SUBSYS_NFTABLES); > + ret = -EAGAIN; nfnetlink_rcv_batch() has special error path for EAGAIN: /* The lock was released to autoload some module, we * have to abort and start from scratch using the * original skb. */ if (err == -EAGAIN) { status |= NFNL_BATCH_REPLAY; goto done; } Yet, nf_logger_find_get_lock() don't return immediately with EAGAIN, but continues and potentially replaces EAGAIN with ENOENT. Why? > + } > + > + rcu_read_lock(); > + logger = rcu_dereference(loggers[pf][type]); > + > + if ((logger == NULL) || > + ((ret == 0) && !try_module_get(logger->me))) { > + ret = -ENOENT; > + } > + rcu_read_unlock(); > + return ret; > +} > +
On 7/16/20 8:21 PM, Konstantin Khorenko wrote: > If nc->call_batch() == nf_tables_newrule(), it means > nc == nf_tables_cb[NFT_MSG_NEWRULE] > > nc == &ss->cb[cb_id] > => ss == nf_tables_subsys > > and we know that nf_tables_subsys.subsys_id = NFNL_SUBSYS_NFTABLES > > > static const struct nfnetlink_subsystem nf_tables_subsys = { > .name = "nf_tables", > .subsys_id = NFNL_SUBSYS_NFTABLES, > .cb = nf_tables_cb, > ... > > > Proven? > Seems ok. FTR I was thinking about 'request_module_nowait(); return -EAGAIN;' as an alternative.
On 07/17/2020 03:39 PM, Andrey Ryabinin wrote: > > > On 7/16/20 3:37 PM, Konstantin Khorenko wrote: >> +static int nf_logger_find_get_lock(int pf, enum nf_log_type type) >> +{ >> + struct nf_logger *logger; >> + int ret = 0; >> + >> + logger = loggers[pf][type]; >> + if (logger == NULL) { >> + nfnl_unlock(NFNL_SUBSYS_NFTABLES); >> + request_module("nf-logger-%u-%u", pf, type); >> + nfnl_lock(NFNL_SUBSYS_NFTABLES); >> + ret = -EAGAIN; > > nfnetlink_rcv_batch() has special error path for EAGAIN: > /* The lock was released to autoload some module, we > * have to abort and start from scratch using the > * original skb. > */ > if (err == -EAGAIN) { > status |= NFNL_BATCH_REPLAY; > goto done; > } > > > Yet, nf_logger_find_get_lock() don't return immediately with EAGAIN, but continues and potentially replaces EAGAIN with ENOENT. > Why? Well, this is the original logic, i did not change it. The idea is: * if loggers[pf][type] is not registered, try to load the appropriate module // the module load could fail, we keep it in mind * next check loggers[pf][type] once again - and if it's empty again after our attempt to load the module (and thus to register desired loggers[pf][type]), then we don't need to return -EAGAIN, because - next module load attempt could also fail to register loggers[pf][type] and we get in a loop - we just don't want to try loading module more than once => in case loggers[pf][type] is still empty after single module load attempt => something is deadly wrong, return -ENOENT, no need to probe it once more - another case - try_module_get() fails - we also don't want to retry in this case >> + } >> + >> + rcu_read_lock(); >> + logger = rcu_dereference(loggers[pf][type]); >> + >> + if ((logger == NULL) || >> + ((ret == 0) && !try_module_get(logger->me))) { >> + ret = -ENOENT; >> + } >> + rcu_read_unlock(); >> + return ret; >> +} >> + > . >
On 07/17/2020 03:41 PM, Andrey Ryabinin wrote: > > > On 7/16/20 8:21 PM, Konstantin Khorenko wrote: > >> If nc->call_batch() == nf_tables_newrule(), it means >> nc == nf_tables_cb[NFT_MSG_NEWRULE] >> >> nc == &ss->cb[cb_id] >> => ss == nf_tables_subsys >> >> and we know that nf_tables_subsys.subsys_id = NFNL_SUBSYS_NFTABLES >> >> >> static const struct nfnetlink_subsystem nf_tables_subsys = { >> .name = "nf_tables", >> .subsys_id = NFNL_SUBSYS_NFTABLES, >> .cb = nf_tables_cb, >> ... >> >> >> Proven? >> > > Seems ok. > FTR I was thinking about 'request_module_nowait(); return -EAGAIN;' as an alternative. Ok, but how do we know when we can try checking again - and expect the loggers[pf][type] has been already registered? i mean, it's possible the handle -EAGAIN in netlink code and return again into nf_logger_find_get_lock() before modprobe loads the module we requested and thus loggers[pf][type] will be again empty.