[RHEL7,COMMIT] ms/netfilter: kill the fake untracked conntrack objects

Submitted by Konstantin Khorenko on Nov. 28, 2017, 10:45 a.m.

Details

Message ID 201711281045.vASAj4r2001312@finist_ce7.work
State New
Series "netfilter: kill the fake untracked conntrack objects"
Headers show

Commit Message

Konstantin Khorenko Nov. 28, 2017, 10:45 a.m.
The commit is pushed to "branch-rh7-3.10.0-693.1.1.vz7.37.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.1.1.vz7.37.30
------>
commit a33ee1b2c9b716b7d7dfeb312430b937cb3edfc9
Author: Stanislav Kinsburskiy <skinsbursky@virtuozzo.com>
Date:   Tue Nov 28 13:45:04 2017 +0300

    ms/netfilter: kill the fake untracked conntrack objects
    
    ms commit: cc41c84 ("netfilter: kill the fake untracked conntrack objects")
    
    resurrect an old patch from Pablo Neira to remove the untracked objects.
    
    Currently, there are four possible states of an skb wrt. conntrack.
    
    1. No conntrack attached, ct is NULL.
    2. Normal (kmem cache allocated) ct attached.
    3. a template (kmalloc'd), not in any hash tables at any point in time
    4. the 'untracked' conntrack, a percpu nf_conn object, tagged via
    IPS_UNTRACKED_BIT in ct->status.
    
    Untracked is supposed to be identical to case 1.  It exists only
    so users can check
    
    -m conntrack --ctstate UNTRACKED vs.
    -m conntrack --ctstate INVALID
    
    e.g. attempts to set connmark on INVALID or UNTRACKED conntracks is
    supposed to be a no-op.
    
    Thus currently we need to check
    ct == NULL || nf_ct_is_untracked(ct)
    
    in a lot of places in order to avoid altering untracked objects.
    
    The other consequence of the percpu untracked object is that all
    -j NOTRACK (and, later, kfree_skb of such skbs) result in an atomic op
    (inc/dec the untracked conntracks refcount).
    
    This adds a new kernel-private ctinfo state, IP_CT_UNTRACKED, to
    make the distinction instead.
    
    The (few) places that care about packet invalid (ct is NULL) vs.
    packet untracked now need to test ct == NULL vs. ctinfo ==
    IP_CT_UNTRACKED,
    but all other places can omit the nf_ct_is_untracked() check.
    
    https://jira.sw.ru/browse/PSBM-71153
    
    Signed-off-by: Florian Westphal <fw@strlen.de>
    Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
    Signed-off-by: Stanislav Kinsburskiy <skinsbursky@virtuozzo.com>
---
 include/net/ip_vs.h                                |  5 ++-
 include/net/netfilter/nf_conntrack.h               | 10 +-----
 include/uapi/linux/netfilter/nf_conntrack_common.h |  6 ++--
 net/ipv4/netfilter/nf_dup_ipv4.c                   |  5 ++-
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c     |  5 ++-
 net/ipv6/netfilter/nf_dup_ipv6.c                   |  5 ++-
 net/netfilter/nf_conntrack_core.c                  | 42 +++-------------------
 net/netfilter/nf_nat_core.c                        |  3 --
 net/netfilter/xt_CT.c                              | 21 +++++------
 net/netfilter/xt_conntrack.c                       | 13 ++++---
 net/netfilter/xt_state.c                           | 13 ++++---
 11 files changed, 41 insertions(+), 87 deletions(-)

Patch hide | download patch | download mbox

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index e1599ad..69d2a9a 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1540,9 +1540,8 @@  static inline void ip_vs_notrack(struct sk_buff *skb)
 
 	if (!ct || !nf_ct_is_untracked(ct)) {
 		nf_conntrack_put(skb->nfct);
-		skb->nfct = &nf_ct_untracked_get()->ct_general;
-		skb->nfctinfo = IP_CT_NEW;
-		nf_conntrack_get(skb->nfct);
+		skb->nfct = NULL;
+		skb->nfctinfo = IP_CT_UNTRACKED;
 	}
 #endif
 }
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index edb8911..b278973 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -232,14 +232,6 @@  extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct,
 			       enum ip_conntrack_dir dir,
 			       u32 seq);
 
-/* Fake conntrack entry for untracked connections */
-DECLARE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
-static inline struct nf_conn *nf_ct_untracked_get(void)
-{
-	return &__raw_get_cpu_var(nf_conntrack_untracked);
-}
-void nf_ct_untracked_status_or(unsigned long bits);
-
 /* Iterate over all conntracks: if iter returns true, it's deleted. */
 void nf_ct_iterate_cleanup(struct net *net,
 			   int (*iter)(struct nf_conn *i, void *data),
@@ -272,7 +264,7 @@  static inline int nf_ct_is_dying(struct nf_conn *ct)
 
 static inline int nf_ct_is_untracked(const struct nf_conn *ct)
 {
-	return test_bit(IPS_UNTRACKED_BIT, &ct->status);
+	return false;
 }
 
 /* Packet is received from loopback */
diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 6d074d1..6c18960 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -28,12 +28,14 @@  enum ip_conntrack_info {
 	/* only for userspace compatibility */
 #ifndef __KERNEL__
 	IP_CT_NEW_REPLY = IP_CT_NUMBER,
+#else
+	IP_CT_UNTRACKED = 7,
 #endif
 };
 
 #define NF_CT_STATE_INVALID_BIT			(1 << 0)
 #define NF_CT_STATE_BIT(ctinfo)			(1 << ((ctinfo) % IP_CT_IS_REPLY + 1))
-#define NF_CT_STATE_UNTRACKED_BIT		(1 << (IP_CT_NUMBER + 1))
+#define NF_CT_STATE_UNTRACKED_BIT		(1 << (IP_CT_UNTRACKED + 1))
 
 /* Bitset representing status of connection. */
 enum ip_conntrack_status {
@@ -90,7 +92,7 @@  enum ip_conntrack_status {
 	IPS_TEMPLATE_BIT = 11,
 	IPS_TEMPLATE = (1 << IPS_TEMPLATE_BIT),
 
-	/* Conntrack is a fake untracked entry */
+	/* Conntrack is a fake untracked entry.  Obsolete and not used anymore */
 	IPS_UNTRACKED_BIT = 12,
 	IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT),
 
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index 2d79e6e..3d51492 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -84,9 +84,8 @@  void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum,
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	/* Avoid counting cloned packets towards the original connection. */
 	nf_conntrack_put(skb->nfct);
-	skb->nfct     = &nf_ct_untracked_get()->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
-	nf_conntrack_get(skb->nfct);
+	skb->nfct     = NULL;
+	skb->nfctinfo = IP_CT_UNTRACKED;
 #endif
 	/*
 	 * If we are in PREROUTING/INPUT, the checksum must be recalculated
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 148eab5..880d669 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -221,9 +221,8 @@  icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	type = icmp6h->icmp6_type - 130;
 	if (type >= 0 && type < sizeof(noct_valid_new) &&
 	    noct_valid_new[type]) {
-		skb->nfct = &nf_ct_untracked_get()->ct_general;
-		skb->nfctinfo = IP_CT_NEW;
-		nf_conntrack_get(skb->nfct);
+		skb->nfct = NULL;
+		skb->nfctinfo = IP_CT_UNTRACKED;
 		return NF_ACCEPT;
 	}
 
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index d2fc3b5..47379d1 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -73,9 +73,8 @@  void nf_dup_ipv6(struct sk_buff *skb, unsigned int hooknum,
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	nf_conntrack_put(skb->nfct);
-	skb->nfct     = &nf_ct_untracked_get()->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
-	nf_conntrack_get(skb->nfct);
+	skb->nfct     = NULL;
+	skb->nfctinfo = IP_CT_UNTRACKED;
 #endif
 	if (hooknum == NF_INET_PRE_ROUTING ||
 	    hooknum == NF_INET_LOCAL_IN) {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index ab04166..37ba305 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -125,9 +125,6 @@  static void nf_conntrack_all_unlock(void)
 unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
-DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
-EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
-
 unsigned int nf_conntrack_hash_rnd __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd);
 
@@ -1143,10 +1140,11 @@  nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 	int set_reply = 0;
 	int ret;
 
-	if (skb->nfct) {
+	if (skb->nfct || skb->nfctinfo == IP_CT_UNTRACKED) {
 		/* Previously seen (loopback or untracked)?  Ignore. */
 		tmpl = (struct nf_conn *)skb->nfct;
-		if (!nf_ct_is_template(tmpl)) {
+		if ((tmpl && !nf_ct_is_template(tmpl)) ||
+		    skb->nfctinfo == IP_CT_UNTRACKED) {
 			NF_CT_STAT_INC_ATOMIC(net, ignore);
 			return NF_ACCEPT;
 		}
@@ -1520,18 +1518,6 @@  static void nf_ct_release_dying_list(struct net *net)
 	}
 }
 
-static int untrack_refs(void)
-{
-	int cnt = 0, cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
-
-		cnt += atomic_read(&ct->ct_general.use) - 1;
-	}
-	return cnt;
-}
-
 void nf_conntrack_cleanup_start(void)
 {
 	RCU_INIT_POINTER(ip_ct_attach, NULL);
@@ -1540,8 +1526,6 @@  void nf_conntrack_cleanup_start(void)
 void nf_conntrack_cleanup_end(void)
 {
 	RCU_INIT_POINTER(nf_ct_destroy, NULL);
-	while (untrack_refs() > 0)
-		schedule();
 
 #ifdef CONFIG_NF_CONNTRACK_ZONES
 	nf_ct_extend_unregister(&nf_ct_zone_extend);
@@ -1695,19 +1679,10 @@  EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
 		  &nf_conntrack_htable_size, 0600);
 
-void nf_ct_untracked_status_or(unsigned long bits)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		per_cpu(nf_conntrack_untracked, cpu).status |= bits;
-}
-EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
-
 int nf_conntrack_init_start(void)
 {
 	int max_factor = 8;
-	int i, ret, cpu;
+	int i, ret;
 
 	for (i = 0; i < CONNTRACK_LOCKS; i++)
 		spin_lock_init(&nf_conntrack_locks[i]);
@@ -1781,14 +1756,6 @@  int nf_conntrack_init_start(void)
 	if (ret < 0)
 		goto err_proto;
 
-	/* Set up fake conntrack: to never be deleted, not in any hashes */
-	for_each_possible_cpu(cpu) {
-		struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
-		write_pnet(&ct->ct_net, &init_net);
-		atomic_set(&ct->ct_general.use, 1);
-	}
-	/*  - and look it like as a confirmed connection */
-	nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
 	return 0;
 
 err_proto:
@@ -1834,6 +1801,7 @@  int nf_conntrack_init_net(struct net *net)
 	int ret = -ENOMEM;
 	int cpu;
 
+	BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
 	atomic_set(&net->ct.count, 0);
 	net->ct.max = init_net.ct.max;
 	seqcount_init(&net->ct.generation);
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 899167d..39ca516 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -881,9 +881,6 @@  static int __init nf_nat_init(void)
 
 	nf_ct_helper_expectfn_register(&follow_master_nat);
 
-	/* Initialize fake conntrack so that NAT will skip it */
-	nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
-
 	BUG_ON(nfnetlink_parse_nat_setup_hook != NULL);
 	RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook,
 			   nfnetlink_parse_nat_setup);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 7bff162..999fb47 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -26,12 +26,14 @@  static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
 	if (skb->nfct != NULL)
 		return XT_CONTINUE;
 
-	/* special case the untracked ct : we want the percpu object */
-	if (!ct)
-		ct = nf_ct_untracked_get();
-	atomic_inc(&ct->ct_general.use);
-	skb->nfct = &ct->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
+	if (ct) {
+		skb->nfct = &ct->ct_general;
+		skb->nfctinfo = IP_CT_NEW;
+		atomic_inc(&ct->ct_general.use);
+	} else {
+		skb->nfct = NULL;
+		skb->nfctinfo = IP_CT_UNTRACKED;
+	}
 
 	return XT_CONTINUE;
 }
@@ -332,7 +334,7 @@  static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
 	struct nf_conn *ct = info->ct;
 	struct nf_conn_help *help;
 
-	if (ct && !nf_ct_is_untracked(ct)) {
+	if (ct) {
 		help = nfct_help(ct);
 		if (help)
 			module_put(help->helper->me);
@@ -406,9 +408,8 @@  notrack_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	if (skb->nfct != NULL)
 		return XT_CONTINUE;
 
-	skb->nfct = &nf_ct_untracked_get()->ct_general;
-	skb->nfctinfo = IP_CT_NEW;
-	nf_conntrack_get(skb->nfct);
+	skb->nfct = NULL;
+	skb->nfctinfo = IP_CT_UNTRACKED;
 
 	return XT_CONTINUE;
 }
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 4ad1a24..4329605 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -131,7 +131,7 @@  conntrack_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
 
 #define FWINV(bool, invflg) ((bool) ^ !!(sinfo->invflags & (invflg)))
 
-	if (ct == &nf_conntrack_untracked)
+	if (ctinfo == IP_CT_UNTRACKED)
 		statebit = XT_CONNTRACK_STATE_UNTRACKED;
 	else if (ct)
 		statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
@@ -261,12 +261,11 @@  conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
 
 	ct = nf_ct_get(skb, &ctinfo);
 
-	if (ct) {
-		if (nf_ct_is_untracked(ct))
-			statebit = XT_CONNTRACK_STATE_UNTRACKED;
-		else
-			statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
-	} else
+	if (ct)
+		statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
+	else if (ctinfo == IP_CT_UNTRACKED)
+		statebit = XT_CONNTRACK_STATE_UNTRACKED;
+	else
 		statebit = XT_CONNTRACK_STATE_INVALID;
 
 	if (info->match_flags & XT_CONNTRACK_STATE) {
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index eb5a50d..5a11bc7 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -28,14 +28,13 @@  state_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	unsigned int statebit;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 
-	if (!ct)
+	if (ct)
+		statebit = XT_STATE_BIT(ctinfo);
+	else if (ctinfo == IP_CT_UNTRACKED)
+		statebit = XT_STATE_UNTRACKED;
+	else
 		statebit = XT_STATE_INVALID;
-	else {
-		if (nf_ct_is_untracked(ct))
-			statebit = XT_STATE_UNTRACKED;
-		else
-			statebit = XT_STATE_BIT(ctinfo);
-	}
+
 	return (sinfo->statemask & statebit);
 }