==================================
Known issue:
There are places (most of cases) where dst is changed in
NF_INET_LOCAL_OUT hook, but our venet hook goes with NF_IP_PRI_LAST
priority and no other hook uses that priority.
So this is fine.
Nevertheless there are cases when dst is changed after this case, for
example in IPSec:
iptable_nat_ipv4_out()
nf_nat_ipv4_out()
nf_xfrm_me_harder()
{
...
dst = xfrm_lookup(net, dst, &fl, sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
skb_dst_drop(skb);
skb_dst_set(skb, dst);
...
}
{
.hook = iptable_nat_ipv4_out,
.owner = THIS_MODULE,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
},
And dst is changed in some exotic cases even in some *_xmit() functions.
Well, may be for the sake of fairness we better move the vznetstat
accounting (and setting mark) code to venet_xmit()/veth_xmit() functions
as well, but honestly i do not make such big changes in vz7.
We live with this kind of accounting for a long time and it works well
for most usecases.
So i'd leave setting mark on the same place along with accounting.
Fixes: f4943221d710 ("vznetstat: Clear skb->mark on entering to VE's IP
stack")
https://jira.sw.ru/browse/PSBM-122082
Signed-off-by: Konstantin Khorenko <khorenko@virtuozzo.com>
Changes:
v2: * extented both comments in the code and commit message
* added similar hunks for ip6 handling
---
drivers/net/venetdev.c | 14 ++++++++++++++
drivers/net/veth.c | 9 +++++++++
kernel/ve/vznetstat/ip6_vznetstat.c | 20 ++++++++++++++++++++
kernel/ve/vznetstat/ip_vznetstat.c | 20 ++++++++++++++++++++
kernel/ve/vznetstat/vznetstat.c | 8 +++++---
5 files changed, 68 insertions(+), 3 deletions(-)
@@ -464,6 +464,20 @@ static int venet_xmit(struct sk_buff *skb, struct net_device *dev)
stats = venet_stats(dev, smp_processor_id());
ve = dev_net(dev)->owner_ve;
+ /*
+ * vzctl configures tc on Host for shaping basing on skb->marks
+ * which are set by venet_acct_mark() basing on VE src and dst
+ * ip address, thus vz specific marks are needed on Host only.
+ *
+ * On the other hand someone might set own marks inside a
+ * Container with xt_mark funtionality, thus vz specific marks
+ * are unexpected and undesirable inside a Container.
+ *
+ * => if the packet goes in VE0 -> VE direction, drop the skb
+ * mark used for vz traffic shaping.
+ */
+ if (ve_is_super(ve))
+ skb->mark = 0;
if (skb->protocol == __constant_htons(ETH_P_IP)) {
struct iphdr *iph;
@@ -149,6 +149,15 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
goto drop;
}
+ /*
+ * If skb is going from VE0 to VE via veth,
+ * drop the vz specific mark used for traffic shaping.
+ * See comment in venet_xmit() for details.
+ */
+ if (dev->features & NETIF_F_VENET &&
+ ve_is_super(dev_net(dev)->owner_ve))
+ skb->mark = 0;
+
if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
@@ -34,6 +34,17 @@ venet_acct_in_hook_v6(const struct nf_hook_ops *hook,
if (in->flags & IFF_LOOPBACK)
goto out;
+ /*
+ * For VE: let's account only traffic which crosses the coundary
+ * VE0->VE.
+ *
+ * For VE0: let's account all the traffic as it always was.
+ * There can be extra accounting in case of traffic between
+ * Docker Containers in VE0.
+ */
+ if (!ve_is_super(in->nd_net->owner_ve) &&
+ !(in->features & NETIF_F_VENET))
+ goto out;
venet_acct_classify_add_incoming(in->nd_net->owner_ve->stat, skb);
out:
@@ -54,6 +65,15 @@ venet_acct_out_hook_v6(const struct nf_hook_ops *hook,
goto out;
if (dst && (dst->dev->flags & IFF_LOOPBACK))
goto out;
+ /*
+ * Don't account and mark VE traffic if it does not cross VE/VE0
+ * boundary.
+ * For VE0 there can be extra accounting in case of traffic
+ * between Docker Containers in VE0.
+ */
+ if (!(ve_is_super(out->nd_net->owner_ve)) &&
+ dst && !(dst->dev->features & NETIF_F_VENET))
+ goto out;
skb->protocol = __constant_htons(ETH_P_IPV6);
venet_acct_classify_add_outgoing(out->nd_net->owner_ve->stat, skb);
@@ -43,6 +43,17 @@ static unsigned int venet_acct_in_hook(const struct nf_hook_ops *hook,
/* Skip loopback dev */
if (in == dev_net(in)->loopback_dev)
goto out;
+ /*
+ * For VE: let's account only traffic which crosses the coundary
+ * VE0->VE.
+ *
+ * For VE0: let's account all the traffic as it always was.
+ * There can be extra accounting in case of traffic between
+ * Docker Containers in VE0.
+ */
+ if (!ve_is_super(in->nd_net->owner_ve) &&
+ !(in->features & NETIF_F_VENET))
+ goto out;
#if VZNS_DEBUG
printk("%s: in %s, out %s, size %d, in->owner_env=%s\n",
@@ -92,6 +103,15 @@ static unsigned int venet_acct_out_hook(const struct nf_hook_ops *hook,
*/
if (dst && (dst->dev->flags & IFF_LOOPBACK))
goto out;
+ /*
+ * Don't account and mark VE traffic if it does not cross VE/VE0
+ * boundary.
+ * For VE0 there can be extra accounting in case of traffic
+ * between Docker Containers in VE0.
+ */
+ if (!(ve_is_super(out->nd_net->owner_ve)) &&
+ dst && !(dst->dev->features & NETIF_F_VENET))
+ goto out;
/* Paranoia */
if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr))))
@@ -726,10 +726,12 @@ void venet_acct_classify_add_incoming(struct venet_stat *stat, struct sk_buff *s
{
acct_one_skb(stat, skb, ACCT_IN, venet_acct_skb_size(skb));
/*
- * Every incomming skb must have zero mark, since here is its first
- * come into VE's IP stack.
+ * We do set mark on skb in venet acct "out" hook so it would be
+ * reasonable to drop redundant marks in venet acct "in" hook.
+ * But seems it's impossible to understand the direction of
+ * packet here (if it goes VE->VE0 or VE0->VE), thus mark is
+ * dropped in venet_xmit() and veth_xmit() instead.
*/
- venet_clear_mark(skb);
}
static inline void venet_acct_mark(struct venet_stat *stat,