From aea9d711f3d68c656ad31ab578ecfb0bb5cd7f97 Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Wed, 9 Jun 2010 16:10:57 +0200
Subject: ipvs: Add missing locking during connection table hashing and
 unhashing

The code that hashes and unhashes connections from the connection table
is missing locking of the connection being modified, which opens up a
race condition and results in memory corruption when this race condition
is hit.

Here is what happens in pretty verbose form:

CPU 0					CPU 1
------------				------------
An active connection is terminated and
we schedule ip_vs_conn_expire() on this
CPU to expire this connection.

					IRQ assignment is changed to this CPU,
					but the expire timer stays scheduled on
					the other CPU.

					New connection from same ip:port comes
					in right before the timer expires, we
					find the inactive connection in our
					connection table and get a reference to
					it. We proper lock the connection in
					tcp_state_transition() and read the
					connection flags in set_tcp_state().

ip_vs_conn_expire() gets called, we
unhash the connection from our
connection table and remove the hashed
flag in ip_vs_conn_unhash(), without
proper locking!

					While still holding proper locks we
					write the connection flags in
					set_tcp_state() and this sets the hashed
					flag again.

ip_vs_conn_expire() fails to expire the
connection, because the other CPU has
incremented the reference count. We try
to re-insert the connection into our
connection table, but this fails in
ip_vs_conn_hash(), because the hashed
flag has been set by the other CPU. We
re-schedule execution of
ip_vs_conn_expire(). Now this connection
has the hashed flag set, but isn't
actually hashed in our connection table
and has a dangling list_head.

					We drop the reference we held on the
					connection and schedule the expire timer
					for timeouting the connection on this
					CPU. Further packets won't be able to
					find this connection in our connection
					table.

					ip_vs_conn_expire() gets called again,
					we think it's already hashed, but the
					list_head is dangling and while removing
					the connection from our connection table
					we write to the memory location where
					this list_head points to.

The result will probably be a kernel oops at some other point in time.

This race condition is pretty subtle, but it can be triggered remotely.
It needs the IRQ assignment change or another circumstance where packets
coming from the same ip:port for the same service are being processed on
different CPUs. And it involves hitting the exact time at which
ip_vs_conn_expire() gets called. It can be avoided by making sure that
all packets from one connection are always processed on the same CPU and
can be made harder to exploit by changing the connection timeouts to
some custom values.

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Cc: stable@kernel.org
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/ipvs/ip_vs_conn.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index d8f7e8ef67b4..ff04e9edbed6 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -162,6 +162,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
 
 	ct_write_lock(hash);
+	spin_lock(&cp->lock);
 
 	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
 		list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
@@ -174,6 +175,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
 		ret = 0;
 	}
 
+	spin_unlock(&cp->lock);
 	ct_write_unlock(hash);
 
 	return ret;
@@ -193,6 +195,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 	hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
 
 	ct_write_lock(hash);
+	spin_lock(&cp->lock);
 
 	if (cp->flags & IP_VS_CONN_F_HASHED) {
 		list_del(&cp->c_list);
@@ -202,6 +205,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
 	} else
 		ret = 0;
 
+	spin_unlock(&cp->lock);
 	ct_write_unlock(hash);
 
 	return ret;
-- 
cgit v1.2.3


From fed396a585d8e1870b326f2e8e1888a72957abb8 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 15 Jun 2010 21:43:07 -0700
Subject: bridge: Fix OOM crash in deliver_clone

The bridge multicast patches introduced an OOM crash in the forward
path, when deliver_clone fails to clone the skb.

Reported-by: Mark Wagner <mwagner@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_forward.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index a98ef1393097..a4e72a89e4ff 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -140,10 +140,10 @@ static int deliver_clone(const struct net_bridge_port *prev,
 			 void (*__packet_hook)(const struct net_bridge_port *p,
 					       struct sk_buff *skb))
 {
+	struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+
 	skb = skb_clone(skb, GFP_ATOMIC);
 	if (!skb) {
-		struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
-
 		dev->stats.tx_dropped++;
 		return -ENOMEM;
 	}
-- 
cgit v1.2.3


From fa68a7822780fdc1295f7efb7e4313e62b447e75 Mon Sep 17 00:00:00 2001
From: Tom Hughes <tom@compton.nu>
Date: Tue, 15 Jun 2010 22:24:28 +0000
Subject: Clear IFF_XMIT_DST_RELEASE for teql interfaces

https://bugzilla.kernel.org/show_bug.cgi?id=16183

The sch_teql module, which can be used to load balance over a set of
underlying interfaces, stopped working after 2.6.30 and has been
broken in all kernels since then for any underlying interface which
requires the addition of link level headers.

The problem is that the transmit routine relies on being able to
access the destination address in the skb in order to do address
resolution once it has decided which underlying interface it is going
to transmit through.

In 2.6.31 the IFF_XMIT_DST_RELEASE flag was introduced, and set by
default for all interfaces, which causes the destination address to be
released before the transmit routine for the interface is called.

The solution is to clear that flag for teql interfaces.

Signed-off-by: Tom Hughes <tom@compton.nu>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_teql.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 3415b6ce1c0a..807643bdcbac 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -449,6 +449,7 @@ static __init void teql_master_setup(struct net_device *dev)
 	dev->tx_queue_len	= 100;
 	dev->flags		= IFF_NOARP;
 	dev->hard_header_len	= LL_MAX_HEADER;
+	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
 }
 
 static LIST_HEAD(master_dev_list);
-- 
cgit v1.2.3


From 25442e06d20aaba7d7b16438078a562b3e4cf19b Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Tue, 15 Jun 2010 06:14:12 +0000
Subject: bridge: fdb cleanup runs too often

It is common in end-node, non STP bridges to set forwarding
delay to zero; which causes the forwarding database cleanup
to run every clock tick. Change to run only as soon as needed
or at next ageing timer interval which ever is sooner.

Use round_jiffies_up macro rather than attempting round up
by changing value.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_fdb.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 26637439965b..b01dde35a69e 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -128,7 +128,7 @@ void br_fdb_cleanup(unsigned long _data)
 {
 	struct net_bridge *br = (struct net_bridge *)_data;
 	unsigned long delay = hold_time(br);
-	unsigned long next_timer = jiffies + br->forward_delay;
+	unsigned long next_timer = jiffies + br->ageing_time;
 	int i;
 
 	spin_lock_bh(&br->hash_lock);
@@ -149,9 +149,7 @@ void br_fdb_cleanup(unsigned long _data)
 	}
 	spin_unlock_bh(&br->hash_lock);
 
-	/* Add HZ/4 to ensure we round the jiffies upwards to be after the next
-	 * timer, otherwise we might round down and will have no-op run. */
-	mod_timer(&br->gc_timer, round_jiffies(next_timer + HZ/4));
+	mod_timer(&br->gc_timer, round_jiffies_up(next_timer));
 }
 
 /* Completely flush all dynamic entries in forwarding database.*/
-- 
cgit v1.2.3


From 26cde9f7e2747b6d254b704594eed87ab959afa5 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 15 Jun 2010 01:52:25 +0000
Subject: udp: Fix bogus UFO packet generation

It has been reported that the new UFO software fallback path
fails under certain conditions with NFS.  I tracked the problem
down to the generation of UFO packets that are smaller than the
MTU.  The software fallback path simply discards these packets.

This patch fixes the problem by not generating such packets on
the UFO path.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9a4a6c96cb0d..041d41df1224 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -873,8 +873,10 @@ int ip_append_data(struct sock *sk,
 	    !exthdrlen)
 		csummode = CHECKSUM_PARTIAL;
 
+	skb = skb_peek_tail(&sk->sk_write_queue);
+
 	inet->cork.length += length;
-	if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) &&
+	if (((length > mtu) || (skb && skb_is_gso(skb))) &&
 	    (sk->sk_protocol == IPPROTO_UDP) &&
 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
 		err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
@@ -892,7 +894,7 @@ int ip_append_data(struct sock *sk,
 	 * adding appropriate IP header.
 	 */
 
-	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+	if (!skb)
 		goto alloc_new_skb;
 
 	while (length > 0) {
@@ -1121,7 +1123,8 @@ ssize_t	ip_append_page(struct sock *sk, struct page *page,
 		return -EINVAL;
 
 	inet->cork.length += size;
-	if ((sk->sk_protocol == IPPROTO_UDP) &&
+	if ((size + skb->len > mtu) &&
+	    (sk->sk_protocol == IPPROTO_UDP) &&
 	    (rt->u.dst.dev->features & NETIF_F_UFO)) {
 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
-- 
cgit v1.2.3