summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig2
-rw-r--r--net/ipv4/af_inet.c21
-rw-r--r--net/ipv4/fib_trie.c6
-rw-r--r--net/ipv4/route.c60
-rw-r--r--net/ipv4/tcp.c47
-rw-r--r--net/ipv4/tcp_input.c118
6 files changed, 150 insertions, 104 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 9d26a3da37e..5b919f7b45d 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -408,7 +408,7 @@ config INET_XFRM_MODE_BEET
config INET_LRO
bool "Large Receive Offload (ipv4/tcp)"
-
+ default y
---help---
Support for Large Receive Offload (ipv4/tcp).
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 170689681aa..5abee4c9744 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1246,13 +1246,20 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
struct sk_buff **pp = NULL;
struct sk_buff *p;
struct iphdr *iph;
+ unsigned int hlen;
+ unsigned int off;
+ unsigned int id;
int flush = 1;
int proto;
- int id;
- iph = skb_gro_header(skb, sizeof(*iph));
- if (unlikely(!iph))
- goto out;
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*iph);
+ iph = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ iph = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!iph))
+ goto out;
+ }
proto = iph->protocol & (MAX_INET_PROTOS - 1);
@@ -1267,9 +1274,9 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto out_unlock;
- flush = ntohs(iph->tot_len) != skb_gro_len(skb) ||
- iph->frag_off != htons(IP_DF);
- id = ntohs(iph->id);
+ id = ntohl(*(u32 *)&iph->id);
+ flush = (u16)((ntohl(*(u32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
+ id >>= 16;
for (p = *head; p; p = p->next) {
struct iphdr *iph2;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 9070d11058e..538d2a9a511 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -986,9 +986,12 @@ fib_find_node(struct trie *t, u32 key)
static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
{
int wasfull;
- t_key cindex, key = tn->key;
+ t_key cindex, key;
struct tnode *tp;
+ preempt_disable();
+ key = tn->key;
+
while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
@@ -1007,6 +1010,7 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
if (IS_TNODE(tn))
tn = (struct tnode *)resize(t, (struct tnode *)tn);
+ preempt_enable();
return (struct node *)tn;
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c4c60e9f068..28205e5bfa9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -784,8 +784,8 @@ static void rt_check_expire(void)
{
static unsigned int rover;
unsigned int i = rover, goal;
- struct rtable *rth, **rthp;
- unsigned long length = 0, samples = 0;
+ struct rtable *rth, *aux, **rthp;
+ unsigned long samples = 0;
unsigned long sum = 0, sum2 = 0;
u64 mult;
@@ -795,9 +795,9 @@ static void rt_check_expire(void)
goal = (unsigned int)mult;
if (goal > rt_hash_mask)
goal = rt_hash_mask + 1;
- length = 0;
for (; goal > 0; goal--) {
unsigned long tmo = ip_rt_gc_timeout;
+ unsigned long length;
i = (i + 1) & rt_hash_mask;
rthp = &rt_hash_table[i].chain;
@@ -809,8 +809,10 @@ static void rt_check_expire(void)
if (*rthp == NULL)
continue;
+ length = 0;
spin_lock_bh(rt_hash_lock_addr(i));
while ((rth = *rthp) != NULL) {
+ prefetch(rth->u.dst.rt_next);
if (rt_is_expired(rth)) {
*rthp = rth->u.dst.rt_next;
rt_free(rth);
@@ -819,33 +821,30 @@ static void rt_check_expire(void)
if (rth->u.dst.expires) {
/* Entry is expired even if it is in use */
if (time_before_eq(jiffies, rth->u.dst.expires)) {
+nofree:
tmo >>= 1;
rthp = &rth->u.dst.rt_next;
/*
- * Only bump our length if the hash
- * inputs on entries n and n+1 are not
- * the same, we only count entries on
+ * We only count entries on
* a chain with equal hash inputs once
* so that entries for different QOS
* levels, and other non-hash input
* attributes don't unfairly skew
* the length computation
*/
- if ((*rthp == NULL) ||
- !compare_hash_inputs(&(*rthp)->fl,
- &rth->fl))
- length += ONE;
+ for (aux = rt_hash_table[i].chain;;) {
+ if (aux == rth) {
+ length += ONE;
+ break;
+ }
+ if (compare_hash_inputs(&aux->fl, &rth->fl))
+ break;
+ aux = aux->u.dst.rt_next;
+ }
continue;
}
- } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
- tmo >>= 1;
- rthp = &rth->u.dst.rt_next;
- if ((*rthp == NULL) ||
- !compare_hash_inputs(&(*rthp)->fl,
- &rth->fl))
- length += ONE;
- continue;
- }
+ } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+ goto nofree;
/* Cleanup aged off entries. */
*rthp = rth->u.dst.rt_next;
@@ -1068,7 +1067,6 @@ out: return 0;
static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
{
struct rtable *rth, **rthp;
- struct rtable *rthi;
unsigned long now;
struct rtable *cand, **candp;
u32 min_score;
@@ -1088,7 +1086,6 @@ restart:
}
rthp = &rt_hash_table[hash].chain;
- rthi = NULL;
spin_lock_bh(rt_hash_lock_addr(hash));
while ((rth = *rthp) != NULL) {
@@ -1134,17 +1131,6 @@ restart:
chain_length++;
rthp = &rth->u.dst.rt_next;
-
- /*
- * check to see if the next entry in the chain
- * contains the same hash input values as rt. If it does
- * This is where we will insert into the list, instead of
- * at the head. This groups entries that differ by aspects not
- * relvant to the hash function together, which we use to adjust
- * our chain length
- */
- if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
- rthi = rth;
}
if (cand) {
@@ -1205,10 +1191,7 @@ restart:
}
}
- if (rthi)
- rt->u.dst.rt_next = rthi->u.dst.rt_next;
- else
- rt->u.dst.rt_next = rt_hash_table[hash].chain;
+ rt->u.dst.rt_next = rt_hash_table[hash].chain;
#if RT_CACHE_DEBUG >= 2
if (rt->u.dst.rt_next) {
@@ -1224,10 +1207,7 @@ restart:
* previous writes to rt are comitted to memory
* before making rt visible to other CPUS.
*/
- if (rthi)
- rcu_assign_pointer(rthi->u.dst.rt_next, rt);
- else
- rcu_assign_pointer(rt_hash_table[hash].chain, rt);
+ rcu_assign_pointer(rt_hash_table[hash].chain, rt);
spin_unlock_bh(rt_hash_lock_addr(hash));
*rp = rt;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7a0f0b27bf1..17b89c523f9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -439,12 +439,14 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
!tp->urg_data ||
before(tp->urg_seq, tp->copied_seq) ||
!before(tp->urg_seq, tp->rcv_nxt)) {
+ struct sk_buff *skb;
+
answ = tp->rcv_nxt - tp->copied_seq;
/* Subtract 1, if FIN is in queue. */
- if (answ && !skb_queue_empty(&sk->sk_receive_queue))
- answ -=
- tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin;
+ skb = skb_peek_tail(&sk->sk_receive_queue);
+ if (answ && skb)
+ answ -= tcp_hdr(skb)->fin;
} else
answ = tp->urg_seq - tp->copied_seq;
release_sock(sk);
@@ -1382,11 +1384,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/* Next get a buffer. */
- skb = skb_peek(&sk->sk_receive_queue);
- do {
- if (!skb)
- break;
-
+ skb_queue_walk(&sk->sk_receive_queue, skb) {
/* Now that we have two receive queues this
* shouldn't happen.
*/
@@ -1403,8 +1401,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (tcp_hdr(skb)->fin)
goto found_fin_ok;
WARN_ON(!(flags & MSG_PEEK));
- skb = skb->next;
- } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+ }
/* Well, if we have backlog, try to process it now yet. */
@@ -2518,20 +2515,30 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
unsigned int thlen;
unsigned int flags;
unsigned int mss = 1;
+ unsigned int hlen;
+ unsigned int off;
int flush = 1;
int i;
- th = skb_gro_header(skb, sizeof(*th));
- if (unlikely(!th))
- goto out;
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*th);
+ th = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+ goto out;
+ }
thlen = th->doff * 4;
if (thlen < sizeof(*th))
goto out;
- th = skb_gro_header(skb, thlen);
- if (unlikely(!th))
- goto out;
+ hlen = off + thlen;
+ if (skb_gro_header_hard(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+ goto out;
+ }
skb_gro_pull(skb, thlen);
@@ -2544,7 +2551,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
th2 = tcp_hdr(p);
- if ((th->source ^ th2->source) | (th->dest ^ th2->dest)) {
+ if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
@@ -2559,14 +2566,14 @@ found:
flush |= flags & TCP_FLAG_CWR;
flush |= (flags ^ tcp_flag_word(th2)) &
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
- flush |= (th->ack_seq ^ th2->ack_seq) | (th->window ^ th2->window);
- for (i = sizeof(*th); !flush && i < thlen; i += 4)
+ flush |= th->ack_seq ^ th2->ack_seq;
+ for (i = sizeof(*th); i < thlen; i += 4)
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);
mss = skb_shinfo(p)->gso_size;
- flush |= (len > mss) | !len;
+ flush |= (len - 1) >= mss;
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
if (flush || skb_gro_receive(head, skb)) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eeb8a92aa41..ba34a23c1bf 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4426,7 +4426,7 @@ drop:
}
__skb_queue_head(&tp->out_of_order_queue, skb);
} else {
- struct sk_buff *skb1 = tp->out_of_order_queue.prev;
+ struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue);
u32 seq = TCP_SKB_CB(skb)->seq;
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -4443,15 +4443,18 @@ drop:
}
/* Find place to insert this segment. */
- do {
+ while (1) {
if (!after(TCP_SKB_CB(skb1)->seq, seq))
break;
- } while ((skb1 = skb1->prev) !=
- (struct sk_buff *)&tp->out_of_order_queue);
+ if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
+ skb1 = NULL;
+ break;
+ }
+ skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
+ }
/* Do skb overlap to previous one? */
- if (skb1 != (struct sk_buff *)&tp->out_of_order_queue &&
- before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
/* All the bits are present. Drop. */
__kfree_skb(skb);
@@ -4463,24 +4466,41 @@ drop:
tcp_dsack_set(sk, seq,
TCP_SKB_CB(skb1)->end_seq);
} else {
- skb1 = skb1->prev;
+ if (skb_queue_is_first(&tp->out_of_order_queue,
+ skb1))
+ skb1 = NULL;
+ else
+ skb1 = skb_queue_prev(
+ &tp->out_of_order_queue,
+ skb1);
}
}
- __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
+ if (!skb1)
+ __skb_queue_head(&tp->out_of_order_queue, skb);
+ else
+ __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
/* And clean segments covered by new one as whole. */
- while ((skb1 = skb->next) !=
- (struct sk_buff *)&tp->out_of_order_queue &&
- after(end_seq, TCP_SKB_CB(skb1)->seq)) {
- if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+ if (skb1 && !skb_queue_is_last(&tp->out_of_order_queue, skb1)) {
+ struct sk_buff *n;
+
+ skb1 = skb_queue_next(&tp->out_of_order_queue, skb1);
+ skb_queue_walk_from_safe(&tp->out_of_order_queue,
+ skb1, n) {
+ if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+ break;
+ if (before(end_seq,
+ TCP_SKB_CB(skb1)->end_seq)) {
+ tcp_dsack_extend(sk,
+ TCP_SKB_CB(skb1)->seq,
+ end_seq);
+ break;
+ }
+ __skb_unlink(skb1, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
- end_seq);
- break;
+ TCP_SKB_CB(skb1)->end_seq);
+ __kfree_skb(skb1);
}
- __skb_unlink(skb1, &tp->out_of_order_queue);
- tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
- TCP_SKB_CB(skb1)->end_seq);
- __kfree_skb(skb1);
}
add_sack:
@@ -4492,7 +4512,10 @@ add_sack:
static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
struct sk_buff_head *list)
{
- struct sk_buff *next = skb->next;
+ struct sk_buff *next = NULL;
+
+ if (!skb_queue_is_last(list, skb))
+ next = skb_queue_next(list, skb);
__skb_unlink(skb, list);
__kfree_skb(skb);
@@ -4503,6 +4526,9 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
/* Collapse contiguous sequence of skbs head..tail with
* sequence numbers start..end.
+ *
+ * If tail is NULL, this means until the end of the list.
+ *
* Segments with FIN/SYN are not collapsed (only because this
* simplifies code)
*/
@@ -4511,15 +4537,23 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
struct sk_buff *head, struct sk_buff *tail,
u32 start, u32 end)
{
- struct sk_buff *skb;
+ struct sk_buff *skb, *n;
+ bool end_of_skbs;
/* First, check that queue is collapsible and find
* the point where collapsing can be useful. */
- for (skb = head; skb != tail;) {
+ skb = head;
+restart:
+ end_of_skbs = true;
+ skb_queue_walk_from_safe(list, skb, n) {
+ if (skb == tail)
+ break;
/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
skb = tcp_collapse_one(sk, skb, list);
- continue;
+ if (!skb)
+ break;
+ goto restart;
}
/* The first skb to collapse is:
@@ -4529,16 +4563,24 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
*/
if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
(tcp_win_from_space(skb->truesize) > skb->len ||
- before(TCP_SKB_CB(skb)->seq, start) ||
- (skb->next != tail &&
- TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq)))
+ before(TCP_SKB_CB(skb)->seq, start))) {
+ end_of_skbs = false;
break;
+ }
+
+ if (!skb_queue_is_last(list, skb)) {
+ struct sk_buff *next = skb_queue_next(list, skb);
+ if (next != tail &&
+ TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
+ end_of_skbs = false;
+ break;
+ }
+ }
/* Decided to skip this, advance start seq. */
start = TCP_SKB_CB(skb)->end_seq;
- skb = skb->next;
}
- if (skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
+ if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
return;
while (before(start, end)) {
@@ -4583,7 +4625,8 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
}
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
skb = tcp_collapse_one(sk, skb, list);
- if (skb == tail ||
+ if (!skb ||
+ skb == tail ||
tcp_hdr(skb)->syn ||
tcp_hdr(skb)->fin)
return;
@@ -4610,17 +4653,21 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
head = skb;
for (;;) {
- skb = skb->next;
+ struct sk_buff *next = NULL;
+
+ if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
+ next = skb_queue_next(&tp->out_of_order_queue, skb);
+ skb = next;
/* Segment is terminated when we see gap or when
* we are at the end of all the queue. */
- if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
+ if (!skb ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) {
tcp_collapse(sk, &tp->out_of_order_queue,
head, skb, start, end);
head = skb;
- if (skb == (struct sk_buff *)&tp->out_of_order_queue)
+ if (!skb)
break;
/* Start new segment */
start = TCP_SKB_CB(skb)->seq;
@@ -4681,10 +4728,11 @@ static int tcp_prune_queue(struct sock *sk)
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
tcp_collapse_ofo_queue(sk);
- tcp_collapse(sk, &sk->sk_receive_queue,
- sk->sk_receive_queue.next,
- (struct sk_buff *)&sk->sk_receive_queue,
- tp->copied_seq, tp->rcv_nxt);
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ tcp_collapse(sk, &sk->sk_receive_queue,
+ skb_peek(&sk->sk_receive_queue),
+ NULL,
+ tp->copied_seq, tp->rcv_nxt);
sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)