From 8f49c2703b33519aaaccc63f571b465b9d2b3a2d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 12 Nov 2010 13:35:00 -0800 Subject: tcp: Don't change unlocked socket state in tcp_v4_err(). Alexey Kuznetsov noticed a regression introduced by commit f1ecd5d9e7366609d640ff4040304ea197fbc618 ("Revert Backoff [v3]: Revert RTO on ICMP destination unreachable") The RTO and timer modification code added to tcp_v4_err() doesn't check sock_owned_by_user(), which if true means we don't have exclusive access to the socket and therefore cannot modify it's critical state. Just skip this new code block if sock_owned_by_user() is true and eliminate the now superfluous sock_owned_by_user() code block contained within. Reported-by: Alexey Kuznetsov Signed-off-by: David S. Miller CC: Damian Lukowski Acked-by: Eric Dumazet --- net/ipv4/tcp_ipv4.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8f8527d4168..69ccbc1dde9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -415,6 +415,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) !icsk->icsk_backoff) break; + if (sock_owned_by_user(sk)) + break; + icsk->icsk_backoff--; inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << icsk->icsk_backoff; @@ -429,11 +432,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (remaining) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, TCP_RTO_MAX); - } else if (sock_owned_by_user(sk)) { - /* RTO revert clocked out retransmission, - * but socket is locked. Will defer. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - HZ/20, TCP_RTO_MAX); } else { /* RTO revert clocked out retransmission. * Will retransmit now */ -- cgit v1.2.3 From 8475ef9fd16cadbfc692f78e608d1941a340beb2 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 22 Nov 2010 03:26:12 +0000 Subject: netns: Don't leak others' openreq-s in proc The /proc/net/tcp leaks openreq sockets from other namespaces. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 69ccbc1dde9..e13da6de1fc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2043,7 +2043,9 @@ get_req: } get_sk: sk_nulls_for_each_from(sk, node) { - if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { + if (!net_eq(sock_net(sk), net)) + continue; + if (sk->sk_family == st->family) { cur = sk; goto out; } -- cgit v1.2.3 From 582a72da9a41be9227dc931d728ae2906880a589 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 30 Nov 2010 11:53:55 -0800 Subject: inetpeer: Introduce inet_peer_address_t. Currently only the v4 aspect is used, but this will change. Signed-off-by: David S. Miller --- include/net/inetpeer.h | 10 +++++++++- net/ipv4/inetpeer.c | 16 ++++++++-------- net/ipv4/tcp_ipv4.c | 2 +- 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index fe239bfe5f7..d7e60792d76 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -13,10 +13,18 @@ #include #include +typedef struct { + union { + __be32 a4; + __be32 a6[4]; + }; + __u16 family; +} inet_peer_address_t; + struct inet_peer { /* group together avl_left,avl_right,v4daddr to speedup lookups */ struct inet_peer __rcu *avl_left, *avl_right; - __be32 v4daddr; /* peer's address */ + inet_peer_address_t daddr; __u32 avl_height; struct list_head unused; __u32 dtime; /* the time of last use of not diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index f9440084892..893f998efdb 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -63,7 +63,7 @@ * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * dtime: unused node list lock - * v4daddr: unchangeable + * daddr: unchangeable * ip_id_count: atomic value (no lock needed) */ @@ -165,9 +165,9 @@ static void unlink_from_unused(struct inet_peer *p) for (u = rcu_dereference_protected(_base->root, \ lockdep_is_held(&_base->lock)); \ u != peer_avl_empty; ) { \ - if (_daddr == u->v4daddr) \ + if (_daddr == u->daddr.a4) \ break; \ - if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ + if ((__force __u32)_daddr < (__force __u32)u->daddr.a4) \ v = &u->avl_left; \ else \ v = &u->avl_right; \ @@ -191,7 +191,7 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr, struct inet_peer_base *base int count = 0; while (u != peer_avl_empty) { - if (daddr == u->v4daddr) { + if (daddr == u->daddr.a4) { /* Before taking a reference, check if this entry was * deleted, unlink_from_pool() sets refcnt=-1 to make * distinction between an unused entry (refcnt=0) and @@ -201,7 +201,7 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr, struct inet_peer_base *base u = NULL; return u; } - if ((__force __u32)daddr < (__force __u32)u->v4daddr) + if ((__force __u32)daddr < (__force __u32)u->daddr.a4) u = rcu_dereference_bh(u->avl_left); else u = rcu_dereference_bh(u->avl_right); @@ -354,7 +354,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { struct inet_peer __rcu **stack[PEER_MAXDEPTH]; struct inet_peer __rcu ***stackptr, ***delp; - if (lookup(p->v4daddr, stack, base) != p) + if (lookup(p->daddr.a4, stack, base) != p) BUG(); delp = stackptr - 1; /* *delp[0] == p */ if (p->avl_left == peer_avl_empty_rcu) { @@ -367,7 +367,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) BUG_ON(rcu_dereference_protected(*stackptr[-1], lockdep_is_held(&base->lock)) != t); **--stackptr = t->avl_left; - /* t is removed, t->v4daddr > x->v4daddr for any + /* t is removed, t->daddr > x->daddr for any * x in p->avl_left subtree. * Put t in the old place of p. */ RCU_INIT_POINTER(*delp[0], t); @@ -479,7 +479,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) } p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; if (p) { - p->v4daddr = daddr; + p->daddr.a4 = daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); atomic_set(&p->ip_id_count, secure_ip_id(daddr)); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 69ccbc1dde9..b8bbf89409b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1347,7 +1347,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_death_row.sysctl_tw_recycle && (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && - peer->v4daddr == saddr) { + peer->daddr.a4 == saddr) { inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > -- cgit v1.2.3 From b534ecf1cd26f094497da6ae28a6ab64cdbe1617 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 30 Nov 2010 11:54:19 -0800 Subject: inetpeer: Make inet_getpeer() take an inet_peer_adress_t pointer. And make an inet_getpeer_v4() helper, update callers. Signed-off-by: David S. Miller --- include/net/inetpeer.h | 11 ++++++++++- net/ipv4/inetpeer.c | 10 +++++----- net/ipv4/ip_fragment.c | 2 +- net/ipv4/route.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- 5 files changed, 19 insertions(+), 10 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index d7e60792d76..834f0456c87 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -50,7 +50,16 @@ struct inet_peer { void inet_initpeers(void) __init; /* can be called with or without local BH being disabled */ -struct inet_peer *inet_getpeer(__be32 daddr, int create); +struct inet_peer *inet_getpeer(inet_peer_address_t *daddr, int create); + +static inline struct inet_peer *inet_getpeer_v4(__be32 v4daddr, int create) +{ + inet_peer_address_t daddr; + + daddr.a4 = v4daddr; + daddr.family = AF_INET; + return inet_getpeer(&daddr, create); +} /* can be called from BH context or outside */ extern void inet_putpeer(struct inet_peer *p); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 893f998efdb..9aa76b8dd49 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -444,7 +444,7 @@ static struct inet_peer_base *family_to_base(int family) } /* Called with or without local BH being disabled. */ -struct inet_peer *inet_getpeer(__be32 daddr, int create) +struct inet_peer *inet_getpeer(inet_peer_address_t *daddr, int create) { struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; struct inet_peer_base *base = family_to_base(AF_INET); @@ -454,7 +454,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) * Because of a concurrent writer, we might not find an existing entry. */ rcu_read_lock_bh(); - p = lookup_rcu_bh(daddr, base); + p = lookup_rcu_bh(daddr->a4, base); rcu_read_unlock_bh(); if (p) { @@ -469,7 +469,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) * At least, nodes should be hot in our cache. */ spin_lock_bh(&base->lock); - p = lookup(daddr, stack, base); + p = lookup(daddr->a4, stack, base); if (p != peer_avl_empty) { atomic_inc(&p->refcnt); spin_unlock_bh(&base->lock); @@ -479,10 +479,10 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) } p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL; if (p) { - p->daddr.a4 = daddr; + p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, secure_ip_id(daddr)); + atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4)); p->tcp_ts_stamp = 0; INIT_LIST_HEAD(&p->unused); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 168440834ad..e6215bdd96c 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -141,7 +141,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a) qp->daddr = arg->iph->daddr; qp->user = arg->user; qp->peer = sysctl_ipfrag_max_dist ? - inet_getpeer(arg->iph->saddr, 1) : NULL; + inet_getpeer_v4(arg->iph->saddr, 1) : NULL; } static __inline__ void ip4_frag_free(struct inet_frag_queue *q) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ec2333fb637..3843c2dfde8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1289,7 +1289,7 @@ void rt_bind_peer(struct rtable *rt, int create) { struct inet_peer *peer; - peer = inet_getpeer(rt->rt_dst, create); + peer = inet_getpeer_v4(rt->rt_dst, create); if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b8bbf89409b..00285fcf678 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1778,7 +1778,7 @@ int tcp_v4_remember_stamp(struct sock *sk) int release_it = 0; if (!rt || rt->rt_dst != inet->inet_daddr) { - peer = inet_getpeer(inet->inet_daddr, 1); + peer = inet_getpeer_v4(inet->inet_daddr, 1); release_it = 1; } else { if (!rt->peer) @@ -1804,7 +1804,7 @@ EXPORT_SYMBOL(tcp_v4_remember_stamp); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) { - struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); + struct inet_peer *peer = inet_getpeer_v4(tw->tw_daddr, 1); if (peer) { const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); -- cgit v1.2.3 From 3f419d2d487821093ee46e898b5f8747f9edc9cd Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 29 Nov 2010 13:37:14 -0800 Subject: inet: Turn ->remember_stamp into ->get_peer in connection AF ops. Then we can make a completely generic tcp_remember_stamp() that uses ->get_peer() as a helper, minimizing the AF specific code and minimizing the eventual code duplication when we implement the ipv6 side of TW recycling. Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 +- include/net/tcp.h | 2 +- net/ipv4/tcp_ipv4.c | 35 ++++++++--------------------------- net/ipv4/tcp_minisocks.c | 31 ++++++++++++++++++++++++++++++- net/ipv6/tcp_ipv6.c | 8 ++++---- 5 files changed, 44 insertions(+), 34 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index e4f494b42e0..6c93a56cc95 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -43,7 +43,7 @@ struct inet_connection_sock_af_ops { struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst); - int (*remember_stamp)(struct sock *sk); + struct inet_peer *(*get_peer)(struct sock *sk, bool *release_it); u16 net_header_len; u16 sockaddr_len; int (*setsockopt)(struct sock *sk, int level, int optname, diff --git a/include/net/tcp.h b/include/net/tcp.h index e36c874c7fb..3e239641d4e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -312,7 +312,7 @@ extern void tcp_shutdown (struct sock *sk, int how); extern int tcp_v4_rcv(struct sk_buff *skb); -extern int tcp_v4_remember_stamp(struct sock *sk); +extern struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it); extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 00285fcf678..0ddf819cfb5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1763,44 +1763,25 @@ do_time_wait: goto discard_it; } -/* VJ's idea. Save last timestamp seen from this destination - * and hold it at least for normal timewait interval to use for duplicate - * segment detection in subsequent connections, before they enter synchronized - * state. - */ - -int tcp_v4_remember_stamp(struct sock *sk) +struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) { + struct rtable *rt = (struct rtable *) __sk_dst_get(sk); struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct rtable *rt = (struct rtable *)__sk_dst_get(sk); - struct inet_peer *peer = NULL; - int release_it = 0; + struct inet_peer *peer; if (!rt || rt->rt_dst != inet->inet_daddr) { peer = inet_getpeer_v4(inet->inet_daddr, 1); - release_it = 1; + *release_it = true; } else { if (!rt->peer) rt_bind_peer(rt, 1); peer = rt->peer; + *release_it = false; } - if (peer) { - if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && - peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { - peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; - peer->tcp_ts = tp->rx_opt.ts_recent; - } - if (release_it) - inet_putpeer(peer); - return 1; - } - - return 0; + return peer; } -EXPORT_SYMBOL(tcp_v4_remember_stamp); +EXPORT_SYMBOL(tcp_v4_get_peer); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) { @@ -1828,7 +1809,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, - .remember_stamp = tcp_v4_remember_stamp, + .get_peer = tcp_v4_get_peer, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 43cf901d765..059082c873c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -49,6 +49,35 @@ struct inet_timewait_death_row tcp_death_row = { }; EXPORT_SYMBOL_GPL(tcp_death_row); +/* VJ's idea. Save last timestamp seen from this destination + * and hold it at least for normal timewait interval to use for duplicate + * segment detection in subsequent connections, before they enter synchronized + * state. + */ + +static int tcp_remember_stamp(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct inet_peer *peer; + bool release_it; + + peer = icsk->icsk_af_ops->get_peer(sk, &release_it); + if (peer) { + if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || + ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && + peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { + peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; + peer->tcp_ts = tp->rx_opt.ts_recent; + } + if (release_it) + inet_putpeer(peer); + return 1; + } + + return 0; +} + static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) @@ -274,7 +303,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) int recycle_ok = 0; if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) - recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); + recycle_ok = tcp_remember_stamp(sk); if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) tw = inet_twsk_alloc(sk, state); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7e41e2cbb85..e394d0029d8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1818,10 +1818,10 @@ do_time_wait: goto discard_it; } -static int tcp_v6_remember_stamp(struct sock *sk) +struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it) { /* Alas, not yet... */ - return 0; + return NULL; } static const struct inet_connection_sock_af_ops ipv6_specific = { @@ -1830,7 +1830,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = { .rebuild_header = inet6_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, - .remember_stamp = tcp_v6_remember_stamp, + .get_peer = tcp_v6_get_peer, .net_header_len = sizeof(struct ipv6hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, @@ -1862,7 +1862,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, - .remember_stamp = tcp_v4_remember_stamp, + .get_peer = tcp_v4_get_peer, .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, -- cgit v1.2.3 From ccb7c410ddc054b8c1ae780319bc98ae092d3854 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 1 Dec 2010 18:09:13 -0800 Subject: timewait_sock: Create and use getpeer op. The only thing AF-specific about remembering the timestamp for a time-wait TCP socket is getting the peer. Abstract that behind a new timewait_sock_ops vector. Support for real IPV6 sockets is not filled in yet, but curiously this makes timewait recycling start to work for v4-mapped ipv6 sockets. Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + include/net/timewait_sock.h | 8 ++++++++ net/ipv4/tcp_ipv4.c | 33 +++++++++++---------------------- net/ipv4/tcp_minisocks.c | 32 ++++++++++++++++++++++++-------- net/ipv6/tcp_ipv6.c | 26 +++++++++++++++++++------- 5 files changed, 63 insertions(+), 37 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 3e239641d4e..4097320caa2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -313,6 +313,7 @@ extern void tcp_shutdown (struct sock *sk, int how); extern int tcp_v4_rcv(struct sk_buff *skb); extern struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it); +extern void *tcp_v4_tw_get_peer(struct sock *sk); extern int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size); diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h index 97c3b14da55..053b3cf2c66 100644 --- a/include/net/timewait_sock.h +++ b/include/net/timewait_sock.h @@ -21,6 +21,7 @@ struct timewait_sock_ops { int (*twsk_unique)(struct sock *sk, struct sock *sktw, void *twp); void (*twsk_destructor)(struct sock *sk); + void *(*twsk_getpeer)(struct sock *sk); }; static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -39,4 +40,11 @@ static inline void twsk_destructor(struct sock *sk) sk->sk_prot->twsk_prot->twsk_destructor(sk); } +static inline void *twsk_getpeer(struct sock *sk) +{ + if (sk->sk_prot->twsk_prot->twsk_getpeer) + return sk->sk_prot->twsk_prot->twsk_getpeer(sk); + return NULL; +} + #endif /* _TIMEWAIT_SOCK_H */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0ddf819cfb5..dd555051ec8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1210,12 +1210,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { }; #endif -static struct timewait_sock_ops tcp_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct tcp_timewait_sock), - .twsk_unique = tcp_twsk_unique, - .twsk_destructor= tcp_twsk_destructor, -}; - int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct tcp_extend_values tmp_ext; @@ -1783,25 +1777,20 @@ struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it) } EXPORT_SYMBOL(tcp_v4_get_peer); -int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) +void *tcp_v4_tw_get_peer(struct sock *sk) { - struct inet_peer *peer = inet_getpeer_v4(tw->tw_daddr, 1); - - if (peer) { - const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); - - if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && - peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; - peer->tcp_ts = tcptw->tw_ts_recent; - } - inet_putpeer(peer); - return 1; - } + struct inet_timewait_sock *tw = inet_twsk(sk); - return 0; + return inet_getpeer_v4(tw->tw_daddr, 1); } +EXPORT_SYMBOL(tcp_v4_tw_get_peer); + +static struct timewait_sock_ops tcp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp_timewait_sock), + .twsk_unique = tcp_twsk_unique, + .twsk_destructor= tcp_twsk_destructor, + .twsk_getpeer = tcp_v4_tw_get_peer, +}; const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 059082c873c..3527b51d615 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -78,6 +78,27 @@ static int tcp_remember_stamp(struct sock *sk) return 0; } +static int tcp_tw_remember_stamp(struct inet_timewait_sock *tw) +{ + struct sock *sk = (struct sock *) tw; + struct inet_peer *peer; + + peer = twsk_getpeer(sk); + if (peer) { + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + + if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || + ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && + peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { + peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; + peer->tcp_ts = tcptw->tw_ts_recent; + } + inet_putpeer(peer); + return 1; + } + return 0; +} + static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) @@ -178,14 +199,9 @@ kill_with_rst: tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } - /* I am shamed, but failed to make it more elegant. - * Yes, it is direct reference to IP, which is impossible - * to generalize to IPv6. Taking into account that IPv6 - * do not understand recycling in any case, it not - * a big problem in practice. --ANK */ - if (tw->tw_family == AF_INET && - tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && - tcp_v4_tw_remember_stamp(tw)) + if (tcp_death_row.sysctl_tw_recycle && + tcptw->tw_ts_recent_stamp && + tcp_tw_remember_stamp(tw)) inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, TCP_TIMEWAIT_LEN); else diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e394d0029d8..5f73a1808e3 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -906,12 +906,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { }; #endif -static struct timewait_sock_ops tcp6_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct tcp6_timewait_sock), - .twsk_unique = tcp_twsk_unique, - .twsk_destructor= tcp_twsk_destructor, -}; - static void __tcp_v6_send_check(struct sk_buff *skb, struct in6_addr *saddr, struct in6_addr *daddr) { @@ -1818,12 +1812,30 @@ do_time_wait: goto discard_it; } -struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it) +static struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it) +{ + /* Alas, not yet... */ + return NULL; +} + +static void *tcp_v6_tw_get_peer(struct sock *sk) { + struct inet_timewait_sock *tw = inet_twsk(sk); + + if (tw->tw_family == AF_INET) + return tcp_v4_tw_get_peer(sk); + /* Alas, not yet... */ return NULL; } +static struct timewait_sock_ops tcp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_unique = tcp_twsk_unique, + .twsk_destructor= tcp_twsk_destructor, + .twsk_getpeer = tcp_v6_tw_get_peer, +}; + static const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, -- cgit v1.2.3 From 0dbaee3b37e118a96bb7b8eb0d9bbaeeb46264be Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 13 Dec 2010 12:52:14 -0800 Subject: net: Abstract default ADVMSS behind an accessor. Make all RTAX_ADVMSS metric accesses go through a new helper function, dst_metric_advmss(). Leave the actual default metric as "zero" in the real metric slot, and compute the actual default value dynamically via a new dst_ops AF specific callback. For stacked IPSEC routes, we use the advmss of the path which preserves existing behavior. Unlike ipv4/ipv6, DecNET ties the advmss to the mtu and thus updates advmss on pmtu updates. This inconsistency in advmss handling results in more raw metric accesses than I wish we ended up with. Signed-off-by: David S. Miller --- drivers/scsi/cxgbi/libcxgbi.c | 2 +- include/net/dst.h | 14 +++++++++++++- include/net/dst_ops.h | 1 + net/decnet/af_decnet.c | 4 ++-- net/decnet/dn_route.c | 22 ++++++++++++++++------ net/ipv4/route.c | 24 +++++++++++++++++------- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_output.c | 14 +++++++++----- net/ipv6/route.c | 16 +++++++--------- net/ipv6/tcp_ipv6.c | 2 +- net/xfrm/xfrm_policy.c | 7 +++++++ 11 files changed, 75 insertions(+), 33 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c index be5661707df..d2ad3d67672 100644 --- a/drivers/scsi/cxgbi/libcxgbi.c +++ b/drivers/scsi/cxgbi/libcxgbi.c @@ -825,7 +825,7 @@ unsigned int cxgbi_sock_select_mss(struct cxgbi_sock *csk, unsigned int pmtu) unsigned int idx; struct dst_entry *dst = csk->dst; - csk->advmss = dst_metric(dst, RTAX_ADVMSS); + csk->advmss = dst_metric_advmss(dst); if (csk->advmss > pmtu - 40) csk->advmss = pmtu - 40; diff --git a/include/net/dst.h b/include/net/dst.h index 755ac6c1aa0..03a1c3d52d8 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -112,10 +112,22 @@ dst_metric_raw(const struct dst_entry *dst, const int metric) static inline u32 dst_metric(const struct dst_entry *dst, const int metric) { - WARN_ON_ONCE(metric == RTAX_HOPLIMIT); + WARN_ON_ONCE(metric == RTAX_HOPLIMIT || + metric == RTAX_ADVMSS); return dst_metric_raw(dst, metric); } +static inline u32 +dst_metric_advmss(const struct dst_entry *dst) +{ + u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS); + + if (!advmss) + advmss = dst->ops->default_advmss(dst); + + return advmss; +} + static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val) { dst->_metrics[metric-1] = val; diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 51665b3461b..15fb7af08c4 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -16,6 +16,7 @@ struct dst_ops { int (*gc)(struct dst_ops *ops); struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); + unsigned int (*default_advmss)(const struct dst_entry *); void (*destroy)(struct dst_entry *); void (*ifdown)(struct dst_entry *, struct net_device *dev, int how); diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 0065e7e14af..2af15b15d1f 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -829,7 +829,7 @@ static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation) return -EINVAL; scp->state = DN_CC; - scp->segsize_loc = dst_metric(__sk_dst_get(sk), RTAX_ADVMSS); + scp->segsize_loc = dst_metric_advmss(__sk_dst_get(sk)); dn_send_conn_conf(sk, allocation); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); @@ -958,7 +958,7 @@ static int __dn_connect(struct sock *sk, struct sockaddr_dn *addr, int addrlen, sk->sk_route_caps = sk->sk_dst_cache->dev->features; sock->state = SS_CONNECTING; scp->state = DN_CI; - scp->segsize_loc = dst_metric(sk->sk_dst_cache, RTAX_ADVMSS); + scp->segsize_loc = dst_metric_advmss(sk->sk_dst_cache); dn_nsp_send_conninit(sk, NSP_CI); err = -EINPROGRESS; diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index e2e926841fe..b8a5c0515be 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -110,6 +110,7 @@ static unsigned long dn_rt_deadline; static int dn_dst_gc(struct dst_ops *ops); static struct dst_entry *dn_dst_check(struct dst_entry *, __u32); +static unsigned int dn_dst_default_advmss(const struct dst_entry *dst); static struct dst_entry *dn_dst_negative_advice(struct dst_entry *); static void dn_dst_link_failure(struct sk_buff *); static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu); @@ -129,6 +130,7 @@ static struct dst_ops dn_dst_ops = { .gc_thresh = 128, .gc = dn_dst_gc, .check = dn_dst_check, + .default_advmss = dn_dst_default_advmss, .negative_advice = dn_dst_negative_advice, .link_failure = dn_dst_link_failure, .update_pmtu = dn_dst_update_pmtu, @@ -245,7 +247,8 @@ static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu) } if (!(dst_metric_locked(dst, RTAX_ADVMSS))) { u32 mss = mtu - DN_MAX_NSP_DATA_HEADER; - if (dst_metric(dst, RTAX_ADVMSS) > mss) + u32 existing_mss = dst_metric_raw(dst, RTAX_ADVMSS); + if (!existing_mss || existing_mss > mss) dst_metric_set(dst, RTAX_ADVMSS, mss); } } @@ -795,12 +798,17 @@ static int dn_rt_bug(struct sk_buff *skb) return NET_RX_DROP; } +static unsigned int dn_dst_default_advmss(const struct dst_entry *dst) +{ + return dn_mss_from_pmtu(dst->dev, dst_mtu(dst)); +} + static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res) { struct dn_fib_info *fi = res->fi; struct net_device *dev = rt->dst.dev; struct neighbour *n; - unsigned mss; + unsigned int metric; if (fi) { if (DN_FIB_RES_GW(*res) && @@ -820,10 +828,12 @@ static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res) if (dst_metric(&rt->dst, RTAX_MTU) == 0 || dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu) dst_metric_set(&rt->dst, RTAX_MTU, rt->dst.dev->mtu); - mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst)); - if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0 || - dst_metric(&rt->dst, RTAX_ADVMSS) > mss) - dst_metric_set(&rt->dst, RTAX_ADVMSS, mss); + metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS); + if (metric) { + unsigned int mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst)); + if (metric > mss) + dst_metric_set(&rt->dst, RTAX_ADVMSS, mss); + } return 0; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 770f70427f0..80997333db0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -139,6 +139,7 @@ static unsigned long expires_ljiffies; */ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); +static unsigned int ipv4_default_advmss(const struct dst_entry *dst); static void ipv4_dst_destroy(struct dst_entry *dst); static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); @@ -155,6 +156,7 @@ static struct dst_ops ipv4_dst_ops = { .protocol = cpu_to_be16(ETH_P_IP), .gc = rt_garbage_collect, .check = ipv4_dst_check, + .default_advmss = ipv4_default_advmss, .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, @@ -383,8 +385,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) (__force u32)r->rt_gateway, r->rt_flags, atomic_read(&r->dst.__refcnt), r->dst.__use, 0, (__force u32)r->rt_src, - (dst_metric(&r->dst, RTAX_ADVMSS) ? - (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0), + dst_metric_advmss(&r->dst) + 40, dst_metric(&r->dst, RTAX_WINDOW), (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + dst_metric(&r->dst, RTAX_RTTVAR)), @@ -1798,6 +1799,19 @@ static void set_class_tag(struct rtable *rt, u32 tag) } #endif +static unsigned int ipv4_default_advmss(const struct dst_entry *dst) +{ + unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); + + if (advmss == 0) { + advmss = max_t(unsigned int, dst->dev->mtu - 40, + ip_rt_min_advmss); + if (advmss > 65535 - 40) + advmss = 65535 - 40; + } + return advmss; +} + static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) { struct dst_entry *dst = &rt->dst; @@ -1823,11 +1837,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) if (dst_mtu(dst) > IP_MAX_MTU) dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); - if (dst_metric(dst, RTAX_ADVMSS) == 0) - dst_metric_set(dst, RTAX_ADVMSS, - max_t(unsigned int, dst->dev->mtu - 40, - ip_rt_min_advmss)); - if (dst_metric(dst, RTAX_ADVMSS) > 65535 - 40) + if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); #ifdef CONFIG_NET_CLS_ROUTE diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4fc3387aa99..f4011027543 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1436,7 +1436,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); - newtp->advmss = dst_metric(dst, RTAX_ADVMSS); + newtp->advmss = dst_metric_advmss(dst); if (tcp_sk(sk)->rx_opt.user_mss && tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 97041f24cd2..2d390669d40 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -119,9 +119,13 @@ static __u16 tcp_advertise_mss(struct sock *sk) struct dst_entry *dst = __sk_dst_get(sk); int mss = tp->advmss; - if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { - mss = dst_metric(dst, RTAX_ADVMSS); - tp->advmss = mss; + if (dst) { + unsigned int metric = dst_metric_advmss(dst); + + if (metric < mss) { + mss = metric; + tp->advmss = mss; + } } return (__u16)mss; @@ -2422,7 +2426,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb_dst_set(skb, dst_clone(dst)); - mss = dst_metric(dst, RTAX_ADVMSS); + mss = dst_metric_advmss(dst); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) mss = tp->rx_opt.user_mss; @@ -2556,7 +2560,7 @@ static void tcp_connect_init(struct sock *sk) if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); - tp->advmss = dst_metric(dst, RTAX_ADVMSS); + tp->advmss = dst_metric_advmss(dst); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) tp->advmss = tp->rx_opt.user_mss; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 98796b0dc2b..d9cb832be52 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -76,6 +76,7 @@ static struct rt6_info * ip6_rt_copy(struct rt6_info *ort); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); +static unsigned int ip6_default_advmss(const struct dst_entry *dst); static struct dst_entry *ip6_negative_advice(struct dst_entry *); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, @@ -103,6 +104,7 @@ static struct dst_ops ip6_dst_ops_template = { .gc = ip6_dst_gc, .gc_thresh = 1024, .check = ip6_dst_check, + .default_advmss = ip6_default_advmss, .destroy = ip6_dst_destroy, .ifdown = ip6_dst_ifdown, .negative_advice = ip6_negative_advice, @@ -937,8 +939,12 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) static int ipv6_get_mtu(struct net_device *dev); -static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu) +static unsigned int ip6_default_advmss(const struct dst_entry *dst) { + struct net_device *dev = dst->dev; + unsigned int mtu = dst_mtu(dst); + struct net *net = dev_net(dev); + mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) @@ -990,7 +996,6 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, atomic_set(&rt->dst.__refcnt, 1); dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255); dst_metric_set(&rt->dst, RTAX_MTU, ipv6_get_mtu(rt->rt6i_dev)); - dst_metric_set(&rt->dst, RTAX_ADVMSS, ipv6_advmss(net, dst_mtu(&rt->dst))); rt->dst.output = ip6_output; #if 0 /* there's no chance to use these for ndisc */ @@ -1312,8 +1317,6 @@ install_route: if (!dst_mtu(&rt->dst)) dst_metric_set(&rt->dst, RTAX_MTU, ipv6_get_mtu(dev)); - if (!dst_metric(&rt->dst, RTAX_ADVMSS)) - dst_metric_set(&rt->dst, RTAX_ADVMSS, ipv6_advmss(net, dst_mtu(&rt->dst))); rt->dst.dev = dev; rt->rt6i_idev = idev; rt->rt6i_table = table; @@ -1540,8 +1543,6 @@ void rt6_redirect(struct in6_addr *dest, struct in6_addr *src, nrt->rt6i_nexthop = neigh_clone(neigh); /* Reset pmtu, it may be better */ dst_metric_set(&nrt->dst, RTAX_MTU, ipv6_get_mtu(neigh->dev)); - dst_metric_set(&nrt->dst, RTAX_ADVMSS, ipv6_advmss(dev_net(neigh->dev), - dst_mtu(&nrt->dst))); if (ip6_ins_rt(nrt)) goto out; @@ -1971,7 +1972,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->rt6i_dev = net->loopback_dev; rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_MTU, ipv6_get_mtu(rt->rt6i_dev)); - dst_metric_set(&rt->dst, RTAX_ADVMSS, ipv6_advmss(net, dst_mtu(&rt->dst))); dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1); rt->dst.obsolete = -1; @@ -2041,7 +2041,6 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; - struct net *net = dev_net(arg->dev); /* In IPv6 pmtu discovery is not optional, so that RTAX_MTU lock cannot disable it. @@ -2073,7 +2072,6 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) (dst_mtu(&rt->dst) < arg->mtu && dst_mtu(&rt->dst) == idev->cnf.mtu6))) { dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); - dst_metric_set(&rt->dst, RTAX_ADVMSS, ipv6_advmss(net, arg->mtu)); } return 0; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index fee07689164..20aa95e3735 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1521,7 +1521,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); - newtp->advmss = dst_metric(dst, RTAX_ADVMSS); + newtp->advmss = dst_metric_advmss(dst); tcp_initialize_rcv_mss(newsk); newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 6e50ccd8c53..36936c8ae96 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2361,6 +2361,11 @@ static int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first, return 1; } +static unsigned int xfrm_default_advmss(const struct dst_entry *dst) +{ + return dst_metric_advmss(dst->path); +} + int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) { struct net *net; @@ -2378,6 +2383,8 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) dst_ops->kmem_cachep = xfrm_dst_cache; if (likely(dst_ops->check == NULL)) dst_ops->check = xfrm_dst_check; + if (likely(dst_ops->default_advmss == NULL)) + dst_ops->default_advmss = xfrm_default_advmss; if (likely(dst_ops->negative_advice == NULL)) dst_ops->negative_advice = xfrm_negative_advice; if (likely(dst_ops->link_failure == NULL)) -- cgit v1.2.3 From 1bde5ac49398a064c753bb490535cfad89e99a5f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Dec 2010 09:32:46 -0800 Subject: tcp: fix listening_get_next() Alexey Vlasov found /proc/net/tcp could sometime loop and display millions of sockets in LISTEN state. In 2.6.29, when we converted TCP hash tables to RCU, we left two sk_next() calls in listening_get_next(). We must instead use sk_nulls_next() to properly detect an end of chain. Reported-by: Alexey Vlasov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e13da6de1fc..d978bb2f748 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2030,7 +2030,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) get_req: req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; } - sk = sk_next(st->syn_wait_sk); + sk = sk_nulls_next(st->syn_wait_sk); st->state = TCP_SEQ_STATE_LISTENING; read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } else { @@ -2039,7 +2039,7 @@ get_req: if (reqsk_queue_len(&icsk->icsk_accept_queue)) goto start_req; read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - sk = sk_next(sk); + sk = sk_nulls_next(sk); } get_sk: sk_nulls_for_each_from(sk, node) { -- cgit v1.2.3