summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig32
-rw-r--r--net/ipv4/af_inet.c143
-rw-r--r--net/ipv4/ah4.c1
-rw-r--r--net/ipv4/arp.c60
-rw-r--r--net/ipv4/cipso_ipv4.c3
-rw-r--r--net/ipv4/datagram.c4
-rw-r--r--net/ipv4/devinet.c8
-rw-r--r--net/ipv4/fib_frontend.c14
-rw-r--r--net/ipv4/fib_hash.c1
-rw-r--r--net/ipv4/fib_rules.c22
-rw-r--r--net/ipv4/fib_semantics.c1
-rw-r--r--net/ipv4/fib_trie.c11
-rw-r--r--net/ipv4/icmp.c49
-rw-r--r--net/ipv4/igmp.c37
-rw-r--r--net/ipv4/inet_connection_sock.c31
-rw-r--r--net/ipv4/inet_diag.c1
-rw-r--r--net/ipv4/inet_fragment.c2
-rw-r--r--net/ipv4/inet_hashtables.c6
-rw-r--r--net/ipv4/inet_timewait_sock.c1
-rw-r--r--net/ipv4/inetpeer.c244
-rw-r--r--net/ipv4/ip_forward.c13
-rw-r--r--net/ipv4/ip_fragment.c28
-rw-r--r--net/ipv4/ip_gre.c29
-rw-r--r--net/ipv4/ip_input.c35
-rw-r--r--net/ipv4/ip_options.c11
-rw-r--r--net/ipv4/ip_output.c121
-rw-r--r--net/ipv4/ip_sockglue.c66
-rw-r--r--net/ipv4/ipconfig.c10
-rw-r--r--net/ipv4/ipip.c16
-rw-r--r--net/ipv4/ipmr.c949
-rw-r--r--net/ipv4/netfilter.c19
-rw-r--r--net/ipv4/netfilter/arp_tables.c124
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c4
-rw-r--r--net/ipv4/netfilter/arptable_filter.c1
-rw-r--r--net/ipv4/netfilter/ip_queue.c62
-rw-r--r--net/ipv4/netfilter/ip_tables.c276
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c128
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c23
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c73
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c18
-rw-r--r--net/ipv4/netfilter/ipt_NETMAP.c22
-rw-r--r--net/ipv4/netfilter/ipt_REDIRECT.c16
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c32
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c48
-rw-r--r--net/ipv4/netfilter/ipt_addrtype.c28
-rw-r--r--net/ipv4/netfilter/ipt_ah.c28
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c19
-rw-r--r--net/ipv4/netfilter/iptable_filter.c3
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c1
-rw-r--r--net/ipv4/netfilter/iptable_raw.c1
-rw-r--r--net/ipv4/netfilter/iptable_security.c1
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c10
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c7
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c30
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c1
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_common.c12
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_dccp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c12
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c10
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_sctp.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_tcp.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udp.c5
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_udplite.c6
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_unknown.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c32
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c18
-rw-r--r--net/ipv4/netfilter/nf_nat_tftp.c1
-rw-r--r--net/ipv4/proc.c17
-rw-r--r--net/ipv4/protocol.c3
-rw-r--r--net/ipv4/raw.c27
-rw-r--r--net/ipv4/route.c683
-rw-r--r--net/ipv4/syncookies.c107
-rw-r--r--net/ipv4/sysctl_net_ipv4.c18
-rw-r--r--net/ipv4/tcp.c199
-rw-r--r--net/ipv4/tcp_cong.c1
-rw-r--r--net/ipv4/tcp_hybla.c4
-rw-r--r--net/ipv4/tcp_input.c40
-rw-r--r--net/ipv4/tcp_ipv4.c227
-rw-r--r--net/ipv4/tcp_minisocks.c11
-rw-r--r--net/ipv4/tcp_output.c106
-rw-r--r--net/ipv4/tcp_probe.c1
-rw-r--r--net/ipv4/tcp_timer.c10
-rw-r--r--net/ipv4/tunnel4.c3
-rw-r--r--net/ipv4/udp.c61
-rw-r--r--net/ipv4/udplite.c3
-rw-r--r--net/ipv4/xfrm4_input.c8
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c1
-rw-r--r--net/ipv4/xfrm4_output.c2
-rw-r--r--net/ipv4/xfrm4_policy.c26
92 files changed, 2627 insertions, 2011 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 0c94a1ac294..7c3a7d19124 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -250,6 +250,20 @@ config IP_MROUTE
<file:Documentation/networking/multicast.txt>. If you haven't heard
about it, you don't need it.
+config IP_MROUTE_MULTIPLE_TABLES
+ bool "IP: multicast policy routing"
+ depends on IP_MROUTE && IP_ADVANCED_ROUTER
+ select FIB_RULES
+ help
+ Normally, a multicast router runs a userspace daemon and decides
+ what to do with a multicast packet based on the source and
+ destination addresses. If you say Y here, the multicast router
+ will also be able to take interfaces and packet marks into
+ account and run multiple instances of userspace daemons
+ simultaneously, each one handling a single table.
+
+ If unsure, say N.
+
config IP_PIMSM_V1
bool "IP: PIM-SM version 1 support"
depends on IP_MROUTE
@@ -289,7 +303,7 @@ config ARPD
If unsure, say N.
config SYN_COOKIES
- bool "IP: TCP syncookie support (disabled per default)"
+ bool "IP: TCP syncookie support"
---help---
Normal TCP/IP networking is open to an attack known as "SYN
flooding". This denial-of-service attack prevents legitimate remote
@@ -314,13 +328,13 @@ config SYN_COOKIES
server is really overloaded. If this happens frequently better turn
them off.
- If you say Y here, note that SYN cookies aren't enabled by default;
- you can enable them by saying Y to "/proc file system support" and
+ If you say Y here, you can disable SYN cookies at run time by
+ saying Y to "/proc file system support" and
"Sysctl support" below and executing the command
- echo 1 >/proc/sys/net/ipv4/tcp_syncookies
+ echo 0 > /proc/sys/net/ipv4/tcp_syncookies
- at boot time after the /proc file system has been mounted.
+ after the /proc file system has been mounted.
If unsure, say N.
@@ -587,9 +601,15 @@ choice
config DEFAULT_HTCP
bool "Htcp" if TCP_CONG_HTCP=y
+ config DEFAULT_HYBLA
+ bool "Hybla" if TCP_CONG_HYBLA=y
+
config DEFAULT_VEGAS
bool "Vegas" if TCP_CONG_VEGAS=y
+ config DEFAULT_VENO
+ bool "Veno" if TCP_CONG_VENO=y
+
config DEFAULT_WESTWOOD
bool "Westwood" if TCP_CONG_WESTWOOD=y
@@ -610,8 +630,10 @@ config DEFAULT_TCP_CONG
default "bic" if DEFAULT_BIC
default "cubic" if DEFAULT_CUBIC
default "htcp" if DEFAULT_HTCP
+ default "hybla" if DEFAULT_HYBLA
default "vegas" if DEFAULT_VEGAS
default "westwood" if DEFAULT_WESTWOOD
+ default "veno" if DEFAULT_VENO
default "reno" if DEFAULT_RENO
default "cubic"
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 33b7dffa773..6a1100c25a9 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -86,6 +86,7 @@
#include <linux/poll.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/system.h>
@@ -153,7 +154,7 @@ void inet_sock_destruct(struct sock *sk)
WARN_ON(sk->sk_forward_alloc);
kfree(inet->opt);
- dst_release(sk->sk_dst_cache);
+ dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
sk_refcnt_debug_dec(sk);
}
EXPORT_SYMBOL(inet_sock_destruct);
@@ -354,6 +355,8 @@ lookup_protocol:
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+ inet->nodefrag = 0;
+
if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
@@ -418,6 +421,8 @@ int inet_release(struct socket *sock)
if (sk) {
long timeout;
+ sock_rps_reset_flow(sk);
+
/* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk);
@@ -530,6 +535,8 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
{
struct sock *sk = sock->sk;
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
if (uaddr->sa_family == AF_UNSPEC)
return sk->sk_prot->disconnect(sk, flags);
@@ -543,7 +550,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
{
DEFINE_WAIT(wait);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
/* Basic assumption: if someone sets sk->sk_err, he _must_
* change state of the socket from TCP_SYN_*.
@@ -556,9 +563,9 @@ static long inet_wait_for_connect(struct sock *sk, long timeo)
lock_sock(sk);
if (signal_pending(current) || !timeo)
break;
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
return timeo;
}
@@ -573,6 +580,9 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int err;
long timeo;
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
+
lock_sock(sk);
if (uaddr->sa_family == AF_UNSPEC) {
@@ -714,29 +724,51 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
{
struct sock *sk = sock->sk;
+ sock_rps_record_flow(sk);
+
/* We may need to bind the socket. */
- if (!inet_sk(sk)->inet_num && inet_autobind(sk))
+ if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+ inet_autobind(sk))
return -EAGAIN;
return sk->sk_prot->sendmsg(iocb, sk, msg, size);
}
EXPORT_SYMBOL(inet_sendmsg);
-
-static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
- size_t size, int flags)
+ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags)
{
struct sock *sk = sock->sk;
+ sock_rps_record_flow(sk);
+
/* We may need to bind the socket. */
- if (!inet_sk(sk)->inet_num && inet_autobind(sk))
+ if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+ inet_autobind(sk))
return -EAGAIN;
if (sk->sk_prot->sendpage)
return sk->sk_prot->sendpage(sk, page, offset, size, flags);
return sock_no_sendpage(sock, page, offset, size, flags);
}
+EXPORT_SYMBOL(inet_sendpage);
+int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ sock_rps_record_flow(sk);
+
+ err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+EXPORT_SYMBOL(inet_recvmsg);
int inet_shutdown(struct socket *sock, int how)
{
@@ -865,10 +897,10 @@ const struct proto_ops inet_stream_ops = {
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
- .sendmsg = tcp_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = tcp_sendpage,
+ .sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
@@ -893,7 +925,7 @@ const struct proto_ops inet_dgram_ops = {
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
@@ -923,7 +955,7 @@ static const struct proto_ops inet_sockraw_ops = {
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
@@ -1073,7 +1105,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
if (err)
return err;
- sk_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->dst);
new_saddr = rt->rt_src;
@@ -1139,7 +1171,7 @@ int inet_sk_rebuild_header(struct sock *sk)
err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
}
if (!err)
- sk_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->dst);
else {
/* Routing failed... */
sk->sk_route_caps = 0;
@@ -1296,8 +1328,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto out_unlock;
- id = ntohl(*(u32 *)&iph->id);
- flush = (u16)((ntohl(*(u32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
+ id = ntohl(*(__be32 *)&iph->id);
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
id >>= 16;
for (p = *head; p; p = p->next) {
@@ -1310,8 +1342,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
if ((iph->protocol ^ iph2->protocol) |
(iph->tos ^ iph2->tos) |
- (iph->saddr ^ iph2->saddr) |
- (iph->daddr ^ iph2->daddr)) {
+ ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
+ ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
@@ -1398,13 +1430,49 @@ unsigned long snmp_fold_field(void __percpu *mib[], int offt)
}
EXPORT_SYMBOL_GPL(snmp_fold_field);
-int snmp_mib_init(void __percpu *ptr[2], size_t mibsize)
+#if BITS_PER_LONG==32
+
+u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
+{
+ u64 res = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ void *bhptr, *userptr;
+ struct u64_stats_sync *syncp;
+ u64 v_bh, v_user;
+ unsigned int start;
+
+ /* first mib used by softirq context, we must use _bh() accessors */
+ bhptr = per_cpu_ptr(SNMP_STAT_BHPTR(mib), cpu);
+ syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
+ do {
+ start = u64_stats_fetch_begin_bh(syncp);
+ v_bh = *(((u64 *) bhptr) + offt);
+ } while (u64_stats_fetch_retry_bh(syncp, start));
+
+ /* second mib used in USER context */
+ userptr = per_cpu_ptr(SNMP_STAT_USRPTR(mib), cpu);
+ syncp = (struct u64_stats_sync *)(userptr + syncp_offset);
+ do {
+ start = u64_stats_fetch_begin(syncp);
+ v_user = *(((u64 *) userptr) + offt);
+ } while (u64_stats_fetch_retry(syncp, start));
+
+ res += v_bh + v_user;
+ }
+ return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field64);
+#endif
+
+int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
{
BUG_ON(ptr == NULL);
- ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
+ ptr[0] = __alloc_percpu(mibsize, align);
if (!ptr[0])
goto err0;
- ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
+ ptr[1] = __alloc_percpu(mibsize, align);
if (!ptr[1])
goto err1;
return 0;
@@ -1461,25 +1529,32 @@ static const struct net_protocol icmp_protocol = {
static __net_init int ipv4_mib_init_net(struct net *net)
{
if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
- sizeof(struct tcp_mib)) < 0)
+ sizeof(struct tcp_mib),
+ __alignof__(struct tcp_mib)) < 0)
goto err_tcp_mib;
if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
- sizeof(struct ipstats_mib)) < 0)
+ sizeof(struct ipstats_mib),
+ __alignof__(struct ipstats_mib)) < 0)
goto err_ip_mib;
if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
- sizeof(struct linux_mib)) < 0)
+ sizeof(struct linux_mib),
+ __alignof__(struct linux_mib)) < 0)
goto err_net_mib;
if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
- sizeof(struct udp_mib)) < 0)
+ sizeof(struct udp_mib),
+ __alignof__(struct udp_mib)) < 0)
goto err_udp_mib;
if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
- sizeof(struct udp_mib)) < 0)
+ sizeof(struct udp_mib),
+ __alignof__(struct udp_mib)) < 0)
goto err_udplite_mib;
if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
- sizeof(struct icmp_mib)) < 0)
+ sizeof(struct icmp_mib),
+ __alignof__(struct icmp_mib)) < 0)
goto err_icmp_mib;
if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
- sizeof(struct icmpmsg_mib)) < 0)
+ sizeof(struct icmpmsg_mib),
+ __alignof__(struct icmpmsg_mib)) < 0)
goto err_icmpmsg_mib;
tcp_mib_init(net);
@@ -1546,9 +1621,13 @@ static int __init inet_init(void)
BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
+ sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+ if (!sysctl_local_reserved_ports)
+ goto out;
+
rc = proto_register(&tcp_prot, 1);
if (rc)
- goto out;
+ goto out_free_reserved_ports;
rc = proto_register(&udp_prot, 1);
if (rc)
@@ -1647,6 +1726,8 @@ out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
+out_free_reserved_ports:
+ kfree(sysctl_local_reserved_ports);
goto out;
}
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 987b47dc69a..880a5ec6dce 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,6 +1,7 @@
#include <crypto/hash.h>
#include <linux/err.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/ah.h>
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index c4dd1354280..96c1955b3e2 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -98,6 +98,7 @@
#include <linux/net.h>
#include <linux/rcupdate.h>
#include <linux/jhash.h>
+#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
@@ -115,6 +116,7 @@
#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
#include <net/atmclip.h>
struct neigh_table *clip_tbl_hook;
+EXPORT_SYMBOL(clip_tbl_hook);
#endif
#include <asm/system.h>
@@ -168,6 +170,7 @@ const struct neigh_ops arp_broken_ops = {
.hh_output = dev_queue_xmit,
.queue_xmit = dev_queue_xmit,
};
+EXPORT_SYMBOL(arp_broken_ops);
struct neigh_table arp_tbl = {
.family = AF_INET,
@@ -197,6 +200,7 @@ struct neigh_table arp_tbl = {
.gc_thresh2 = 512,
.gc_thresh3 = 1024,
};
+EXPORT_SYMBOL(arp_tbl);
int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
{
@@ -332,11 +336,14 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
struct net_device *dev = neigh->dev;
__be32 target = *(__be32*)neigh->primary_key;
int probes = atomic_read(&neigh->probes);
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev;
- if (!in_dev)
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
return;
-
+ }
switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
default:
case 0: /* By default announce any local IP */
@@ -357,9 +364,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
case 2: /* Avoid secondary IPs, get a primary/preferred one */
break;
}
+ rcu_read_unlock();
- if (in_dev)
- in_dev_put(in_dev);
if (!saddr)
saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
@@ -426,7 +432,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
if (ip_route_output_key(net, &rt, &fl) < 0)
return 1;
- if (rt->u.dst.dev != dev) {
+ if (rt->dst.dev != dev) {
NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
flag = 1;
}
@@ -496,6 +502,7 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
kfree_skb(skb);
return 1;
}
+EXPORT_SYMBOL(arp_find);
/* END OF OBSOLETE FUNCTIONS */
@@ -531,7 +538,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
struct in_device *out_dev;
int imi, omi = -1;
- if (rt->u.dst.dev == dev)
+ if (rt->dst.dev == dev)
return 0;
if (!IN_DEV_PROXY_ARP(in_dev))
@@ -544,10 +551,10 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
/* place to check for proxy_arp for routes */
- if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) {
+ out_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (out_dev)
omi = IN_DEV_MEDIUM_ID(out_dev);
- in_dev_put(out_dev);
- }
+
return (omi != imi && omi != -1);
}
@@ -575,7 +582,7 @@ static inline int arp_fwd_pvlan(struct in_device *in_dev,
__be32 sip, __be32 tip)
{
/* Private VLAN is only concerned about the same ethernet segment */
- if (rt->u.dst.dev != dev)
+ if (rt->dst.dev != dev)
return 0;
/* Don't reply on self probes (often done by windowz boxes)*/
@@ -660,13 +667,13 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
#endif
#endif
-#ifdef CONFIG_FDDI
+#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
case ARPHRD_FDDI:
arp->ar_hrd = htons(ARPHRD_ETHER);
arp->ar_pro = htons(ETH_P_IP);
break;
#endif
-#ifdef CONFIG_TR
+#if defined(CONFIG_TR) || defined(CONFIG_TR_MODULE)
case ARPHRD_IEEE802_TR:
arp->ar_hrd = htons(ARPHRD_IEEE802);
arp->ar_pro = htons(ETH_P_IP);
@@ -697,6 +704,7 @@ out:
kfree_skb(skb);
return NULL;
}
+EXPORT_SYMBOL(arp_create);
/*
* Send an arp packet.
@@ -706,6 +714,7 @@ void arp_xmit(struct sk_buff *skb)
/* Send it off, maybe filter it using firewalling first. */
NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
}
+EXPORT_SYMBOL(arp_xmit);
/*
* Create and send an arp packet.
@@ -732,6 +741,7 @@ void arp_send(int type, int ptype, __be32 dest_ip,
arp_xmit(skb);
}
+EXPORT_SYMBOL(arp_send);
/*
* Process an arp request.
@@ -740,7 +750,7 @@ void arp_send(int type, int ptype, __be32 dest_ip,
static int arp_process(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
struct arphdr *arp;
unsigned char *arp_ptr;
struct rtable *rt;
@@ -853,7 +863,7 @@ static int arp_process(struct sk_buff *skb)
}
if (arp->ar_op == htons(ARPOP_REQUEST) &&
- ip_route_input(skb, tip, sip, 0, dev) == 0) {
+ ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
rt = skb_rtable(skb);
addr_type = rt->rt_type;
@@ -889,7 +899,6 @@ static int arp_process(struct sk_buff *skb)
arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
} else {
pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
- in_dev_put(in_dev);
return 0;
}
goto out;
@@ -935,8 +944,6 @@ static int arp_process(struct sk_buff *skb)
}
out:
- if (in_dev)
- in_dev_put(in_dev);
consume_skb(skb);
return 0;
}
@@ -1044,13 +1051,13 @@ static int arp_req_set(struct net *net, struct arpreq *r,
struct rtable * rt;
if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
return err;
- dev = rt->u.dst.dev;
+ dev = rt->dst.dev;
ip_rt_put(rt);
if (!dev)
return -EINVAL;
}
switch (dev->type) {
-#ifdef CONFIG_FDDI
+#if defined(CONFIG_FDDI) || defined(CONFIG_FDDI_MODULE)
case ARPHRD_FDDI:
/*
* According to RFC 1390, FDDI devices should accept ARP
@@ -1151,7 +1158,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
struct rtable * rt;
if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
return err;
- dev = rt->u.dst.dev;
+ dev = rt->dst.dev;
ip_rt_put(rt);
if (!dev)
return -EINVAL;
@@ -1452,14 +1459,3 @@ static int __init arp_proc_init(void)
}
#endif /* CONFIG_PROC_FS */
-
-EXPORT_SYMBOL(arp_broken_ops);
-EXPORT_SYMBOL(arp_find);
-EXPORT_SYMBOL(arp_create);
-EXPORT_SYMBOL(arp_xmit);
-EXPORT_SYMBOL(arp_send);
-EXPORT_SYMBOL(arp_tbl);
-
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-EXPORT_SYMBOL(clip_tbl_hook);
-#endif
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 1e029dc7545..3a92a76ae41 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -44,6 +44,7 @@
#include <linux/string.h>
#include <linux/jhash.h>
#include <linux/audit.h>
+#include <linux/slab.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/tcp.h>
@@ -289,8 +290,6 @@ void cipso_v4_cache_invalidate(void)
cipso_v4_cache[iter].size = 0;
spin_unlock_bh(&cipso_v4_cache[iter].lock);
}
-
- return;
}
/**
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index fb2465811b4..f0550941df7 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -69,9 +69,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk->sk_state = TCP_ESTABLISHED;
inet->inet_id = jiffies;
- sk_dst_set(sk, &rt->u.dst);
+ sk_dst_set(sk, &rt->dst);
return(0);
}
-
EXPORT_SYMBOL(ip4_datagram_connect);
-
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 51ca946e339..da14c49284f 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -50,6 +50,7 @@
#include <linux/notifier.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
+#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
@@ -1080,6 +1081,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
}
ip_mc_up(in_dev);
/* fall through */
+ case NETDEV_NOTIFY_PEERS:
case NETDEV_CHANGEADDR:
/* Send gratuitous ARP to notify of link change */
if (IN_DEV_ARP_NOTIFY(in_dev)) {
@@ -1095,10 +1097,10 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
case NETDEV_DOWN:
ip_mc_down(in_dev);
break;
- case NETDEV_BONDING_OLDTYPE:
+ case NETDEV_PRE_TYPE_CHANGE:
ip_mc_unmap(in_dev);
break;
- case NETDEV_BONDING_NEWTYPE:
+ case NETDEV_POST_TYPE_CHANGE:
ip_mc_remap(in_dev);
break;
case NETDEV_CHANGEMTU:
@@ -1194,7 +1196,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
if (idx < s_idx)
goto cont;
- if (idx > s_idx)
+ if (h > s_h || idx > s_idx)
s_ip_idx = 0;
in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 9b3e28ed524..a4396891835 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -34,6 +34,7 @@
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/slab.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -174,6 +175,7 @@ out:
fib_res_put(&res);
return dev;
}
+EXPORT_SYMBOL(ip_dev_find);
/*
* Find address type as if only "dev" was present in the system. If
@@ -213,12 +215,14 @@ unsigned int inet_addr_type(struct net *net, __be32 addr)
{
return __inet_dev_addr_type(net, NULL, addr);
}
+EXPORT_SYMBOL(inet_addr_type);
unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr)
{
return __inet_dev_addr_type(net, dev, addr);
}
+EXPORT_SYMBOL(inet_dev_addr_type);
/* Given (packet source, input interface) and optional (dst, oif, tos):
- (main) check, that source is valid i.e. not broadcast or our local
@@ -283,7 +287,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
if (no_addr)
goto last_resort;
if (rpf == 1)
- goto e_inval;
+ goto e_rpf;
fl.oif = dev->ifindex;
ret = 0;
@@ -298,7 +302,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
last_resort:
if (rpf)
- goto e_inval;
+ goto e_rpf;
*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
*itag = 0;
return 0;
@@ -307,6 +311,8 @@ e_inval_res:
fib_res_put(&res);
e_inval:
return -EINVAL;
+e_rpf:
+ return -EXDEV;
}
static inline __be32 sk_extract_addr(struct sockaddr *addr)
@@ -1074,7 +1080,3 @@ void __init ip_fib_init(void)
fib_hash_init();
}
-
-EXPORT_SYMBOL(inet_addr_type);
-EXPORT_SYMBOL(inet_dev_addr_type);
-EXPORT_SYMBOL(ip_dev_find);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 14972017b9c..4ed7e0dea1b 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -32,6 +32,7 @@
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/ip.h>
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index ca2d07b1c70..76daeb5ff56 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -213,7 +213,6 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
{
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
- frh->family = AF_INET;
frh->dst_len = rule4->dst_len;
frh->src_len = rule4->src_len;
frh->tos = rule4->tos;
@@ -234,23 +233,6 @@ nla_put_failure:
return -ENOBUFS;
}
-static u32 fib4_rule_default_pref(struct fib_rules_ops *ops)
-{
- struct list_head *pos;
- struct fib_rule *rule;
-
- if (!list_empty(&ops->rules_list)) {
- pos = ops->rules_list.next;
- if (pos->next != &ops->rules_list) {
- rule = list_entry(pos->next, struct fib_rule, list);
- if (rule->pref)
- return rule->pref - 1;
- }
- }
-
- return 0;
-}
-
static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
{
return nla_total_size(4) /* dst */
@@ -263,7 +245,7 @@ static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
rt_cache_flush(ops->fro_net, -1);
}
-static struct fib_rules_ops fib4_rules_ops_template = {
+static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
.family = AF_INET,
.rule_size = sizeof(struct fib4_rule),
.addr_size = sizeof(u32),
@@ -272,7 +254,7 @@ static struct fib_rules_ops fib4_rules_ops_template = {
.configure = fib4_rule_configure,
.compare = fib4_rule_compare,
.fill = fib4_rule_fill,
- .default_pref = fib4_rule_default_pref,
+ .default_pref = fib_default_rule_pref,
.nlmsg_payload = fib4_rule_nlmsg_payload,
.flush_cache = fib4_rule_flush_cache,
.nlgroup = RTNLGRP_IPV4_RULE,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 1af0ea0fb6a..20f09c5b31e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -32,6 +32,7 @@
#include <linux/proc_fs.h>
#include <linux/skbuff.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <net/arp.h>
#include <net/ip.h>
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index af5d8979286..79d057a939b 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -71,6 +71,7 @@
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -208,7 +209,9 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
{
struct node *ret = tnode_get_child(tn, i);
- return rcu_dereference(ret);
+ return rcu_dereference_check(ret,
+ rcu_read_lock_held() ||
+ lockdep_rtnl_is_held());
}
static inline int tnode_child_length(const struct tnode *tn)
@@ -961,7 +964,9 @@ fib_find_node(struct trie *t, u32 key)
struct node *n;
pos = 0;
- n = rcu_dereference(t->trie);
+ n = rcu_dereference_check(t->trie,
+ rcu_read_lock_held() ||
+ lockdep_rtnl_is_held());
while (n != NULL && NODE_TYPE(n) == T_TNODE) {
tn = (struct tnode *) n;
@@ -1017,8 +1022,6 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
rcu_assign_pointer(t->trie, (struct node *)tn);
tnode_free_flush();
-
- return;
}
/* only used from updater-side */
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4b4c2bcd15d..a0d847c7cba 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -74,6 +74,7 @@
#include <linux/netdevice.h>
#include <linux/string.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/slab.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/route.h>
@@ -180,6 +181,7 @@ const struct icmp_err icmp_err_convert[] = {
.fatal = 1,
},
};
+EXPORT_SYMBOL(icmp_err_convert);
/*
* ICMP control array. This specifies what to do with each ICMP.
@@ -266,11 +268,12 @@ int xrlim_allow(struct dst_entry *dst, int timeout)
dst->rate_tokens = token;
return rc;
}
+EXPORT_SYMBOL(xrlim_allow);
static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
int type, int code)
{
- struct dst_entry *dst = &rt->u.dst;
+ struct dst_entry *dst = &rt->dst;
int rc = 1;
if (type > NR_ICMP_TYPES)
@@ -326,13 +329,14 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
struct sock *sk;
struct sk_buff *skb;
- sk = icmp_sk(dev_net((*rt)->u.dst.dev));
+ sk = icmp_sk(dev_net((*rt)->dst.dev));
if (ip_append_data(sk, icmp_glue_bits, icmp_param,
icmp_param->data_len+icmp_param->head_len,
icmp_param->head_len,
- ipc, rt, MSG_DONTWAIT) < 0)
+ ipc, rt, MSG_DONTWAIT) < 0) {
+ ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS);
ip_flush_pending_frames(sk);
- else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+ } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
struct icmphdr *icmph = icmp_hdr(skb);
__wsum csum = 0;
struct sk_buff *skb1;
@@ -357,7 +361,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
{
struct ipcm_cookie ipc;
struct rtable *rt = skb_rtable(skb);
- struct net *net = dev_net(rt->u.dst.dev);
+ struct net *net = dev_net(rt->dst.dev);
struct sock *sk;
struct inet_sock *inet;
__be32 daddr;
@@ -425,7 +429,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
if (!rt)
goto out;
- net = dev_net(rt->u.dst.dev);
+ net = dev_net(rt->dst.dev);
/*
* Find the original header. It is expected to be valid, of course.
@@ -585,20 +589,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
err = __ip_route_output_key(net, &rt2, &fl);
else {
struct flowi fl2 = {};
- struct dst_entry *odst;
+ unsigned long orefdst;
fl2.fl4_dst = fl.fl4_src;
if (ip_route_output_key(net, &rt2, &fl2))
goto relookup_failed;
/* Ugh! */
- odst = skb_dst(skb_in);
+ orefdst = skb_in->_skb_refdst; /* save old refdst */
err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
- RT_TOS(tos), rt2->u.dst.dev);
+ RT_TOS(tos), rt2->dst.dev);
- dst_release(&rt2->u.dst);
+ dst_release(&rt2->dst);
rt2 = skb_rtable(skb_in);
- skb_dst_set(skb_in, odst);
+ skb_in->_skb_refdst = orefdst; /* restore old refdst */
}
if (err)
@@ -608,7 +612,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
XFRM_LOOKUP_ICMP);
switch (err) {
case 0:
- dst_release(&rt->u.dst);
+ dst_release(&rt->dst);
rt = rt2;
break;
case -EPERM:
@@ -627,7 +631,7 @@ route_done:
/* RFC says return as much as we can without exceeding 576 bytes. */
- room = dst_mtu(&rt->u.dst);
+ room = dst_mtu(&rt->dst);
if (room > 576)
room = 576;
room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
@@ -645,6 +649,7 @@ out_unlock:
icmp_xmit_unlock(sk);
out:;
}
+EXPORT_SYMBOL(icmp_send);
/*
@@ -923,6 +928,7 @@ static void icmp_address(struct sk_buff *skb)
/*
* RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain
* loudly if an inconsistency is found.
+ * called with rcu_read_lock()
*/
static void icmp_address_reply(struct sk_buff *skb)
@@ -933,12 +939,12 @@ static void icmp_address_reply(struct sk_buff *skb)
struct in_ifaddr *ifa;
if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
- goto out;
+ return;
- in_dev = in_dev_get(dev);
+ in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
- goto out;
- rcu_read_lock();
+ return;
+
if (in_dev->ifa_list &&
IN_DEV_LOG_MARTIANS(in_dev) &&
IN_DEV_FORWARD(in_dev)) {
@@ -956,9 +962,6 @@ static void icmp_address_reply(struct sk_buff *skb)
mp, dev->name, &rt->rt_src);
}
}
- rcu_read_unlock();
- in_dev_put(in_dev);
-out:;
}
static void icmp_discard(struct sk_buff *skb)
@@ -972,7 +975,7 @@ int icmp_rcv(struct sk_buff *skb)
{
struct icmphdr *icmph;
struct rtable *rt = skb_rtable(skb);
- struct net *net = dev_net(rt->u.dst.dev);
+ struct net *net = dev_net(rt->dst.dev);
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
struct sec_path *sp = skb_sec_path(skb);
@@ -1214,7 +1217,3 @@ int __init icmp_init(void)
{
return register_pernet_subsys(&icmp_sk_ops);
}
-
-EXPORT_SYMBOL(icmp_err_convert);
-EXPORT_SYMBOL(icmp_send);
-EXPORT_SYMBOL(xrlim_allow);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 63bf298ca10..a1ad0e7180d 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -71,6 +71,7 @@
*/
#include <linux/module.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/types.h>
@@ -311,7 +312,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
return NULL;
}
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
skb->dev = dev;
skb_reserve(skb, LL_RESERVED_SPACE(dev));
@@ -329,7 +330,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
pip->saddr = rt->rt_src;
pip->protocol = IPPROTO_IGMP;
pip->tot_len = 0; /* filled in later */
- ip_select_ident(pip, &rt->u.dst, NULL);
+ ip_select_ident(pip, &rt->dst, NULL);
((u8*)&pip[1])[0] = IPOPT_RA;
((u8*)&pip[1])[1] = 4;
((u8*)&pip[1])[2] = 0;
@@ -659,7 +660,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
return -1;
}
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
skb_reserve(skb, LL_RESERVED_SPACE(dev));
@@ -675,7 +676,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
iph->daddr = dst;
iph->saddr = rt->rt_src;
iph->protocol = IPPROTO_IGMP;
- ip_select_ident(iph, &rt->u.dst, NULL);
+ ip_select_ident(iph, &rt->dst, NULL);
((u8*)&iph[1])[0] = IPOPT_RA;
((u8*)&iph[1])[1] = 4;
((u8*)&iph[1])[2] = 0;
@@ -915,18 +916,19 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
read_unlock(&in_dev->mc_list_lock);
}
+/* called in rcu_read_lock() section */
int igmp_rcv(struct sk_buff *skb)
{
/* This basically follows the spec line by line -- see RFC1112 */
struct igmphdr *ih;
- struct in_device *in_dev = in_dev_get(skb->dev);
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
int len = skb->len;
if (in_dev == NULL)
goto drop;
if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
- goto drop_ref;
+ goto drop;
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
@@ -936,7 +938,7 @@ int igmp_rcv(struct sk_buff *skb)
case CHECKSUM_NONE:
skb->csum = 0;
if (__skb_checksum_complete(skb))
- goto drop_ref;
+ goto drop;
}
ih = igmp_hdr(skb);
@@ -956,7 +958,6 @@ int igmp_rcv(struct sk_buff *skb)
break;
case IGMP_PIM:
#ifdef CONFIG_IP_PIMSM_V1
- in_dev_put(in_dev);
return pim_rcv_v1(skb);
#endif
case IGMPV3_HOST_MEMBERSHIP_REPORT:
@@ -970,8 +971,6 @@ int igmp_rcv(struct sk_buff *skb)
break;
}
-drop_ref:
- in_dev_put(in_dev);
drop:
kfree_skb(skb);
return 0;
@@ -997,7 +996,7 @@ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
--ANK
*/
if (arp_mc_map(addr, buf, dev, 0) == 0)
- dev_mc_add(dev, buf, dev->addr_len, 0);
+ dev_mc_add(dev, buf);
}
/*
@@ -1010,7 +1009,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
struct net_device *dev = in_dev->dev;
if (arp_mc_map(addr, buf, dev, 0) == 0)
- dev_mc_delete(dev, buf, dev->addr_len, 0);
+ dev_mc_del(dev, buf);
}
#ifdef CONFIG_IP_MULTICAST
@@ -1245,6 +1244,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
out:
return;
}
+EXPORT_SYMBOL(ip_mc_inc_group);
/*
* Resend IGMP JOIN report; used for bonding.
@@ -1267,6 +1267,7 @@ void ip_mc_rejoin_group(struct ip_mc_list *im)
igmp_ifc_event(in_dev);
#endif
}
+EXPORT_SYMBOL(ip_mc_rejoin_group);
/*
* A socket has left a multicast group on device dev
@@ -1297,6 +1298,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
}
}
}
+EXPORT_SYMBOL(ip_mc_dec_group);
/* Device changing type */
@@ -1426,7 +1428,7 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
}
if (!dev && !ip_route_output_key(net, &rt, &fl)) {
- dev = rt->u.dst.dev;
+ dev = rt->dst.dev;
ip_rt_put(rt);
}
if (dev) {
@@ -1645,8 +1647,7 @@ static int sf_setstate(struct ip_mc_list *pmc)
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
if (!dpsf) {
- dpsf = (struct ip_sf_list *)
- kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+ dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
if (!dpsf)
continue;
*dpsf = *psf;
@@ -1806,6 +1807,7 @@ done:
rtnl_unlock();
return err;
}
+EXPORT_SYMBOL(ip_mc_join_group);
static void ip_sf_socklist_reclaim(struct rcu_head *rp)
{
@@ -2678,8 +2680,3 @@ int __init igmp_mc_proc_init(void)
return register_pernet_subsys(&igmp_net_ops);
}
#endif
-
-EXPORT_SYMBOL(ip_mc_dec_group);
-EXPORT_SYMBOL(ip_mc_inc_group);
-EXPORT_SYMBOL(ip_mc_join_group);
-EXPORT_SYMBOL(ip_mc_rejoin_group);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 8da6429269d..7174370b119 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __read_mostly = {
.range = { 32768, 61000 },
};
+unsigned long *sysctl_local_reserved_ports;
+EXPORT_SYMBOL(sysctl_local_reserved_ports);
+
void inet_get_local_port_range(int *low, int *high)
{
unsigned seq;
@@ -81,7 +84,6 @@ int inet_csk_bind_conflict(const struct sock *sk,
}
return node != NULL;
}
-
EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
/* Obtain a reference to a local port for the given sock,
@@ -108,6 +110,8 @@ again:
smallest_size = -1;
do {
+ if (inet_is_reserved_local_port(rover))
+ goto next_nolock;
head = &hashinfo->bhash[inet_bhashfn(net, rover,
hashinfo->bhash_size)];
spin_lock(&head->lock);
@@ -130,6 +134,7 @@ again:
break;
next:
spin_unlock(&head->lock);
+ next_nolock:
if (++rover > high)
rover = low;
} while (--remaining > 0);
@@ -206,7 +211,6 @@ fail:
local_bh_enable();
return ret;
}
-
EXPORT_SYMBOL_GPL(inet_csk_get_port);
/*
@@ -234,7 +238,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
* having to remove and re-insert us on the wait queue.
*/
for (;;) {
- prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+ prepare_to_wait_exclusive(sk_sleep(sk), &wait,
TASK_INTERRUPTIBLE);
release_sock(sk);
if (reqsk_queue_empty(&icsk->icsk_accept_queue))
@@ -253,7 +257,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
if (!timeo)
break;
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
return err;
}
@@ -299,7 +303,6 @@ out_err:
*err = error;
goto out;
}
-
EXPORT_SYMBOL(inet_csk_accept);
/*
@@ -321,7 +324,6 @@ void inet_csk_init_xmit_timers(struct sock *sk,
setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
-
EXPORT_SYMBOL(inet_csk_init_xmit_timers);
void inet_csk_clear_xmit_timers(struct sock *sk)
@@ -334,21 +336,18 @@ void inet_csk_clear_xmit_timers(struct sock *sk)
sk_stop_timer(sk, &icsk->icsk_delack_timer);
sk_stop_timer(sk, &sk->sk_timer);
}
-
EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
void inet_csk_delete_keepalive_timer(struct sock *sk)
{
sk_stop_timer(sk, &sk->sk_timer);
}
-
EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
{
sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
}
-
EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
struct dst_entry *inet_csk_route_req(struct sock *sk,
@@ -377,7 +376,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
goto no_route;
if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
goto route_err;
- return &rt->u.dst;
+ return &rt->dst;
route_err:
ip_rt_put(rt);
@@ -385,7 +384,6 @@ no_route:
IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
-
EXPORT_SYMBOL_GPL(inet_csk_route_req);
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
@@ -427,7 +425,6 @@ struct request_sock *inet_csk_search_req(const struct sock *sk,
return req;
}
-
EXPORT_SYMBOL_GPL(inet_csk_search_req);
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
@@ -441,11 +438,11 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
inet_csk_reqsk_queue_added(sk, timeout);
}
+EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/* Only thing we need from tcp.h */
extern int sysctl_tcp_synack_retries;
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/* Decide when to expire the request and when to resend SYN-ACK */
static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
@@ -563,7 +560,6 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
if (lopt->qlen)
inet_csk_reset_keepalive_timer(parent, interval);
}
-
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
@@ -593,7 +589,6 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
}
return newsk;
}
-
EXPORT_SYMBOL_GPL(inet_csk_clone);
/*
@@ -624,7 +619,6 @@ void inet_csk_destroy_sock(struct sock *sk)
percpu_counter_dec(sk->sk_prot->orphan_count);
sock_put(sk);
}
-
EXPORT_SYMBOL(inet_csk_destroy_sock);
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
@@ -659,7 +653,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
__reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -EADDRINUSE;
}
-
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
/*
@@ -714,7 +707,6 @@ void inet_csk_listen_stop(struct sock *sk)
}
WARN_ON(sk->sk_ack_backlog);
}
-
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
@@ -726,7 +718,6 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
sin->sin_addr.s_addr = inet->inet_daddr;
sin->sin_port = inet->inet_dport;
}
-
EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
#ifdef CONFIG_COMPAT
@@ -741,7 +732,6 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
return icsk->icsk_af_ops->getsockopt(sk, level, optname,
optval, optlen);
}
-
EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
@@ -755,6 +745,5 @@ int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
return icsk->icsk_af_ops->setsockopt(sk, level, optname,
optval, optlen);
}
-
EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
#endif
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 1aaa8110d84..e5fa2ddce32 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/time.h>
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index eaf3e2c8646..5ff2a51b6d0 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -19,6 +19,7 @@
#include <linux/random.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
+#include <linux/slab.h>
#include <net/inet_frag.h>
@@ -113,7 +114,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
fq->last_in |= INET_FRAG_COMPLETE;
}
}
-
EXPORT_SYMBOL(inet_frag_kill);
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 2b79377b468..fb7ad5a21ff 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -99,7 +99,6 @@ void inet_put_port(struct sock *sk)
__inet_put_port(sk);
local_bh_enable();
}
-
EXPORT_SYMBOL(inet_put_port);
void __inet_inherit_port(struct sock *sk, struct sock *child)
@@ -116,7 +115,6 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)
inet_csk(child)->icsk_bind_hash = tb;
spin_unlock(&head->lock);
}
-
EXPORT_SYMBOL_GPL(__inet_inherit_port);
static inline int compute_score(struct sock *sk, struct net *net,
@@ -456,6 +454,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
local_bh_disable();
for (i = 1; i <= remaining; i++) {
port = low + (i + offset) % remaining;
+ if (inet_is_reserved_local_port(port))
+ continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock(&head->lock);
@@ -544,7 +544,6 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
__inet_check_established, __inet_hash_nolisten);
}
-
EXPORT_SYMBOL_GPL(inet_hash_connect);
void inet_hashinfo_init(struct inet_hashinfo *h)
@@ -558,5 +557,4 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
i + LISTENING_NULLS_BASE);
}
}
-
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index cc94cc2d8b2..c5af909cf70 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -10,6 +10,7 @@
#include <linux/kernel.h>
#include <linux/kmemcheck.h>
+#include <linux/slab.h>
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 6bcfe52a9c8..9ffa24b9a80 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -51,8 +51,8 @@
* lookups performed with disabled BHs.
*
* Serialisation issues.
- * 1. Nodes may appear in the tree only with the pool write lock held.
- * 2. Nodes may disappear from the tree only with the pool write lock held
+ * 1. Nodes may appear in the tree only with the pool lock held.
+ * 2. Nodes may disappear from the tree only with the pool lock held
* AND reference count being 0.
* 3. Nodes appears and disappears from unused node list only under
* "inet_peer_unused_lock".
@@ -64,23 +64,31 @@
* usually under some other lock to prevent node disappearing
* dtime: unused node list lock
* v4daddr: unchangeable
- * ip_id_count: idlock
+ * ip_id_count: atomic value (no lock needed)
*/
static struct kmem_cache *peer_cachep __read_mostly;
#define node_height(x) x->avl_height
-static struct inet_peer peer_fake_node = {
- .avl_left = &peer_fake_node,
- .avl_right = &peer_fake_node,
+
+#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
+static const struct inet_peer peer_fake_node = {
+ .avl_left = peer_avl_empty,
+ .avl_right = peer_avl_empty,
.avl_height = 0
};
-#define peer_avl_empty (&peer_fake_node)
-static struct inet_peer *peer_root = peer_avl_empty;
-static DEFINE_RWLOCK(peer_pool_lock);
+
+static struct {
+ struct inet_peer *root;
+ spinlock_t lock;
+ int total;
+} peers = {
+ .root = peer_avl_empty,
+ .lock = __SPIN_LOCK_UNLOCKED(peers.lock),
+ .total = 0,
+};
#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
-static int peer_total;
/* Exported for sysctl_net_ipv4. */
int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
* aggressively at this stage */
@@ -89,8 +97,13 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
int inet_peer_gc_mintime __read_mostly = 10 * HZ;
int inet_peer_gc_maxtime __read_mostly = 120 * HZ;
-static LIST_HEAD(unused_peers);
-static DEFINE_SPINLOCK(inet_peer_unused_lock);
+static struct {
+ struct list_head list;
+ spinlock_t lock;
+} unused_peers = {
+ .list = LIST_HEAD_INIT(unused_peers.list),
+ .lock = __SPIN_LOCK_UNLOCKED(unused_peers.lock),
+};
static void peer_check_expire(unsigned long dummy);
static DEFINE_TIMER(peer_periodic_timer, peer_check_expire, 0, 0);
@@ -116,7 +129,7 @@ void __init inet_initpeers(void)
peer_cachep = kmem_cache_create("inet_peer_cache",
sizeof(struct inet_peer),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
/* All the timers, started at system startup tend
@@ -131,38 +144,69 @@ void __init inet_initpeers(void)
/* Called with or without local BH being disabled. */
static void unlink_from_unused(struct inet_peer *p)
{
- spin_lock_bh(&inet_peer_unused_lock);
- list_del_init(&p->unused);
- spin_unlock_bh(&inet_peer_unused_lock);
+ if (!list_empty(&p->unused)) {
+ spin_lock_bh(&unused_peers.lock);
+ list_del_init(&p->unused);
+ spin_unlock_bh(&unused_peers.lock);
+ }
}
/*
* Called with local BH disabled and the pool lock held.
- * _stack is known to be NULL or not at compile time,
- * so compiler will optimize the if (_stack) tests.
*/
#define lookup(_daddr, _stack) \
({ \
struct inet_peer *u, **v; \
- if (_stack != NULL) { \
- stackptr = _stack; \
- *stackptr++ = &peer_root; \
- } \
- for (u = peer_root; u != peer_avl_empty; ) { \
+ \
+ stackptr = _stack; \
+ *stackptr++ = &peers.root; \
+ for (u = peers.root; u != peer_avl_empty; ) { \
if (_daddr == u->v4daddr) \
break; \
if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \
v = &u->avl_left; \
else \
v = &u->avl_right; \
- if (_stack != NULL) \
- *stackptr++ = v; \
+ *stackptr++ = v; \
u = *v; \
} \
u; \
})
-/* Called with local BH disabled and the pool write lock held. */
+/*
+ * Called with rcu_read_lock_bh()
+ * Because we hold no lock against a writer, its quite possible we fall
+ * in an endless loop.
+ * But every pointer we follow is guaranteed to be valid thanks to RCU.
+ * We exit from this function if number of links exceeds PEER_MAXDEPTH
+ */
+static struct inet_peer *lookup_rcu_bh(__be32 daddr)
+{
+ struct inet_peer *u = rcu_dereference_bh(peers.root);
+ int count = 0;
+
+ while (u != peer_avl_empty) {
+ if (daddr == u->v4daddr) {
+ /* Before taking a reference, check if this entry was
+ * deleted, unlink_from_pool() sets refcnt=-1 to make
+ * distinction between an unused entry (refcnt=0) and
+ * a freed one.
+ */
+ if (unlikely(!atomic_add_unless(&u->refcnt, 1, -1)))
+ u = NULL;
+ return u;
+ }
+ if ((__force __u32)daddr < (__force __u32)u->v4daddr)
+ u = rcu_dereference_bh(u->avl_left);
+ else
+ u = rcu_dereference_bh(u->avl_right);
+ if (unlikely(++count == PEER_MAXDEPTH))
+ break;
+ }
+ return NULL;
+}
+
+/* Called with local BH disabled and the pool lock held. */
#define lookup_rightempty(start) \
({ \
struct inet_peer *u, **v; \
@@ -176,9 +220,10 @@ static void unlink_from_unused(struct inet_peer *p)
u; \
})
-/* Called with local BH disabled and the pool write lock held.
+/* Called with local BH disabled and the pool lock held.
* Variable names are the proof of operation correctness.
- * Look into mm/map_avl.c for more detail description of the ideas. */
+ * Look into mm/map_avl.c for more detail description of the ideas.
+ */
static void peer_avl_rebalance(struct inet_peer **stack[],
struct inet_peer ***stackend)
{
@@ -254,15 +299,21 @@ static void peer_avl_rebalance(struct inet_peer **stack[],
}
}
-/* Called with local BH disabled and the pool write lock held. */
+/* Called with local BH disabled and the pool lock held. */
#define link_to_pool(n) \
do { \
n->avl_height = 1; \
n->avl_left = peer_avl_empty; \
n->avl_right = peer_avl_empty; \
+ smp_wmb(); /* lockless readers can catch us now */ \
**--stackptr = n; \
peer_avl_rebalance(stack, stackptr); \
-} while(0)
+} while (0)
+
+static void inetpeer_free_rcu(struct rcu_head *head)
+{
+ kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
+}
/* May be called with local BH enabled. */
static void unlink_from_pool(struct inet_peer *p)
@@ -271,13 +322,14 @@ static void unlink_from_pool(struct inet_peer *p)
do_free = 0;
- write_lock_bh(&peer_pool_lock);
+ spin_lock_bh(&peers.lock);
/* Check the reference counter. It was artificially incremented by 1
- * in cleanup() function to prevent sudden disappearing. If the
- * reference count is still 1 then the node is referenced only as `p'
- * here and from the pool. So under the exclusive pool lock it's safe
- * to remove the node and free it later. */
- if (atomic_read(&p->refcnt) == 1) {
+ * in cleanup() function to prevent sudden disappearing. If we can
+ * atomically (because of lockless readers) take this last reference,
+ * it's safe to remove the node and free it later.
+ * We use refcnt=-1 to alert lockless readers this entry is deleted.
+ */
+ if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) {
struct inet_peer **stack[PEER_MAXDEPTH];
struct inet_peer ***stackptr, ***delp;
if (lookup(p->v4daddr, stack) != p)
@@ -303,20 +355,21 @@ static void unlink_from_pool(struct inet_peer *p)
delp[1] = &t->avl_left; /* was &p->avl_left */
}
peer_avl_rebalance(stack, stackptr);
- peer_total--;
+ peers.total--;
do_free = 1;
}
- write_unlock_bh(&peer_pool_lock);
+ spin_unlock_bh(&peers.lock);
if (do_free)
- kmem_cache_free(peer_cachep, p);
+ call_rcu_bh(&p->rcu, inetpeer_free_rcu);
else
/* The node is used again. Decrease the reference counter
* back. The loop "cleanup -> unlink_from_unused
* -> unlink_from_pool -> putpeer -> link_to_unused
* -> cleanup (for the same node)"
* doesn't really exist because the entry will have a
- * recent deletion time and will not be cleaned again soon. */
+ * recent deletion time and will not be cleaned again soon.
+ */
inet_putpeer(p);
}
@@ -326,16 +379,16 @@ static int cleanup_once(unsigned long ttl)
struct inet_peer *p = NULL;
/* Remove the first entry from the list of unused nodes. */
- spin_lock_bh(&inet_peer_unused_lock);
- if (!list_empty(&unused_peers)) {
+ spin_lock_bh(&unused_peers.lock);
+ if (!list_empty(&unused_peers.list)) {
__u32 delta;
- p = list_first_entry(&unused_peers, struct inet_peer, unused);
+ p = list_first_entry(&unused_peers.list, struct inet_peer, unused);
delta = (__u32)jiffies - p->dtime;
if (delta < ttl) {
/* Do not prune fresh entries. */
- spin_unlock_bh(&inet_peer_unused_lock);
+ spin_unlock_bh(&unused_peers.lock);
return -1;
}
@@ -345,7 +398,7 @@ static int cleanup_once(unsigned long ttl)
* before unlink_from_pool() call. */
atomic_inc(&p->refcnt);
}
- spin_unlock_bh(&inet_peer_unused_lock);
+ spin_unlock_bh(&unused_peers.lock);
if (p == NULL)
/* It means that the total number of USED entries has
@@ -360,62 +413,56 @@ static int cleanup_once(unsigned long ttl)
/* Called with or without local BH being disabled. */
struct inet_peer *inet_getpeer(__be32 daddr, int create)
{
- struct inet_peer *p, *n;
+ struct inet_peer *p;
struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
- /* Look up for the address quickly. */
- read_lock_bh(&peer_pool_lock);
- p = lookup(daddr, NULL);
- if (p != peer_avl_empty)
- atomic_inc(&p->refcnt);
- read_unlock_bh(&peer_pool_lock);
+ /* Look up for the address quickly, lockless.
+ * Because of a concurrent writer, we might not find an existing entry.
+ */
+ rcu_read_lock_bh();
+ p = lookup_rcu_bh(daddr);
+ rcu_read_unlock_bh();
+
+ if (p) {
+ /* The existing node has been found.
+ * Remove the entry from unused list if it was there.
+ */
+ unlink_from_unused(p);
+ return p;
+ }
+ /* retry an exact lookup, taking the lock before.
+ * At least, nodes should be hot in our cache.
+ */
+ spin_lock_bh(&peers.lock);
+ p = lookup(daddr, stack);
if (p != peer_avl_empty) {
- /* The existing node has been found. */
+ atomic_inc(&p->refcnt);
+ spin_unlock_bh(&peers.lock);
/* Remove the entry from unused list if it was there. */
unlink_from_unused(p);
return p;
}
+ p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
+ if (p) {
+ p->v4daddr = daddr;
+ atomic_set(&p->refcnt, 1);
+ atomic_set(&p->rid, 0);
+ atomic_set(&p->ip_id_count, secure_ip_id(daddr));
+ p->tcp_ts_stamp = 0;
+ INIT_LIST_HEAD(&p->unused);
+
+
+ /* Link the node. */
+ link_to_pool(p);
+ peers.total++;
+ }
+ spin_unlock_bh(&peers.lock);
- if (!create)
- return NULL;
-
- /* Allocate the space outside the locked region. */
- n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
- if (n == NULL)
- return NULL;
- n->v4daddr = daddr;
- atomic_set(&n->refcnt, 1);
- atomic_set(&n->rid, 0);
- atomic_set(&n->ip_id_count, secure_ip_id(daddr));
- n->tcp_ts_stamp = 0;
-
- write_lock_bh(&peer_pool_lock);
- /* Check if an entry has suddenly appeared. */
- p = lookup(daddr, stack);
- if (p != peer_avl_empty)
- goto out_free;
-
- /* Link the node. */
- link_to_pool(n);
- INIT_LIST_HEAD(&n->unused);
- peer_total++;
- write_unlock_bh(&peer_pool_lock);
-
- if (peer_total >= inet_peer_threshold)
+ if (peers.total >= inet_peer_threshold)
/* Remove one less-recently-used entry. */
cleanup_once(0);
- return n;
-
-out_free:
- /* The appropriate node is already in the pool. */
- atomic_inc(&p->refcnt);
- write_unlock_bh(&peer_pool_lock);
- /* Remove the entry from unused list if it was there. */
- unlink_from_unused(p);
- /* Free preallocated the preallocated node. */
- kmem_cache_free(peer_cachep, n);
return p;
}
@@ -425,12 +472,12 @@ static void peer_check_expire(unsigned long dummy)
unsigned long now = jiffies;
int ttl;
- if (peer_total >= inet_peer_threshold)
+ if (peers.total >= inet_peer_threshold)
ttl = inet_peer_minttl;
else
ttl = inet_peer_maxttl
- (inet_peer_maxttl - inet_peer_minttl) / HZ *
- peer_total / inet_peer_threshold * HZ;
+ peers.total / inet_peer_threshold * HZ;
while (!cleanup_once(ttl)) {
if (jiffies != now)
break;
@@ -439,22 +486,25 @@ static void peer_check_expire(unsigned long dummy)
/* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
* interval depending on the total number of entries (more entries,
* less interval). */
- if (peer_total >= inet_peer_threshold)
+ if (peers.total >= inet_peer_threshold)
peer_periodic_timer.expires = jiffies + inet_peer_gc_mintime;
else
peer_periodic_timer.expires = jiffies
+ inet_peer_gc_maxtime
- (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
- peer_total / inet_peer_threshold * HZ;
+ peers.total / inet_peer_threshold * HZ;
add_timer(&peer_periodic_timer);
}
void inet_putpeer(struct inet_peer *p)
{
- spin_lock_bh(&inet_peer_unused_lock);
- if (atomic_dec_and_test(&p->refcnt)) {
- list_add_tail(&p->unused, &unused_peers);
+ local_bh_disable();
+
+ if (atomic_dec_and_lock(&p->refcnt, &unused_peers.lock)) {
+ list_add_tail(&p->unused, &unused_peers.list);
p->dtime = (__u32)jiffies;
+ spin_unlock(&unused_peers.lock);
}
- spin_unlock_bh(&inet_peer_unused_lock);
+
+ local_bh_enable();
}
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index a2991bc8e32..99461f09320 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -25,6 +25,7 @@
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/tcp.h>
@@ -86,16 +87,16 @@ int ip_forward(struct sk_buff *skb)
if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
goto sr_failed;
- if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) &&
+ if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
(ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
- IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS);
+ IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(dst_mtu(&rt->u.dst)));
+ htonl(dst_mtu(&rt->dst)));
goto drop;
}
/* We are about to mangle packet. Copy it! */
- if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
+ if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
goto drop;
iph = ip_hdr(skb);
@@ -111,8 +112,8 @@ int ip_forward(struct sk_buff *skb)
skb->priority = rt_tos2priority(iph->tos);
- return NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, rt->u.dst.dev,
- ip_forward_finish);
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
+ rt->dst.dev, ip_forward_finish);
sr_failed:
/*
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b59430bc041..b7c41654dde 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -32,6 +32,7 @@
#include <linux/netdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <net/route.h>
#include <net/dst.h>
#include <net/sock.h>
@@ -123,11 +124,8 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
}
/* Memory Tracking Functions. */
-static __inline__ void frag_kfree_skb(struct netns_frags *nf,
- struct sk_buff *skb, int *work)
+static void frag_kfree_skb(struct netns_frags *nf, struct sk_buff *skb)
{
- if (work)
- *work -= skb->truesize;
atomic_sub(skb->truesize, &nf->mem);
kfree_skb(skb);
}
@@ -308,7 +306,7 @@ static int ip_frag_reinit(struct ipq *qp)
fp = qp->q.fragments;
do {
struct sk_buff *xp = fp->next;
- frag_kfree_skb(qp->q.net, fp, NULL);
+ frag_kfree_skb(qp->q.net, fp);
fp = xp;
} while (fp);
@@ -316,6 +314,7 @@ static int ip_frag_reinit(struct ipq *qp)
qp->q.len = 0;
qp->q.meat = 0;
qp->q.fragments = NULL;
+ qp->q.fragments_tail = NULL;
qp->iif = 0;
return 0;
@@ -388,6 +387,11 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
* in the chain of fragments so far. We must know where to put
* this fragment, right?
*/
+ prev = qp->q.fragments_tail;
+ if (!prev || FRAG_CB(prev)->offset < offset) {
+ next = NULL;
+ goto found;
+ }
prev = NULL;
for (next = qp->q.fragments; next != NULL; next = next->next) {
if (FRAG_CB(next)->offset >= offset)
@@ -395,6 +399,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
prev = next;
}
+found:
/* We found where to put this one. Check for overlap with
* preceding fragment, and, if needed, align things so that
* any overlaps are eliminated.
@@ -445,7 +450,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
qp->q.fragments = next;
qp->q.meat -= free_it->len;
- frag_kfree_skb(qp->q.net, free_it, NULL);
+ frag_kfree_skb(qp->q.net, free_it);
}
}
@@ -453,6 +458,8 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
/* Insert this fragment in the chain of fragments. */
skb->next = next;
+ if (!next)
+ qp->q.fragments_tail = skb;
if (prev)
prev->next = skb;
else
@@ -506,6 +513,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
goto out_nomem;
fp->next = head->next;
+ if (!fp->next)
+ qp->q.fragments_tail = fp;
prev->next = fp;
skb_morph(head, qp->q.fragments);
@@ -555,7 +564,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- atomic_sub(head->truesize, &qp->q.net->mem);
for (fp=head->next; fp; fp = fp->next) {
head->data_len += fp->len;
@@ -565,8 +573,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
head->truesize += fp->truesize;
- atomic_sub(fp->truesize, &qp->q.net->mem);
}
+ atomic_sub(head->truesize, &qp->q.net->mem);
head->next = NULL;
head->dev = dev;
@@ -577,6 +585,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
iph->tot_len = htons(len);
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
+ qp->q.fragments_tail = NULL;
return 0;
out_nomem:
@@ -623,6 +632,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
kfree_skb(skb);
return -ENOMEM;
}
+EXPORT_SYMBOL(ip_defrag);
#ifdef CONFIG_SYSCTL
static int zero;
@@ -776,5 +786,3 @@ void __init ipfrag_init(void)
ip4_frags.secret_interval = 10 * 60 * HZ;
inet_frags_init(&ip4_frags);
}
-
-EXPORT_SYMBOL(ip_defrag);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f47c9f76754..945b20a5ad5 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
@@ -501,7 +502,6 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
t->err_time = jiffies;
out:
rcu_read_unlock();
- return;
}
static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
@@ -537,7 +537,6 @@ static int ipgre_rcv(struct sk_buff *skb)
struct ip_tunnel *tunnel;
int offset = 4;
__be16 gre_proto;
- unsigned int len;
if (!pskb_may_pull(skb, 16))
goto drop_nolock;
@@ -628,8 +627,6 @@ static int ipgre_rcv(struct sk_buff *skb)
tunnel->i_seqno = seqno + 1;
}
- len = skb->len;
-
/* Warning: All skb pointers will be invalidated! */
if (tunnel->dev->type == ARPHRD_ETHER) {
if (!pskb_may_pull(skb, ETH_HLEN)) {
@@ -643,11 +640,7 @@ static int ipgre_rcv(struct sk_buff *skb)
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
}
- stats->rx_packets++;
- stats->rx_bytes += len;
- skb->dev = tunnel->dev;
- skb_dst_drop(skb);
- nf_reset(skb);
+ skb_tunnel_rx(skb, tunnel->dev);
skb_reset_network_header(skb);
ipgre_ecn_decapsulate(iph, skb);
@@ -738,6 +731,8 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
tos = 0;
if (skb->protocol == htons(ETH_P_IP))
tos = old_iph->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
}
{
@@ -752,7 +747,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
goto tx_error;
}
}
- tdev = rt->u.dst.dev;
+ tdev = rt->dst.dev;
if (tdev == dev) {
ip_rt_put(rt);
@@ -762,7 +757,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
df = tiph->frag_off;
if (df)
- mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
+ mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
else
mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
@@ -810,11 +805,13 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
tunnel->err_count = 0;
}
- max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
+ max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
(skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+ if (max_headroom > dev->needed_headroom)
+ dev->needed_headroom = max_headroom;
if (!new_skb) {
ip_rt_put(rt);
txq->tx_dropped++;
@@ -835,7 +832,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
IPSKB_REROUTED);
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
/*
* Push down and install the IPIP header.
@@ -858,7 +855,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
#endif
else
- iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
+ iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
}
((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
@@ -920,7 +917,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
.proto = IPPROTO_GRE };
struct rtable *rt;
if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
- tdev = rt->u.dst.dev;
+ tdev = rt->dst.dev;
ip_rt_put(rt);
}
@@ -1179,7 +1176,7 @@ static int ipgre_open(struct net_device *dev)
struct rtable *rt;
if (ip_route_output_key(dev_net(dev), &rt, &fl))
return -EADDRNOTAVAIL;
- dev = rt->u.dst.dev;
+ dev = rt->dst.dev;
ip_rt_put(rt);
if (__in_dev_get_rtnl(dev) == NULL)
return -EADDRNOTAVAIL;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index c29de9879fd..d859bcc26cb 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -119,6 +119,7 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
+#include <linux/slab.h>
#include <linux/net.h>
#include <linux/socket.h>
@@ -145,7 +146,7 @@
#include <linux/netlink.h>
/*
- * Process Router Attention IP option
+ * Process Router Attention IP option (RFC 2113)
*/
int ip_call_ra_chain(struct sk_buff *skb)
{
@@ -154,8 +155,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
struct sock *last = NULL;
struct net_device *dev = skb->dev;
- read_lock(&ip_ra_lock);
- for (ra = ip_ra_chain; ra; ra = ra->next) {
+ for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
struct sock *sk = ra->sk;
/* If socket is bound to an interface, only report
@@ -166,10 +166,8 @@ int ip_call_ra_chain(struct sk_buff *skb)
sk->sk_bound_dev_if == dev->ifindex) &&
net_eq(sock_net(sk), dev_net(dev))) {
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
- if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) {
- read_unlock(&ip_ra_lock);
+ if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
return 1;
- }
}
if (last) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
@@ -182,10 +180,8 @@ int ip_call_ra_chain(struct sk_buff *skb)
if (last) {
raw_rcv(last, skb);
- read_unlock(&ip_ra_lock);
return 1;
}
- read_unlock(&ip_ra_lock);
return 0;
}
@@ -265,7 +261,7 @@ int ip_local_deliver(struct sk_buff *skb)
return 0;
}
- return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
@@ -297,18 +293,16 @@ static inline int ip_rcv_options(struct sk_buff *skb)
}
if (unlikely(opt->srr)) {
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
if (in_dev) {
if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
if (IN_DEV_LOG_MARTIANS(in_dev) &&
net_ratelimit())
printk(KERN_INFO "source route option %pI4 -> %pI4\n",
&iph->saddr, &iph->daddr);
- in_dev_put(in_dev);
goto drop;
}
-
- in_dev_put(in_dev);
}
if (ip_options_rcv_srr(skb))
@@ -330,8 +324,8 @@ static int ip_rcv_finish(struct sk_buff *skb)
* how the packet travels inside Linux networking.
*/
if (skb_dst(skb) == NULL) {
- int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
- skb->dev);
+ int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ iph->tos, skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
@@ -339,13 +333,16 @@ static int ip_rcv_finish(struct sk_buff *skb)
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INNOROUTES);
+ else if (err == -EXDEV)
+ NET_INC_STATS_BH(dev_net(skb->dev),
+ LINUX_MIB_IPRPFILTER);
goto drop;
}
}
#ifdef CONFIG_NET_CLS_ROUTE
if (unlikely(skb_dst(skb)->tclassid)) {
- struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
+ struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes += skb->len;
@@ -359,10 +356,10 @@ static int ip_rcv_finish(struct sk_buff *skb)
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST,
+ IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
- IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST,
+ IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
skb->len);
return dst_input(skb);
@@ -443,7 +440,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
- return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
inhdr_error:
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 94bf105ef3c..ba9836c488e 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -11,6 +11,7 @@
#include <linux/capability.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/types.h>
#include <asm/uaccess.h>
#include <linux/skbuff.h>
@@ -237,7 +238,6 @@ void ip_options_fragment(struct sk_buff * skb)
opt->rr_needaddr = 0;
opt->ts_needaddr = 0;
opt->ts_needtime = 0;
- return;
}
/*
@@ -600,6 +600,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
unsigned char *optptr = skb_network_header(skb) + opt->srr;
struct rtable *rt = skb_rtable(skb);
struct rtable *rt2;
+ unsigned long orefdst;
int err;
if (!opt->srr)
@@ -623,16 +624,16 @@ int ip_options_rcv_srr(struct sk_buff *skb)
}
memcpy(&nexthop, &optptr[srrptr-1], 4);
- rt = skb_rtable(skb);
+ orefdst = skb->_skb_refdst;
skb_dst_set(skb, NULL);
err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
rt2 = skb_rtable(skb);
if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
- ip_rt_put(rt2);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_drop(skb);
+ skb->_skb_refdst = orefdst;
return -EINVAL;
}
- ip_rt_put(rt);
+ refdst_drop(orefdst);
if (rt2->rt_type != RTN_LOCAL)
break;
/* Superfast 8) loopback forward */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 3451799e3db..04b69896df5 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -51,6 +51,7 @@
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/highmem.h>
+#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/sockios.h>
@@ -88,6 +89,7 @@ __inline__ void ip_send_check(struct iphdr *iph)
iph->check = 0;
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
+EXPORT_SYMBOL(ip_send_check);
int __ip_local_out(struct sk_buff *skb)
{
@@ -95,8 +97,8 @@ int __ip_local_out(struct sk_buff *skb)
iph->tot_len = htons(skb->len);
ip_send_check(iph);
- return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
- dst_output);
+ return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
+ skb_dst(skb)->dev, dst_output);
}
int ip_local_out(struct sk_buff *skb)
@@ -119,7 +121,7 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
newskb->pkt_type = PACKET_LOOPBACK;
newskb->ip_summed = CHECKSUM_UNNECESSARY;
WARN_ON(!skb_dst(newskb));
- netif_rx(newskb);
+ netif_rx_ni(newskb);
return 0;
}
@@ -150,15 +152,15 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
iph->version = 4;
iph->ihl = 5;
iph->tos = inet->tos;
- if (ip_dont_fragment(sk, &rt->u.dst))
+ if (ip_dont_fragment(sk, &rt->dst))
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
- iph->ttl = ip_select_ttl(inet, &rt->u.dst);
+ iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->daddr = rt->rt_dst;
iph->saddr = rt->rt_src;
iph->protocol = sk->sk_protocol;
- ip_select_ident(iph, &rt->u.dst, sk);
+ ip_select_ident(iph, &rt->dst, sk);
if (opt && opt->optlen) {
iph->ihl += opt->optlen>>2;
@@ -171,7 +173,6 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
/* Send it out. */
return ip_local_out(skb);
}
-
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
static inline int ip_finish_output2(struct sk_buff *skb)
@@ -239,7 +240,7 @@ int ip_mc_output(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct rtable *rt = skb_rtable(skb);
- struct net_device *dev = rt->u.dst.dev;
+ struct net_device *dev = rt->dst.dev;
/*
* If the indicated interface is up and running, send the packet.
@@ -271,8 +272,8 @@ int ip_mc_output(struct sk_buff *skb)
) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
- NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
- NULL, newskb->dev,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ newskb, NULL, newskb->dev,
ip_dev_loopback_xmit);
}
@@ -287,12 +288,12 @@ int ip_mc_output(struct sk_buff *skb)
if (rt->rt_flags&RTCF_BROADCAST) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
- NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
- newskb->dev, ip_dev_loopback_xmit);
+ NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
+ NULL, newskb->dev, ip_dev_loopback_xmit);
}
- return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
- ip_finish_output,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
+ skb->dev, ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
@@ -305,22 +306,24 @@ int ip_output(struct sk_buff *skb)
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
- return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
-int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
+int ip_queue_xmit(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
struct ip_options *opt = inet->opt;
struct rtable *rt;
struct iphdr *iph;
+ int res;
/* Skip all of this if the packet is already routed,
* f.e. by something like SCTP.
*/
+ rcu_read_lock();
rt = skb_rtable(skb);
if (rt != NULL)
goto packet_routed;
@@ -356,9 +359,9 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
goto no_route;
}
- sk_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->dst);
}
- skb_dst_set(skb, dst_clone(&rt->u.dst));
+ skb_dst_set_noref(skb, &rt->dst);
packet_routed:
if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
@@ -369,11 +372,11 @@ packet_routed:
skb_reset_network_header(skb);
iph = ip_hdr(skb);
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
- if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
+ if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
- iph->ttl = ip_select_ttl(inet, &rt->u.dst);
+ iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->protocol = sk->sk_protocol;
iph->saddr = rt->rt_src;
iph->daddr = rt->rt_dst;
@@ -384,19 +387,23 @@ packet_routed:
ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
}
- ip_select_ident_more(iph, &rt->u.dst, sk,
+ ip_select_ident_more(iph, &rt->dst, sk,
(skb_shinfo(skb)->gso_segs ?: 1) - 1);
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- return ip_local_out(skb);
+ res = ip_local_out(skb);
+ rcu_read_unlock();
+ return res;
no_route:
+ rcu_read_unlock();
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
+EXPORT_SYMBOL(ip_queue_xmit);
static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
@@ -405,7 +412,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->priority = from->priority;
to->protocol = from->protocol;
skb_dst_drop(to);
- skb_dst_set(to, dst_clone(skb_dst(from)));
+ skb_dst_copy(to, from);
to->dev = from->dev;
to->mark = from->mark;
@@ -436,17 +443,16 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
{
struct iphdr *iph;
- int raw = 0;
int ptr;
struct net_device *dev;
struct sk_buff *skb2;
- unsigned int mtu, hlen, left, len, ll_rs, pad;
+ unsigned int mtu, hlen, left, len, ll_rs;
int offset;
__be16 not_last_frag;
struct rtable *rt = skb_rtable(skb);
int err = 0;
- dev = rt->u.dst.dev;
+ dev = rt->dst.dev;
/*
* Point into the IP datagram header.
@@ -467,7 +473,11 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
*/
hlen = iph->ihl * 4;
- mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
+ mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (skb->nf_bridge)
+ mtu -= nf_bridge_mtu_reduction(skb);
+#endif
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
/* When frag_list is given, use it. First, check its validity:
@@ -570,14 +580,12 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
slow_path:
left = skb->len - hlen; /* Space per frame */
- ptr = raw + hlen; /* Where to start from */
+ ptr = hlen; /* Where to start from */
/* for bridged IP traffic encapsulated inside f.e. a vlan header,
* we need to make room for the encapsulating header
*/
- pad = nf_bridge_pad(skb);
- ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
- mtu -= pad;
+ ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
/*
* Fragment the datagram.
@@ -687,7 +695,6 @@ fail:
IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
return err;
}
-
EXPORT_SYMBOL(ip_fragment);
int
@@ -706,6 +713,7 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
}
return 0;
}
+EXPORT_SYMBOL(ip_generic_getfrag);
static inline __wsum
csum_page(struct page *page, int offset, int copy)
@@ -823,13 +831,13 @@ int ip_append_data(struct sock *sk,
*/
*rtp = NULL;
inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
- rt->u.dst.dev->mtu :
- dst_mtu(rt->u.dst.path);
- inet->cork.dst = &rt->u.dst;
+ rt->dst.dev->mtu :
+ dst_mtu(rt->dst.path);
+ inet->cork.dst = &rt->dst;
inet->cork.length = 0;
sk->sk_sndmsg_page = NULL;
sk->sk_sndmsg_off = 0;
- if ((exthdrlen = rt->u.dst.header_len) != 0) {
+ if ((exthdrlen = rt->dst.header_len) != 0) {
length += exthdrlen;
transhdrlen += exthdrlen;
}
@@ -842,7 +850,7 @@ int ip_append_data(struct sock *sk,
exthdrlen = 0;
mtu = inet->cork.fragsize;
}
- hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+ hh_len = LL_RESERVED_SPACE(rt->dst.dev);
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
@@ -859,14 +867,16 @@ int ip_append_data(struct sock *sk,
*/
if (transhdrlen &&
length + fragheaderlen <= mtu &&
- rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
+ rt->dst.dev->features & NETIF_F_V4_CSUM &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
+ skb = skb_peek_tail(&sk->sk_write_queue);
+
inet->cork.length += length;
- if (((length> mtu) || !skb_queue_empty(&sk->sk_write_queue)) &&
+ if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
- (rt->u.dst.dev->features & NETIF_F_UFO)) {
+ (rt->dst.dev->features & NETIF_F_UFO)) {
err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
fragheaderlen, transhdrlen, mtu,
flags);
@@ -882,7 +892,7 @@ int ip_append_data(struct sock *sk,
* adding appropriate IP header.
*/
- if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+ if (!skb)
goto alloc_new_skb;
while (length > 0) {
@@ -914,7 +924,7 @@ alloc_new_skb:
fraglen = datalen + fragheaderlen;
if ((flags & MSG_MORE) &&
- !(rt->u.dst.dev->features&NETIF_F_SG))
+ !(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
else
alloclen = datalen + fragheaderlen;
@@ -925,7 +935,7 @@ alloc_new_skb:
* the last.
*/
if (datalen == length + fraggap)
- alloclen += rt->u.dst.trailer_len;
+ alloclen += rt->dst.trailer_len;
if (transhdrlen) {
skb = sock_alloc_send_skb(sk,
@@ -998,7 +1008,7 @@ alloc_new_skb:
if (copy > length)
copy = length;
- if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
+ if (!(rt->dst.dev->features&NETIF_F_SG)) {
unsigned int off;
off = skb->len;
@@ -1093,10 +1103,10 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
if (inet->cork.flags & IPCORK_OPT)
opt = inet->cork.opt;
- if (!(rt->u.dst.dev->features&NETIF_F_SG))
+ if (!(rt->dst.dev->features&NETIF_F_SG))
return -EOPNOTSUPP;
- hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+ hh_len = LL_RESERVED_SPACE(rt->dst.dev);
mtu = inet->cork.fragsize;
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
@@ -1111,8 +1121,9 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
return -EINVAL;
inet->cork.length += size;
- if ((sk->sk_protocol == IPPROTO_UDP) &&
- (rt->u.dst.dev->features & NETIF_F_UFO)) {
+ if ((size + skb->len > mtu) &&
+ (sk->sk_protocol == IPPROTO_UDP) &&
+ (rt->dst.dev->features & NETIF_F_UFO)) {
skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
}
@@ -1264,8 +1275,8 @@ int ip_push_pending_frames(struct sock *sk)
* If local_df is set too, we still allow to fragment this frame
* locally. */
if (inet->pmtudisc >= IP_PMTUDISC_DO ||
- (skb->len <= dst_mtu(&rt->u.dst) &&
- ip_dont_fragment(sk, &rt->u.dst)))
+ (skb->len <= dst_mtu(&rt->dst) &&
+ ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
if (inet->cork.flags & IPCORK_OPT)
@@ -1274,7 +1285,7 @@ int ip_push_pending_frames(struct sock *sk)
if (rt->rt_type == RTN_MULTICAST)
ttl = inet->mc_ttl;
else
- ttl = ip_select_ttl(inet, &rt->u.dst);
+ ttl = ip_select_ttl(inet, &rt->dst);
iph = (struct iphdr *)skb->data;
iph->version = 4;
@@ -1285,7 +1296,7 @@ int ip_push_pending_frames(struct sock *sk)
}
iph->tos = inet->tos;
iph->frag_off = df;
- ip_select_ident(iph, &rt->u.dst, sk);
+ ip_select_ident(iph, &rt->dst, sk);
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
iph->saddr = rt->rt_src;
@@ -1298,7 +1309,7 @@ int ip_push_pending_frames(struct sock *sk)
* on dst refcount
*/
inet->cork.dst = NULL;
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
if (iph->protocol == IPPROTO_ICMP)
icmp_out_count(net, ((struct icmphdr *)
@@ -1435,7 +1446,3 @@ void __init ip_init(void)
igmp_mc_proc_init();
#endif
}
-
-EXPORT_SYMBOL(ip_generic_getfrag);
-EXPORT_SYMBOL(ip_queue_xmit);
-EXPORT_SYMBOL(ip_send_check);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 644dc43a55d..6c40a8c46e7 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -23,6 +23,7 @@
#include <linux/icmp.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -238,7 +239,16 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc)
sent to multicast group to reach destination designated router.
*/
struct ip_ra_chain *ip_ra_chain;
-DEFINE_RWLOCK(ip_ra_lock);
+static DEFINE_SPINLOCK(ip_ra_lock);
+
+
+static void ip_ra_destroy_rcu(struct rcu_head *head)
+{
+ struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
+
+ sock_put(ra->saved_sk);
+ kfree(ra);
+}
int ip_ra_control(struct sock *sk, unsigned char on,
void (*destructor)(struct sock *))
@@ -250,35 +260,42 @@ int ip_ra_control(struct sock *sk, unsigned char on,
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
- write_lock_bh(&ip_ra_lock);
+ spin_lock_bh(&ip_ra_lock);
for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) {
if (ra->sk == sk) {
if (on) {
- write_unlock_bh(&ip_ra_lock);
+ spin_unlock_bh(&ip_ra_lock);
kfree(new_ra);
return -EADDRINUSE;
}
- *rap = ra->next;
- write_unlock_bh(&ip_ra_lock);
+ /* dont let ip_call_ra_chain() use sk again */
+ ra->sk = NULL;
+ rcu_assign_pointer(*rap, ra->next);
+ spin_unlock_bh(&ip_ra_lock);
if (ra->destructor)
ra->destructor(sk);
- sock_put(sk);
- kfree(ra);
+ /*
+ * Delay sock_put(sk) and kfree(ra) after one rcu grace
+ * period. This guarantee ip_call_ra_chain() dont need
+ * to mess with socket refcounts.
+ */
+ ra->saved_sk = sk;
+ call_rcu(&ra->rcu, ip_ra_destroy_rcu);
return 0;
}
}
if (new_ra == NULL) {
- write_unlock_bh(&ip_ra_lock);
+ spin_unlock_bh(&ip_ra_lock);
return -ENOBUFS;
}
new_ra->sk = sk;
new_ra->destructor = destructor;
new_ra->next = ra;
- *rap = new_ra;
+ rcu_assign_pointer(*rap, new_ra);
sock_hold(sk);
- write_unlock_bh(&ip_ra_lock);
+ spin_unlock_bh(&ip_ra_lock);
return 0;
}
@@ -286,12 +303,8 @@ int ip_ra_control(struct sock *sk, unsigned char on,
void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
__be16 port, u32 info, u8 *payload)
{
- struct inet_sock *inet = inet_sk(sk);
struct sock_exterr_skb *serr;
- if (!inet->recverr)
- return;
-
skb = skb_clone(skb, GFP_ATOMIC);
if (!skb)
return;
@@ -452,7 +465,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
(1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
(1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
(1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
- (1<<IP_MINTTL))) ||
+ (1<<IP_MINTTL) | (1<<IP_NODEFRAG))) ||
optname == IP_MULTICAST_TTL ||
optname == IP_MULTICAST_ALL ||
optname == IP_MULTICAST_LOOP ||
@@ -575,6 +588,13 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
inet->hdrincl = val ? 1 : 0;
break;
+ case IP_NODEFRAG:
+ if (sk->sk_type != SOCK_RAW) {
+ err = -ENOPROTOOPT;
+ break;
+ }
+ inet->nodefrag = val ? 1 : 0;
+ break;
case IP_MTU_DISCOVER:
if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
goto e_inval;
@@ -957,6 +977,22 @@ e_inval:
return -EINVAL;
}
+/**
+ * ip_queue_rcv_skb - Queue an skb into sock receive queue
+ * @sk: socket
+ * @skb: buffer
+ *
+ * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option
+ * is not set, we drop skb dst entry now, while dst cache line is hot.
+ */
+int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO))
+ skb_dst_drop(skb);
+ return sock_queue_rcv_skb(sk, skb);
+}
+EXPORT_SYMBOL(ip_queue_rcv_skb);
+
int ip_setsockopt(struct sock *sk, int level,
int optname, char __user *optval, unsigned int optlen)
{
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 67890928164..3a6e1ec5e9a 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -53,6 +53,7 @@
#include <linux/root_dev.h>
#include <linux/delay.h>
#include <linux/nfs_fs.h>
+#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/arp.h>
#include <net/ip.h>
@@ -664,6 +665,13 @@ ic_dhcp_init_options(u8 *options)
memcpy(e, ic_req_params, sizeof(ic_req_params));
e += sizeof(ic_req_params);
+ if (ic_host_name_set) {
+ *e++ = 12; /* host-name */
+ len = strlen(utsname()->nodename);
+ *e++ = len;
+ memcpy(e, utsname()->nodename, len);
+ e += len;
+ }
if (*vendor_class_identifier) {
printk(KERN_INFO "DHCP: sending class identifier \"%s\"\n",
vendor_class_identifier);
@@ -975,7 +983,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
/* Is it a reply for the device we are configuring? */
if (b->xid != ic_dev_xid) {
if (net_ratelimit())
- printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet \n");
+ printk(KERN_ERR "DHCP/BOOTP: Ignoring delayed packet\n");
goto drop_unlock;
}
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 2f302d3ac9a..ec036731a70 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -95,6 +95,7 @@
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
@@ -373,11 +374,8 @@ static int ipip_rcv(struct sk_buff *skb)
skb->protocol = htons(ETH_P_IP);
skb->pkt_type = PACKET_HOST;
- tunnel->dev->stats.rx_packets++;
- tunnel->dev->stats.rx_bytes += skb->len;
- skb->dev = tunnel->dev;
- skb_dst_drop(skb);
- nf_reset(skb);
+ skb_tunnel_rx(skb, tunnel->dev);
+
ipip_ecn_decapsulate(iph, skb);
netif_rx(skb);
rcu_read_unlock();
@@ -437,7 +435,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
goto tx_error_icmp;
}
}
- tdev = rt->u.dst.dev;
+ tdev = rt->dst.dev;
if (tdev == dev) {
ip_rt_put(rt);
@@ -448,7 +446,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
df |= old_iph->frag_off & htons(IP_DF);
if (df) {
- mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+ mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
if (mtu < 68) {
stats->collisions++;
@@ -505,7 +503,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
IPSKB_REROUTED);
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
/*
* Push down and install the IPIP header.
@@ -554,7 +552,7 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
.proto = IPPROTO_IPIP };
struct rtable *rt;
if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
- tdev = rt->u.dst.dev;
+ tdev = rt->dst.dev;
ip_rt_put(rt);
}
dev->flags |= IFF_POINTOPOINT;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8582e12e4a6..179fcab866f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -22,7 +22,7 @@
* overflow.
* Carlos Picoto : PIMv1 Support
* Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
- * Relax this requrement to work with older peers.
+ * Relax this requirement to work with older peers.
*
*/
@@ -47,6 +47,7 @@
#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/if_ether.h>
+#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -62,11 +63,40 @@
#include <net/ipip.h>
#include <net/checksum.h>
#include <net/netlink.h>
+#include <net/fib_rules.h>
#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
#define CONFIG_IP_PIMSM 1
#endif
+struct mr_table {
+ struct list_head list;
+#ifdef CONFIG_NET_NS
+ struct net *net;
+#endif
+ u32 id;
+ struct sock *mroute_sk;
+ struct timer_list ipmr_expire_timer;
+ struct list_head mfc_unres_queue;
+ struct list_head mfc_cache_array[MFC_LINES];
+ struct vif_device vif_table[MAXVIFS];
+ int maxvif;
+ atomic_t cache_resolve_queue_len;
+ int mroute_do_assert;
+ int mroute_do_pim;
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+ int mroute_reg_vif_num;
+#endif
+};
+
+struct ipmr_rule {
+ struct fib_rule common;
+};
+
+struct ipmr_result {
+ struct mr_table *mrt;
+};
+
/* Big lock, protecting vif table, mrt cache and mroute socket state.
Note that the changes are semaphored via rtnl_lock.
*/
@@ -77,9 +107,7 @@ static DEFINE_RWLOCK(mrt_lock);
* Multicast router control variables
*/
-#define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL)
-
-static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */
+#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
/* Special spinlock for queue of unresolved entries */
static DEFINE_SPINLOCK(mfc_unres_lock);
@@ -94,12 +122,217 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
static struct kmem_cache *mrt_cachep __read_mostly;
-static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
-static int ipmr_cache_report(struct net *net,
+static struct mr_table *ipmr_new_table(struct net *net, u32 id);
+static int ip_mr_forward(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, struct mfc_cache *cache,
+ int local);
+static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert);
-static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
+static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ struct mfc_cache *c, struct rtmsg *rtm);
+static void ipmr_expire_process(unsigned long arg);
+
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+#define ipmr_for_each_table(mrt, net) \
+ list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
+
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
+
+ ipmr_for_each_table(mrt, net) {
+ if (mrt->id == id)
+ return mrt;
+ }
+ return NULL;
+}
+
+static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
+ struct mr_table **mrt)
+{
+ struct ipmr_result res;
+ struct fib_lookup_arg arg = { .result = &res, };
+ int err;
+
+ err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
+ if (err < 0)
+ return err;
+ *mrt = res.mrt;
+ return 0;
+}
+
+static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct ipmr_result *res = arg->result;
+ struct mr_table *mrt;
-static struct timer_list ipmr_expire_timer;
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+ case FR_ACT_UNREACHABLE:
+ return -ENETUNREACH;
+ case FR_ACT_PROHIBIT:
+ return -EACCES;
+ case FR_ACT_BLACKHOLE:
+ default:
+ return -EINVAL;
+ }
+
+ mrt = ipmr_get_table(rule->fr_net, rule->table);
+ if (mrt == NULL)
+ return -EAGAIN;
+ res->mrt = mrt;
+ return 0;
+}
+
+static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+ return 1;
+}
+
+static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
+ FRA_GENERIC_POLICY,
+};
+
+static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh, struct nlattr **tb)
+{
+ return 0;
+}
+
+static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ return 1;
+}
+
+static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh)
+{
+ frh->dst_len = 0;
+ frh->src_len = 0;
+ frh->tos = 0;
+ return 0;
+}
+
+static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
+ .family = RTNL_FAMILY_IPMR,
+ .rule_size = sizeof(struct ipmr_rule),
+ .addr_size = sizeof(u32),
+ .action = ipmr_rule_action,
+ .match = ipmr_rule_match,
+ .configure = ipmr_rule_configure,
+ .compare = ipmr_rule_compare,
+ .default_pref = fib_default_rule_pref,
+ .fill = ipmr_rule_fill,
+ .nlgroup = RTNLGRP_IPV4_RULE,
+ .policy = ipmr_rule_policy,
+ .owner = THIS_MODULE,
+};
+
+static int __net_init ipmr_rules_init(struct net *net)
+{
+ struct fib_rules_ops *ops;
+ struct mr_table *mrt;
+ int err;
+
+ ops = fib_rules_register(&ipmr_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ INIT_LIST_HEAD(&net->ipv4.mr_tables);
+
+ mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+ if (mrt == NULL) {
+ err = -ENOMEM;
+ goto err1;
+ }
+
+ err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
+ if (err < 0)
+ goto err2;
+
+ net->ipv4.mr_rules_ops = ops;
+ return 0;
+
+err2:
+ kfree(mrt);
+err1:
+ fib_rules_unregister(ops);
+ return err;
+}
+
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+ struct mr_table *mrt, *next;
+
+ list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
+ list_del(&mrt->list);
+ kfree(mrt);
+ }
+ fib_rules_unregister(net->ipv4.mr_rules_ops);
+}
+#else
+#define ipmr_for_each_table(mrt, net) \
+ for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
+
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+ return net->ipv4.mrt;
+}
+
+static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
+ struct mr_table **mrt)
+{
+ *mrt = net->ipv4.mrt;
+ return 0;
+}
+
+static int __net_init ipmr_rules_init(struct net *net)
+{
+ net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+ return net->ipv4.mrt ? 0 : -ENOMEM;
+}
+
+static void __net_exit ipmr_rules_exit(struct net *net)
+{
+ kfree(net->ipv4.mrt);
+}
+#endif
+
+static struct mr_table *ipmr_new_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
+ unsigned int i;
+
+ mrt = ipmr_get_table(net, id);
+ if (mrt != NULL)
+ return mrt;
+
+ mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+ if (mrt == NULL)
+ return NULL;
+ write_pnet(&mrt->net, net);
+ mrt->id = id;
+
+ /* Forwarding cache */
+ for (i = 0; i < MFC_LINES; i++)
+ INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
+
+ INIT_LIST_HEAD(&mrt->mfc_unres_queue);
+
+ setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
+ (unsigned long)mrt);
+
+#ifdef CONFIG_IP_PIMSM
+ mrt->mroute_reg_vif_num = -1;
+#endif
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+ list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
+#endif
+ return mrt;
+}
/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
@@ -200,12 +433,24 @@ failure:
static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct net *net = dev_net(dev);
+ struct mr_table *mrt;
+ struct flowi fl = {
+ .oif = dev->ifindex,
+ .iif = skb->skb_iif,
+ .mark = skb->mark,
+ };
+ int err;
+
+ err = ipmr_fib_lookup(net, &fl, &mrt);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
read_lock(&mrt_lock);
dev->stats.tx_bytes += skb->len;
dev->stats.tx_packets++;
- ipmr_cache_report(net, skb, net->ipv4.mroute_reg_vif_num,
- IGMPMSG_WHOLEPKT);
+ ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
read_unlock(&mrt_lock);
kfree_skb(skb);
return NETDEV_TX_OK;
@@ -225,12 +470,18 @@ static void reg_vif_setup(struct net_device *dev)
dev->features |= NETIF_F_NETNS_LOCAL;
}
-static struct net_device *ipmr_reg_vif(struct net *net)
+static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
{
struct net_device *dev;
struct in_device *in_dev;
+ char name[IFNAMSIZ];
+
+ if (mrt->id == RT_TABLE_DEFAULT)
+ sprintf(name, "pimreg");
+ else
+ sprintf(name, "pimreg%u", mrt->id);
- dev = alloc_netdev(0, "pimreg", reg_vif_setup);
+ dev = alloc_netdev(0, name, reg_vif_setup);
if (dev == NULL)
return NULL;
@@ -275,17 +526,17 @@ failure:
* @notify: Set to 1, if the caller is a notifier_call
*/
-static int vif_delete(struct net *net, int vifi, int notify,
+static int vif_delete(struct mr_table *mrt, int vifi, int notify,
struct list_head *head)
{
struct vif_device *v;
struct net_device *dev;
struct in_device *in_dev;
- if (vifi < 0 || vifi >= net->ipv4.maxvif)
+ if (vifi < 0 || vifi >= mrt->maxvif)
return -EADDRNOTAVAIL;
- v = &net->ipv4.vif_table[vifi];
+ v = &mrt->vif_table[vifi];
write_lock_bh(&mrt_lock);
dev = v->dev;
@@ -297,17 +548,17 @@ static int vif_delete(struct net *net, int vifi, int notify,
}
#ifdef CONFIG_IP_PIMSM
- if (vifi == net->ipv4.mroute_reg_vif_num)
- net->ipv4.mroute_reg_vif_num = -1;
+ if (vifi == mrt->mroute_reg_vif_num)
+ mrt->mroute_reg_vif_num = -1;
#endif
- if (vifi+1 == net->ipv4.maxvif) {
+ if (vifi+1 == mrt->maxvif) {
int tmp;
for (tmp=vifi-1; tmp>=0; tmp--) {
- if (VIF_EXISTS(net, tmp))
+ if (VIF_EXISTS(mrt, tmp))
break;
}
- net->ipv4.maxvif = tmp+1;
+ mrt->maxvif = tmp+1;
}
write_unlock_bh(&mrt_lock);
@@ -328,7 +579,6 @@ static int vif_delete(struct net *net, int vifi, int notify,
static inline void ipmr_cache_free(struct mfc_cache *c)
{
- release_net(mfc_net(c));
kmem_cache_free(mrt_cachep, c);
}
@@ -336,13 +586,13 @@ static inline void ipmr_cache_free(struct mfc_cache *c)
and reporting error to netlink readers.
*/
-static void ipmr_destroy_unres(struct mfc_cache *c)
+static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
{
+ struct net *net = read_pnet(&mrt->net);
struct sk_buff *skb;
struct nlmsgerr *e;
- struct net *net = mfc_net(c);
- atomic_dec(&net->ipv4.cache_resolve_queue_len);
+ atomic_dec(&mrt->cache_resolve_queue_len);
while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
if (ip_hdr(skb)->version == 0) {
@@ -363,42 +613,40 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
}
-/* Single timer process for all the unresolved queue. */
+/* Timer process for the unresolved queue. */
-static void ipmr_expire_process(unsigned long dummy)
+static void ipmr_expire_process(unsigned long arg)
{
+ struct mr_table *mrt = (struct mr_table *)arg;
unsigned long now;
unsigned long expires;
- struct mfc_cache *c, **cp;
+ struct mfc_cache *c, *next;
if (!spin_trylock(&mfc_unres_lock)) {
- mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
+ mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
return;
}
- if (mfc_unres_queue == NULL)
+ if (list_empty(&mrt->mfc_unres_queue))
goto out;
now = jiffies;
expires = 10*HZ;
- cp = &mfc_unres_queue;
- while ((c=*cp) != NULL) {
+ list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
if (time_after(c->mfc_un.unres.expires, now)) {
unsigned long interval = c->mfc_un.unres.expires - now;
if (interval < expires)
expires = interval;
- cp = &c->next;
continue;
}
- *cp = c->next;
-
- ipmr_destroy_unres(c);
+ list_del(&c->list);
+ ipmr_destroy_unres(mrt, c);
}
- if (mfc_unres_queue != NULL)
- mod_timer(&ipmr_expire_timer, jiffies + expires);
+ if (!list_empty(&mrt->mfc_unres_queue))
+ mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
out:
spin_unlock(&mfc_unres_lock);
@@ -406,17 +654,17 @@ out:
/* Fill oifs list. It is called under write locked mrt_lock. */
-static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
+static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
+ unsigned char *ttls)
{
int vifi;
- struct net *net = mfc_net(cache);
cache->mfc_un.res.minvif = MAXVIFS;
cache->mfc_un.res.maxvif = 0;
memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
- for (vifi = 0; vifi < net->ipv4.maxvif; vifi++) {
- if (VIF_EXISTS(net, vifi) &&
+ for (vifi = 0; vifi < mrt->maxvif; vifi++) {
+ if (VIF_EXISTS(mrt, vifi) &&
ttls[vifi] && ttls[vifi] < 255) {
cache->mfc_un.res.ttls[vifi] = ttls[vifi];
if (cache->mfc_un.res.minvif > vifi)
@@ -427,16 +675,17 @@ static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
}
}
-static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
+static int vif_add(struct net *net, struct mr_table *mrt,
+ struct vifctl *vifc, int mrtsock)
{
int vifi = vifc->vifc_vifi;
- struct vif_device *v = &net->ipv4.vif_table[vifi];
+ struct vif_device *v = &mrt->vif_table[vifi];
struct net_device *dev;
struct in_device *in_dev;
int err;
/* Is vif busy ? */
- if (VIF_EXISTS(net, vifi))
+ if (VIF_EXISTS(mrt, vifi))
return -EADDRINUSE;
switch (vifc->vifc_flags) {
@@ -446,9 +695,9 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
* Special Purpose VIF in PIM
* All the packets will be sent to the daemon
*/
- if (net->ipv4.mroute_reg_vif_num >= 0)
+ if (mrt->mroute_reg_vif_num >= 0)
return -EADDRINUSE;
- dev = ipmr_reg_vif(net);
+ dev = ipmr_reg_vif(net, mrt);
if (!dev)
return -ENOBUFS;
err = dev_set_allmulti(dev, 1);
@@ -524,49 +773,47 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
v->dev = dev;
#ifdef CONFIG_IP_PIMSM
if (v->flags&VIFF_REGISTER)
- net->ipv4.mroute_reg_vif_num = vifi;
+ mrt->mroute_reg_vif_num = vifi;
#endif
- if (vifi+1 > net->ipv4.maxvif)
- net->ipv4.maxvif = vifi+1;
+ if (vifi+1 > mrt->maxvif)
+ mrt->maxvif = vifi+1;
write_unlock_bh(&mrt_lock);
return 0;
}
-static struct mfc_cache *ipmr_cache_find(struct net *net,
+static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
__be32 origin,
__be32 mcastgrp)
{
int line = MFC_HASH(mcastgrp, origin);
struct mfc_cache *c;
- for (c = net->ipv4.mfc_cache_array[line]; c; c = c->next) {
- if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
- break;
+ list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
+ if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
+ return c;
}
- return c;
+ return NULL;
}
/*
* Allocate a multicast cache entry
*/
-static struct mfc_cache *ipmr_cache_alloc(struct net *net)
+static struct mfc_cache *ipmr_cache_alloc(void)
{
struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
if (c == NULL)
return NULL;
c->mfc_un.res.minvif = MAXVIFS;
- mfc_net_set(c, net);
return c;
}
-static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
+static struct mfc_cache *ipmr_cache_alloc_unres(void)
{
struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
if (c == NULL)
return NULL;
skb_queue_head_init(&c->mfc_un.unres.unresolved);
c->mfc_un.unres.expires = jiffies + 10*HZ;
- mfc_net_set(c, net);
return c;
}
@@ -574,7 +821,8 @@ static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
* A cache entry has gone into a resolved state from queued
*/
-static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
+static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
+ struct mfc_cache *uc, struct mfc_cache *c)
{
struct sk_buff *skb;
struct nlmsgerr *e;
@@ -587,7 +835,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
if (ip_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
- if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
+ if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
nlh->nlmsg_len = (skb_tail_pointer(skb) -
(u8 *)nlh);
} else {
@@ -599,9 +847,9 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
memset(&e->msg, 0, sizeof(e->msg));
}
- rtnl_unicast(skb, mfc_net(c), NETLINK_CB(skb).pid);
+ rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
} else
- ip_mr_forward(skb, c, 0);
+ ip_mr_forward(net, mrt, skb, c, 0);
}
}
@@ -612,7 +860,7 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
* Called under mrt_lock.
*/
-static int ipmr_cache_report(struct net *net,
+static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert)
{
struct sk_buff *skb;
@@ -645,7 +893,7 @@ static int ipmr_cache_report(struct net *net,
memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
msg->im_msgtype = IGMPMSG_WHOLEPKT;
msg->im_mbz = 0;
- msg->im_vif = net->ipv4.mroute_reg_vif_num;
+ msg->im_vif = mrt->mroute_reg_vif_num;
ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
sizeof(struct iphdr));
@@ -677,7 +925,7 @@ static int ipmr_cache_report(struct net *net,
skb->transport_header = skb->network_header;
}
- if (net->ipv4.mroute_sk == NULL) {
+ if (mrt->mroute_sk == NULL) {
kfree_skb(skb);
return -EINVAL;
}
@@ -685,7 +933,7 @@ static int ipmr_cache_report(struct net *net,
/*
* Deliver to mrouted
*/
- ret = sock_queue_rcv_skb(net->ipv4.mroute_sk, skb);
+ ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
if (ret < 0) {
if (net_ratelimit())
printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -700,27 +948,29 @@ static int ipmr_cache_report(struct net *net,
*/
static int
-ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
+ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
{
+ bool found = false;
int err;
struct mfc_cache *c;
const struct iphdr *iph = ip_hdr(skb);
spin_lock_bh(&mfc_unres_lock);
- for (c=mfc_unres_queue; c; c=c->next) {
- if (net_eq(mfc_net(c), net) &&
- c->mfc_mcastgrp == iph->daddr &&
- c->mfc_origin == iph->saddr)
+ list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
+ if (c->mfc_mcastgrp == iph->daddr &&
+ c->mfc_origin == iph->saddr) {
+ found = true;
break;
+ }
}
- if (c == NULL) {
+ if (!found) {
/*
* Create a new entry if allowable
*/
- if (atomic_read(&net->ipv4.cache_resolve_queue_len) >= 10 ||
- (c = ipmr_cache_alloc_unres(net)) == NULL) {
+ if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
+ (c = ipmr_cache_alloc_unres()) == NULL) {
spin_unlock_bh(&mfc_unres_lock);
kfree_skb(skb);
@@ -737,7 +987,7 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
/*
* Reflect first query at mrouted.
*/
- err = ipmr_cache_report(net, skb, vifi, IGMPMSG_NOCACHE);
+ err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
if (err < 0) {
/* If the report failed throw the cache entry
out - Brad Parker
@@ -749,11 +999,11 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
return err;
}
- atomic_inc(&net->ipv4.cache_resolve_queue_len);
- c->next = mfc_unres_queue;
- mfc_unres_queue = c;
+ atomic_inc(&mrt->cache_resolve_queue_len);
+ list_add(&c->list, &mrt->mfc_unres_queue);
- mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
+ if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
+ mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
}
/*
@@ -775,19 +1025,18 @@ ipmr_cache_unresolved(struct net *net, vifi_t vifi, struct sk_buff *skb)
* MFC cache manipulation by user space mroute daemon
*/
-static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc)
+static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
{
int line;
- struct mfc_cache *c, **cp;
+ struct mfc_cache *c, *next;
line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
- for (cp = &net->ipv4.mfc_cache_array[line];
- (c = *cp) != NULL; cp = &c->next) {
+ list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
write_lock_bh(&mrt_lock);
- *cp = c->next;
+ list_del(&c->list);
write_unlock_bh(&mrt_lock);
ipmr_cache_free(c);
@@ -797,24 +1046,30 @@ static int ipmr_mfc_delete(struct net *net, struct mfcctl *mfc)
return -ENOENT;
}
-static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
+static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
+ struct mfcctl *mfc, int mrtsock)
{
+ bool found = false;
int line;
- struct mfc_cache *uc, *c, **cp;
+ struct mfc_cache *uc, *c;
+
+ if (mfc->mfcc_parent >= MAXVIFS)
+ return -ENFILE;
line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
- for (cp = &net->ipv4.mfc_cache_array[line];
- (c = *cp) != NULL; cp = &c->next) {
+ list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
- c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
+ c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+ found = true;
break;
+ }
}
- if (c != NULL) {
+ if (found) {
write_lock_bh(&mrt_lock);
c->mfc_parent = mfc->mfcc_parent;
- ipmr_update_thresholds(c, mfc->mfcc_ttls);
+ ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
write_unlock_bh(&mrt_lock);
@@ -824,43 +1079,42 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
return -EINVAL;
- c = ipmr_cache_alloc(net);
+ c = ipmr_cache_alloc();
if (c == NULL)
return -ENOMEM;
c->mfc_origin = mfc->mfcc_origin.s_addr;
c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
c->mfc_parent = mfc->mfcc_parent;
- ipmr_update_thresholds(c, mfc->mfcc_ttls);
+ ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
if (!mrtsock)
c->mfc_flags |= MFC_STATIC;
write_lock_bh(&mrt_lock);
- c->next = net->ipv4.mfc_cache_array[line];
- net->ipv4.mfc_cache_array[line] = c;
+ list_add(&c->list, &mrt->mfc_cache_array[line]);
write_unlock_bh(&mrt_lock);
/*
* Check to see if we resolved a queued list. If so we
* need to send on the frames and tidy up.
*/
+ found = false;
spin_lock_bh(&mfc_unres_lock);
- for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
- cp = &uc->next) {
- if (net_eq(mfc_net(uc), net) &&
- uc->mfc_origin == c->mfc_origin &&
+ list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
+ if (uc->mfc_origin == c->mfc_origin &&
uc->mfc_mcastgrp == c->mfc_mcastgrp) {
- *cp = uc->next;
- atomic_dec(&net->ipv4.cache_resolve_queue_len);
+ list_del(&uc->list);
+ atomic_dec(&mrt->cache_resolve_queue_len);
+ found = true;
break;
}
}
- if (mfc_unres_queue == NULL)
- del_timer(&ipmr_expire_timer);
+ if (list_empty(&mrt->mfc_unres_queue))
+ del_timer(&mrt->ipmr_expire_timer);
spin_unlock_bh(&mfc_unres_lock);
- if (uc) {
- ipmr_cache_resolve(uc, c);
+ if (found) {
+ ipmr_cache_resolve(net, mrt, uc, c);
ipmr_cache_free(uc);
}
return 0;
@@ -870,53 +1124,41 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
* Close the multicast socket, and clear the vif tables etc
*/
-static void mroute_clean_tables(struct net *net)
+static void mroute_clean_tables(struct mr_table *mrt)
{
int i;
LIST_HEAD(list);
+ struct mfc_cache *c, *next;
/*
* Shut down all active vif entries
*/
- for (i = 0; i < net->ipv4.maxvif; i++) {
- if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC))
- vif_delete(net, i, 0, &list);
+ for (i = 0; i < mrt->maxvif; i++) {
+ if (!(mrt->vif_table[i].flags&VIFF_STATIC))
+ vif_delete(mrt, i, 0, &list);
}
unregister_netdevice_many(&list);
/*
* Wipe the cache
*/
- for (i=0; i<MFC_LINES; i++) {
- struct mfc_cache *c, **cp;
-
- cp = &net->ipv4.mfc_cache_array[i];
- while ((c = *cp) != NULL) {
- if (c->mfc_flags&MFC_STATIC) {
- cp = &c->next;
+ for (i = 0; i < MFC_LINES; i++) {
+ list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
+ if (c->mfc_flags&MFC_STATIC)
continue;
- }
write_lock_bh(&mrt_lock);
- *cp = c->next;
+ list_del(&c->list);
write_unlock_bh(&mrt_lock);
ipmr_cache_free(c);
}
}
- if (atomic_read(&net->ipv4.cache_resolve_queue_len) != 0) {
- struct mfc_cache *c, **cp;
-
+ if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
spin_lock_bh(&mfc_unres_lock);
- cp = &mfc_unres_queue;
- while ((c = *cp) != NULL) {
- if (!net_eq(mfc_net(c), net)) {
- cp = &c->next;
- continue;
- }
- *cp = c->next;
-
- ipmr_destroy_unres(c);
+ list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
+ list_del(&c->list);
+ ipmr_destroy_unres(mrt, c);
}
spin_unlock_bh(&mfc_unres_lock);
}
@@ -925,16 +1167,19 @@ static void mroute_clean_tables(struct net *net)
static void mrtsock_destruct(struct sock *sk)
{
struct net *net = sock_net(sk);
+ struct mr_table *mrt;
rtnl_lock();
- if (sk == net->ipv4.mroute_sk) {
- IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
+ ipmr_for_each_table(mrt, net) {
+ if (sk == mrt->mroute_sk) {
+ IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
- write_lock_bh(&mrt_lock);
- net->ipv4.mroute_sk = NULL;
- write_unlock_bh(&mrt_lock);
+ write_lock_bh(&mrt_lock);
+ mrt->mroute_sk = NULL;
+ write_unlock_bh(&mrt_lock);
- mroute_clean_tables(net);
+ mroute_clean_tables(mrt);
+ }
}
rtnl_unlock();
}
@@ -952,9 +1197,14 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
struct vifctl vif;
struct mfcctl mfc;
struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (mrt == NULL)
+ return -ENOENT;
if (optname != MRT_INIT) {
- if (sk != net->ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
+ if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
return -EACCES;
}
@@ -967,7 +1217,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -ENOPROTOOPT;
rtnl_lock();
- if (net->ipv4.mroute_sk) {
+ if (mrt->mroute_sk) {
rtnl_unlock();
return -EADDRINUSE;
}
@@ -975,7 +1225,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
ret = ip_ra_control(sk, 1, mrtsock_destruct);
if (ret == 0) {
write_lock_bh(&mrt_lock);
- net->ipv4.mroute_sk = sk;
+ mrt->mroute_sk = sk;
write_unlock_bh(&mrt_lock);
IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
@@ -983,7 +1233,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
rtnl_unlock();
return ret;
case MRT_DONE:
- if (sk != net->ipv4.mroute_sk)
+ if (sk != mrt->mroute_sk)
return -EACCES;
return ip_ra_control(sk, 0, NULL);
case MRT_ADD_VIF:
@@ -996,9 +1246,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -ENFILE;
rtnl_lock();
if (optname == MRT_ADD_VIF) {
- ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk);
+ ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
} else {
- ret = vif_delete(net, vif.vifc_vifi, 0, NULL);
+ ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
}
rtnl_unlock();
return ret;
@@ -1015,9 +1265,9 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
return -EFAULT;
rtnl_lock();
if (optname == MRT_DEL_MFC)
- ret = ipmr_mfc_delete(net, &mfc);
+ ret = ipmr_mfc_delete(mrt, &mfc);
else
- ret = ipmr_mfc_add(net, &mfc, sk == net->ipv4.mroute_sk);
+ ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
rtnl_unlock();
return ret;
/*
@@ -1028,7 +1278,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
int v;
if (get_user(v,(int __user *)optval))
return -EFAULT;
- net->ipv4.mroute_do_assert = (v) ? 1 : 0;
+ mrt->mroute_do_assert = (v) ? 1 : 0;
return 0;
}
#ifdef CONFIG_IP_PIMSM
@@ -1042,14 +1292,35 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
rtnl_lock();
ret = 0;
- if (v != net->ipv4.mroute_do_pim) {
- net->ipv4.mroute_do_pim = v;
- net->ipv4.mroute_do_assert = v;
+ if (v != mrt->mroute_do_pim) {
+ mrt->mroute_do_pim = v;
+ mrt->mroute_do_assert = v;
}
rtnl_unlock();
return ret;
}
#endif
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+ case MRT_TABLE:
+ {
+ u32 v;
+
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+ if (get_user(v, (u32 __user *)optval))
+ return -EFAULT;
+ if (sk == mrt->mroute_sk)
+ return -EBUSY;
+
+ rtnl_lock();
+ ret = 0;
+ if (!ipmr_new_table(net, v))
+ ret = -ENOMEM;
+ raw_sk(sk)->ipmr_table = v;
+ rtnl_unlock();
+ return ret;
+ }
+#endif
/*
* Spurious command, or MRT_VERSION which you cannot
* set.
@@ -1068,6 +1339,11 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
int olr;
int val;
struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (mrt == NULL)
+ return -ENOENT;
if (optname != MRT_VERSION &&
#ifdef CONFIG_IP_PIMSM
@@ -1089,10 +1365,10 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
val = 0x0305;
#ifdef CONFIG_IP_PIMSM
else if (optname == MRT_PIM)
- val = net->ipv4.mroute_do_pim;
+ val = mrt->mroute_do_pim;
#endif
else
- val = net->ipv4.mroute_do_assert;
+ val = mrt->mroute_do_assert;
if (copy_to_user(optval, &val, olr))
return -EFAULT;
return 0;
@@ -1109,16 +1385,21 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
struct vif_device *vif;
struct mfc_cache *c;
struct net *net = sock_net(sk);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ if (mrt == NULL)
+ return -ENOENT;
switch (cmd) {
case SIOCGETVIFCNT:
if (copy_from_user(&vr, arg, sizeof(vr)))
return -EFAULT;
- if (vr.vifi >= net->ipv4.maxvif)
+ if (vr.vifi >= mrt->maxvif)
return -EINVAL;
read_lock(&mrt_lock);
- vif = &net->ipv4.vif_table[vr.vifi];
- if (VIF_EXISTS(net, vr.vifi)) {
+ vif = &mrt->vif_table[vr.vifi];
+ if (VIF_EXISTS(mrt, vr.vifi)) {
vr.icount = vif->pkt_in;
vr.ocount = vif->pkt_out;
vr.ibytes = vif->bytes_in;
@@ -1136,7 +1417,7 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
return -EFAULT;
read_lock(&mrt_lock);
- c = ipmr_cache_find(net, sr.src.s_addr, sr.grp.s_addr);
+ c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
if (c) {
sr.pktcnt = c->mfc_un.res.pkt;
sr.bytecnt = c->mfc_un.res.bytes;
@@ -1159,16 +1440,20 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
{
struct net_device *dev = ptr;
struct net *net = dev_net(dev);
+ struct mr_table *mrt;
struct vif_device *v;
int ct;
LIST_HEAD(list);
if (event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
- v = &net->ipv4.vif_table[0];
- for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) {
- if (v->dev == dev)
- vif_delete(net, ct, 1, &list);
+
+ ipmr_for_each_table(mrt, net) {
+ v = &mrt->vif_table[0];
+ for (ct = 0; ct < mrt->maxvif; ct++, v++) {
+ if (v->dev == dev)
+ vif_delete(mrt, ct, 1, &list);
+ }
}
unregister_netdevice_many(&list);
return NOTIFY_DONE;
@@ -1227,11 +1512,11 @@ static inline int ipmr_forward_finish(struct sk_buff *skb)
* Processing handlers for ipmr_forward
*/
-static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
+static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, struct mfc_cache *c, int vifi)
{
- struct net *net = mfc_net(c);
const struct iphdr *iph = ip_hdr(skb);
- struct vif_device *vif = &net->ipv4.vif_table[vifi];
+ struct vif_device *vif = &mrt->vif_table[vifi];
struct net_device *dev;
struct rtable *rt;
int encap = 0;
@@ -1245,7 +1530,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
vif->bytes_out += skb->len;
vif->dev->stats.tx_bytes += skb->len;
vif->dev->stats.tx_packets++;
- ipmr_cache_report(net, skb, vifi, IGMPMSG_WHOLEPKT);
+ ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
goto out_free;
}
#endif
@@ -1270,9 +1555,9 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
goto out_free;
}
- dev = rt->u.dst.dev;
+ dev = rt->dst.dev;
- if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
+ if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
/* Do not fragment multicasts. Alas, IPv4 does not
allow to send ICMP, so that packets will disappear
to blackhole.
@@ -1283,7 +1568,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
goto out_free;
}
- encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
+ encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
if (skb_cow(skb, encap)) {
ip_rt_put(rt);
@@ -1294,7 +1579,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
vif->bytes_out += skb->len;
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
ip_decrease_ttl(ip_hdr(skb));
/* FIXME: forward and output firewalls used to be called here.
@@ -1319,21 +1604,20 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
* not mrouter) cannot join to more than one interface - it will
* result in receiving multiple packets.
*/
- NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
ipmr_forward_finish);
return;
out_free:
kfree_skb(skb);
- return;
}
-static int ipmr_find_vif(struct net_device *dev)
+static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
{
- struct net *net = dev_net(dev);
int ct;
- for (ct = net->ipv4.maxvif-1; ct >= 0; ct--) {
- if (net->ipv4.vif_table[ct].dev == dev)
+
+ for (ct = mrt->maxvif-1; ct >= 0; ct--) {
+ if (mrt->vif_table[ct].dev == dev)
break;
}
return ct;
@@ -1341,11 +1625,12 @@ static int ipmr_find_vif(struct net_device *dev)
/* "local" means that we should preserve one skb (for local delivery) */
-static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
+static int ip_mr_forward(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, struct mfc_cache *cache,
+ int local)
{
int psend = -1;
int vif, ct;
- struct net *net = mfc_net(cache);
vif = cache->mfc_parent;
cache->mfc_un.res.pkt++;
@@ -1354,7 +1639,7 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
/*
* Wrong interface: drop packet and (maybe) send PIM assert.
*/
- if (net->ipv4.vif_table[vif].dev != skb->dev) {
+ if (mrt->vif_table[vif].dev != skb->dev) {
int true_vifi;
if (skb_rtable(skb)->fl.iif == 0) {
@@ -1373,26 +1658,26 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
}
cache->mfc_un.res.wrong_if++;
- true_vifi = ipmr_find_vif(skb->dev);
+ true_vifi = ipmr_find_vif(mrt, skb->dev);
- if (true_vifi >= 0 && net->ipv4.mroute_do_assert &&
+ if (true_vifi >= 0 && mrt->mroute_do_assert &&
/* pimsm uses asserts, when switching from RPT to SPT,
so that we cannot check that packet arrived on an oif.
It is bad, but otherwise we would need to move pretty
large chunk of pimd to kernel. Ough... --ANK
*/
- (net->ipv4.mroute_do_pim ||
+ (mrt->mroute_do_pim ||
cache->mfc_un.res.ttls[true_vifi] < 255) &&
time_after(jiffies,
cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
cache->mfc_un.res.last_assert = jiffies;
- ipmr_cache_report(net, skb, true_vifi, IGMPMSG_WRONGVIF);
+ ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
}
goto dont_forward;
}
- net->ipv4.vif_table[vif].pkt_in++;
- net->ipv4.vif_table[vif].bytes_in += skb->len;
+ mrt->vif_table[vif].pkt_in++;
+ mrt->vif_table[vif].bytes_in += skb->len;
/*
* Forward the frame
@@ -1402,7 +1687,8 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
if (psend != -1) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
- ipmr_queue_xmit(skb2, cache, psend);
+ ipmr_queue_xmit(net, mrt, skb2, cache,
+ psend);
}
psend = ct;
}
@@ -1411,9 +1697,9 @@ static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local
if (local) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
- ipmr_queue_xmit(skb2, cache, psend);
+ ipmr_queue_xmit(net, mrt, skb2, cache, psend);
} else {
- ipmr_queue_xmit(skb, cache, psend);
+ ipmr_queue_xmit(net, mrt, skb, cache, psend);
return 0;
}
}
@@ -1434,6 +1720,8 @@ int ip_mr_input(struct sk_buff *skb)
struct mfc_cache *cache;
struct net *net = dev_net(skb->dev);
int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
+ struct mr_table *mrt;
+ int err;
/* Packet is looped back after forward, it should not be
forwarded second time, but still can be delivered locally.
@@ -1441,6 +1729,12 @@ int ip_mr_input(struct sk_buff *skb)
if (IPCB(skb)->flags&IPSKB_FORWARDED)
goto dont_forward;
+ err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
if (!local) {
if (IPCB(skb)->opt.router_alert) {
if (ip_call_ra_chain(skb))
@@ -1453,9 +1747,9 @@ int ip_mr_input(struct sk_buff *skb)
that we can forward NO IGMP messages.
*/
read_lock(&mrt_lock);
- if (net->ipv4.mroute_sk) {
+ if (mrt->mroute_sk) {
nf_reset(skb);
- raw_rcv(net->ipv4.mroute_sk, skb);
+ raw_rcv(mrt->mroute_sk, skb);
read_unlock(&mrt_lock);
return 0;
}
@@ -1464,7 +1758,7 @@ int ip_mr_input(struct sk_buff *skb)
}
read_lock(&mrt_lock);
- cache = ipmr_cache_find(net, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
/*
* No usable cache entry
@@ -1482,19 +1776,19 @@ int ip_mr_input(struct sk_buff *skb)
skb = skb2;
}
- vif = ipmr_find_vif(skb->dev);
+ vif = ipmr_find_vif(mrt, skb->dev);
if (vif >= 0) {
- int err = ipmr_cache_unresolved(net, vif, skb);
+ int err2 = ipmr_cache_unresolved(mrt, vif, skb);
read_unlock(&mrt_lock);
- return err;
+ return err2;
}
read_unlock(&mrt_lock);
kfree_skb(skb);
return -ENODEV;
}
- ip_mr_forward(skb, cache, local);
+ ip_mr_forward(net, mrt, skb, cache, local);
read_unlock(&mrt_lock);
@@ -1511,11 +1805,11 @@ dont_forward:
}
#ifdef CONFIG_IP_PIMSM
-static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
+static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
+ unsigned int pimlen)
{
struct net_device *reg_dev = NULL;
struct iphdr *encap;
- struct net *net = dev_net(skb->dev);
encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
/*
@@ -1530,8 +1824,8 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
return 1;
read_lock(&mrt_lock);
- if (net->ipv4.mroute_reg_vif_num >= 0)
- reg_dev = net->ipv4.vif_table[net->ipv4.mroute_reg_vif_num].dev;
+ if (mrt->mroute_reg_vif_num >= 0)
+ reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
if (reg_dev)
dev_hold(reg_dev);
read_unlock(&mrt_lock);
@@ -1542,14 +1836,12 @@ static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
skb->mac_header = skb->network_header;
skb_pull(skb, (u8*)encap - skb->data);
skb_reset_network_header(skb);
- skb->dev = reg_dev;
skb->protocol = htons(ETH_P_IP);
skb->ip_summed = 0;
skb->pkt_type = PACKET_HOST;
- skb_dst_drop(skb);
- reg_dev->stats.rx_bytes += skb->len;
- reg_dev->stats.rx_packets++;
- nf_reset(skb);
+
+ skb_tunnel_rx(skb, reg_dev);
+
netif_rx(skb);
dev_put(reg_dev);
@@ -1566,17 +1858,21 @@ int pim_rcv_v1(struct sk_buff * skb)
{
struct igmphdr *pim;
struct net *net = dev_net(skb->dev);
+ struct mr_table *mrt;
if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
goto drop;
pim = igmp_hdr(skb);
- if (!net->ipv4.mroute_do_pim ||
+ if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
+ goto drop;
+
+ if (!mrt->mroute_do_pim ||
pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
goto drop;
- if (__pim_rcv(skb, sizeof(*pim))) {
+ if (__pim_rcv(mrt, skb, sizeof(*pim))) {
drop:
kfree_skb(skb);
}
@@ -1588,6 +1884,8 @@ drop:
static int pim_rcv(struct sk_buff * skb)
{
struct pimreghdr *pim;
+ struct net *net = dev_net(skb->dev);
+ struct mr_table *mrt;
if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
goto drop;
@@ -1599,7 +1897,10 @@ static int pim_rcv(struct sk_buff * skb)
csum_fold(skb_checksum(skb, 0, skb->len, 0))))
goto drop;
- if (__pim_rcv(skb, sizeof(*pim))) {
+ if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
+ goto drop;
+
+ if (__pim_rcv(mrt, skb, sizeof(*pim))) {
drop:
kfree_skb(skb);
}
@@ -1607,29 +1908,31 @@ drop:
}
#endif
-static int
-ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
+static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ struct mfc_cache *c, struct rtmsg *rtm)
{
int ct;
struct rtnexthop *nhp;
- struct net *net = mfc_net(c);
- struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev;
u8 *b = skb_tail_pointer(skb);
struct rtattr *mp_head;
- if (dev)
- RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
+ /* If cache is unresolved, don't try to parse IIF and OIF */
+ if (c->mfc_parent >= MAXVIFS)
+ return -ENOENT;
+
+ if (VIF_EXISTS(mrt, c->mfc_parent))
+ RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
- if (c->mfc_un.res.ttls[ct] < 255) {
+ if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
goto rtattr_failure;
nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
nhp->rtnh_flags = 0;
nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
- nhp->rtnh_ifindex = net->ipv4.vif_table[ct].dev->ifindex;
+ nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
nhp->rtnh_len = sizeof(*nhp);
}
}
@@ -1647,11 +1950,16 @@ int ipmr_get_route(struct net *net,
struct sk_buff *skb, struct rtmsg *rtm, int nowait)
{
int err;
+ struct mr_table *mrt;
struct mfc_cache *cache;
struct rtable *rt = skb_rtable(skb);
+ mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (mrt == NULL)
+ return -ENOENT;
+
read_lock(&mrt_lock);
- cache = ipmr_cache_find(net, rt->rt_src, rt->rt_dst);
+ cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
if (cache == NULL) {
struct sk_buff *skb2;
@@ -1665,7 +1973,7 @@ int ipmr_get_route(struct net *net,
}
dev = skb->dev;
- if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
+ if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
read_unlock(&mrt_lock);
return -ENODEV;
}
@@ -1682,24 +1990,107 @@ int ipmr_get_route(struct net *net,
iph->saddr = rt->rt_src;
iph->daddr = rt->rt_dst;
iph->version = 0;
- err = ipmr_cache_unresolved(net, vif, skb2);
+ err = ipmr_cache_unresolved(mrt, vif, skb2);
read_unlock(&mrt_lock);
return err;
}
if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
cache->mfc_flags |= MFC_NOTIFY;
- err = ipmr_fill_mroute(skb, cache, rtm);
+ err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
read_unlock(&mrt_lock);
return err;
}
+static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+ u32 pid, u32 seq, struct mfc_cache *c)
+{
+ struct nlmsghdr *nlh;
+ struct rtmsg *rtm;
+
+ nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ rtm = nlmsg_data(nlh);
+ rtm->rtm_family = RTNL_FAMILY_IPMR;
+ rtm->rtm_dst_len = 32;
+ rtm->rtm_src_len = 32;
+ rtm->rtm_tos = 0;
+ rtm->rtm_table = mrt->id;
+ NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
+ rtm->rtm_type = RTN_MULTICAST;
+ rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+ rtm->rtm_protocol = RTPROT_UNSPEC;
+ rtm->rtm_flags = 0;
+
+ NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
+ NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
+
+ if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
+ goto nla_put_failure;
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct mr_table *mrt;
+ struct mfc_cache *mfc;
+ unsigned int t = 0, s_t;
+ unsigned int h = 0, s_h;
+ unsigned int e = 0, s_e;
+
+ s_t = cb->args[0];
+ s_h = cb->args[1];
+ s_e = cb->args[2];
+
+ read_lock(&mrt_lock);
+ ipmr_for_each_table(mrt, net) {
+ if (t < s_t)
+ goto next_table;
+ if (t > s_t)
+ s_h = 0;
+ for (h = s_h; h < MFC_LINES; h++) {
+ list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
+ if (e < s_e)
+ goto next_entry;
+ if (ipmr_fill_mroute(mrt, skb,
+ NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ mfc) < 0)
+ goto done;
+next_entry:
+ e++;
+ }
+ e = s_e = 0;
+ }
+ s_h = 0;
+next_table:
+ t++;
+ }
+done:
+ read_unlock(&mrt_lock);
+
+ cb->args[2] = e;
+ cb->args[1] = h;
+ cb->args[0] = t;
+
+ return skb->len;
+}
+
#ifdef CONFIG_PROC_FS
/*
* The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
*/
struct ipmr_vif_iter {
struct seq_net_private p;
+ struct mr_table *mrt;
int ct;
};
@@ -1707,11 +2098,13 @@ static struct vif_device *ipmr_vif_seq_idx(struct net *net,
struct ipmr_vif_iter *iter,
loff_t pos)
{
- for (iter->ct = 0; iter->ct < net->ipv4.maxvif; ++iter->ct) {
- if (!VIF_EXISTS(net, iter->ct))
+ struct mr_table *mrt = iter->mrt;
+
+ for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
+ if (!VIF_EXISTS(mrt, iter->ct))
continue;
if (pos-- == 0)
- return &net->ipv4.vif_table[iter->ct];
+ return &mrt->vif_table[iter->ct];
}
return NULL;
}
@@ -1719,7 +2112,15 @@ static struct vif_device *ipmr_vif_seq_idx(struct net *net,
static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(mrt_lock)
{
+ struct ipmr_vif_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
+ struct mr_table *mrt;
+
+ mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (mrt == NULL)
+ return ERR_PTR(-ENOENT);
+
+ iter->mrt = mrt;
read_lock(&mrt_lock);
return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
@@ -1730,15 +2131,16 @@ static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ipmr_vif_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
+ struct mr_table *mrt = iter->mrt;
++*pos;
if (v == SEQ_START_TOKEN)
return ipmr_vif_seq_idx(net, iter, 0);
- while (++iter->ct < net->ipv4.maxvif) {
- if (!VIF_EXISTS(net, iter->ct))
+ while (++iter->ct < mrt->maxvif) {
+ if (!VIF_EXISTS(mrt, iter->ct))
continue;
- return &net->ipv4.vif_table[iter->ct];
+ return &mrt->vif_table[iter->ct];
}
return NULL;
}
@@ -1751,7 +2153,8 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
{
- struct net *net = seq_file_net(seq);
+ struct ipmr_vif_iter *iter = seq->private;
+ struct mr_table *mrt = iter->mrt;
if (v == SEQ_START_TOKEN) {
seq_puts(seq,
@@ -1762,7 +2165,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
- vif - net->ipv4.vif_table,
+ vif - mrt->vif_table,
name, vif->bytes_in, vif->pkt_in,
vif->bytes_out, vif->pkt_out,
vif->flags, vif->local, vif->remote);
@@ -1793,7 +2196,8 @@ static const struct file_operations ipmr_vif_fops = {
struct ipmr_mfc_iter {
struct seq_net_private p;
- struct mfc_cache **cache;
+ struct mr_table *mrt;
+ struct list_head *cache;
int ct;
};
@@ -1801,22 +2205,22 @@ struct ipmr_mfc_iter {
static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
struct ipmr_mfc_iter *it, loff_t pos)
{
+ struct mr_table *mrt = it->mrt;
struct mfc_cache *mfc;
- it->cache = net->ipv4.mfc_cache_array;
read_lock(&mrt_lock);
- for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
- for (mfc = net->ipv4.mfc_cache_array[it->ct];
- mfc; mfc = mfc->next)
+ for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
+ it->cache = &mrt->mfc_cache_array[it->ct];
+ list_for_each_entry(mfc, it->cache, list)
if (pos-- == 0)
return mfc;
+ }
read_unlock(&mrt_lock);
- it->cache = &mfc_unres_queue;
spin_lock_bh(&mfc_unres_lock);
- for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
- if (net_eq(mfc_net(mfc), net) &&
- pos-- == 0)
+ it->cache = &mrt->mfc_unres_queue;
+ list_for_each_entry(mfc, it->cache, list)
+ if (pos-- == 0)
return mfc;
spin_unlock_bh(&mfc_unres_lock);
@@ -1829,7 +2233,13 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
{
struct ipmr_mfc_iter *it = seq->private;
struct net *net = seq_file_net(seq);
+ struct mr_table *mrt;
+ mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (mrt == NULL)
+ return ERR_PTR(-ENOENT);
+
+ it->mrt = mrt;
it->cache = NULL;
it->ct = 0;
return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
@@ -1841,37 +2251,36 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct mfc_cache *mfc = v;
struct ipmr_mfc_iter *it = seq->private;
struct net *net = seq_file_net(seq);
+ struct mr_table *mrt = it->mrt;
++*pos;
if (v == SEQ_START_TOKEN)
return ipmr_mfc_seq_idx(net, seq->private, 0);
- if (mfc->next)
- return mfc->next;
+ if (mfc->list.next != it->cache)
+ return list_entry(mfc->list.next, struct mfc_cache, list);
- if (it->cache == &mfc_unres_queue)
+ if (it->cache == &mrt->mfc_unres_queue)
goto end_of_list;
- BUG_ON(it->cache != net->ipv4.mfc_cache_array);
+ BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
while (++it->ct < MFC_LINES) {
- mfc = net->ipv4.mfc_cache_array[it->ct];
- if (mfc)
- return mfc;
+ it->cache = &mrt->mfc_cache_array[it->ct];
+ if (list_empty(it->cache))
+ continue;
+ return list_first_entry(it->cache, struct mfc_cache, list);
}
/* exhausted cache_array, show unresolved */
read_unlock(&mrt_lock);
- it->cache = &mfc_unres_queue;
+ it->cache = &mrt->mfc_unres_queue;
it->ct = 0;
spin_lock_bh(&mfc_unres_lock);
- mfc = mfc_unres_queue;
- while (mfc && !net_eq(mfc_net(mfc), net))
- mfc = mfc->next;
- if (mfc)
- return mfc;
+ if (!list_empty(it->cache))
+ return list_first_entry(it->cache, struct mfc_cache, list);
end_of_list:
spin_unlock_bh(&mfc_unres_lock);
@@ -1883,18 +2292,17 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
{
struct ipmr_mfc_iter *it = seq->private;
- struct net *net = seq_file_net(seq);
+ struct mr_table *mrt = it->mrt;
- if (it->cache == &mfc_unres_queue)
+ if (it->cache == &mrt->mfc_unres_queue)
spin_unlock_bh(&mfc_unres_lock);
- else if (it->cache == net->ipv4.mfc_cache_array)
+ else if (it->cache == &mrt->mfc_cache_array[it->ct])
read_unlock(&mrt_lock);
}
static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
{
int n;
- struct net *net = seq_file_net(seq);
if (v == SEQ_START_TOKEN) {
seq_puts(seq,
@@ -1902,20 +2310,21 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
} else {
const struct mfc_cache *mfc = v;
const struct ipmr_mfc_iter *it = seq->private;
+ const struct mr_table *mrt = it->mrt;
- seq_printf(seq, "%08lX %08lX %-3hd",
- (unsigned long) mfc->mfc_mcastgrp,
- (unsigned long) mfc->mfc_origin,
+ seq_printf(seq, "%08X %08X %-3hd",
+ (__force u32) mfc->mfc_mcastgrp,
+ (__force u32) mfc->mfc_origin,
mfc->mfc_parent);
- if (it->cache != &mfc_unres_queue) {
+ if (it->cache != &mrt->mfc_unres_queue) {
seq_printf(seq, " %8lu %8lu %8lu",
mfc->mfc_un.res.pkt,
mfc->mfc_un.res.bytes,
mfc->mfc_un.res.wrong_if);
for (n = mfc->mfc_un.res.minvif;
n < mfc->mfc_un.res.maxvif; n++ ) {
- if (VIF_EXISTS(net, n) &&
+ if (VIF_EXISTS(mrt, n) &&
mfc->mfc_un.res.ttls[n] < 255)
seq_printf(seq,
" %2d:%-3d",
@@ -1967,27 +2376,11 @@ static const struct net_protocol pim_protocol = {
*/
static int __net_init ipmr_net_init(struct net *net)
{
- int err = 0;
+ int err;
- net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device),
- GFP_KERNEL);
- if (!net->ipv4.vif_table) {
- err = -ENOMEM;
+ err = ipmr_rules_init(net);
+ if (err < 0)
goto fail;
- }
-
- /* Forwarding cache */
- net->ipv4.mfc_cache_array = kcalloc(MFC_LINES,
- sizeof(struct mfc_cache *),
- GFP_KERNEL);
- if (!net->ipv4.mfc_cache_array) {
- err = -ENOMEM;
- goto fail_mfc_cache;
- }
-
-#ifdef CONFIG_IP_PIMSM
- net->ipv4.mroute_reg_vif_num = -1;
-#endif
#ifdef CONFIG_PROC_FS
err = -ENOMEM;
@@ -2002,10 +2395,8 @@ static int __net_init ipmr_net_init(struct net *net)
proc_cache_fail:
proc_net_remove(net, "ip_mr_vif");
proc_vif_fail:
- kfree(net->ipv4.mfc_cache_array);
+ ipmr_rules_exit(net);
#endif
-fail_mfc_cache:
- kfree(net->ipv4.vif_table);
fail:
return err;
}
@@ -2016,8 +2407,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
proc_net_remove(net, "ip_mr_cache");
proc_net_remove(net, "ip_mr_vif");
#endif
- kfree(net->ipv4.mfc_cache_array);
- kfree(net->ipv4.vif_table);
+ ipmr_rules_exit(net);
}
static struct pernet_operations ipmr_net_ops = {
@@ -2040,7 +2430,6 @@ int __init ip_mr_init(void)
if (err)
goto reg_pernet_fail;
- setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
err = register_netdevice_notifier(&ip_mr_notifier);
if (err)
goto reg_notif_fail;
@@ -2051,6 +2440,7 @@ int __init ip_mr_init(void)
goto add_proto_fail;
}
#endif
+ rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
return 0;
#ifdef CONFIG_IP_PIMSM_V2
@@ -2058,7 +2448,6 @@ add_proto_fail:
unregister_netdevice_notifier(&ip_mr_notifier);
#endif
reg_notif_fail:
- del_timer(&ipmr_expire_timer);
unregister_pernet_subsys(&ipmr_net_ops);
reg_pernet_fail:
kmem_cache_destroy(mrt_cachep);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index c14623fc4d5..d88a46c54fd 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -4,6 +4,7 @@
#include <linux/netfilter_ipv4.h>
#include <linux/ip.h>
#include <linux/skbuff.h>
+#include <linux/gfp.h>
#include <net/route.h>
#include <net/xfrm.h>
#include <net/ip.h>
@@ -16,7 +17,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
struct flowi fl = {};
- struct dst_entry *odst;
+ unsigned long orefdst;
unsigned int hh_len;
unsigned int type;
@@ -42,7 +43,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
/* Drop old route. */
skb_dst_drop(skb);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
} else {
/* non-local src, find valid iif to satisfy
* rp-filter when calling ip_route_input. */
@@ -50,14 +51,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
if (ip_route_output_key(net, &rt, &fl) != 0)
return -1;
- odst = skb_dst(skb);
+ orefdst = skb->_skb_refdst;
if (ip_route_input(skb, iph->daddr, iph->saddr,
- RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
- dst_release(&rt->u.dst);
+ RT_TOS(iph->tos), rt->dst.dev) != 0) {
+ dst_release(&rt->dst);
return -1;
}
- dst_release(&rt->u.dst);
- dst_release(odst);
+ dst_release(&rt->dst);
+ refdst_drop(orefdst);
}
if (skb_dst(skb)->error)
@@ -211,9 +212,7 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
skb->len - dataoff, 0);
skb->ip_summed = CHECKSUM_NONE;
- csum = __skb_checksum_complete_head(skb, dataoff + len);
- if (!csum)
- skb->ip_summed = CHECKSUM_UNNECESSARY;
+ return __skb_checksum_complete_head(skb, dataoff + len);
}
return csum;
}
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index f07d77f6575..6bccba31d13 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -49,12 +49,7 @@ MODULE_DESCRIPTION("arptables core");
#endif
#ifdef CONFIG_NETFILTER_DEBUG
-#define ARP_NF_ASSERT(x) \
-do { \
- if (!(x)) \
- printk("ARP_NF_ASSERT: %s:%s:%u\n", \
- __func__, __FILE__, __LINE__); \
-} while(0)
+#define ARP_NF_ASSERT(x) WARN_ON(!(x))
#else
#define ARP_NF_ASSERT(x)
#endif
@@ -224,10 +219,10 @@ static inline int arp_checkentry(const struct arpt_arp *arp)
}
static unsigned int
-arpt_error(struct sk_buff *skb, const struct xt_target_param *par)
+arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
{
if (net_ratelimit())
- printk("arp_tables: error: '%s'\n",
+ pr_err("arp_tables: error: '%s'\n",
(const char *)par->targinfo);
return NF_DROP;
@@ -260,12 +255,11 @@ unsigned int arpt_do_table(struct sk_buff *skb,
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
unsigned int verdict = NF_DROP;
const struct arphdr *arp;
- bool hotdrop = false;
struct arpt_entry *e, *back;
const char *indev, *outdev;
void *table_base;
const struct xt_table_info *private;
- struct xt_target_param tgpar;
+ struct xt_action_param acpar;
if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
return NF_DROP;
@@ -280,24 +274,22 @@ unsigned int arpt_do_table(struct sk_buff *skb,
e = get_entry(table_base, private->hook_entry[hook]);
back = get_entry(table_base, private->underflow[hook]);
- tgpar.in = in;
- tgpar.out = out;
- tgpar.hooknum = hook;
- tgpar.family = NFPROTO_ARP;
+ acpar.in = in;
+ acpar.out = out;
+ acpar.hooknum = hook;
+ acpar.family = NFPROTO_ARP;
+ acpar.hotdrop = false;
arp = arp_hdr(skb);
do {
const struct arpt_entry_target *t;
- int hdr_len;
if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
e = arpt_next_entry(e);
continue;
}
- hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
- (2 * skb->dev->addr_len);
- ADD_COUNTER(e->counters, hdr_len, 1);
+ ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
t = arpt_get_target_c(e);
@@ -333,9 +325,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
/* Targets which reenter must return
* abs. verdicts
*/
- tgpar.target = t->u.kernel.target;
- tgpar.targinfo = t->data;
- verdict = t->u.kernel.target->target(skb, &tgpar);
+ acpar.target = t->u.kernel.target;
+ acpar.targinfo = t->data;
+ verdict = t->u.kernel.target->target(skb, &acpar);
/* Target might have changed stuff. */
arp = arp_hdr(skb);
@@ -345,10 +337,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
else
/* Verdict */
break;
- } while (!hotdrop);
+ } while (!acpar.hotdrop);
xt_info_rdunlock_bh();
- if (hotdrop)
+ if (acpar.hotdrop)
return NF_DROP;
else
return verdict;
@@ -390,7 +382,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
int visited = e->comefrom & (1 << hook);
if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
- printk("arptables: loop hook %u pos %u %08X.\n",
+ pr_notice("arptables: loop hook %u pos %u %08X.\n",
hook, pos, e->comefrom);
return 0;
}
@@ -523,13 +515,11 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
return ret;
t = arpt_get_target(e);
- target = try_then_request_module(xt_find_target(NFPROTO_ARP,
- t->u.user.name,
- t->u.user.revision),
- "arpt_%s", t->u.user.name);
- if (IS_ERR(target) || !target) {
+ target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
- ret = target ? PTR_ERR(target) : -ENOENT;
+ ret = PTR_ERR(target);
goto out;
}
t->u.kernel.target = target;
@@ -651,6 +641,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
if (ret != 0)
break;
++i;
+ if (strcmp(arpt_get_target(iter)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
}
duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
if (ret != 0)
@@ -717,7 +710,7 @@ static void get_counters(const struct xt_table_info *t,
struct arpt_entry *iter;
unsigned int cpu;
unsigned int i;
- unsigned int curcpu;
+ unsigned int curcpu = get_cpu();
/* Instead of clearing (by a previous call to memset())
* the counters and using adds, we set the counters
@@ -727,14 +720,16 @@ static void get_counters(const struct xt_table_info *t,
* if new softirq were to run and call ipt_do_table
*/
local_bh_disable();
- curcpu = smp_processor_id();
-
i = 0;
xt_entry_foreach(iter, t->entries[curcpu], t->size) {
SET_COUNTER(counters[i], iter->counters.bcnt,
iter->counters.pcnt);
++i;
}
+ local_bh_enable();
+ /* Processing counters from other cpus, we can let bottom half enabled,
+ * (preemption is disabled)
+ */
for_each_possible_cpu(cpu) {
if (cpu == curcpu)
@@ -748,7 +743,7 @@ static void get_counters(const struct xt_table_info *t,
}
xt_info_wrunlock(cpu);
}
- local_bh_enable();
+ put_cpu();
}
static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -762,7 +757,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
* about).
*/
countersize = sizeof(struct xt_counters) * private->number;
- counters = vmalloc_node(countersize, numa_node_id());
+ counters = vmalloc(countersize);
if (counters == NULL)
return ERR_PTR(-ENOMEM);
@@ -1009,8 +1004,7 @@ static int __do_replace(struct net *net, const char *name,
struct arpt_entry *iter;
ret = 0;
- counters = vmalloc_node(num_counters * sizeof(struct xt_counters),
- numa_node_id());
+ counters = vmalloc(num_counters * sizeof(struct xt_counters));
if (!counters) {
ret = -ENOMEM;
goto out;
@@ -1163,7 +1157,7 @@ static int do_add_counters(struct net *net, const void __user *user,
if (len != size + num_counters * sizeof(struct xt_counters))
return -EINVAL;
- paddc = vmalloc_node(len - size, numa_node_id());
+ paddc = vmalloc(len - size);
if (!paddc)
return -ENOMEM;
@@ -1252,14 +1246,12 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
entry_offset = (void *)e - (void *)base;
t = compat_arpt_get_target(e);
- target = try_then_request_module(xt_find_target(NFPROTO_ARP,
- t->u.user.name,
- t->u.user.revision),
- "arpt_%s", t->u.user.name);
- if (IS_ERR(target) || !target) {
+ target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
t->u.user.name);
- ret = target ? PTR_ERR(target) : -ENOENT;
+ ret = PTR_ERR(target);
goto out;
}
t->u.kernel.target = target;
@@ -1778,8 +1770,7 @@ struct xt_table *arpt_register_table(struct net *net,
{
int ret;
struct xt_table_info *newinfo;
- struct xt_table_info bootstrap
- = { 0, 0, 0, { 0 }, { 0 }, { } };
+ struct xt_table_info bootstrap = {0};
void *loc_cpu_entry;
struct xt_table *new_table;
@@ -1830,22 +1821,23 @@ void arpt_unregister_table(struct xt_table *table)
}
/* The built-in targets: standard (NULL) and error. */
-static struct xt_target arpt_standard_target __read_mostly = {
- .name = ARPT_STANDARD_TARGET,
- .targetsize = sizeof(int),
- .family = NFPROTO_ARP,
+static struct xt_target arpt_builtin_tg[] __read_mostly = {
+ {
+ .name = ARPT_STANDARD_TARGET,
+ .targetsize = sizeof(int),
+ .family = NFPROTO_ARP,
#ifdef CONFIG_COMPAT
- .compatsize = sizeof(compat_int_t),
- .compat_from_user = compat_standard_from_user,
- .compat_to_user = compat_standard_to_user,
+ .compatsize = sizeof(compat_int_t),
+ .compat_from_user = compat_standard_from_user,
+ .compat_to_user = compat_standard_to_user,
#endif
-};
-
-static struct xt_target arpt_error_target __read_mostly = {
- .name = ARPT_ERROR_TARGET,
- .target = arpt_error,
- .targetsize = ARPT_FUNCTION_MAXNAMELEN,
- .family = NFPROTO_ARP,
+ },
+ {
+ .name = ARPT_ERROR_TARGET,
+ .target = arpt_error,
+ .targetsize = ARPT_FUNCTION_MAXNAMELEN,
+ .family = NFPROTO_ARP,
+ },
};
static struct nf_sockopt_ops arpt_sockopts = {
@@ -1889,12 +1881,9 @@ static int __init arp_tables_init(void)
goto err1;
/* Noone else will be downing sem now, so we won't sleep */
- ret = xt_register_target(&arpt_standard_target);
+ ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
if (ret < 0)
goto err2;
- ret = xt_register_target(&arpt_error_target);
- if (ret < 0)
- goto err3;
/* Register setsockopt */
ret = nf_register_sockopt(&arpt_sockopts);
@@ -1905,9 +1894,7 @@ static int __init arp_tables_init(void)
return 0;
err4:
- xt_unregister_target(&arpt_error_target);
-err3:
- xt_unregister_target(&arpt_standard_target);
+ xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
err2:
unregister_pernet_subsys(&arp_tables_net_ops);
err1:
@@ -1917,8 +1904,7 @@ err1:
static void __exit arp_tables_fini(void)
{
nf_unregister_sockopt(&arpt_sockopts);
- xt_unregister_target(&arpt_error_target);
- xt_unregister_target(&arpt_standard_target);
+ xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg));
unregister_pernet_subsys(&arp_tables_net_ops);
}
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index b0d5b1d0a76..e1be7dd1171 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -9,7 +9,7 @@ MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
MODULE_DESCRIPTION("arptables arp payload mangle target");
static unsigned int
-target(struct sk_buff *skb, const struct xt_target_param *par)
+target(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct arpt_mangle *mangle = par->targinfo;
const struct arphdr *arp;
@@ -54,7 +54,7 @@ target(struct sk_buff *skb, const struct xt_target_param *par)
return mangle->target;
}
-static bool checkentry(const struct xt_tgchk_param *par)
+static int checkentry(const struct xt_tgchk_param *par)
{
const struct arpt_mangle *mangle = par->targinfo;
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index bfe26f32b93..79ca5e70d49 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_arp/arp_tables.h>
+#include <linux/slab.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index 2855f1f38cb..d2c1311cb28 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -26,6 +26,7 @@
#include <linux/security.h>
#include <linux/net.h>
#include <linux/mutex.h>
+#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/route.h>
@@ -41,7 +42,7 @@ typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long);
static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE;
static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT;
-static DEFINE_RWLOCK(queue_lock);
+static DEFINE_SPINLOCK(queue_lock);
static int peer_pid __read_mostly;
static unsigned int copy_range __read_mostly;
static unsigned int queue_total;
@@ -71,10 +72,10 @@ __ipq_set_mode(unsigned char mode, unsigned int range)
break;
case IPQ_COPY_PACKET:
- copy_mode = mode;
+ if (range > 0xFFFF)
+ range = 0xFFFF;
copy_range = range;
- if (copy_range > 0xFFFF)
- copy_range = 0xFFFF;
+ copy_mode = mode;
break;
default:
@@ -100,7 +101,7 @@ ipq_find_dequeue_entry(unsigned long id)
{
struct nf_queue_entry *entry = NULL, *i;
- write_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
list_for_each_entry(i, &queue_list, list) {
if ((unsigned long)i == id) {
@@ -114,7 +115,7 @@ ipq_find_dequeue_entry(unsigned long id)
queue_total--;
}
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
return entry;
}
@@ -135,9 +136,9 @@ __ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
static void
ipq_flush(ipq_cmpfn cmpfn, unsigned long data)
{
- write_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
__ipq_flush(cmpfn, data);
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
}
static struct sk_buff *
@@ -151,37 +152,29 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
struct nlmsghdr *nlh;
struct timeval tv;
- read_lock_bh(&queue_lock);
-
- switch (copy_mode) {
+ switch (ACCESS_ONCE(copy_mode)) {
case IPQ_COPY_META:
case IPQ_COPY_NONE:
size = NLMSG_SPACE(sizeof(*pmsg));
break;
case IPQ_COPY_PACKET:
- if ((entry->skb->ip_summed == CHECKSUM_PARTIAL ||
- entry->skb->ip_summed == CHECKSUM_COMPLETE) &&
- (*errp = skb_checksum_help(entry->skb))) {
- read_unlock_bh(&queue_lock);
+ if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
+ (*errp = skb_checksum_help(entry->skb)))
return NULL;
- }
- if (copy_range == 0 || copy_range > entry->skb->len)
+
+ data_len = ACCESS_ONCE(copy_range);
+ if (data_len == 0 || data_len > entry->skb->len)
data_len = entry->skb->len;
- else
- data_len = copy_range;
size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
break;
default:
*errp = -EINVAL;
- read_unlock_bh(&queue_lock);
return NULL;
}
- read_unlock_bh(&queue_lock);
-
skb = alloc_skb(size, GFP_ATOMIC);
if (!skb)
goto nlmsg_failure;
@@ -242,7 +235,7 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
if (nskb == NULL)
return status;
- write_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
if (!peer_pid)
goto err_out_free_nskb;
@@ -266,14 +259,14 @@ ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
__ipq_enqueue_entry(entry);
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
return status;
err_out_free_nskb:
kfree_skb(nskb);
err_out_unlock:
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
return status;
}
@@ -342,9 +335,9 @@ ipq_set_mode(unsigned char mode, unsigned int range)
{
int status;
- write_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
status = __ipq_set_mode(mode, range);
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
return status;
}
@@ -440,11 +433,11 @@ __ipq_rcv_skb(struct sk_buff *skb)
if (security_netlink_recv(skb, CAP_NET_ADMIN))
RCV_SKB_FAIL(-EPERM);
- write_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
if (peer_pid) {
if (peer_pid != pid) {
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
RCV_SKB_FAIL(-EBUSY);
}
} else {
@@ -452,7 +445,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
peer_pid = pid;
}
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
status = ipq_receive_peer(NLMSG_DATA(nlh), type,
nlmsglen - NLMSG_LENGTH(0));
@@ -461,7 +454,6 @@ __ipq_rcv_skb(struct sk_buff *skb)
if (flags & NLM_F_ACK)
netlink_ack(skb, nlh, 0);
- return;
}
static void
@@ -498,10 +490,10 @@ ipq_rcv_nl_event(struct notifier_block *this,
struct netlink_notify *n = ptr;
if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
- write_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
__ipq_reset();
- write_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
}
return NOTIFY_DONE;
}
@@ -528,7 +520,7 @@ static ctl_table ipq_table[] = {
#ifdef CONFIG_PROC_FS
static int ip_queue_show(struct seq_file *m, void *v)
{
- read_lock_bh(&queue_lock);
+ spin_lock_bh(&queue_lock);
seq_printf(m,
"Peer PID : %d\n"
@@ -546,7 +538,7 @@ static int ip_queue_show(struct seq_file *m, void *v)
queue_dropped,
queue_user_dropped);
- read_unlock_bh(&queue_lock);
+ spin_unlock_bh(&queue_lock);
return 0;
}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index b29c66df8d1..c439721b165 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -39,24 +39,19 @@ MODULE_DESCRIPTION("IPv4 packet filter");
/*#define DEBUG_IP_FIREWALL_USER*/
#ifdef DEBUG_IP_FIREWALL
-#define dprintf(format, args...) printk(format , ## args)
+#define dprintf(format, args...) pr_info(format , ## args)
#else
#define dprintf(format, args...)
#endif
#ifdef DEBUG_IP_FIREWALL_USER
-#define duprintf(format, args...) printk(format , ## args)
+#define duprintf(format, args...) pr_info(format , ## args)
#else
#define duprintf(format, args...)
#endif
#ifdef CONFIG_NETFILTER_DEBUG
-#define IP_NF_ASSERT(x) \
-do { \
- if (!(x)) \
- printk("IP_NF_ASSERT: %s:%s:%u\n", \
- __func__, __FILE__, __LINE__); \
-} while(0)
+#define IP_NF_ASSERT(x) WARN_ON(!(x))
#else
#define IP_NF_ASSERT(x)
#endif
@@ -165,30 +160,14 @@ ip_checkentry(const struct ipt_ip *ip)
}
static unsigned int
-ipt_error(struct sk_buff *skb, const struct xt_target_param *par)
+ipt_error(struct sk_buff *skb, const struct xt_action_param *par)
{
if (net_ratelimit())
- printk("ip_tables: error: `%s'\n",
- (const char *)par->targinfo);
+ pr_info("error: `%s'\n", (const char *)par->targinfo);
return NF_DROP;
}
-/* Performance critical - called for every packet */
-static inline bool
-do_match(const struct ipt_entry_match *m, const struct sk_buff *skb,
- struct xt_match_param *par)
-{
- par->match = m->u.kernel.match;
- par->matchinfo = m->data;
-
- /* Stop iteration if it doesn't match */
- if (!m->u.kernel.match->match(skb, par))
- return true;
- else
- return false;
-}
-
/* Performance critical */
static inline struct ipt_entry *
get_entry(const void *base, unsigned int offset)
@@ -322,19 +301,16 @@ ipt_do_table(struct sk_buff *skb,
const struct net_device *out,
struct xt_table *table)
{
-#define tb_comefrom ((struct ipt_entry *)table_base)->comefrom
-
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
const struct iphdr *ip;
- bool hotdrop = false;
/* Initializing verdict to NF_DROP keeps gcc happy. */
unsigned int verdict = NF_DROP;
const char *indev, *outdev;
const void *table_base;
- struct ipt_entry *e, *back;
+ struct ipt_entry *e, **jumpstack;
+ unsigned int *stackptr, origptr, cpu;
const struct xt_table_info *private;
- struct xt_match_param mtpar;
- struct xt_target_param tgpar;
+ struct xt_action_param acpar;
/* Initialization */
ip = ip_hdr(skb);
@@ -346,42 +322,49 @@ ipt_do_table(struct sk_buff *skb,
* things we don't know, ie. tcp syn flag or ports). If the
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
- mtpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
- mtpar.thoff = ip_hdrlen(skb);
- mtpar.hotdrop = &hotdrop;
- mtpar.in = tgpar.in = in;
- mtpar.out = tgpar.out = out;
- mtpar.family = tgpar.family = NFPROTO_IPV4;
- mtpar.hooknum = tgpar.hooknum = hook;
+ acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
+ acpar.thoff = ip_hdrlen(skb);
+ acpar.hotdrop = false;
+ acpar.in = in;
+ acpar.out = out;
+ acpar.family = NFPROTO_IPV4;
+ acpar.hooknum = hook;
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
xt_info_rdlock_bh();
private = table->private;
- table_base = private->entries[smp_processor_id()];
+ cpu = smp_processor_id();
+ table_base = private->entries[cpu];
+ jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
+ stackptr = per_cpu_ptr(private->stackptr, cpu);
+ origptr = *stackptr;
e = get_entry(table_base, private->hook_entry[hook]);
- /* For return from builtin chain */
- back = get_entry(table_base, private->underflow[hook]);
+ pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n",
+ table->name, hook, origptr,
+ get_entry(table_base, private->underflow[hook]));
do {
const struct ipt_entry_target *t;
const struct xt_entry_match *ematch;
IP_NF_ASSERT(e);
- IP_NF_ASSERT(back);
if (!ip_packet_match(ip, indev, outdev,
- &e->ip, mtpar.fragoff)) {
+ &e->ip, acpar.fragoff)) {
no_match:
e = ipt_next_entry(e);
continue;
}
- xt_ematch_foreach(ematch, e)
- if (do_match(ematch, skb, &mtpar) != 0)
+ xt_ematch_foreach(ematch, e) {
+ acpar.match = ematch->u.kernel.match;
+ acpar.matchinfo = ematch->data;
+ if (!acpar.match->match(skb, &acpar))
goto no_match;
+ }
- ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
+ ADD_COUNTER(e->counters, skb->len, 1);
t = ipt_get_target(e);
IP_NF_ASSERT(t->u.kernel.target);
@@ -404,41 +387,38 @@ ipt_do_table(struct sk_buff *skb,
verdict = (unsigned)(-v) - 1;
break;
}
- e = back;
- back = get_entry(table_base, back->comefrom);
+ if (*stackptr == 0) {
+ e = get_entry(table_base,
+ private->underflow[hook]);
+ pr_debug("Underflow (this is normal) "
+ "to %p\n", e);
+ } else {
+ e = jumpstack[--*stackptr];
+ pr_debug("Pulled %p out from pos %u\n",
+ e, *stackptr);
+ e = ipt_next_entry(e);
+ }
continue;
}
if (table_base + v != ipt_next_entry(e) &&
!(e->ip.flags & IPT_F_GOTO)) {
- /* Save old back ptr in next entry */
- struct ipt_entry *next = ipt_next_entry(e);
- next->comefrom = (void *)back - table_base;
- /* set back pointer to next entry */
- back = next;
+ if (*stackptr >= private->stacksize) {
+ verdict = NF_DROP;
+ break;
+ }
+ jumpstack[(*stackptr)++] = e;
+ pr_debug("Pushed %p into pos %u\n",
+ e, *stackptr - 1);
}
e = get_entry(table_base, v);
continue;
}
- /* Targets which reenter must return
- abs. verdicts */
- tgpar.target = t->u.kernel.target;
- tgpar.targinfo = t->data;
-
+ acpar.target = t->u.kernel.target;
+ acpar.targinfo = t->data;
-#ifdef CONFIG_NETFILTER_DEBUG
- tb_comefrom = 0xeeeeeeec;
-#endif
- verdict = t->u.kernel.target->target(skb, &tgpar);
-#ifdef CONFIG_NETFILTER_DEBUG
- if (tb_comefrom != 0xeeeeeeec && verdict == IPT_CONTINUE) {
- printk("Target %s reentered!\n",
- t->u.kernel.target->name);
- verdict = NF_DROP;
- }
- tb_comefrom = 0x57acc001;
-#endif
+ verdict = t->u.kernel.target->target(skb, &acpar);
/* Target might have changed stuff. */
ip = ip_hdr(skb);
if (verdict == IPT_CONTINUE)
@@ -446,18 +426,18 @@ ipt_do_table(struct sk_buff *skb,
else
/* Verdict */
break;
- } while (!hotdrop);
+ } while (!acpar.hotdrop);
xt_info_rdunlock_bh();
-
+ pr_debug("Exiting %s; resetting sp from %u to %u\n",
+ __func__, *stackptr, origptr);
+ *stackptr = origptr;
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
#else
- if (hotdrop)
+ if (acpar.hotdrop)
return NF_DROP;
else return verdict;
#endif
-
-#undef tb_comefrom
}
/* Figures out from what hook each rule can be called: returns 0 if
@@ -486,7 +466,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
int visited = e->comefrom & (1 << hook);
if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
- printk("iptables: loop hook %u pos %u %08X.\n",
+ pr_err("iptables: loop hook %u pos %u %08X.\n",
hook, pos, e->comefrom);
return 0;
}
@@ -591,7 +571,7 @@ check_entry(const struct ipt_entry *e, const char *name)
const struct ipt_entry_target *t;
if (!ip_checkentry(&e->ip)) {
- duprintf("ip_tables: ip check failed %p %s.\n", e, name);
+ duprintf("ip check failed %p %s.\n", e, par->match->name);
return -EINVAL;
}
@@ -618,8 +598,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
ret = xt_check_match(par, m->u.match_size - sizeof(*m),
ip->proto, ip->invflags & IPT_INV_PROTO);
if (ret < 0) {
- duprintf("ip_tables: check failed for `%s'.\n",
- par.match->name);
+ duprintf("check failed for `%s'.\n", par->match->name);
return ret;
}
return 0;
@@ -631,12 +610,11 @@ find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
struct xt_match *match;
int ret;
- match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name,
- m->u.user.revision),
- "ipt_%s", m->u.user.name);
- if (IS_ERR(match) || !match) {
+ match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
+ m->u.user.revision);
+ if (IS_ERR(match)) {
duprintf("find_check_match: `%s' not found\n", m->u.user.name);
- return match ? PTR_ERR(match) : -ENOENT;
+ return PTR_ERR(match);
}
m->u.kernel.match = match;
@@ -667,7 +645,7 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name)
ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
if (ret < 0) {
- duprintf("ip_tables: check failed for `%s'.\n",
+ duprintf("check failed for `%s'.\n",
t->u.kernel.target->name);
return ret;
}
@@ -703,13 +681,11 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
}
t = ipt_get_target(e);
- target = try_then_request_module(xt_find_target(AF_INET,
- t->u.user.name,
- t->u.user.revision),
- "ipt_%s", t->u.user.name);
- if (IS_ERR(target) || !target) {
+ target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
- ret = target ? PTR_ERR(target) : -ENOENT;
+ ret = PTR_ERR(target);
goto cleanup_matches;
}
t->u.kernel.target = target;
@@ -843,6 +819,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
if (ret != 0)
return ret;
++i;
+ if (strcmp(ipt_get_target(iter)->u.user.name,
+ XT_ERROR_TARGET) == 0)
+ ++newinfo->stacksize;
}
if (i != repl->num_entries) {
@@ -905,7 +884,7 @@ get_counters(const struct xt_table_info *t,
struct ipt_entry *iter;
unsigned int cpu;
unsigned int i;
- unsigned int curcpu;
+ unsigned int curcpu = get_cpu();
/* Instead of clearing (by a previous call to memset())
* the counters and using adds, we set the counters
@@ -915,14 +894,16 @@ get_counters(const struct xt_table_info *t,
* if new softirq were to run and call ipt_do_table
*/
local_bh_disable();
- curcpu = smp_processor_id();
-
i = 0;
xt_entry_foreach(iter, t->entries[curcpu], t->size) {
SET_COUNTER(counters[i], iter->counters.bcnt,
iter->counters.pcnt);
++i;
}
+ local_bh_enable();
+ /* Processing counters from other cpus, we can let bottom half enabled,
+ * (preemption is disabled)
+ */
for_each_possible_cpu(cpu) {
if (cpu == curcpu)
@@ -936,7 +917,7 @@ get_counters(const struct xt_table_info *t,
}
xt_info_wrunlock(cpu);
}
- local_bh_enable();
+ put_cpu();
}
static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -949,7 +930,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
(other than comefrom, which userspace doesn't care
about). */
countersize = sizeof(struct xt_counters) * private->number;
- counters = vmalloc_node(countersize, numa_node_id());
+ counters = vmalloc(countersize);
if (counters == NULL)
return ERR_PTR(-ENOMEM);
@@ -1311,7 +1292,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
if (ret != 0)
goto free_newinfo;
- duprintf("ip_tables: Translated table\n");
+ duprintf("Translated table\n");
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, tmp.counters);
@@ -1373,7 +1354,7 @@ do_add_counters(struct net *net, const void __user *user,
if (len != size + num_counters * sizeof(struct xt_counters))
return -EINVAL;
- paddc = vmalloc_node(len - size, numa_node_id());
+ paddc = vmalloc(len - size);
if (!paddc)
return -ENOMEM;
@@ -1476,13 +1457,12 @@ compat_find_calc_match(struct ipt_entry_match *m,
{
struct xt_match *match;
- match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name,
- m->u.user.revision),
- "ipt_%s", m->u.user.name);
- if (IS_ERR(match) || !match) {
+ match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
+ m->u.user.revision);
+ if (IS_ERR(match)) {
duprintf("compat_check_calc_match: `%s' not found\n",
m->u.user.name);
- return match ? PTR_ERR(match) : -ENOENT;
+ return PTR_ERR(match);
}
m->u.kernel.match = match;
*size += xt_compat_match_offset(match);
@@ -1549,14 +1529,12 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
}
t = compat_ipt_get_target(e);
- target = try_then_request_module(xt_find_target(AF_INET,
- t->u.user.name,
- t->u.user.revision),
- "ipt_%s", t->u.user.name);
- if (IS_ERR(target) || !target) {
+ target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
+ t->u.user.revision);
+ if (IS_ERR(target)) {
duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
t->u.user.name);
- ret = target ? PTR_ERR(target) : -ENOENT;
+ ret = PTR_ERR(target);
goto release_matches;
}
t->u.kernel.target = target;
@@ -2094,8 +2072,7 @@ struct xt_table *ipt_register_table(struct net *net,
{
int ret;
struct xt_table_info *newinfo;
- struct xt_table_info bootstrap
- = { 0, 0, 0, { 0 }, { 0 }, { } };
+ struct xt_table_info bootstrap = {0};
void *loc_cpu_entry;
struct xt_table *new_table;
@@ -2157,7 +2134,7 @@ icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
}
static bool
-icmp_match(const struct sk_buff *skb, const struct xt_match_param *par)
+icmp_match(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct icmphdr *ic;
struct icmphdr _icmph;
@@ -2173,7 +2150,7 @@ icmp_match(const struct sk_buff *skb, const struct xt_match_param *par)
* can't. Hence, no choice but to drop.
*/
duprintf("Dropping evil ICMP tinygram.\n");
- *par->hotdrop = true;
+ par->hotdrop = true;
return false;
}
@@ -2184,31 +2161,31 @@ icmp_match(const struct sk_buff *skb, const struct xt_match_param *par)
!!(icmpinfo->invflags&IPT_ICMP_INV));
}
-static bool icmp_checkentry(const struct xt_mtchk_param *par)
+static int icmp_checkentry(const struct xt_mtchk_param *par)
{
const struct ipt_icmp *icmpinfo = par->matchinfo;
/* Must specify no unknown invflags */
- return !(icmpinfo->invflags & ~IPT_ICMP_INV);
+ return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0;
}
-/* The built-in targets: standard (NULL) and error. */
-static struct xt_target ipt_standard_target __read_mostly = {
- .name = IPT_STANDARD_TARGET,
- .targetsize = sizeof(int),
- .family = NFPROTO_IPV4,
+static struct xt_target ipt_builtin_tg[] __read_mostly = {
+ {
+ .name = IPT_STANDARD_TARGET,
+ .targetsize = sizeof(int),
+ .family = NFPROTO_IPV4,
#ifdef CONFIG_COMPAT
- .compatsize = sizeof(compat_int_t),
- .compat_from_user = compat_standard_from_user,
- .compat_to_user = compat_standard_to_user,
+ .compatsize = sizeof(compat_int_t),
+ .compat_from_user = compat_standard_from_user,
+ .compat_to_user = compat_standard_to_user,
#endif
-};
-
-static struct xt_target ipt_error_target __read_mostly = {
- .name = IPT_ERROR_TARGET,
- .target = ipt_error,
- .targetsize = IPT_FUNCTION_MAXNAMELEN,
- .family = NFPROTO_IPV4,
+ },
+ {
+ .name = IPT_ERROR_TARGET,
+ .target = ipt_error,
+ .targetsize = IPT_FUNCTION_MAXNAMELEN,
+ .family = NFPROTO_IPV4,
+ },
};
static struct nf_sockopt_ops ipt_sockopts = {
@@ -2228,13 +2205,15 @@ static struct nf_sockopt_ops ipt_sockopts = {
.owner = THIS_MODULE,
};
-static struct xt_match icmp_matchstruct __read_mostly = {
- .name = "icmp",
- .match = icmp_match,
- .matchsize = sizeof(struct ipt_icmp),
- .checkentry = icmp_checkentry,
- .proto = IPPROTO_ICMP,
- .family = NFPROTO_IPV4,
+static struct xt_match ipt_builtin_mt[] __read_mostly = {
+ {
+ .name = "icmp",
+ .match = icmp_match,
+ .matchsize = sizeof(struct ipt_icmp),
+ .checkentry = icmp_checkentry,
+ .proto = IPPROTO_ICMP,
+ .family = NFPROTO_IPV4,
+ },
};
static int __net_init ip_tables_net_init(struct net *net)
@@ -2261,13 +2240,10 @@ static int __init ip_tables_init(void)
goto err1;
/* Noone else will be downing sem now, so we won't sleep */
- ret = xt_register_target(&ipt_standard_target);
+ ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
if (ret < 0)
goto err2;
- ret = xt_register_target(&ipt_error_target);
- if (ret < 0)
- goto err3;
- ret = xt_register_match(&icmp_matchstruct);
+ ret = xt_register_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
if (ret < 0)
goto err4;
@@ -2276,15 +2252,13 @@ static int __init ip_tables_init(void)
if (ret < 0)
goto err5;
- printk(KERN_INFO "ip_tables: (C) 2000-2006 Netfilter Core Team\n");
+ pr_info("(C) 2000-2006 Netfilter Core Team\n");
return 0;
err5:
- xt_unregister_match(&icmp_matchstruct);
+ xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
err4:
- xt_unregister_target(&ipt_error_target);
-err3:
- xt_unregister_target(&ipt_standard_target);
+ xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
err2:
unregister_pernet_subsys(&ip_tables_net_ops);
err1:
@@ -2295,10 +2269,8 @@ static void __exit ip_tables_fini(void)
{
nf_unregister_sockopt(&ipt_sockopts);
- xt_unregister_match(&icmp_matchstruct);
- xt_unregister_target(&ipt_error_target);
- xt_unregister_target(&ipt_standard_target);
-
+ xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
+ xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
unregister_pernet_subsys(&ip_tables_net_ops);
}
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0886f96c736..3a43cf36db8 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -9,11 +9,13 @@
* published by the Free Software Foundation.
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/jhash.h>
#include <linux/bitops.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/udp.h>
@@ -51,12 +53,13 @@ struct clusterip_config {
#endif
enum clusterip_hashmode hash_mode; /* which hashing mode */
u_int32_t hash_initval; /* hash initialization */
+ struct rcu_head rcu;
};
static LIST_HEAD(clusterip_configs);
/* clusterip_lock protects the clusterip_configs list */
-static DEFINE_RWLOCK(clusterip_lock);
+static DEFINE_SPINLOCK(clusterip_lock);
#ifdef CONFIG_PROC_FS
static const struct file_operations clusterip_proc_fops;
@@ -69,11 +72,17 @@ clusterip_config_get(struct clusterip_config *c)
atomic_inc(&c->refcount);
}
+
+static void clusterip_config_rcu_free(struct rcu_head *head)
+{
+ kfree(container_of(head, struct clusterip_config, rcu));
+}
+
static inline void
clusterip_config_put(struct clusterip_config *c)
{
if (atomic_dec_and_test(&c->refcount))
- kfree(c);
+ call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
}
/* decrease the count of entries using/referencing this config. If last
@@ -82,12 +91,13 @@ clusterip_config_put(struct clusterip_config *c)
static inline void
clusterip_config_entry_put(struct clusterip_config *c)
{
- write_lock_bh(&clusterip_lock);
- if (atomic_dec_and_test(&c->entries)) {
- list_del(&c->list);
- write_unlock_bh(&clusterip_lock);
+ local_bh_disable();
+ if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
+ list_del_rcu(&c->list);
+ spin_unlock(&clusterip_lock);
+ local_bh_enable();
- dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
+ dev_mc_del(c->dev, c->clustermac);
dev_put(c->dev);
/* In case anyone still accesses the file, the open/close
@@ -98,7 +108,7 @@ clusterip_config_entry_put(struct clusterip_config *c)
#endif
return;
}
- write_unlock_bh(&clusterip_lock);
+ local_bh_enable();
}
static struct clusterip_config *
@@ -106,7 +116,7 @@ __clusterip_config_find(__be32 clusterip)
{
struct clusterip_config *c;
- list_for_each_entry(c, &clusterip_configs, list) {
+ list_for_each_entry_rcu(c, &clusterip_configs, list) {
if (c->clusterip == clusterip)
return c;
}
@@ -119,16 +129,15 @@ clusterip_config_find_get(__be32 clusterip, int entry)
{
struct clusterip_config *c;
- read_lock_bh(&clusterip_lock);
+ rcu_read_lock_bh();
c = __clusterip_config_find(clusterip);
- if (!c) {
- read_unlock_bh(&clusterip_lock);
- return NULL;
+ if (c) {
+ if (unlikely(!atomic_inc_not_zero(&c->refcount)))
+ c = NULL;
+ else if (entry)
+ atomic_inc(&c->entries);
}
- atomic_inc(&c->refcount);
- if (entry)
- atomic_inc(&c->entries);
- read_unlock_bh(&clusterip_lock);
+ rcu_read_unlock_bh();
return c;
}
@@ -179,9 +188,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
}
#endif
- write_lock_bh(&clusterip_lock);
- list_add(&c->list, &clusterip_configs);
- write_unlock_bh(&clusterip_lock);
+ spin_lock_bh(&clusterip_lock);
+ list_add_rcu(&c->list, &clusterip_configs);
+ spin_unlock_bh(&clusterip_lock);
return c;
}
@@ -238,8 +247,7 @@ clusterip_hashfn(const struct sk_buff *skb,
break;
default:
if (net_ratelimit())
- printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n",
- iph->protocol);
+ pr_info("unknown protocol %u\n", iph->protocol);
sport = dport = 0;
}
@@ -261,7 +269,7 @@ clusterip_hashfn(const struct sk_buff *skb,
hashval = 0;
/* This cannot happen, unless the check function wasn't called
* at rule load time */
- printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode);
+ pr_info("unknown mode %u\n", config->hash_mode);
BUG();
break;
}
@@ -281,7 +289,7 @@ clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
***********************************************************************/
static unsigned int
-clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
+clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
struct nf_conn *ct;
@@ -294,7 +302,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
ct = nf_ct_get(skb, &ctinfo);
if (ct == NULL) {
- printk(KERN_ERR "CLUSTERIP: no conntrack!\n");
+ pr_info("no conntrack!\n");
/* FIXME: need to drop invalid ones, since replies
* to outgoing connections of other nodes will be
* marked as INVALID */
@@ -347,25 +355,24 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
return XT_CONTINUE;
}
-static bool clusterip_tg_check(const struct xt_tgchk_param *par)
+static int clusterip_tg_check(const struct xt_tgchk_param *par)
{
struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
const struct ipt_entry *e = par->entryinfo;
-
struct clusterip_config *config;
+ int ret;
if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
- printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n",
- cipinfo->hash_mode);
- return false;
+ pr_info("unknown mode %u\n", cipinfo->hash_mode);
+ return -EINVAL;
}
if (e->ip.dmsk.s_addr != htonl(0xffffffff) ||
e->ip.dst.s_addr == 0) {
- printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n");
- return false;
+ pr_info("Please specify destination IP\n");
+ return -EINVAL;
}
/* FIXME: further sanity checks */
@@ -373,41 +380,41 @@ static bool clusterip_tg_check(const struct xt_tgchk_param *par)
config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
if (!config) {
if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
- printk(KERN_WARNING "CLUSTERIP: no config found for %pI4, need 'new'\n", &e->ip.dst.s_addr);
- return false;
+ pr_info("no config found for %pI4, need 'new'\n",
+ &e->ip.dst.s_addr);
+ return -EINVAL;
} else {
struct net_device *dev;
if (e->ip.iniface[0] == '\0') {
- printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n");
- return false;
+ pr_info("Please specify an interface name\n");
+ return -EINVAL;
}
dev = dev_get_by_name(&init_net, e->ip.iniface);
if (!dev) {
- printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
- return false;
+ pr_info("no such interface %s\n",
+ e->ip.iniface);
+ return -ENOENT;
}
config = clusterip_config_init(cipinfo,
e->ip.dst.s_addr, dev);
if (!config) {
- printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n");
+ pr_info("cannot allocate config\n");
dev_put(dev);
- return false;
+ return -ENOMEM;
}
- dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0);
+ dev_mc_add(config->dev, config->clustermac);
}
}
cipinfo->config = config;
- if (nf_ct_l3proto_try_module_get(par->target->family) < 0) {
- printk(KERN_WARNING "can't load conntrack support for "
- "proto=%u\n", par->target->family);
- return false;
- }
-
- return true;
+ ret = nf_ct_l3proto_try_module_get(par->family);
+ if (ret < 0)
+ pr_info("cannot load conntrack support for proto=%u\n",
+ par->family);
+ return ret;
}
/* drop reference count of cluster config when rule is deleted */
@@ -421,7 +428,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
clusterip_config_put(cipinfo->config);
- nf_ct_l3proto_module_put(par->target->family);
+ nf_ct_l3proto_module_put(par->family);
}
#ifdef CONFIG_COMPAT
@@ -462,7 +469,7 @@ struct arp_payload {
__be32 src_ip;
u_int8_t dst_hw[ETH_ALEN];
__be32 dst_ip;
-} __attribute__ ((packed));
+} __packed;
#ifdef DEBUG
static void arp_print(struct arp_payload *payload)
@@ -478,8 +485,8 @@ static void arp_print(struct arp_payload *payload)
}
hbuffer[--k]='\0';
- printk("src %pI4@%s, dst %pI4\n",
- &payload->src_ip, hbuffer, &payload->dst_ip);
+ pr_debug("src %pI4@%s, dst %pI4\n",
+ &payload->src_ip, hbuffer, &payload->dst_ip);
}
#endif
@@ -518,7 +525,7 @@ arp_mangle(unsigned int hook,
* this wouldn't work, since we didn't subscribe the mcast group on
* other interfaces */
if (c->dev != out) {
- pr_debug("CLUSTERIP: not mangling arp reply on different "
+ pr_debug("not mangling arp reply on different "
"interface: cip'%s'-skb'%s'\n",
c->dev->name, out->name);
clusterip_config_put(c);
@@ -529,7 +536,7 @@ arp_mangle(unsigned int hook,
memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
#ifdef DEBUG
- pr_debug(KERN_DEBUG "CLUSTERIP mangled arp reply: ");
+ pr_debug("mangled arp reply: ");
arp_print(payload);
#endif
@@ -600,7 +607,8 @@ static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
static void clusterip_seq_stop(struct seq_file *s, void *v)
{
- kfree(v);
+ if (!IS_ERR(v))
+ kfree(v);
}
static int clusterip_seq_show(struct seq_file *s, void *v)
@@ -705,13 +713,13 @@ static int __init clusterip_tg_init(void)
#ifdef CONFIG_PROC_FS
clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net);
if (!clusterip_procdir) {
- printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n");
+ pr_err("Unable to proc dir entry\n");
ret = -ENOMEM;
goto cleanup_hook;
}
#endif /* CONFIG_PROC_FS */
- printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n",
+ pr_info("ClusterIP Version %s loaded successfully\n",
CLUSTERIP_VERSION);
return 0;
@@ -726,13 +734,15 @@ cleanup_target:
static void __exit clusterip_tg_exit(void)
{
- printk(KERN_NOTICE "ClusterIP Version %s unloading\n",
- CLUSTERIP_VERSION);
+ pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
#ifdef CONFIG_PROC_FS
remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
#endif
nf_unregister_hook(&cip_arp_ops);
xt_unregister_target(&clusterip_tg_reg);
+
+ /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
+ rcu_barrier_bh();
}
module_init(clusterip_tg_init);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index ea5cea2415c..4bf3dc49ad1 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -6,7 +6,7 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/in.h>
#include <linux/module.h>
#include <linux/skbuff.h>
@@ -77,7 +77,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
}
static unsigned int
-ecn_tg(struct sk_buff *skb, const struct xt_target_param *par)
+ecn_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ipt_ECN_info *einfo = par->targinfo;
@@ -93,28 +93,25 @@ ecn_tg(struct sk_buff *skb, const struct xt_target_param *par)
return XT_CONTINUE;
}
-static bool ecn_tg_check(const struct xt_tgchk_param *par)
+static int ecn_tg_check(const struct xt_tgchk_param *par)
{
const struct ipt_ECN_info *einfo = par->targinfo;
const struct ipt_entry *e = par->entryinfo;
if (einfo->operation & IPT_ECN_OP_MASK) {
- printk(KERN_WARNING "ECN: unsupported ECN operation %x\n",
- einfo->operation);
- return false;
+ pr_info("unsupported ECN operation %x\n", einfo->operation);
+ return -EINVAL;
}
if (einfo->ip_ect & ~IPT_ECN_IP_MASK) {
- printk(KERN_WARNING "ECN: new ECT codepoint %x out of mask\n",
- einfo->ip_ect);
- return false;
+ pr_info("new ECT codepoint %x out of mask\n", einfo->ip_ect);
+ return -EINVAL;
}
if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) &&
(e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) {
- printk(KERN_WARNING "ECN: cannot use TCP operations on a "
- "non-tcp rule\n");
- return false;
+ pr_info("cannot use TCP operations on a non-tcp rule\n");
+ return -EINVAL;
}
- return true;
+ return 0;
}
static struct xt_target ecn_tg_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index ee128efa1c8..915fc17d7ce 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -9,10 +9,11 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/skbuff.h>
+#include <linux/if_arp.h>
#include <linux/ip.h>
#include <net/icmp.h>
#include <net/udp.h>
@@ -363,11 +364,47 @@ static void dump_packet(const struct nf_loginfo *info,
/* maxlen = 230+ 91 + 230 + 252 = 803 */
}
+static void dump_mac_header(const struct nf_loginfo *info,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ unsigned int logflags = 0;
+
+ if (info->type == NF_LOG_TYPE_LOG)
+ logflags = info->u.log.logflags;
+
+ if (!(logflags & IPT_LOG_MACDECODE))
+ goto fallback;
+
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+ eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+ ntohs(eth_hdr(skb)->h_proto));
+ return;
+ default:
+ break;
+ }
+
+fallback:
+ printk("MAC=");
+ if (dev->hard_header_len &&
+ skb->mac_header != skb->network_header) {
+ const unsigned char *p = skb_mac_header(skb);
+ unsigned int i;
+
+ printk("%02x", *p++);
+ for (i = 1; i < dev->hard_header_len; i++, p++)
+ printk(":%02x", *p);
+ }
+ printk(" ");
+}
+
static struct nf_loginfo default_loginfo = {
.type = NF_LOG_TYPE_LOG,
.u = {
.log = {
- .level = 0,
+ .level = 5,
.logflags = NF_LOG_MASK,
},
},
@@ -404,20 +441,9 @@ ipt_log_packet(u_int8_t pf,
}
#endif
- if (in && !out) {
- /* MAC logging for input chain only. */
- printk("MAC=");
- if (skb->dev && skb->dev->hard_header_len &&
- skb->mac_header != skb->network_header) {
- int i;
- const unsigned char *p = skb_mac_header(skb);
- for (i = 0; i < skb->dev->hard_header_len; i++,p++)
- printk("%02x%c", *p,
- i==skb->dev->hard_header_len - 1
- ? ' ':':');
- } else
- printk(" ");
- }
+ /* MAC logging for input path only. */
+ if (in && !out)
+ dump_mac_header(loginfo, skb);
dump_packet(loginfo, skb, 0);
printk("\n");
@@ -425,7 +451,7 @@ ipt_log_packet(u_int8_t pf,
}
static unsigned int
-log_tg(struct sk_buff *skb, const struct xt_target_param *par)
+log_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ipt_log_info *loginfo = par->targinfo;
struct nf_loginfo li;
@@ -439,20 +465,19 @@ log_tg(struct sk_buff *skb, const struct xt_target_param *par)
return XT_CONTINUE;
}
-static bool log_tg_check(const struct xt_tgchk_param *par)
+static int log_tg_check(const struct xt_tgchk_param *par)
{
const struct ipt_log_info *loginfo = par->targinfo;
if (loginfo->level >= 8) {
- pr_debug("LOG: level %u >= 8\n", loginfo->level);
- return false;
+ pr_debug("level %u >= 8\n", loginfo->level);
+ return -EINVAL;
}
if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
- pr_debug("LOG: prefix term %i\n",
- loginfo->prefix[sizeof(loginfo->prefix)-1]);
- return false;
+ pr_debug("prefix is not null-terminated\n");
+ return -EINVAL;
}
- return true;
+ return 0;
}
static struct xt_target log_tg_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 650b54042b0..d2ed9dc74eb 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -8,7 +8,7 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
#include <linux/inetdevice.h>
#include <linux/ip.h>
@@ -28,23 +28,23 @@ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
/* FIXME: Multiple targets. --RR */
-static bool masquerade_tg_check(const struct xt_tgchk_param *par)
+static int masquerade_tg_check(const struct xt_tgchk_param *par)
{
const struct nf_nat_multi_range_compat *mr = par->targinfo;
if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
- pr_debug("masquerade_check: bad MAP_IPS.\n");
- return false;
+ pr_debug("bad MAP_IPS.\n");
+ return -EINVAL;
}
if (mr->rangesize != 1) {
- pr_debug("masquerade_check: bad rangesize %u\n", mr->rangesize);
- return false;
+ pr_debug("bad rangesize %u\n", mr->rangesize);
+ return -EINVAL;
}
- return true;
+ return 0;
}
static unsigned int
-masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par)
+masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
struct nf_conn *ct;
struct nf_conn_nat *nat;
@@ -72,7 +72,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par)
rt = skb_rtable(skb);
newsrc = inet_select_addr(par->out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
if (!newsrc) {
- printk("MASQUERADE: %s ate my IP address\n", par->out->name);
+ pr_info("%s ate my IP address\n", par->out->name);
return NF_DROP;
}
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
index 7c29582d4ec..6cdb298f103 100644
--- a/net/ipv4/netfilter/ipt_NETMAP.c
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -9,7 +9,7 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/ip.h>
#include <linux/module.h>
#include <linux/netdevice.h>
@@ -22,23 +22,23 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
MODULE_DESCRIPTION("Xtables: 1:1 NAT mapping of IPv4 subnets");
-static bool netmap_tg_check(const struct xt_tgchk_param *par)
+static int netmap_tg_check(const struct xt_tgchk_param *par)
{
const struct nf_nat_multi_range_compat *mr = par->targinfo;
if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
- pr_debug("NETMAP:check: bad MAP_IPS.\n");
- return false;
+ pr_debug("bad MAP_IPS.\n");
+ return -EINVAL;
}
if (mr->rangesize != 1) {
- pr_debug("NETMAP:check: bad rangesize %u.\n", mr->rangesize);
- return false;
+ pr_debug("bad rangesize %u.\n", mr->rangesize);
+ return -EINVAL;
}
- return true;
+ return 0;
}
static unsigned int
-netmap_tg(struct sk_buff *skb, const struct xt_target_param *par)
+netmap_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
@@ -48,7 +48,8 @@ netmap_tg(struct sk_buff *skb, const struct xt_target_param *par)
NF_CT_ASSERT(par->hooknum == NF_INET_PRE_ROUTING ||
par->hooknum == NF_INET_POST_ROUTING ||
- par->hooknum == NF_INET_LOCAL_OUT);
+ par->hooknum == NF_INET_LOCAL_OUT ||
+ par->hooknum == NF_INET_LOCAL_IN);
ct = nf_ct_get(skb, &ctinfo);
netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
@@ -77,7 +78,8 @@ static struct xt_target netmap_tg_reg __read_mostly = {
.table = "nat",
.hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_POST_ROUTING) |
- (1 << NF_INET_LOCAL_OUT),
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_LOCAL_IN),
.checkentry = netmap_tg_check,
.me = THIS_MODULE
};
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 698e5e78685..18a0656505a 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -6,7 +6,7 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
#include <linux/ip.h>
#include <linux/timer.h>
@@ -26,23 +26,23 @@ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("Xtables: Connection redirection to localhost");
/* FIXME: Take multiple ranges --RR */
-static bool redirect_tg_check(const struct xt_tgchk_param *par)
+static int redirect_tg_check(const struct xt_tgchk_param *par)
{
const struct nf_nat_multi_range_compat *mr = par->targinfo;
if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
- pr_debug("redirect_check: bad MAP_IPS.\n");
- return false;
+ pr_debug("bad MAP_IPS.\n");
+ return -EINVAL;
}
if (mr->rangesize != 1) {
- pr_debug("redirect_check: bad rangesize %u.\n", mr->rangesize);
- return false;
+ pr_debug("bad rangesize %u.\n", mr->rangesize);
+ return -EINVAL;
}
- return true;
+ return 0;
}
static unsigned int
-redirect_tg(struct sk_buff *skb, const struct xt_target_param *par)
+redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 5113b8f1a37..b254dafaf42 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -9,9 +9,10 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <linux/ip.h>
#include <linux/udp.h>
#include <linux/icmp.h>
@@ -94,10 +95,11 @@ static void send_reset(struct sk_buff *oldskb, int hook)
}
tcph->rst = 1;
- tcph->check = tcp_v4_check(sizeof(struct tcphdr),
- niph->saddr, niph->daddr,
- csum_partial(tcph,
- sizeof(struct tcphdr), 0));
+ tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
+ niph->daddr, 0);
+ nskb->ip_summed = CHECKSUM_PARTIAL;
+ nskb->csum_start = (unsigned char *)tcph - nskb->head;
+ nskb->csum_offset = offsetof(struct tcphdr, check);
addr_type = RTN_UNSPEC;
if (hook != NF_INET_FORWARD
@@ -108,13 +110,12 @@ static void send_reset(struct sk_buff *oldskb, int hook)
addr_type = RTN_LOCAL;
/* ip_route_me_harder expects skb->dst to be set */
- skb_dst_set(nskb, dst_clone(skb_dst(oldskb)));
+ skb_dst_set_noref(nskb, skb_dst(oldskb));
if (ip_route_me_harder(nskb, addr_type))
goto free_nskb;
niph->ttl = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT);
- nskb->ip_summed = CHECKSUM_NONE;
/* "Never happens" */
if (nskb->len > dst_mtu(skb_dst(nskb)))
@@ -135,13 +136,10 @@ static inline void send_unreach(struct sk_buff *skb_in, int code)
}
static unsigned int
-reject_tg(struct sk_buff *skb, const struct xt_target_param *par)
+reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ipt_reject_info *reject = par->targinfo;
- /* WARNING: This code causes reentry within iptables.
- This means that the iptables jump stack is now crap. We
- must return an absolute verdict. --RR */
switch (reject->with) {
case IPT_ICMP_NET_UNREACHABLE:
send_unreach(skb, ICMP_NET_UNREACH);
@@ -174,23 +172,23 @@ reject_tg(struct sk_buff *skb, const struct xt_target_param *par)
return NF_DROP;
}
-static bool reject_tg_check(const struct xt_tgchk_param *par)
+static int reject_tg_check(const struct xt_tgchk_param *par)
{
const struct ipt_reject_info *rejinfo = par->targinfo;
const struct ipt_entry *e = par->entryinfo;
if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
- printk("ipt_REJECT: ECHOREPLY no longer supported.\n");
- return false;
+ pr_info("ECHOREPLY no longer supported.\n");
+ return -EINVAL;
} else if (rejinfo->with == IPT_TCP_RESET) {
/* Must specify that it's a TCP packet */
if (e->ip.proto != IPPROTO_TCP ||
(e->ip.invflags & XT_INV_PROTO)) {
- printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n");
- return false;
+ pr_info("TCP_RESET invalid for non-tcp\n");
+ return -EINVAL;
}
}
- return true;
+ return 0;
}
static struct xt_target reject_tg_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 09a5d3f7cc4..446e0f467a1 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -29,10 +29,11 @@
* Specify, after how many hundredths of a second the queue should be
* flushed even if it is not full yet.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/socket.h>
+#include <linux/slab.h>
#include <linux/skbuff.h>
#include <linux/kernel.h>
#include <linux/timer.h>
@@ -56,8 +57,6 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
#define ULOG_NL_EVENT 111 /* Harald's favorite number */
#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */
-#define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0)
-
static unsigned int nlbufsiz = NLMSG_GOODSIZE;
module_param(nlbufsiz, uint, 0400);
MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
@@ -90,12 +89,12 @@ static void ulog_send(unsigned int nlgroupnum)
ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
if (timer_pending(&ub->timer)) {
- pr_debug("ipt_ULOG: ulog_send: timer was pending, deleting\n");
+ pr_debug("ulog_send: timer was pending, deleting\n");
del_timer(&ub->timer);
}
if (!ub->skb) {
- pr_debug("ipt_ULOG: ulog_send: nothing to send\n");
+ pr_debug("ulog_send: nothing to send\n");
return;
}
@@ -104,7 +103,7 @@ static void ulog_send(unsigned int nlgroupnum)
ub->lastnlh->nlmsg_type = NLMSG_DONE;
NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
- pr_debug("ipt_ULOG: throwing %d packets to netlink group %u\n",
+ pr_debug("throwing %d packets to netlink group %u\n",
ub->qlen, nlgroupnum + 1);
netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC);
@@ -117,7 +116,7 @@ static void ulog_send(unsigned int nlgroupnum)
/* timer function to flush queue in flushtimeout time */
static void ulog_timer(unsigned long data)
{
- pr_debug("ipt_ULOG: timer function called, calling ulog_send\n");
+ pr_debug("timer function called, calling ulog_send\n");
/* lock to protect against somebody modifying our structure
* from ipt_ulog_target at the same time */
@@ -138,7 +137,7 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
n = max(size, nlbufsiz);
skb = alloc_skb(n, GFP_ATOMIC);
if (!skb) {
- PRINTR("ipt_ULOG: can't alloc whole buffer %ub!\n", n);
+ pr_debug("cannot alloc whole buffer %ub!\n", n);
if (n > size) {
/* try to allocate only as much as we need for
@@ -146,8 +145,7 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
skb = alloc_skb(size, GFP_ATOMIC);
if (!skb)
- PRINTR("ipt_ULOG: can't even allocate %ub\n",
- size);
+ pr_debug("cannot even allocate %ub\n", size);
}
}
@@ -198,8 +196,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
goto alloc_failure;
}
- pr_debug("ipt_ULOG: qlen %d, qthreshold %Zu\n", ub->qlen,
- loginfo->qthreshold);
+ pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
/* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
@@ -272,16 +269,14 @@ static void ipt_ulog_packet(unsigned int hooknum,
return;
nlmsg_failure:
- PRINTR("ipt_ULOG: error during NLMSG_PUT\n");
-
+ pr_debug("error during NLMSG_PUT\n");
alloc_failure:
- PRINTR("ipt_ULOG: Error building netlink message\n");
-
+ pr_debug("Error building netlink message\n");
spin_unlock_bh(&ulog_lock);
}
static unsigned int
-ulog_tg(struct sk_buff *skb, const struct xt_target_param *par)
+ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
ipt_ulog_packet(par->hooknum, skb, par->in, par->out,
par->targinfo, NULL);
@@ -313,21 +308,20 @@ static void ipt_logfn(u_int8_t pf,
ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
}
-static bool ulog_tg_check(const struct xt_tgchk_param *par)
+static int ulog_tg_check(const struct xt_tgchk_param *par)
{
const struct ipt_ulog_info *loginfo = par->targinfo;
if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
- pr_debug("ipt_ULOG: prefix term %i\n",
- loginfo->prefix[sizeof(loginfo->prefix) - 1]);
- return false;
+ pr_debug("prefix not null-terminated\n");
+ return -EINVAL;
}
if (loginfo->qthreshold > ULOG_MAX_QLEN) {
- pr_debug("ipt_ULOG: queue threshold %Zu > MAX_QLEN\n",
+ pr_debug("queue threshold %Zu > MAX_QLEN\n",
loginfo->qthreshold);
- return false;
+ return -EINVAL;
}
- return true;
+ return 0;
}
#ifdef CONFIG_COMPAT
@@ -389,10 +383,10 @@ static int __init ulog_tg_init(void)
{
int ret, i;
- pr_debug("ipt_ULOG: init module\n");
+ pr_debug("init module\n");
if (nlbufsiz > 128*1024) {
- printk("Netlink buffer has to be <= 128kB\n");
+ pr_warning("Netlink buffer has to be <= 128kB\n");
return -EINVAL;
}
@@ -422,7 +416,7 @@ static void __exit ulog_tg_exit(void)
ulog_buff_t *ub;
int i;
- pr_debug("ipt_ULOG: cleanup_module\n");
+ pr_debug("cleanup_module\n");
if (nflog)
nf_log_unregister(&ipt_ulog_logger);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
index 3b216be3bc9..db8bff0fb86 100644
--- a/net/ipv4/netfilter/ipt_addrtype.c
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -8,7 +8,7 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/skbuff.h>
@@ -30,7 +30,7 @@ static inline bool match_type(struct net *net, const struct net_device *dev,
}
static bool
-addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
struct net *net = dev_net(par->in ? par->in : par->out);
const struct ipt_addrtype_info *info = par->matchinfo;
@@ -48,7 +48,7 @@ addrtype_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
}
static bool
-addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par)
+addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
{
struct net *net = dev_net(par->in ? par->in : par->out);
const struct ipt_addrtype_info_v1 *info = par->matchinfo;
@@ -70,34 +70,34 @@ addrtype_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par)
return ret;
}
-static bool addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
+static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par)
{
struct ipt_addrtype_info_v1 *info = par->matchinfo;
if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN &&
info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
- printk(KERN_ERR "ipt_addrtype: both incoming and outgoing "
- "interface limitation cannot be selected\n");
- return false;
+ pr_info("both incoming and outgoing "
+ "interface limitation cannot be selected\n");
+ return -EINVAL;
}
if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN)) &&
info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) {
- printk(KERN_ERR "ipt_addrtype: output interface limitation "
- "not valid in PRE_ROUTING and INPUT\n");
- return false;
+ pr_info("output interface limitation "
+ "not valid in PREROUTING and INPUT\n");
+ return -EINVAL;
}
if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) |
(1 << NF_INET_LOCAL_OUT)) &&
info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) {
- printk(KERN_ERR "ipt_addrtype: input interface limitation "
- "not valid in POST_ROUTING and OUTPUT\n");
- return false;
+ pr_info("input interface limitation "
+ "not valid in POSTROUTING and OUTPUT\n");
+ return -EINVAL;
}
- return true;
+ return 0;
}
static struct xt_match addrtype_mt_reg[] __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index 0104c0b399d..14a2aa8b8a1 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -5,7 +5,7 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/in.h>
#include <linux/module.h>
#include <linux/skbuff.h>
@@ -18,25 +18,19 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match");
-#ifdef DEBUG_CONNTRACK
-#define duprintf(format, args...) printk(format , ## args)
-#else
-#define duprintf(format, args...)
-#endif
-
/* Returns 1 if the spi is matched by the range, 0 otherwise */
static inline bool
spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
{
bool r;
- duprintf("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
- min,spi,max);
+ pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
+ invert ? '!' : ' ', min, spi, max);
r=(spi >= min && spi <= max) ^ invert;
- duprintf(" result %s\n",r? "PASS" : "FAILED");
+ pr_debug(" result %s\n", r ? "PASS" : "FAILED");
return r;
}
-static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par)
+static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
struct ip_auth_hdr _ahdr;
const struct ip_auth_hdr *ah;
@@ -51,8 +45,8 @@ static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par)
/* We've been asked to examine this packet, and we
* can't. Hence, no choice but to drop.
*/
- duprintf("Dropping evil AH tinygram.\n");
- *par->hotdrop = true;
+ pr_debug("Dropping evil AH tinygram.\n");
+ par->hotdrop = true;
return 0;
}
@@ -61,16 +55,16 @@ static bool ah_mt(const struct sk_buff *skb, const struct xt_match_param *par)
!!(ahinfo->invflags & IPT_AH_INV_SPI));
}
-static bool ah_mt_check(const struct xt_mtchk_param *par)
+static int ah_mt_check(const struct xt_mtchk_param *par)
{
const struct ipt_ah *ahinfo = par->matchinfo;
/* Must specify no unknown invflags */
if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
- duprintf("ipt_ah: unknown flags %X\n", ahinfo->invflags);
- return false;
+ pr_debug("unknown flags %X\n", ahinfo->invflags);
+ return -EINVAL;
}
- return true;
+ return 0;
}
static struct xt_match ah_mt_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index 2a1e56b7190..af6e9c77834 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -6,7 +6,7 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/in.h>
#include <linux/ip.h>
#include <net/ip.h>
@@ -67,7 +67,7 @@ static inline bool match_tcp(const struct sk_buff *skb,
return true;
}
-static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par)
+static bool ecn_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct ipt_ecn_info *info = par->matchinfo;
@@ -78,32 +78,31 @@ static bool ecn_mt(const struct sk_buff *skb, const struct xt_match_param *par)
if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
if (ip_hdr(skb)->protocol != IPPROTO_TCP)
return false;
- if (!match_tcp(skb, info, par->hotdrop))
+ if (!match_tcp(skb, info, &par->hotdrop))
return false;
}
return true;
}
-static bool ecn_mt_check(const struct xt_mtchk_param *par)
+static int ecn_mt_check(const struct xt_mtchk_param *par)
{
const struct ipt_ecn_info *info = par->matchinfo;
const struct ipt_ip *ip = par->entryinfo;
if (info->operation & IPT_ECN_OP_MATCH_MASK)
- return false;
+ return -EINVAL;
if (info->invert & IPT_ECN_OP_MATCH_MASK)
- return false;
+ return -EINVAL;
if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) &&
ip->proto != IPPROTO_TCP) {
- printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for"
- " non-tcp packets\n");
- return false;
+ pr_info("cannot match TCP bits in rule for non-tcp packets\n");
+ return -EINVAL;
}
- return true;
+ return 0;
}
static struct xt_match ecn_mt_reg __read_mostly = {
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index c8dc9800d62..c37641e819f 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
#include <net/ip.h>
MODULE_LICENSE("GPL");
@@ -88,7 +89,7 @@ static int __init iptable_filter_init(void)
int ret;
if (forward < 0 || forward > NF_MAX_VERDICT) {
- printk("iptables forward must be 0 or 1\n");
+ pr_err("iptables forward must be 0 or 1\n");
return -EINVAL;
}
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index b9b83464cbf..294a2a32f29 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -12,6 +12,7 @@
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include <net/route.h>
#include <linux/ip.h>
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 06fb9d11953..07fb710cd72 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -5,6 +5,7 @@
*/
#include <linux/module.h>
#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
#include <net/ip.h>
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index cce2f64e6f2..be45bdc4c60 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -17,6 +17,7 @@
*/
#include <linux/module.h>
#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
#include <net/ip.h>
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 2bb1f87051c..5a03c02af99 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -382,32 +382,32 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4);
if (ret < 0) {
- printk("nf_conntrack_ipv4: can't register tcp.\n");
+ pr_err("nf_conntrack_ipv4: can't register tcp.\n");
goto cleanup_sockopt;
}
ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4);
if (ret < 0) {
- printk("nf_conntrack_ipv4: can't register udp.\n");
+ pr_err("nf_conntrack_ipv4: can't register udp.\n");
goto cleanup_tcp;
}
ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp);
if (ret < 0) {
- printk("nf_conntrack_ipv4: can't register icmp.\n");
+ pr_err("nf_conntrack_ipv4: can't register icmp.\n");
goto cleanup_udp;
}
ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4);
if (ret < 0) {
- printk("nf_conntrack_ipv4: can't register ipv4\n");
+ pr_err("nf_conntrack_ipv4: can't register ipv4\n");
goto cleanup_icmp;
}
ret = nf_register_hooks(ipv4_conntrack_ops,
ARRAY_SIZE(ipv4_conntrack_ops));
if (ret < 0) {
- printk("nf_conntrack_ipv4: can't register hooks.\n");
+ pr_err("nf_conntrack_ipv4: can't register hooks.\n");
goto cleanup_ipv4;
}
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 2fb7b76da94..244f7cb08d6 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -336,12 +336,12 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
const struct ip_conntrack_stat *st = v;
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n");
+ seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
return 0;
}
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
- "%08x %08x %08x %08x %08x %08x %08x %08x \n",
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
nr_conntracks,
st->searched,
st->found,
@@ -358,7 +358,8 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
st->expect_new,
st->expect_create,
- st->expect_delete
+ st->expect_delete,
+ st->search_restart
);
return 0;
}
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index cb763ae9ed9..eab8de32f20 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -66,6 +66,11 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (inet && inet->nodefrag)
+ return NF_ACCEPT;
+
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
/* Previously seen (loopback)? Ignore. Do this before
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 4595281c286..8c8632d9b93 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/skbuff.h>
+#include <linux/gfp.h>
#include <net/checksum.h>
#include <net/icmp.h>
#include <net/ip.h>
@@ -260,14 +261,9 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
rcu_read_lock();
proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
- /* Change protocol info to have some randomization */
- if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) {
- proto->unique_tuple(tuple, range, maniptype, ct);
- goto out;
- }
-
/* Only bother mapping if it's not already in range and unique */
- if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
+ if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) &&
+ (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
!nf_nat_used_tuple(tuple, ct))
goto out;
@@ -439,7 +435,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
return 0;
- inside = (void *)skb->data + ip_hdrlen(skb);
+ inside = (void *)skb->data + hdrlen;
/* We're actually going to mangle it beyond trivial checksum
adjustment, so make sure the current checksum is correct. */
@@ -469,12 +465,10 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
/* rcu_read_lock()ed by nf_hook_slow */
l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol);
- if (!nf_ct_get_tuple(skb,
- ip_hdrlen(skb) + sizeof(struct icmphdr),
- (ip_hdrlen(skb) +
+ if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr),
+ (hdrlen +
sizeof(struct icmphdr) + inside->ip.ihl * 4),
- (u_int16_t)AF_INET,
- inside->ip.protocol,
+ (u_int16_t)AF_INET, inside->ip.protocol,
&inner, l3proto, l4proto))
return 0;
@@ -483,15 +477,13 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
pass all hooks (locally-generated ICMP). Consider incoming
packet: PREROUTING (DST manip), routing produces ICMP, goes
through POSTROUTING (which must correct the DST manip). */
- if (!manip_pkt(inside->ip.protocol, skb,
- ip_hdrlen(skb) + sizeof(inside->icmp),
- &ct->tuplehash[!dir].tuple,
- !manip))
+ if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp),
+ &ct->tuplehash[!dir].tuple, !manip))
return 0;
if (skb->ip_summed != CHECKSUM_PARTIAL) {
/* Reloading "inside" here since manip_pkt inner. */
- inside = (void *)skb->data + ip_hdrlen(skb);
+ inside = (void *)skb->data + hdrlen;
inside->icmp.checksum = 0;
inside->icmp.checksum =
csum_fold(skb_checksum(skb, hdrlen,
@@ -741,7 +733,7 @@ static int __init nf_nat_init(void)
spin_unlock_bh(&nf_nat_lock);
/* Initialize fake conntrack so that NAT will skip it */
- nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
+ nf_ct_untracked_status_or(IPS_NAT_DONE_MASK);
l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET);
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 7e8e6fc7541..5045196d853 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -10,7 +10,6 @@
*/
#include <linux/module.h>
-#include <linux/moduleparam.h>
#include <linux/tcp.h>
#include <net/tcp.h>
@@ -44,7 +43,7 @@ static int set_addr(struct sk_buff *skb,
addroff, sizeof(buf),
(char *) &buf, sizeof(buf))) {
if (net_ratelimit())
- printk("nf_nat_h323: nf_nat_mangle_tcp_packet"
+ pr_notice("nf_nat_h323: nf_nat_mangle_tcp_packet"
" error\n");
return -1;
}
@@ -60,7 +59,7 @@ static int set_addr(struct sk_buff *skb,
addroff, sizeof(buf),
(char *) &buf, sizeof(buf))) {
if (net_ratelimit())
- printk("nf_nat_h323: nf_nat_mangle_udp_packet"
+ pr_notice("nf_nat_h323: nf_nat_mangle_udp_packet"
" error\n");
return -1;
}
@@ -216,7 +215,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
/* Run out of expectations */
if (i >= H323_RTP_CHANNEL_MAX) {
if (net_ratelimit())
- printk("nf_nat_h323: out of expectations\n");
+ pr_notice("nf_nat_h323: out of expectations\n");
return 0;
}
@@ -235,7 +234,7 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
if (nated_port == 0) { /* No port available */
if (net_ratelimit())
- printk("nf_nat_h323: out of RTP ports\n");
+ pr_notice("nf_nat_h323: out of RTP ports\n");
return 0;
}
@@ -292,7 +291,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
if (nated_port == 0) { /* No port available */
if (net_ratelimit())
- printk("nf_nat_h323: out of TCP ports\n");
+ pr_notice("nf_nat_h323: out of TCP ports\n");
return 0;
}
@@ -342,7 +341,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
if (nated_port == 0) { /* No port available */
if (net_ratelimit())
- printk("nf_nat_q931: out of TCP ports\n");
+ pr_notice("nf_nat_q931: out of TCP ports\n");
return 0;
}
@@ -426,7 +425,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
if (nated_port == 0) { /* No port available */
if (net_ratelimit())
- printk("nf_nat_ras: out of TCP ports\n");
+ pr_notice("nf_nat_ras: out of TCP ports\n");
return 0;
}
@@ -508,7 +507,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
if (nated_port == 0) { /* No port available */
if (net_ratelimit())
- printk("nf_nat_q931: out of TCP ports\n");
+ pr_notice("nf_nat_q931: out of TCP ports\n");
return 0;
}
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 4b6af4bb1f5..4a0c6b548ee 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -8,6 +8,7 @@
* published by the Free Software Foundation.
*/
#include <linux/module.h>
+#include <linux/gfp.h>
#include <linux/kmod.h>
#include <linux/types.h>
#include <linux/timer.h>
diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c
index 6c4f11f5144..3e61faf23a9 100644
--- a/net/ipv4/netfilter/nf_nat_proto_common.c
+++ b/net/ipv4/netfilter/nf_nat_proto_common.c
@@ -34,7 +34,7 @@ bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple,
}
EXPORT_SYMBOL_GPL(nf_nat_proto_in_range);
-bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
+void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct,
@@ -53,7 +53,7 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
/* If it's dst rewrite, can't change port */
if (maniptype == IP_NAT_MANIP_DST)
- return false;
+ return;
if (ntohs(*portptr) < 1024) {
/* Loose convention: >> 512 is credential passing */
@@ -81,15 +81,15 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple,
else
off = *rover;
- for (i = 0; i < range_size; i++, off++) {
+ for (i = 0; ; ++off) {
*portptr = htons(min + off % range_size);
- if (nf_nat_used_tuple(tuple, ct))
+ if (++i != range_size && nf_nat_used_tuple(tuple, ct))
continue;
if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM))
*rover = off;
- return true;
+ return;
}
- return false;
+ return;
}
EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple);
diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c
index 22485ce306d..570faf2667b 100644
--- a/net/ipv4/netfilter/nf_nat_proto_dccp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_dccp.c
@@ -22,14 +22,14 @@
static u_int16_t dccp_port_rover;
-static bool
+static void
dccp_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
- return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
- &dccp_port_rover);
+ nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+ &dccp_port_rover);
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index d7e89201351..bc8d83a31c7 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -37,7 +37,7 @@ MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
/* generate unique tuple ... */
-static bool
+static void
gre_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
@@ -50,7 +50,7 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
/* If there is no master conntrack we are not PPTP,
do not change tuples */
if (!ct->master)
- return false;
+ return;
if (maniptype == IP_NAT_MANIP_SRC)
keyptr = &tuple->src.u.gre.key;
@@ -68,14 +68,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple,
pr_debug("min = %u, range_size = %u\n", min, range_size);
- for (i = 0; i < range_size; i++, key++) {
+ for (i = 0; ; ++key) {
*keyptr = htons(min + key % range_size);
- if (!nf_nat_used_tuple(tuple, ct))
- return true;
+ if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+ return;
}
pr_debug("%p: no NAT mapping\n", ct);
- return false;
+ return;
}
/* manipulate a GRE packet according to maniptype */
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 19a8b0b07d8..5744c3ec847 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -27,7 +27,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple,
ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
}
-static bool
+static void
icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
@@ -42,13 +42,13 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple,
if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
range_size = 0xFFFF;
- for (i = 0; i < range_size; i++, id++) {
+ for (i = 0; ; ++id) {
tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) +
(id % range_size));
- if (!nf_nat_used_tuple(tuple, ct))
- return true;
+ if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+ return;
}
- return false;
+ return;
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c
index 3fc598eeeb1..756331d4266 100644
--- a/net/ipv4/netfilter/nf_nat_proto_sctp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c
@@ -16,14 +16,14 @@
static u_int16_t nf_sctp_port_rover;
-static bool
+static void
sctp_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
- return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
- &nf_sctp_port_rover);
+ nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+ &nf_sctp_port_rover);
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c
index 399e2cfa263..aa460a595d5 100644
--- a/net/ipv4/netfilter/nf_nat_proto_tcp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c
@@ -20,14 +20,13 @@
static u_int16_t tcp_port_rover;
-static bool
+static void
tcp_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
- return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
- &tcp_port_rover);
+ nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover);
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c
index 9e61c79492e..dfe65c7e292 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udp.c
@@ -19,14 +19,13 @@
static u_int16_t udp_port_rover;
-static bool
+static void
udp_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
- return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
- &udp_port_rover);
+ nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover);
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c
index 440a229bbd8..3cc8c8af39e 100644
--- a/net/ipv4/netfilter/nf_nat_proto_udplite.c
+++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c
@@ -18,14 +18,14 @@
static u_int16_t udplite_port_rover;
-static bool
+static void
udplite_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
- return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
- &udplite_port_rover);
+ nf_nat_proto_unique_tuple(tuple, range, maniptype, ct,
+ &udplite_port_rover);
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c
index 14381c62ace..a50f2bc1c73 100644
--- a/net/ipv4/netfilter/nf_nat_proto_unknown.c
+++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c
@@ -26,14 +26,14 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple,
return true;
}
-static bool unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
+static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
/* Sorry: we can't help you; if it's not unique, we can't frob
anything. */
- return false;
+ return;
}
static bool
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index ab74cc0535e..ebbd319f62f 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -7,6 +7,7 @@
*/
/* Everything about the rules for NAT. */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
@@ -15,6 +16,7 @@
#include <linux/kmod.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
+#include <linux/slab.h>
#include <net/checksum.h>
#include <net/route.h>
#include <linux/bitops.h>
@@ -26,7 +28,8 @@
#define NAT_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
(1 << NF_INET_POST_ROUTING) | \
- (1 << NF_INET_LOCAL_OUT))
+ (1 << NF_INET_LOCAL_OUT) | \
+ (1 << NF_INET_LOCAL_IN))
static const struct xt_table nat_table = {
.name = "nat",
@@ -37,13 +40,14 @@ static const struct xt_table nat_table = {
/* Source NAT */
static unsigned int
-ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par)
+ipt_snat_target(struct sk_buff *skb, const struct xt_action_param *par)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
const struct nf_nat_multi_range_compat *mr = par->targinfo;
- NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING);
+ NF_CT_ASSERT(par->hooknum == NF_INET_POST_ROUTING ||
+ par->hooknum == NF_INET_LOCAL_IN);
ct = nf_ct_get(skb, &ctinfo);
@@ -56,7 +60,7 @@ ipt_snat_target(struct sk_buff *skb, const struct xt_target_param *par)
}
static unsigned int
-ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par)
+ipt_dnat_target(struct sk_buff *skb, const struct xt_action_param *par)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
@@ -73,31 +77,31 @@ ipt_dnat_target(struct sk_buff *skb, const struct xt_target_param *par)
return nf_nat_setup_info(ct, &mr->range[0], IP_NAT_MANIP_DST);
}
-static bool ipt_snat_checkentry(const struct xt_tgchk_param *par)
+static int ipt_snat_checkentry(const struct xt_tgchk_param *par)
{
const struct nf_nat_multi_range_compat *mr = par->targinfo;
/* Must be a valid range */
if (mr->rangesize != 1) {
- printk("SNAT: multiple ranges no longer supported\n");
- return false;
+ pr_info("SNAT: multiple ranges no longer supported\n");
+ return -EINVAL;
}
- return true;
+ return 0;
}
-static bool ipt_dnat_checkentry(const struct xt_tgchk_param *par)
+static int ipt_dnat_checkentry(const struct xt_tgchk_param *par)
{
const struct nf_nat_multi_range_compat *mr = par->targinfo;
/* Must be a valid range */
if (mr->rangesize != 1) {
- printk("DNAT: multiple ranges no longer supported\n");
- return false;
+ pr_info("DNAT: multiple ranges no longer supported\n");
+ return -EINVAL;
}
- return true;
+ return 0;
}
-unsigned int
+static unsigned int
alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
{
/* Force range to this IP; let proto decide mapping for
@@ -139,7 +143,7 @@ static struct xt_target ipt_snat_reg __read_mostly = {
.target = ipt_snat_target,
.targetsize = sizeof(struct nf_nat_multi_range_compat),
.table = "nat",
- .hooks = 1 << NF_INET_POST_ROUTING,
+ .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN),
.checkentry = ipt_snat_checkentry,
.family = AF_INET,
};
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index 0b9c7ce3d6c..1679e2c0963 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -43,6 +43,7 @@
#include <linux/moduleparam.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/udp.h>
@@ -400,7 +401,7 @@ static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
*octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
if (*octets == NULL) {
if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
return 0;
}
@@ -451,7 +452,7 @@ static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
if (*oid == NULL) {
if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
return 0;
}
@@ -728,7 +729,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
if (*obj == NULL) {
kfree(id);
if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
return 0;
}
(*obj)->syntax.l[0] = l;
@@ -745,7 +746,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
kfree(p);
kfree(id);
if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
return 0;
}
memcpy((*obj)->syntax.c, p, len);
@@ -760,7 +761,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
if (*obj == NULL) {
kfree(id);
if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
return 0;
}
if (!asn1_null_decode(ctx, end)) {
@@ -781,7 +782,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
kfree(lp);
kfree(id);
if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
return 0;
}
memcpy((*obj)->syntax.ul, lp, len);
@@ -802,7 +803,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
kfree(p);
kfree(id);
if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
return 0;
}
memcpy((*obj)->syntax.uc, p, len);
@@ -820,7 +821,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
if (*obj == NULL) {
kfree(id);
if (net_ratelimit())
- printk("OOM in bsalg (%d)\n", __LINE__);
+ pr_notice("OOM in bsalg (%d)\n", __LINE__);
return 0;
}
(*obj)->syntax.ul[0] = ul;
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 5678e9562c1..95481fee8bd 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -7,6 +7,7 @@
*/
#include <linux/types.h>
#include <linux/icmp.h>
+#include <linux/gfp.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
@@ -97,7 +98,7 @@ nf_nat_fn(unsigned int hooknum,
return NF_ACCEPT;
/* Don't try to NAT if this packet is not conntracked */
- if (ct == &nf_conntrack_untracked)
+ if (nf_ct_is_untracked(ct))
return NF_ACCEPT;
nat = nfct_nat(ct);
@@ -130,16 +131,9 @@ nf_nat_fn(unsigned int hooknum,
if (!nf_nat_initialized(ct, maniptype)) {
unsigned int ret;
- if (hooknum == NF_INET_LOCAL_IN)
- /* LOCAL_IN hook doesn't have a chain! */
- ret = alloc_null_binding(ct, hooknum);
- else
- ret = nf_nat_rule_find(skb, hooknum, in, out,
- ct);
-
- if (ret != NF_ACCEPT) {
+ ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
+ if (ret != NF_ACCEPT)
return ret;
- }
} else
pr_debug("Already setup manip %s for ct %p\n",
maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
@@ -293,12 +287,12 @@ static int __init nf_nat_standalone_init(void)
#endif
ret = nf_nat_rule_init();
if (ret < 0) {
- printk("nf_nat_init: can't setup rules.\n");
+ pr_err("nf_nat_init: can't setup rules.\n");
goto cleanup_decode_session;
}
ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops));
if (ret < 0) {
- printk("nf_nat_init: can't register hooks.\n");
+ pr_err("nf_nat_init: can't register hooks.\n");
goto cleanup_rule_init;
}
return ret;
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
index b096e81500a..7274a43c7a1 100644
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ b/net/ipv4/netfilter/nf_nat_tftp.c
@@ -6,7 +6,6 @@
*/
#include <linux/module.h>
-#include <linux/moduleparam.h>
#include <linux/udp.h>
#include <net/netfilter/nf_nat_helper.h>
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 4f1f337f433..4ae1f203f7c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -251,6 +251,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
+ SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
+ SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
SNMP_MIB_SENTINEL
};
@@ -341,10 +343,12 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
sysctl_ip_default_ttl);
+ BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.ip_statistics,
- snmp4_ipstats_list[i].entry));
+ seq_printf(seq, " %llu",
+ snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+ snmp4_ipstats_list[i].entry,
+ offsetof(struct ipstats_mib, syncp)));
icmp_put(seq); /* RFC 2011 compatibility */
icmpmsg_put(seq);
@@ -430,9 +434,10 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nIpExt:");
for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
- seq_printf(seq, " %lu",
- snmp_fold_field((void __percpu **)net->mib.ip_statistics,
- snmp4_ipextstats_list[i].entry));
+ seq_printf(seq, " %llu",
+ snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+ snmp4_ipextstats_list[i].entry,
+ offsetof(struct ipstats_mib, syncp)));
seq_putc(seq, '\n');
return 0;
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 542f22fc98b..f2d29735140 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -52,6 +52,7 @@ int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
return ret;
}
+EXPORT_SYMBOL(inet_add_protocol);
/*
* Remove a protocol from the hash tables.
@@ -76,6 +77,4 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
return ret;
}
-
-EXPORT_SYMBOL(inet_add_protocol);
EXPORT_SYMBOL(inet_del_protocol);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ce154b47f1d..009a7b2aa1e 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -60,7 +60,6 @@
#include <net/net_namespace.h>
#include <net/dst.h>
#include <net/sock.h>
-#include <linux/gfp.h>
#include <linux/ip.h>
#include <linux/net.h>
#include <net/ip.h>
@@ -291,7 +290,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
{
/* Charge it to the socket. */
- if (sock_queue_rcv_skb(sk, skb) < 0) {
+ if (ip_queue_rcv_skb(sk, skb) < 0) {
kfree_skb(skb);
return NET_RX_DROP;
}
@@ -315,7 +314,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
}
static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
- struct rtable *rt,
+ struct rtable **rtp,
unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
@@ -324,25 +323,27 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
struct sk_buff *skb;
unsigned int iphlen;
int err;
+ struct rtable *rt = *rtp;
- if (length > rt->u.dst.dev->mtu) {
+ if (length > rt->dst.dev->mtu) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
- rt->u.dst.dev->mtu);
+ rt->dst.dev->mtu);
return -EMSGSIZE;
}
if (flags&MSG_PROBE)
goto out;
skb = sock_alloc_send_skb(sk,
- length + LL_ALLOCATED_SPACE(rt->u.dst.dev) + 15,
+ length + LL_ALLOCATED_SPACE(rt->dst.dev) + 15,
flags & MSG_DONTWAIT, &err);
if (skb == NULL)
goto error;
- skb_reserve(skb, LL_RESERVED_SPACE(rt->u.dst.dev));
+ skb_reserve(skb, LL_RESERVED_SPACE(rt->dst.dev));
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- skb_dst_set(skb, dst_clone(&rt->u.dst));
+ skb_dst_set(skb, &rt->dst);
+ *rtp = NULL;
skb_reset_network_header(skb);
iph = ip_hdr(skb);
@@ -374,7 +375,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
iph->check = 0;
iph->tot_len = htons(length);
if (!iph->id)
- ip_select_ident(iph, &rt->u.dst, NULL);
+ ip_select_ident(iph, &rt->dst, NULL);
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
}
@@ -382,8 +383,8 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
icmp_out_count(net, ((struct icmphdr *)
skb_transport_header(skb))->type);
- err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
- dst_output);
+ err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
+ rt->dst.dev, dst_output);
if (err > 0)
err = net_xmit_errno(err);
if (err)
@@ -577,7 +578,7 @@ back_from_confirm:
if (inet->hdrincl)
err = raw_send_hdrinc(sk, msg->msg_iov, len,
- rt, msg->msg_flags);
+ &rt, msg->msg_flags);
else {
if (!ipc.addr)
@@ -605,7 +606,7 @@ out:
return len;
do_confirm:
- dst_confirm(&rt->u.dst);
+ dst_confirm(&rt->dst);
if (!(msg->msg_flags & MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a770df2493d..3f56b6e6c6a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -90,6 +90,7 @@
#include <linux/jhash.h>
#include <linux/rcupdate.h>
#include <linux/times.h>
+#include <linux/slab.h>
#include <net/dst.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
@@ -128,7 +129,6 @@ static int ip_rt_gc_elasticity __read_mostly = 8;
static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
-static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
static int rt_chain_length_max __read_mostly = 20;
static struct delayed_work expires_work;
@@ -253,14 +253,12 @@ static unsigned rt_hash_mask __read_mostly;
static unsigned int rt_hash_log __read_mostly;
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
-#define RT_CACHE_STAT_INC(field) \
- (__raw_get_cpu_var(rt_cache_stat).field++)
+#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
- int genid)
+ int genid)
{
- return jhash_3words((__force u32)(__be32)(daddr),
- (__force u32)(__be32)(saddr),
+ return jhash_3words((__force u32)daddr, (__force u32)saddr,
idx, genid)
& rt_hash_mask;
}
@@ -288,10 +286,10 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
rcu_read_lock_bh();
r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
while (r) {
- if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
+ if (dev_net(r->dst.dev) == seq_file_net(seq) &&
r->rt_genid == st->genid)
return r;
- r = rcu_dereference_bh(r->u.dst.rt_next);
+ r = rcu_dereference_bh(r->dst.rt_next);
}
rcu_read_unlock_bh();
}
@@ -303,7 +301,7 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
{
struct rt_cache_iter_state *st = seq->private;
- r = r->u.dst.rt_next;
+ r = r->dst.rt_next;
while (!r) {
rcu_read_unlock_bh();
do {
@@ -321,7 +319,7 @@ static struct rtable *rt_cache_get_next(struct seq_file *seq,
{
struct rt_cache_iter_state *st = seq->private;
while ((r = __rt_cache_get_next(seq, r)) != NULL) {
- if (dev_net(r->u.dst.dev) != seq_file_net(seq))
+ if (dev_net(r->dst.dev) != seq_file_net(seq))
continue;
if (r->rt_genid == st->genid)
break;
@@ -377,20 +375,21 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
struct rtable *r = v;
int len;
- seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
- "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
- r->u.dst.dev ? r->u.dst.dev->name : "*",
- (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
- r->rt_flags, atomic_read(&r->u.dst.__refcnt),
- r->u.dst.__use, 0, (unsigned long)r->rt_src,
- (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
- (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
- dst_metric(&r->u.dst, RTAX_WINDOW),
- (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
- dst_metric(&r->u.dst, RTAX_RTTVAR)),
+ seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
+ "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
+ r->dst.dev ? r->dst.dev->name : "*",
+ (__force u32)r->rt_dst,
+ (__force u32)r->rt_gateway,
+ r->rt_flags, atomic_read(&r->dst.__refcnt),
+ r->dst.__use, 0, (__force u32)r->rt_src,
+ (dst_metric(&r->dst, RTAX_ADVMSS) ?
+ (int)dst_metric(&r->dst, RTAX_ADVMSS) + 40 : 0),
+ dst_metric(&r->dst, RTAX_WINDOW),
+ (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
+ dst_metric(&r->dst, RTAX_RTTVAR)),
r->fl.fl4_tos,
- r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
- r->u.dst.hh ? (r->u.dst.hh->hh_output ==
+ r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
+ r->dst.hh ? (r->dst.hh->hh_output ==
dev_queue_xmit) : 0,
r->rt_spec_dst, &len);
@@ -609,13 +608,13 @@ static inline int ip_rt_proc_init(void)
static inline void rt_free(struct rtable *rt)
{
- call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+ call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
}
static inline void rt_drop(struct rtable *rt)
{
ip_rt_put(rt);
- call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+ call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
}
static inline int rt_fast_clean(struct rtable *rth)
@@ -623,13 +622,13 @@ static inline int rt_fast_clean(struct rtable *rth)
/* Kill broadcast/multicast entries very aggresively, if they
collide in hash table with more useful entries */
return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
- rth->fl.iif && rth->u.dst.rt_next;
+ rth->fl.iif && rth->dst.rt_next;
}
static inline int rt_valuable(struct rtable *rth)
{
return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
- rth->u.dst.expires;
+ rth->dst.expires;
}
static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -637,15 +636,15 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
unsigned long age;
int ret = 0;
- if (atomic_read(&rth->u.dst.__refcnt))
+ if (atomic_read(&rth->dst.__refcnt))
goto out;
ret = 1;
- if (rth->u.dst.expires &&
- time_after_eq(jiffies, rth->u.dst.expires))
+ if (rth->dst.expires &&
+ time_after_eq(jiffies, rth->dst.expires))
goto out;
- age = jiffies - rth->u.dst.lastuse;
+ age = jiffies - rth->dst.lastuse;
ret = 0;
if ((age <= tmo1 && !rt_fast_clean(rth)) ||
(age <= tmo2 && rt_valuable(rth)))
@@ -661,7 +660,7 @@ out: return ret;
*/
static inline u32 rt_score(struct rtable *rt)
{
- u32 score = jiffies - rt->u.dst.lastuse;
+ u32 score = jiffies - rt->dst.lastuse;
score = ~score & ~(3<<30);
@@ -684,30 +683,29 @@ static inline bool rt_caching(const struct net *net)
static inline bool compare_hash_inputs(const struct flowi *fl1,
const struct flowi *fl2)
{
- return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
- (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
+ return ((((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
+ ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
(fl1->iif ^ fl2->iif)) == 0);
}
static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
{
- return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
- (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
+ return (((__force u32)fl1->nl_u.ip4_u.daddr ^ (__force u32)fl2->nl_u.ip4_u.daddr) |
+ ((__force u32)fl1->nl_u.ip4_u.saddr ^ (__force u32)fl2->nl_u.ip4_u.saddr) |
(fl1->mark ^ fl2->mark) |
- (*(u16 *)&fl1->nl_u.ip4_u.tos ^
- *(u16 *)&fl2->nl_u.ip4_u.tos) |
+ (*(u16 *)&fl1->nl_u.ip4_u.tos ^ *(u16 *)&fl2->nl_u.ip4_u.tos) |
(fl1->oif ^ fl2->oif) |
(fl1->iif ^ fl2->iif)) == 0;
}
static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
{
- return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
+ return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
}
static inline int rt_is_expired(struct rtable *rth)
{
- return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
+ return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
}
/*
@@ -736,7 +734,7 @@ static void rt_do_flush(int process_context)
rth = rt_hash_table[i].chain;
/* defer releasing the head of the list after spin_unlock */
- for (tail = rth; tail; tail = tail->u.dst.rt_next)
+ for (tail = rth; tail; tail = tail->dst.rt_next)
if (!rt_is_expired(tail))
break;
if (rth != tail)
@@ -745,9 +743,9 @@ static void rt_do_flush(int process_context)
/* call rt_free on entries after the tail requiring flush */
prev = &rt_hash_table[i].chain;
for (p = *prev; p; p = next) {
- next = p->u.dst.rt_next;
+ next = p->dst.rt_next;
if (!rt_is_expired(p)) {
- prev = &p->u.dst.rt_next;
+ prev = &p->dst.rt_next;
} else {
*prev = next;
rt_free(p);
@@ -762,7 +760,7 @@ static void rt_do_flush(int process_context)
spin_unlock_bh(rt_hash_lock_addr(i));
for (; rth != tail; rth = next) {
- next = rth->u.dst.rt_next;
+ next = rth->dst.rt_next;
rt_free(rth);
}
}
@@ -793,7 +791,7 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
while (aux != rth) {
if (compare_hash_inputs(&aux->fl, &rth->fl))
return 0;
- aux = aux->u.dst.rt_next;
+ aux = aux->dst.rt_next;
}
return ONE;
}
@@ -833,18 +831,18 @@ static void rt_check_expire(void)
length = 0;
spin_lock_bh(rt_hash_lock_addr(i));
while ((rth = *rthp) != NULL) {
- prefetch(rth->u.dst.rt_next);
+ prefetch(rth->dst.rt_next);
if (rt_is_expired(rth)) {
- *rthp = rth->u.dst.rt_next;
+ *rthp = rth->dst.rt_next;
rt_free(rth);
continue;
}
- if (rth->u.dst.expires) {
+ if (rth->dst.expires) {
/* Entry is expired even if it is in use */
- if (time_before_eq(jiffies, rth->u.dst.expires)) {
+ if (time_before_eq(jiffies, rth->dst.expires)) {
nofree:
tmo >>= 1;
- rthp = &rth->u.dst.rt_next;
+ rthp = &rth->dst.rt_next;
/*
* We only count entries on
* a chain with equal hash inputs once
@@ -860,7 +858,7 @@ nofree:
goto nofree;
/* Cleanup aged off entries. */
- *rthp = rth->u.dst.rt_next;
+ *rthp = rth->dst.rt_next;
rt_free(rth);
}
spin_unlock_bh(rt_hash_lock_addr(i));
@@ -918,32 +916,11 @@ void rt_cache_flush_batch(void)
rt_do_flush(!in_softirq());
}
-/*
- * We change rt_genid and let gc do the cleanup
- */
-static void rt_secret_rebuild(unsigned long __net)
-{
- struct net *net = (struct net *)__net;
- rt_cache_invalidate(net);
- mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
-}
-
-static void rt_secret_rebuild_oneshot(struct net *net)
-{
- del_timer_sync(&net->ipv4.rt_secret_timer);
- rt_cache_invalidate(net);
- if (ip_rt_secret_interval)
- mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
-}
-
static void rt_emergency_hash_rebuild(struct net *net)
{
- if (net_ratelimit()) {
+ if (net_ratelimit())
printk(KERN_WARNING "Route hash chain too long!\n");
- printk(KERN_WARNING "Adjust your secret_interval!\n");
- }
-
- rt_secret_rebuild_oneshot(net);
+ rt_cache_invalidate(net);
}
/*
@@ -1022,10 +999,10 @@ static int rt_garbage_collect(struct dst_ops *ops)
if (!rt_is_expired(rth) &&
!rt_may_expire(rth, tmo, expire)) {
tmo >>= 1;
- rthp = &rth->u.dst.rt_next;
+ rthp = &rth->dst.rt_next;
continue;
}
- *rthp = rth->u.dst.rt_next;
+ *rthp = rth->dst.rt_next;
rt_free(rth);
goal--;
}
@@ -1091,13 +1068,13 @@ static int slow_chain_length(const struct rtable *head)
while (rth) {
length += has_noalias(head, rth);
- rth = rth->u.dst.rt_next;
+ rth = rth->dst.rt_next;
}
return length >> FRACT_BITS;
}
static int rt_intern_hash(unsigned hash, struct rtable *rt,
- struct rtable **rp, struct sk_buff *skb)
+ struct rtable **rp, struct sk_buff *skb, int ifindex)
{
struct rtable *rth, **rthp;
unsigned long now;
@@ -1113,7 +1090,7 @@ restart:
candp = NULL;
now = jiffies;
- if (!rt_caching(dev_net(rt->u.dst.dev))) {
+ if (!rt_caching(dev_net(rt->dst.dev))) {
/*
* If we're not caching, just tell the caller we
* were successful and don't touch the route. The
@@ -1131,7 +1108,7 @@ restart:
*/
if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
- int err = arp_bind_neighbour(&rt->u.dst);
+ int err = arp_bind_neighbour(&rt->dst);
if (err) {
if (net_ratelimit())
printk(KERN_WARNING
@@ -1150,19 +1127,19 @@ restart:
spin_lock_bh(rt_hash_lock_addr(hash));
while ((rth = *rthp) != NULL) {
if (rt_is_expired(rth)) {
- *rthp = rth->u.dst.rt_next;
+ *rthp = rth->dst.rt_next;
rt_free(rth);
continue;
}
if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
/* Put it first */
- *rthp = rth->u.dst.rt_next;
+ *rthp = rth->dst.rt_next;
/*
* Since lookup is lockfree, the deletion
* must be visible to another weakly ordered CPU before
* the insertion at the start of the hash chain.
*/
- rcu_assign_pointer(rth->u.dst.rt_next,
+ rcu_assign_pointer(rth->dst.rt_next,
rt_hash_table[hash].chain);
/*
* Since lookup is lockfree, the update writes
@@ -1170,18 +1147,18 @@ restart:
*/
rcu_assign_pointer(rt_hash_table[hash].chain, rth);
- dst_use(&rth->u.dst, now);
+ dst_use(&rth->dst, now);
spin_unlock_bh(rt_hash_lock_addr(hash));
rt_drop(rt);
if (rp)
*rp = rth;
else
- skb_dst_set(skb, &rth->u.dst);
+ skb_dst_set(skb, &rth->dst);
return 0;
}
- if (!atomic_read(&rth->u.dst.__refcnt)) {
+ if (!atomic_read(&rth->dst.__refcnt)) {
u32 score = rt_score(rth);
if (score <= min_score) {
@@ -1193,7 +1170,7 @@ restart:
chain_length++;
- rthp = &rth->u.dst.rt_next;
+ rthp = &rth->dst.rt_next;
}
if (cand) {
@@ -1204,19 +1181,24 @@ restart:
* only 2 entries per bucket. We will see.
*/
if (chain_length > ip_rt_gc_elasticity) {
- *candp = cand->u.dst.rt_next;
+ *candp = cand->dst.rt_next;
rt_free(cand);
}
} else {
if (chain_length > rt_chain_length_max &&
slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
- struct net *net = dev_net(rt->u.dst.dev);
+ struct net *net = dev_net(rt->dst.dev);
int num = ++net->ipv4.current_rt_cache_rebuild_count;
- if (!rt_caching(dev_net(rt->u.dst.dev))) {
+ if (!rt_caching(net)) {
printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
- rt->u.dst.dev->name, num);
+ rt->dst.dev->name, num);
}
- rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
+ rt_emergency_hash_rebuild(net);
+ spin_unlock_bh(rt_hash_lock_addr(hash));
+
+ hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
+ ifindex, rt_genid(net));
+ goto restart;
}
}
@@ -1224,7 +1206,7 @@ restart:
route or unicast forwarding path.
*/
if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
- int err = arp_bind_neighbour(&rt->u.dst);
+ int err = arp_bind_neighbour(&rt->dst);
if (err) {
spin_unlock_bh(rt_hash_lock_addr(hash));
@@ -1255,14 +1237,14 @@ restart:
}
}
- rt->u.dst.rt_next = rt_hash_table[hash].chain;
+ rt->dst.rt_next = rt_hash_table[hash].chain;
#if RT_CACHE_DEBUG >= 2
- if (rt->u.dst.rt_next) {
+ if (rt->dst.rt_next) {
struct rtable *trt;
printk(KERN_DEBUG "rt_cache @%02x: %pI4",
hash, &rt->rt_dst);
- for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
+ for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
printk(" . %pI4", &trt->rt_dst);
printk("\n");
}
@@ -1280,7 +1262,7 @@ skip_hashing:
if (rp)
*rp = rt;
else
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
return 0;
}
@@ -1342,6 +1324,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
ip_select_fb_ident(iph);
}
+EXPORT_SYMBOL(__ip_select_ident);
static void rt_del(unsigned hash, struct rtable *rt)
{
@@ -1352,20 +1335,21 @@ static void rt_del(unsigned hash, struct rtable *rt)
ip_rt_put(rt);
while ((aux = *rthp) != NULL) {
if (aux == rt || rt_is_expired(aux)) {
- *rthp = aux->u.dst.rt_next;
+ *rthp = aux->dst.rt_next;
rt_free(aux);
continue;
}
- rthp = &aux->u.dst.rt_next;
+ rthp = &aux->dst.rt_next;
}
spin_unlock_bh(rt_hash_lock_addr(hash));
}
+/* called in rcu_read_lock() section */
void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
__be32 saddr, struct net_device *dev)
{
int i, k;
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
struct rtable *rth, **rthp;
__be32 skeys[2] = { saddr, 0 };
int ikeys[2] = { dev->ifindex, 0 };
@@ -1401,7 +1385,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
rthp=&rt_hash_table[hash].chain;
- rcu_read_lock();
while ((rth = rcu_dereference(*rthp)) != NULL) {
struct rtable *rt;
@@ -1410,44 +1393,42 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
rth->fl.oif != ikeys[k] ||
rth->fl.iif != 0 ||
rt_is_expired(rth) ||
- !net_eq(dev_net(rth->u.dst.dev), net)) {
- rthp = &rth->u.dst.rt_next;
+ !net_eq(dev_net(rth->dst.dev), net)) {
+ rthp = &rth->dst.rt_next;
continue;
}
if (rth->rt_dst != daddr ||
rth->rt_src != saddr ||
- rth->u.dst.error ||
+ rth->dst.error ||
rth->rt_gateway != old_gw ||
- rth->u.dst.dev != dev)
+ rth->dst.dev != dev)
break;
- dst_hold(&rth->u.dst);
- rcu_read_unlock();
+ dst_hold(&rth->dst);
rt = dst_alloc(&ipv4_dst_ops);
if (rt == NULL) {
ip_rt_put(rth);
- in_dev_put(in_dev);
return;
}
/* Copy all the information. */
*rt = *rth;
- rt->u.dst.__use = 1;
- atomic_set(&rt->u.dst.__refcnt, 1);
- rt->u.dst.child = NULL;
- if (rt->u.dst.dev)
- dev_hold(rt->u.dst.dev);
+ rt->dst.__use = 1;
+ atomic_set(&rt->dst.__refcnt, 1);
+ rt->dst.child = NULL;
+ if (rt->dst.dev)
+ dev_hold(rt->dst.dev);
if (rt->idev)
in_dev_hold(rt->idev);
- rt->u.dst.obsolete = 0;
- rt->u.dst.lastuse = jiffies;
- rt->u.dst.path = &rt->u.dst;
- rt->u.dst.neighbour = NULL;
- rt->u.dst.hh = NULL;
+ rt->dst.obsolete = -1;
+ rt->dst.lastuse = jiffies;
+ rt->dst.path = &rt->dst;
+ rt->dst.neighbour = NULL;
+ rt->dst.hh = NULL;
#ifdef CONFIG_XFRM
- rt->u.dst.xfrm = NULL;
+ rt->dst.xfrm = NULL;
#endif
rt->rt_genid = rt_genid(net);
rt->rt_flags |= RTCF_REDIRECTED;
@@ -1456,37 +1437,35 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
rt->rt_gateway = new_gw;
/* Redirect received -> path was valid */
- dst_confirm(&rth->u.dst);
+ dst_confirm(&rth->dst);
if (rt->peer)
atomic_inc(&rt->peer->refcnt);
- if (arp_bind_neighbour(&rt->u.dst) ||
- !(rt->u.dst.neighbour->nud_state &
+ if (arp_bind_neighbour(&rt->dst) ||
+ !(rt->dst.neighbour->nud_state &
NUD_VALID)) {
- if (rt->u.dst.neighbour)
- neigh_event_send(rt->u.dst.neighbour, NULL);
+ if (rt->dst.neighbour)
+ neigh_event_send(rt->dst.neighbour, NULL);
ip_rt_put(rth);
rt_drop(rt);
goto do_next;
}
- netevent.old = &rth->u.dst;
- netevent.new = &rt->u.dst;
+ netevent.old = &rth->dst;
+ netevent.new = &rt->dst;
call_netevent_notifiers(NETEVENT_REDIRECT,
&netevent);
rt_del(hash, rth);
- if (!rt_intern_hash(hash, rt, &rt, NULL))
+ if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
ip_rt_put(rt);
goto do_next;
}
- rcu_read_unlock();
do_next:
;
}
}
- in_dev_put(in_dev);
return;
reject_redirect:
@@ -1497,7 +1476,7 @@ reject_redirect:
&old_gw, dev->name, &new_gw,
&saddr, &daddr);
#endif
- in_dev_put(in_dev);
+ ;
}
static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -1506,11 +1485,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
struct dst_entry *ret = dst;
if (rt) {
- if (dst->obsolete) {
+ if (dst->obsolete > 0) {
ip_rt_put(rt);
ret = NULL;
} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
- rt->u.dst.expires) {
+ (rt->dst.expires &&
+ time_after_eq(jiffies, rt->dst.expires))) {
unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
rt->fl.oif,
rt_genid(dev_net(dst->dev)));
@@ -1548,7 +1528,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
int log_martians;
rcu_read_lock();
- in_dev = __in_dev_get_rcu(rt->u.dst.dev);
+ in_dev = __in_dev_get_rcu(rt->dst.dev);
if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
rcu_read_unlock();
return;
@@ -1559,30 +1539,30 @@ void ip_rt_send_redirect(struct sk_buff *skb)
/* No redirected packets during ip_rt_redirect_silence;
* reset the algorithm.
*/
- if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
- rt->u.dst.rate_tokens = 0;
+ if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
+ rt->dst.rate_tokens = 0;
/* Too many ignored redirects; do not send anything
- * set u.dst.rate_last to the last seen redirected packet.
+ * set dst.rate_last to the last seen redirected packet.
*/
- if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
- rt->u.dst.rate_last = jiffies;
+ if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
+ rt->dst.rate_last = jiffies;
return;
}
/* Check for load limit; set rate_last to the latest sent
* redirect.
*/
- if (rt->u.dst.rate_tokens == 0 ||
+ if (rt->dst.rate_tokens == 0 ||
time_after(jiffies,
- (rt->u.dst.rate_last +
- (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
+ (rt->dst.rate_last +
+ (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
- rt->u.dst.rate_last = jiffies;
- ++rt->u.dst.rate_tokens;
+ rt->dst.rate_last = jiffies;
+ ++rt->dst.rate_tokens;
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (log_martians &&
- rt->u.dst.rate_tokens == ip_rt_redirect_number &&
+ rt->dst.rate_tokens == ip_rt_redirect_number &&
net_ratelimit())
printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
&rt->rt_src, rt->rt_iif,
@@ -1597,7 +1577,7 @@ static int ip_error(struct sk_buff *skb)
unsigned long now;
int code;
- switch (rt->u.dst.error) {
+ switch (rt->dst.error) {
case EINVAL:
default:
goto out;
@@ -1606,7 +1586,7 @@ static int ip_error(struct sk_buff *skb)
break;
case ENETUNREACH:
code = ICMP_NET_UNREACH;
- IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
+ IP_INC_STATS_BH(dev_net(rt->dst.dev),
IPSTATS_MIB_INNOROUTES);
break;
case EACCES:
@@ -1615,12 +1595,12 @@ static int ip_error(struct sk_buff *skb)
}
now = jiffies;
- rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
- if (rt->u.dst.rate_tokens > ip_rt_error_burst)
- rt->u.dst.rate_tokens = ip_rt_error_burst;
- rt->u.dst.rate_last = now;
- if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
- rt->u.dst.rate_tokens -= ip_rt_error_cost;
+ rt->dst.rate_tokens += now - rt->dst.rate_last;
+ if (rt->dst.rate_tokens > ip_rt_error_burst)
+ rt->dst.rate_tokens = ip_rt_error_burst;
+ rt->dst.rate_last = now;
+ if (rt->dst.rate_tokens >= ip_rt_error_cost) {
+ rt->dst.rate_tokens -= ip_rt_error_cost;
icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
}
@@ -1665,7 +1645,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->u.dst.rt_next)) {
+ rth = rcu_dereference(rth->dst.rt_next)) {
unsigned short mtu = new_mtu;
if (rth->fl.fl4_dst != daddr ||
@@ -1674,8 +1654,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
rth->rt_src != iph->saddr ||
rth->fl.oif != ikeys[k] ||
rth->fl.iif != 0 ||
- dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
- !net_eq(dev_net(rth->u.dst.dev), net) ||
+ dst_metric_locked(&rth->dst, RTAX_MTU) ||
+ !net_eq(dev_net(rth->dst.dev), net) ||
rt_is_expired(rth))
continue;
@@ -1683,22 +1663,22 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
/* BSD 4.2 compatibility hack :-( */
if (mtu == 0 &&
- old_mtu >= dst_mtu(&rth->u.dst) &&
+ old_mtu >= dst_mtu(&rth->dst) &&
old_mtu >= 68 + (iph->ihl << 2))
old_mtu -= iph->ihl << 2;
mtu = guess_mtu(old_mtu);
}
- if (mtu <= dst_mtu(&rth->u.dst)) {
- if (mtu < dst_mtu(&rth->u.dst)) {
- dst_confirm(&rth->u.dst);
+ if (mtu <= dst_mtu(&rth->dst)) {
+ if (mtu < dst_mtu(&rth->dst)) {
+ dst_confirm(&rth->dst);
if (mtu < ip_rt_min_pmtu) {
mtu = ip_rt_min_pmtu;
- rth->u.dst.metrics[RTAX_LOCK-1] |=
+ rth->dst.metrics[RTAX_LOCK-1] |=
(1 << RTAX_MTU);
}
- rth->u.dst.metrics[RTAX_MTU-1] = mtu;
- dst_set_expires(&rth->u.dst,
+ rth->dst.metrics[RTAX_MTU-1] = mtu;
+ dst_set_expires(&rth->dst,
ip_rt_mtu_expires);
}
est_mtu = mtu;
@@ -1726,7 +1706,9 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
{
- return NULL;
+ if (rt_is_expired((struct rtable *)dst))
+ return NULL;
+ return dst;
}
static void ipv4_dst_destroy(struct dst_entry *dst)
@@ -1769,7 +1751,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
rt = skb_rtable(skb);
if (rt)
- dst_set_expires(&rt->u.dst, 0);
+ dst_set_expires(&rt->dst, 0);
}
static int ip_rt_bug(struct sk_buff *skb)
@@ -1797,11 +1779,11 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
if (rt->fl.iif == 0)
src = rt->rt_src;
- else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
+ else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) {
src = FIB_RES_PREFSRC(res);
fib_res_put(&res);
} else
- src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
+ src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
RT_SCOPE_UNIVERSE);
memcpy(addr, &src, 4);
}
@@ -1809,10 +1791,10 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
#ifdef CONFIG_NET_CLS_ROUTE
static void set_class_tag(struct rtable *rt, u32 tag)
{
- if (!(rt->u.dst.tclassid & 0xFFFF))
- rt->u.dst.tclassid |= tag & 0xFFFF;
- if (!(rt->u.dst.tclassid & 0xFFFF0000))
- rt->u.dst.tclassid |= tag & 0xFFFF0000;
+ if (!(rt->dst.tclassid & 0xFFFF))
+ rt->dst.tclassid |= tag & 0xFFFF;
+ if (!(rt->dst.tclassid & 0xFFFF0000))
+ rt->dst.tclassid |= tag & 0xFFFF0000;
}
#endif
@@ -1824,30 +1806,30 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
if (FIB_RES_GW(*res) &&
FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = FIB_RES_GW(*res);
- memcpy(rt->u.dst.metrics, fi->fib_metrics,
- sizeof(rt->u.dst.metrics));
+ memcpy(rt->dst.metrics, fi->fib_metrics,
+ sizeof(rt->dst.metrics));
if (fi->fib_mtu == 0) {
- rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
- if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
+ rt->dst.metrics[RTAX_MTU-1] = rt->dst.dev->mtu;
+ if (dst_metric_locked(&rt->dst, RTAX_MTU) &&
rt->rt_gateway != rt->rt_dst &&
- rt->u.dst.dev->mtu > 576)
- rt->u.dst.metrics[RTAX_MTU-1] = 576;
+ rt->dst.dev->mtu > 576)
+ rt->dst.metrics[RTAX_MTU-1] = 576;
}
#ifdef CONFIG_NET_CLS_ROUTE
- rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+ rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
#endif
} else
- rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
-
- if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
- rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
- if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
- rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
- if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
- rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
+ rt->dst.metrics[RTAX_MTU-1]= rt->dst.dev->mtu;
+
+ if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
+ rt->dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
+ if (dst_mtu(&rt->dst) > IP_MAX_MTU)
+ rt->dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
+ if (dst_metric(&rt->dst, RTAX_ADVMSS) == 0)
+ rt->dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->dst.dev->mtu - 40,
ip_rt_min_advmss);
- if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
- rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
+ if (dst_metric(&rt->dst, RTAX_ADVMSS) > 65535 - 40)
+ rt->dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
#ifdef CONFIG_NET_CLS_ROUTE
#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1858,14 +1840,16 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
rt->rt_type = res->type;
}
+/* called in rcu_read_lock() section */
static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, int our)
{
- unsigned hash;
+ unsigned int hash;
struct rtable *rth;
__be32 spec_dst;
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
u32 itag = 0;
+ int err;
/* Primary sanity checks. */
@@ -1880,20 +1864,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (!ipv4_is_local_multicast(daddr))
goto e_inval;
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
- } else if (fib_validate_source(saddr, 0, tos, 0,
- dev, &spec_dst, &itag, 0) < 0)
- goto e_inval;
-
+ } else {
+ err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
+ &itag, 0);
+ if (err < 0)
+ goto e_err;
+ }
rth = dst_alloc(&ipv4_dst_ops);
if (!rth)
goto e_nobufs;
- rth->u.dst.output= ip_rt_bug;
+ rth->dst.output = ip_rt_bug;
+ rth->dst.obsolete = -1;
- atomic_set(&rth->u.dst.__refcnt, 1);
- rth->u.dst.flags= DST_HOST;
+ atomic_set(&rth->dst.__refcnt, 1);
+ rth->dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->u.dst.flags |= DST_NOPOLICY;
+ rth->dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
@@ -1901,13 +1888,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
#ifdef CONFIG_NET_CLS_ROUTE
- rth->u.dst.tclassid = itag;
+ rth->dst.tclassid = itag;
#endif
rth->rt_iif =
rth->fl.iif = dev->ifindex;
- rth->u.dst.dev = init_net.loopback_dev;
- dev_hold(rth->u.dst.dev);
- rth->idev = in_dev_get(rth->u.dst.dev);
+ rth->dst.dev = init_net.loopback_dev;
+ dev_hold(rth->dst.dev);
+ rth->idev = in_dev_get(rth->dst.dev);
rth->fl.oif = 0;
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
@@ -1915,27 +1902,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->rt_flags = RTCF_MULTICAST;
rth->rt_type = RTN_MULTICAST;
if (our) {
- rth->u.dst.input= ip_local_deliver;
+ rth->dst.input= ip_local_deliver;
rth->rt_flags |= RTCF_LOCAL;
}
#ifdef CONFIG_IP_MROUTE
if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
- rth->u.dst.input = ip_mr_input;
+ rth->dst.input = ip_mr_input;
#endif
RT_CACHE_STAT_INC(in_slow_mc);
- in_dev_put(in_dev);
hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
- return rt_intern_hash(hash, rth, NULL, skb);
+ return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
e_nobufs:
- in_dev_put(in_dev);
return -ENOBUFS;
-
e_inval:
- in_dev_put(in_dev);
return -EINVAL;
+e_err:
+ return err;
}
@@ -1969,22 +1954,22 @@ static void ip_handle_martian_source(struct net_device *dev,
#endif
}
+/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
struct fib_result *res,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos,
struct rtable **result)
{
-
struct rtable *rth;
int err;
struct in_device *out_dev;
- unsigned flags = 0;
+ unsigned int flags = 0;
__be32 spec_dst;
u32 itag;
/* get a working reference to the output device */
- out_dev = in_dev_get(FIB_RES_DEV(*res));
+ out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
if (out_dev == NULL) {
if (net_ratelimit())
printk(KERN_CRIT "Bug in ip_route_input" \
@@ -1999,7 +1984,6 @@ static int __mkroute_input(struct sk_buff *skb,
ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
saddr);
- err = -EINVAL;
goto cleanup;
}
@@ -2033,12 +2017,12 @@ static int __mkroute_input(struct sk_buff *skb,
goto cleanup;
}
- atomic_set(&rth->u.dst.__refcnt, 1);
- rth->u.dst.flags= DST_HOST;
+ atomic_set(&rth->dst.__refcnt, 1);
+ rth->dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->u.dst.flags |= DST_NOPOLICY;
+ rth->dst.flags |= DST_NOPOLICY;
if (IN_DEV_CONF_GET(out_dev, NOXFRM))
- rth->u.dst.flags |= DST_NOXFRM;
+ rth->dst.flags |= DST_NOXFRM;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
@@ -2048,15 +2032,16 @@ static int __mkroute_input(struct sk_buff *skb,
rth->rt_gateway = daddr;
rth->rt_iif =
rth->fl.iif = in_dev->dev->ifindex;
- rth->u.dst.dev = (out_dev)->dev;
- dev_hold(rth->u.dst.dev);
- rth->idev = in_dev_get(rth->u.dst.dev);
+ rth->dst.dev = (out_dev)->dev;
+ dev_hold(rth->dst.dev);
+ rth->idev = in_dev_get(rth->dst.dev);
rth->fl.oif = 0;
rth->rt_spec_dst= spec_dst;
- rth->u.dst.input = ip_forward;
- rth->u.dst.output = ip_output;
- rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
+ rth->dst.obsolete = -1;
+ rth->dst.input = ip_forward;
+ rth->dst.output = ip_output;
+ rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
rt_set_nexthop(rth, res, itag);
@@ -2065,8 +2050,6 @@ static int __mkroute_input(struct sk_buff *skb,
*result = rth;
err = 0;
cleanup:
- /* release the working reference to the output device */
- in_dev_put(out_dev);
return err;
}
@@ -2092,8 +2075,8 @@ static int ip_mkroute_input(struct sk_buff *skb,
/* put it into the cache */
hash = rt_hash(daddr, saddr, fl->iif,
- rt_genid(dev_net(rth->u.dst.dev)));
- return rt_intern_hash(hash, rth, NULL, skb);
+ rt_genid(dev_net(rth->dst.dev)));
+ return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
}
/*
@@ -2110,7 +2093,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev)
{
struct fib_result res;
- struct in_device *in_dev = in_dev_get(dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
struct flowi fl = { .nl_u = { .ip4_u =
{ .daddr = daddr,
.saddr = saddr,
@@ -2170,13 +2153,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto brd_input;
if (res.type == RTN_LOCAL) {
- int result;
- result = fib_validate_source(saddr, daddr, tos,
+ err = fib_validate_source(saddr, daddr, tos,
net->loopback_dev->ifindex,
dev, &spec_dst, &itag, skb->mark);
- if (result < 0)
- goto martian_source;
- if (result)
+ if (err < 0)
+ goto martian_source_keep_err;
+ if (err)
flags |= RTCF_DIRECTSRC;
spec_dst = daddr;
goto local_input;
@@ -2189,7 +2171,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
done:
- in_dev_put(in_dev);
if (free_res)
fib_res_put(&res);
out: return err;
@@ -2204,7 +2185,7 @@ brd_input:
err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
&itag, skb->mark);
if (err < 0)
- goto martian_source;
+ goto martian_source_keep_err;
if (err)
flags |= RTCF_DIRECTSRC;
}
@@ -2217,13 +2198,14 @@ local_input:
if (!rth)
goto e_nobufs;
- rth->u.dst.output= ip_rt_bug;
+ rth->dst.output= ip_rt_bug;
+ rth->dst.obsolete = -1;
rth->rt_genid = rt_genid(net);
- atomic_set(&rth->u.dst.__refcnt, 1);
- rth->u.dst.flags= DST_HOST;
+ atomic_set(&rth->dst.__refcnt, 1);
+ rth->dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->u.dst.flags |= DST_NOPOLICY;
+ rth->dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
@@ -2231,25 +2213,25 @@ local_input:
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
#ifdef CONFIG_NET_CLS_ROUTE
- rth->u.dst.tclassid = itag;
+ rth->dst.tclassid = itag;
#endif
rth->rt_iif =
rth->fl.iif = dev->ifindex;
- rth->u.dst.dev = net->loopback_dev;
- dev_hold(rth->u.dst.dev);
- rth->idev = in_dev_get(rth->u.dst.dev);
+ rth->dst.dev = net->loopback_dev;
+ dev_hold(rth->dst.dev);
+ rth->idev = in_dev_get(rth->dst.dev);
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
- rth->u.dst.input= ip_local_deliver;
+ rth->dst.input= ip_local_deliver;
rth->rt_flags = flags|RTCF_LOCAL;
if (res.type == RTN_UNREACHABLE) {
- rth->u.dst.input= ip_error;
- rth->u.dst.error= -err;
+ rth->dst.input= ip_error;
+ rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
rth->rt_type = res.type;
hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
- err = rt_intern_hash(hash, rth, NULL, skb);
+ err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
goto done;
no_route:
@@ -2284,46 +2266,54 @@ e_nobufs:
goto done;
martian_source:
+ err = -EINVAL;
+martian_source_keep_err:
ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
- goto e_inval;
+ goto done;
}
-int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev)
+int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev, bool noref)
{
struct rtable * rth;
unsigned hash;
int iif = dev->ifindex;
struct net *net;
+ int res;
net = dev_net(dev);
+ rcu_read_lock();
+
if (!rt_caching(net))
goto skip_cache;
tos &= IPTOS_RT_MASK;
hash = rt_hash(daddr, saddr, iif, rt_genid(net));
- rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->u.dst.rt_next)) {
- if (((rth->fl.fl4_dst ^ daddr) |
- (rth->fl.fl4_src ^ saddr) |
+ rth = rcu_dereference(rth->dst.rt_next)) {
+ if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
+ ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
(rth->fl.iif ^ iif) |
rth->fl.oif |
(rth->fl.fl4_tos ^ tos)) == 0 &&
rth->fl.mark == skb->mark &&
- net_eq(dev_net(rth->u.dst.dev), net) &&
+ net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
- dst_use(&rth->u.dst, jiffies);
+ if (noref) {
+ dst_use_noref(&rth->dst, jiffies);
+ skb_dst_set_noref(skb, &rth->dst);
+ } else {
+ dst_use(&rth->dst, jiffies);
+ skb_dst_set(skb, &rth->dst);
+ }
RT_CACHE_STAT_INC(in_hit);
rcu_read_unlock();
- skb_dst_set(skb, &rth->u.dst);
return 0;
}
RT_CACHE_STAT_INC(in_hlist_search);
}
- rcu_read_unlock();
skip_cache:
/* Multicast recognition logic is moved from route cache to here.
@@ -2338,12 +2328,11 @@ skip_cache:
route cache entry is created eventually.
*/
if (ipv4_is_multicast(daddr)) {
- struct in_device *in_dev;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
- rcu_read_lock();
- if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
+ if (in_dev) {
int our = ip_check_mc(in_dev, daddr, saddr,
- ip_hdr(skb)->protocol);
+ ip_hdr(skb)->protocol);
if (our
#ifdef CONFIG_IP_MROUTE
||
@@ -2351,16 +2340,20 @@ skip_cache:
IN_DEV_MFORWARD(in_dev))
#endif
) {
+ int res = ip_route_input_mc(skb, daddr, saddr,
+ tos, dev, our);
rcu_read_unlock();
- return ip_route_input_mc(skb, daddr, saddr,
- tos, dev, our);
+ return res;
}
}
rcu_read_unlock();
return -EINVAL;
}
- return ip_route_input_slow(skb, daddr, saddr, tos, dev);
+ res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
+ rcu_read_unlock();
+ return res;
}
+EXPORT_SYMBOL(ip_route_input_common);
static int __mkroute_output(struct rtable **result,
struct fib_result *res,
@@ -2420,12 +2413,12 @@ static int __mkroute_output(struct rtable **result,
goto cleanup;
}
- atomic_set(&rth->u.dst.__refcnt, 1);
- rth->u.dst.flags= DST_HOST;
+ atomic_set(&rth->dst.__refcnt, 1);
+ rth->dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOXFRM))
- rth->u.dst.flags |= DST_NOXFRM;
+ rth->dst.flags |= DST_NOXFRM;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->u.dst.flags |= DST_NOPOLICY;
+ rth->dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = oldflp->fl4_dst;
rth->fl.fl4_tos = tos;
@@ -2437,34 +2430,35 @@ static int __mkroute_output(struct rtable **result,
rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
/* get references to the devices that are to be hold by the routing
cache entry */
- rth->u.dst.dev = dev_out;
+ rth->dst.dev = dev_out;
dev_hold(dev_out);
rth->idev = in_dev_get(dev_out);
rth->rt_gateway = fl->fl4_dst;
rth->rt_spec_dst= fl->fl4_src;
- rth->u.dst.output=ip_output;
+ rth->dst.output=ip_output;
+ rth->dst.obsolete = -1;
rth->rt_genid = rt_genid(dev_net(dev_out));
RT_CACHE_STAT_INC(out_slow_tot);
if (flags & RTCF_LOCAL) {
- rth->u.dst.input = ip_local_deliver;
+ rth->dst.input = ip_local_deliver;
rth->rt_spec_dst = fl->fl4_dst;
}
if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
rth->rt_spec_dst = fl->fl4_src;
if (flags & RTCF_LOCAL &&
!(dev_out->flags & IFF_LOOPBACK)) {
- rth->u.dst.output = ip_mc_output;
+ rth->dst.output = ip_mc_output;
RT_CACHE_STAT_INC(out_slow_mc);
}
#ifdef CONFIG_IP_MROUTE
if (res->type == RTN_MULTICAST) {
if (IN_DEV_MFORWARD(in_dev) &&
!ipv4_is_local_multicast(oldflp->fl4_dst)) {
- rth->u.dst.input = ip_mr_input;
- rth->u.dst.output = ip_mc_output;
+ rth->dst.input = ip_mr_input;
+ rth->dst.output = ip_mc_output;
}
}
#endif
@@ -2495,7 +2489,7 @@ static int ip_mkroute_output(struct rtable **rp,
if (err == 0) {
hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
rt_genid(dev_net(dev_out)));
- err = rt_intern_hash(hash, rth, rp, NULL);
+ err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
}
return err;
@@ -2719,7 +2713,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
rcu_read_lock_bh();
for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
+ rth = rcu_dereference_bh(rth->dst.rt_next)) {
if (rth->fl.fl4_dst == flp->fl4_dst &&
rth->fl.fl4_src == flp->fl4_src &&
rth->fl.iif == 0 &&
@@ -2727,9 +2721,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
rth->fl.mark == flp->mark &&
!((rth->fl.fl4_tos ^ flp->fl4_tos) &
(IPTOS_RT_MASK | RTO_ONLINK)) &&
- net_eq(dev_net(rth->u.dst.dev), net) &&
+ net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
- dst_use(&rth->u.dst, jiffies);
+ dst_use(&rth->dst, jiffies);
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
*rp = rth;
@@ -2742,7 +2736,6 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
slow_output:
return ip_route_output_slow(net, rp, flp);
}
-
EXPORT_SYMBOL_GPL(__ip_route_output_key);
static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -2766,15 +2759,15 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
dst_alloc(&ipv4_dst_blackhole_ops);
if (rt) {
- struct dst_entry *new = &rt->u.dst;
+ struct dst_entry *new = &rt->dst;
atomic_set(&new->__refcnt, 1);
new->__use = 1;
new->input = dst_discard;
new->output = dst_discard;
- memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
+ memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
- new->dev = ort->u.dst.dev;
+ new->dev = ort->dst.dev;
if (new->dev)
dev_hold(new->dev);
@@ -2798,7 +2791,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
dst_free(new);
}
- dst_release(&(*rp)->u.dst);
+ dst_release(&(*rp)->dst);
*rp = rt;
return (rt ? 0 : -ENOMEM);
}
@@ -2826,13 +2819,13 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
return 0;
}
-
EXPORT_SYMBOL_GPL(ip_route_output_flow);
int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
{
return ip_route_output_flow(net, rp, flp, NULL, 0);
}
+EXPORT_SYMBOL(ip_route_output_key);
static int rt_fill_info(struct net *net,
struct sk_buff *skb, u32 pid, u32 seq, int event,
@@ -2868,11 +2861,11 @@ static int rt_fill_info(struct net *net,
r->rtm_src_len = 32;
NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
}
- if (rt->u.dst.dev)
- NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
+ if (rt->dst.dev)
+ NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
#ifdef CONFIG_NET_CLS_ROUTE
- if (rt->u.dst.tclassid)
- NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
+ if (rt->dst.tclassid)
+ NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
#endif
if (rt->fl.iif)
NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
@@ -2882,12 +2875,16 @@ static int rt_fill_info(struct net *net,
if (rt->rt_dst != rt->rt_gateway)
NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
- if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
+ if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
goto nla_put_failure;
- error = rt->u.dst.error;
- expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
+ if (rt->fl.mark)
+ NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
+
+ error = rt->dst.error;
+ expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
if (rt->peer) {
+ inet_peer_refcheck(rt->peer);
id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
if (rt->peer->tcp_ts_stamp) {
ts = rt->peer->tcp_ts;
@@ -2918,7 +2915,7 @@ static int rt_fill_info(struct net *net,
NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
}
- if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
+ if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
expires, error) < 0)
goto nla_put_failure;
@@ -2939,6 +2936,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
__be32 src = 0;
u32 iif;
int err;
+ int mark;
struct sk_buff *skb;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
@@ -2966,6 +2964,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
+ mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
if (iif) {
struct net_device *dev;
@@ -2978,13 +2977,14 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
skb->protocol = htons(ETH_P_IP);
skb->dev = dev;
+ skb->mark = mark;
local_bh_disable();
err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
local_bh_enable();
rt = skb_rtable(skb);
- if (err == 0 && rt->u.dst.error)
- err = -rt->u.dst.error;
+ if (err == 0 && rt->dst.error)
+ err = -rt->dst.error;
} else {
struct flowi fl = {
.nl_u = {
@@ -2995,6 +2995,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
},
},
.oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
+ .mark = mark,
};
err = ip_route_output_key(net, &rt, &fl);
}
@@ -3002,7 +3003,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
if (err)
goto errout_free;
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_set(skb, &rt->dst);
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
@@ -3038,12 +3039,12 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
continue;
rcu_read_lock_bh();
for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
- rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
- if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
+ rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
+ if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
continue;
if (rt_is_expired(rt))
continue;
- skb_dst_set(skb, dst_clone(&rt->u.dst));
+ skb_dst_set_noref(skb, &rt->dst);
if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, RTM_NEWROUTE,
1, NLM_F_MULTI) <= 0) {
@@ -3089,48 +3090,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
return -EINVAL;
}
-static void rt_secret_reschedule(int old)
-{
- struct net *net;
- int new = ip_rt_secret_interval;
- int diff = new - old;
-
- if (!diff)
- return;
-
- rtnl_lock();
- for_each_net(net) {
- int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
- long time;
-
- if (!new)
- continue;
-
- if (deleted) {
- time = net->ipv4.rt_secret_timer.expires - jiffies;
-
- if (time <= 0 || (time += diff) <= 0)
- time = 0;
- } else
- time = new;
-
- mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
- }
- rtnl_unlock();
-}
-
-static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- int old = ip_rt_secret_interval;
- int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
-
- rt_secret_reschedule(old);
-
- return ret;
-}
-
static ctl_table ipv4_route_table[] = {
{
.procname = "gc_thresh",
@@ -3239,13 +3198,6 @@ static ctl_table ipv4_route_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "secret_interval",
- .data = &ip_rt_secret_interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = ipv4_sysctl_rt_secret_interval,
- },
{ }
};
@@ -3324,34 +3276,15 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
};
#endif
-
-static __net_init int rt_secret_timer_init(struct net *net)
+static __net_init int rt_genid_init(struct net *net)
{
- atomic_set(&net->ipv4.rt_genid,
- (int) ((num_physpages ^ (num_physpages>>8)) ^
- (jiffies ^ (jiffies >> 7))));
-
- net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
- net->ipv4.rt_secret_timer.data = (unsigned long)net;
- init_timer_deferrable(&net->ipv4.rt_secret_timer);
-
- if (ip_rt_secret_interval) {
- net->ipv4.rt_secret_timer.expires =
- jiffies + net_random() % ip_rt_secret_interval +
- ip_rt_secret_interval;
- add_timer(&net->ipv4.rt_secret_timer);
- }
+ get_random_bytes(&net->ipv4.rt_genid,
+ sizeof(net->ipv4.rt_genid));
return 0;
}
-static __net_exit void rt_secret_timer_exit(struct net *net)
-{
- del_timer_sync(&net->ipv4.rt_secret_timer);
-}
-
-static __net_initdata struct pernet_operations rt_secret_timer_ops = {
- .init = rt_secret_timer_init,
- .exit = rt_secret_timer_exit,
+static __net_initdata struct pernet_operations rt_genid_ops = {
+ .init = rt_genid_init,
};
@@ -3412,9 +3345,6 @@ int __init ip_rt_init(void)
schedule_delayed_work(&expires_work,
net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
- if (register_pernet_subsys(&rt_secret_timer_ops))
- printk(KERN_ERR "Unable to setup rt_secret_timer\n");
-
if (ip_rt_proc_init())
printk(KERN_ERR "Unable to create route proc files\n");
#ifdef CONFIG_XFRM
@@ -3426,6 +3356,7 @@ int __init ip_rt_init(void)
#ifdef CONFIG_SYSCTL
register_pernet_subsys(&sysctl_route_ops);
#endif
+ register_pernet_subsys(&rt_genid_ops);
return rc;
}
@@ -3439,7 +3370,3 @@ void __init ip_static_sysctl_init(void)
register_sysctl_paths(ipv4_path, ipv4_skeleton);
}
#endif
-
-EXPORT_SYMBOL(__ip_select_ident);
-EXPORT_SYMBOL(ip_route_input);
-EXPORT_SYMBOL(ip_route_output_key);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 5c24db4a3c9..650cace2180 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -18,8 +18,8 @@
#include <net/tcp.h>
#include <net/route.h>
-/* Timestamps: lowest 9 bits store TCP options */
-#define TSBITS 9
+/* Timestamps: lowest bits store TCP options */
+#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)
extern int sysctl_tcp_syncookies;
@@ -58,7 +58,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
/*
* when syncookies are in effect and tcp timestamps are enabled we encode
- * tcp options in the lowest 9 bits of the timestamp value that will be
+ * tcp options in the lower bits of the timestamp value that will be
* sent in the syn-ack.
* Since subsequent timestamps use the normal tcp_time_stamp value, we
* must make sure that the resulting initial timestamp is <= tcp_time_stamp.
@@ -70,11 +70,10 @@ __u32 cookie_init_timestamp(struct request_sock *req)
u32 options = 0;
ireq = inet_rsk(req);
- if (ireq->wscale_ok) {
- options = ireq->snd_wscale;
- options |= ireq->rcv_wscale << 4;
- }
- options |= ireq->sack_ok << 8;
+
+ options = ireq->wscale_ok ? ireq->snd_wscale : 0xf;
+ options |= ireq->sack_ok << 4;
+ options |= ireq->ecn_ok << 5;
ts = ts_now & ~TSMASK;
ts |= options;
@@ -138,23 +137,23 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
}
/*
- * This table has to be sorted and terminated with (__u16)-1.
- * XXX generate a better table.
- * Unresolved Issues: HIPPI with a 64k MSS is not well supported.
+ * MSS Values are taken from the 2009 paper
+ * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson:
+ * - values 1440 to 1460 accounted for 80% of observed mss values
+ * - values outside the 536-1460 range are rare (<0.2%).
+ *
+ * Table must be sorted.
*/
static __u16 const msstab[] = {
- 64 - 1,
- 256 - 1,
- 512 - 1,
- 536 - 1,
- 1024 - 1,
- 1440 - 1,
- 1460 - 1,
- 4312 - 1,
- (__u16)-1
+ 64,
+ 512,
+ 536,
+ 1024,
+ 1440,
+ 1460,
+ 4312,
+ 8960,
};
-/* The number doesn't include the -1 terminator */
-#define NUM_MSS (ARRAY_SIZE(msstab) - 1)
/*
* Generate a syncookie. mssp points to the mss, which is returned
@@ -169,10 +168,10 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
tcp_synq_overflow(sk);
- /* XXX sort msstab[] by probability? Binary search? */
- for (mssind = 0; mss > msstab[mssind + 1]; mssind++)
- ;
- *mssp = msstab[mssind] + 1;
+ for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
+ if (mss >= msstab[mssind])
+ break;
+ *mssp = msstab[mssind];
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
@@ -202,7 +201,7 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
jiffies / (HZ * 60),
COUNTER_TRIES);
- return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
+ return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
}
static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
@@ -227,26 +226,38 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
* additional tcp options in the timestamp.
* This extracts these options from the timestamp echo.
*
- * The lowest 4 bits are for snd_wscale
- * The next 4 lsb are for rcv_wscale
- * The next lsb is for sack_ok
+ * The lowest 4 bits store snd_wscale.
+ * next 2 bits indicate SACK and ECN support.
+ *
+ * return false if we decode an option that should not be.
*/
-void cookie_check_timestamp(struct tcp_options_received *tcp_opt)
+bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool *ecn_ok)
{
- /* echoed timestamp, 9 lowest bits contain options */
+ /* echoed timestamp, lowest bits contain options */
u32 options = tcp_opt->rcv_tsecr & TSMASK;
- tcp_opt->snd_wscale = options & 0xf;
- options >>= 4;
- tcp_opt->rcv_wscale = options & 0xf;
+ if (!tcp_opt->saw_tstamp) {
+ tcp_clear_options(tcp_opt);
+ return true;
+ }
+
+ if (!sysctl_tcp_timestamps)
+ return false;
tcp_opt->sack_ok = (options >> 4) & 0x1;
+ *ecn_ok = (options >> 5) & 1;
+ if (*ecn_ok && !sysctl_tcp_ecn)
+ return false;
+
+ if (tcp_opt->sack_ok && !sysctl_tcp_sack)
+ return false;
- if (tcp_opt->sack_ok)
- tcp_sack_reset(tcp_opt);
+ if ((options & 0xf) == 0xf)
+ return true; /* no window scaling */
- if (tcp_opt->snd_wscale || tcp_opt->rcv_wscale)
- tcp_opt->wscale_ok = 1;
+ tcp_opt->wscale_ok = 1;
+ tcp_opt->snd_wscale = options & 0xf;
+ return sysctl_tcp_window_scaling != 0;
}
EXPORT_SYMBOL(cookie_check_timestamp);
@@ -265,8 +276,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
int mss;
struct rtable *rt;
__u8 rcv_wscale;
+ bool ecn_ok;
- if (!sysctl_tcp_syncookies || !th->ack)
+ if (!sysctl_tcp_syncookies || !th->ack || th->rst)
goto out;
if (tcp_synq_no_recent_overflow(sk) ||
@@ -281,8 +293,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
memset(&tcp_opt, 0, sizeof(tcp_opt));
tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
- if (tcp_opt.saw_tstamp)
- cookie_check_timestamp(&tcp_opt);
+ if (!cookie_check_timestamp(&tcp_opt, &ecn_ok))
+ goto out;
ret = NULL;
req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
@@ -298,9 +310,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
ireq->rmt_port = th->source;
ireq->loc_addr = ip_hdr(skb)->daddr;
ireq->rmt_addr = ip_hdr(skb)->saddr;
- ireq->ecn_ok = 0;
+ ireq->ecn_ok = ecn_ok;
ireq->snd_wscale = tcp_opt.snd_wscale;
- ireq->rcv_wscale = tcp_opt.rcv_wscale;
ireq->sack_ok = tcp_opt.sack_ok;
ireq->wscale_ok = tcp_opt.wscale_ok;
ireq->tstamp_ok = tcp_opt.saw_tstamp;
@@ -347,22 +358,22 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
{ .sport = th->dest,
.dport = th->source } } };
security_req_classify_flow(req, &fl);
- if (ip_route_output_key(&init_net, &rt, &fl)) {
+ if (ip_route_output_key(sock_net(sk), &rt, &fl)) {
reqsk_free(req);
goto out;
}
}
/* Try to redo what tcp_v4_send_synack did. */
- req->window_clamp = tp->window_clamp ? :dst_metric(&rt->u.dst, RTAX_WINDOW);
+ req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
tcp_select_initial_window(tcp_full_space(sk), req->mss,
&req->rcv_wnd, &req->window_clamp,
ireq->wscale_ok, &rcv_wscale,
- dst_metric(&rt->u.dst, RTAX_INITRWND));
+ dst_metric(&rt->dst, RTAX_INITRWND));
ireq->rcv_wscale = rcv_wscale;
- ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
+ ret = get_cookie_sock(sk, skb, req, &rt->dst);
out: return ret;
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index c1bc074f61b..d96c1da4b17 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
#include <linux/inetdevice.h>
#include <linux/seqlock.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <net/snmp.h>
#include <net/icmp.h>
#include <net/ip.h>
@@ -298,6 +299,13 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = ipv4_local_port_range,
},
+ {
+ .procname = "ip_local_reserved_ports",
+ .data = NULL, /* initialized in sysctl_ipv4_init */
+ .maxlen = 65536,
+ .mode = 0644,
+ .proc_handler = proc_do_large_bitmap,
+ },
#ifdef CONFIG_IP_MULTICAST
{
.procname = "igmp_max_memberships",
@@ -735,6 +743,16 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
static __init int sysctl_ipv4_init(void)
{
struct ctl_table_header *hdr;
+ struct ctl_table *i;
+
+ for (i = ipv4_table; i->procname; i++) {
+ if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
+ i->data = sysctl_local_reserved_ports;
+ break;
+ }
+ }
+ if (!i->procname)
+ return -EINVAL;
hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
if (hdr == NULL)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5901010fad5..176e11aaea7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -265,6 +265,7 @@
#include <linux/err.h>
#include <linux/crypto.h>
#include <linux/time.h>
+#include <linux/slab.h>
#include <net/icmp.h>
#include <net/tcp.h>
@@ -314,7 +315,6 @@ struct tcp_splice_state {
* is strict, actions are advisory and have some latency.
*/
int tcp_memory_pressure __read_mostly;
-
EXPORT_SYMBOL(tcp_memory_pressure);
void tcp_enter_memory_pressure(struct sock *sk)
@@ -324,7 +324,6 @@ void tcp_enter_memory_pressure(struct sock *sk)
tcp_memory_pressure = 1;
}
}
-
EXPORT_SYMBOL(tcp_enter_memory_pressure);
/* Convert seconds to retransmits based on initial and max timeout */
@@ -377,7 +376,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
struct sock *sk = sock->sk;
struct tcp_sock *tp = tcp_sk(sk);
- sock_poll_wait(file, sk->sk_sleep, wait);
+ sock_poll_wait(file, sk_sleep(sk), wait);
if (sk->sk_state == TCP_LISTEN)
return inet_csk_listen_poll(sk);
@@ -429,7 +428,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (tp->urg_seq == tp->copied_seq &&
!sock_flag(sk, SOCK_URGINLINE) &&
tp->urg_data)
- target--;
+ target++;
/* Potential race condition. If read of tp below will
* escape above sk->sk_state, we can be illegally awaken
@@ -459,6 +458,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
}
return mask;
}
+EXPORT_SYMBOL(tcp_poll);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
@@ -507,10 +507,11 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return put_user(answ, (int __user *)arg);
}
+EXPORT_SYMBOL(tcp_ioctl);
static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+ TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
tp->pushed_seq = tp->write_seq;
}
@@ -526,7 +527,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
skb->csum = 0;
tcb->seq = tcb->end_seq = tp->write_seq;
- tcb->flags = TCPCB_FLAG_ACK;
+ tcb->flags = TCPHDR_ACK;
tcb->sacked = 0;
skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
@@ -607,6 +608,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
ssize_t spliced;
int ret;
+ sock_rps_record_flow(sk);
/*
* We can't seek on a socket input
*/
@@ -674,6 +676,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
return ret;
}
+EXPORT_SYMBOL(tcp_splice_read);
struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
{
@@ -814,7 +817,7 @@ new_segment:
skb_shinfo(skb)->gso_segs = 0;
if (!copied)
- TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+ TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
copied += copy;
poffset += copy;
@@ -855,15 +858,15 @@ out_err:
return sk_stream_error(sk, flags, err);
}
-ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
- size_t size, int flags)
+int tcp_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
{
ssize_t res;
- struct sock *sk = sock->sk;
if (!(sk->sk_route_caps & NETIF_F_SG) ||
!(sk->sk_route_caps & NETIF_F_ALL_CSUM))
- return sock_no_sendpage(sock, page, offset, size, flags);
+ return sock_no_sendpage(sk->sk_socket, page, offset, size,
+ flags);
lock_sock(sk);
TCP_CHECK_TIMER(sk);
@@ -872,6 +875,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
release_sock(sk);
return res;
}
+EXPORT_SYMBOL(tcp_sendpage);
#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
#define TCP_OFF(sk) (sk->sk_sndmsg_off)
@@ -896,10 +900,9 @@ static inline int select_size(struct sock *sk, int sg)
return tmp;
}
-int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
{
- struct sock *sk = sock->sk;
struct iovec *iov;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -1060,7 +1063,7 @@ new_segment:
}
if (!copied)
- TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+ TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
@@ -1120,6 +1123,7 @@ out_err:
release_sock(sk);
return err;
}
+EXPORT_SYMBOL(tcp_sendmsg);
/*
* Handle reading urgent data. BSD has very simple semantics for
@@ -1254,6 +1258,39 @@ static void tcp_prequeue_process(struct sock *sk)
tp->ucopy.memory = 0;
}
+#ifdef CONFIG_NET_DMA
+static void tcp_service_net_dma(struct sock *sk, bool wait)
+{
+ dma_cookie_t done, used;
+ dma_cookie_t last_issued;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->ucopy.dma_chan)
+ return;
+
+ last_issued = tp->ucopy.dma_cookie;
+ dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+
+ do {
+ if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
+ last_issued, &done,
+ &used) == DMA_SUCCESS) {
+ /* Safe to free early-copied skbs now */
+ __skb_queue_purge(&sk->sk_async_wait_queue);
+ break;
+ } else {
+ struct sk_buff *skb;
+ while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
+ (dma_async_is_complete(skb->dma_cookie, done,
+ used) == DMA_SUCCESS)) {
+ __skb_dequeue(&sk->sk_async_wait_queue);
+ kfree_skb(skb);
+ }
+ }
+ } while (wait);
+}
+#endif
+
static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
@@ -1335,6 +1372,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_eat_skb(sk, skb, 0);
if (!desc->count)
break;
+ tp->copied_seq = seq;
}
tp->copied_seq = seq;
@@ -1345,6 +1383,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
tcp_cleanup_rbuf(sk, copied);
return copied;
}
+EXPORT_SYMBOL(tcp_read_sock);
/*
* This routine copies from a sock struct into the user buffer.
@@ -1546,6 +1585,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/* __ Set realtime policy in scheduler __ */
}
+#ifdef CONFIG_NET_DMA
+ if (tp->ucopy.dma_chan)
+ dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+#endif
if (copied >= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
@@ -1554,6 +1597,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
sk_wait_data(sk, &timeo);
#ifdef CONFIG_NET_DMA
+ tcp_service_net_dma(sk, false); /* Don't block */
tp->ucopy.wakeup = 0;
#endif
@@ -1633,6 +1677,9 @@ do_prequeue:
copied = -EFAULT;
break;
}
+
+ dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+
if ((offset + used) == skb->len)
copied_early = 1;
@@ -1702,27 +1749,9 @@ skip_copy:
}
#ifdef CONFIG_NET_DMA
- if (tp->ucopy.dma_chan) {
- dma_cookie_t done, used;
-
- dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
-
- while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
- tp->ucopy.dma_cookie, &done,
- &used) == DMA_IN_PROGRESS) {
- /* do partial cleanup of sk_async_wait_queue */
- while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
- (dma_async_is_complete(skb->dma_cookie, done,
- used) == DMA_SUCCESS)) {
- __skb_dequeue(&sk->sk_async_wait_queue);
- kfree_skb(skb);
- }
- }
+ tcp_service_net_dma(sk, true); /* Wait for queue to drain */
+ tp->ucopy.dma_chan = NULL;
- /* Safe to free early-copied skbs now */
- __skb_queue_purge(&sk->sk_async_wait_queue);
- tp->ucopy.dma_chan = NULL;
- }
if (tp->ucopy.pinned_list) {
dma_unpin_iovec_pages(tp->ucopy.pinned_list);
tp->ucopy.pinned_list = NULL;
@@ -1749,6 +1778,7 @@ recv_urg:
err = tcp_recv_urg(sk, msg, len, flags);
goto out;
}
+EXPORT_SYMBOL(tcp_recvmsg);
void tcp_set_state(struct sock *sk, int state)
{
@@ -1841,6 +1871,7 @@ void tcp_shutdown(struct sock *sk, int how)
tcp_send_fin(sk);
}
}
+EXPORT_SYMBOL(tcp_shutdown);
void tcp_close(struct sock *sk, long timeout)
{
@@ -1873,6 +1904,10 @@ void tcp_close(struct sock *sk, long timeout)
sk_mem_reclaim(sk);
+ /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
+ if (sk->sk_state == TCP_CLOSE)
+ goto adjudge_to_death;
+
/* As outlined in RFC 2525, section 2.17, we send a RST here because
* data was lost. To witness the awful effects of the old behavior of
* always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
@@ -2000,6 +2035,7 @@ out:
local_bh_enable();
sock_put(sk);
}
+EXPORT_SYMBOL(tcp_close);
/* These states need RST on ABORT according to RFC793 */
@@ -2073,6 +2109,7 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_error_report(sk);
return err;
}
+EXPORT_SYMBOL(tcp_disconnect);
/*
* Socket option code for TCP.
@@ -2150,6 +2187,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
GFP_KERNEL);
if (cvp == NULL)
return -ENOMEM;
+
+ kref_init(&cvp->kref);
}
lock_sock(sk);
tp->rx_opt.cookie_in_always =
@@ -2164,12 +2203,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
*/
kref_put(&tp->cookie_values->kref,
tcp_cookie_values_release);
- kref_init(&cvp->kref);
- tp->cookie_values = cvp;
} else {
cvp = tp->cookie_values;
}
}
+
if (cvp != NULL) {
cvp->cookie_desired = ctd.tcpct_cookie_desired;
@@ -2183,6 +2221,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
cvp->s_data_desired = ctd.tcpct_s_data_desired;
cvp->s_data_constant = 0; /* false */
}
+
+ tp->cookie_values = cvp;
}
release_sock(sk);
return err;
@@ -2190,7 +2230,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
default:
/* fallthru */
break;
- };
+ }
if (optlen < sizeof(int))
return -EINVAL;
@@ -2273,7 +2313,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
if (sock_flag(sk, SOCK_KEEPOPEN) &&
!((1 << sk->sk_state) &
(TCPF_CLOSE | TCPF_LISTEN))) {
- __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
+ u32 elapsed = keepalive_time_elapsed(tp);
if (tp->keepalive_time > elapsed)
elapsed = tp->keepalive_time - elapsed;
else
@@ -2371,6 +2411,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
optval, optlen);
return do_tcp_setsockopt(sk, level, optname, optval, optlen);
}
+EXPORT_SYMBOL(tcp_setsockopt);
#ifdef CONFIG_COMPAT
int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
@@ -2381,7 +2422,6 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
optval, optlen);
return do_tcp_setsockopt(sk, level, optname, optval, optlen);
}
-
EXPORT_SYMBOL(compat_tcp_setsockopt);
#endif
@@ -2447,7 +2487,6 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_total_retrans = tp->total_retrans;
}
-
EXPORT_SYMBOL_GPL(tcp_get_info);
static int do_tcp_getsockopt(struct sock *sk, int level,
@@ -2565,6 +2604,12 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
return 0;
}
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ val = tp->thin_lto;
+ break;
+ case TCP_THIN_DUPACK:
+ val = tp->thin_dupack;
+ break;
default:
return -ENOPROTOOPT;
}
@@ -2586,6 +2631,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
optval, optlen);
return do_tcp_getsockopt(sk, level, optname, optval, optlen);
}
+EXPORT_SYMBOL(tcp_getsockopt);
#ifdef CONFIG_COMPAT
int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
@@ -2596,7 +2642,6 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
optval, optlen);
return do_tcp_getsockopt(sk, level, optname, optval, optlen);
}
-
EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif
@@ -2696,7 +2741,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
struct tcphdr *th2;
unsigned int len;
unsigned int thlen;
- unsigned int flags;
+ __be32 flags;
unsigned int mss = 1;
unsigned int hlen;
unsigned int off;
@@ -2746,10 +2791,10 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
found:
flush = NAPI_GRO_CB(p)->flush;
- flush |= flags & TCP_FLAG_CWR;
- flush |= (flags ^ tcp_flag_word(th2)) &
- ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH);
- flush |= th->ack_seq ^ th2->ack_seq;
+ flush |= (__force int)(flags & TCP_FLAG_CWR);
+ flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+ ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
+ flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
for (i = sizeof(*th); i < thlen; i += 4)
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);
@@ -2770,8 +2815,9 @@ found:
out_check_final:
flush = len < mss;
- flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST |
- TCP_FLAG_SYN | TCP_FLAG_FIN);
+ flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
+ TCP_FLAG_RST | TCP_FLAG_SYN |
+ TCP_FLAG_FIN));
if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
pp = head;
@@ -2814,7 +2860,6 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
if (p->md5_desc.tfm)
crypto_free_hash(p->md5_desc.tfm);
kfree(p);
- p = NULL;
}
}
free_percpu(pool);
@@ -2833,7 +2878,6 @@ void tcp_free_md5sig_pool(void)
if (pool)
__tcp_free_md5sig_pool(pool);
}
-
EXPORT_SYMBOL(tcp_free_md5sig_pool);
static struct tcp_md5sig_pool * __percpu *
@@ -2909,28 +2953,42 @@ retry:
}
return pool;
}
-
EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
-struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
+
+/**
+ * tcp_get_md5sig_pool - get md5sig_pool for this user
+ *
+ * We use percpu structure, so if we succeed, we exit with preemption
+ * and BH disabled, to make sure another thread or softirq handling
+ * wont try to get same context.
+ */
+struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
{
struct tcp_md5sig_pool * __percpu *p;
- spin_lock_bh(&tcp_md5sig_pool_lock);
+
+ local_bh_disable();
+
+ spin_lock(&tcp_md5sig_pool_lock);
p = tcp_md5sig_pool;
if (p)
tcp_md5sig_users++;
- spin_unlock_bh(&tcp_md5sig_pool_lock);
- return (p ? *per_cpu_ptr(p, cpu) : NULL);
-}
+ spin_unlock(&tcp_md5sig_pool_lock);
-EXPORT_SYMBOL(__tcp_get_md5sig_pool);
+ if (p)
+ return *this_cpu_ptr(p);
-void __tcp_put_md5sig_pool(void)
+ local_bh_enable();
+ return NULL;
+}
+EXPORT_SYMBOL(tcp_get_md5sig_pool);
+
+void tcp_put_md5sig_pool(void)
{
+ local_bh_enable();
tcp_free_md5sig_pool();
}
-
-EXPORT_SYMBOL(__tcp_put_md5sig_pool);
+EXPORT_SYMBOL(tcp_put_md5sig_pool);
int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
struct tcphdr *th)
@@ -2946,7 +3004,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
th->check = old_checksum;
return err;
}
-
EXPORT_SYMBOL(tcp_md5_hash_header);
int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
@@ -2959,6 +3016,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
const unsigned head_data_len = skb_headlen(skb) > header_len ?
skb_headlen(skb) - header_len : 0;
const struct skb_shared_info *shi = skb_shinfo(skb);
+ struct sk_buff *frag_iter;
sg_init_table(&sg, 1);
@@ -2973,9 +3031,12 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
return 1;
}
+ skb_walk_frags(skb, frag_iter)
+ if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
+ return 1;
+
return 0;
}
-
EXPORT_SYMBOL(tcp_md5_hash_skb_data);
int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
@@ -2985,7 +3046,6 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
sg_init_one(&sg, key->key, key->keylen);
return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
}
-
EXPORT_SYMBOL(tcp_md5_hash_key);
#endif
@@ -3257,16 +3317,3 @@ void __init tcp_init(void)
tcp_secret_retiring = &tcp_secret_two;
tcp_secret_secondary = &tcp_secret_two;
}
-
-EXPORT_SYMBOL(tcp_close);
-EXPORT_SYMBOL(tcp_disconnect);
-EXPORT_SYMBOL(tcp_getsockopt);
-EXPORT_SYMBOL(tcp_ioctl);
-EXPORT_SYMBOL(tcp_poll);
-EXPORT_SYMBOL(tcp_read_sock);
-EXPORT_SYMBOL(tcp_recvmsg);
-EXPORT_SYMBOL(tcp_sendmsg);
-EXPORT_SYMBOL(tcp_splice_read);
-EXPORT_SYMBOL(tcp_sendpage);
-EXPORT_SYMBOL(tcp_setsockopt);
-EXPORT_SYMBOL(tcp_shutdown);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 6428b342b16..0ec9bd0ae94 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -10,6 +10,7 @@
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/list.h>
+#include <linux/gfp.h>
#include <net/tcp.h>
int sysctl_tcp_max_ssthresh = 0;
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index c209e054a63..377bc934937 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -126,8 +126,8 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
* calculate 2^fract in a <<7 value.
*/
is_slowstart = 1;
- increment = ((1 << ca->rho) * hybla_fraction(rho_fractions))
- - 128;
+ increment = ((1 << min(ca->rho, 16U)) *
+ hybla_fraction(rho_fractions)) - 128;
} else {
/*
* congestion avoidance
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 788851ca8c5..3c426cb318e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -62,6 +62,7 @@
*/
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/kernel.h>
@@ -77,10 +78,13 @@ int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
+EXPORT_SYMBOL(sysctl_tcp_reordering);
int sysctl_tcp_ecn __read_mostly = 2;
+EXPORT_SYMBOL(sysctl_tcp_ecn);
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 2;
+EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
@@ -418,6 +422,7 @@ void tcp_initialize_rcv_mss(struct sock *sk)
inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
+EXPORT_SYMBOL(tcp_initialize_rcv_mss);
/* Receiver "autotuning" code.
*
@@ -2511,6 +2516,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
int err;
unsigned int mss;
+ if (packets == 0)
+ return;
+
WARN_ON(packets > tp->packets_out);
if (tp->lost_skb_hint) {
skb = tp->lost_skb_hint;
@@ -2635,7 +2643,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
if (sk->sk_family == AF_INET) {
printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
msg,
- &inet->daddr, ntohs(inet->dport),
+ &inet->inet_daddr, ntohs(inet->inet_dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
@@ -2645,7 +2653,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
struct ipv6_pinfo *np = inet6_sk(sk);
printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
msg,
- &np->daddr, ntohs(inet->dport),
+ &np->daddr, ntohs(inet->inet_dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
@@ -2934,6 +2942,7 @@ void tcp_simple_retransmit(struct sock *sk)
}
tcp_xmit_retransmit_queue(sk);
}
+EXPORT_SYMBOL(tcp_simple_retransmit);
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
@@ -3282,7 +3291,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
* connection startup slow start one packet too
* quickly. This is severely frowned upon behavior.
*/
- if (!(scb->flags & TCPCB_FLAG_SYN)) {
+ if (!(scb->flags & TCPHDR_SYN)) {
flag |= FLAG_DATA_ACKED;
} else {
flag |= FLAG_SYN_ACKED;
@@ -3706,7 +3715,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
}
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
- dst_confirm(sk->sk_dst_cache);
+ dst_confirm(__sk_dst_get(sk));
return 1;
@@ -3841,18 +3850,20 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
/* 16-bit multiple */
opt_rx->cookie_plus = opsize;
*hvpp = ptr;
+ break;
default:
/* ignore option */
break;
- };
+ }
break;
- };
+ }
ptr += opsize-2;
length -= opsize;
}
}
}
+EXPORT_SYMBOL(tcp_parse_options);
static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
{
@@ -3926,6 +3937,7 @@ u8 *tcp_parse_md5sig_option(struct tcphdr *th)
}
return NULL;
}
+EXPORT_SYMBOL(tcp_parse_md5sig_option);
#endif
static inline void tcp_store_ts_recent(struct tcp_sock *tp)
@@ -4315,7 +4327,7 @@ static void tcp_ofo_queue(struct sock *sk)
}
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
- SOCK_DEBUG(sk, "ofo packet was already received \n");
+ SOCK_DEBUG(sk, "ofo packet was already received\n");
__skb_unlink(skb, &tp->out_of_order_queue);
__kfree_skb(skb);
continue;
@@ -4363,6 +4375,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
goto drop;
+ skb_dst_drop(skb);
__skb_pull(skb, th->doff * 4);
TCP_ECN_accept_cwr(tp, skb);
@@ -5426,6 +5439,7 @@ discard:
__kfree_skb(skb);
return 0;
}
+EXPORT_SYMBOL(tcp_rcv_established);
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
@@ -5829,7 +5843,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (tp->snd_una == tp->write_seq) {
tcp_set_state(sk, TCP_FIN_WAIT2);
sk->sk_shutdown |= SEND_SHUTDOWN;
- dst_confirm(sk->sk_dst_cache);
+ dst_confirm(__sk_dst_get(sk));
if (!sock_flag(sk, SOCK_DEAD))
/* Wake up lingering close() */
@@ -5925,14 +5939,4 @@ discard:
}
return 0;
}
-
-EXPORT_SYMBOL(sysctl_tcp_ecn);
-EXPORT_SYMBOL(sysctl_tcp_reordering);
-EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
-EXPORT_SYMBOL(tcp_parse_options);
-#ifdef CONFIG_TCP_MD5SIG
-EXPORT_SYMBOL(tcp_parse_md5sig_option);
-#endif
-EXPORT_SYMBOL(tcp_rcv_established);
EXPORT_SYMBOL(tcp_rcv_state_process);
-EXPORT_SYMBOL(tcp_initialize_rcv_mss);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 70df40980a8..020766292bb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -60,6 +60,7 @@
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/times.h>
+#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
@@ -83,6 +84,7 @@
int sysctl_tcp_tw_reuse __read_mostly;
int sysctl_tcp_low_latency __read_mostly;
+EXPORT_SYMBOL(sysctl_tcp_low_latency);
#ifdef CONFIG_TCP_MD5SIG
@@ -99,6 +101,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
#endif
struct inet_hashinfo tcp_hashinfo;
+EXPORT_SYMBOL(tcp_hashinfo);
static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
{
@@ -138,7 +141,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
return 0;
}
-
EXPORT_SYMBOL_GPL(tcp_twsk_unique);
/* This will initiate an outgoing connection. */
@@ -203,10 +205,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* TIME-WAIT * and initialize rx_opt.ts_recent from it,
* when trying new connection.
*/
- if (peer != NULL &&
- (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
- tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
- tp->rx_opt.ts_recent = peer->tcp_ts;
+ if (peer) {
+ inet_peer_refcheck(peer);
+ if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
+ tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
+ tp->rx_opt.ts_recent = peer->tcp_ts;
+ }
}
}
@@ -236,7 +240,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
- sk_setup_caps(sk, &rt->u.dst);
+ sk_setup_caps(sk, &rt->dst);
if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
@@ -264,6 +268,7 @@ failure:
inet->inet_dport = 0;
return err;
}
+EXPORT_SYMBOL(tcp_v4_connect);
/*
* This routine does path mtu discovery as defined in RFC1191.
@@ -370,6 +375,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (sk->sk_state == TCP_CLOSE)
goto out;
+ if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ goto out;
+ }
+
icsk = inet_csk(sk);
tp = tcp_sk(sk);
seq = ntohl(th->seq);
@@ -513,26 +523,32 @@ out:
sock_put(sk);
}
-/* This routine computes an IPv4 TCP checksum. */
-void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
+static void __tcp_v4_send_check(struct sk_buff *skb,
+ __be32 saddr, __be32 daddr)
{
- struct inet_sock *inet = inet_sk(sk);
struct tcphdr *th = tcp_hdr(skb);
if (skb->ip_summed == CHECKSUM_PARTIAL) {
- th->check = ~tcp_v4_check(len, inet->inet_saddr,
- inet->inet_daddr, 0);
+ th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct tcphdr, check);
} else {
- th->check = tcp_v4_check(len, inet->inet_saddr,
- inet->inet_daddr,
+ th->check = tcp_v4_check(skb->len, saddr, daddr,
csum_partial(th,
th->doff << 2,
skb->csum));
}
}
+/* This routine computes an IPv4 TCP checksum. */
+void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
+}
+EXPORT_SYMBOL(tcp_v4_send_check);
+
int tcp_v4_gso_send_check(struct sk_buff *skb)
{
const struct iphdr *iph;
@@ -545,10 +561,8 @@ int tcp_v4_gso_send_check(struct sk_buff *skb)
th = tcp_hdr(skb);
th->check = 0;
- th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
- skb->csum_start = skb_transport_header(skb) - skb->head;
- skb->csum_offset = offsetof(struct tcphdr, check);
skb->ip_summed = CHECKSUM_PARTIAL;
+ __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
return 0;
}
@@ -757,13 +771,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
skb = tcp_make_synack(sk, dst, req, rvp);
if (skb) {
- struct tcphdr *th = tcp_hdr(skb);
-
- th->check = tcp_v4_check(skb->len,
- ireq->loc_addr,
- ireq->rmt_addr,
- csum_partial(th, skb->len,
- skb->csum));
+ __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
ireq->rmt_addr,
@@ -790,19 +798,20 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
kfree(inet_rsk(req)->opt);
}
-#ifdef CONFIG_SYN_COOKIES
-static void syn_flood_warning(struct sk_buff *skb)
+static void syn_flood_warning(const struct sk_buff *skb)
{
- static unsigned long warntime;
+ const char *msg;
- if (time_after(jiffies, (warntime + HZ * 60))) {
- warntime = jiffies;
- printk(KERN_INFO
- "possible SYN flooding on port %d. Sending cookies.\n",
- ntohs(tcp_hdr(skb)->dest));
- }
-}
+#ifdef CONFIG_SYN_COOKIES
+ if (sysctl_tcp_syncookies)
+ msg = "Sending cookies";
+ else
#endif
+ msg = "Dropping request";
+
+ pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
+ ntohs(tcp_hdr(skb)->dest), msg);
+}
/*
* Save and compile IPv4 options into the request_sock if needed.
@@ -854,7 +863,6 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
{
return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
}
-
EXPORT_SYMBOL(tcp_v4_md5_lookup);
static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
@@ -888,7 +896,7 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
kfree(newkey);
return -ENOMEM;
}
- sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+ sk_nocaps_add(sk, NETIF_F_GSO_MASK);
}
if (tcp_alloc_md5sig_pool(sk) == NULL) {
kfree(newkey);
@@ -921,7 +929,6 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
}
return 0;
}
-
EXPORT_SYMBOL(tcp_v4_md5_do_add);
static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
@@ -959,7 +966,6 @@ int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
}
return -ENOENT;
}
-
EXPORT_SYMBOL(tcp_v4_md5_do_del);
static void tcp_v4_clear_md5_list(struct sock *sk)
@@ -1018,7 +1024,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
return -EINVAL;
tp->md5sig_info = p;
- sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+ sk_nocaps_add(sk, NETIF_F_GSO_MASK);
}
newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
@@ -1132,7 +1138,6 @@ clear_hash_noput:
memset(md5_hash, 0, 16);
return 1;
}
-
EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
@@ -1240,6 +1245,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
* evidently real one.
*/
if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
+ if (net_ratelimit())
+ syn_flood_warning(skb);
#ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) {
want_cookie = 1;
@@ -1283,8 +1290,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
goto drop_and_release;
/* Secret recipe starts with IP addresses */
- *mess++ ^= daddr;
- *mess++ ^= saddr;
+ *mess++ ^= (__force u32)daddr;
+ *mess++ ^= (__force u32)saddr;
/* plus variable length Initiator Cookie */
c = (u8 *)mess;
@@ -1320,15 +1327,12 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
- if (!want_cookie)
+ if (!want_cookie || tmp_opt.tstamp_ok)
TCP_ECN_create_request(req, tcp_hdr(skb));
if (want_cookie) {
-#ifdef CONFIG_SYN_COOKIES
- syn_flood_warning(skb);
- req->cookie_ts = tmp_opt.tstamp_ok;
-#endif
isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+ req->cookie_ts = tmp_opt.tstamp_ok;
} else if (!isn) {
struct inet_peer *peer = NULL;
@@ -1346,6 +1350,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
(dst = inet_csk_route_req(sk, req)) != NULL &&
(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
peer->v4daddr == saddr) {
+ inet_peer_refcheck(peer);
if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
(s32)(peer->tcp_ts - req->ts_recent) >
TCP_PAWS_WINDOW) {
@@ -1390,6 +1395,7 @@ drop_and_free:
drop:
return 0;
}
+EXPORT_SYMBOL(tcp_v4_conn_request);
/*
@@ -1459,7 +1465,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
if (newkey != NULL)
tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
newkey, key->keylen);
- newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+ sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
}
#endif
@@ -1475,6 +1481,7 @@ exit:
dst_release(dst);
return NULL;
}
+EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
@@ -1501,7 +1508,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
}
#ifdef CONFIG_SYN_COOKIES
- if (!th->rst && !th->syn && th->ack)
+ if (!th->syn)
sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
#endif
return sk;
@@ -1552,6 +1559,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
#endif
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+ sock_rps_save_rxhash(sk, skb->rxhash);
TCP_CHECK_TIMER(sk);
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
@@ -1576,7 +1584,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
}
return 0;
}
- }
+ } else
+ sock_rps_save_rxhash(sk, skb->rxhash);
+
TCP_CHECK_TIMER(sk);
if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
@@ -1601,6 +1611,7 @@ csum_err:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
+EXPORT_SYMBOL(tcp_v4_do_rcv);
/*
* From tcp_input.c
@@ -1787,6 +1798,7 @@ int tcp_v4_remember_stamp(struct sock *sk)
return 0;
}
+EXPORT_SYMBOL(tcp_v4_remember_stamp);
int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
{
@@ -1826,6 +1838,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
.compat_getsockopt = compat_ip_getsockopt,
#endif
};
+EXPORT_SYMBOL(ipv4_specific);
#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
@@ -1954,7 +1967,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
percpu_counter_dec(&tcp_sockets_allocated);
}
-
EXPORT_SYMBOL(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS
@@ -1972,6 +1984,11 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
}
+/*
+ * Get next listener socket follow cur. If cur is NULL, get first socket
+ * starting from bucket given in st->bucket; when st->bucket is zero the
+ * very first socket in the hash table is returned.
+ */
static void *listening_get_next(struct seq_file *seq, void *cur)
{
struct inet_connection_sock *icsk;
@@ -1982,14 +1999,15 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
struct net *net = seq_file_net(seq);
if (!sk) {
- st->bucket = 0;
- ilb = &tcp_hashinfo.listening_hash[0];
+ ilb = &tcp_hashinfo.listening_hash[st->bucket];
spin_lock_bh(&ilb->lock);
sk = sk_nulls_head(&ilb->head);
+ st->offset = 0;
goto get_sk;
}
ilb = &tcp_hashinfo.listening_hash[st->bucket];
++st->num;
+ ++st->offset;
if (st->state == TCP_SEQ_STATE_OPENREQ) {
struct request_sock *req = cur;
@@ -2004,6 +2022,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
}
req = req->dl_next;
}
+ st->offset = 0;
if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
break;
get_req:
@@ -2039,6 +2058,7 @@ start_req:
read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
spin_unlock_bh(&ilb->lock);
+ st->offset = 0;
if (++st->bucket < INET_LHTABLE_SIZE) {
ilb = &tcp_hashinfo.listening_hash[st->bucket];
spin_lock_bh(&ilb->lock);
@@ -2052,7 +2072,12 @@ out:
static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
{
- void *rc = listening_get_next(seq, NULL);
+ struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ st->bucket = 0;
+ st->offset = 0;
+ rc = listening_get_next(seq, NULL);
while (rc && *pos) {
rc = listening_get_next(seq, rc);
@@ -2067,13 +2092,18 @@ static inline int empty_bucket(struct tcp_iter_state *st)
hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
}
+/*
+ * Get first established socket starting from bucket given in st->bucket.
+ * If st->bucket is zero, the very first socket in the hash is returned.
+ */
static void *established_get_first(struct seq_file *seq)
{
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
void *rc = NULL;
- for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
+ st->offset = 0;
+ for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
struct sock *sk;
struct hlist_nulls_node *node;
struct inet_timewait_sock *tw;
@@ -2118,6 +2148,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
struct net *net = seq_file_net(seq);
++st->num;
+ ++st->offset;
if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
tw = cur;
@@ -2134,6 +2165,7 @@ get_tw:
st->state = TCP_SEQ_STATE_ESTABLISHED;
/* Look for next non empty bucket */
+ st->offset = 0;
while (++st->bucket <= tcp_hashinfo.ehash_mask &&
empty_bucket(st))
;
@@ -2161,7 +2193,11 @@ out:
static void *established_get_idx(struct seq_file *seq, loff_t pos)
{
- void *rc = established_get_first(seq);
+ struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ st->bucket = 0;
+ rc = established_get_first(seq);
while (rc && pos) {
rc = established_get_next(seq, rc);
@@ -2186,24 +2222,72 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
return rc;
}
+static void *tcp_seek_last_pos(struct seq_file *seq)
+{
+ struct tcp_iter_state *st = seq->private;
+ int offset = st->offset;
+ int orig_num = st->num;
+ void *rc = NULL;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_OPENREQ:
+ case TCP_SEQ_STATE_LISTENING:
+ if (st->bucket >= INET_LHTABLE_SIZE)
+ break;
+ st->state = TCP_SEQ_STATE_LISTENING;
+ rc = listening_get_next(seq, NULL);
+ while (offset-- && rc)
+ rc = listening_get_next(seq, rc);
+ if (rc)
+ break;
+ st->bucket = 0;
+ /* Fallthrough */
+ case TCP_SEQ_STATE_ESTABLISHED:
+ case TCP_SEQ_STATE_TIME_WAIT:
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ if (st->bucket > tcp_hashinfo.ehash_mask)
+ break;
+ rc = established_get_first(seq);
+ while (offset-- && rc)
+ rc = established_get_next(seq, rc);
+ }
+
+ st->num = orig_num;
+
+ return rc;
+}
+
static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
{
struct tcp_iter_state *st = seq->private;
+ void *rc;
+
+ if (*pos && *pos == st->last_pos) {
+ rc = tcp_seek_last_pos(seq);
+ if (rc)
+ goto out;
+ }
+
st->state = TCP_SEQ_STATE_LISTENING;
st->num = 0;
- return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+ st->bucket = 0;
+ st->offset = 0;
+ rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+
+out:
+ st->last_pos = *pos;
+ return rc;
}
static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ struct tcp_iter_state *st = seq->private;
void *rc = NULL;
- struct tcp_iter_state *st;
if (v == SEQ_START_TOKEN) {
rc = tcp_get_idx(seq, 0);
goto out;
}
- st = seq->private;
switch (st->state) {
case TCP_SEQ_STATE_OPENREQ:
@@ -2211,6 +2295,8 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
rc = listening_get_next(seq, v);
if (!rc) {
st->state = TCP_SEQ_STATE_ESTABLISHED;
+ st->bucket = 0;
+ st->offset = 0;
rc = established_get_first(seq);
}
break;
@@ -2221,6 +2307,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
out:
++*pos;
+ st->last_pos = *pos;
return rc;
}
@@ -2259,6 +2346,7 @@ static int tcp_seq_open(struct inode *inode, struct file *file)
s = ((struct seq_file *)file->private_data)->private;
s->family = afinfo->family;
+ s->last_pos = 0;
return 0;
}
@@ -2282,11 +2370,13 @@ int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
rc = -ENOMEM;
return rc;
}
+EXPORT_SYMBOL(tcp_proc_register);
void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
{
proc_net_remove(net, afinfo->name);
}
+EXPORT_SYMBOL(tcp_proc_unregister);
static void get_openreq4(struct sock *sk, struct request_sock *req,
struct seq_file *f, int i, int uid, int *len)
@@ -2510,6 +2600,8 @@ struct proto tcp_prot = {
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
+ .sendmsg = tcp_sendmsg,
+ .sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.hash = inet_hash,
.unhash = inet_unhash,
@@ -2528,11 +2620,13 @@ struct proto tcp_prot = {
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
+ .no_autobind = true,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
};
+EXPORT_SYMBOL(tcp_prot);
static int __net_init tcp_sk_init(struct net *net)
@@ -2563,20 +2657,3 @@ void __init tcp_v4_init(void)
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
}
-
-EXPORT_SYMBOL(ipv4_specific);
-EXPORT_SYMBOL(tcp_hashinfo);
-EXPORT_SYMBOL(tcp_prot);
-EXPORT_SYMBOL(tcp_v4_conn_request);
-EXPORT_SYMBOL(tcp_v4_connect);
-EXPORT_SYMBOL(tcp_v4_do_rcv);
-EXPORT_SYMBOL(tcp_v4_remember_stamp);
-EXPORT_SYMBOL(tcp_v4_send_check);
-EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
-
-#ifdef CONFIG_PROC_FS
-EXPORT_SYMBOL(tcp_proc_register);
-EXPORT_SYMBOL(tcp_proc_unregister);
-#endif
-EXPORT_SYMBOL(sysctl_tcp_low_latency);
-
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4199bc6915c..f25b56cb85c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -20,6 +20,7 @@
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/workqueue.h>
#include <net/tcp.h>
@@ -46,7 +47,6 @@ struct inet_timewait_death_row tcp_death_row = {
.twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
(unsigned long)&tcp_death_row),
};
-
EXPORT_SYMBOL_GPL(tcp_death_row);
static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
@@ -261,6 +261,7 @@ kill:
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
+EXPORT_SYMBOL(tcp_timewait_state_process);
/*
* Move a socket to time-wait or dead fin-wait-2 state.
@@ -361,7 +362,6 @@ void tcp_twsk_destructor(struct sock *sk)
tcp_free_md5sig_pool();
#endif
}
-
EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
@@ -509,6 +509,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
}
return newsk;
}
+EXPORT_SYMBOL(tcp_create_openreq_child);
/*
* Process an incoming packet for SYN_RECV sockets represented
@@ -671,6 +672,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
return NULL;
}
@@ -704,6 +706,7 @@ embryonic_reset:
inet_csk_reqsk_queue_drop(sk, req, prev);
return NULL;
}
+EXPORT_SYMBOL(tcp_check_req);
/*
* Queue segment on the new socket if the new socket is active,
@@ -735,8 +738,4 @@ int tcp_child_process(struct sock *parent, struct sock *child,
sock_put(child);
return ret;
}
-
-EXPORT_SYMBOL(tcp_check_req);
EXPORT_SYMBOL(tcp_child_process);
-EXPORT_SYMBOL(tcp_create_openreq_child);
-EXPORT_SYMBOL(tcp_timewait_state_process);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f181b78f238..de3bd845858 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -37,6 +37,7 @@
#include <net/tcp.h>
#include <linux/compiler.h>
+#include <linux/gfp.h>
#include <linux/module.h>
/* People can turn this off for buggy TCP's found in printers etc. */
@@ -246,6 +247,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
/* Set the clamp no higher than max representable value */
(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
}
+EXPORT_SYMBOL(tcp_select_initial_window);
/* Chose a new window to advertise, update state in tcp_sock for the
* socket, and return result with RFC1323 scaling applied. The return
@@ -293,9 +295,9 @@ static u16 tcp_select_window(struct sock *sk)
/* Packet ECN state for a SYN-ACK */
static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
{
- TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
+ TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR;
if (!(tp->ecn_flags & TCP_ECN_OK))
- TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
+ TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE;
}
/* Packet ECN state for a SYN. */
@@ -305,7 +307,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
tp->ecn_flags = 0;
if (sysctl_tcp_ecn == 1) {
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR;
+ TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
}
}
@@ -349,6 +351,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
*/
static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
+ skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
TCP_SKB_CB(skb)->flags = flags;
@@ -359,7 +362,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
skb_shinfo(skb)->gso_type = 0;
TCP_SKB_CB(skb)->seq = seq;
- if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN))
+ if (flags & (TCPHDR_SYN | TCPHDR_FIN))
seq++;
TCP_SKB_CB(skb)->end_seq = seq;
}
@@ -666,7 +669,6 @@ static unsigned tcp_synack_options(struct sock *sk,
u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
xvp->cookie_plus :
0;
- bool doing_ts = ireq->tstamp_ok;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
@@ -679,7 +681,7 @@ static unsigned tcp_synack_options(struct sock *sk,
* rather than TS in order to fit in better with old,
* buggy kernels, but that was deemed to be unnecessary.
*/
- doing_ts &= !ireq->sack_ok;
+ ireq->tstamp_ok &= !ireq->sack_ok;
}
#else
*md5 = NULL;
@@ -694,7 +696,7 @@ static unsigned tcp_synack_options(struct sock *sk,
opts->options |= OPTION_WSCALE;
remaining -= TCPOLEN_WSCALE_ALIGNED;
}
- if (likely(doing_ts)) {
+ if (likely(ireq->tstamp_ok)) {
opts->options |= OPTION_TS;
opts->tsval = TCP_SKB_CB(skb)->when;
opts->tsecr = req->ts_recent;
@@ -702,7 +704,7 @@ static unsigned tcp_synack_options(struct sock *sk,
}
if (likely(ireq->sack_ok)) {
opts->options |= OPTION_SACK_ADVERTISE;
- if (unlikely(!doing_ts))
+ if (unlikely(!ireq->tstamp_ok))
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
@@ -710,7 +712,7 @@ static unsigned tcp_synack_options(struct sock *sk,
* If the <SYN> options fit, the same options should fit now!
*/
if (*md5 == NULL &&
- doing_ts &&
+ ireq->tstamp_ok &&
cookie_plus > TCPOLEN_COOKIE_BASE) {
int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
@@ -819,7 +821,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
- if (unlikely(tcb->flags & TCPCB_FLAG_SYN))
+ if (unlikely(tcb->flags & TCPHDR_SYN))
tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
else
tcp_options_size = tcp_established_options(sk, skb, &opts,
@@ -842,7 +844,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->flags);
- if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+ if (unlikely(tcb->flags & TCPHDR_SYN)) {
/* RFC1323: The window in SYN & SYN/ACK segments
* is never scaled.
*/
@@ -859,36 +861,37 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
th->urg_ptr = htons(tp->snd_up - tcb->seq);
th->urg = 1;
} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
- th->urg_ptr = 0xFFFF;
+ th->urg_ptr = htons(0xFFFF);
th->urg = 1;
}
}
tcp_options_write((__be32 *)(th + 1), tp, &opts);
- if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0))
+ if (likely((tcb->flags & TCPHDR_SYN) == 0))
TCP_ECN_send(sk, skb, tcp_header_size);
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
- sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+ sk_nocaps_add(sk, NETIF_F_GSO_MASK);
tp->af_specific->calc_md5_hash(opts.hash_location,
md5, sk, NULL, skb);
}
#endif
- icsk->icsk_af_ops->send_check(sk, skb->len, skb);
+ icsk->icsk_af_ops->send_check(sk, skb);
- if (likely(tcb->flags & TCPCB_FLAG_ACK))
+ if (likely(tcb->flags & TCPHDR_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
if (skb->len != tcp_header_size)
tcp_event_data_sent(tp, skb, sk);
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
- TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
+ TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
+ tcp_skb_pcount(skb));
- err = icsk->icsk_af_ops->queue_xmit(skb, 0);
+ err = icsk->icsk_af_ops->queue_xmit(skb);
if (likely(err <= 0))
return err;
@@ -1021,7 +1024,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->flags;
- TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
+ TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->flags = flags;
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
@@ -1187,6 +1190,7 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0;
}
+EXPORT_SYMBOL(tcp_mtup_init);
/* This function synchronize snd mss to current pmtu/exthdr set.
@@ -1230,6 +1234,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
return mss_now;
}
+EXPORT_SYMBOL(tcp_sync_mss);
/* Compute the current effective MSS, taking SACKs and IP options,
* and even PMTU discovery events into account.
@@ -1326,8 +1331,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
u32 in_flight, cwnd;
/* Don't be strict about the congestion window for the final FIN. */
- if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
- tcp_skb_pcount(skb) == 1)
+ if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1)
return 1;
in_flight = tcp_packets_in_flight(tp);
@@ -1396,7 +1400,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
* Nagle can be ignored during F-RTO too (see RFC4138).
*/
if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
- (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
+ (TCP_SKB_CB(skb)->flags & TCPHDR_FIN))
return 1;
if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
@@ -1459,7 +1463,7 @@ int tcp_may_send_now(struct sock *sk)
* packet has never been sent out before (and thus is not cloned).
*/
static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
- unsigned int mss_now)
+ unsigned int mss_now, gfp_t gfp)
{
struct sk_buff *buff;
int nlen = skb->len - len;
@@ -1469,7 +1473,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
if (skb->len != skb->data_len)
return tcp_fragment(sk, skb, len, mss_now);
- buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC);
+ buff = sk_stream_alloc_skb(sk, 0, gfp);
if (unlikely(buff == NULL))
return -ENOMEM;
@@ -1485,7 +1489,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)->flags;
- TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH);
+ TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->flags = flags;
/* This packet was never sent out yet, so no SACK bits. */
@@ -1516,7 +1520,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 send_win, cong_win, limit, in_flight;
- if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+ if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
goto send_now;
if (icsk->icsk_ca_state != TCP_CA_Open)
@@ -1642,7 +1646,7 @@ static int tcp_mtu_probe(struct sock *sk)
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
- TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
+ TCP_SKB_CB(nskb)->flags = TCPHDR_ACK;
TCP_SKB_CB(nskb)->sacked = 0;
nskb->csum = 0;
nskb->ip_summed = skb->ip_summed;
@@ -1667,7 +1671,7 @@ static int tcp_mtu_probe(struct sock *sk)
sk_wmem_free_skb(sk, skb);
} else {
TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
- ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+ ~(TCPHDR_FIN|TCPHDR_PSH);
if (!skb_shinfo(skb)->nr_frags) {
skb_pull(skb, copy);
if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -1767,7 +1771,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
cwnd_quota);
if (skb->len > limit &&
- unlikely(tso_fragment(sk, skb, limit, mss_now)))
+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -2018,7 +2022,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
if (!sysctl_tcp_retrans_collapse)
return;
- if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)
+ if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN)
return;
tcp_for_write_queue_from_safe(skb, tmp, sk) {
@@ -2110,7 +2114,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
* since it is cheap to do so and saves bytes on the network.
*/
if (skb->len > 0 &&
- (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
+ (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) &&
tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
if (!pskb_trim(skb, 0)) {
/* Reuse, even though it does some unnecessary work */
@@ -2206,6 +2210,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
int mib_idx;
int fwd_rexmitting = 0;
+ if (!tp->packets_out)
+ return;
+
if (!tp->lost_out)
tp->retransmit_high = tp->snd_una;
@@ -2299,7 +2306,7 @@ void tcp_send_fin(struct sock *sk)
mss_now = tcp_current_mss(sk);
if (tcp_send_head(sk) != NULL) {
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
+ TCP_SKB_CB(skb)->flags |= TCPHDR_FIN;
TCP_SKB_CB(skb)->end_seq++;
tp->write_seq++;
} else {
@@ -2316,7 +2323,7 @@ void tcp_send_fin(struct sock *sk)
skb_reserve(skb, MAX_TCP_HEADER);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
- TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
+ TCPHDR_ACK | TCPHDR_FIN);
tcp_queue_skb(sk, skb);
}
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
@@ -2341,7 +2348,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
- TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
+ TCPHDR_ACK | TCPHDR_RST);
/* Send it off. */
TCP_SKB_CB(skb)->when = tcp_time_stamp;
if (tcp_transmit_skb(sk, skb, 0, priority))
@@ -2361,11 +2368,11 @@ int tcp_send_synack(struct sock *sk)
struct sk_buff *skb;
skb = tcp_write_queue_head(sk);
- if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) {
+ if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) {
printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
return -EFAULT;
}
- if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) {
+ if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) {
if (skb_cloned(skb)) {
struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
if (nskb == NULL)
@@ -2379,7 +2386,7 @@ int tcp_send_synack(struct sock *sk)
skb = nskb;
}
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
+ TCP_SKB_CB(skb)->flags |= TCPHDR_ACK;
TCP_ECN_send_synack(tcp_sk(sk), skb);
}
TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -2458,7 +2465,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
* not even correctly set)
*/
tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
- TCPCB_FLAG_SYN | TCPCB_FLAG_ACK);
+ TCPHDR_SYN | TCPHDR_ACK);
if (OPTION_COOKIE_EXTENSION & opts.options) {
if (s_data_desired) {
@@ -2483,7 +2490,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
*tail-- ^= TCP_SKB_CB(skb)->seq + 1;
/* recommended */
- *tail-- ^= ((th->dest << 16) | th->source);
+ *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
*tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
sha_transform((__u32 *)&xvp->cookie_bakery[0],
@@ -2501,7 +2508,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
th->window = htons(min(req->rcv_wnd, 65535U));
tcp_options_write((__be32 *)(th + 1), tp, &opts);
th->doff = (tcp_header_size >> 2);
- TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
+ TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
@@ -2513,6 +2520,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
return skb;
}
+EXPORT_SYMBOL(tcp_make_synack);
/* Do all connect socket setups that can be done AF independent. */
static void tcp_connect_init(struct sock *sk)
@@ -2590,7 +2598,7 @@ int tcp_connect(struct sock *sk)
skb_reserve(buff, MAX_TCP_HEADER);
tp->snd_nxt = tp->write_seq;
- tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN);
+ tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
TCP_ECN_send_syn(sk, buff);
/* Send it off. */
@@ -2615,6 +2623,7 @@ int tcp_connect(struct sock *sk)
inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
return 0;
}
+EXPORT_SYMBOL(tcp_connect);
/* Send out a delayed ack, the caller does the policy checking
* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
@@ -2696,7 +2705,7 @@ void tcp_send_ack(struct sock *sk)
/* Reserve space for headers and prepare control bits. */
skb_reserve(buff, MAX_TCP_HEADER);
- tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK);
+ tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
/* Send it off, this clears delayed acks for us. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -2730,7 +2739,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
* end to send an ack. Don't queue or clone SKB, just
* send it.
*/
- tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK);
+ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
TCP_SKB_CB(skb)->when = tcp_time_stamp;
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
@@ -2760,13 +2769,13 @@ int tcp_write_wakeup(struct sock *sk)
if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
skb->len > mss) {
seg_size = min(seg_size, mss);
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+ TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
if (tcp_fragment(sk, skb, seg_size, mss))
return -1;
} else if (!tcp_skb_pcount(skb))
tcp_set_skb_tso_segs(sk, skb, mss);
- TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+ TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
TCP_SKB_CB(skb)->when = tcp_time_stamp;
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
if (!err)
@@ -2819,10 +2828,3 @@ void tcp_send_probe0(struct sock *sk)
TCP_RTO_MAX);
}
}
-
-EXPORT_SYMBOL(tcp_select_initial_window);
-EXPORT_SYMBOL(tcp_connect);
-EXPORT_SYMBOL(tcp_make_synack);
-EXPORT_SYMBOL(tcp_simple_retransmit);
-EXPORT_SYMBOL(tcp_sync_mss);
-EXPORT_SYMBOL(tcp_mtup_init);
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 9bc805df95d..f8efada580e 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -22,6 +22,7 @@
#include <linux/kprobes.h>
#include <linux/socket.h>
#include <linux/tcp.h>
+#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/module.h>
#include <linux/ktime.h>
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b2e6bbccaee..808bb920c9f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -19,6 +19,7 @@
*/
#include <linux/module.h>
+#include <linux/gfp.h>
#include <net/tcp.h>
int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
@@ -40,7 +41,6 @@ void tcp_init_xmit_timers(struct sock *sk)
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
}
-
EXPORT_SYMBOL(tcp_init_xmit_timers);
static void tcp_write_err(struct sock *sk)
@@ -171,14 +171,14 @@ static int tcp_write_timeout(struct sock *sk)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits)
- dst_negative_advice(&sk->sk_dst_cache, sk);
+ dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
} else {
if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
- dst_negative_advice(&sk->sk_dst_cache, sk);
+ dst_negative_advice(sk);
}
retry_until = sysctl_tcp_retries2;
@@ -516,7 +516,7 @@ static void tcp_keepalive_timer (unsigned long data)
struct sock *sk = (struct sock *) data;
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- __u32 elapsed;
+ u32 elapsed;
/* Only process if socket is not in use. */
bh_lock_sock(sk);
@@ -553,7 +553,7 @@ static void tcp_keepalive_timer (unsigned long data)
if (tp->packets_out || tcp_send_head(sk))
goto resched;
- elapsed = tcp_time_stamp - tp->rcv_tstamp;
+ elapsed = keepalive_time_elapsed(tp);
if (elapsed >= keepalive_time_when(tp)) {
if (icsk->icsk_probes_out >= keepalive_probes(tp)) {
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 3959e0ca456..59186ca7808 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -8,6 +8,7 @@
#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -47,7 +48,6 @@ err:
return ret;
}
-
EXPORT_SYMBOL(xfrm4_tunnel_register);
int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
@@ -71,7 +71,6 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
return ret;
}
-
EXPORT_SYMBOL(xfrm4_tunnel_deregister);
static int tunnel4_rcv(struct sk_buff *skb)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7af756d0f93..32e0bef60d0 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,6 +95,7 @@
#include <linux/mm.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
+#include <linux/slab.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
@@ -232,7 +233,8 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
*/
do {
if (low <= snum && snum <= high &&
- !test_bit(snum >> udptable->log, bitmap))
+ !test_bit(snum >> udptable->log, bitmap) &&
+ !inet_is_reserved_local_port(snum))
goto found;
snum += rand;
} while (snum != first);
@@ -306,13 +308,13 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
unsigned int port)
{
- return jhash_1word(saddr, net_hash_mix(net)) ^ port;
+ return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
}
int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
unsigned int hash2_nulladdr =
- udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum);
+ udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
unsigned int hash2_partial =
udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
@@ -465,14 +467,14 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
daddr, hnum, dif,
hslot2, slot2);
if (!result) {
- hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum);
+ hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
goto begin;
- result = udp4_lib_lookup2(net, INADDR_ANY, sport,
- daddr, hnum, dif,
+ result = udp4_lib_lookup2(net, saddr, sport,
+ htonl(INADDR_ANY), hnum, dif,
hslot2, slot2);
}
rcu_read_unlock();
@@ -631,9 +633,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
if (!inet->recverr) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
- } else {
+ } else
ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
- }
+
sk->sk_err = err;
sk->sk_error_report(sk);
out:
@@ -912,7 +914,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
!sock_flag(sk, SOCK_BROADCAST))
goto out;
if (connected)
- sk_dst_set(sk, dst_clone(&rt->u.dst));
+ sk_dst_set(sk, dst_clone(&rt->dst));
}
if (msg->msg_flags&MSG_CONFIRM)
@@ -976,7 +978,7 @@ out:
return err;
do_confirm:
- dst_confirm(&rt->u.dst);
+ dst_confirm(&rt->dst);
if (!(msg->msg_flags&MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
@@ -1061,10 +1063,11 @@ static unsigned int first_packet_length(struct sock *sk)
spin_unlock_bh(&rcvq->lock);
if (!skb_queue_empty(&list_kill)) {
- lock_sock(sk);
+ bool slow = lock_sock_fast(sk);
+
__skb_queue_purge(&list_kill);
sk_mem_reclaim_partial(sk);
- release_sock(sk);
+ unlock_sock_fast(sk, slow);
}
return res;
}
@@ -1121,6 +1124,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
int peeked;
int err;
int is_udplite = IS_UDPLITE(sk);
+ bool slow;
/*
* Check any passed addresses
@@ -1195,10 +1199,10 @@ out:
return err;
csum_copy_err:
- lock_sock(sk);
+ slow = lock_sock_fast(sk);
if (!skb_kill_datagram(sk, skb, flags))
UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- release_sock(sk);
+ unlock_sock_fast(sk, slow);
if (noblock)
return -EAGAIN;
@@ -1216,6 +1220,7 @@ int udp_disconnect(struct sock *sk, int flags)
sk->sk_state = TCP_CLOSE;
inet->inet_daddr = 0;
inet->inet_dport = 0;
+ sock_rps_save_rxhash(sk, 0);
sk->sk_bound_dev_if = 0;
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
@@ -1257,8 +1262,12 @@ EXPORT_SYMBOL(udp_lib_unhash);
static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
- int rc = sock_queue_rcv_skb(sk, skb);
+ int rc;
+
+ if (inet_sk(sk)->inet_daddr)
+ sock_rps_save_rxhash(sk, skb->rxhash);
+ rc = ip_queue_rcv_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
@@ -1366,6 +1375,10 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
goto drop;
}
+
+ if (sk_rcvqueues_full(sk, skb))
+ goto drop;
+
rc = 0;
bh_lock_sock(sk);
@@ -1526,6 +1539,9 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
uh = udp_hdr(skb);
ulen = ntohs(uh->len);
+ saddr = ip_hdr(skb)->saddr;
+ daddr = ip_hdr(skb)->daddr;
+
if (ulen > skb->len)
goto short_packet;
@@ -1539,9 +1555,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
- saddr = ip_hdr(skb)->saddr;
- daddr = ip_hdr(skb)->daddr;
-
if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
return __udp4_lib_mcast_deliver(net, skb, uh,
saddr, daddr, udptable);
@@ -1614,9 +1627,9 @@ int udp_rcv(struct sk_buff *skb)
void udp_destroy_sock(struct sock *sk)
{
- lock_sock(sk);
+ bool slow = lock_sock_fast(sk);
udp_flush_pending_frames(sk);
- release_sock(sk);
+ unlock_sock_fast(sk, slow);
}
/*
@@ -1675,8 +1688,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
return -ENOPROTOOPT;
if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
val = 8;
- else if (val > USHORT_MAX)
- val = USHORT_MAX;
+ else if (val > USHRT_MAX)
+ val = USHRT_MAX;
up->pcslen = val;
up->pcflag |= UDPLITE_SEND_CC;
break;
@@ -1689,8 +1702,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
return -ENOPROTOOPT;
if (val != 0 && val < 8) /* Avoid silly minimal values. */
val = 8;
- else if (val > USHORT_MAX)
- val = USHORT_MAX;
+ else if (val > USHRT_MAX)
+ val = USHRT_MAX;
up->pcrlen = val;
up->pcflag |= UDPLITE_RECV_CC;
break;
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 6610bf76369..ab76aa928fa 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -58,6 +58,7 @@ struct proto udplite_prot = {
.compat_getsockopt = compat_udp_getsockopt,
#endif
};
+EXPORT_SYMBOL(udplite_prot);
static struct inet_protosw udplite4_protosw = {
.type = SOCK_DGRAM,
@@ -127,5 +128,3 @@ out_unregister_proto:
out_register_err:
printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
}
-
-EXPORT_SYMBOL(udplite_prot);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index f9f922a0ba8..06814b6216d 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -9,6 +9,7 @@
*
*/
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/netfilter.h>
@@ -26,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
if (skb_dst(skb) == NULL) {
const struct iphdr *iph = ip_hdr(skb);
- if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
- skb->dev))
+ if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ iph->tos, skb->dev))
goto drop;
}
return dst_input(skb);
@@ -60,7 +61,7 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
iph->tot_len = htons(skb->len);
ip_send_check(iph);
- NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
+ NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
xfrm4_rcv_encap_finish);
return 0;
}
@@ -162,5 +163,4 @@ int xfrm4_rcv(struct sk_buff *skb)
{
return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0);
}
-
EXPORT_SYMBOL(xfrm4_rcv);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 3444f3b34ec..6f368413eb0 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -4,6 +4,7 @@
* Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
*/
+#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index c908bd99bcb..571aa96a175 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -86,7 +86,7 @@ static int xfrm4_output_finish(struct sk_buff *skb)
int xfrm4_output(struct sk_buff *skb)
{
- return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb,
+ return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb,
NULL, skb_dst(skb)->dev, xfrm4_output_finish,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index e4a1483fba7..869078d4eeb 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -37,7 +37,7 @@ static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
fl.fl4_src = saddr->a4;
err = __ip_route_output_key(net, &rt, &fl);
- dst = &rt->u.dst;
+ dst = &rt->dst;
if (err)
dst = ERR_PTR(err);
return dst;
@@ -59,27 +59,6 @@ static int xfrm4_get_saddr(struct net *net,
return 0;
}
-static struct dst_entry *
-__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
-{
- struct dst_entry *dst;
-
- read_lock_bh(&policy->lock);
- for (dst = policy->bundles; dst; dst = dst->next) {
- struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
- if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/
- xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
- xdst->u.rt.fl.fl4_src == fl->fl4_src &&
- xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&
- xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) {
- dst_clone(dst);
- break;
- }
- }
- read_unlock_bh(&policy->lock);
- return dst;
-}
-
static int xfrm4_get_tos(struct flowi *fl)
{
return fl->fl4_tos;
@@ -129,6 +108,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
memset(fl, 0, sizeof(struct flowi));
+ fl->mark = skb->mark;
+
if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
switch (iph->protocol) {
case IPPROTO_UDP:
@@ -259,7 +240,6 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
.dst_ops = &xfrm4_dst_ops,
.dst_lookup = xfrm4_dst_lookup,
.get_saddr = xfrm4_get_saddr,
- .find_bundle = __xfrm4_find_bundle,
.decode_session = _decode_session4,
.get_tos = xfrm4_get_tos,
.init_path = xfrm4_init_path,