summaryrefslogtreecommitdiff
path: root/net/ipv4/fou.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-11 14:27:06 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-11 14:27:06 -0800
commit70e71ca0af244f48a5dcf56dc435243792e3a495 (patch)
treef7d9c4c4d9a857a00043e9bf6aa2d6f533a34778 /net/ipv4/fou.c
parentbae41e45b7400496b9bf0c70c6004419d9987819 (diff)
parent00c83b01d58068dfeb2e1351cca6fccf2a83fa8f (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) New offloading infrastructure and example 'rocker' driver for offloading of switching and routing to hardware. This work was done by a large group of dedicated individuals, not limited to: Scott Feldman, Jiri Pirko, Thomas Graf, John Fastabend, Jamal Hadi Salim, Andy Gospodarek, Florian Fainelli, Roopa Prabhu 2) Start making the networking operate on IOV iterators instead of modifying iov objects in-situ during transfers. Thanks to Al Viro and Herbert Xu. 3) A set of new netlink interfaces for the TIPC stack, from Richard Alpe. 4) Remove unnecessary looping during ipv6 routing lookups, from Martin KaFai Lau. 5) Add PAUSE frame generation support to gianfar driver, from Matei Pavaluca. 6) Allow for larger reordering levels in TCP, which are easily achievable in the real world right now, from Eric Dumazet. 7) Add a variable of napi_schedule that doesn't need to disable cpu interrupts, from Eric Dumazet. 8) Use a doubly linked list to optimize neigh_parms_release(), from Nicolas Dichtel. 9) Various enhancements to the kernel BPF verifier, and allow eBPF programs to actually be attached to sockets. From Alexei Starovoitov. 10) Support TSO/LSO in sunvnet driver, from David L Stevens. 11) Allow controlling ECN usage via routing metrics, from Florian Westphal. 12) Remote checksum offload, from Tom Herbert. 13) Add split-header receive, BQL, and xmit_more support to amd-xgbe driver, from Thomas Lendacky. 14) Add MPLS support to openvswitch, from Simon Horman. 15) Support wildcard tunnel endpoints in ipv6 tunnels, from Steffen Klassert. 16) Do gro flushes on a per-device basis using a timer, from Eric Dumazet. This tries to resolve the conflicting goals between the desired handling of bulk vs. RPC-like traffic. 17) Allow userspace to ask for the CPU upon what a packet was received/steered, via SO_INCOMING_CPU. From Eric Dumazet. 18) Limit GSO packets to half the current congestion window, from Eric Dumazet. 19) Add a generic helper so that all drivers set their RSS keys in a consistent way, from Eric Dumazet. 20) Add xmit_more support to enic driver, from Govindarajulu Varadarajan. 21) Add VLAN packet scheduler action, from Jiri Pirko. 22) Support configurable RSS hash functions via ethtool, from Eyal Perry. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1820 commits) Fix race condition between vxlan_sock_add and vxlan_sock_release net/macb: fix compilation warning for print_hex_dump() called with skb->mac_header net/mlx4: Add support for A0 steering net/mlx4: Refactor QUERY_PORT net/mlx4_core: Add explicit error message when rule doesn't meet configuration net/mlx4: Add A0 hybrid steering net/mlx4: Add mlx4_bitmap zone allocator net/mlx4: Add a check if there are too many reserved QPs net/mlx4: Change QP allocation scheme net/mlx4_core: Use tasklet for user-space CQ completion events net/mlx4_core: Mask out host side virtualization features for guests net/mlx4_en: Set csum level for encapsulated packets be2net: Export tunnel offloads only when a VxLAN tunnel is created gianfar: Fix dma check map error when DMA_API_DEBUG is enabled cxgb4/csiostor: Don't use MASTER_MUST for fw_hello call net: fec: only enable mdio interrupt before phy device link up net: fec: clear all interrupt events to support i.MX6SX net: fec: reset fep link status in suspend function net: sock: fix access via invalid file descriptor net: introduce helper macro for_each_cmsghdr ...
Diffstat (limited to 'net/ipv4/fou.c')
-rw-r--r--net/ipv4/fou.c423
1 files changed, 379 insertions, 44 deletions
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 606c520ffd5a..b986298a7ba3 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -38,21 +38,17 @@ static inline struct fou *fou_from_sock(struct sock *sk)
return sk->sk_user_data;
}
-static int fou_udp_encap_recv_deliver(struct sk_buff *skb,
- u8 protocol, size_t len)
+static void fou_recv_pull(struct sk_buff *skb, size_t len)
{
struct iphdr *iph = ip_hdr(skb);
/* Remove 'len' bytes from the packet (UDP header and
- * FOU header if present), modify the protocol to the one
- * we found, and then call rcv_encap.
+ * FOU header if present).
*/
iph->tot_len = htons(ntohs(iph->tot_len) - len);
__skb_pull(skb, len);
skb_postpull_rcsum(skb, udp_hdr(skb), len);
skb_reset_transport_header(skb);
-
- return -protocol;
}
static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
@@ -62,16 +58,56 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
if (!fou)
return 1;
- return fou_udp_encap_recv_deliver(skb, fou->protocol,
- sizeof(struct udphdr));
+ fou_recv_pull(skb, sizeof(struct udphdr));
+
+ return -fou->protocol;
+}
+
+static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
+ void *data, size_t hdrlen, u8 ipproto)
+{
+ __be16 *pd = data;
+ size_t start = ntohs(pd[0]);
+ size_t offset = ntohs(pd[1]);
+ size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
+ __wsum delta;
+
+ if (skb->remcsum_offload) {
+ /* Already processed in GRO path */
+ skb->remcsum_offload = 0;
+ return guehdr;
+ }
+
+ if (!pskb_may_pull(skb, plen))
+ return NULL;
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE))
+ __skb_checksum_complete(skb);
+
+ delta = remcsum_adjust((void *)guehdr + hdrlen,
+ skb->csum, start, offset);
+
+ /* Adjust skb->csum since we changed the packet */
+ skb->csum = csum_add(skb->csum, delta);
+
+ return guehdr;
+}
+
+static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr)
+{
+ /* No support yet */
+ kfree_skb(skb);
+ return 0;
}
static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
{
struct fou *fou = fou_from_sock(sk);
- size_t len;
+ size_t len, optlen, hdrlen;
struct guehdr *guehdr;
- struct udphdr *uh;
+ void *data;
+ u16 doffset = 0;
if (!fou)
return 1;
@@ -80,25 +116,58 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
if (!pskb_may_pull(skb, len))
goto drop;
- uh = udp_hdr(skb);
- guehdr = (struct guehdr *)&uh[1];
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ optlen = guehdr->hlen << 2;
+ len += optlen;
- len += guehdr->hlen << 2;
if (!pskb_may_pull(skb, len))
goto drop;
- uh = udp_hdr(skb);
- guehdr = (struct guehdr *)&uh[1];
+ /* guehdr may change after pull */
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
- if (guehdr->version != 0)
- goto drop;
+ hdrlen = sizeof(struct guehdr) + optlen;
- if (guehdr->flags) {
- /* No support yet */
+ if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen))
goto drop;
+
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+
+ /* Pull csum through the guehdr now . This can be used if
+ * there is a remote checksum offload.
+ */
+ skb_postpull_rcsum(skb, udp_hdr(skb), len);
+
+ data = &guehdr[1];
+
+ if (guehdr->flags & GUE_FLAG_PRIV) {
+ __be32 flags = *(__be32 *)(data + doffset);
+
+ doffset += GUE_LEN_PRIV;
+
+ if (flags & GUE_PFLAG_REMCSUM) {
+ guehdr = gue_remcsum(skb, guehdr, data + doffset,
+ hdrlen, guehdr->proto_ctype);
+ if (!guehdr)
+ goto drop;
+
+ data = &guehdr[1];
+
+ doffset += GUE_PLEN_REMCSUM;
+ }
}
- return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len);
+ if (unlikely(guehdr->control))
+ return gue_control_message(skb, guehdr);
+
+ __skb_pull(skb, sizeof(struct udphdr) + hdrlen);
+ skb_reset_transport_header(skb);
+
+ return -guehdr->proto_ctype;
+
drop:
kfree_skb(skb);
return 0;
@@ -149,6 +218,41 @@ out_unlock:
return err;
}
+static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
+ struct guehdr *guehdr, void *data,
+ size_t hdrlen, u8 ipproto)
+{
+ __be16 *pd = data;
+ size_t start = ntohs(pd[0]);
+ size_t offset = ntohs(pd[1]);
+ size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
+ __wsum delta;
+
+ if (skb->remcsum_offload)
+ return guehdr;
+
+ if (!NAPI_GRO_CB(skb)->csum_valid)
+ return NULL;
+
+ /* Pull checksum that will be written */
+ if (skb_gro_header_hard(skb, off + plen)) {
+ guehdr = skb_gro_header_slow(skb, off + plen, off);
+ if (!guehdr)
+ return NULL;
+ }
+
+ delta = remcsum_adjust((void *)guehdr + hdrlen,
+ NAPI_GRO_CB(skb)->csum, start, offset);
+
+ /* Adjust skb->csum since we changed the packet */
+ skb->csum = csum_add(skb->csum, delta);
+ NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+
+ skb->remcsum_offload = 1;
+
+ return guehdr;
+}
+
static struct sk_buff **gue_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
@@ -156,38 +260,64 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
const struct net_offload *ops;
struct sk_buff **pp = NULL;
struct sk_buff *p;
- u8 proto;
struct guehdr *guehdr;
- unsigned int hlen, guehlen;
- unsigned int off;
+ size_t len, optlen, hdrlen, off;
+ void *data;
+ u16 doffset = 0;
int flush = 1;
off = skb_gro_offset(skb);
- hlen = off + sizeof(*guehdr);
+ len = off + sizeof(*guehdr);
+
guehdr = skb_gro_header_fast(skb, off);
- if (skb_gro_header_hard(skb, hlen)) {
- guehdr = skb_gro_header_slow(skb, hlen, off);
+ if (skb_gro_header_hard(skb, len)) {
+ guehdr = skb_gro_header_slow(skb, len, off);
if (unlikely(!guehdr))
goto out;
}
- proto = guehdr->next_hdr;
+ optlen = guehdr->hlen << 2;
+ len += optlen;
- rcu_read_lock();
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
- if (WARN_ON(!ops || !ops->callbacks.gro_receive))
- goto out_unlock;
+ if (skb_gro_header_hard(skb, len)) {
+ guehdr = skb_gro_header_slow(skb, len, off);
+ if (unlikely(!guehdr))
+ goto out;
+ }
- guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
+ if (unlikely(guehdr->control) || guehdr->version != 0 ||
+ validate_gue_flags(guehdr, optlen))
+ goto out;
- hlen = off + guehlen;
- if (skb_gro_header_hard(skb, hlen)) {
- guehdr = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!guehdr))
- goto out_unlock;
+ hdrlen = sizeof(*guehdr) + optlen;
+
+ /* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr,
+ * this is needed if there is a remote checkcsum offload.
+ */
+ skb_gro_postpull_rcsum(skb, guehdr, hdrlen);
+
+ data = &guehdr[1];
+
+ if (guehdr->flags & GUE_FLAG_PRIV) {
+ __be32 flags = *(__be32 *)(data + doffset);
+
+ doffset += GUE_LEN_PRIV;
+
+ if (flags & GUE_PFLAG_REMCSUM) {
+ guehdr = gue_gro_remcsum(skb, off, guehdr,
+ data + doffset, hdrlen,
+ guehdr->proto_ctype);
+ if (!guehdr)
+ goto out;
+
+ data = &guehdr[1];
+
+ doffset += GUE_PLEN_REMCSUM;
+ }
}
+ skb_gro_pull(skb, hdrlen);
+
flush = 0;
for (p = *head; p; p = p->next) {
@@ -199,7 +329,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
guehdr2 = (struct guehdr *)(p->data + off);
/* Compare base GUE header to be equal (covers
- * hlen, version, next_hdr, and flags.
+ * hlen, version, proto_ctype, and flags.
*/
if (guehdr->word != guehdr2->word) {
NAPI_GRO_CB(p)->same_flow = 0;
@@ -214,10 +344,11 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
}
}
- skb_gro_pull(skb, guehlen);
-
- /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
- skb_gro_postpull_rcsum(skb, guehdr, guehlen);
+ rcu_read_lock();
+ offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
+ ops = rcu_dereference(offloads[guehdr->proto_ctype]);
+ if (WARN_ON(!ops || !ops->callbacks.gro_receive))
+ goto out_unlock;
pp = ops->callbacks.gro_receive(head, skb);
@@ -238,7 +369,7 @@ static int gue_gro_complete(struct sk_buff *skb, int nhoff)
u8 proto;
int err = -ENOENT;
- proto = guehdr->next_hdr;
+ proto = guehdr->proto_ctype;
guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
@@ -489,6 +620,200 @@ static const struct genl_ops fou_nl_ops[] = {
},
};
+size_t fou_encap_hlen(struct ip_tunnel_encap *e)
+{
+ return sizeof(struct udphdr);
+}
+EXPORT_SYMBOL(fou_encap_hlen);
+
+size_t gue_encap_hlen(struct ip_tunnel_encap *e)
+{
+ size_t len;
+ bool need_priv = false;
+
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+
+ if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) {
+ len += GUE_PLEN_REMCSUM;
+ need_priv = true;
+ }
+
+ len += need_priv ? GUE_LEN_PRIV : 0;
+
+ return len;
+}
+EXPORT_SYMBOL(gue_encap_hlen);
+
+static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ struct flowi4 *fl4, u8 *protocol, __be16 sport)
+{
+ struct udphdr *uh;
+
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+
+ uh = udp_hdr(skb);
+
+ uh->dest = e->dport;
+ uh->source = sport;
+ uh->len = htons(skb->len);
+ uh->check = 0;
+ udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
+ fl4->saddr, fl4->daddr, skb->len);
+
+ *protocol = IPPROTO_UDP;
+}
+
+int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
+ int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+ __be16 sport;
+
+ skb = iptunnel_handle_offloads(skb, csum, type);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+ fou_build_udp(skb, e, fl4, protocol, sport);
+
+ return 0;
+}
+EXPORT_SYMBOL(fou_build_header);
+
+int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
+ int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+ struct guehdr *guehdr;
+ size_t hdrlen, optlen = 0;
+ __be16 sport;
+ void *data;
+ bool need_priv = false;
+
+ if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ csum = false;
+ optlen += GUE_PLEN_REMCSUM;
+ type |= SKB_GSO_TUNNEL_REMCSUM;
+ need_priv = true;
+ }
+
+ optlen += need_priv ? GUE_LEN_PRIV : 0;
+
+ skb = iptunnel_handle_offloads(skb, csum, type);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ /* Get source port (based on flow hash) before skb_push */
+ sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+
+ hdrlen = sizeof(struct guehdr) + optlen;
+
+ skb_push(skb, hdrlen);
+
+ guehdr = (struct guehdr *)skb->data;
+
+ guehdr->control = 0;
+ guehdr->version = 0;
+ guehdr->hlen = optlen >> 2;
+ guehdr->flags = 0;
+ guehdr->proto_ctype = *protocol;
+
+ data = &guehdr[1];
+
+ if (need_priv) {
+ __be32 *flags = data;
+
+ guehdr->flags |= GUE_FLAG_PRIV;
+ *flags = 0;
+ data += GUE_LEN_PRIV;
+
+ if (type & SKB_GSO_TUNNEL_REMCSUM) {
+ u16 csum_start = skb_checksum_start_offset(skb);
+ __be16 *pd = data;
+
+ if (csum_start < hdrlen)
+ return -EINVAL;
+
+ csum_start -= hdrlen;
+ pd[0] = htons(csum_start);
+ pd[1] = htons(csum_start + skb->csum_offset);
+
+ if (!skb_is_gso(skb)) {
+ skb->ip_summed = CHECKSUM_NONE;
+ skb->encapsulation = 0;
+ }
+
+ *flags |= GUE_PFLAG_REMCSUM;
+ data += GUE_PLEN_REMCSUM;
+ }
+
+ }
+
+ fou_build_udp(skb, e, fl4, protocol, sport);
+
+ return 0;
+}
+EXPORT_SYMBOL(gue_build_header);
+
+#ifdef CONFIG_NET_FOU_IP_TUNNELS
+
+static const struct ip_tunnel_encap_ops __read_mostly fou_iptun_ops = {
+ .encap_hlen = fou_encap_hlen,
+ .build_header = fou_build_header,
+};
+
+static const struct ip_tunnel_encap_ops __read_mostly gue_iptun_ops = {
+ .encap_hlen = gue_encap_hlen,
+ .build_header = gue_build_header,
+};
+
+static int ip_tunnel_encap_add_fou_ops(void)
+{
+ int ret;
+
+ ret = ip_tunnel_encap_add_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ if (ret < 0) {
+ pr_err("can't add fou ops\n");
+ return ret;
+ }
+
+ ret = ip_tunnel_encap_add_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
+ if (ret < 0) {
+ pr_err("can't add gue ops\n");
+ ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void ip_tunnel_encap_del_fou_ops(void)
+{
+ ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU);
+ ip_tunnel_encap_del_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE);
+}
+
+#else
+
+static int ip_tunnel_encap_add_fou_ops(void)
+{
+ return 0;
+}
+
+static void ip_tunnel_encap_del_fou_ops(void)
+{
+}
+
+#endif
+
static int __init fou_init(void)
{
int ret;
@@ -496,6 +821,14 @@ static int __init fou_init(void)
ret = genl_register_family_with_ops(&fou_nl_family,
fou_nl_ops);
+ if (ret < 0)
+ goto exit;
+
+ ret = ip_tunnel_encap_add_fou_ops();
+ if (ret < 0)
+ genl_unregister_family(&fou_nl_family);
+
+exit:
return ret;
}
@@ -503,6 +836,8 @@ static void __exit fou_fini(void)
{
struct fou *fou, *next;
+ ip_tunnel_encap_del_fou_ops();
+
genl_unregister_family(&fou_nl_family);
/* Close all the FOU sockets */