From d035f19f59c5bca2fda2faa43b5e9fe09dfb7884 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 29 May 2019 13:25:36 +0200 Subject: netfilter: nf_conntrack: allow to register bridge support This patch adds infrastructure to register and to unregister bridge support for the conntrack module via nf_ct_bridge_register() and nf_ct_bridge_unregister(). Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_proto.c | 61 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 3 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 37bb530d848f..3813cb551df9 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -442,12 +443,14 @@ static int nf_ct_tcp_fixup(struct nf_conn *ct, void *_nfproto) return 0; } +static struct nf_ct_bridge_info *nf_ct_bridge_info; + static int nf_ct_netns_do_get(struct net *net, u8 nfproto) { struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); - bool fixup_needed = false; + bool fixup_needed = false, retry = true; int err = 0; - +retry: mutex_lock(&nf_ct_proto_mutex); switch (nfproto) { @@ -487,6 +490,32 @@ static int nf_ct_netns_do_get(struct net *net, u8 nfproto) fixup_needed = true; break; #endif + case NFPROTO_BRIDGE: + if (!nf_ct_bridge_info) { + if (!retry) { + err = -EPROTO; + goto out_unlock; + } + mutex_unlock(&nf_ct_proto_mutex); + request_module("nf_conntrack_bridge"); + retry = false; + goto retry; + } + if (!try_module_get(nf_ct_bridge_info->me)) { + err = -EPROTO; + goto out_unlock; + } + cnet->users_bridge++; + if (cnet->users_bridge > 1) + goto out_unlock; + + err = nf_register_net_hooks(net, nf_ct_bridge_info->ops, + nf_ct_bridge_info->ops_size); + if (err) + cnet->users_bridge = 0; + else + fixup_needed = true; + break; default: err = -EPROTO; break; @@ -519,8 +548,16 @@ static void nf_ct_netns_do_put(struct net *net, u8 nfproto) ARRAY_SIZE(ipv6_conntrack_ops)); break; #endif + case NFPROTO_BRIDGE: + if (!nf_ct_bridge_info) + break; + if (cnet->users_bridge && (--cnet->users_bridge == 0)) + nf_unregister_net_hooks(net, nf_ct_bridge_info->ops, + nf_ct_bridge_info->ops_size); + + module_put(nf_ct_bridge_info->me); + break; } - mutex_unlock(&nf_ct_proto_mutex); } @@ -560,6 +597,24 @@ void nf_ct_netns_put(struct net *net, uint8_t nfproto) } EXPORT_SYMBOL_GPL(nf_ct_netns_put); +void nf_ct_bridge_register(struct nf_ct_bridge_info *info) +{ + WARN_ON(nf_ct_bridge_info); + mutex_lock(&nf_ct_proto_mutex); + nf_ct_bridge_info = info; + mutex_unlock(&nf_ct_proto_mutex); +} +EXPORT_SYMBOL_GPL(nf_ct_bridge_register); + +void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info) +{ + WARN_ON(!nf_ct_bridge_info); + mutex_lock(&nf_ct_proto_mutex); + nf_ct_bridge_info = NULL; + mutex_unlock(&nf_ct_proto_mutex); +} +EXPORT_SYMBOL_GPL(nf_ct_bridge_unregister); + int nf_conntrack_proto_init(void) { int ret; -- cgit v1.2.3 From 3c171f496ef57774f8e5d509923372549734877f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 29 May 2019 13:25:37 +0200 Subject: netfilter: bridge: add connection tracking system This patch adds basic connection tracking support for the bridge, including initial IPv4 support. This patch register two hooks to deal with the bridge forwarding path, one from the bridge prerouting hook to call nf_conntrack_in(); and another from the bridge postrouting hook to confirm the entry. The conntrack bridge prerouting hook defragments packets before passing them to nf_conntrack_in() to look up for an existing entry, otherwise a new entry is allocated and it is attached to the skbuff. The conntrack bridge postrouting hook confirms new conntrack entries, ie. if this is the first packet seen, then it adds the entry to the hashtable and (if needed) it refragments the skbuff into the original fragments, leaving the geometry as is if possible. Exceptions are linearized skbuffs, eg. skbuffs that are passed up to nfqueue and conntrack helpers, as well as cloned skbuff for the local delivery (eg. tcpdump), also in case of bridge port flooding (cloned skbuff too). The packet defragmentation is done through the ip_defrag() call. This forces us to save the bridge control buffer, reset the IP control buffer area and then restore it after call. This function also bumps the IP fragmentation statistics, it would be probably desiderable to have independent statistics for the bridge defragmentation/refragmentation. The maximum fragment length is stored in the control buffer and it is used to refragment the skbuff from the postrouting path. The new fraglist splitter and fragment transformer APIs are used to implement the bridge refragmentation code. The br_ip_fragment() function drops the packet in case the maximum fragment size seen is larger than the output port MTU. This patchset follows the principle that conntrack should not drop packets, so users can do it through policy via invalid state matching. Like br_netfilter, there is no refragmentation for packets that are passed up for local delivery, ie. prerouting -> input path. There are calls to nf_reset() already in several spots in the stack since time ago already, eg. af_packet, that show that skbuff fraglist handling from the netif_rx path is supported already. The helpers are called from the postrouting hook, before confirmation, from there we may see packet floods to bridge ports. Then, although unlikely, this may result in exercising the helpers many times for each clone. It would be good to explore how to pass all the packets in a list to the conntrack hook to do this handle only once for this case. Thanks to Florian Westphal for handing me over an initial patchset version to add support for conntrack bridge. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_conntrack_bridge.h | 7 + include/net/netfilter/nf_conntrack_core.h | 3 + net/bridge/br_device.c | 1 + net/bridge/br_private.h | 1 + net/bridge/netfilter/Kconfig | 14 ++ net/bridge/netfilter/Makefile | 3 + net/bridge/netfilter/nf_conntrack_bridge.c | 378 ++++++++++++++++++++++++++++ net/netfilter/nf_conntrack_proto.c | 7 +- 8 files changed, 410 insertions(+), 4 deletions(-) create mode 100644 net/bridge/netfilter/nf_conntrack_bridge.c (limited to 'net/netfilter') diff --git a/include/net/netfilter/nf_conntrack_bridge.h b/include/net/netfilter/nf_conntrack_bridge.h index 3be1642e04f7..9a5514d5bc51 100644 --- a/include/net/netfilter/nf_conntrack_bridge.h +++ b/include/net/netfilter/nf_conntrack_bridge.h @@ -10,4 +10,11 @@ struct nf_ct_bridge_info { void nf_ct_bridge_register(struct nf_ct_bridge_info *info); void nf_ct_bridge_unregister(struct nf_ct_bridge_info *info); +struct nf_ct_bridge_frag_data { + char mac[ETH_HLEN]; + bool vlan_present; + u16 vlan_tci; + __be16 vlan_proto; +}; + #endif diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index ae41e92251dd..de10faf2ce91 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -64,6 +64,9 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) return ret; } +unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo); + void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *proto); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 013323b6dbe4..693aefad7f8a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -56,6 +56,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) br_switchdev_frame_unmark(skb); BR_INPUT_SKB_CB(skb)->brdev = dev; + BR_INPUT_SKB_CB(skb)->frag_max_size = 0; skb_reset_mac_header(skb); eth = eth_hdr(skb); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 334a8c496b50..68561741e827 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -425,6 +425,7 @@ struct net_bridge { struct br_input_skb_cb { struct net_device *brdev; + u16 frag_max_size; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING u8 igmp; u8 mrouters_only:1; diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index c3ad90c43801..f4fb0b9b927d 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -19,6 +19,20 @@ config NF_LOG_BRIDGE tristate "Bridge packet logging" select NF_LOG_COMMON +config NF_CONNTRACK_BRIDGE + tristate "IPv4/IPV6 bridge connection tracking support" + depends on NF_CONNTRACK + default n + help + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. This is used to enhance packet filtering via + stateful policies. Enable this if you want native tracking from + the bridge. This provides a replacement for the `br_netfilter' + infrastructure. + + To compile it as a module, choose M here. If unsure, say N. + endif # NF_TABLES_BRIDGE menuconfig BRIDGE_NF_EBTABLES diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 9b868861f21a..9d7767322a64 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -5,6 +5,9 @@ obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o +# connection tracking +obj-$(CONFIG_NF_CONNTRACK_BRIDGE) += nf_conntrack_bridge.o + # packet logging obj-$(CONFIG_NF_LOG_BRIDGE) += nf_log_bridge.o diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c new file mode 100644 index 000000000000..2571528ed582 --- /dev/null +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -0,0 +1,378 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include "../br_private.h" + +/* Best effort variant of ip_do_fragment which preserves geometry, unless skbuff + * has been linearized or cloned. + */ +static int nf_br_ip_fragment(struct net *net, struct sock *sk, + struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + unsigned int hlen, ll_rs, mtu; + struct ip_frag_state state; + struct iphdr *iph; + int err; + + /* for offloaded checksums cleanup checksum before fragmentation */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto blackhole; + + iph = ip_hdr(skb); + + /* + * Setup starting values + */ + + hlen = iph->ihl * 4; + frag_max_size -= hlen; + ll_rs = LL_RESERVED_SPACE(skb->dev); + mtu = skb->dev->mtu; + + if (skb_has_frag_list(skb)) { + unsigned int first_len = skb_pagelen(skb); + struct ip_fraglist_iter iter; + struct sk_buff *frag; + + if (first_len - hlen > mtu || + skb_headroom(skb) < ll_rs) + goto blackhole; + + if (skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag) { + if (frag->len > mtu || + skb_headroom(frag) < hlen + ll_rs) + goto blackhole; + + if (skb_shared(frag)) + goto slow_path; + } + + ip_fraglist_init(skb, iph, hlen, &iter); + + for (;;) { + if (iter.frag) + ip_fraglist_prepare(skb, &iter); + + err = output(net, sk, data, skb); + if (err || !iter.frag) + break; + + skb = ip_fraglist_next(&iter); + } + return err; + } +slow_path: + /* This is a linearized skbuff, the original geometry is lost for us. + * This may also be a clone skbuff, we could preserve the geometry for + * the copies but probably not worth the effort. + */ + ip_frag_init(skb, hlen, ll_rs, frag_max_size, &state); + + while (state.left > 0) { + struct sk_buff *skb2; + + skb2 = ip_frag_next(skb, &state); + if (IS_ERR(skb2)) { + err = PTR_ERR(skb2); + goto blackhole; + } + + err = output(net, sk, data, skb2); + if (err) + goto blackhole; + } + consume_skb(skb); + return err; + +blackhole: + kfree_skb(skb); + return 0; +} + +/* ip_defrag() expects IPCB() in place. */ +static void br_skb_cb_save(struct sk_buff *skb, struct br_input_skb_cb *cb, + size_t inet_skb_parm_size) +{ + memcpy(cb, skb->cb, sizeof(*cb)); + memset(skb->cb, 0, inet_skb_parm_size); +} + +static void br_skb_cb_restore(struct sk_buff *skb, + const struct br_input_skb_cb *cb, + u16 fragsz) +{ + memcpy(skb->cb, cb, sizeof(*cb)); + BR_INPUT_SKB_CB(skb)->frag_max_size = fragsz; +} + +static unsigned int nf_ct_br_defrag4(struct sk_buff *skb, + const struct nf_hook_state *state) +{ + u16 zone_id = NF_CT_DEFAULT_ZONE_ID; + enum ip_conntrack_info ctinfo; + struct br_input_skb_cb cb; + const struct nf_conn *ct; + int err; + + if (!ip_is_fragment(ip_hdr(skb))) + return NF_ACCEPT; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) + zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); + + br_skb_cb_save(skb, &cb, sizeof(struct inet_skb_parm)); + local_bh_disable(); + err = ip_defrag(state->net, skb, + IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id); + local_bh_enable(); + if (!err) { + br_skb_cb_restore(skb, &cb, IPCB(skb)->frag_max_size); + skb->ignore_df = 1; + return NF_ACCEPT; + } + + return NF_STOLEN; +} + +static int nf_ct_br_ip_check(const struct sk_buff *skb) +{ + const struct iphdr *iph; + int nhoff, len; + + nhoff = skb_network_offset(skb); + iph = ip_hdr(skb); + if (iph->ihl < 5 || + iph->version != 4) + return -1; + + len = ntohs(iph->tot_len); + if (skb->len < nhoff + len || + len < (iph->ihl * 4)) + return -1; + + return 0; +} + +static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nf_hook_state bridge_state = *state; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u32 len; + int ret; + + ct = nf_ct_get(skb, &ctinfo); + if ((ct && !nf_ct_is_template(ct)) || + ctinfo == IP_CT_UNTRACKED) + return NF_ACCEPT; + + switch (skb->protocol) { + case htons(ETH_P_IP): + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return NF_ACCEPT; + + len = ntohs(ip_hdr(skb)->tot_len); + if (pskb_trim_rcsum(skb, len)) + return NF_ACCEPT; + + if (nf_ct_br_ip_check(skb)) + return NF_ACCEPT; + + bridge_state.pf = NFPROTO_IPV4; + ret = nf_ct_br_defrag4(skb, &bridge_state); + break; + case htons(ETH_P_IPV6): + /* fall through */ + default: + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + return NF_ACCEPT; + } + + if (ret != NF_ACCEPT) + return ret; + + return nf_conntrack_in(skb, &bridge_state); +} + +static void nf_ct_bridge_frag_save(struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data) +{ + if (skb_vlan_tag_present(skb)) { + data->vlan_present = true; + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + } else { + data->vlan_present = false; + } + skb_copy_from_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN); +} + +static unsigned int +nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + struct nf_ct_bridge_frag_data data; + + if (!BR_INPUT_SKB_CB(skb)->frag_max_size) + return NF_ACCEPT; + + nf_ct_bridge_frag_save(skb, &data); + switch (skb->protocol) { + case htons(ETH_P_IP): + nf_br_ip_fragment(state->net, state->sk, skb, &data, output); + break; + case htons(ETH_P_IPV6): + return NF_ACCEPT; + default: + WARN_ON_ONCE(1); + return NF_DROP; + } + + return NF_STOLEN; +} + +/* Actually only slow path refragmentation needs this. */ +static int nf_ct_bridge_frag_restore(struct sk_buff *skb, + const struct nf_ct_bridge_frag_data *data) +{ + int err; + + err = skb_cow_head(skb, ETH_HLEN); + if (err) { + kfree_skb(skb); + return -ENOMEM; + } + if (data->vlan_present) + __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci); + + skb_copy_to_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN); + skb_reset_mac_header(skb); + + return 0; +} + +static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *skb) +{ + int err; + + err = nf_ct_bridge_frag_restore(skb, data); + if (err < 0) + return err; + + return br_dev_queue_push_xmit(net, sk, skb); +} + +static unsigned int nf_ct_bridge_confirm(struct sk_buff *skb) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + int protoff; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return nf_conntrack_confirm(skb); + + switch (skb->protocol) { + case htons(ETH_P_IP): + protoff = skb_network_offset(skb) + ip_hdrlen(skb); + break; + case htons(ETH_P_IPV6): { + unsigned char pnum = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, + &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) + return nf_conntrack_confirm(skb); + } + break; + default: + return NF_ACCEPT; + } + return nf_confirm(skb, protoff, ct, ctinfo); +} + +static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + int ret; + + ret = nf_ct_bridge_confirm(skb); + if (ret != NF_ACCEPT) + return ret; + + return nf_ct_bridge_refrag(skb, state, nf_ct_bridge_refrag_post); +} + +static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = { + { + .hook = nf_ct_bridge_pre, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, + }, + { + .hook = nf_ct_bridge_post, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM, + }, +}; + +static struct nf_ct_bridge_info bridge_info = { + .ops = nf_ct_bridge_hook_ops, + .ops_size = ARRAY_SIZE(nf_ct_bridge_hook_ops), + .me = THIS_MODULE, +}; + +static int __init nf_conntrack_l3proto_bridge_init(void) +{ + nf_ct_bridge_register(&bridge_info); + + return 0; +} + +static void __exit nf_conntrack_l3proto_bridge_fini(void) +{ + nf_ct_bridge_unregister(&bridge_info); +} + +module_init(nf_conntrack_l3proto_bridge_init); +module_exit(nf_conntrack_l3proto_bridge_fini); + +MODULE_ALIAS("nf_conntrack-" __stringify(AF_BRIDGE)); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 3813cb551df9..7e2e8b8d6ebe 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -121,10 +121,8 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) }; EXPORT_SYMBOL_GPL(nf_ct_l4proto_find); -static unsigned int nf_confirm(struct sk_buff *skb, - unsigned int protoff, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) +unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) { const struct nf_conn_help *help; @@ -155,6 +153,7 @@ static unsigned int nf_confirm(struct sk_buff *skb, /* We've seen it coming out the other side: confirm it */ return nf_conntrack_confirm(skb); } +EXPORT_SYMBOL_GPL(nf_confirm); static unsigned int ipv4_confirm(void *priv, struct sk_buff *skb, -- cgit v1.2.3 From af9573be674e6aed893ad649c76d55c39a2f6420 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 29 May 2019 13:25:39 +0200 Subject: netfilter: nf_conntrack_bridge: register inet conntrack for bridge This patch enables IPv4 and IPv6 conntrack from the bridge to deal with local traffic. Hence, packets that are passed up to the local input path are confirmed later on from the {ipv4,ipv6}_confirm() hooks. For packets leaving the IP stack (ie. output path), fragmentation occurs after the inet postrouting hook. Therefore, the bridge local out and postrouting bridge hooks see fragments with conntrack objects, which is inconsistent. In this case, we could defragment again from the bridge output hook, but this is expensive. The recommended filtering spot for outgoing locally generated traffic leaving through the bridge interface is to use the classic IPv4/IPv6 output hook, which comes earlier. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_proto.c | 58 +++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 16 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 7e2e8b8d6ebe..a0560d175a7f 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -560,38 +560,64 @@ static void nf_ct_netns_do_put(struct net *net, u8 nfproto) mutex_unlock(&nf_ct_proto_mutex); } -int nf_ct_netns_get(struct net *net, u8 nfproto) +static int nf_ct_netns_inet_get(struct net *net) { int err; - if (nfproto == NFPROTO_INET) { - err = nf_ct_netns_do_get(net, NFPROTO_IPV4); - if (err < 0) - goto err1; - err = nf_ct_netns_do_get(net, NFPROTO_IPV6); - if (err < 0) - goto err2; - } else { - err = nf_ct_netns_do_get(net, nfproto); - if (err < 0) - goto err1; - } - return 0; + err = nf_ct_netns_do_get(net, NFPROTO_IPV4); + if (err < 0) + goto err1; + err = nf_ct_netns_do_get(net, NFPROTO_IPV6); + if (err < 0) + goto err2; + return err; err2: nf_ct_netns_put(net, NFPROTO_IPV4); err1: return err; } + +int nf_ct_netns_get(struct net *net, u8 nfproto) +{ + int err; + + switch (nfproto) { + case NFPROTO_INET: + err = nf_ct_netns_inet_get(net); + break; + case NFPROTO_BRIDGE: + err = nf_ct_netns_do_get(net, NFPROTO_BRIDGE); + if (err < 0) + return err; + + err = nf_ct_netns_inet_get(net); + if (err < 0) { + nf_ct_netns_put(net, NFPROTO_BRIDGE); + return err; + } + break; + default: + err = nf_ct_netns_do_get(net, nfproto); + break; + } + return err; +} EXPORT_SYMBOL_GPL(nf_ct_netns_get); void nf_ct_netns_put(struct net *net, uint8_t nfproto) { - if (nfproto == NFPROTO_INET) { + switch (nfproto) { + case NFPROTO_BRIDGE: + nf_ct_netns_do_put(net, NFPROTO_BRIDGE); + /* fall through */ + case NFPROTO_INET: nf_ct_netns_do_put(net, NFPROTO_IPV4); nf_ct_netns_do_put(net, NFPROTO_IPV6); - } else { + break; + default: nf_ct_netns_do_put(net, nfproto); + break; } } EXPORT_SYMBOL_GPL(nf_ct_netns_put); -- cgit v1.2.3 From 1da40ab6caf924633116582c4c86939c486f20db Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 5 May 2019 15:14:38 +0300 Subject: ipvs: allow rs_table to contain different real server types Before now rs_table was used only for NAT real servers. Change it to allow TUN real severs from different types, possibly hashed with different port key. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 3 +++ net/netfilter/ipvs/ip_vs_ctl.c | 43 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 38 insertions(+), 8 deletions(-) (limited to 'net/netfilter') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 2ac40135b576..9a8ac8997e34 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1497,6 +1497,9 @@ static inline int ip_vs_todrop(struct netns_ipvs *ipvs) static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { return 0; } #endif +#define IP_VS_DFWD_METHOD(dest) (atomic_read(&(dest)->conn_flags) & \ + IP_VS_CONN_F_FWD_MASK) + /* ip_vs_fwd_tag returns the forwarding tag of the connection */ #define IP_VS_FWD_METHOD(cp) (cp->flags & IP_VS_CONN_F_FWD_MASK) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 0e887159425c..30b1a9f9c2e3 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -515,15 +515,36 @@ static inline unsigned int ip_vs_rs_hashkey(int af, static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { unsigned int hash; + __be16 port; if (dest->in_rs_table) return; + switch (IP_VS_DFWD_METHOD(dest)) { + case IP_VS_CONN_F_MASQ: + port = dest->port; + break; + case IP_VS_CONN_F_TUNNEL: + switch (dest->tun_type) { + case IP_VS_CONN_F_TUNNEL_TYPE_GUE: + port = dest->tun_port; + break; + case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: + port = 0; + break; + default: + return; + } + break; + default: + return; + } + /* * Hash by proto,addr,port, * which are the parameters of the real service. */ - hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); + hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); dest->in_rs_table = 1; @@ -555,7 +576,8 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && - (dest->protocol == protocol || dest->vfwmark)) { + (dest->protocol == protocol || dest->vfwmark) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { /* HIT */ return true; } @@ -585,7 +607,8 @@ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && - (dest->protocol == protocol || dest->vfwmark)) { + (dest->protocol == protocol || dest->vfwmark) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { /* HIT */ return dest; } @@ -831,6 +854,13 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; + /* Need to rehash? */ + if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != + IP_VS_DFWD_METHOD(dest) || + udest->tun_type != dest->tun_type || + udest->tun_port != dest->tun_port) + ip_vs_rs_unhash(dest); + /* set the tunnel info */ dest->tun_type = udest->tun_type; dest->tun_port = udest->tun_port; @@ -839,16 +869,13 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; } else { - /* - * Put the real service in rs_table if not present. - * For now only for NAT! - */ - ip_vs_rs_hash(ipvs, dest); /* FTP-NAT requires conntrack for mangling */ if (svc->port == FTPPORT) ip_vs_register_conntrack(svc); } atomic_set(&dest->conn_flags, conn_flags); + /* Put the real service in rs_table if not present. */ + ip_vs_rs_hash(ipvs, dest); /* bind the service */ old_svc = rcu_dereference_protected(dest->svc, 1); -- cgit v1.2.3 From 2aa3c9f48bc28ca0effd9877e010ad54c8a630e5 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 5 May 2019 15:14:39 +0300 Subject: ipvs: add function to find tunnels Add ip_vs_find_tunnel() to match tunnel headers by family, address and optional port. Use it to properly find the tunnel real server used in received ICMP errors. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 3 +++ net/netfilter/ipvs/ip_vs_core.c | 8 ++++++++ net/netfilter/ipvs/ip_vs_ctl.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) (limited to 'net/netfilter') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 9a8ac8997e34..b01a94ebfc0e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1404,6 +1404,9 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, struct ip_vs_dest * ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport); +struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, + const union nf_inet_addr *daddr, + __be16 tun_port); int ip_vs_use_count_inc(void); void ip_vs_use_count_dec(void); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 14457551bcb4..4447ee512b88 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1598,6 +1598,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, struct ip_vs_proto_data *pd; unsigned int offset, offset2, ihl, verdict; bool ipip, new_cp = false; + union nf_inet_addr *raddr; *related = 1; @@ -1636,15 +1637,22 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ + raddr = (union nf_inet_addr *)&cih->daddr; /* Special case for errors for IPIP packets */ ipip = false; if (cih->protocol == IPPROTO_IPIP) { + struct ip_vs_dest *dest; + if (unlikely(cih->frag_off & htons(IP_OFFSET))) return NF_ACCEPT; /* Error for our IPIP must arrive at LOCAL_IN */ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) return NF_ACCEPT; + dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0); + /* Only for known tunnel */ + if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP) + return NF_ACCEPT; offset += cih->ihl * 4; cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 30b1a9f9c2e3..d5847e06350f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -617,6 +617,35 @@ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, return NULL; } +/* Find real service record by . + * In case of multiple records with the same , only + * the first found record is returned. + * + * To be called under RCU lock. + */ +struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, + const union nf_inet_addr *daddr, + __be16 tun_port) +{ + struct ip_vs_dest *dest; + unsigned int hash; + + /* Check for "full" addressed entries */ + hash = ip_vs_rs_hashkey(af, daddr, tun_port); + + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->tun_port == tun_port && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { + /* HIT */ + return dest; + } + } + + return NULL; +} + /* Lookup destination by {addr,port} in the given service * Called under RCU lock. */ -- cgit v1.2.3 From 508f744c0de38f517a94cc2d0bf8e118271b9645 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 5 May 2019 15:14:40 +0300 Subject: ipvs: strip udp tunnel headers from icmp errors Recognize UDP tunnels in received ICMP errors and properly strip the tunnel headers. GUE is what we have for now. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 60 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'net/netfilter') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 4447ee512b88..d1d7b2483fd7 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -39,6 +39,7 @@ #include #include #include /* for icmp_send */ +#include #include #include #include /* net_generic() */ @@ -1579,6 +1580,41 @@ ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, return 1; } +/* Check the UDP tunnel and return its header length */ +static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, + unsigned int offset, __u16 af, + const union nf_inet_addr *daddr, __u8 *proto) +{ + struct udphdr _udph, *udph; + struct ip_vs_dest *dest; + + udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (!udph) + goto unk; + offset += sizeof(struct udphdr); + dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest); + if (!dest) + goto unk; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + struct guehdr _gueh, *gueh; + + gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh); + if (!gueh) + goto unk; + if (gueh->control != 0 || gueh->version != 0) + goto unk; + /* Later we can support also IPPROTO_IPV6 */ + if (gueh->proto_ctype != IPPROTO_IPIP) + goto unk; + *proto = gueh->proto_ctype; + return sizeof(struct udphdr) + sizeof(struct guehdr) + + (gueh->hlen << 2); + } + +unk: + return 0; +} + /* * Handle ICMP messages in the outside-to-inside direction (incoming). * Find any that might be relevant, check against existing connections, @@ -1658,6 +1694,30 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ ipip = true; + } else if (cih->protocol == IPPROTO_UDP && /* Can be UDP encap */ + /* Error for our tunnel must arrive at LOCAL_IN */ + (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) { + __u8 iproto; + int ulen; + + /* Non-first fragment has no UDP header */ + if (unlikely(cih->frag_off & htons(IP_OFFSET))) + return NF_ACCEPT; + offset2 = offset + cih->ihl * 4; + ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, raddr, + &iproto); + if (ulen > 0) { + /* Skip IP and UDP tunnel headers */ + offset = offset2 + ulen; + /* Now we should be at the original IP header */ + cih = skb_header_pointer(skb, offset, sizeof(_ciph), + &_ciph); + if (cih && cih->version == 4 && cih->ihl >= 5 && + iproto == IPPROTO_IPIP) + ipip = true; + else + return NF_ACCEPT; + } } pd = ip_vs_proto_data_get(ipvs, cih->protocol); -- cgit v1.2.3 From ea6cc2fd8a2b89ab6dcd096ba6dbc1ecbdf26564 Mon Sep 17 00:00:00 2001 From: Lukasz Pawelczyk Date: Fri, 10 May 2019 13:46:22 +0200 Subject: netfilter: xt_owner: Add supplementary groups option The XT_OWNER_SUPPL_GROUPS flag causes GIDs specified with XT_OWNER_GID to be also checked in the supplementary groups of a process. f_cred->group_info cannot be modified during its lifetime and f_cred holds a reference to it so it's safe to use. Signed-off-by: Lukasz Pawelczyk Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_owner.h | 7 ++++--- net/netfilter/xt_owner.c | 23 ++++++++++++++++++++--- 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'net/netfilter') diff --git a/include/uapi/linux/netfilter/xt_owner.h b/include/uapi/linux/netfilter/xt_owner.h index fa3ad84957d5..9e98c09eda32 100644 --- a/include/uapi/linux/netfilter/xt_owner.h +++ b/include/uapi/linux/netfilter/xt_owner.h @@ -5,9 +5,10 @@ #include enum { - XT_OWNER_UID = 1 << 0, - XT_OWNER_GID = 1 << 1, - XT_OWNER_SOCKET = 1 << 2, + XT_OWNER_UID = 1 << 0, + XT_OWNER_GID = 1 << 1, + XT_OWNER_SOCKET = 1 << 2, + XT_OWNER_SUPPL_GROUPS = 1 << 3, }; struct xt_owner_match_info { diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 46686fb73784..a8784502aca6 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -91,11 +91,28 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } if (info->match & XT_OWNER_GID) { + unsigned int i, match = false; kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); - if ((gid_gte(filp->f_cred->fsgid, gid_min) && - gid_lte(filp->f_cred->fsgid, gid_max)) ^ - !(info->invert & XT_OWNER_GID)) + struct group_info *gi = filp->f_cred->group_info; + + if (gid_gte(filp->f_cred->fsgid, gid_min) && + gid_lte(filp->f_cred->fsgid, gid_max)) + match = true; + + if (!match && (info->match & XT_OWNER_SUPPL_GROUPS) && gi) { + for (i = 0; i < gi->ngroups; ++i) { + kgid_t group = gi->gid[i]; + + if (gid_gte(group, gid_min) && + gid_lte(group, gid_max)) { + match = true; + break; + } + } + } + + if (match ^ !(info->invert & XT_OWNER_GID)) return false; } -- cgit v1.2.3 From 5e2ad02e9001fd99cae3c14e52f67bb976e9bee3 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 16 May 2019 04:02:31 +0900 Subject: netfilter: nf_flow_table: remove unnecessary variable in flow_offload_tuple The oifidx in the struct flow_offload_tuple is not used anymore. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 2 -- net/netfilter/nf_flow_table_core.c | 1 - 2 files changed, 3 deletions(-) (limited to 'net/netfilter') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 3e370cb36263..d8c187936bec 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -53,8 +53,6 @@ struct flow_offload_tuple { u8 l4proto; u8 dir; - int oifidx; - u16 mtu; struct dst_entry *dst_cache; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 948b4ebbe3fb..e3d797252a98 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -53,7 +53,6 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, ft->dst_port = ctt->dst.u.tcp.port; ft->iifidx = other_dst->dev->ifindex; - ft->oifidx = dst->dev->ifindex; ft->dst_cache = dst; } -- cgit v1.2.3 From 53315ac660b09f8128cd086e3ea0ed5ed5081d55 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 22 May 2019 23:35:11 +0200 Subject: netfilter: nf_tables: free base chain counters from worker No need to use synchronize_rcu() here, just swap the two pointers and have the release occur from work queue after commit has completed. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 28241e82fd15..2fed78b19abe 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1449,25 +1449,18 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) return newstats; } -static void nft_chain_stats_replace(struct net *net, - struct nft_base_chain *chain, - struct nft_stats __percpu *newstats) +static void nft_chain_stats_replace(struct nft_trans *trans) { - struct nft_stats __percpu *oldstats; + struct nft_base_chain *chain = nft_base_chain(trans->ctx.chain); - if (newstats == NULL) + if (!nft_trans_chain_stats(trans)) return; - if (rcu_access_pointer(chain->stats)) { - oldstats = rcu_dereference_protected(chain->stats, - lockdep_commit_lock_is_held(net)); - rcu_assign_pointer(chain->stats, newstats); - synchronize_rcu(); - free_percpu(oldstats); - } else { - rcu_assign_pointer(chain->stats, newstats); + rcu_swap_protected(chain->stats, nft_trans_chain_stats(trans), + lockdep_commit_lock_is_held(trans->ctx.net)); + + if (!nft_trans_chain_stats(trans)) static_branch_inc(&nft_counters_enabled); - } } static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) @@ -6360,9 +6353,9 @@ static void nft_chain_commit_update(struct nft_trans *trans) if (!nft_is_base_chain(trans->ctx.chain)) return; + nft_chain_stats_replace(trans); + basechain = nft_base_chain(trans->ctx.chain); - nft_chain_stats_replace(trans->ctx.net, basechain, - nft_trans_chain_stats(trans)); switch (nft_trans_chain_policy(trans)) { case NF_DROP: @@ -6379,6 +6372,7 @@ static void nft_commit_release(struct nft_trans *trans) nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_NEWCHAIN: + free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: -- cgit v1.2.3 From ec0974df357f94385070c242ee0a280f4e3cc12d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 23 May 2019 15:44:06 +0200 Subject: netfilter: ipvs: prefer skb_ensure_writable It does the same thing, use it instead so we can remove skb_make_writable. Signed-off-by: Florian Westphal Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_app.c | 4 ++-- net/netfilter/ipvs/ip_vs_core.c | 4 ++-- net/netfilter/ipvs/ip_vs_ftp.c | 4 ++-- net/netfilter/ipvs/ip_vs_proto_sctp.c | 4 ++-- net/netfilter/ipvs/ip_vs_proto_tcp.c | 4 ++-- net/netfilter/ipvs/ip_vs_proto_udp.c | 4 ++-- net/netfilter/ipvs/ip_vs_xmit.c | 12 ++++++------ 7 files changed, 18 insertions(+), 18 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 7588aeaa605f..ba34ac25ee7b 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -363,7 +363,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, struct tcphdr *th; __u32 seq; - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + if (skb_ensure_writable(skb, tcp_offset + sizeof(*th))) return 0; th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); @@ -440,7 +440,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, struct tcphdr *th; __u32 seq; - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + if (skb_ensure_writable(skb, tcp_offset + sizeof(*th))) return 0; th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index d1d7b2483fd7..90adca9a5510 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -898,7 +898,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb, if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || IPPROTO_SCTP == protocol) offset += 2 * sizeof(__u16); - if (!skb_make_writable(skb, offset)) + if (skb_ensure_writable(skb, offset)) goto out; #ifdef CONFIG_IP_VS_IPV6 @@ -1288,7 +1288,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); - if (!skb_make_writable(skb, iph->len)) + if (skb_ensure_writable(skb, iph->len)) goto drop; /* mangle the packet */ diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index fe69d46ff779..5cbefa927f09 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -273,7 +273,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, return 1; /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return 0; if (cp->app_data == (void *) IP_VS_FTP_PASV) { @@ -439,7 +439,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, return 1; /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return 0; data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index b58ddb7dffd1..a0921adc31a9 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -101,7 +101,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, #endif /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { @@ -148,7 +148,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, #endif /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + if (skb_ensure_writable(skb, sctphoff + sizeof(*sctph))) return 0; if (unlikely(cp->app != NULL)) { diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 00ce07dda980..089ee592a955 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -163,7 +163,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { @@ -241,7 +241,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 92c078abcb3e..de366aa3c03b 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -153,7 +153,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) return 0; if (unlikely(cp->app != NULL)) { @@ -236,7 +236,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, oldlen = skb->len - udphoff; /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + if (skb_ensure_writable(skb, udphoff + sizeof(*udph))) return 0; if (unlikely(cp->app != NULL)) { diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 8d6f94b67772..0b41d0504429 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -279,7 +279,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, } /* don't propagate ttl change to cloned packets */ - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) return false; ipv6_hdr(skb)->hop_limit--; @@ -294,7 +294,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, } /* don't propagate ttl change to cloned packets */ - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return false; /* Decrease ttl */ @@ -796,7 +796,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) @@ -885,7 +885,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) @@ -1404,7 +1404,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) + if (skb_ensure_writable(skb, offset)) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) @@ -1493,7 +1493,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) + if (skb_ensure_writable(skb, offset)) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) -- cgit v1.2.3 From 86f045385462597e96f5481198a6c60d18d109ca Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 23 May 2019 15:44:07 +0200 Subject: netfilter: conntrack, nat: prefer skb_ensure_writable like previous patches -- convert conntrack to use the core helper. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_sctp.c | 2 +- net/netfilter/nf_conntrack_seqadj.c | 4 ++-- net/netfilter/nf_nat_helper.c | 4 ++-- net/netfilter/nf_nat_proto.c | 24 ++++++++++++------------ 4 files changed, 17 insertions(+), 17 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 5b8dde266412..07c5208a4ea0 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -339,7 +339,7 @@ static bool sctp_error(struct sk_buff *skb, if (state->hook == NF_INET_PRE_ROUTING && state->net->ct.sysctl_checksum && skb->ip_summed == CHECKSUM_NONE) { - if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) { + if (skb_ensure_writable(skb, dataoff + sizeof(*sh))) { logmsg = "nf_ct_sctp: failed to read header "; goto out_invalid; } diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index dc21a43cd145..3066449f8bd8 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -126,7 +126,7 @@ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, optoff = protoff + sizeof(struct tcphdr); optend = protoff + tcph->doff * 4; - if (!skb_make_writable(skb, optend)) + if (skb_ensure_writable(skb, optend)) return 0; tcph = (void *)skb->data + protoff; @@ -176,7 +176,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb, this_way = &seqadj->seq[dir]; other_way = &seqadj->seq[!dir]; - if (!skb_make_writable(skb, protoff + sizeof(*tcph))) + if (skb_ensure_writable(skb, protoff + sizeof(*tcph))) return 0; tcph = (void *)skb->data + protoff; diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index ccc06f7539d7..03e8e2d79375 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -98,7 +98,7 @@ bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb, struct tcphdr *tcph; int oldlen, datalen; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return false; if (rep_len > match_len && @@ -148,7 +148,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, struct udphdr *udph; int datalen, oldlen; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return false; if (rep_len > match_len && diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 84f5c90a7f21..04a6c1ac2526 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -73,7 +73,7 @@ static bool udp_manip_pkt(struct sk_buff *skb, struct udphdr *hdr; bool do_csum; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); @@ -91,7 +91,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb, #ifdef CONFIG_NF_CT_PROTO_UDPLITE struct udphdr *hdr; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); @@ -117,7 +117,7 @@ sctp_manip_pkt(struct sk_buff *skb, if (skb->len >= hdroff + sizeof(*hdr)) hdrsize = sizeof(*hdr); - if (!skb_make_writable(skb, hdroff + hdrsize)) + if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct sctphdr *)(skb->data + hdroff); @@ -158,7 +158,7 @@ tcp_manip_pkt(struct sk_buff *skb, if (skb->len >= hdroff + sizeof(struct tcphdr)) hdrsize = sizeof(struct tcphdr); - if (!skb_make_writable(skb, hdroff + hdrsize)) + if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct tcphdr *)(skb->data + hdroff); @@ -198,7 +198,7 @@ dccp_manip_pkt(struct sk_buff *skb, if (skb->len >= hdroff + sizeof(struct dccp_hdr)) hdrsize = sizeof(struct dccp_hdr); - if (!skb_make_writable(skb, hdroff + hdrsize)) + if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct dccp_hdr *)(skb->data + hdroff); @@ -232,7 +232,7 @@ icmp_manip_pkt(struct sk_buff *skb, { struct icmphdr *hdr; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmphdr *)(skb->data + hdroff); @@ -250,7 +250,7 @@ icmpv6_manip_pkt(struct sk_buff *skb, { struct icmp6hdr *hdr; - if (!skb_make_writable(skb, hdroff + sizeof(*hdr))) + if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmp6hdr *)(skb->data + hdroff); @@ -278,7 +278,7 @@ gre_manip_pkt(struct sk_buff *skb, /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ - if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8)) + if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8)) return false; greh = (void *)skb->data + hdroff; @@ -350,7 +350,7 @@ static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, struct iphdr *iph; unsigned int hdroff; - if (!skb_make_writable(skb, iphdroff + sizeof(*iph))) + if (skb_ensure_writable(skb, iphdroff + sizeof(*iph))) return false; iph = (void *)skb->data + iphdroff; @@ -381,7 +381,7 @@ static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, int hdroff; u8 nexthdr; - if (!skb_make_writable(skb, iphdroff + sizeof(*ipv6h))) + if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h))) return false; ipv6h = (void *)skb->data + iphdroff; @@ -565,7 +565,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); - if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) + if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip_checksum(skb, hooknum, hdrlen, 0)) return 0; @@ -787,7 +787,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); - if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) + if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6)) return 0; -- cgit v1.2.3 From 3862c6a91a431337ead5685d647b83f5a82f7705 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 23 May 2019 15:44:08 +0200 Subject: netfilter: ipv4: prefer skb_ensure_writable .. so skb_make_writable can be removed soon. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arpt_mangle.c | 2 +- net/ipv4/netfilter/ipt_ECN.c | 4 ++-- net/ipv4/netfilter/nf_nat_h323.c | 2 +- net/ipv4/netfilter/nf_nat_snmp_basic_main.c | 2 +- net/netfilter/nf_nat_sip.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/netfilter') diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index 87ca2c42359b..a4e07e5e9c11 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -17,7 +17,7 @@ target(struct sk_buff *skb, const struct xt_action_param *par) unsigned char *arpptr; int pln, hln; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return NF_DROP; arp = arp_hdr(skb); diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index aaaf9a81fbc9..9f6751893660 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -32,7 +32,7 @@ set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo) if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) { __u8 oldtos; - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return false; iph = ip_hdr(skb); oldtos = iph->tos; @@ -61,7 +61,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) tcph->cwr == einfo->proto.tcp.cwr)) return true; - if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph))) + if (skb_ensure_writable(skb, ip_hdrlen(skb) + sizeof(*tcph))) return false; tcph = (void *)ip_hdr(skb) + ip_hdrlen(skb); diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 7875c98072eb..15f2b2604890 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -59,7 +59,7 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff, net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n"); return -1; } - /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy + /* nf_nat_mangle_udp_packet uses skb_ensure_writable() to copy * or pull everything in a linear buffer, so we can safely * use the skb pointers now */ *data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c index 657d2dcec3cc..717b726504fe 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c @@ -186,7 +186,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, return NF_DROP; } - if (!skb_make_writable(skb, skb->len)) { + if (skb_ensure_writable(skb, skb->len)) { nf_ct_helper_log(skb, ct, "cannot mangle packet"); return NF_DROP; } diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index 464387b3600f..07805bf4d62a 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -285,7 +285,7 @@ next: if (dir == IP_CT_DIR_REPLY && ct_sip_info->forced_dport) { struct udphdr *uh; - if (!skb_make_writable(skb, skb->len)) { + if (skb_ensure_writable(skb, skb->len)) { nf_ct_helper_log(skb, ct, "cannot mangle packet"); return NF_DROP; } -- cgit v1.2.3 From 7418ee4c8810e4ad74fb05a8b8d4cf406738ac81 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 23 May 2019 15:44:09 +0200 Subject: netfilter: nf_tables: prefer skb_ensure_writable .. so skb_make_writable can be removed. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_exthdr.c | 3 ++- net/netfilter/nft_payload.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a940c9fd9045..45c8a6c07783 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -156,7 +156,8 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, if (i + optl > tcphdr_len || priv->len + priv->offset > optl) return; - if (!skb_make_writable(pkt->skb, pkt->xt.thoff + i + priv->len)) + if (skb_ensure_writable(pkt->skb, + pkt->xt.thoff + i + priv->len)) return; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 54e15de4b79a..1465b7d6d2b0 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -243,7 +243,7 @@ static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt, tsum)); } - if (!skb_make_writable(skb, l4csum_offset + sizeof(sum)) || + if (skb_ensure_writable(skb, l4csum_offset + sizeof(sum)) || skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0) return -1; @@ -259,7 +259,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, return -1; nft_csum_replace(&sum, fsum, tsum); - if (!skb_make_writable(skb, csum_offset + sizeof(sum)) || + if (skb_ensure_writable(skb, csum_offset + sizeof(sum)) || skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) return -1; @@ -312,7 +312,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr, goto err; } - if (!skb_make_writable(skb, max(offset + priv->len, 0)) || + if (skb_ensure_writable(skb, max(offset + priv->len, 0)) || skb_store_bits(skb, offset, src, priv->len) < 0) goto err; -- cgit v1.2.3 From 8e03707f118cd2f8dc993b01dd15e78da147a29e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 23 May 2019 15:44:10 +0200 Subject: netfilter: xt_HL: prefer skb_ensure_writable Also, make the argument to be only the needed size of the header we're altering, no need to pull in the full packet into linear area. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_HL.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c index 4653b071bed4..a37b8824221f 100644 --- a/net/netfilter/xt_HL.c +++ b/net/netfilter/xt_HL.c @@ -32,7 +32,7 @@ ttl_tg(struct sk_buff *skb, const struct xt_action_param *par) const struct ipt_TTL_info *info = par->targinfo; int new_ttl; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, sizeof(*iph))) return NF_DROP; iph = ip_hdr(skb); @@ -72,7 +72,7 @@ hl_tg6(struct sk_buff *skb, const struct xt_action_param *par) const struct ip6t_HL_info *info = par->targinfo; int new_hl; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, sizeof(*ip6h))) return NF_DROP; ip6h = ipv6_hdr(skb); -- cgit v1.2.3 From fb2eb1c131f8526905a83bd98fd2d4e7c9f950a0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 23 May 2019 15:44:11 +0200 Subject: netfilter: tcpmss, optstrip: prefer skb_ensure_writable This also changes optstrip to only make the tcp header writeable rather than the entire packet. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_TCPMSS.c | 2 +- net/netfilter/xt_TCPOPTSTRIP.c | 28 +++++++++++++--------------- 2 files changed, 14 insertions(+), 16 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 98efb202f8b4..3e24443ab81c 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -89,7 +89,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, if (par->fragoff != 0) return 0; - if (!skb_make_writable(skb, skb->len)) + if (skb_ensure_writable(skb, skb->len)) return -1; len = skb->len - tcphoff; diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index eb92bffff11c..5a274813076a 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -31,33 +31,33 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) static unsigned int tcpoptstrip_mangle_packet(struct sk_buff *skb, const struct xt_action_param *par, - unsigned int tcphoff, unsigned int minlen) + unsigned int tcphoff) { const struct xt_tcpoptstrip_target_info *info = par->targinfo; + struct tcphdr *tcph, _th; unsigned int optl, i, j; - struct tcphdr *tcph; u_int16_t n, o; u_int8_t *opt; - int len, tcp_hdrlen; + int tcp_hdrlen; /* This is a fragment, no TCP header is available */ if (par->fragoff != 0) return XT_CONTINUE; - if (!skb_make_writable(skb, skb->len)) + tcph = skb_header_pointer(skb, tcphoff, sizeof(_th), &_th); + if (!tcph) return NF_DROP; - len = skb->len - tcphoff; - if (len < (int)sizeof(struct tcphdr)) - return NF_DROP; - - tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); tcp_hdrlen = tcph->doff * 4; + if (tcp_hdrlen < sizeof(struct tcphdr)) + return NF_DROP; - if (len < tcp_hdrlen) + if (skb_ensure_writable(skb, tcphoff + tcp_hdrlen)) return NF_DROP; - opt = (u_int8_t *)tcph; + /* must reload tcph, might have been moved */ + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + opt = (u8 *)tcph; /* * Walk through all TCP options - if we find some option to remove, @@ -91,8 +91,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, static unsigned int tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) { - return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb), - sizeof(struct iphdr) + sizeof(struct tcphdr)); + return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb)); } #if IS_ENABLED(CONFIG_IP6_NF_MANGLE) @@ -109,8 +108,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) if (tcphoff < 0) return NF_DROP; - return tcpoptstrip_mangle_packet(skb, par, tcphoff, - sizeof(*ipv6h) + sizeof(struct tcphdr)); + return tcpoptstrip_mangle_packet(skb, par, tcphoff); } #endif -- cgit v1.2.3 From 2cf6bffc49dae26edd12af6b57c8c780590380bf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 23 May 2019 15:44:12 +0200 Subject: netfilter: replace skb_make_writable with skb_ensure_writable This converts all remaining users and then removes skb_make_writable. Suggested-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 5 ----- net/netfilter/core.c | 22 ---------------------- net/netfilter/nf_synproxy_core.c | 2 +- net/netfilter/nfnetlink_queue.c | 2 +- net/netfilter/xt_DSCP.c | 8 ++++---- 5 files changed, 6 insertions(+), 33 deletions(-) (limited to 'net/netfilter') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 996bc247ef6e..049aeb40fa35 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -336,11 +336,6 @@ int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int *len); #endif -/* Call this before modifying an existing packet: ensures it is - modifiable and linear to the point you care about (writable_len). - Returns true or false. */ -int skb_make_writable(struct sk_buff *skb, unsigned int writable_len); - struct flowi; struct nf_queue_entry; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b96fd3f54705..817a9e5d16e4 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -536,28 +536,6 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, } EXPORT_SYMBOL(nf_hook_slow); - -int skb_make_writable(struct sk_buff *skb, unsigned int writable_len) -{ - if (writable_len > skb->len) - return 0; - - /* Not exclusive use of packet? Must copy. */ - if (!skb_cloned(skb)) { - if (writable_len <= skb_headlen(skb)) - return 1; - } else if (skb_clone_writable(skb, writable_len)) - return 1; - - if (writable_len <= skb_headlen(skb)) - writable_len = 0; - else - writable_len -= skb_headlen(skb); - - return !!__pskb_pull_tail(skb, writable_len); -} -EXPORT_SYMBOL(skb_make_writable); - /* This needs to be compiled in any case to avoid dependencies between the * nfnetlink_queue code and nf_conntrack. */ diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 8ff4d22f10b2..3d58a9e93e5a 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -196,7 +196,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, optoff = protoff + sizeof(struct tcphdr); optend = protoff + th->doff * 4; - if (!skb_make_writable(skb, optend)) + if (skb_ensure_writable(skb, optend)) return 0; while (optoff < optend) { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 27dac47b29c2..831f57008d78 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -863,7 +863,7 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) } skb_put(e->skb, diff); } - if (!skb_make_writable(e->skb, data_len)) + if (skb_ensure_writable(e->skb, data_len)) return -ENOMEM; skb_copy_to_linear_data(e->skb, data, data_len); e->skb->ip_summed = CHECKSUM_NONE; diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index 098ed851b7a7..30d554d6c213 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -34,7 +34,7 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par) u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; ipv4_change_dsfield(ip_hdr(skb), @@ -52,7 +52,7 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par) u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { - if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) return NF_DROP; ipv6_change_dsfield(ipv6_hdr(skb), @@ -82,7 +82,7 @@ tos_tg(struct sk_buff *skb, const struct xt_action_param *par) nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ip_hdr(skb); ipv4_change_dsfield(iph, 0, nv); @@ -102,7 +102,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { - if (!skb_make_writable(skb, sizeof(struct iphdr))) + if (skb_ensure_writable(skb, sizeof(struct iphdr))) return NF_DROP; iph = ipv6_hdr(skb); ipv6_change_dsfield(iph, 0, nv); -- cgit v1.2.3 From 29930e314da3833437a2ddc7b17f6a954f38d8fb Mon Sep 17 00:00:00 2001 From: Jacky Hu Date: Thu, 30 May 2019 08:16:40 +0800 Subject: ipvs: add checksum support for gue encapsulation Add checksum support for gue encapsulation with the tun_flags parameter, which could be one of the values below: IP_VS_TUNNEL_ENCAP_FLAG_NOCSUM IP_VS_TUNNEL_ENCAP_FLAG_CSUM IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM Signed-off-by: Jacky Hu Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 2 + include/uapi/linux/ip_vs.h | 7 ++ net/netfilter/ipvs/ip_vs_ctl.c | 11 +++- net/netfilter/ipvs/ip_vs_xmit.c | 143 +++++++++++++++++++++++++++++++++++----- 4 files changed, 146 insertions(+), 17 deletions(-) (limited to 'net/netfilter') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index b01a94ebfc0e..cb1ad0cc5c7b 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -603,6 +603,7 @@ struct ip_vs_dest_user_kern { u16 tun_type; /* tunnel type */ __be16 tun_port; /* tunnel port */ + u16 tun_flags; /* tunnel flags */ }; @@ -665,6 +666,7 @@ struct ip_vs_dest { atomic_t last_weight; /* server latest weight */ __u16 tun_type; /* tunnel type */ __be16 tun_port; /* tunnel port */ + __u16 tun_flags; /* tunnel flags */ refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index e34f436fc79d..e4f18061a4fd 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -131,6 +131,11 @@ enum { IP_VS_CONN_F_TUNNEL_TYPE_MAX, }; +/* Tunnel encapsulation flags */ +#define IP_VS_TUNNEL_ENCAP_FLAG_NOCSUM (0) +#define IP_VS_TUNNEL_ENCAP_FLAG_CSUM (1 << 0) +#define IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM (1 << 1) + /* * The struct ip_vs_service_user and struct ip_vs_dest_user are * used to set IPVS rules through setsockopt. @@ -403,6 +408,8 @@ enum { IPVS_DEST_ATTR_TUN_PORT, /* tunnel port */ + IPVS_DEST_ATTR_TUN_FLAGS, /* tunnel flags */ + __IPVS_DEST_ATTR_MAX, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index d5847e06350f..ad19ac08622f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -893,6 +893,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, /* set the tunnel info */ dest->tun_type = udest->tun_type; dest->tun_port = udest->tun_port; + dest->tun_flags = udest->tun_flags; /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { @@ -2967,6 +2968,7 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, @@ -3273,6 +3275,8 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) dest->tun_type) || nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, dest->tun_port) || + nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, + dest->tun_flags) || nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, @@ -3393,7 +3397,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, - *nla_l_thresh, *nla_tun_type, *nla_tun_port; + *nla_l_thresh, *nla_tun_type, *nla_tun_port, + *nla_tun_flags; nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; @@ -3401,6 +3406,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; + nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) return -EINVAL; @@ -3416,6 +3422,9 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, if (nla_tun_port) udest->tun_port = nla_get_be16(nla_tun_port); + + if (nla_tun_flags) + udest->tun_flags = nla_get_u16(nla_tun_flags); } return 0; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 0b41d0504429..af3379d5e5bc 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -385,8 +386,13 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (!dest) goto err_put; - if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); + if ((dest->tun_flags & + IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) + mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); goto err_put; @@ -540,8 +546,13 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); if (!dest) goto err_put; - if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); + if ((dest->tun_flags & + IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) + mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); @@ -1006,17 +1017,56 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb, __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); struct udphdr *udph; /* Our new UDP header */ struct guehdr *gueh; /* Our new GUE header */ + size_t hdrlen, optlen = 0; + void *data; + bool need_priv = false; + + if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + need_priv = true; + } - skb_push(skb, sizeof(struct guehdr)); + hdrlen = sizeof(struct guehdr) + optlen; + + skb_push(skb, hdrlen); gueh = (struct guehdr *)skb->data; gueh->control = 0; gueh->version = 0; - gueh->hlen = 0; + gueh->hlen = optlen >> 2; gueh->flags = 0; gueh->proto_ctype = *next_protocol; + data = &gueh[1]; + + if (need_priv) { + __be32 *flags = data; + u16 csum_start = skb_checksum_start_offset(skb); + __be16 *pd; + + gueh->flags |= GUE_FLAG_PRIV; + *flags = 0; + data += GUE_LEN_PRIV; + + if (csum_start < hdrlen) + return -EINVAL; + + csum_start -= hdrlen; + pd = data; + pd[0] = htons(csum_start); + pd[1] = htons(csum_start + skb->csum_offset); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + + *flags |= GUE_PFLAG_REMCSUM; + data += GUE_PLEN_REMCSUM; + } + skb_push(skb, sizeof(struct udphdr)); skb_reset_transport_header(skb); @@ -1070,6 +1120,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int max_headroom; /* The extra header space needed */ int ret, local; int tun_type, gso_type; + int tun_flags; EnterFunction(10); @@ -1092,9 +1143,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); tun_type = cp->dest->tun_type; + tun_flags = cp->dest->tun_flags; - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + size_t gue_hdrlen, gue_optlen = 0; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } + gue_hdrlen = sizeof(struct guehdr) + gue_optlen; + + max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; @@ -1105,8 +1166,17 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; gso_type = __tun_gso_type_mask(AF_INET, cp->af); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - gso_type |= SKB_GSO_UDP_TUNNEL; + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + else + gso_type |= SKB_GSO_UDP_TUNNEL; + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gso_type |= SKB_GSO_TUNNEL_REMCSUM; + } + } if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; @@ -1115,8 +1185,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb_set_inner_ipproto(skb, next_protocol); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - ipvs_gue_encap(net, skb, cp, &next_protocol); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + bool check = false; + + if (ipvs_gue_encap(net, skb, cp, &next_protocol)) + goto tx_error; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + check = true; + + udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); + } + skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); @@ -1174,6 +1255,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int max_headroom; /* The extra header space needed */ int ret, local; int tun_type, gso_type; + int tun_flags; EnterFunction(10); @@ -1197,9 +1279,19 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); tun_type = cp->dest->tun_type; + tun_flags = cp->dest->tun_flags; - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + size_t gue_hdrlen, gue_optlen = 0; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } + gue_hdrlen = sizeof(struct guehdr) + gue_optlen; + + max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, &next_protocol, &payload_len, @@ -1208,8 +1300,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; gso_type = __tun_gso_type_mask(AF_INET6, cp->af); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - gso_type |= SKB_GSO_UDP_TUNNEL; + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + else + gso_type |= SKB_GSO_UDP_TUNNEL; + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + gso_type |= SKB_GSO_TUNNEL_REMCSUM; + } + } if (iptunnel_handle_offloads(skb, gso_type)) goto tx_error; @@ -1218,8 +1319,18 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb_set_inner_ipproto(skb, next_protocol); - if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) - ipvs_gue_encap(net, skb, cp, &next_protocol); + if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { + bool check = false; + + if (ipvs_gue_encap(net, skb, cp, &next_protocol)) + goto tx_error; + + if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || + (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) + check = true; + + udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); + } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); -- cgit v1.2.3 From b8d19572367bb019f77bbc921ef6bf965f1c8b22 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 31 May 2019 18:27:06 +0200 Subject: netfilter: use in_dev_for_each_ifa_rcu Netfilter hooks are always running under rcu read lock, use the new iterator macro so sparse won't complain once we add proper __rcu annotations. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_tproxy_ipv4.c | 9 +++++++-- net/netfilter/nf_conntrack_broadcast.c | 9 +++++++-- net/netfilter/nfnetlink_osf.c | 5 ++--- 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'net/netfilter') diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 164714104965..40c93b3bd731 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -53,6 +53,7 @@ EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4); __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) { + const struct in_ifaddr *ifa; struct in_device *indev; __be32 laddr; @@ -61,10 +62,14 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) laddr = 0; indev = __in_dev_get_rcu(skb->dev); - for_primary_ifa(indev) { + + in_dev_for_each_ifa_rcu(ifa, indev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; + laddr = ifa->ifa_local; break; - } endfor_ifa(indev); + } return laddr ? laddr : daddr; } diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index 5423b197d98a..a5dbc3676a4f 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -41,12 +41,17 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, in_dev = __in_dev_get_rcu(rt->dst.dev); if (in_dev != NULL) { - for_primary_ifa(in_dev) { + const struct in_ifaddr *ifa; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; + if (ifa->ifa_broadcast == iph->daddr) { mask = ifa->ifa_mask; break; } - } endfor_ifa(in_dev); + } } if (mask == 0) diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index f42326b40d6f..9f5dea0064ea 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -33,6 +33,7 @@ static inline int nf_osf_ttl(const struct sk_buff *skb, { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); const struct iphdr *ip = ip_hdr(skb); + const struct in_ifaddr *ifa; int ret = 0; if (ttl_check == NF_OSF_TTL_TRUE) @@ -42,15 +43,13 @@ static inline int nf_osf_ttl(const struct sk_buff *skb, else if (ip->ttl <= f_ttl) return 1; - for_ifa(in_dev) { + in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(ip->saddr, ifa)) { ret = (ip->ttl == f_ttl); break; } } - endfor_ifa(in_dev); - return ret; } -- cgit v1.2.3 From 2638eb8b50cfc16240e0bb080b9afbf541a9b39d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 31 May 2019 18:27:09 +0200 Subject: net: ipv4: provide __rcu annotation for ifa_list ifa_list is protected by rcu, yet code doesn't reflect this. Add the __rcu annotations and fix up all places that are now reported by sparse. I've done this in the same commit to not add intermediate patches that result in new warnings. Reported-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- drivers/infiniband/hw/i40iw/i40iw_utils.c | 12 ++-- drivers/infiniband/hw/nes/nes.c | 8 ++- drivers/infiniband/hw/usnic/usnic_ib_main.c | 15 +++-- drivers/net/ethernet/via/via-velocity.h | 2 +- drivers/net/plip/plip.c | 4 +- drivers/net/vmxnet3/vmxnet3_drv.c | 19 ++++-- drivers/net/wireless/ath/ath6kl/cfg80211.c | 4 +- drivers/net/wireless/marvell/mwifiex/cfg80211.c | 2 +- drivers/staging/isdn/hysdn/hysdn_net.c | 6 +- include/linux/inetdevice.h | 21 ++---- net/core/netpoll.c | 10 ++- net/core/pktgen.c | 8 ++- net/ipv4/devinet.c | 88 ++++++++++++++++--------- net/mac80211/main.c | 4 +- net/netfilter/nf_nat_redirect.c | 12 ++-- 15 files changed, 134 insertions(+), 81 deletions(-) (limited to 'net/netfilter') diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c index 337410f40860..016524683e17 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c @@ -174,10 +174,14 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, rcu_read_lock(); in = __in_dev_get_rcu(upper_dev); - if (!in->ifa_list) - local_ipaddr = 0; - else - local_ipaddr = ntohl(in->ifa_list->ifa_address); + local_ipaddr = 0; + if (in) { + struct in_ifaddr *ifa; + + ifa = rcu_dereference(in->ifa_list); + if (ifa) + local_ipaddr = ntohl(ifa->ifa_address); + } rcu_read_unlock(); } else { diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index e00add6d78ec..29b324726ea6 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -183,7 +183,13 @@ static int nes_inetaddr_event(struct notifier_block *notifier, rcu_read_lock(); in = __in_dev_get_rcu(upper_dev); - nesvnic->local_ipaddr = in->ifa_list->ifa_address; + if (in) { + struct in_ifaddr *ifa; + + ifa = rcu_dereference(in->ifa_list); + if (ifa) + nesvnic->local_ipaddr = ifa->ifa_address; + } rcu_read_unlock(); } else { nesvnic->local_ipaddr = ifa->ifa_address; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index d88d9f8a7f9a..34c1f9d6c915 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -427,11 +427,16 @@ static void *usnic_ib_device_add(struct pci_dev *dev) if (netif_carrier_ok(us_ibdev->netdev)) usnic_fwd_carrier_up(us_ibdev->ufdev); - ind = in_dev_get(netdev); - if (ind->ifa_list) - usnic_fwd_add_ipaddr(us_ibdev->ufdev, - ind->ifa_list->ifa_address); - in_dev_put(ind); + rcu_read_lock(); + ind = __in_dev_get_rcu(netdev); + if (ind) { + const struct in_ifaddr *ifa; + + ifa = rcu_dereference(ind->ifa_list); + if (ifa) + usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address); + } + rcu_read_unlock(); usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr, us_ibdev->ufdev->inaddr, &gid.raw[0]); diff --git a/drivers/net/ethernet/via/via-velocity.h b/drivers/net/ethernet/via/via-velocity.h index c0ecc6c7b5e0..cdfe7809e3c1 100644 --- a/drivers/net/ethernet/via/via-velocity.h +++ b/drivers/net/ethernet/via/via-velocity.h @@ -1509,7 +1509,7 @@ static inline int velocity_get_ip(struct velocity_info *vptr) rcu_read_lock(); in_dev = __in_dev_get_rcu(vptr->netdev); if (in_dev != NULL) { - ifa = (struct in_ifaddr *) in_dev->ifa_list; + ifa = rcu_dereference(in_dev->ifa_list); if (ifa != NULL) { memcpy(vptr->ip_addr, &ifa->ifa_address, 4); res = 0; diff --git a/drivers/net/plip/plip.c b/drivers/net/plip/plip.c index feb92ecd1880..3e3ac2e496a1 100644 --- a/drivers/net/plip/plip.c +++ b/drivers/net/plip/plip.c @@ -1012,7 +1012,7 @@ plip_rewrite_address(const struct net_device *dev, struct ethhdr *eth) in_dev = __in_dev_get_rcu(dev); if (in_dev) { /* Any address will do - we take the first */ - const struct in_ifaddr *ifa = in_dev->ifa_list; + const struct in_ifaddr *ifa = rcu_dereference(in_dev->ifa_list); if (ifa) { memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); memset(eth->h_dest, 0xfc, 2); @@ -1107,7 +1107,7 @@ plip_open(struct net_device *dev) /* Any address will do - we take the first. We already have the first two bytes filled with 0xfc, from plip_init_dev(). */ - struct in_ifaddr *ifa=in_dev->ifa_list; + const struct in_ifaddr *ifa = rcu_dereference(in_dev->ifa_list); if (ifa != NULL) { memcpy(dev->dev_addr+2, &ifa->ifa_local, 4); } diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 89984fcab01e..1b2a18ea855c 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -3651,13 +3651,19 @@ vmxnet3_suspend(struct device *device) } if (adapter->wol & WAKE_ARP) { - in_dev = in_dev_get(netdev); - if (!in_dev) + rcu_read_lock(); + + in_dev = __in_dev_get_rcu(netdev); + if (!in_dev) { + rcu_read_unlock(); goto skip_arp; + } - ifa = (struct in_ifaddr *)in_dev->ifa_list; - if (!ifa) + ifa = rcu_dereference(in_dev->ifa_list); + if (!ifa) { + rcu_read_unlock(); goto skip_arp; + } pmConf->filters[i].patternSize = ETH_HLEN + /* Ethernet header*/ sizeof(struct arphdr) + /* ARP header */ @@ -3677,7 +3683,9 @@ vmxnet3_suspend(struct device *device) /* The Unicast IPv4 address in 'tip' field. */ arpreq += 2 * ETH_ALEN + sizeof(u32); - *(u32 *)arpreq = ifa->ifa_address; + *(__be32 *)arpreq = ifa->ifa_address; + + rcu_read_unlock(); /* The mask for the relevant bits. */ pmConf->filters[i].mask[0] = 0x00; @@ -3686,7 +3694,6 @@ vmxnet3_suspend(struct device *device) pmConf->filters[i].mask[3] = 0x00; pmConf->filters[i].mask[4] = 0xC0; /* IPv4 TIP */ pmConf->filters[i].mask[5] = 0x03; /* IPv4 TIP */ - in_dev_put(in_dev); pmConf->wakeUpEvents |= VMXNET3_PM_WAKEUP_FILTER; i++; diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index 5477a014e1fb..37cf602d8adf 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -2194,13 +2194,13 @@ static int ath6kl_wow_suspend_vif(struct ath6kl_vif *vif, if (!in_dev) return 0; - ifa = in_dev->ifa_list; + ifa = rtnl_dereference(in_dev->ifa_list); memset(&ips, 0, sizeof(ips)); /* Configure IP addr only if IP address count < MAX_IP_ADDRS */ while (index < MAX_IP_ADDRS && ifa) { ips[index] = ifa->ifa_local; - ifa = ifa->ifa_next; + ifa = rtnl_dereference(ifa->ifa_next); index++; } diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index e11a4bb67172..5a7cdb981789 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -3268,7 +3268,7 @@ static void mwifiex_set_auto_arp_mef_entry(struct mwifiex_private *priv, in_dev = __in_dev_get_rtnl(adapter->priv[i]->netdev); if (!in_dev) continue; - ifa = in_dev->ifa_list; + ifa = rtnl_dereference(in_dev->ifa_list); if (!ifa || !ifa->ifa_local) continue; ips[i] = ifa->ifa_local; diff --git a/drivers/staging/isdn/hysdn/hysdn_net.c b/drivers/staging/isdn/hysdn/hysdn_net.c index 8e9c34f33d86..bea37ae30ebb 100644 --- a/drivers/staging/isdn/hysdn/hysdn_net.c +++ b/drivers/staging/isdn/hysdn/hysdn_net.c @@ -70,9 +70,13 @@ net_open(struct net_device *dev) for (i = 0; i < ETH_ALEN; i++) dev->dev_addr[i] = 0xfc; if ((in_dev = dev->ip_ptr) != NULL) { - struct in_ifaddr *ifa = in_dev->ifa_list; + const struct in_ifaddr *ifa; + + rcu_read_lock(); + ifa = rcu_dereference(in_dev->ifa_list); if (ifa != NULL) memcpy(dev->dev_addr + (ETH_ALEN - sizeof(ifa->ifa_local)), &ifa->ifa_local, sizeof(ifa->ifa_local)); + rcu_read_unlock(); } } else memcpy(dev->dev_addr, card->mac_addr, ETH_ALEN); diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index d5d05503a04b..3515ca64e638 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -26,7 +26,7 @@ struct in_device { struct net_device *dev; refcount_t refcnt; int dead; - struct in_ifaddr *ifa_list; /* IP ifaddr chain */ + struct in_ifaddr __rcu *ifa_list;/* IP ifaddr chain */ struct ip_mc_list __rcu *mc_list; /* IP multicast filter chain */ struct ip_mc_list __rcu * __rcu *mc_hash; @@ -136,7 +136,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) struct in_ifaddr { struct hlist_node hash; - struct in_ifaddr *ifa_next; + struct in_ifaddr __rcu *ifa_next; struct in_device *ifa_dev; struct rcu_head rcu_head; __be32 ifa_local; @@ -206,22 +206,13 @@ static __inline__ bool bad_mask(__be32 mask, __be32 addr) return false; } -#define for_primary_ifa(in_dev) { struct in_ifaddr *ifa; \ - for (ifa = (in_dev)->ifa_list; ifa && !(ifa->ifa_flags&IFA_F_SECONDARY); ifa = ifa->ifa_next) - -#define for_ifa(in_dev) { struct in_ifaddr *ifa; \ - for (ifa = (in_dev)->ifa_list; ifa; ifa = ifa->ifa_next) - - -#define endfor_ifa(in_dev) } - #define in_dev_for_each_ifa_rtnl(ifa, in_dev) \ - for (ifa = (in_dev)->ifa_list; ifa; \ - ifa = ifa->ifa_next) + for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa; \ + ifa = rtnl_dereference(ifa->ifa_next)) #define in_dev_for_each_ifa_rcu(ifa, in_dev) \ - for (ifa = (in_dev)->ifa_list; ifa; \ - ifa = ifa->ifa_next) + for (ifa = rcu_dereference((in_dev)->ifa_list); ifa; \ + ifa = rcu_dereference(ifa->ifa_next)) static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev) { diff --git a/net/core/netpoll.c b/net/core/netpoll.c index dd8b1a460d64..2cf27da1baeb 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -696,16 +696,22 @@ int netpoll_setup(struct netpoll *np) if (!np->local_ip.ip) { if (!np->ipv6) { + const struct in_ifaddr *ifa; + in_dev = __in_dev_get_rtnl(ndev); + if (!in_dev) + goto put_noaddr; - if (!in_dev || !in_dev->ifa_list) { + ifa = rtnl_dereference(in_dev->ifa_list); + if (!ifa) { +put_noaddr: np_err(np, "no IP address for %s, aborting\n", np->dev_name); err = -EDESTADDRREQ; goto put; } - np->local_ip.ip = in_dev->ifa_list->ifa_local; + np->local_ip.ip = ifa->ifa_local; np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 319ad5490fb3..4cd120dc30ad 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2125,9 +2125,11 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) rcu_read_lock(); in_dev = __in_dev_get_rcu(pkt_dev->odev); if (in_dev) { - if (in_dev->ifa_list) { - pkt_dev->saddr_min = - in_dev->ifa_list->ifa_address; + const struct in_ifaddr *ifa; + + ifa = rcu_dereference(in_dev->ifa_list); + if (ifa) { + pkt_dev->saddr_min = ifa->ifa_address; pkt_dev->saddr_max = pkt_dev->saddr_min; } } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index b45421b2b734..ebaea05b4033 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -194,7 +194,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain); -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +static void inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, int destroy); #ifdef CONFIG_SYSCTL static int devinet_sysctl_register(struct in_device *idev); @@ -300,8 +301,8 @@ static void in_dev_rcu_put(struct rcu_head *head) static void inetdev_destroy(struct in_device *in_dev) { - struct in_ifaddr *ifa; struct net_device *dev; + struct in_ifaddr *ifa; ASSERT_RTNL(); @@ -311,7 +312,7 @@ static void inetdev_destroy(struct in_device *in_dev) ip_mc_destroy_dev(in_dev); - while ((ifa = in_dev->ifa_list) != NULL) { + while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) { inet_del_ifa(in_dev, &in_dev->ifa_list, 0); inet_free_ifa(ifa); } @@ -342,17 +343,20 @@ int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) return 0; } -static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, - int destroy, struct nlmsghdr *nlh, u32 portid) +static void __inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, + int destroy, struct nlmsghdr *nlh, u32 portid) { struct in_ifaddr *promote = NULL; - struct in_ifaddr *ifa, *ifa1 = *ifap; - struct in_ifaddr *last_prim = in_dev->ifa_list; + struct in_ifaddr *ifa, *ifa1; + struct in_ifaddr *last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); + ifa1 = rtnl_dereference(*ifap); + last_prim = rtnl_dereference(in_dev->ifa_list); if (in_dev->dead) goto no_promotions; @@ -361,9 +365,9 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { - struct in_ifaddr **ifap1 = &ifa1->ifa_next; + struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next; - while ((ifa = *ifap1) != NULL) { + while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) last_prim = ifa; @@ -396,7 +400,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, * and later to add them back with new prefsrc. Do this * while all addresses are on the device list. */ - for (ifa = promote; ifa; ifa = ifa->ifa_next) { + for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) fib_del_ifaddr(ifa, ifa1); @@ -422,19 +426,24 @@ no_promotions: blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { - struct in_ifaddr *next_sec = promote->ifa_next; + struct in_ifaddr *next_sec; + next_sec = rtnl_dereference(promote->ifa_next); if (prev_prom) { - prev_prom->ifa_next = promote->ifa_next; - promote->ifa_next = last_prim->ifa_next; - last_prim->ifa_next = promote; + struct in_ifaddr *last_sec; + + last_sec = rtnl_dereference(last_prim->ifa_next); + rcu_assign_pointer(prev_prom->ifa_next, next_sec); + rcu_assign_pointer(promote->ifa_next, last_sec); + rcu_assign_pointer(last_prim->ifa_next, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); - for (ifa = next_sec; ifa; ifa = ifa->ifa_next) { + for (ifa = next_sec; ifa; + ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; @@ -446,7 +455,8 @@ no_promotions: inet_free_ifa(ifa1); } -static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, +static void inet_del_ifa(struct in_device *in_dev, + struct in_ifaddr __rcu **ifap, int destroy) { __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); @@ -459,9 +469,10 @@ static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid, struct netlink_ext_ack *extack) { + struct in_ifaddr __rcu **last_primary, **ifap; struct in_device *in_dev = ifa->ifa_dev; - struct in_ifaddr *ifa1, **ifap, **last_primary; struct in_validator_info ivi; + struct in_ifaddr *ifa1; int ret; ASSERT_RTNL(); @@ -474,8 +485,10 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; - for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; - ifap = &ifa1->ifa_next) { + ifap = &in_dev->ifa_list; + ifa1 = rtnl_dereference(*ifap); + + while (ifa1) { if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) last_primary = &ifa1->ifa_next; @@ -491,6 +504,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, } ifa->ifa_flags |= IFA_F_SECONDARY; } + + ifap = &ifa1->ifa_next; + ifa1 = rtnl_dereference(*ifap); } /* Allow any devices that wish to register ifaddr validtors to weigh @@ -516,8 +532,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifap = last_primary; } - ifa->ifa_next = *ifap; - *ifap = ifa; + rcu_assign_pointer(ifa->ifa_next, *ifap); + rcu_assign_pointer(*ifap, ifa); inet_hash_insert(dev_net(in_dev->dev), ifa); @@ -617,10 +633,12 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct in_ifaddr __rcu **ifap; struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; - struct in_ifaddr *ifa, **ifap; + struct in_ifaddr *ifa; + int err = -EINVAL; ASSERT_RTNL(); @@ -637,7 +655,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + for (ifap = &in_dev->ifa_list; (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) @@ -725,15 +743,20 @@ static void check_lifetime(struct work_struct *work) if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_valid_lft) { - struct in_ifaddr **ifap; - - for (ifap = &ifa->ifa_dev->ifa_list; - *ifap != NULL; ifap = &(*ifap)->ifa_next) { - if (*ifap == ifa) { + struct in_ifaddr __rcu **ifap; + struct in_ifaddr *tmp; + + ifap = &ifa->ifa_dev->ifa_list; + tmp = rtnl_dereference(*ifap); + while (tmp) { + tmp = rtnl_dereference(tmp->ifa_next); + if (rtnl_dereference(*ifap) == ifa) { inet_del_ifa(ifa->ifa_dev, ifap, 1); break; } + ifap = &tmp->ifa_next; + tmp = rtnl_dereference(*ifap); } } else if (ifa->ifa_preferred_lft != INFINITY_LIFE_TIME && @@ -977,8 +1000,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) { struct sockaddr_in sin_orig; struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr; + struct in_ifaddr __rcu **ifap = NULL; struct in_device *in_dev; - struct in_ifaddr **ifap = NULL; struct in_ifaddr *ifa = NULL; struct net_device *dev; char *colon; @@ -1049,7 +1072,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) /* note: we only do this for a limited set of ioctls and only if the original address family was AF_INET. This is checked above. */ - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + + for (ifap = &in_dev->ifa_list; + (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) { if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == @@ -1062,7 +1087,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) 4.3BSD-style and passed in junk so we fall back to comparing just the label */ if (!ifa) { - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + for (ifap = &in_dev->ifa_list; + (ifa = rtnl_dereference(*ifap)) != NULL; ifap = &ifa->ifa_next) if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 2b608044ae23..1f11907dc528 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -354,11 +354,11 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, sdata_lock(sdata); /* Copy the addresses to the bss_conf list */ - ifa = idev->ifa_list; + ifa = rtnl_dereference(idev->ifa_list); while (ifa) { if (c < IEEE80211_BSS_ARP_ADDR_LIST_LEN) bss_conf->arp_addr_list[c] = ifa->ifa_address; - ifa = ifa->ifa_next; + ifa = rtnl_dereference(ifa->ifa_next); c++; } diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index 78a9e6454ff3..8598e80968e0 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -47,15 +47,17 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, if (hooknum == NF_INET_LOCAL_OUT) { newdst = htonl(0x7F000001); } else { - struct in_device *indev; - struct in_ifaddr *ifa; + const struct in_device *indev; newdst = 0; indev = __in_dev_get_rcu(skb->dev); - if (indev && indev->ifa_list) { - ifa = indev->ifa_list; - newdst = ifa->ifa_local; + if (indev) { + const struct in_ifaddr *ifa; + + ifa = rcu_dereference(indev->ifa_list); + if (ifa) + newdst = ifa->ifa_local; } if (!newdst) -- cgit v1.2.3 From 24c509b2e2f1661ce9500fc7e32647113e62d7e3 Mon Sep 17 00:00:00 2001 From: Florent Fourcot Date: Mon, 10 Jun 2019 12:28:58 +0200 Subject: netfilter: ipset: remove useless memset() calls One of the memset call is buggy: it does not erase full array, but only pointer size. Moreover, after a check, first step of nla_parse_nested/nla_parse is to erase tb array as well. We can remove both calls safely. Signed-off-by: Florent Fourcot Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 3f4a4936f63c..faddcf398b73 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1599,7 +1599,6 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, int nla_rem; nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { - memset(tb, 0, sizeof(tb)); if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) @@ -1651,7 +1650,6 @@ static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb, int nla_rem; nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { - memset(tb, 0, sizeof(*tb)); if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) -- cgit v1.2.3 From f0cb839084efdf99328bed393f12f2e5c258ce8d Mon Sep 17 00:00:00 2001 From: Florent Fourcot Date: Mon, 10 Jun 2019 12:42:56 +0200 Subject: netfilter: ipset: merge uadd and udel functions Both functions are using exactly the same code, except the command value passed to call_ad function. Signed-off-by: Florent Fourcot Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_core.c | 71 +++++++++++---------------------------- 1 file changed, 20 insertions(+), 51 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index faddcf398b73..2ad609900b22 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1561,10 +1561,12 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, return ret; } -static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_ad(struct net *net, struct sock *ctnl, + struct sk_buff *skb, + enum ipset_adt adt, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) { struct ip_set_net *inst = ip_set_pernet(net); struct ip_set *set; @@ -1593,7 +1595,7 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (attr[IPSET_ATTR_DATA]) { if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, + ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); } else { int nla_rem; @@ -1603,7 +1605,7 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, !flag_nested(nla) || nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, + ret = call_ad(ctnl, skb, set, tb, adt, flags, use_lineno); if (ret < 0) return ret; @@ -1612,55 +1614,22 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, return ret; } -static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int ip_set_uadd(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const attr[], struct netlink_ext_ack *extack) { - struct ip_set_net *inst = ip_set_pernet(net); - struct ip_set *set; - struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; - const struct nlattr *nla; - u32 flags = flag_exist(nlh); - bool use_lineno; - int ret = 0; - - if (unlikely(protocol_min_failed(attr) || - !attr[IPSET_ATTR_SETNAME] || - !((attr[IPSET_ATTR_DATA] != NULL) ^ - (attr[IPSET_ATTR_ADT] != NULL)) || - (attr[IPSET_ATTR_DATA] && - !flag_nested(attr[IPSET_ATTR_DATA])) || - (attr[IPSET_ATTR_ADT] && - (!flag_nested(attr[IPSET_ATTR_ADT]) || - !attr[IPSET_ATTR_LINENO])))) - return -IPSET_ERR_PROTOCOL; - - set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME])); - if (!set) - return -ENOENT; - - use_lineno = !!attr[IPSET_ATTR_LINENO]; - if (attr[IPSET_ATTR_DATA]) { - if (nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) - return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags, - use_lineno); - } else { - int nla_rem; + return ip_set_ad(net, ctnl, skb, + IPSET_ADD, nlh, attr, extack); +} - nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { - if (nla_type(nla) != IPSET_ATTR_DATA || - !flag_nested(nla) || - nla_parse_nested_deprecated(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) - return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, - flags, use_lineno); - if (ret < 0) - return ret; - } - } - return ret; +static int ip_set_udel(struct net *net, struct sock *ctnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const attr[], + struct netlink_ext_ack *extack) +{ + return ip_set_ad(net, ctnl, skb, + IPSET_DEL, nlh, attr, extack); } static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, -- cgit v1.2.3 From f4f5748bfec94cf418e49bf05f0c81a1b9ebc950 Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Mon, 10 Jun 2019 12:47:37 +0200 Subject: netfilter: ipset: fix a missing check of nla_parse When nla_parse fails, we should not use the results (the first argument). The fix checks if it fails, and if so, returns its error code upstream. Signed-off-by: Aditya Pakki Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 2ad609900b22..d0f4c627ff91 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1544,10 +1544,14 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); cmdattr = (void *)&errmsg->msg + min_len; - nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr, - nlh->nlmsg_len - min_len, - ip_set_adt_policy, NULL); + ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, cmdattr, + nlh->nlmsg_len - min_len, + ip_set_adt_policy, NULL); + if (ret) { + nlmsg_free(skb2); + return ret; + } errline = nla_data(cda[IPSET_ATTR_LINENO]); *errline = lineno; -- cgit v1.2.3 From 13c6ba1f855415cf3b9c58ea926ae8858050ec1c Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 10 Jun 2019 12:50:00 +0200 Subject: netfilter: ipset: Fix the last missing check of nla_parse_deprecated() In dump_init() the outdated comment was incorrect and we had a missing validation check of nla_parse_deprecated(). Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index d0f4c627ff91..039892cd2b7d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1293,11 +1293,13 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) struct nlattr *attr = (void *)nlh + min_len; u32 dump_type; ip_set_id_t index; + int ret; - /* Second pass, so parser can't fail */ - nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr, - nlh->nlmsg_len - min_len, ip_set_setname_policy, - NULL); + ret = nla_parse_deprecated(cda, IPSET_ATTR_CMD_MAX, attr, + nlh->nlmsg_len - min_len, + ip_set_setname_policy, NULL); + if (ret) + return ret; cb->args[IPSET_CB_PROTO] = nla_get_u8(cda[IPSET_ATTR_PROTOCOL]); if (cda[IPSET_ATTR_SETNAME]) { -- cgit v1.2.3 From b1732e1638925a2b60b5b453b25f59bf4e79e010 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 10 Jun 2019 12:58:26 +0200 Subject: netfilter: ipset: Fix error path in set_target_v3_checkentry() Fix error path and release the references properly. Signed-off-by: Jozsef Kadlecsik --- net/netfilter/xt_set.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index bf2890b13212..cf67bbe07dc2 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -439,6 +439,7 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) { const struct xt_set_info_target_v3 *info = par->targinfo; ip_set_id_t index; + int ret = 0; if (info->add_set.index != IPSET_INVALID_ID) { index = ip_set_nfnl_get_byindex(par->net, @@ -456,17 +457,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) if (index == IPSET_INVALID_ID) { pr_info_ratelimited("Cannot find del_set index %u as target\n", info->del_set.index); - if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, - info->add_set.index); - return -ENOENT; + ret = -ENOENT; + goto cleanup_add; } } if (info->map_set.index != IPSET_INVALID_ID) { if (strncmp(par->table, "mangle", 7)) { pr_info_ratelimited("--map-set only usable from mangle table\n"); - return -EINVAL; + ret = -EINVAL; + goto cleanup_del; } if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && @@ -474,20 +474,16 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) 1 << NF_INET_LOCAL_OUT | 1 << NF_INET_POST_ROUTING))) { pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); - return -EINVAL; + ret = -EINVAL; + goto cleanup_del; } index = ip_set_nfnl_get_byindex(par->net, info->map_set.index); if (index == IPSET_INVALID_ID) { pr_info_ratelimited("Cannot find map_set index %u as target\n", info->map_set.index); - if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, - info->add_set.index); - if (info->del_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, - info->del_set.index); - return -ENOENT; + ret = -ENOENT; + goto cleanup_del; } } @@ -495,16 +491,21 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) info->del_set.dim > IPSET_DIM_MAX || info->map_set.dim > IPSET_DIM_MAX) { pr_info_ratelimited("SET target dimension over the limit!\n"); - if (info->add_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, info->add_set.index); - if (info->del_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, info->del_set.index); - if (info->map_set.index != IPSET_INVALID_ID) - ip_set_nfnl_put(par->net, info->map_set.index); - return -ERANGE; + ret = -ERANGE; + goto cleanup_mark; } return 0; +cleanup_mark: + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); +cleanup_del: + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); +cleanup_add: + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + return ret; } static void -- cgit v1.2.3 From 11921796f4799ca9c61c4b22cc54d84aa69f8a35 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Sun, 26 May 2019 23:14:06 +0200 Subject: ipset: Fix memory accounting for hash types on resize If a fresh array block is allocated during resize, the current in-memory set size should be increased by the size of the block, not replaced by it. Before the fix, adding entries to a hash set type, leading to a table resize, caused an inconsistent memory size to be reported. This becomes more obvious when swapping sets with similar sizes: # cat hash_ip_size.sh #!/bin/sh FAIL_RETRIES=10 tries=0 while [ ${tries} -lt ${FAIL_RETRIES} ]; do ipset create t1 hash:ip for i in `seq 1 4345`; do ipset add t1 1.2.$((i / 255)).$((i % 255)) done t1_init="$(ipset list t1|sed -n 's/Size in memory: \(.*\)/\1/p')" ipset create t2 hash:ip for i in `seq 1 4360`; do ipset add t2 1.2.$((i / 255)).$((i % 255)) done t2_init="$(ipset list t2|sed -n 's/Size in memory: \(.*\)/\1/p')" ipset swap t1 t2 t1_swap="$(ipset list t1|sed -n 's/Size in memory: \(.*\)/\1/p')" t2_swap="$(ipset list t2|sed -n 's/Size in memory: \(.*\)/\1/p')" ipset destroy t1 ipset destroy t2 tries=$((tries + 1)) if [ ${t1_init} -lt 10000 ] || [ ${t2_init} -lt 10000 ]; then echo "FAIL after ${tries} tries:" echo "T1 size ${t1_init}, after swap ${t1_swap}" echo "T2 size ${t2_init}, after swap ${t2_swap}" exit 1 fi done echo "PASS" # echo -n 'func hash_ip4_resize +p' > /sys/kernel/debug/dynamic_debug/control # ./hash_ip_size.sh [ 2035.018673] attempt to resize set t1 from 10 to 11, t 00000000fe6551fa [ 2035.078583] set t1 resized from 10 (00000000fe6551fa) to 11 (00000000172a0163) [ 2035.080353] Table destroy by resize 00000000fe6551fa FAIL after 4 tries: T1 size 9064, after swap 71128 T2 size 71128, after swap 9064 Reported-by: NOYB Fixes: 9e41f26a505c ("netfilter: ipset: Count non-static extension memory for userspace") Signed-off-by: Stefano Brivio Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/netfilter') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 01d51f775f12..623e0d675725 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -625,7 +625,7 @@ retry: goto cleanup; } m->size = AHASH_INIT_SIZE; - extsize = ext_size(AHASH_INIT_SIZE, dsize); + extsize += ext_size(AHASH_INIT_SIZE, dsize); RCU_INIT_POINTER(hbucket(t, key), m); } else if (m->pos >= m->size) { struct hbucket *ht; -- cgit v1.2.3 From fe03d4745675cbd678cb8c50d951df0abafdcaee Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Mon, 10 Jun 2019 13:00:24 +0200 Subject: Update my email address It's better to use my kadlec@netfilter.org email address in the source code. I might not be able to use kadlec@blackhole.kfki.hu in the future. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Jozsef Kadlecsik --- CREDITS | 2 +- MAINTAINERS | 2 +- include/linux/jhash.h | 2 +- include/linux/netfilter/ipset/ip_set.h | 2 +- include/linux/netfilter/ipset/ip_set_counter.h | 2 +- include/linux/netfilter/ipset/ip_set_skbinfo.h | 2 +- include/linux/netfilter/ipset/ip_set_timeout.h | 2 +- include/uapi/linux/netfilter/ipset/ip_set.h | 2 +- net/ipv4/netfilter/iptable_raw.c | 2 +- net/ipv4/netfilter/nf_nat_h323.c | 2 +- net/ipv6/netfilter/ip6table_raw.c | 2 +- net/netfilter/ipset/ip_set_bitmap_gen.h | 2 +- net/netfilter/ipset/ip_set_bitmap_ip.c | 4 ++-- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 4 ++-- net/netfilter/ipset/ip_set_bitmap_port.c | 4 ++-- net/netfilter/ipset/ip_set_core.c | 4 ++-- net/netfilter/ipset/ip_set_getport.c | 2 +- net/netfilter/ipset/ip_set_hash_gen.h | 2 +- net/netfilter/ipset/ip_set_hash_ip.c | 4 ++-- net/netfilter/ipset/ip_set_hash_ipmark.c | 2 +- net/netfilter/ipset/ip_set_hash_ipport.c | 4 ++-- net/netfilter/ipset/ip_set_hash_ipportip.c | 4 ++-- net/netfilter/ipset/ip_set_hash_ipportnet.c | 4 ++-- net/netfilter/ipset/ip_set_hash_mac.c | 4 ++-- net/netfilter/ipset/ip_set_hash_net.c | 4 ++-- net/netfilter/ipset/ip_set_hash_netiface.c | 4 ++-- net/netfilter/ipset/ip_set_hash_netnet.c | 2 +- net/netfilter/ipset/ip_set_hash_netport.c | 4 ++-- net/netfilter/ipset/ip_set_hash_netportnet.c | 2 +- net/netfilter/ipset/ip_set_list_set.c | 4 ++-- net/netfilter/nf_conntrack_h323_main.c | 2 +- net/netfilter/nf_conntrack_proto_tcp.c | 2 +- net/netfilter/xt_iprange.c | 4 ++-- net/netfilter/xt_set.c | 4 ++-- 34 files changed, 49 insertions(+), 49 deletions(-) (limited to 'net/netfilter') diff --git a/CREDITS b/CREDITS index 8e0342620a06..4200f4f91a16 100644 --- a/CREDITS +++ b/CREDITS @@ -1800,7 +1800,7 @@ S: 2300 Copenhagen S. S: Denmark N: Jozsef Kadlecsik -E: kadlec@blackhole.kfki.hu +E: kadlec@netfilter.org P: 1024D/470DB964 4CB3 1A05 713E 9BF7 FAC5 5809 DD8C B7B1 470D B964 D: netfilter: TCP window tracking code D: netfilter: raw table diff --git a/MAINTAINERS b/MAINTAINERS index fcbd648b960e..4c65ce86fc9e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10858,7 +10858,7 @@ F: drivers/net/ethernet/neterion/ NETFILTER M: Pablo Neira Ayuso -M: Jozsef Kadlecsik +M: Jozsef Kadlecsik M: Florian Westphal L: netfilter-devel@vger.kernel.org L: coreteam@netfilter.org diff --git a/include/linux/jhash.h b/include/linux/jhash.h index 8037850f3104..ba2f6a9776b6 100644 --- a/include/linux/jhash.h +++ b/include/linux/jhash.h @@ -17,7 +17,7 @@ * if SELF_TEST is defined. You can use this free for any purpose. It's in * the public domain. It has no warranty. * - * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu) + * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org) * * I've modified Bob's hash to be useful in the Linux kernel, and * any bugs present are my fault. diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index e499d170f12d..f5c6e7cd6469 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -1,7 +1,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson * Patrick Schaaf * Martin Josefsson - * Copyright (C) 2003-2013 Jozsef Kadlecsik + * Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/include/linux/netfilter/ipset/ip_set_counter.h b/include/linux/netfilter/ipset/ip_set_counter.h index 3d33a2c3f39f..305aeda2a899 100644 --- a/include/linux/netfilter/ipset/ip_set_counter.h +++ b/include/linux/netfilter/ipset/ip_set_counter.h @@ -1,7 +1,7 @@ #ifndef _IP_SET_COUNTER_H #define _IP_SET_COUNTER_H -/* Copyright (C) 2015 Jozsef Kadlecsik +/* Copyright (C) 2015 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/include/linux/netfilter/ipset/ip_set_skbinfo.h b/include/linux/netfilter/ipset/ip_set_skbinfo.h index 29d7ef2bc3fa..fac57ef854c2 100644 --- a/include/linux/netfilter/ipset/ip_set_skbinfo.h +++ b/include/linux/netfilter/ipset/ip_set_skbinfo.h @@ -1,7 +1,7 @@ #ifndef _IP_SET_SKBINFO_H #define _IP_SET_SKBINFO_H -/* Copyright (C) 2015 Jozsef Kadlecsik +/* Copyright (C) 2015 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h index 8ce271e187b6..dc74150f3432 100644 --- a/include/linux/netfilter/ipset/ip_set_timeout.h +++ b/include/linux/netfilter/ipset/ip_set_timeout.h @@ -1,7 +1,7 @@ #ifndef _IP_SET_TIMEOUT_H #define _IP_SET_TIMEOUT_H -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index ea69ca21ff23..eea166c52c36 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -2,7 +2,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson * Patrick Schaaf * Martin Josefsson - * Copyright (C) 2003-2011 Jozsef Kadlecsik + * Copyright (C) 2003-2011 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 6eefde5bc468..69697eb4bfc6 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -2,7 +2,7 @@ /* * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT . * - * Copyright (C) 2003 Jozsef Kadlecsik + * Copyright (C) 2003 Jozsef Kadlecsik */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 15f2b2604890..076b6b29d66d 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -7,7 +7,7 @@ * This source code is licensed under General Public License version 2. * * Based on the 'brute force' H.323 NAT module by - * Jozsef Kadlecsik + * Jozsef Kadlecsik */ #include diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 3f7d4691c423..a22100b1cf2c 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -2,7 +2,7 @@ /* * IPv6 raw table, a port of the IPv4 raw table to IPv6 * - * Copyright (C) 2003 Jozsef Kadlecsik + * Copyright (C) 2003 Jozsef Kadlecsik */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 38ef2ea838cb..29c1e9a50601 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013 Jozsef Kadlecsik +/* Copyright (C) 2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 488d6d05c65c..5a66c5499700 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -1,6 +1,6 @@ /* Copyright (C) 2000-2002 Joakim Axelsson * Patrick Schaaf - * Copyright (C) 2003-2013 Jozsef Kadlecsik + * Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -31,7 +31,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("bitmap:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip"); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 980000fc3b50..ec7a8b12642c 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -1,7 +1,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson * Patrick Schaaf * Martin Josefsson - * Copyright (C) 2003-2013 Jozsef Kadlecsik + * Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -31,7 +31,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index b561ca8b3659..18275ec4924c 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -26,7 +26,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("bitmap:port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:port"); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 039892cd2b7d..18430ad2fdf2 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1,6 +1,6 @@ /* Copyright (C) 2000-2002 Joakim Axelsson * Patrick Schaaf - * Copyright (C) 2003-2013 Jozsef Kadlecsik + * Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -51,7 +51,7 @@ static unsigned int max_sets; module_param(max_sets, int, 0600); MODULE_PARM_DESC(max_sets, "maximal number of sets"); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); MODULE_DESCRIPTION("core IP set support"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c index 3f09cdb42562..dc7b46b41354 100644 --- a/net/netfilter/ipset/ip_set_getport.c +++ b/net/netfilter/ipset/ip_set_getport.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2011 Jozsef Kadlecsik +/* Copyright (C) 2003-2011 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 623e0d675725..07ef941130a6 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2013 Jozsef Kadlecsik +/* Copyright (C) 2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 613eb212cb48..7b82bf1104ce 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -30,7 +30,7 @@ #define IPSET_TYPE_REV_MAX 4 /* skbinfo support */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip"); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index f3ba8348cf9d..7d468f98a252 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * Copyright (C) 2013 Smoothwall Ltd. * * This program is free software; you can redistribute it and/or modify diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index ddb8039ec1d2..d358ee69d04b 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -32,7 +32,7 @@ #define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:ip,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index a7f4d7a85420..0a304785f912 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -32,7 +32,7 @@ #define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,ip"); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 88b83d6d3084..245f7d714870 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -34,7 +34,7 @@ #define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,net"); diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 4fe5f243d0a3..3d1fc71dac38 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2014 Jozsef Kadlecsik +/* Copyright (C) 2014 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -23,7 +23,7 @@ #define IPSET_TYPE_REV_MAX 0 MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:mac"); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 5449e23af13a..470701fda231 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -31,7 +31,7 @@ #define IPSET_TYPE_REV_MAX 6 /* skbinfo mapping support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net"); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index f5164c1efce2..1df8656ad84d 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011-2013 Jozsef Kadlecsik +/* Copyright (C) 2011-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -32,7 +32,7 @@ #define IPSET_TYPE_REV_MAX 6 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,iface"); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 5a2b923bd81f..e0553be89600 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * Copyright (C) 2013 Oliver Smith * * This program is free software; you can redistribute it and/or modify diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 1a187be9ebc8..943d55d76fcf 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -33,7 +33,7 @@ #define IPSET_TYPE_REV_MAX 7 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("hash:net,port", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,port"); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 613e18e720a4..afaff99e578c 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2013 Jozsef Kadlecsik +/* Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 4f894165cdcd..ed4360072f64 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2008-2013 Jozsef Kadlecsik +/* Copyright (C) 2008-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -22,7 +22,7 @@ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_list:set"); diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 12de40390e97..1ff66e070cb2 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -7,7 +7,7 @@ * This source code is licensed under General Public License version 2. * * Based on the 'brute force' H.323 connection tracking module by - * Jozsef Kadlecsik + * Jozsef Kadlecsik * * For more information, please see http://nath323.sourceforge.net/ */ diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 7ba01d8ee165..60b68400435d 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1,6 +1,6 @@ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team - * (C) 2002-2013 Jozsef Kadlecsik + * (C) 2002-2013 Jozsef Kadlecsik * (C) 2006-2012 Patrick McHardy * * This program is free software; you can redistribute it and/or modify diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c index b46626cddd93..4ab4155706d7 100644 --- a/net/netfilter/xt_iprange.c +++ b/net/netfilter/xt_iprange.c @@ -1,7 +1,7 @@ /* * xt_iprange - Netfilter module to match IP address ranges * - * (C) 2003 Jozsef Kadlecsik + * (C) 2003 Jozsef Kadlecsik * (C) CC Computer Consultants GmbH, 2008 * * This program is free software; you can redistribute it and/or modify @@ -133,7 +133,7 @@ static void __exit iprange_mt_exit(void) module_init(iprange_mt_init); module_exit(iprange_mt_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); MODULE_AUTHOR("Jan Engelhardt "); MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching"); MODULE_ALIAS("ipt_iprange"); diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index cf67bbe07dc2..f025c51ba375 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -1,7 +1,7 @@ /* Copyright (C) 2000-2002 Joakim Axelsson * Patrick Schaaf * Martin Josefsson - * Copyright (C) 2003-2013 Jozsef Kadlecsik + * Copyright (C) 2003-2013 Jozsef Kadlecsik * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -21,7 +21,7 @@ #include MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_AUTHOR("Jozsef Kadlecsik "); MODULE_DESCRIPTION("Xtables: IP set match and target module"); MODULE_ALIAS("xt_SET"); MODULE_ALIAS("ipt_set"); -- cgit v1.2.3 From 857b46027d6f91150797295752581b7155b9d0e1 Mon Sep 17 00:00:00 2001 From: Stéphane Veyret Date: Sat, 25 May 2019 15:30:58 +0200 Subject: netfilter: nft_ct: add ct expectations support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch allows to add, list and delete expectations via nft objref infrastructure and assigning these expectations via nft rule. This allows manual port triggering when no helper is defined to manage a specific protocol. For example, if I have an online game which protocol is based on initial connection to TCP port 9753 of the server, and where the server opens a connection to port 9876, I can set rules as follow: table ip filter { ct expectation mygame { protocol udp; dport 9876; timeout 2m; size 1; } chain input { type filter hook input priority 0; policy drop; tcp dport 9753 ct expectation set "mygame"; } chain output { type filter hook output priority 0; policy drop; udp dport 9876 ct status expected accept; } } Signed-off-by: Stéphane Veyret Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 14 +++- net/netfilter/nft_ct.c | 138 ++++++++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 3 deletions(-) (limited to 'net/netfilter') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 505393c6e959..31a6b8f7ff73 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1445,6 +1445,17 @@ enum nft_ct_timeout_timeout_attributes { }; #define NFTA_CT_TIMEOUT_MAX (__NFTA_CT_TIMEOUT_MAX - 1) +enum nft_ct_expectation_attributes { + NFTA_CT_EXPECT_UNSPEC, + NFTA_CT_EXPECT_L3PROTO, + NFTA_CT_EXPECT_L4PROTO, + NFTA_CT_EXPECT_DPORT, + NFTA_CT_EXPECT_TIMEOUT, + NFTA_CT_EXPECT_SIZE, + __NFTA_CT_EXPECT_MAX, +}; +#define NFTA_CT_EXPECT_MAX (__NFTA_CT_EXPECT_MAX - 1) + #define NFT_OBJECT_UNSPEC 0 #define NFT_OBJECT_COUNTER 1 #define NFT_OBJECT_QUOTA 2 @@ -1454,7 +1465,8 @@ enum nft_ct_timeout_timeout_attributes { #define NFT_OBJECT_TUNNEL 6 #define NFT_OBJECT_CT_TIMEOUT 7 #define NFT_OBJECT_SECMARK 8 -#define __NFT_OBJECT_MAX 9 +#define NFT_OBJECT_CT_EXPECT 9 +#define __NFT_OBJECT_MAX 10 #define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1) /** diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index f043936763f3..06b52c894573 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -24,6 +24,7 @@ #include #include #include +#include struct nft_ct { enum nft_ct_keys key:8; @@ -1156,6 +1157,131 @@ static struct nft_object_type nft_ct_helper_obj_type __read_mostly = { .owner = THIS_MODULE, }; +struct nft_ct_expect_obj { + u16 l3num; + __be16 dport; + u8 l4proto; + u8 size; + u32 timeout; +}; + +static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_ct_expect_obj *priv = nft_obj_data(obj); + + if (!tb[NFTA_CT_EXPECT_L4PROTO] || + !tb[NFTA_CT_EXPECT_DPORT] || + !tb[NFTA_CT_EXPECT_TIMEOUT] || + !tb[NFTA_CT_EXPECT_SIZE]) + return -EINVAL; + + priv->l3num = ctx->family; + if (tb[NFTA_CT_EXPECT_L3PROTO]) + priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO])); + + priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]); + priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]); + priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]); + priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]); + + return nf_ct_netns_get(ctx->net, ctx->family); +} + +static void nft_ct_expect_obj_destroy(const struct nft_ctx *ctx, + struct nft_object *obj) +{ + nf_ct_netns_put(ctx->net, ctx->family); +} + +static int nft_ct_expect_obj_dump(struct sk_buff *skb, + struct nft_object *obj, bool reset) +{ + const struct nft_ct_expect_obj *priv = nft_obj_data(obj); + + if (nla_put_be16(skb, NFTA_CT_EXPECT_L3PROTO, htons(priv->l3num)) || + nla_put_u8(skb, NFTA_CT_EXPECT_L4PROTO, priv->l4proto) || + nla_put_be16(skb, NFTA_CT_EXPECT_DPORT, priv->dport) || + nla_put_u32(skb, NFTA_CT_EXPECT_TIMEOUT, priv->timeout) || + nla_put_u8(skb, NFTA_CT_EXPECT_SIZE, priv->size)) + return -1; + + return 0; +} + +static void nft_ct_expect_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_ct_expect_obj *priv = nft_obj_data(obj); + struct nf_conntrack_expect *exp; + enum ip_conntrack_info ctinfo; + struct nf_conn_help *help; + enum ip_conntrack_dir dir; + u16 l3num = priv->l3num; + struct nf_conn *ct; + + ct = nf_ct_get(pkt->skb, &ctinfo); + if (!ct || ctinfo == IP_CT_UNTRACKED) { + regs->verdict.code = NFT_BREAK; + return; + } + dir = CTINFO2DIR(ctinfo); + + help = nfct_help(ct); + if (!help) + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); + + if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) { + regs->verdict.code = NFT_BREAK; + return; + } + if (l3num == NFPROTO_INET) + l3num = nf_ct_l3num(ct); + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + regs->verdict.code = NF_DROP; + return; + } + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, l3num, + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + priv->l4proto, NULL, &priv->dport); + exp->timeout.expires = jiffies + priv->timeout * HZ; + + if (nf_ct_expect_related(exp) != 0) + regs->verdict.code = NF_DROP; +} + +static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = { + [NFTA_CT_EXPECT_L3PROTO] = { .type = NLA_U16 }, + [NFTA_CT_EXPECT_L4PROTO] = { .type = NLA_U8 }, + [NFTA_CT_EXPECT_DPORT] = { .type = NLA_U16 }, + [NFTA_CT_EXPECT_TIMEOUT] = { .type = NLA_U32 }, + [NFTA_CT_EXPECT_SIZE] = { .type = NLA_U8 }, +}; + +static struct nft_object_type nft_ct_expect_obj_type; + +static const struct nft_object_ops nft_ct_expect_obj_ops = { + .type = &nft_ct_expect_obj_type, + .size = sizeof(struct nft_ct_expect_obj), + .eval = nft_ct_expect_obj_eval, + .init = nft_ct_expect_obj_init, + .destroy = nft_ct_expect_obj_destroy, + .dump = nft_ct_expect_obj_dump, +}; + +static struct nft_object_type nft_ct_expect_obj_type __read_mostly = { + .type = NFT_OBJECT_CT_EXPECT, + .ops = &nft_ct_expect_obj_ops, + .maxattr = NFTA_CT_EXPECT_MAX, + .policy = nft_ct_expect_policy, + .owner = THIS_MODULE, +}; + static int __init nft_ct_module_init(void) { int err; @@ -1173,17 +1299,23 @@ static int __init nft_ct_module_init(void) err = nft_register_obj(&nft_ct_helper_obj_type); if (err < 0) goto err2; + + err = nft_register_obj(&nft_ct_expect_obj_type); + if (err < 0) + goto err3; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT err = nft_register_obj(&nft_ct_timeout_obj_type); if (err < 0) - goto err3; + goto err4; #endif return 0; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT +err4: + nft_unregister_obj(&nft_ct_expect_obj_type); +#endif err3: nft_unregister_obj(&nft_ct_helper_obj_type); -#endif err2: nft_unregister_expr(&nft_notrack_type); err1: @@ -1196,6 +1328,7 @@ static void __exit nft_ct_module_exit(void) #ifdef CONFIG_NF_CONNTRACK_TIMEOUT nft_unregister_obj(&nft_ct_timeout_obj_type); #endif + nft_unregister_obj(&nft_ct_expect_obj_type); nft_unregister_obj(&nft_ct_helper_obj_type); nft_unregister_expr(&nft_notrack_type); nft_unregister_expr(&nft_ct_type); @@ -1210,3 +1343,4 @@ MODULE_ALIAS_NFT_EXPR("ct"); MODULE_ALIAS_NFT_EXPR("notrack"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT); -- cgit v1.2.3 From 87e389b4c20091b562bd65d90272f9d7c67eb437 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 4 Jun 2019 14:14:04 +0200 Subject: netfilter: conntrack: small conntrack lookup optimization ____nf_conntrack_find() performs checks on the conntrack objects in this order: 1. if (nf_ct_is_expired(ct)) This fetches ct->timeout, in third cache line. The hnnode that is used to store the list pointers resides in the first (origin) or second (reply tuple) cache lines. This test rarely passes, but its necessary to reap obsolete entries. 2. if (nf_ct_is_dying(ct)) This fetches ct->status, also in third cache line. The test is useless, and can be removed: Consider: cpu0 cpu1 ct = ____nf_conntrack_find() atomic_inc_not_zero(ct) -> ok nf_ct_key_equal -> ok is_dying -> DYING bit not set, ok set_bit(ct, DYING); ... unhash ... etc. return ct -> returning a ct with dying bit set, despite having a test for it. This (unlikely) case is fine - refcount prevents ct from getting free'd. 3. if (nf_ct_key_equal(h, tuple, zone, net)) nf_ct_key_equal checks in following order: 1. Tuple equal (first or second cacheline) 2. Zone equal (third cacheline) 3. confirmed bit set (->status, third cacheline) 4. net namespace match (third cacheline). Swapping "timeout" and "cpu" places timeout in the first cacheline. This has two advantages: 1. For a conntrack that won't even match the original tuple, we will now only fetch the first and maybe the second cacheline instead of always accessing the 3rd one as well. 2. in case of TCP ct->timeout changes frequently because we reduce/increase it when there are packets outstanding in the network. The first cacheline contains both the reference count and the ct spinlock, i.e. moving timeout there avoids writes to 3rd cacheline. The restart sequence in __nf_conntrack_find() is removed, if we found a candidate, but then fail to increment the refcount or discover the tuple has changed (object recycling), just pretend we did not find an entry. A second lookup won't find anything until another CPU adds a new conntrack with identical tuple into the hash table, which is very unlikely. We have the confirmation-time checks (when we hold hash lock) that deal with identical entries and even perform clash resolution in some cases. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 7 +++---- net/netfilter/nf_conntrack_core.c | 25 +++++++++++++------------ 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'net/netfilter') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 5cb19ce454d1..c86657d99630 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -70,7 +70,8 @@ struct nf_conn { struct nf_conntrack ct_general; spinlock_t lock; - u16 cpu; + /* jiffies32 when this ct is considered dead */ + u32 timeout; #ifdef CONFIG_NF_CONNTRACK_ZONES struct nf_conntrack_zone zone; @@ -82,9 +83,7 @@ struct nf_conn { /* Have we seen traffic both ways yet? (bitset) */ unsigned long status; - /* jiffies32 when this ct is considered dead */ - u32 timeout; - + u16 cpu; possible_net_t ct_net; #if IS_ENABLED(CONFIG_NF_NAT) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2a714527cde1..2855a2e39fc4 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -752,9 +752,6 @@ begin: continue; } - if (nf_ct_is_dying(ct)) - continue; - if (nf_ct_key_equal(h, tuple, zone, net)) return h; } @@ -780,20 +777,24 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conn *ct; rcu_read_lock(); -begin: + h = ____nf_conntrack_find(net, zone, tuple, hash); if (h) { + /* We have a candidate that matches the tuple we're interested + * in, try to obtain a reference and re-check tuple + */ ct = nf_ct_tuplehash_to_ctrack(h); - if (unlikely(nf_ct_is_dying(ct) || - !atomic_inc_not_zero(&ct->ct_general.use))) - h = NULL; - else { - if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) { - nf_ct_put(ct); - goto begin; - } + if (likely(atomic_inc_not_zero(&ct->ct_general.use))) { + if (likely(nf_ct_key_equal(h, tuple, zone, net))) + goto found; + + /* TYPESAFE_BY_RCU recycled the candidate */ + nf_ct_put(ct); } + + h = NULL; } +found: rcu_read_unlock(); return h; -- cgit v1.2.3 From 9911c1139fd072594ac259c2ce055b004ca92f49 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 7 Jun 2019 16:37:30 +0200 Subject: netfilter: xt_owner: bail out with EINVAL in case of unsupported flags Reject flags that are not supported with EINVAL. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_owner.h | 5 +++++ net/netfilter/xt_owner.c | 3 +++ 2 files changed, 8 insertions(+) (limited to 'net/netfilter') diff --git a/include/uapi/linux/netfilter/xt_owner.h b/include/uapi/linux/netfilter/xt_owner.h index 9e98c09eda32..5108df4d0313 100644 --- a/include/uapi/linux/netfilter/xt_owner.h +++ b/include/uapi/linux/netfilter/xt_owner.h @@ -11,6 +11,11 @@ enum { XT_OWNER_SUPPL_GROUPS = 1 << 3, }; +#define XT_OWNER_MASK (XT_OWNER_UID | \ + XT_OWNER_GID | \ + XT_OWNER_SOCKET | \ + XT_OWNER_SUPPL_GROUPS) + struct xt_owner_match_info { __u32 uid_min, uid_max; __u32 gid_min, gid_max; diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index a8784502aca6..ee597fdc5db7 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -25,6 +25,9 @@ static int owner_check(const struct xt_mtchk_param *par) struct xt_owner_match_info *info = par->matchinfo; struct net *net = par->net; + if (info->match & ~XT_OWNER_MASK) + return -EINVAL; + /* Only allow the common case where the userns of the writer * matches the userns of the network namespace. */ -- cgit v1.2.3 From d7f9b2f18eaef74b4f948c7e24e3a8f796f0c90d Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Fri, 7 Jun 2019 02:36:07 +0200 Subject: netfilter: synproxy: extract SYNPROXY infrastructure from {ipt, ip6t}_SYNPROXY Add common functions into nf_synproxy_core.c to prepare for nftables support. The prototypes of the functions used by {ipt, ip6t}_SYNPROXY are in the new file nf_synproxy.h Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_synproxy.h | 13 +- include/net/netfilter/nf_synproxy.h | 44 ++ net/ipv4/netfilter/ipt_SYNPROXY.c | 394 +---------- net/ipv6/netfilter/ip6t_SYNPROXY.c | 420 +----------- net/netfilter/nf_synproxy_core.c | 896 ++++++++++++++++++++++++-- 5 files changed, 920 insertions(+), 847 deletions(-) create mode 100644 include/net/netfilter/nf_synproxy.h (limited to 'net/netfilter') diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h index 2c7559a54092..c5659dcf5b1a 100644 --- a/include/net/netfilter/nf_conntrack_synproxy.h +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -72,21 +72,12 @@ struct synproxy_options { }; struct tcphdr; -struct xt_synproxy_info; +struct nf_synproxy_info; bool synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, const struct tcphdr *th, struct synproxy_options *opts); -unsigned int synproxy_options_size(const struct synproxy_options *opts); -void synproxy_build_options(struct tcphdr *th, - const struct synproxy_options *opts); -void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, +void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info, struct synproxy_options *opts); -void synproxy_check_timestamp_cookie(struct synproxy_options *opts); - -unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff, - struct tcphdr *th, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct nf_conn_synproxy *synproxy); #endif /* _NF_CONNTRACK_SYNPROXY_H */ diff --git a/include/net/netfilter/nf_synproxy.h b/include/net/netfilter/nf_synproxy.h new file mode 100644 index 000000000000..3e8b3f03b687 --- /dev/null +++ b/include/net/netfilter/nf_synproxy.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NF_SYNPROXY_SHARED_H +#define _NF_SYNPROXY_SHARED_H + +#include +#include +#include +#include +#include + +#include +#include + +void synproxy_send_client_synack(struct net *net, const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts); + +bool synproxy_recv_client_ack(struct net *net, + const struct sk_buff *skb, + const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq); + +unsigned int ipv4_synproxy_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *nhs); +int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net); +void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net); + +#if IS_ENABLED(CONFIG_IPV6) +void synproxy_send_client_synack_ipv6(struct net *net, + const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts); + +bool synproxy_recv_client_ack_ipv6(struct net *net, const struct sk_buff *skb, + const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq); + +unsigned int ipv6_synproxy_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *nhs); +int nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net); +void nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net); +#endif /* CONFIG_IPV6 */ + +#endif /* _NF_SYNPROXY_SHARED_H */ diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 690b17ef6a44..7f7979734fb4 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -6,258 +6,11 @@ * published by the Free Software Foundation. */ -#include -#include -#include - #include #include #include -#include -#include -#include -#include - -static struct iphdr * -synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, - __be32 daddr) -{ - struct iphdr *iph; - - skb_reset_network_header(skb); - iph = skb_put(skb, sizeof(*iph)); - iph->version = 4; - iph->ihl = sizeof(*iph) / 4; - iph->tos = 0; - iph->id = 0; - iph->frag_off = htons(IP_DF); - iph->ttl = net->ipv4.sysctl_ip_default_ttl; - iph->protocol = IPPROTO_TCP; - iph->check = 0; - iph->saddr = saddr; - iph->daddr = daddr; - - return iph; -} - -static void -synproxy_send_tcp(struct net *net, - const struct sk_buff *skb, struct sk_buff *nskb, - struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, - struct iphdr *niph, struct tcphdr *nth, - unsigned int tcp_hdr_size) -{ - nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)nth - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - skb_dst_set_noref(nskb, skb_dst(skb)); - nskb->protocol = htons(ETH_P_IP); - if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) - goto free_nskb; - - if (nfct) { - nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); - nf_conntrack_get(nfct); - } - - ip_local_out(net, nskb->sk, nskb); - return; - -free_nskb: - kfree_skb(nskb); -} - -static void -synproxy_send_client_synack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - u16 mss = opts->mss; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE; - nth->doff = tcp_hdr_size / 4; - nth->window = 0; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_syn(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(recv_seq - 1); - /* ack_seq is used to relay our ISN to the synproxy hook to initialize - * sequence number translation once a connection tracking entry exists. - */ - nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); - tcp_flag_word(nth) = TCP_FLAG_SYN; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; - nth->doff = tcp_hdr_size / 4; - nth->window = th->window; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, - niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_ack(struct net *net, - const struct ip_ct_tcp *state, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(ntohl(th->ack_seq)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct iphdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ip_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(ntohl(th->seq) + 1); - nth->ack_seq = th->ack_seq; - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(ntohs(th->window) >> opts->wscale); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static bool -synproxy_recv_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - int mss; - - mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); - if (mss == 0) { - this_cpu_inc(snet->stats->cookie_invalid); - return false; - } - - this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; - opts->options |= XT_SYNPROXY_OPT_MSS; - - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy_check_timestamp_cookie(opts); - - synproxy_send_server_syn(net, skb, th, opts, recv_seq); - return true; -} +#include static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) @@ -309,135 +62,6 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv4_synproxy_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *nhs) -{ - struct net *net = nhs->net; - struct synproxy_net *snet = synproxy_pernet(net); - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - struct nf_conn_synproxy *synproxy; - struct synproxy_options opts = {}; - const struct ip_ct_tcp *state; - struct tcphdr *th, _th; - unsigned int thoff; - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return NF_ACCEPT; - - synproxy = nfct_synproxy(ct); - if (synproxy == NULL) - return NF_ACCEPT; - - if (nf_is_loopback_packet(skb) || - ip_hdr(skb)->protocol != IPPROTO_TCP) - return NF_ACCEPT; - - thoff = ip_hdrlen(skb); - th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); - if (th == NULL) - return NF_DROP; - - state = &ct->proto.tcp; - switch (state->state) { - case TCP_CONNTRACK_CLOSE: - if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - - ntohl(th->seq) + 1); - break; - } - - if (!th->syn || th->ack || - CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - break; - - /* Reopened connection - reset the sequence number and timestamp - * adjustments, they will get initialized once the connection is - * reestablished. - */ - nf_ct_seqadj_init(ct, ctinfo, 0); - synproxy->tsoff = 0; - this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ - case TCP_CONNTRACK_SYN_SENT: - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (!th->syn && th->ack && - CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, - * therefore we need to add 1 to make the SYN sequence - * number match the one of first SYN. - */ - if (synproxy_recv_client_ack(net, skb, th, &opts, - ntohl(th->seq) + 1)) { - this_cpu_inc(snet->stats->cookie_retrans); - consume_skb(skb); - return NF_STOLEN; - } else { - return NF_DROP; - } - } - - synproxy->isn = ntohl(th->ack_seq); - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy->its = opts.tsecr; - - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - break; - case TCP_CONNTRACK_SYN_RECV: - if (!th->syn || !th->ack) - break; - - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { - synproxy->tsoff = opts.tsval - synproxy->its; - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - } - - opts.options &= ~(XT_SYNPROXY_OPT_MSS | - XT_SYNPROXY_OPT_WSCALE | - XT_SYNPROXY_OPT_SACK_PERM); - - swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(net, state, skb, th, &opts); - - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); - nf_conntrack_event_cache(IPCT_SEQADJ, ct); - - swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(net, skb, th, &opts); - - consume_skb(skb); - return NF_STOLEN; - default: - break; - } - - synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); - return NF_ACCEPT; -} - -static const struct nf_hook_ops ipv4_synproxy_ops[] = { - { - .hook = ipv4_synproxy_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, - { - .hook = ipv4_synproxy_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, -}; - static int synproxy_tg4_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); @@ -452,13 +76,10 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par) if (err) return err; - if (snet->hook_ref4 == 0) { - err = nf_register_net_hooks(par->net, ipv4_synproxy_ops, - ARRAY_SIZE(ipv4_synproxy_ops)); - if (err) { - nf_ct_netns_put(par->net, par->family); - return err; - } + err = nf_synproxy_ipv4_init(snet, par->net); + if (err) { + nf_ct_netns_put(par->net, par->family); + return err; } snet->hook_ref4++; @@ -469,10 +90,7 @@ static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); - snet->hook_ref4--; - if (snet->hook_ref4 == 0) - nf_unregister_net_hooks(par->net, ipv4_synproxy_ops, - ARRAY_SIZE(ipv4_synproxy_ops)); + nf_synproxy_ipv4_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index cb6d42b03cb5..55a9b92d0a1f 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -6,272 +6,11 @@ * published by the Free Software Foundation. */ -#include -#include -#include -#include -#include - #include #include #include -#include -#include -#include -#include - -static struct ipv6hdr * -synproxy_build_ip(struct net *net, struct sk_buff *skb, - const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - struct ipv6hdr *iph; - - skb_reset_network_header(skb); - iph = skb_put(skb, sizeof(*iph)); - ip6_flow_hdr(iph, 0, 0); - iph->hop_limit = net->ipv6.devconf_all->hop_limit; - iph->nexthdr = IPPROTO_TCP; - iph->saddr = *saddr; - iph->daddr = *daddr; - - return iph; -} - -static void -synproxy_send_tcp(struct net *net, - const struct sk_buff *skb, struct sk_buff *nskb, - struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, - struct ipv6hdr *niph, struct tcphdr *nth, - unsigned int tcp_hdr_size) -{ - struct dst_entry *dst; - struct flowi6 fl6; - - nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)nth - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_TCP; - fl6.saddr = niph->saddr; - fl6.daddr = niph->daddr; - fl6.fl6_sport = nth->source; - fl6.fl6_dport = nth->dest; - security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - dst_release(dst); - goto free_nskb; - } - dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); - if (IS_ERR(dst)) - goto free_nskb; - - skb_dst_set(nskb, dst); - - if (nfct) { - nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); - nf_conntrack_get(nfct); - } - - ip6_local_out(net, nskb->sk, nskb); - return; - -free_nskb: - kfree_skb(nskb); -} - -static void -synproxy_send_client_synack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - u16 mss = opts->mss; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE; - nth->doff = tcp_hdr_size / 4; - nth->window = 0; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} -static void -synproxy_send_server_syn(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(recv_seq - 1); - /* ack_seq is used to relay our ISN to the synproxy hook to initialize - * sequence number translation once a connection tracking entry exists. - */ - nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); - tcp_flag_word(nth) = TCP_FLAG_SYN; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; - nth->doff = tcp_hdr_size / 4; - nth->window = th->window; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, - niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_ack(struct net *net, - const struct ip_ct_tcp *state, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(ntohl(th->ack_seq)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(ntohl(th->seq) + 1); - nth->ack_seq = th->ack_seq; - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(ntohs(th->window) >> opts->wscale); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static bool -synproxy_recv_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - int mss; - - mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); - if (mss == 0) { - this_cpu_inc(snet->stats->cookie_invalid); - return false; - } - - this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; - opts->options |= XT_SYNPROXY_OPT_MSS; - - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy_check_timestamp_cookie(opts); - - synproxy_send_server_syn(net, skb, th, opts, recv_seq); - return true; -} +#include static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) @@ -307,13 +46,14 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(net, skb, th, &opts); + synproxy_send_client_synack_ipv6(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ - if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) { + if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, + ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { @@ -324,141 +64,6 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv6_synproxy_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *nhs) -{ - struct net *net = nhs->net; - struct synproxy_net *snet = synproxy_pernet(net); - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - struct nf_conn_synproxy *synproxy; - struct synproxy_options opts = {}; - const struct ip_ct_tcp *state; - struct tcphdr *th, _th; - __be16 frag_off; - u8 nexthdr; - int thoff; - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return NF_ACCEPT; - - synproxy = nfct_synproxy(ct); - if (synproxy == NULL) - return NF_ACCEPT; - - if (nf_is_loopback_packet(skb)) - return NF_ACCEPT; - - nexthdr = ipv6_hdr(skb)->nexthdr; - thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (thoff < 0 || nexthdr != IPPROTO_TCP) - return NF_ACCEPT; - - th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); - if (th == NULL) - return NF_DROP; - - state = &ct->proto.tcp; - switch (state->state) { - case TCP_CONNTRACK_CLOSE: - if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - - ntohl(th->seq) + 1); - break; - } - - if (!th->syn || th->ack || - CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - break; - - /* Reopened connection - reset the sequence number and timestamp - * adjustments, they will get initialized once the connection is - * reestablished. - */ - nf_ct_seqadj_init(ct, ctinfo, 0); - synproxy->tsoff = 0; - this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ - case TCP_CONNTRACK_SYN_SENT: - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (!th->syn && th->ack && - CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, - * therefore we need to add 1 to make the SYN sequence - * number match the one of first SYN. - */ - if (synproxy_recv_client_ack(net, skb, th, &opts, - ntohl(th->seq) + 1)) { - this_cpu_inc(snet->stats->cookie_retrans); - consume_skb(skb); - return NF_STOLEN; - } else { - return NF_DROP; - } - } - - synproxy->isn = ntohl(th->ack_seq); - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy->its = opts.tsecr; - - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - break; - case TCP_CONNTRACK_SYN_RECV: - if (!th->syn || !th->ack) - break; - - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { - synproxy->tsoff = opts.tsval - synproxy->its; - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - } - - opts.options &= ~(XT_SYNPROXY_OPT_MSS | - XT_SYNPROXY_OPT_WSCALE | - XT_SYNPROXY_OPT_SACK_PERM); - - swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(net, state, skb, th, &opts); - - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); - nf_conntrack_event_cache(IPCT_SEQADJ, ct); - - swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(net, skb, th, &opts); - - consume_skb(skb); - return NF_STOLEN; - default: - break; - } - - synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); - return NF_ACCEPT; -} - -static const struct nf_hook_ops ipv6_synproxy_ops[] = { - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, -}; - static int synproxy_tg6_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); @@ -474,16 +79,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par) if (err) return err; - if (snet->hook_ref6 == 0) { - err = nf_register_net_hooks(par->net, ipv6_synproxy_ops, - ARRAY_SIZE(ipv6_synproxy_ops)); - if (err) { - nf_ct_netns_put(par->net, par->family); - return err; - } + err = nf_synproxy_ipv6_init(snet, par->net); + if (err) { + nf_ct_netns_put(par->net, par->family); + return err; } - snet->hook_ref6++; return err; } @@ -491,10 +92,7 @@ static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); - snet->hook_ref6--; - if (snet->hook_ref6 == 0) - nf_unregister_net_hooks(par->net, ipv6_synproxy_ops, - ARRAY_SIZE(ipv6_synproxy_ops)); + nf_synproxy_ipv6_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 3d58a9e93e5a..50677285f82e 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -13,16 +13,16 @@ #include #include -#include -#include -#include -#include +#include +#include #include +#include #include #include #include #include +#include unsigned int synproxy_net_id; EXPORT_SYMBOL_GPL(synproxy_net_id); @@ -60,7 +60,7 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, case TCPOPT_MSS: if (opsize == TCPOLEN_MSS) { opts->mss = get_unaligned_be16(ptr); - opts->options |= XT_SYNPROXY_OPT_MSS; + opts->options |= NF_SYNPROXY_OPT_MSS; } break; case TCPOPT_WINDOW: @@ -68,19 +68,19 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, opts->wscale = *ptr; if (opts->wscale > TCP_MAX_WSCALE) opts->wscale = TCP_MAX_WSCALE; - opts->options |= XT_SYNPROXY_OPT_WSCALE; + opts->options |= NF_SYNPROXY_OPT_WSCALE; } break; case TCPOPT_TIMESTAMP: if (opsize == TCPOLEN_TIMESTAMP) { opts->tsval = get_unaligned_be32(ptr); opts->tsecr = get_unaligned_be32(ptr + 4); - opts->options |= XT_SYNPROXY_OPT_TIMESTAMP; + opts->options |= NF_SYNPROXY_OPT_TIMESTAMP; } break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM) - opts->options |= XT_SYNPROXY_OPT_SACK_PERM; + opts->options |= NF_SYNPROXY_OPT_SACK_PERM; break; } @@ -92,36 +92,36 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, } EXPORT_SYMBOL_GPL(synproxy_parse_options); -unsigned int synproxy_options_size(const struct synproxy_options *opts) +static unsigned int +synproxy_options_size(const struct synproxy_options *opts) { unsigned int size = 0; - if (opts->options & XT_SYNPROXY_OPT_MSS) + if (opts->options & NF_SYNPROXY_OPT_MSS) size += TCPOLEN_MSS_ALIGNED; - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) size += TCPOLEN_TSTAMP_ALIGNED; - else if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + else if (opts->options & NF_SYNPROXY_OPT_SACK_PERM) size += TCPOLEN_SACKPERM_ALIGNED; - if (opts->options & XT_SYNPROXY_OPT_WSCALE) + if (opts->options & NF_SYNPROXY_OPT_WSCALE) size += TCPOLEN_WSCALE_ALIGNED; return size; } -EXPORT_SYMBOL_GPL(synproxy_options_size); -void +static void synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) { __be32 *ptr = (__be32 *)(th + 1); u8 options = opts->options; - if (options & XT_SYNPROXY_OPT_MSS) + if (options & NF_SYNPROXY_OPT_MSS) *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | opts->mss); - if (options & XT_SYNPROXY_OPT_TIMESTAMP) { - if (options & XT_SYNPROXY_OPT_SACK_PERM) + if (options & NF_SYNPROXY_OPT_TIMESTAMP) { + if (options & NF_SYNPROXY_OPT_SACK_PERM) *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | @@ -134,58 +134,56 @@ synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts) *ptr++ = htonl(opts->tsval); *ptr++ = htonl(opts->tsecr); - } else if (options & XT_SYNPROXY_OPT_SACK_PERM) + } else if (options & NF_SYNPROXY_OPT_SACK_PERM) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); - if (options & XT_SYNPROXY_OPT_WSCALE) + if (options & NF_SYNPROXY_OPT_WSCALE) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->wscale); } -EXPORT_SYMBOL_GPL(synproxy_build_options); -void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, +void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info, struct synproxy_options *opts) { opts->tsecr = opts->tsval; opts->tsval = tcp_time_stamp_raw() & ~0x3f; - if (opts->options & XT_SYNPROXY_OPT_WSCALE) { + if (opts->options & NF_SYNPROXY_OPT_WSCALE) { opts->tsval |= opts->wscale; opts->wscale = info->wscale; } else opts->tsval |= 0xf; - if (opts->options & XT_SYNPROXY_OPT_SACK_PERM) + if (opts->options & NF_SYNPROXY_OPT_SACK_PERM) opts->tsval |= 1 << 4; - if (opts->options & XT_SYNPROXY_OPT_ECN) + if (opts->options & NF_SYNPROXY_OPT_ECN) opts->tsval |= 1 << 5; } EXPORT_SYMBOL_GPL(synproxy_init_timestamp_cookie); -void synproxy_check_timestamp_cookie(struct synproxy_options *opts) +static void +synproxy_check_timestamp_cookie(struct synproxy_options *opts) { opts->wscale = opts->tsecr & 0xf; if (opts->wscale != 0xf) - opts->options |= XT_SYNPROXY_OPT_WSCALE; + opts->options |= NF_SYNPROXY_OPT_WSCALE; - opts->options |= opts->tsecr & (1 << 4) ? XT_SYNPROXY_OPT_SACK_PERM : 0; + opts->options |= opts->tsecr & (1 << 4) ? NF_SYNPROXY_OPT_SACK_PERM : 0; - opts->options |= opts->tsecr & (1 << 5) ? XT_SYNPROXY_OPT_ECN : 0; + opts->options |= opts->tsecr & (1 << 5) ? NF_SYNPROXY_OPT_ECN : 0; } -EXPORT_SYMBOL_GPL(synproxy_check_timestamp_cookie); -unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, - unsigned int protoff, - struct tcphdr *th, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct nf_conn_synproxy *synproxy) +static unsigned int +synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff, + struct tcphdr *th, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct nf_conn_synproxy *synproxy) { unsigned int optoff, optend; __be32 *ptr, old; @@ -235,7 +233,6 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, } return 1; } -EXPORT_SYMBOL_GPL(synproxy_tstamp_adjust); static struct nf_ct_ext_type nf_ct_synproxy_extend __read_mostly = { .len = sizeof(struct nf_conn_synproxy), @@ -416,5 +413,830 @@ static void __exit synproxy_core_exit(void) module_init(synproxy_core_init); module_exit(synproxy_core_exit); +static struct iphdr * +synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, + __be32 daddr) +{ + struct iphdr *iph; + + skb_reset_network_header(skb); + iph = skb_put(skb, sizeof(*iph)); + iph->version = 4; + iph->ihl = sizeof(*iph) / 4; + iph->tos = 0; + iph->id = 0; + iph->frag_off = htons(IP_DF); + iph->ttl = net->ipv4.sysctl_ip_default_ttl; + iph->protocol = IPPROTO_TCP; + iph->check = 0; + iph->saddr = saddr; + iph->daddr = daddr; + + return iph; +} + +static void +synproxy_send_tcp(struct net *net, + const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct iphdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + skb_dst_set_noref(nskb, skb_dst(skb)); + nskb->protocol = htons(ETH_P_IP); + if (ip_route_me_harder(net, nskb, RTN_UNSPEC)) + goto free_nskb; + + if (nfct) { + nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); + nf_conntrack_get(nfct); + } + + ip_local_out(net, nskb->sk, nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +void +synproxy_send_client_synack(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); +} +EXPORT_SYMBOL_GPL(synproxy_send_client_synack); + +static void +synproxy_send_server_syn(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack(struct net *net, + const struct ip_ct_tcp *state, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_client_ack(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct iphdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ip_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(ntohs(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); +} + +bool +synproxy_recv_client_ack(struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + int mss; + + mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + opts->options |= NF_SYNPROXY_OPT_MSS; + + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn(net, skb, th, opts, recv_seq); + return true; +} +EXPORT_SYMBOL_GPL(synproxy_recv_client_ack); + +unsigned int +ipv4_synproxy_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *nhs) +{ + struct net *net = nhs->net; + struct synproxy_net *snet = synproxy_pernet(net); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + unsigned int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (!synproxy) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb) || + ip_hdr(skb)->protocol != IPPROTO_TCP) + return NF_ACCEPT; + + thoff = ip_hdrlen(skb); + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (!th) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack(net, skb, th, &opts, + ntohl(th->seq) + 1)) { + this_cpu_inc(snet->stats->cookie_retrans); + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) { + synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } + + opts.options &= ~(NF_SYNPROXY_OPT_MSS | + NF_SYNPROXY_OPT_WSCALE | + NF_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack(net, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack(net, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(ipv4_synproxy_hook); + +static const struct nf_hook_ops ipv4_synproxy_ops[] = { + { + .hook = ipv4_synproxy_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv4_synproxy_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net) +{ + int err; + + if (snet->hook_ref4 == 0) { + err = nf_register_net_hooks(net, ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); + if (err) + return err; + } + + snet->hook_ref4++; + return err; +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init); + +void nf_synproxy_ipv4_fini(struct synproxy_net *snet, struct net *net) +{ + snet->hook_ref4--; + if (snet->hook_ref4 == 0) + nf_unregister_net_hooks(net, ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_fini); + +#if IS_ENABLED(CONFIG_IPV6) +static struct ipv6hdr * +synproxy_build_ip_ipv6(struct net *net, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr) +{ + struct ipv6hdr *iph; + + skb_reset_network_header(skb); + iph = skb_put(skb, sizeof(*iph)); + ip6_flow_hdr(iph, 0, 0); + iph->hop_limit = net->ipv6.devconf_all->hop_limit; + iph->nexthdr = IPPROTO_TCP; + iph->saddr = *saddr; + iph->daddr = *daddr; + + return iph; +} + +static void +synproxy_send_tcp_ipv6(struct net *net, + const struct sk_buff *skb, struct sk_buff *nskb, + struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, + struct ipv6hdr *niph, struct tcphdr *nth, + unsigned int tcp_hdr_size) +{ + struct dst_entry *dst; + struct flowi6 fl6; + int err; + + nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)nth - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = niph->saddr; + fl6.daddr = niph->daddr; + fl6.fl6_sport = nth->source; + fl6.fl6_dport = nth->dest; + security_skb_classify_flow((struct sk_buff *)skb, + flowi6_to_flowi(&fl6)); + err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); + if (err) { + goto free_nskb; + } + + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + goto free_nskb; + + skb_dst_set(nskb, dst); + + if (nfct) { + nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); + nf_conntrack_get(nfct); + } + + ip6_local_out(net, nskb->sk, nskb); + return; + +free_nskb: + kfree_skb(nskb); +} + +void +synproxy_send_client_synack_ipv6(struct net *net, + const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + u16 mss = opts->mss; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(nf_ipv6_cookie_init_sequence(iph, th, &mss)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE; + nth->doff = tcp_hdr_size / 4; + nth->window = 0; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, + tcp_hdr_size); +} +EXPORT_SYMBOL_GPL(synproxy_send_client_synack_ipv6); + +static void +synproxy_send_server_syn_ipv6(struct net *net, const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(recv_seq - 1); + /* ack_seq is used to relay our ISN to the synproxy hook to initialize + * sequence number translation once a connection tracking entry exists. + */ + nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); + tcp_flag_word(nth) = TCP_FLAG_SYN; + if (opts->options & NF_SYNPROXY_OPT_ECN) + tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; + nth->doff = tcp_hdr_size / 4; + nth->window = th->window; + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, &snet->tmpl->ct_general, + IP_CT_NEW, niph, nth, tcp_hdr_size); +} + +static void +synproxy_send_server_ack_ipv6(struct net *net, const struct ip_ct_tcp *state, + const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->daddr, &iph->saddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->dest; + nth->dest = th->source; + nth->seq = htonl(ntohl(th->ack_seq)); + nth->ack_seq = htonl(ntohl(th->seq) + 1); + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, NULL, 0, niph, nth, + tcp_hdr_size); +} + +static void +synproxy_send_client_ack_ipv6(struct net *net, const struct sk_buff *skb, + const struct tcphdr *th, + const struct synproxy_options *opts) +{ + struct sk_buff *nskb; + struct ipv6hdr *iph, *niph; + struct tcphdr *nth; + unsigned int tcp_hdr_size; + + iph = ipv6_hdr(skb); + + tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); + nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, + GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, MAX_TCP_HEADER); + + niph = synproxy_build_ip_ipv6(net, nskb, &iph->saddr, &iph->daddr); + + skb_reset_transport_header(nskb); + nth = skb_put(nskb, tcp_hdr_size); + nth->source = th->source; + nth->dest = th->dest; + nth->seq = htonl(ntohl(th->seq) + 1); + nth->ack_seq = th->ack_seq; + tcp_flag_word(nth) = TCP_FLAG_ACK; + nth->doff = tcp_hdr_size / 4; + nth->window = htons(ntohs(th->window) >> opts->wscale); + nth->check = 0; + nth->urg_ptr = 0; + + synproxy_build_options(nth, opts); + + synproxy_send_tcp_ipv6(net, skb, nskb, skb_nfct(skb), + IP_CT_ESTABLISHED_REPLY, niph, nth, + tcp_hdr_size); +} + +bool +synproxy_recv_client_ack_ipv6(struct net *net, + const struct sk_buff *skb, + const struct tcphdr *th, + struct synproxy_options *opts, u32 recv_seq) +{ + struct synproxy_net *snet = synproxy_pernet(net); + int mss; + + mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); + if (mss == 0) { + this_cpu_inc(snet->stats->cookie_invalid); + return false; + } + + this_cpu_inc(snet->stats->cookie_valid); + opts->mss = mss; + opts->options |= NF_SYNPROXY_OPT_MSS; + + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy_check_timestamp_cookie(opts); + + synproxy_send_server_syn_ipv6(net, skb, th, opts, recv_seq); + return true; +} +EXPORT_SYMBOL_GPL(synproxy_recv_client_ack_ipv6); + +unsigned int +ipv6_synproxy_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *nhs) +{ + struct net *net = nhs->net; + struct synproxy_net *snet = synproxy_pernet(net); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct nf_conn_synproxy *synproxy; + struct synproxy_options opts = {}; + const struct ip_ct_tcp *state; + struct tcphdr *th, _th; + __be16 frag_off; + u8 nexthdr; + int thoff; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return NF_ACCEPT; + + synproxy = nfct_synproxy(ct); + if (!synproxy) + return NF_ACCEPT; + + if (nf_is_loopback_packet(skb)) + return NF_ACCEPT; + + nexthdr = ipv6_hdr(skb)->nexthdr; + thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (thoff < 0 || nexthdr != IPPROTO_TCP) + return NF_ACCEPT; + + th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); + if (!th) + return NF_DROP; + + state = &ct->proto.tcp; + switch (state->state) { + case TCP_CONNTRACK_CLOSE: + if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - + ntohl(th->seq) + 1); + break; + } + + if (!th->syn || th->ack || + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + break; + + /* Reopened connection - reset the sequence number and timestamp + * adjustments, they will get initialized once the connection is + * reestablished. + */ + nf_ct_seqadj_init(ct, ctinfo, 0); + synproxy->tsoff = 0; + this_cpu_inc(snet->stats->conn_reopened); + + /* fall through */ + case TCP_CONNTRACK_SYN_SENT: + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (!th->syn && th->ack && + CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, + * therefore we need to add 1 to make the SYN sequence + * number match the one of first SYN. + */ + if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, + ntohl(th->seq) + 1)) { + this_cpu_inc(snet->stats->cookie_retrans); + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } + } + + synproxy->isn = ntohl(th->ack_seq); + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + break; + case TCP_CONNTRACK_SYN_RECV: + if (!th->syn || !th->ack) + break; + + if (!synproxy_parse_options(skb, thoff, th, &opts)) + return NF_DROP; + + if (opts.options & NF_SYNPROXY_OPT_TIMESTAMP) { + synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } + + opts.options &= ~(NF_SYNPROXY_OPT_MSS | + NF_SYNPROXY_OPT_WSCALE | + NF_SYNPROXY_OPT_SACK_PERM); + + swap(opts.tsval, opts.tsecr); + synproxy_send_server_ack_ipv6(net, state, skb, th, &opts); + + nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); + + swap(opts.tsval, opts.tsecr); + synproxy_send_client_ack_ipv6(net, skb, th, &opts); + + consume_skb(skb); + return NF_STOLEN; + default: + break; + } + + synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(ipv6_synproxy_hook); + +static const struct nf_hook_ops ipv6_synproxy_ops[] = { + { + .hook = ipv6_synproxy_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv6_synproxy_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + +int +nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net) +{ + int err; + + if (snet->hook_ref6 == 0) { + err = nf_register_net_hooks(net, ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); + if (err) + return err; + } + + snet->hook_ref6++; + return err; +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init); + +void +nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net) +{ + snet->hook_ref6--; + if (snet->hook_ref6 == 0) + nf_unregister_net_hooks(net, ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); +} +EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_fini); +#endif /* CONFIG_IPV6 */ + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); -- cgit v1.2.3 From 72c5e11854afb842e157353be0291d65b91725f5 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 18 Jun 2019 15:22:44 +0100 Subject: netfilter: synproxy: ensure zero is returned on non-error return path Currently functions nf_synproxy_{ipc4|ipv6}_init return an uninitialized garbage value in variable ret on a successful return. Fix this by returning zero on success. Addresses-Coverity: ("Uninitialized scalar variable") Fixes: d7f9b2f18eae ("netfilter: synproxy: extract SYNPROXY infrastructure from {ipt, ip6t}_SYNPROXY") Signed-off-by: Colin Ian King Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_synproxy_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 50677285f82e..7bf5202e3222 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -798,7 +798,7 @@ int nf_synproxy_ipv4_init(struct synproxy_net *snet, struct net *net) } snet->hook_ref4++; - return err; + return 0; } EXPORT_SYMBOL_GPL(nf_synproxy_ipv4_init); @@ -1223,7 +1223,7 @@ nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net) } snet->hook_ref6++; - return err; + return 0; } EXPORT_SYMBOL_GPL(nf_synproxy_ipv6_init); -- cgit v1.2.3 From 2f0513d487d2619c751fd08f5b7c64e759435ff4 Mon Sep 17 00:00:00 2001 From: Stéphane Veyret Date: Wed, 19 Jun 2019 09:03:14 +0200 Subject: netfilter: nft_ct: fix null pointer in ct expectations support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nf_ct_helper_ext_add may return null, which must then be checked. Fixes: 857b46027d6f ("netfilter: nft_ct: add ct expectations support") Reported-by: Colin Ian King Signed-off-by: Stéphane Veyret Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/netfilter') diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 06b52c894573..77dab1bdb3ca 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1232,6 +1232,10 @@ static void nft_ct_expect_obj_eval(struct nft_object *obj, help = nfct_help(ct); if (!help) help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); + if (!help) { + regs->verdict.code = NF_DROP; + return; + } if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) { regs->verdict.code = NFT_BREAK; -- cgit v1.2.3 From 79ebb5bb4e38a58ca796dd242b855a4982e101d7 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Tue, 18 Jun 2019 11:11:02 +0200 Subject: netfilter: nf_tables: enable set expiration time for set elements Currently, the expiration of every element in a set or map is a read-only parameter generated at kernel side. This change will permit to set a certain expiration date per element that will be required, for example, during stateful replication among several nodes. This patch handles the NFTA_SET_ELEM_EXPIRATION in order to configure the expiration parameter per element, or will use the timeout in the case that the expiration is not set. Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 26 ++++++++++++++++++++------ net/netfilter/nft_dynset.c | 2 +- 3 files changed, 22 insertions(+), 8 deletions(-) (limited to 'net/netfilter') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 5b8624ae4a27..9e8493aad49d 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -636,7 +636,7 @@ static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *data, - u64 timeout, gfp_t gfp); + u64 timeout, u64 expiration, gfp_t gfp); void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d444405211c5..412bb85e9d29 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3873,6 +3873,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_DATA] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_ELEM_TIMEOUT] = { .type = NLA_U64 }, + [NFTA_SET_ELEM_EXPIRATION] = { .type = NLA_U64 }, [NFTA_SET_ELEM_USERDATA] = { .type = NLA_BINARY, .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED }, @@ -4326,7 +4327,7 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *data, - u64 timeout, gfp_t gfp) + u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -4341,9 +4342,11 @@ void *nft_set_elem_init(const struct nft_set *set, memcpy(nft_set_ext_key(ext), key, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) memcpy(nft_set_ext_data(ext), data, set->dlen); - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) - *nft_set_ext_expiration(ext) = - get_jiffies_64() + timeout; + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { + *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration; + if (expiration == 0) + *nft_set_ext_expiration(ext) += timeout; + } if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) *nft_set_ext_timeout(ext) = timeout; @@ -4408,6 +4411,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_trans *trans; u32 flags = 0; u64 timeout; + u64 expiration; u8 ulen; int err; @@ -4451,6 +4455,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, timeout = set->timeout; } + expiration = 0; + if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) { + if (!(set->flags & NFT_SET_TIMEOUT)) + return -EINVAL; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION], + &expiration); + if (err) + return err; + } + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1, nla[NFTA_SET_ELEM_KEY]); if (err < 0) @@ -4533,7 +4547,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data, - timeout, GFP_KERNEL); + timeout, expiration, GFP_KERNEL); if (elem.priv == NULL) goto err3; @@ -4735,7 +4749,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, - GFP_KERNEL); + 0, GFP_KERNEL); if (elem.priv == NULL) goto err2; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 8394560aa695..bfb9f7463b03 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -60,7 +60,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, elem = nft_set_elem_init(set, &priv->tmpl, ®s->data[priv->sreg_key], ®s->data[priv->sreg_data], - timeout, GFP_ATOMIC); + timeout, 0, GFP_ATOMIC); if (elem == NULL) goto err1; -- cgit v1.2.3 From 22f2efd337761dd03e79b8ddf988653bdb5c20f9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 19 Jun 2019 18:30:37 +0200 Subject: netfilter: synproxy: use nf_cookie_v6_check() from core This helper function is never used and it is intended to avoid a direct dependency with the ipv6 module. Fixes: d7f9b2f18eae ("netfilter: synproxy: extract SYNPROXY infrastructure from {ipt, ip6t}_SYNPROXY") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_synproxy_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 7bf5202e3222..24d3e564403f 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -1056,7 +1056,7 @@ synproxy_recv_client_ack_ipv6(struct net *net, struct synproxy_net *snet = synproxy_pernet(net); int mss; - mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); + mss = nf_cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); if (mss == 0) { this_cpu_inc(snet->stats->cookie_invalid); return false; -- cgit v1.2.3 From cf47a0b882a4e5f6b34c7949d7b293e9287f1972 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 4 Jun 2019 21:56:35 +0300 Subject: ipvs: defer hook registration to avoid leaks syzkaller reports for memory leak when registering hooks [1] As we moved the nf_unregister_net_hooks() call into __ip_vs_dev_cleanup(), defer the nf_register_net_hooks() call, so that hooks are allocated and freed from same pernet_operations (ipvs_core_dev_ops). [1] BUG: memory leak unreferenced object 0xffff88810acd8a80 (size 96): comm "syz-executor073", pid 7254, jiffies 4294950560 (age 22.250s) hex dump (first 32 bytes): 02 00 00 00 00 00 00 00 50 8b bb 82 ff ff ff ff ........P....... 00 00 00 00 00 00 00 00 00 77 bb 82 ff ff ff ff .........w...... backtrace: [<0000000013db61f1>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<0000000013db61f1>] slab_post_alloc_hook mm/slab.h:439 [inline] [<0000000013db61f1>] slab_alloc_node mm/slab.c:3269 [inline] [<0000000013db61f1>] kmem_cache_alloc_node_trace+0x15b/0x2a0 mm/slab.c:3597 [<000000001a27307d>] __do_kmalloc_node mm/slab.c:3619 [inline] [<000000001a27307d>] __kmalloc_node+0x38/0x50 mm/slab.c:3627 [<0000000025054add>] kmalloc_node include/linux/slab.h:590 [inline] [<0000000025054add>] kvmalloc_node+0x4a/0xd0 mm/util.c:431 [<0000000050d1bc00>] kvmalloc include/linux/mm.h:637 [inline] [<0000000050d1bc00>] kvzalloc include/linux/mm.h:645 [inline] [<0000000050d1bc00>] allocate_hook_entries_size+0x3b/0x60 net/netfilter/core.c:61 [<00000000e8abe142>] nf_hook_entries_grow+0xae/0x270 net/netfilter/core.c:128 [<000000004b94797c>] __nf_register_net_hook+0x9a/0x170 net/netfilter/core.c:337 [<00000000d1545cbc>] nf_register_net_hook+0x34/0xc0 net/netfilter/core.c:464 [<00000000876c9b55>] nf_register_net_hooks+0x53/0xc0 net/netfilter/core.c:480 [<000000002ea868e0>] __ip_vs_init+0xe8/0x170 net/netfilter/ipvs/ip_vs_core.c:2280 [<000000002eb2d451>] ops_init+0x4c/0x140 net/core/net_namespace.c:130 [<000000000284ec48>] setup_net+0xde/0x230 net/core/net_namespace.c:316 [<00000000a70600fa>] copy_net_ns+0xf0/0x1e0 net/core/net_namespace.c:439 [<00000000ff26c15e>] create_new_namespaces+0x141/0x2a0 kernel/nsproxy.c:107 [<00000000b103dc79>] copy_namespaces+0xa1/0xe0 kernel/nsproxy.c:165 [<000000007cc008a2>] copy_process.part.0+0x11fd/0x2150 kernel/fork.c:2035 [<00000000c344af7c>] copy_process kernel/fork.c:1800 [inline] [<00000000c344af7c>] _do_fork+0x121/0x4f0 kernel/fork.c:2369 Reported-by: syzbot+722da59ccb264bc19910@syzkaller.appspotmail.com Fixes: 719c7d563c17 ("ipvs: Fix use-after-free in ip_vs_in") Signed-off-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 7138556b206b..d5103a9eb302 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2245,7 +2245,6 @@ static const struct nf_hook_ops ip_vs_ops[] = { static int __net_init __ip_vs_init(struct net *net) { struct netns_ipvs *ipvs; - int ret; ipvs = net_generic(net, ip_vs_net_id); if (ipvs == NULL) @@ -2277,17 +2276,11 @@ static int __net_init __ip_vs_init(struct net *net) if (ip_vs_sync_net_init(ipvs) < 0) goto sync_fail; - ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - if (ret < 0) - goto hook_fail; - return 0; /* * Error handling */ -hook_fail: - ip_vs_sync_net_cleanup(ipvs); sync_fail: ip_vs_conn_net_cleanup(ipvs); conn_fail: @@ -2317,6 +2310,19 @@ static void __net_exit __ip_vs_cleanup(struct net *net) net->ipvs = NULL; } +static int __net_init __ip_vs_dev_init(struct net *net) +{ + int ret; + + ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + if (ret < 0) + goto hook_fail; + return 0; + +hook_fail: + return ret; +} + static void __net_exit __ip_vs_dev_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -2336,6 +2342,7 @@ static struct pernet_operations ipvs_core_ops = { }; static struct pernet_operations ipvs_core_dev_ops = { + .init = __ip_vs_dev_init, .exit = __ip_vs_dev_cleanup, }; -- cgit v1.2.3 From dbb5281a1f84b2f93032d4864c211ce8a20811a7 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Thu, 20 Jun 2019 12:19:59 -0400 Subject: netfilter: nf_tables: add support for matching IPv4 options This is the kernel change for the overall changes with this description: Add capability to have rules matching IPv4 options. This is developed mainly to support dropping of IP packets with loose and/or strict source route route options. Signed-off-by: Stephen Suryaputra Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 + net/ipv4/ip_options.c | 1 + net/netfilter/nft_exthdr.c | 133 +++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+) (limited to 'net/netfilter') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 31a6b8f7ff73..c6c8ec5c7c00 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -730,10 +730,12 @@ enum nft_exthdr_flags { * * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers * @NFT_EXTHDR_OP_TCP: match against tcp options + * @NFT_EXTHDR_OP_IPV4: match against ipv4 options */ enum nft_exthdr_op { NFT_EXTHDR_OP_IPV6, NFT_EXTHDR_OP_TCPOPT, + NFT_EXTHDR_OP_IPV4, __NFT_EXTHDR_OP_MAX }; #define NFT_EXTHDR_OP_MAX (__NFT_EXTHDR_OP_MAX - 1) diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 3db31bb9df50..ddaa01ec2bce 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -473,6 +473,7 @@ error: *info = htonl((pp_ptr-iph)<<24); return -EINVAL; } +EXPORT_SYMBOL(__ip_options_compile); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 45c8a6c07783..8032b2937c7f 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -62,6 +62,103 @@ err: regs->verdict.code = NFT_BREAK; } +/* find the offset to specified option. + * + * If target header is found, its offset is set in *offset and return option + * number. Otherwise, return negative error. + * + * If the first fragment doesn't contain the End of Options it is considered + * invalid. + */ +static int ipv4_find_option(struct net *net, struct sk_buff *skb, + unsigned int *offset, int target) +{ + unsigned char optbuf[sizeof(struct ip_options) + 40]; + struct ip_options *opt = (struct ip_options *)optbuf; + struct iphdr *iph, _iph; + unsigned int start; + bool found = false; + __be32 info; + int optlen; + + iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (!iph) + return -EBADMSG; + start = sizeof(struct iphdr); + + optlen = iph->ihl * 4 - (int)sizeof(struct iphdr); + if (optlen <= 0) + return -ENOENT; + + memset(opt, 0, sizeof(struct ip_options)); + /* Copy the options since __ip_options_compile() modifies + * the options. + */ + if (skb_copy_bits(skb, start, opt->__data, optlen)) + return -EBADMSG; + opt->optlen = optlen; + + if (__ip_options_compile(net, opt, NULL, &info)) + return -EBADMSG; + + switch (target) { + case IPOPT_SSRR: + case IPOPT_LSRR: + if (!opt->srr) + break; + found = target == IPOPT_SSRR ? opt->is_strictroute : + !opt->is_strictroute; + if (found) + *offset = opt->srr + start; + break; + case IPOPT_RR: + if (!opt->rr) + break; + *offset = opt->rr + start; + found = true; + break; + case IPOPT_RA: + if (!opt->router_alert) + break; + *offset = opt->router_alert + start; + found = true; + break; + default: + return -EOPNOTSUPP; + } + return found ? target : -ENOENT; +} + +static void nft_exthdr_ipv4_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + struct sk_buff *skb = pkt->skb; + unsigned int offset; + int err; + + if (skb->protocol != htons(ETH_P_IP)) + goto err; + + err = ipv4_find_option(nft_net(pkt), skb, &offset, priv->type); + if (priv->flags & NFT_EXTHDR_F_PRESENT) { + *dest = (err >= 0); + return; + } else if (err < 0) { + goto err; + } + offset += priv->offset; + + dest[priv->len / NFT_REG32_SIZE] = 0; + if (skb_copy_bits(pkt->skb, offset, dest, priv->len) < 0) + goto err; + return; +err: + regs->verdict.code = NFT_BREAK; +} + static void * nft_tcp_header_pointer(const struct nft_pktinfo *pkt, unsigned int len, void *buffer, unsigned int *tcphdr_len) @@ -315,6 +412,28 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, return nft_validate_register_load(priv->sreg, priv->len); } +static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + int err = nft_exthdr_init(ctx, expr, tb); + + if (err < 0) + return err; + + switch (priv->type) { + case IPOPT_SSRR: + case IPOPT_LSRR: + case IPOPT_RR: + case IPOPT_RA: + break; + default: + return -EOPNOTSUPP; + } + return 0; +} + static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) @@ -361,6 +480,14 @@ static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .dump = nft_exthdr_dump, }; +static const struct nft_expr_ops nft_exthdr_ipv4_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_ipv4_eval, + .init = nft_exthdr_ipv4_init, + .dump = nft_exthdr_dump, +}; + static const struct nft_expr_ops nft_exthdr_tcp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), @@ -401,6 +528,12 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_ipv6_ops; break; + case NFT_EXTHDR_OP_IPV4: + if (ctx->family != NFPROTO_IPV6) { + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_ipv4_ops; + } + break; } return ERR_PTR(-EOPNOTSUPP); -- cgit v1.2.3 From 5db7c8b9f9fc2aeec671ae3ca6375752c162e0e7 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 18 Jun 2019 23:07:36 +0300 Subject: ipvs: fix tinfo memory leak in start_sync_thread syzkaller reports for memory leak in start_sync_thread [1] As Eric points out, kthread may start and stop before the threadfn function is called, so there is no chance the data (tinfo in our case) to be released in thread. Fix this by releasing tinfo in the controlling code instead. [1] BUG: memory leak unreferenced object 0xffff8881206bf700 (size 32): comm "syz-executor761", pid 7268, jiffies 4294943441 (age 20.470s) hex dump (first 32 bytes): 00 40 7c 09 81 88 ff ff 80 45 b8 21 81 88 ff ff .@|......E.!.... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000057619e23>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<0000000057619e23>] slab_post_alloc_hook mm/slab.h:439 [inline] [<0000000057619e23>] slab_alloc mm/slab.c:3326 [inline] [<0000000057619e23>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<0000000086ce5479>] kmalloc include/linux/slab.h:547 [inline] [<0000000086ce5479>] start_sync_thread+0x5d2/0xe10 net/netfilter/ipvs/ip_vs_sync.c:1862 [<000000001a9229cc>] do_ip_vs_set_ctl+0x4c5/0x780 net/netfilter/ipvs/ip_vs_ctl.c:2402 [<00000000ece457c8>] nf_sockopt net/netfilter/nf_sockopt.c:106 [inline] [<00000000ece457c8>] nf_setsockopt+0x4c/0x80 net/netfilter/nf_sockopt.c:115 [<00000000942f62d4>] ip_setsockopt net/ipv4/ip_sockglue.c:1258 [inline] [<00000000942f62d4>] ip_setsockopt+0x9b/0xb0 net/ipv4/ip_sockglue.c:1238 [<00000000a56a8ffd>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<00000000fa895401>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130 [<0000000095eef4cf>] __sys_setsockopt+0x98/0x120 net/socket.c:2078 [<000000009747cf88>] __do_sys_setsockopt net/socket.c:2089 [inline] [<000000009747cf88>] __se_sys_setsockopt net/socket.c:2086 [inline] [<000000009747cf88>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<00000000ded8ba80>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<00000000893b4ac8>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported-by: syzbot+7e2e50c8adfccd2e5041@syzkaller.appspotmail.com Suggested-by: Eric Biggers Fixes: 998e7a76804b ("ipvs: Use kthread_run() instead of doing a double-fork via kernel_thread()") Signed-off-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 6 +- net/netfilter/ipvs/ip_vs_ctl.c | 4 -- net/netfilter/ipvs/ip_vs_sync.c | 134 +++++++++++++++++++++------------------- 3 files changed, 76 insertions(+), 68 deletions(-) (limited to 'net/netfilter') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 2ac40135b576..b36a1df93e7c 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -808,11 +808,12 @@ struct ipvs_master_sync_state { struct ip_vs_sync_buff *sync_buff; unsigned long sync_queue_len; unsigned int sync_queue_delay; - struct task_struct *master_thread; struct delayed_work master_wakeup_work; struct netns_ipvs *ipvs; }; +struct ip_vs_sync_thread_data; + /* How much time to keep dests in trash */ #define IP_VS_DEST_TRASH_PERIOD (120 * HZ) @@ -943,7 +944,8 @@ struct netns_ipvs { spinlock_t sync_lock; struct ipvs_master_sync_state *ms; spinlock_t sync_buff_lock; - struct task_struct **backup_threads; + struct ip_vs_sync_thread_data *master_tinfo; + struct ip_vs_sync_thread_data *backup_tinfo; int threads_mask; volatile int sync_state; struct mutex sync_mutex; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 776c87ed4813..741d91aa4a8d 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2396,9 +2396,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) cfg.syncid = dm->syncid; ret = start_sync_thread(ipvs, &cfg, dm->state); } else { - mutex_lock(&ipvs->sync_mutex); ret = stop_sync_thread(ipvs, dm->state); - mutex_unlock(&ipvs->sync_mutex); } goto out_dec; } @@ -3515,10 +3513,8 @@ static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) if (!attrs[IPVS_DAEMON_ATTR_STATE]) return -EINVAL; - mutex_lock(&ipvs->sync_mutex); ret = stop_sync_thread(ipvs, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); - mutex_unlock(&ipvs->sync_mutex); return ret; } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 2526be6b3d90..a4a78c4b06de 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -195,6 +195,7 @@ union ip_vs_sync_conn { #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) struct ip_vs_sync_thread_data { + struct task_struct *task; struct netns_ipvs *ipvs; struct socket *sock; char *buf; @@ -374,8 +375,11 @@ static inline void sb_queue_tail(struct netns_ipvs *ipvs, max(IPVS_SYNC_SEND_DELAY, 1)); ms->sync_queue_len++; list_add_tail(&sb->list, &ms->sync_queue); - if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) - wake_up_process(ms->master_thread); + if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) { + int id = (int)(ms - ipvs->ms); + + wake_up_process(ipvs->master_tinfo[id].task); + } } else ip_vs_sync_buff_release(sb); spin_unlock(&ipvs->sync_lock); @@ -1636,8 +1640,10 @@ static void master_wakeup_work_handler(struct work_struct *work) spin_lock_bh(&ipvs->sync_lock); if (ms->sync_queue_len && ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { + int id = (int)(ms - ipvs->ms); + ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; - wake_up_process(ms->master_thread); + wake_up_process(ipvs->master_tinfo[id].task); } spin_unlock_bh(&ipvs->sync_lock); } @@ -1703,10 +1709,6 @@ done: if (sb) ip_vs_sync_buff_release(sb); - /* release the sending multicast socket */ - sock_release(tinfo->sock); - kfree(tinfo); - return 0; } @@ -1740,11 +1742,6 @@ static int sync_thread_backup(void *data) } } - /* release the sending multicast socket */ - sock_release(tinfo->sock); - kfree(tinfo->buf); - kfree(tinfo); - return 0; } @@ -1752,8 +1749,8 @@ static int sync_thread_backup(void *data) int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, int state) { - struct ip_vs_sync_thread_data *tinfo = NULL; - struct task_struct **array = NULL, *task; + struct ip_vs_sync_thread_data *ti = NULL, *tinfo; + struct task_struct *task; struct net_device *dev; char *name; int (*threadfn)(void *data); @@ -1822,7 +1819,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, threadfn = sync_thread_master; } else if (state == IP_VS_STATE_BACKUP) { result = -EEXIST; - if (ipvs->backup_threads) + if (ipvs->backup_tinfo) goto out_early; ipvs->bcfg = *c; @@ -1849,28 +1846,22 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, master_wakeup_work_handler); ms->ipvs = ipvs; } - } else { - array = kcalloc(count, sizeof(struct task_struct *), - GFP_KERNEL); - result = -ENOMEM; - if (!array) - goto out; } + result = -ENOMEM; + ti = kcalloc(count, sizeof(struct ip_vs_sync_thread_data), + GFP_KERNEL); + if (!ti) + goto out; for (id = 0; id < count; id++) { - result = -ENOMEM; - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); - if (!tinfo) - goto out; + tinfo = &ti[id]; tinfo->ipvs = ipvs; - tinfo->sock = NULL; if (state == IP_VS_STATE_BACKUP) { + result = -ENOMEM; tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, GFP_KERNEL); if (!tinfo->buf) goto out; - } else { - tinfo->buf = NULL; } tinfo->id = id; if (state == IP_VS_STATE_MASTER) @@ -1885,17 +1876,15 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, result = PTR_ERR(task); goto out; } - tinfo = NULL; - if (state == IP_VS_STATE_MASTER) - ipvs->ms[id].master_thread = task; - else - array[id] = task; + tinfo->task = task; } /* mark as active */ - if (state == IP_VS_STATE_BACKUP) - ipvs->backup_threads = array; + if (state == IP_VS_STATE_MASTER) + ipvs->master_tinfo = ti; + else + ipvs->backup_tinfo = ti; spin_lock_bh(&ipvs->sync_buff_lock); ipvs->sync_state |= state; spin_unlock_bh(&ipvs->sync_buff_lock); @@ -1910,29 +1899,31 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, out: /* We do not need RTNL lock anymore, release it here so that - * sock_release below and in the kthreads can use rtnl_lock - * to leave the mcast group. + * sock_release below can use rtnl_lock to leave the mcast group. */ rtnl_unlock(); - count = id; - while (count-- > 0) { - if (state == IP_VS_STATE_MASTER) - kthread_stop(ipvs->ms[count].master_thread); - else - kthread_stop(array[count]); + id = min(id, count - 1); + if (ti) { + for (tinfo = ti + id; tinfo >= ti; tinfo--) { + if (tinfo->task) + kthread_stop(tinfo->task); + } } if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { kfree(ipvs->ms); ipvs->ms = NULL; } mutex_unlock(&ipvs->sync_mutex); - if (tinfo) { - if (tinfo->sock) - sock_release(tinfo->sock); - kfree(tinfo->buf); - kfree(tinfo); + + /* No more mutexes, release socks */ + if (ti) { + for (tinfo = ti + id; tinfo >= ti; tinfo--) { + if (tinfo->sock) + sock_release(tinfo->sock); + kfree(tinfo->buf); + } + kfree(ti); } - kfree(array); return result; out_early: @@ -1944,15 +1935,18 @@ out_early: int stop_sync_thread(struct netns_ipvs *ipvs, int state) { - struct task_struct **array; + struct ip_vs_sync_thread_data *ti, *tinfo; int id; int retc = -EINVAL; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); + mutex_lock(&ipvs->sync_mutex); if (state == IP_VS_STATE_MASTER) { + retc = -ESRCH; if (!ipvs->ms) - return -ESRCH; + goto err; + ti = ipvs->master_tinfo; /* * The lock synchronizes with sb_queue_tail(), so that we don't @@ -1971,38 +1965,56 @@ int stop_sync_thread(struct netns_ipvs *ipvs, int state) struct ipvs_master_sync_state *ms = &ipvs->ms[id]; int ret; + tinfo = &ti[id]; pr_info("stopping master sync thread %d ...\n", - task_pid_nr(ms->master_thread)); + task_pid_nr(tinfo->task)); cancel_delayed_work_sync(&ms->master_wakeup_work); - ret = kthread_stop(ms->master_thread); + ret = kthread_stop(tinfo->task); if (retc >= 0) retc = ret; } kfree(ipvs->ms); ipvs->ms = NULL; + ipvs->master_tinfo = NULL; } else if (state == IP_VS_STATE_BACKUP) { - if (!ipvs->backup_threads) - return -ESRCH; + retc = -ESRCH; + if (!ipvs->backup_tinfo) + goto err; + ti = ipvs->backup_tinfo; ipvs->sync_state &= ~IP_VS_STATE_BACKUP; - array = ipvs->backup_threads; retc = 0; for (id = ipvs->threads_mask; id >= 0; id--) { int ret; + tinfo = &ti[id]; pr_info("stopping backup sync thread %d ...\n", - task_pid_nr(array[id])); - ret = kthread_stop(array[id]); + task_pid_nr(tinfo->task)); + ret = kthread_stop(tinfo->task); if (retc >= 0) retc = ret; } - kfree(array); - ipvs->backup_threads = NULL; + ipvs->backup_tinfo = NULL; + } else { + goto err; } + id = ipvs->threads_mask; + mutex_unlock(&ipvs->sync_mutex); + + /* No more mutexes, release socks */ + for (tinfo = ti + id; tinfo >= ti; tinfo--) { + if (tinfo->sock) + sock_release(tinfo->sock); + kfree(tinfo->buf); + } + kfree(ti); /* decrease the module use count */ ip_vs_use_count_dec(); + return retc; +err: + mutex_unlock(&ipvs->sync_mutex); return retc; } @@ -2021,7 +2033,6 @@ void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) { int retc; - mutex_lock(&ipvs->sync_mutex); retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); if (retc && retc != -ESRCH) pr_err("Failed to stop Master Daemon\n"); @@ -2029,5 +2040,4 @@ void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); if (retc && retc != -ESRCH) pr_err("Failed to stop Backup Daemon\n"); - mutex_unlock(&ipvs->sync_mutex); } -- cgit v1.2.3 From e7600865db32b69deb0109b8254244dca592adcf Mon Sep 17 00:00:00 2001 From: Felix Kaechele Date: Tue, 25 Jun 2019 16:48:59 -0400 Subject: netfilter: ctnetlink: Fix regression in conntrack entry deletion Commit f8e608982022 ("netfilter: ctnetlink: Resolve conntrack L3-protocol flush regression") introduced a regression in which deletion of conntrack entries would fail because the L3 protocol information is replaced by AF_UNSPEC. As a result the search for the entry to be deleted would turn up empty due to the tuple used to perform the search is now different from the tuple used to initially set up the entry. For flushing the conntrack table we do however want to keep the option for nfgenmsg->version to have a non-zero value to allow for newer user-space tools to request treatment under the new behavior. With that it is possible to independently flush tables for a defined L3 protocol. This was introduced with the enhancements in in commit 59c08c69c278 ("netfilter: ctnetlink: Support L3 protocol-filter on flush"). Older user-space tools will retain the behavior of flushing all tables regardless of defined L3 protocol. Fixes: f8e608982022 ("netfilter: ctnetlink: Resolve conntrack L3-protocol flush regression") Suggested-by: Pablo Neira Ayuso Signed-off-by: Felix Kaechele Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 7db79c1b8084..1b77444d5b52 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1256,7 +1256,6 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, struct nf_conntrack_tuple tuple; struct nf_conn *ct; struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC; struct nf_conntrack_zone zone; int err; @@ -1266,11 +1265,13 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, if (cda[CTA_TUPLE_ORIG]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, - u3, &zone); + nfmsg->nfgen_family, &zone); else if (cda[CTA_TUPLE_REPLY]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, - u3, &zone); + nfmsg->nfgen_family, &zone); else { + u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC; + return ctnetlink_flush_conntrack(net, cda, NETLINK_CB(skb).portid, nlmsg_report(nlh), u3); -- cgit v1.2.3 From 5d1549847c76b1ffcf8e388ef4d0f229bdd1d7e8 Mon Sep 17 00:00:00 2001 From: He Zhe Date: Mon, 24 Jun 2019 11:17:38 +0800 Subject: netfilter: Fix remainder of pseudo-header protocol 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since v5.1-rc1, some types of packets do not get unreachable reply with the following iptables setting. Fox example, $ iptables -A INPUT -p icmp --icmp-type 8 -j REJECT $ ping 127.0.0.1 -c 1 PING 127.0.0.1 (127.0.0.1) 56(84) bytes of data. — 127.0.0.1 ping statistics — 1 packets transmitted, 0 received, 100% packet loss, time 0ms We should have got the following reply from command line, but we did not. From 127.0.0.1 icmp_seq=1 Destination Port Unreachable Yi Zhao reported it and narrowed it down to: 7fc38225363d ("netfilter: reject: skip csum verification for protocols that don't support it"), This is because nf_ip_checksum still expects pseudo-header protocol type 0 for packets that are of neither TCP or UDP, and thus ICMP packets are mistakenly treated as TCP/UDP. This patch corrects the conditions in nf_ip_checksum and all other places that still call it with protocol 0. Fixes: 7fc38225363d ("netfilter: reject: skip csum verification for protocols that don't support it") Reported-by: Yi Zhao Signed-off-by: He Zhe Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_icmp.c | 2 +- net/netfilter/nf_nat_proto.c | 2 +- net/netfilter/utils.c | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 9becac953587..71a84a0517f3 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -221,7 +221,7 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, /* See ip_conntrack_proto_tcp.c */ if (state->net->ct.sysctl_checksum && state->hook == NF_INET_PRE_ROUTING && - nf_ip_checksum(skb, state->hook, dataoff, 0)) { + nf_ip_checksum(skb, state->hook, dataoff, IPPROTO_ICMP)) { icmp_error_log(skb, state, "bad hw icmp checksum"); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 84f5c90a7f21..9f3e52ebd3b8 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -567,7 +567,7 @@ int nf_nat_icmp_reply_translation(struct sk_buff *skb, if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) return 0; - if (nf_ip_checksum(skb, hooknum, hdrlen, 0)) + if (nf_ip_checksum(skb, hooknum, hdrlen, IPPROTO_ICMP)) return 0; inside = (void *)skb->data + hdrlen; diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c index 06dc55590441..51b454d8fa9c 100644 --- a/net/netfilter/utils.c +++ b/net/netfilter/utils.c @@ -17,7 +17,8 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, case CHECKSUM_COMPLETE: if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) break; - if ((protocol == 0 && !csum_fold(skb->csum)) || + if ((protocol != IPPROTO_TCP && protocol != IPPROTO_UDP && + !csum_fold(skb->csum)) || !csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len - dataoff, protocol, skb->csum)) { @@ -26,7 +27,7 @@ __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, } /* fall through */ case CHECKSUM_NONE: - if (protocol == 0) + if (protocol != IPPROTO_TCP && protocol != IPPROTO_UDP) skb->csum = 0; else skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, -- cgit v1.2.3 From b60a77386b1d4868f72f6353d35dabe5fbe981f2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Jun 2019 20:40:45 +0200 Subject: net: make skb_dst_force return true when dst is refcounted netfilter did not expect that skb_dst_force() can cause skb to lose its dst entry. I got a bug report with a skb->dst NULL dereference in netfilter output path. The backtrace contains nf_reinject(), so the dst might have been cleared when skb got queued to userspace. Other users were fixed via if (skb_dst(skb)) { skb_dst_force(skb); if (!skb_dst(skb)) goto handle_err; } But I think its preferable to make the 'dst might be cleared' part of the function explicit. In netfilter case, skb with a null dst is expected when queueing in prerouting hook, so drop skb for the other hooks. v2: v1 of this patch returned true in case skb had no dst entry. Eric said: Say if we have two skb_dst_force() calls for some reason on the same skb, only the first one will return false. This now returns false even when skb had no dst, as per Erics suggestion, so callers might need to check skb_dst() first before skb_dst_force(). Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/dst.h | 5 ++++- net/netfilter/nf_queue.c | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net/netfilter') diff --git a/include/net/dst.h b/include/net/dst.h index 12b31c602cb0..f8206d3fed2f 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -302,8 +302,9 @@ static inline bool dst_hold_safe(struct dst_entry *dst) * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. + * Returns true if dst is refcounted. */ -static inline void skb_dst_force(struct sk_buff *skb) +static inline bool skb_dst_force(struct sk_buff *skb) { if (skb_dst_is_noref(skb)) { struct dst_entry *dst = skb_dst(skb); @@ -314,6 +315,8 @@ static inline void skb_dst_force(struct sk_buff *skb) skb->_skb_refdst = (unsigned long)dst; } + + return skb->_skb_refdst != 0UL; } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index b5b2be55ca82..2c440015ff0c 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -190,6 +190,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, goto err; } + if (!skb_dst_force(skb) && state->hook != NF_INET_PRE_ROUTING) { + status = -ENETDOWN; + goto err; + } + *entry = (struct nf_queue_entry) { .skb = skb, .state = *state, @@ -198,7 +203,6 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, }; nf_queue_entry_get_refs(entry); - skb_dst_force(skb); switch (entry->state.pf) { case AF_INET: -- cgit v1.2.3 From f0c1aab2bd1ad131d9d7528b9dcbf9253a74e5da Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 21 Jun 2019 17:37:48 +0200 Subject: netfilter: rename nf_SYNPROXY.h to nf_synproxy.h Uppercase is a reminiscence from the iptables infrastructure, rename this header before this is included in stable kernels. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_SYNPROXY.h | 19 ------------------- include/uapi/linux/netfilter/nf_synproxy.h | 19 +++++++++++++++++++ include/uapi/linux/netfilter/xt_SYNPROXY.h | 2 +- net/netfilter/nf_synproxy_core.c | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) delete mode 100644 include/uapi/linux/netfilter/nf_SYNPROXY.h create mode 100644 include/uapi/linux/netfilter/nf_synproxy.h (limited to 'net/netfilter') diff --git a/include/uapi/linux/netfilter/nf_SYNPROXY.h b/include/uapi/linux/netfilter/nf_SYNPROXY.h deleted file mode 100644 index 068d1b3a6f06..000000000000 --- a/include/uapi/linux/netfilter/nf_SYNPROXY.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NF_SYNPROXY_H -#define _NF_SYNPROXY_H - -#include - -#define NF_SYNPROXY_OPT_MSS 0x01 -#define NF_SYNPROXY_OPT_WSCALE 0x02 -#define NF_SYNPROXY_OPT_SACK_PERM 0x04 -#define NF_SYNPROXY_OPT_TIMESTAMP 0x08 -#define NF_SYNPROXY_OPT_ECN 0x10 - -struct nf_synproxy_info { - __u8 options; - __u8 wscale; - __u16 mss; -}; - -#endif /* _NF_SYNPROXY_H */ diff --git a/include/uapi/linux/netfilter/nf_synproxy.h b/include/uapi/linux/netfilter/nf_synproxy.h new file mode 100644 index 000000000000..068d1b3a6f06 --- /dev/null +++ b/include/uapi/linux/netfilter/nf_synproxy.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NF_SYNPROXY_H +#define _NF_SYNPROXY_H + +#include + +#define NF_SYNPROXY_OPT_MSS 0x01 +#define NF_SYNPROXY_OPT_WSCALE 0x02 +#define NF_SYNPROXY_OPT_SACK_PERM 0x04 +#define NF_SYNPROXY_OPT_TIMESTAMP 0x08 +#define NF_SYNPROXY_OPT_ECN 0x10 + +struct nf_synproxy_info { + __u8 options; + __u8 wscale; + __u16 mss; +}; + +#endif /* _NF_SYNPROXY_H */ diff --git a/include/uapi/linux/netfilter/xt_SYNPROXY.h b/include/uapi/linux/netfilter/xt_SYNPROXY.h index 4d5611d647df..19c04ed86172 100644 --- a/include/uapi/linux/netfilter/xt_SYNPROXY.h +++ b/include/uapi/linux/netfilter/xt_SYNPROXY.h @@ -2,7 +2,7 @@ #ifndef _XT_SYNPROXY_H #define _XT_SYNPROXY_H -#include +#include #define XT_SYNPROXY_OPT_MSS NF_SYNPROXY_OPT_MSS #define XT_SYNPROXY_OPT_WSCALE NF_SYNPROXY_OPT_WSCALE diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 409722d23302..b101f187eda8 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include -- cgit v1.2.3 From eca27f14b1168a8962023571b29d2ec593788b61 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 2 Jul 2019 20:06:30 +0200 Subject: netfilter: nf_log: Replace a seq_printf() call by seq_puts() in seq_show() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A string which did not contain a data format specification should be put into a sequence. Thus use the corresponding function “seq_puts”. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 3574a212bdc2..bb25d4c794c7 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -374,7 +374,7 @@ static int seq_show(struct seq_file *s, void *v) continue; logger = nft_log_dereference(loggers[*pos][i]); - seq_printf(s, "%s", logger->name); + seq_puts(s, logger->name); if (i == 0 && loggers[*pos][i + 1] != NULL) seq_puts(s, ","); -- cgit v1.2.3 From 0d9cb300acad29f25ea23d2592e69970bc61f14c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 2 Jul 2019 20:41:14 +0200 Subject: netfilter: nf_queue: remove unused hook entries pointer Its not used anywhere, so remove this. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 3 +-- net/bridge/br_input.c | 2 +- net/netfilter/core.c | 2 +- net/netfilter/nf_queue.c | 8 +++----- 4 files changed, 6 insertions(+), 9 deletions(-) (limited to 'net/netfilter') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 7239105d9d2e..3cb6dcf53a4e 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -120,6 +120,5 @@ nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family, } int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, - const struct nf_hook_entries *entries, unsigned int index, - unsigned int verdict); + unsigned int index, unsigned int verdict); #endif /* _NF_QUEUE_H */ diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 21b74e7a7b2f..512383d5e53f 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -234,7 +234,7 @@ static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb) kfree_skb(skb); return RX_HANDLER_CONSUMED; case NF_QUEUE: - ret = nf_queue(skb, &state, e, i, verdict); + ret = nf_queue(skb, &state, i, verdict); if (ret == 1) continue; return RX_HANDLER_CONSUMED; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 817a9e5d16e4..5d5bdf450091 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -520,7 +520,7 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, ret = -EPERM; return ret; case NF_QUEUE: - ret = nf_queue(skb, state, e, s, verdict); + ret = nf_queue(skb, state, s, verdict); if (ret == 1) continue; return ret; diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index b5b2be55ca82..c72a5bdd123f 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -156,7 +156,6 @@ static void nf_ip6_saveroute(const struct sk_buff *skb, } static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, - const struct nf_hook_entries *entries, unsigned int index, unsigned int queuenum) { int status = -ENOENT; @@ -225,12 +224,11 @@ err: /* Packets leaving via this function must come back through nf_reinject(). */ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, - const struct nf_hook_entries *entries, unsigned int index, - unsigned int verdict) + unsigned int index, unsigned int verdict) { int ret; - ret = __nf_queue(skb, state, entries, index, verdict >> NF_VERDICT_QBITS); + ret = __nf_queue(skb, state, index, verdict >> NF_VERDICT_QBITS); if (ret < 0) { if (ret == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) @@ -336,7 +334,7 @@ next_hook: local_bh_enable(); break; case NF_QUEUE: - err = nf_queue(skb, &entry->state, hooks, i, verdict); + err = nf_queue(skb, &entry->state, i, verdict); if (err == 1) goto next_hook; break; -- cgit v1.2.3 From 6f7b841bc939e7c811ad32427b58d54edbcfa6ed Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 1 Jul 2019 19:49:34 +0300 Subject: ipvs: allow tunneling with gre encapsulation windows real servers can handle gre tunnels, this patch allows gre encapsulation with the tunneling method, thereby letting ipvs be load balancer for windows-based services Signed-off-by: Vadim Fedorenko Acked-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/ip_vs.h | 1 + net/netfilter/ipvs/ip_vs_ctl.c | 1 + net/netfilter/ipvs/ip_vs_xmit.c | 66 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 65 insertions(+), 3 deletions(-) (limited to 'net/netfilter') diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index e4f18061a4fd..4102ddcb4e14 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -128,6 +128,7 @@ enum { IP_VS_CONN_F_TUNNEL_TYPE_IPIP = 0, /* IPIP */ IP_VS_CONN_F_TUNNEL_TYPE_GUE, /* GUE */ + IP_VS_CONN_F_TUNNEL_TYPE_GRE, /* GRE */ IP_VS_CONN_F_TUNNEL_TYPE_MAX, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 84384d896e29..998353bec74f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -525,6 +525,7 @@ static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) port = dest->tun_port; break; case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: + case IP_VS_CONN_F_TUNNEL_TYPE_GRE: port = 0; break; default: diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 71fc6d63a67f..9c464d24beec 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -29,6 +29,7 @@ #include /* for tcphdr */ #include #include +#include #include /* for csum_tcpudp_magic */ #include #include /* for icmp_send */ @@ -388,6 +389,12 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 tflags = 0; + + if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + mtu -= gre_calc_hlen(tflags); } if (mtu < 68) { IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); @@ -548,6 +555,12 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && skb->ip_summed == CHECKSUM_PARTIAL) mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; + } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 tflags = 0; + + if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + mtu -= gre_calc_hlen(tflags); } if (mtu < IPV6_MIN_MTU) { IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, @@ -1079,6 +1092,24 @@ ipvs_gue_encap(struct net *net, struct sk_buff *skb, return 0; } +static void +ipvs_gre_encap(struct net *net, struct sk_buff *skb, + struct ip_vs_conn *cp, __u8 *next_protocol) +{ + __be16 proto = *next_protocol == IPPROTO_IPIP ? + htons(ETH_P_IP) : htons(ETH_P_IPV6); + __be16 tflags = 0; + size_t hdrlen; + + if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + + hdrlen = gre_calc_hlen(tflags); + gre_build_header(skb, hdrlen, tflags, proto, 0, 0); + + *next_protocol = IPPROTO_GRE; +} + /* * IP Tunneling transmitter * @@ -1151,6 +1182,15 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, gue_hdrlen = sizeof(struct guehdr) + gue_optlen; max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + size_t gre_hdrlen; + __be16 tflags = 0; + + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + gre_hdrlen = gre_calc_hlen(tflags); + + max_headroom += gre_hdrlen; } /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ @@ -1172,6 +1212,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, skb->ip_summed == CHECKSUM_PARTIAL) { gso_type |= SKB_GSO_TUNNEL_REMCSUM; } + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + gso_type |= SKB_GSO_GRE_CSUM; + else + gso_type |= SKB_GSO_GRE; } if (iptunnel_handle_offloads(skb, gso_type)) @@ -1192,8 +1237,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, check = true; udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); - } - + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) + ipvs_gre_encap(net, skb, cp, &next_protocol); skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); @@ -1287,6 +1332,15 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, gue_hdrlen = sizeof(struct guehdr) + gue_optlen; max_headroom += sizeof(struct udphdr) + gue_hdrlen; + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + size_t gre_hdrlen; + __be16 tflags = 0; + + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + tflags |= TUNNEL_CSUM; + gre_hdrlen = gre_calc_hlen(tflags); + + max_headroom += gre_hdrlen; } skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, @@ -1306,6 +1360,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->ip_summed == CHECKSUM_PARTIAL) { gso_type |= SKB_GSO_TUNNEL_REMCSUM; } + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) + gso_type |= SKB_GSO_GRE_CSUM; + else + gso_type |= SKB_GSO_GRE; } if (iptunnel_handle_offloads(skb, gso_type)) @@ -1326,7 +1385,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, check = true; udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); - } + } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) + ipvs_gre_encap(net, skb, cp, &next_protocol); skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); -- cgit v1.2.3 From ad49d86e07a497e834cb06f2b151dccd75f8e148 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 26 Jun 2019 12:59:19 +0200 Subject: netfilter: nf_tables: Add synproxy support Add synproxy support for nf_tables. This behaves like the iptables synproxy target but it is structured in a way that allows us to propose improvements in the future. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_synproxy.h | 1 + include/net/netfilter/nf_synproxy.h | 5 + include/uapi/linux/netfilter/nf_synproxy.h | 4 + include/uapi/linux/netfilter/nf_tables.h | 16 ++ net/netfilter/Kconfig | 11 + net/netfilter/Makefile | 1 + net/netfilter/nft_synproxy.c | 287 ++++++++++++++++++++++++++ 7 files changed, 325 insertions(+) create mode 100644 net/netfilter/nft_synproxy.c (limited to 'net/netfilter') diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h index c5659dcf5b1a..8f00125b06f4 100644 --- a/include/net/netfilter/nf_conntrack_synproxy.h +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -2,6 +2,7 @@ #ifndef _NF_CONNTRACK_SYNPROXY_H #define _NF_CONNTRACK_SYNPROXY_H +#include #include struct nf_conn_synproxy { diff --git a/include/net/netfilter/nf_synproxy.h b/include/net/netfilter/nf_synproxy.h index 3e8b3f03b687..87d73fb5279d 100644 --- a/include/net/netfilter/nf_synproxy.h +++ b/include/net/netfilter/nf_synproxy.h @@ -39,6 +39,11 @@ unsigned int ipv6_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs); int nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net); void nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net); +#else +static inline int +nf_synproxy_ipv6_init(struct synproxy_net *snet, struct net *net) { return 0; } +static inline void +nf_synproxy_ipv6_fini(struct synproxy_net *snet, struct net *net) {}; #endif /* CONFIG_IPV6 */ #endif /* _NF_SYNPROXY_SHARED_H */ diff --git a/include/uapi/linux/netfilter/nf_synproxy.h b/include/uapi/linux/netfilter/nf_synproxy.h index 068d1b3a6f06..6f3791c8946f 100644 --- a/include/uapi/linux/netfilter/nf_synproxy.h +++ b/include/uapi/linux/netfilter/nf_synproxy.h @@ -9,6 +9,10 @@ #define NF_SYNPROXY_OPT_SACK_PERM 0x04 #define NF_SYNPROXY_OPT_TIMESTAMP 0x08 #define NF_SYNPROXY_OPT_ECN 0x10 +#define NF_SYNPROXY_OPT_MASK (NF_SYNPROXY_OPT_MSS | \ + NF_SYNPROXY_OPT_WSCALE | \ + NF_SYNPROXY_OPT_SACK_PERM | \ + NF_SYNPROXY_OPT_TIMESTAMP) struct nf_synproxy_info { __u8 options; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c6c8ec5c7c00..c53d581643fe 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1551,6 +1551,22 @@ enum nft_osf_flags { NFT_OSF_F_VERSION = (1 << 0), }; +/** + * enum nft_synproxy_attributes - nf_tables synproxy expression netlink attributes + * + * @NFTA_SYNPROXY_MSS: mss value sent to the backend (NLA_U16) + * @NFTA_SYNPROXY_WSCALE: wscale value sent to the backend (NLA_U8) + * @NFTA_SYNPROXY_FLAGS: flags (NLA_U32) + */ +enum nft_synproxy_attributes { + NFTA_SYNPROXY_UNSPEC, + NFTA_SYNPROXY_MSS, + NFTA_SYNPROXY_WSCALE, + NFTA_SYNPROXY_FLAGS, + __NFTA_SYNPROXY_MAX, +}; +#define NFTA_SYNPROXY_MAX (__NFTA_SYNPROXY_MAX - 1) + /** * enum nft_device_attributes - nf_tables device netlink attributes * diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 21025c2c605b..d59742408d9b 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -651,6 +651,17 @@ config NFT_TPROXY help This makes transparent proxy support available in nftables. +config NFT_SYNPROXY + tristate "Netfilter nf_tables SYNPROXY expression support" + depends on NF_CONNTRACK && NETFILTER_ADVANCED + select NETFILTER_SYNPROXY + select SYN_COOKIES + help + The SYNPROXY expression allows you to intercept TCP connections and + establish them using syncookies before they are passed on to the + server. This allows to avoid conntrack and server resource usage + during SYN-flood attacks. + if NF_TABLES_NETDEV config NF_DUP_NETDEV diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 72cca6b48960..deada20975ff 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -110,6 +110,7 @@ obj-$(CONFIG_NFT_SOCKET) += nft_socket.o obj-$(CONFIG_NFT_OSF) += nft_osf.o obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o obj-$(CONFIG_NFT_XFRM) += nft_xfrm.o +obj-$(CONFIG_NFT_SYNPROXY) += nft_synproxy.o obj-$(CONFIG_NFT_NAT) += nft_chain_nat.o diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c new file mode 100644 index 000000000000..80060ade8a5b --- /dev/null +++ b/net/netfilter/nft_synproxy.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_synproxy { + struct nf_synproxy_info info; +}; + +static const struct nla_policy nft_synproxy_policy[NFTA_SYNPROXY_MAX + 1] = { + [NFTA_SYNPROXY_MSS] = { .type = NLA_U16 }, + [NFTA_SYNPROXY_WSCALE] = { .type = NLA_U8 }, + [NFTA_SYNPROXY_FLAGS] = { .type = NLA_U32 }, +}; + +static void nft_synproxy_tcp_options(struct synproxy_options *opts, + const struct tcphdr *tcp, + struct synproxy_net *snet, + struct nf_synproxy_info *info, + struct nft_synproxy *priv) +{ + this_cpu_inc(snet->stats->syn_received); + if (tcp->ece && tcp->cwr) + opts->options |= NF_SYNPROXY_OPT_ECN; + + opts->options &= priv->info.options; + if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) + synproxy_init_timestamp_cookie(info, opts); + else + opts->options &= ~(NF_SYNPROXY_OPT_WSCALE | + NF_SYNPROXY_OPT_SACK_PERM | + NF_SYNPROXY_OPT_ECN); +} + +static void nft_synproxy_eval_v4(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt, + const struct tcphdr *tcp, + struct tcphdr *_tcph, + struct synproxy_options *opts) +{ + struct nft_synproxy *priv = nft_expr_priv(expr); + struct nf_synproxy_info info = priv->info; + struct net *net = nft_net(pkt); + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *skb = pkt->skb; + + if (tcp->syn) { + /* Initial SYN from client */ + nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); + synproxy_send_client_synack(net, skb, tcp, opts); + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else if (tcp->ack) { + /* ACK from client */ + if (synproxy_recv_client_ack(net, skb, tcp, opts, + ntohl(tcp->seq))) { + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else { + regs->verdict.code = NF_DROP; + } + } +} + +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) +static void nft_synproxy_eval_v6(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt, + const struct tcphdr *tcp, + struct tcphdr *_tcph, + struct synproxy_options *opts) +{ + struct nft_synproxy *priv = nft_expr_priv(expr); + struct nf_synproxy_info info = priv->info; + struct net *net = nft_net(pkt); + struct synproxy_net *snet = synproxy_pernet(net); + struct sk_buff *skb = pkt->skb; + + if (tcp->syn) { + /* Initial SYN from client */ + nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); + synproxy_send_client_synack_ipv6(net, skb, tcp, opts); + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else if (tcp->ack) { + /* ACK from client */ + if (synproxy_recv_client_ack_ipv6(net, skb, tcp, opts, + ntohl(tcp->seq))) { + consume_skb(skb); + regs->verdict.code = NF_STOLEN; + } else { + regs->verdict.code = NF_DROP; + } + } +} +#endif /* CONFIG_NF_TABLES_IPV6*/ + +static void nft_synproxy_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct synproxy_options opts = {}; + struct sk_buff *skb = pkt->skb; + int thoff = pkt->xt.thoff; + const struct tcphdr *tcp; + struct tcphdr _tcph; + + if (pkt->tprot != IPPROTO_TCP) { + regs->verdict.code = NFT_BREAK; + return; + } + + if (nf_ip_checksum(skb, nft_hook(pkt), thoff, IPPROTO_TCP)) { + regs->verdict.code = NF_DROP; + return; + } + + tcp = skb_header_pointer(skb, pkt->xt.thoff, + sizeof(struct tcphdr), + &_tcph); + if (!tcp) { + regs->verdict.code = NF_DROP; + return; + } + + if (!synproxy_parse_options(skb, thoff, tcp, &opts)) { + regs->verdict.code = NF_DROP; + return; + } + + switch (skb->protocol) { + case htons(ETH_P_IP): + nft_synproxy_eval_v4(expr, regs, pkt, tcp, &_tcph, &opts); + return; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case htons(ETH_P_IPV6): + nft_synproxy_eval_v6(expr, regs, pkt, tcp, &_tcph, &opts); + return; +#endif + } + regs->verdict.code = NFT_BREAK; +} + +static int nft_synproxy_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct synproxy_net *snet = synproxy_pernet(ctx->net); + struct nft_synproxy *priv = nft_expr_priv(expr); + u32 flags; + int err; + + if (tb[NFTA_SYNPROXY_MSS]) + priv->info.mss = ntohs(nla_get_be16(tb[NFTA_SYNPROXY_MSS])); + if (tb[NFTA_SYNPROXY_WSCALE]) + priv->info.wscale = nla_get_u8(tb[NFTA_SYNPROXY_WSCALE]); + if (tb[NFTA_SYNPROXY_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFTA_SYNPROXY_FLAGS])); + if (flags & ~NF_SYNPROXY_OPT_MASK) + return -EOPNOTSUPP; + priv->info.options = flags; + } + + err = nf_ct_netns_get(ctx->net, ctx->family); + if (err) + return err; + + switch (ctx->family) { + case NFPROTO_IPV4: + err = nf_synproxy_ipv4_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + err = nf_synproxy_ipv6_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + break; +#endif + case NFPROTO_INET: + case NFPROTO_BRIDGE: + err = nf_synproxy_ipv4_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + err = nf_synproxy_ipv6_init(snet, ctx->net); + if (err) + goto nf_ct_failure; + break; + } + + return 0; + +nf_ct_failure: + nf_ct_netns_put(ctx->net, ctx->family); + return err; +} + +static void nft_synproxy_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct synproxy_net *snet = synproxy_pernet(ctx->net); + + switch (ctx->family) { + case NFPROTO_IPV4: + nf_synproxy_ipv4_fini(snet, ctx->net); + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + nf_synproxy_ipv6_fini(snet, ctx->net); + break; +#endif + case NFPROTO_INET: + case NFPROTO_BRIDGE: + nf_synproxy_ipv4_fini(snet, ctx->net); + nf_synproxy_ipv6_fini(snet, ctx->net); + break; + } + nf_ct_netns_put(ctx->net, ctx->family); +} + +static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_synproxy *priv = nft_expr_priv(expr); + + if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) || + nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) || + nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static int nft_synproxy_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD)); +} + +static struct nft_expr_type nft_synproxy_type; +static const struct nft_expr_ops nft_synproxy_ops = { + .eval = nft_synproxy_eval, + .size = NFT_EXPR_SIZE(sizeof(struct nft_synproxy)), + .init = nft_synproxy_init, + .destroy = nft_synproxy_destroy, + .dump = nft_synproxy_dump, + .type = &nft_synproxy_type, + .validate = nft_synproxy_validate, +}; + +static struct nft_expr_type nft_synproxy_type __read_mostly = { + .ops = &nft_synproxy_ops, + .name = "synproxy", + .owner = THIS_MODULE, + .policy = nft_synproxy_policy, + .maxattr = NFTA_SYNPROXY_MAX, +}; + +static int __init nft_synproxy_module_init(void) +{ + return nft_register_expr(&nft_synproxy_type); +} + +static void __exit nft_synproxy_module_exit(void) +{ + return nft_unregister_expr(&nft_synproxy_type); +} + +module_init(nft_synproxy_module_init); +module_exit(nft_synproxy_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Fernando Fernandez "); +MODULE_ALIAS_NFT_EXPR("synproxy"); -- cgit v1.2.3 From 6aedd14b25dbcf6cfdf2da8569153d45b3e5d9fd Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 3 Jul 2019 21:38:09 +0300 Subject: ipvs: strip gre tunnel headers from icmp errors Recognize GRE tunnels in received ICMP errors and properly strip the tunnel headers. Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 46 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index e8651fd621ef..dd4727a5d6ec 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -35,6 +35,7 @@ #include #include /* for icmp_send */ #include +#include #include #include #include /* net_generic() */ @@ -1610,6 +1611,38 @@ unk: return 0; } +/* Check the GRE tunnel and return its header length */ +static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, + unsigned int offset, __u16 af, + const union nf_inet_addr *daddr, __u8 *proto) +{ + struct gre_base_hdr _greh, *greh; + struct ip_vs_dest *dest; + + greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh); + if (!greh) + goto unk; + dest = ip_vs_find_tunnel(ipvs, af, daddr, 0); + if (!dest) + goto unk; + if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { + __be16 type; + + /* Only support version 0 and C (csum) */ + if ((greh->flags & ~GRE_CSUM) != 0) + goto unk; + type = greh->protocol; + /* Later we can support also IPPROTO_IPV6 */ + if (type != htons(ETH_P_IP)) + goto unk; + *proto = IPPROTO_IPIP; + return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags)); + } + +unk: + return 0; +} + /* * Handle ICMP messages in the outside-to-inside direction (incoming). * Find any that might be relevant, check against existing connections, @@ -1689,7 +1722,8 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ ipip = true; - } else if (cih->protocol == IPPROTO_UDP && /* Can be UDP encap */ + } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */ + cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */ /* Error for our tunnel must arrive at LOCAL_IN */ (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) { __u8 iproto; @@ -1699,10 +1733,14 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, if (unlikely(cih->frag_off & htons(IP_OFFSET))) return NF_ACCEPT; offset2 = offset + cih->ihl * 4; - ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, raddr, - &iproto); + if (cih->protocol == IPPROTO_UDP) + ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, + raddr, &iproto); + else + ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET, + raddr, &iproto); if (ulen > 0) { - /* Skip IP and UDP tunnel headers */ + /* Skip IP and UDP/GRE tunnel headers */ offset = offset2 + ulen; /* Now we should be at the original IP header */ cih = skb_header_pointer(skb, offset, sizeof(_ciph), -- cgit v1.2.3 From 30e103fe24debce6f35f2e53cc763ed7be292df3 Mon Sep 17 00:00:00 2001 From: wenxu Date: Fri, 5 Jul 2019 21:16:32 +0800 Subject: netfilter: nft_meta: move bridge meta keys into nft_meta_bridge Separate bridge meta key from nft_meta to meta_bridge to avoid a dependency between the bridge module and nft_meta when using the bridge API available through include/linux/if_bridge.h Signed-off-by: wenxu Reviewed-by: Nikolay Aleksandrov Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_meta.h | 44 ++++++++++++ net/bridge/netfilter/Kconfig | 6 ++ net/bridge/netfilter/Makefile | 1 + net/bridge/netfilter/nft_meta_bridge.c | 127 +++++++++++++++++++++++++++++++++ net/netfilter/nf_tables_core.c | 1 + net/netfilter/nft_meta.c | 81 ++++++++------------- 6 files changed, 207 insertions(+), 53 deletions(-) create mode 100644 include/net/netfilter/nft_meta.h create mode 100644 net/bridge/netfilter/nft_meta_bridge.c (limited to 'net/netfilter') diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h new file mode 100644 index 000000000000..5c69e9b09388 --- /dev/null +++ b/include/net/netfilter/nft_meta.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NFT_META_H_ +#define _NFT_META_H_ + +struct nft_meta { + enum nft_meta_keys key:8; + union { + enum nft_registers dreg:8; + enum nft_registers sreg:8; + }; +}; + +extern const struct nla_policy nft_meta_policy[]; + +int nft_meta_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]); + +int nft_meta_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]); + +int nft_meta_get_dump(struct sk_buff *skb, + const struct nft_expr *expr); + +int nft_meta_set_dump(struct sk_buff *skb, + const struct nft_expr *expr); + +void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt); + +void nft_meta_set_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt); + +void nft_meta_set_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr); + +int nft_meta_set_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data); + +#endif diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index f4fb0b9b927d..fbc708508360 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -9,6 +9,12 @@ menuconfig NF_TABLES_BRIDGE bool "Ethernet Bridge nf_tables support" if NF_TABLES_BRIDGE + +config NFT_BRIDGE_META + tristate "Netfilter nf_table bridge meta support" + help + Add support for bridge dedicated meta key. + config NFT_BRIDGE_REJECT tristate "Netfilter nf_tables bridge reject support" depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6 diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 9d7767322a64..8e2c5759d964 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -3,6 +3,7 @@ # Makefile for the netfilter modules for Link Layer filtering on a bridge. # +obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o # connection tracking diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c new file mode 100644 index 000000000000..dde8651254ac --- /dev/null +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../br_private.h" + +static void nft_meta_bridge_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); + u32 *dest = ®s->data[priv->dreg]; + const struct net_bridge_port *p; + + switch (priv->key) { + case NFT_META_BRI_IIFNAME: + if (in == NULL || (p = br_port_get_rcu(in)) == NULL) + goto err; + break; + case NFT_META_BRI_OIFNAME: + if (out == NULL || (p = br_port_get_rcu(out)) == NULL) + goto err; + break; + default: + goto out; + } + + strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); + return; +out: + return nft_meta_get_eval(expr, regs, pkt); +err: + regs->verdict.code = NFT_BREAK; +} + +static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_meta *priv = nft_expr_priv(expr); + unsigned int len; + + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { + case NFT_META_BRI_IIFNAME: + case NFT_META_BRI_OIFNAME: + len = IFNAMSIZ; + break; + default: + return nft_meta_get_init(ctx, expr, tb); + } + + priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); +} + +static struct nft_expr_type nft_meta_bridge_type; +static const struct nft_expr_ops nft_meta_bridge_get_ops = { + .type = &nft_meta_bridge_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_bridge_get_eval, + .init = nft_meta_bridge_get_init, + .dump = nft_meta_get_dump, +}; + +static const struct nft_expr_ops nft_meta_bridge_set_ops = { + .type = &nft_meta_bridge_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .eval = nft_meta_set_eval, + .init = nft_meta_set_init, + .destroy = nft_meta_set_destroy, + .dump = nft_meta_set_dump, + .validate = nft_meta_set_validate, +}; + +static const struct nft_expr_ops * +nft_meta_bridge_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_META_KEY] == NULL) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) + return ERR_PTR(-EINVAL); + + if (tb[NFTA_META_DREG]) + return &nft_meta_bridge_get_ops; + + if (tb[NFTA_META_SREG]) + return &nft_meta_bridge_set_ops; + + return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_meta_bridge_type __read_mostly = { + .family = NFPROTO_BRIDGE, + .name = "meta", + .select_ops = nft_meta_bridge_select_ops, + .policy = nft_meta_policy, + .maxattr = NFTA_META_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_meta_bridge_module_init(void) +{ + return nft_register_expr(&nft_meta_bridge_type); +} + +static void __exit nft_meta_bridge_module_exit(void) +{ + nft_unregister_expr(&nft_meta_bridge_type); +} + +module_init(nft_meta_bridge_module_init); +module_exit(nft_meta_bridge_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("wenxu "); +MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta"); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index b950cd31348b..96c74c4c7176 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -19,6 +19,7 @@ #include #include #include +#include static noinline void __nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index a54329b8634a..18a848b01759 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -21,23 +21,12 @@ #include /* for TCP_TIME_WAIT */ #include #include +#include #include /* NF_BR_PRE_ROUTING */ -struct nft_meta { - enum nft_meta_keys key:8; - union { - enum nft_registers dreg:8; - enum nft_registers sreg:8; - }; -}; - static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); -#ifdef CONFIG_NF_TABLES_BRIDGE -#include "../bridge/br_private.h" -#endif - void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -47,9 +36,6 @@ void nft_meta_get_eval(const struct nft_expr *expr, const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); struct sock *sk; u32 *dest = ®s->data[priv->dreg]; -#ifdef CONFIG_NF_TABLES_BRIDGE - const struct net_bridge_port *p; -#endif switch (priv->key) { case NFT_META_LEN: @@ -228,18 +214,6 @@ void nft_meta_get_eval(const struct nft_expr *expr, case NFT_META_SECPATH: nft_reg_store8(dest, secpath_exists(skb)); break; -#endif -#ifdef CONFIG_NF_TABLES_BRIDGE - case NFT_META_BRI_IIFNAME: - if (in == NULL || (p = br_port_get_rcu(in)) == NULL) - goto err; - strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); - return; - case NFT_META_BRI_OIFNAME: - if (out == NULL || (p = br_port_get_rcu(out)) == NULL) - goto err; - strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); - return; #endif case NFT_META_IIFKIND: if (in == NULL || in->rtnl_link_ops == NULL) @@ -260,10 +234,11 @@ void nft_meta_get_eval(const struct nft_expr *expr, err: regs->verdict.code = NFT_BREAK; } +EXPORT_SYMBOL_GPL(nft_meta_get_eval); -static void nft_meta_set_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +void nft_meta_set_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; @@ -300,16 +275,18 @@ static void nft_meta_set_eval(const struct nft_expr *expr, WARN_ON(1); } } +EXPORT_SYMBOL_GPL(nft_meta_set_eval); -static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { +const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, [NFTA_META_KEY] = { .type = NLA_U32 }, [NFTA_META_SREG] = { .type = NLA_U32 }, }; +EXPORT_SYMBOL_GPL(nft_meta_policy); -static int nft_meta_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +int nft_meta_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; @@ -359,14 +336,6 @@ static int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_SECPATH: len = sizeof(u8); break; -#endif -#ifdef CONFIG_NF_TABLES_BRIDGE - case NFT_META_BRI_IIFNAME: - case NFT_META_BRI_OIFNAME: - if (ctx->family != NFPROTO_BRIDGE) - return -EOPNOTSUPP; - len = IFNAMSIZ; - break; #endif default: return -EOPNOTSUPP; @@ -376,6 +345,7 @@ static int nft_meta_get_init(const struct nft_ctx *ctx, return nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, len); } +EXPORT_SYMBOL_GPL(nft_meta_get_init); static int nft_meta_get_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, @@ -409,9 +379,9 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, #endif } -static int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) +int nft_meta_set_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; @@ -437,10 +407,11 @@ static int nft_meta_set_validate(const struct nft_ctx *ctx, return nft_chain_validate_hooks(ctx->chain, hooks); } +EXPORT_SYMBOL_GPL(nft_meta_set_validate); -static int nft_meta_set_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +int nft_meta_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; @@ -475,9 +446,10 @@ static int nft_meta_set_init(const struct nft_ctx *ctx, return 0; } +EXPORT_SYMBOL_GPL(nft_meta_set_init); -static int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) +int nft_meta_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -490,8 +462,9 @@ static int nft_meta_get_dump(struct sk_buff *skb, nla_put_failure: return -1; } +EXPORT_SYMBOL_GPL(nft_meta_get_dump); -static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -505,15 +478,17 @@ static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) nla_put_failure: return -1; } +EXPORT_SYMBOL_GPL(nft_meta_set_dump); -static void nft_meta_set_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +void nft_meta_set_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); if (priv->key == NFT_META_NFTRACE) static_branch_dec(&nft_trace_enabled); } +EXPORT_SYMBOL_GPL(nft_meta_set_destroy); static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, -- cgit v1.2.3 From b9c04ae7907f09c5e873e7c9a8feea2ce41e15b3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 5 Jul 2019 23:38:46 +0200 Subject: netfilter: nf_tables: add nft_expr_type_request_module() This helper function makes sure the family specific extension is loaded. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cae5c46e2dd4..582f4e475d67 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2019,6 +2019,19 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family, return NULL; } +#ifdef CONFIG_MODULES +static int nft_expr_type_request_module(struct net *net, u8 family, + struct nlattr *nla) +{ + nft_request_module(net, "nft-expr-%u-%.*s", family, + nla_len(nla), (char *)nla_data(nla)); + if (__nft_expr_type_get(family, nla)) + return -EAGAIN; + + return 0; +} +#endif + static const struct nft_expr_type *nft_expr_type_get(struct net *net, u8 family, struct nlattr *nla) @@ -2035,9 +2048,7 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net, lockdep_nfnl_nft_mutex_not_held(); #ifdef CONFIG_MODULES if (type == NULL) { - nft_request_module(net, "nft-expr-%u-%.*s", family, - nla_len(nla), (char *)nla_data(nla)); - if (__nft_expr_type_get(family, nla)) + if (nft_expr_type_request_module(net, family, nla) == -EAGAIN) return ERR_PTR(-EAGAIN); nft_request_module(net, "nft-expr-%.*s", -- cgit v1.2.3 From 9cff126f73a7025bcb0883189b2bed90010a57d4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 5 Jul 2019 22:59:05 +0200 Subject: netfilter: nf_tables: __nft_expr_type_get() selects specific family type In case that there are two types, prefer the family specify extension. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 582f4e475d67..5e97bf64975a 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2009,14 +2009,17 @@ EXPORT_SYMBOL_GPL(nft_unregister_expr); static const struct nft_expr_type *__nft_expr_type_get(u8 family, struct nlattr *nla) { - const struct nft_expr_type *type; + const struct nft_expr_type *type, *candidate = NULL; list_for_each_entry(type, &nf_tables_expressions, list) { - if (!nla_strcmp(nla, type->name) && - (!type->family || type->family == family)) - return type; + if (!nla_strcmp(nla, type->name)) { + if (!type->family && !candidate) + candidate = type; + else if (type->family == family) + candidate = type; + } } - return NULL; + return candidate; } #ifdef CONFIG_MODULES -- cgit v1.2.3 From 0ef1efd1354d732d040f29b2005420f83fcdd8f4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 5 Jul 2019 23:38:54 +0200 Subject: netfilter: nf_tables: force module load in case select_ops() returns -EAGAIN nft_meta needs to pull in the nft_meta_bridge module in case that this is a bridge family rule from the select_ops() path. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++++++ net/netfilter/nft_meta.c | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5e97bf64975a..d22d00ca78c1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2144,6 +2144,12 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, (const struct nlattr * const *)info->tb); if (IS_ERR(ops)) { err = PTR_ERR(ops); +#ifdef CONFIG_MODULES + if (err == -EAGAIN) + nft_expr_type_request_module(ctx->net, + ctx->family, + tb[NFTA_EXPR_NAME]); +#endif goto err1; } } else diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 18a848b01759..417f8d32e9a3 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -519,6 +519,10 @@ nft_meta_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) return ERR_PTR(-EINVAL); +#ifdef CONFIG_NF_TABLES_BRIDGE + if (ctx->family == NFPROTO_BRIDGE) + return ERR_PTR(-EAGAIN); +#endif if (tb[NFTA_META_DREG]) return &nft_meta_get_ops; -- cgit v1.2.3 From c9626a2cbdb20e26587b3fad99960520a023432b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Jul 2019 23:00:43 +0200 Subject: netfilter: nf_tables: add hardware offload support This patch adds hardware offload support for nftables through the existing netdev_ops->ndo_setup_tc() interface, the TC_SETUP_CLSFLOWER classifier and the flow rule API. This hardware offload support is available for the NFPROTO_NETDEV family and the ingress hook. Each nftables expression has a new ->offload interface, that is used to populate the flow rule object that is attached to the transaction object. There is a new per-table NFT_TABLE_F_HW flag, that is set on to offload an entire table, including all of its chains. This patch supports for basic metadata (layer 3 and 4 protocol numbers), 5-tuple payload matching and the accept/drop actions; this also includes basechain hardware offload only. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_tables.h | 14 ++ include/net/netfilter/nf_tables_offload.h | 76 +++++++++ include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/Makefile | 2 +- net/netfilter/nf_tables_api.c | 39 ++++- net/netfilter/nf_tables_offload.c | 267 ++++++++++++++++++++++++++++++ net/netfilter/nft_cmp.c | 53 ++++++ net/netfilter/nft_immediate.c | 31 ++++ net/netfilter/nft_meta.c | 27 +++ net/netfilter/nft_payload.c | 187 +++++++++++++++++++++ 10 files changed, 691 insertions(+), 7 deletions(-) create mode 100644 include/net/netfilter/nf_tables_offload.h create mode 100644 net/netfilter/nf_tables_offload.c (limited to 'net/netfilter') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 9e8493aad49d..35dfdd9f69b3 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -161,6 +161,7 @@ struct nft_ctx { const struct nlattr * const *nla; u32 portid; u32 seq; + u16 flags; u8 family; u8 level; bool report; @@ -735,6 +736,9 @@ enum nft_trans_phase { NFT_TRANS_RELEASE }; +struct nft_flow_rule; +struct nft_offload_ctx; + /** * struct nft_expr_ops - nf_tables expression operations * @@ -777,6 +781,10 @@ struct nft_expr_ops { const struct nft_data **data); bool (*gc)(struct net *net, const struct nft_expr *expr); + int (*offload)(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr); + u32 offload_flags; const struct nft_expr_type *type; void *data; }; @@ -859,6 +867,7 @@ static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) enum nft_chain_flags { NFT_BASE_CHAIN = 0x1, + NFT_CHAIN_HW_OFFLOAD = 0x2, }; /** @@ -942,6 +951,7 @@ struct nft_stats { * @stats: per-cpu chain stats * @chain: the chain * @dev_name: device name that this base chain is attached to (if any) + * @cb_list: list of flow block callbacks (for hardware offload) */ struct nft_base_chain { struct nf_hook_ops ops; @@ -951,6 +961,7 @@ struct nft_base_chain { struct nft_stats __percpu *stats; struct nft_chain chain; char dev_name[IFNAMSIZ]; + struct list_head cb_list; }; static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain) @@ -1322,11 +1333,14 @@ struct nft_trans { struct nft_trans_rule { struct nft_rule *rule; + struct nft_flow_rule *flow; u32 rule_id; }; #define nft_trans_rule(trans) \ (((struct nft_trans_rule *)trans->data)->rule) +#define nft_trans_flow_rule(trans) \ + (((struct nft_trans_rule *)trans->data)->flow) #define nft_trans_rule_id(trans) \ (((struct nft_trans_rule *)trans->data)->rule_id) diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h new file mode 100644 index 000000000000..3196663a10e3 --- /dev/null +++ b/include/net/netfilter/nf_tables_offload.h @@ -0,0 +1,76 @@ +#ifndef _NET_NF_TABLES_OFFLOAD_H +#define _NET_NF_TABLES_OFFLOAD_H + +#include +#include + +struct nft_offload_reg { + u32 key; + u32 len; + u32 base_offset; + u32 offset; + struct nft_data mask; +}; + +enum nft_offload_dep_type { + NFT_OFFLOAD_DEP_UNSPEC = 0, + NFT_OFFLOAD_DEP_NETWORK, + NFT_OFFLOAD_DEP_TRANSPORT, +}; + +struct nft_offload_ctx { + struct { + enum nft_offload_dep_type type; + __be16 l3num; + u8 protonum; + } dep; + unsigned int num_actions; + struct nft_offload_reg regs[NFT_REG32_15 + 1]; +}; + +void nft_offload_set_dependency(struct nft_offload_ctx *ctx, + enum nft_offload_dep_type type); +void nft_offload_update_dependency(struct nft_offload_ctx *ctx, + const void *data, u32 len); + +struct nft_flow_key { + struct flow_dissector_key_basic basic; + union { + struct flow_dissector_key_ipv4_addrs ipv4; + struct flow_dissector_key_ipv6_addrs ipv6; + }; + struct flow_dissector_key_ports tp; + struct flow_dissector_key_ip ip; + struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_eth_addrs eth_addrs; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct nft_flow_match { + struct flow_dissector dissector; + struct nft_flow_key key; + struct nft_flow_key mask; +}; + +struct nft_flow_rule { + __be16 proto; + struct nft_flow_match match; + struct flow_rule *rule; +}; + +#define NFT_OFFLOAD_F_ACTION (1 << 0) + +struct nft_rule; +struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule); +void nft_flow_rule_destroy(struct nft_flow_rule *flow); +int nft_flow_rule_offload_commit(struct net *net); + +#define NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg) \ + (__reg)->base_offset = \ + offsetof(struct nft_flow_key, __base); \ + (__reg)->offset = \ + offsetof(struct nft_flow_key, __base.__field); \ + (__reg)->len = __len; \ + (__reg)->key = __key; \ + memset(&(__reg)->mask, 0xff, (__reg)->len); + +#endif diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 0e3462dfb182..82abaa183fc3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -192,6 +192,7 @@ enum nft_table_attributes { * @NFTA_CHAIN_USE: number of references to this chain (NLA_U32) * @NFTA_CHAIN_TYPE: type name of the string (NLA_NUL_STRING) * @NFTA_CHAIN_COUNTERS: counter specification of the chain (NLA_NESTED: nft_counter_attributes) + * @NFTA_CHAIN_FLAGS: chain flags */ enum nft_chain_attributes { NFTA_CHAIN_UNSPEC, @@ -204,6 +205,7 @@ enum nft_chain_attributes { NFTA_CHAIN_TYPE, NFTA_CHAIN_COUNTERS, NFTA_CHAIN_PAD, + NFTA_CHAIN_FLAGS, __NFTA_CHAIN_MAX }; #define NFTA_CHAIN_MAX (__NFTA_CHAIN_MAX - 1) diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index deada20975ff..9270a7fae484 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -78,7 +78,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \ - nft_chain_route.o + nft_chain_route.o nf_tables_offload.o nf_tables_set-objs := nf_tables_set_core.o \ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d22d00ca78c1..ed17a7c29b86 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -97,6 +98,7 @@ static void nft_ctx_init(struct nft_ctx *ctx, ctx->nla = nla; ctx->portid = NETLINK_CB(skb).portid; ctx->report = nlmsg_report(nlh); + ctx->flags = nlh->nlmsg_flags; ctx->seq = nlh->nlmsg_seq; } @@ -1169,6 +1171,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { [NFTA_CHAIN_POLICY] = { .type = NLA_U32 }, [NFTA_CHAIN_TYPE] = { .type = NLA_STRING }, [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, + [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, }; static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { @@ -1603,7 +1606,7 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha } static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, - u8 policy) + u8 policy, u32 flags) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -1657,8 +1660,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, ops->hook = hook.type->hooks[ops->hooknum]; ops->dev = hook.dev; - chain->flags |= NFT_BASE_CHAIN; + chain->flags |= NFT_BASE_CHAIN | flags; basechain->policy = NF_ACCEPT; + INIT_LIST_HEAD(&basechain->cb_list); } else { chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) @@ -1718,7 +1722,8 @@ err1: return err; } -static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy) +static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, + u32 flags) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -1730,6 +1735,9 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy) struct nft_trans *trans; int err; + if (chain->flags ^ flags) + return -EOPNOTSUPP; + if (nla[NFTA_CHAIN_HOOK]) { if (!nft_is_base_chain(chain)) return -EBUSY; @@ -1835,6 +1843,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, u8 policy = NF_ACCEPT; struct nft_ctx ctx; u64 handle = 0; + u32 flags = 0; lockdep_assert_held(&net->nft.commit_mutex); @@ -1889,6 +1898,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } } + if (nla[NFTA_CHAIN_FLAGS]) + flags = ntohl(nla_get_be32(nla[NFTA_CHAIN_FLAGS])); + nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain != NULL) { @@ -1899,10 +1911,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - return nf_tables_updchain(&ctx, genmask, policy); + return nf_tables_updchain(&ctx, genmask, policy, flags); } - return nf_tables_addchain(&ctx, family, genmask, policy); + return nf_tables_addchain(&ctx, family, genmask, policy, flags); } static int nf_tables_delchain(struct net *net, struct sock *nlsk, @@ -2658,6 +2670,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, u8 genmask = nft_genmask_next(net); struct nft_expr_info *info = NULL; int family = nfmsg->nfgen_family; + struct nft_flow_rule *flow; struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; @@ -2804,7 +2817,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, list_add_tail_rcu(&rule->list, &old_rule->list); } else { - if (nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule) == NULL) { + trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); + if (!trans) { err = -ENOMEM; goto err2; } @@ -2827,6 +2841,14 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, if (net->nft.validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); + if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { + flow = nft_flow_rule_create(rule); + if (IS_ERR(flow)) + return PTR_ERR(flow); + + nft_trans_flow_rule(trans) = flow; + } + return 0; err2: nf_tables_rule_release(&ctx, rule); @@ -6624,6 +6646,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; + int err; if (list_empty(&net->nft.commit_list)) { mutex_unlock(&net->nft.commit_mutex); @@ -6634,6 +6657,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nf_tables_validate(net) < 0) return -EAGAIN; + err = nft_flow_rule_offload_commit(net); + if (err < 0) + return err; + /* 1. Allocate space for next generation rules_gen_X[] */ list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { int ret; diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c new file mode 100644 index 000000000000..2c3302845f67 --- /dev/null +++ b/net/netfilter/nf_tables_offload.c @@ -0,0 +1,267 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include +#include +#include +#include + +static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) +{ + struct nft_flow_rule *flow; + + flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL); + if (!flow) + return NULL; + + flow->rule = flow_rule_alloc(num_actions); + if (!flow->rule) { + kfree(flow); + return NULL; + } + + flow->rule->match.dissector = &flow->match.dissector; + flow->rule->match.mask = &flow->match.mask; + flow->rule->match.key = &flow->match.key; + + return flow; +} + +struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule) +{ + struct nft_offload_ctx ctx = { + .dep = { + .type = NFT_OFFLOAD_DEP_UNSPEC, + }, + }; + struct nft_flow_rule *flow; + int num_actions = 0, err; + struct nft_expr *expr; + + expr = nft_expr_first(rule); + while (expr->ops && expr != nft_expr_last(rule)) { + if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION) + num_actions++; + + expr = nft_expr_next(expr); + } + + flow = nft_flow_rule_alloc(num_actions); + if (!flow) + return ERR_PTR(-ENOMEM); + + expr = nft_expr_first(rule); + while (expr->ops && expr != nft_expr_last(rule)) { + if (!expr->ops->offload) { + err = -EOPNOTSUPP; + goto err_out; + } + err = expr->ops->offload(&ctx, flow, expr); + if (err < 0) + goto err_out; + + expr = nft_expr_next(expr); + } + flow->proto = ctx.dep.l3num; + + return flow; +err_out: + nft_flow_rule_destroy(flow); + + return ERR_PTR(err); +} + +void nft_flow_rule_destroy(struct nft_flow_rule *flow) +{ + kfree(flow->rule); + kfree(flow); +} + +void nft_offload_set_dependency(struct nft_offload_ctx *ctx, + enum nft_offload_dep_type type) +{ + ctx->dep.type = type; +} + +void nft_offload_update_dependency(struct nft_offload_ctx *ctx, + const void *data, u32 len) +{ + switch (ctx->dep.type) { + case NFT_OFFLOAD_DEP_NETWORK: + WARN_ON(len != sizeof(__u16)); + memcpy(&ctx->dep.l3num, data, sizeof(__u16)); + break; + case NFT_OFFLOAD_DEP_TRANSPORT: + WARN_ON(len != sizeof(__u8)); + memcpy(&ctx->dep.protonum, data, sizeof(__u8)); + break; + default: + break; + } + ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; +} + +static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, + __be16 proto, + struct netlink_ext_ack *extack) +{ + common->protocol = proto; + common->extack = extack; +} + +static int nft_setup_cb_call(struct nft_base_chain *basechain, + enum tc_setup_type type, void *type_data) +{ + struct flow_block_cb *block_cb; + int err; + + list_for_each_entry(block_cb, &basechain->cb_list, list) { + err = block_cb->cb(type, type_data, block_cb->cb_priv); + if (err < 0) + return err; + } + return 0; +} + +static int nft_flow_offload_rule(struct nft_trans *trans, + enum flow_cls_command command) +{ + struct nft_flow_rule *flow = nft_trans_flow_rule(trans); + struct nft_rule *rule = nft_trans_rule(trans); + struct flow_cls_offload cls_flow = {}; + struct nft_base_chain *basechain; + struct netlink_ext_ack extack; + __be16 proto = ETH_P_ALL; + + if (!nft_is_base_chain(trans->ctx.chain)) + return -EOPNOTSUPP; + + basechain = nft_base_chain(trans->ctx.chain); + + if (flow) + proto = flow->proto; + + nft_flow_offload_common_init(&cls_flow.common, proto, &extack); + cls_flow.command = command; + cls_flow.cookie = (unsigned long) rule; + if (flow) + cls_flow.rule = flow->rule; + + return nft_setup_cb_call(basechain, TC_SETUP_CLSFLOWER, &cls_flow); +} + +static int nft_flow_offload_bind(struct flow_block_offload *bo, + struct nft_base_chain *basechain) +{ + list_splice(&bo->cb_list, &basechain->cb_list); + return 0; +} + +static int nft_flow_offload_unbind(struct flow_block_offload *bo, + struct nft_base_chain *basechain) +{ + struct flow_block_cb *block_cb, *next; + + list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { + list_del(&block_cb->list); + flow_block_cb_free(block_cb); + } + + return 0; +} + +#define FLOW_SETUP_BLOCK TC_SETUP_BLOCK + +static int nft_flow_offload_chain(struct nft_trans *trans, + enum flow_block_command cmd) +{ + struct nft_chain *chain = trans->ctx.chain; + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo = {}; + struct nft_base_chain *basechain; + struct net_device *dev; + int err; + + if (!nft_is_base_chain(chain)) + return -EOPNOTSUPP; + + basechain = nft_base_chain(chain); + dev = basechain->ops.dev; + if (!dev || !dev->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + /* Only default policy to accept is supported for now. */ + if (cmd == FLOW_BLOCK_BIND && + nft_trans_chain_policy(trans) != -1 && + nft_trans_chain_policy(trans) != NF_ACCEPT) + return -EOPNOTSUPP; + + bo.command = cmd; + bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo.extack = &extack; + INIT_LIST_HEAD(&bo.cb_list); + + err = dev->netdev_ops->ndo_setup_tc(dev, FLOW_SETUP_BLOCK, &bo); + if (err < 0) + return err; + + switch (cmd) { + case FLOW_BLOCK_BIND: + err = nft_flow_offload_bind(&bo, basechain); + break; + case FLOW_BLOCK_UNBIND: + err = nft_flow_offload_unbind(&bo, basechain); + break; + } + + return err; +} + +int nft_flow_rule_offload_commit(struct net *net) +{ + struct nft_trans *trans; + int err = 0; + + list_for_each_entry(trans, &net->nft.commit_list, list) { + if (trans->ctx.family != NFPROTO_NETDEV) + continue; + + switch (trans->msg_type) { + case NFT_MSG_NEWCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_chain(trans, FLOW_BLOCK_BIND); + break; + case NFT_MSG_DELCHAIN: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_chain(trans, FLOW_BLOCK_UNBIND); + break; + case NFT_MSG_NEWRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + if (trans->ctx.flags & NLM_F_REPLACE || + !(trans->ctx.flags & NLM_F_APPEND)) + return -EOPNOTSUPP; + + err = nft_flow_offload_rule(trans, FLOW_CLS_REPLACE); + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + break; + case NFT_MSG_DELRULE: + if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) + continue; + + err = nft_flow_offload_rule(trans, FLOW_CLS_DESTROY); + break; + } + + if (err) + return err; + } + + return err; +} diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 411c0cf741e3..bd173b1824c6 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -12,6 +12,7 @@ #include #include #include +#include #include struct nft_cmp_expr { @@ -107,12 +108,44 @@ nla_put_failure: return -1; } +static int __nft_cmp_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_cmp_expr *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->sreg]; + u8 *mask = (u8 *)&flow->match.mask; + u8 *key = (u8 *)&flow->match.key; + + if (priv->op != NFT_CMP_EQ) + return -EOPNOTSUPP; + + memcpy(key + reg->offset, &priv->data, priv->len); + memcpy(mask + reg->offset, ®->mask, priv->len); + + flow->match.dissector.used_keys |= BIT(reg->key); + flow->match.dissector.offset[reg->key] = reg->base_offset; + + nft_offload_update_dependency(ctx, &priv->data, priv->len); + + return 0; +} + +static int nft_cmp_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_cmp_expr *priv = nft_expr_priv(expr); + + return __nft_cmp_offload(ctx, flow, priv); +} + static const struct nft_expr_ops nft_cmp_ops = { .type = &nft_cmp_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)), .eval = nft_cmp_eval, .init = nft_cmp_init, .dump = nft_cmp_dump, + .offload = nft_cmp_offload, }; static int nft_cmp_fast_init(const struct nft_ctx *ctx, @@ -143,6 +176,25 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, return 0; } +static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); + struct nft_cmp_expr cmp = { + .data = { + .data = { + [0] = priv->data, + }, + }, + .sreg = priv->sreg, + .len = priv->len / BITS_PER_BYTE, + .op = NFT_CMP_EQ, + }; + + return __nft_cmp_offload(ctx, flow, &cmp); +} + static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); @@ -169,6 +221,7 @@ const struct nft_expr_ops nft_cmp_fast_ops = { .eval = NULL, /* inlined */ .init = nft_cmp_fast_init, .dump = nft_cmp_fast_dump, + .offload = nft_cmp_fast_offload, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index cb8547f97220..ca2ae4b95a8d 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -13,6 +13,7 @@ #include #include #include +#include void nft_immediate_eval(const struct nft_expr *expr, struct nft_regs *regs, @@ -124,6 +125,34 @@ static int nft_immediate_validate(const struct nft_ctx *ctx, return 0; } +static int nft_immediate_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + struct flow_action_entry *entry; + const struct nft_data *data; + + if (priv->dreg != NFT_REG_VERDICT) + return -EOPNOTSUPP; + + entry = &flow->rule->action.entries[ctx->num_actions++]; + + data = &priv->data; + switch (data->verdict.code) { + case NF_ACCEPT: + entry->id = FLOW_ACTION_ACCEPT; + break; + case NF_DROP: + entry->id = FLOW_ACTION_DROP; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), @@ -133,6 +162,8 @@ static const struct nft_expr_ops nft_imm_ops = { .deactivate = nft_immediate_deactivate, .dump = nft_immediate_dump, .validate = nft_immediate_validate, + .offload = nft_immediate_offload, + .offload_flags = NFT_OFFLOAD_F_ACTION, }; struct nft_expr_type nft_imm_type __read_mostly = { diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 417f8d32e9a3..76866f77e343 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -22,6 +22,7 @@ #include #include #include +#include #include /* NF_BR_PRE_ROUTING */ @@ -490,6 +491,31 @@ void nft_meta_set_destroy(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_meta_set_destroy); +static int nft_meta_get_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->key) { + case NFT_META_PROTOCOL: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto, + sizeof(__u16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); + break; + case NFT_META_L4PROTO: + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, + sizeof(__u8), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), @@ -497,6 +523,7 @@ static const struct nft_expr_ops nft_meta_get_ops = { .init = nft_meta_get_init, .dump = nft_meta_get_dump, .validate = nft_meta_get_validate, + .offload = nft_meta_get_offload, }; static const struct nft_expr_ops nft_meta_set_ops = { diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 1260f78a034d..22a80eb60222 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -15,10 +15,13 @@ #include #include #include +#include /* For layer 4 checksum field offset. */ #include #include #include +#include +#include /* add vlan header into the user buffer for if tag was removed by offloads */ static bool @@ -150,12 +153,195 @@ nla_put_failure: return -1; } +static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct ethhdr, h_source): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, + src, ETH_ALEN, reg); + break; + case offsetof(struct ethhdr, h_dest): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, + dst, ETH_ALEN, reg); + break; + } + + return 0; +} + +static int nft_payload_offload_ip(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct iphdr, saddr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src, + sizeof(struct in_addr), reg); + break; + case offsetof(struct iphdr, daddr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst, + sizeof(struct in_addr), reg); + break; + case offsetof(struct iphdr, protocol): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, + sizeof(__u8), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct ipv6hdr, saddr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src, + sizeof(struct in6_addr), reg); + break; + case offsetof(struct ipv6hdr, daddr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst, + sizeof(struct in6_addr), reg); + break; + case offsetof(struct ipv6hdr, nexthdr): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, + sizeof(__u8), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int nft_payload_offload_nh(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + int err; + + switch (ctx->dep.l3num) { + case htons(ETH_P_IP): + err = nft_payload_offload_ip(ctx, flow, priv); + break; + case htons(ETH_P_IPV6): + err = nft_payload_offload_ip6(ctx, flow, priv); + break; + default: + return -EOPNOTSUPP; + } + + return err; +} + +static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct tcphdr, source): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, + sizeof(__be16), reg); + break; + case offsetof(struct tcphdr, dest): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, + sizeof(__be16), reg); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int nft_payload_offload_udp(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + + switch (priv->offset) { + case offsetof(struct udphdr, source): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, + sizeof(__be16), reg); + break; + case offsetof(struct udphdr, dest): + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, + sizeof(__be16), reg); + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int nft_payload_offload_th(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_payload *priv) +{ + int err; + + switch (ctx->dep.protonum) { + case IPPROTO_TCP: + err = nft_payload_offload_tcp(ctx, flow, priv); + break; + case IPPROTO_UDP: + err = nft_payload_offload_udp(ctx, flow, priv); + break; + default: + return -EOPNOTSUPP; + } + + return err; +} + +static int nft_payload_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + int err; + + switch (priv->base) { + case NFT_PAYLOAD_LL_HEADER: + err = nft_payload_offload_ll(ctx, flow, priv); + break; + case NFT_PAYLOAD_NETWORK_HEADER: + err = nft_payload_offload_nh(ctx, flow, priv); + break; + case NFT_PAYLOAD_TRANSPORT_HEADER: + err = nft_payload_offload_th(ctx, flow, priv); + break; + default: + err = -EOPNOTSUPP; + break; + } + return err; +} + static const struct nft_expr_ops nft_payload_ops = { .type = &nft_payload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, + .offload = nft_payload_offload, }; const struct nft_expr_ops nft_payload_fast_ops = { @@ -164,6 +350,7 @@ const struct nft_expr_ops nft_payload_fast_ops = { .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, + .offload = nft_payload_offload, }; static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) -- cgit v1.2.3