From 0e47079b6cd7d26e2dab95398927b1184a9a733c Mon Sep 17 00:00:00 2001 From: Wadim Egorov Date: Wed, 14 Feb 2018 17:07:12 +0100 Subject: net: phy: dp83867: Add documentation for CLK_OUT pin muxing Add documentation of ti,clk-output-sel which can be used to select a specific clock for CLK_OUT. Signed-off-by: Wadim Egorov Signed-off-by: Daniel Schultz Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/ti,dp83867.txt | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/ti,dp83867.txt b/Documentation/devicetree/bindings/net/ti,dp83867.txt index 02c4353b5cf2..9ef9338aaee1 100644 --- a/Documentation/devicetree/bindings/net/ti,dp83867.txt +++ b/Documentation/devicetree/bindings/net/ti,dp83867.txt @@ -25,6 +25,8 @@ Optional property: software needs to take when this pin is strapped in these modes. See data manual for details. + - ti,clk-output-sel - Muxing option for CLK_OUT pin - see dt-bindings/net/ti-dp83867.h + for applicable values. Note: ti,min-output-impedance and ti,max-output-impedance are mutually exclusive. When both properties are present ti,max-output-impedance -- cgit v1.2.3 From 1f74edf9a5b12447b6be96130b334e8bb98d7e69 Mon Sep 17 00:00:00 2001 From: Xue Liu Date: Tue, 20 Feb 2018 12:55:56 +0100 Subject: ieee802154: Add device tree documentation for MCR20A Signed-off-by: Xue Liu Signed-off-by: Stefan Schmidt --- .../devicetree/bindings/net/ieee802154/mcr20a.txt | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/ieee802154/mcr20a.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/ieee802154/mcr20a.txt b/Documentation/devicetree/bindings/net/ieee802154/mcr20a.txt new file mode 100644 index 000000000000..2aaef567c5be --- /dev/null +++ b/Documentation/devicetree/bindings/net/ieee802154/mcr20a.txt @@ -0,0 +1,23 @@ +* MCR20A IEEE 802.15.4 * + +Required properties: + - compatible: should be "nxp,mcr20a" + - spi-max-frequency: maximal bus speed, should be set to a frequency + lower than 9000000 depends sync or async operation mode + - reg: the chipselect index + - interrupts: the interrupt generated by the device. Non high-level + can occur deadlocks while handling isr. + +Optional properties: + - rst_b-gpio: GPIO spec for the RST_B pin + +Example: + + mcr20a@0 { + compatible = "nxp,mcr20a"; + spi-max-frequency = <9000000>; + reg = <0>; + interrupts = <17 2>; + interrupt-parent = <&gpio>; + rst_b-gpio = <&gpio 27 1> + }; -- cgit v1.2.3 From 66f5325ce93a2e6fd726fca31fea91baf36a392e Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 27 Feb 2018 15:53:07 +0000 Subject: dt-bindings: add maximum power level to SFP binding Add the new maximum power level property to the SFP binding. Signed-off-by: Russell King Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/sff,sfp.txt | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/sff,sfp.txt b/Documentation/devicetree/bindings/net/sff,sfp.txt index f1c441bedf68..929591d52ed6 100644 --- a/Documentation/devicetree/bindings/net/sff,sfp.txt +++ b/Documentation/devicetree/bindings/net/sff,sfp.txt @@ -33,6 +33,10 @@ Optional Properties: Select (AKA RS1) output gpio signal (SFP+ only), low: low Tx rate, high: high Tx rate. Must not be present for SFF modules +- maximum-power-milliwatt : Maximum module power consumption + Specifies the maximum power consumption allowable by a module in the + slot, in milli-Watts. Presently, modules can be up to 1W, 1.5W or 2W. + Example #1: Direct serdes to SFP connection sfp_eth3: sfp-eth3 { @@ -40,6 +44,7 @@ sfp_eth3: sfp-eth3 { i2c-bus = <&sfp_1g_i2c>; los-gpios = <&cpm_gpio2 22 GPIO_ACTIVE_HIGH>; mod-def0-gpios = <&cpm_gpio2 21 GPIO_ACTIVE_LOW>; + maximum-power-milliwatt = <1000>; pinctrl-names = "default"; pinctrl-0 = <&cpm_sfp_1g_pins &cps_sfp_1g_pins>; tx-disable-gpios = <&cps_gpio1 24 GPIO_ACTIVE_HIGH>; -- cgit v1.2.3 From b4bac172e90ce4a93df8adf44eb70d91b9d611eb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Mar 2018 08:32:18 -0800 Subject: net/ipv6: Add support for path selection using hash of 5-tuple Some operators prefer IPv6 path selection to use a standard 5-tuple hash rather than just an L3 hash with the flow the label. To that end add support to IPv6 for multipath hash policy similar to bf4e0a3db97eb ("net: ipv4: add support for ECMP hash policy choice"). The default is still L3 which covers source and destination addresses along with flow label and IPv6 protocol. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 ++++ include/net/ip6_route.h | 4 +- include/net/netevent.h | 1 + include/net/netns/ipv6.h | 1 + net/ipv6/icmp.c | 2 +- net/ipv6/route.c | 68 ++++++++++++++++++++++++++-------- net/ipv6/sysctl_net_ipv6.c | 27 ++++++++++++++ 7 files changed, 91 insertions(+), 19 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a553d4e4a0fb..783675a730e5 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1363,6 +1363,13 @@ flowlabel_reflect - BOOLEAN FALSE: disabled Default: FALSE +fib_multipath_hash_policy - INTEGER + Controls which hash policy to use for multipath routes. + Default: 0 (Layer 3) + Possible values: + 0 - Layer 3 (source and destination addresses plus flow label) + 1 - Layer 4 (standard 5-tuple) + anycast_src_echo_reply - BOOLEAN Controls the use of anycast addresses as source addresses for ICMPv6 echo reply diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9594f9317952..ce2abc0ff102 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -130,8 +130,8 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, const struct sk_buff *skb, int flags); -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, - struct flow_keys *hkeys); +u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, + const struct sk_buff *skb, struct flow_keys *hkeys); struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); diff --git a/include/net/netevent.h b/include/net/netevent.h index baee605a94ab..d9918261701c 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -27,6 +27,7 @@ enum netevent_notif_type { NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */ + NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */ }; int register_netevent_notifier(struct notifier_block *nb); diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index e286fda09fcf..5b51110435fc 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -28,6 +28,7 @@ struct netns_sysctl_ipv6 { int ip6_rt_gc_elasticity; int ip6_rt_mtu_expires; int ip6_rt_min_advmss; + int multipath_hash_policy; int flowlabel_consistency; int auto_flowlabels; int icmpv6_time; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a5d929223820..6f84668be6ea 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); - fl6.mp_hash = rt6_multipath_hash(&fl6, skb, NULL); + fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d2b8368663cb..f0ae58424c45 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -450,7 +450,8 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -static struct rt6_info *rt6_multipath_select(struct rt6_info *match, +static struct rt6_info *rt6_multipath_select(const struct net *net, + struct rt6_info *match, struct flowi6 *fl6, int oif, const struct sk_buff *skb, int strict) @@ -461,7 +462,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, * case it will always be non-zero. Otherwise now is the time to do it. */ if (!fl6->mp_hash) - fl6->mp_hash = rt6_multipath_hash(fl6, skb, NULL); + fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) return match; @@ -932,7 +933,7 @@ restart: rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, + rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif, skb, flags); } if (rt == net->ipv6.ip6_null_entry) { @@ -1674,7 +1675,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, redo_rt6_select: rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) - rt = rt6_multipath_select(rt, fl6, oif, skb, strict); + rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict); if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) @@ -1839,21 +1840,56 @@ out: } /* if skb is set it will be used and fl6 can be NULL */ -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, - struct flow_keys *flkeys) +u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, + const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; u32 mhash; - memset(&hash_keys, 0, sizeof(hash_keys)); - hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; - if (skb) { - ip6_multipath_l3_keys(skb, &hash_keys, flkeys); - } else { - hash_keys.addrs.v6addrs.src = fl6->saddr; - hash_keys.addrs.v6addrs.dst = fl6->daddr; - hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; - hash_keys.basic.ip_proto = fl6->flowi6_proto; + switch (net->ipv6.sysctl.multipath_hash_policy) { + case 0: + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (skb) { + ip6_multipath_l3_keys(skb, &hash_keys, flkeys); + } else { + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; + case 1: + if (skb) { + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; + struct flow_keys keys; + + /* short-circuit if we already have L4 hash present */ + if (skb->l4_hash) + return skb_get_hash_raw(skb) >> 1; + + memset(&hash_keys, 0, sizeof(hash_keys)); + + if (!flkeys) { + skb_flow_dissect_flow_keys(skb, &keys, flag); + flkeys = &keys; + } + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; + hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; + hash_keys.ports.src = flkeys->ports.src; + hash_keys.ports.dst = flkeys->ports.dst; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; + } else { + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.ports.src = fl6->fl6_sport; + hash_keys.ports.dst = fl6->fl6_dport; + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; } mhash = flow_hash_from_keys(&hash_keys); @@ -1884,7 +1920,7 @@ void ip6_route_input(struct sk_buff *skb) flkeys = &_flkeys; if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) - fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys); + fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 262f791f1b9b..966c42af92f4 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -16,14 +16,31 @@ #include #include #include +#include #ifdef CONFIG_NETLABEL #include #endif +static int zero; static int one = 1; static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; +static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv6.sysctl.multipath_hash_policy); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); + + return ret; +} static struct ctl_table ipv6_table_template[] = { { @@ -126,6 +143,15 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "fib_multipath_hash_policy", + .data = &init_net.ipv6.sysctl.multipath_hash_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_rt6_multipath_hash_policy, + .extra1 = &zero, + .extra2 = &one, + }, { } }; @@ -190,6 +216,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt; ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len; ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len; + ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy, ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) -- cgit v1.2.3 From 22936c3e6e8cdd9641fdf1362dc9369dd35246c8 Mon Sep 17 00:00:00 2001 From: Brandon Streiff Date: Mon, 5 Mar 2018 16:05:22 -0600 Subject: dt-bindings: net: dsa: marvell: describe compatibility string There are two compatibility strings for mv88e6xxx, but it isn't clear from the documentation why only those two exist when the mv88e6xxx driver supports more than the 6085 and 6190. Briefly describe how the compatible property is used, and provide guidance on which to use. The model list comes from looking at port_base_addr values (0x0 vs 0x10) in drivers/net/dsa/mv88e6xxx/chip.c. Signed-off-by: Brandon Streiff Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/marvell.txt | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/dsa/marvell.txt b/Documentation/devicetree/bindings/net/dsa/marvell.txt index 1d4d0f49c9d0..caf71e2fe24a 100644 --- a/Documentation/devicetree/bindings/net/dsa/marvell.txt +++ b/Documentation/devicetree/bindings/net/dsa/marvell.txt @@ -13,9 +13,18 @@ placed as a child node of an mdio device. The properties described here are those specific to Marvell devices. Additional required and optional properties can be found in dsa.txt. +The compatibility string is used only to find an identification register, +which is at a different MDIO base address in different switch families. +- "marvell,mv88e6085" : Switch has base address 0x10. Use with models: + 6085, 6095, 6097, 6123, 6131, 6141, 6161, 6165, + 6171, 6172, 6175, 6176, 6185, 6240, 6320, 6321, + 6341, 6350, 6351, 6352 +- "marvell,mv88e6190" : Switch has base address 0x00. Use with models: + 6190, 6190X, 6191, 6290, 6390, 6390X + Required properties: - compatible : Should be one of "marvell,mv88e6085" or - "marvell,mv88e6190" + "marvell,mv88e6190" as indicated above - reg : Address on the MII bus for the switch. Optional properties: -- cgit v1.2.3 From 79134e6ce2c9d1a00eab4d98cb48f975dd2474cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Mar 2018 12:51:41 -0800 Subject: net: do not create fallback tunnels for non-default namespaces fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0, ip6tnl0, ip6gre0) are automatically created when the corresponding module is loaded. These tunnels are also automatically created when a new network namespace is created, at a great cost. In many cases, netns are used for isolation purposes, and these extra network devices are a waste of resources. We are using thousands of netns per host, and hit the netns creation/delete bottleneck a lot. (Many thanks to Kirill for recent work on this) Add a new sysctl so that we can opt-out from this automatic creation. Note that these tunnels are still created for the initial namespace, to be the least intrusive for typical setups. Tested: lpk43:~# cat add_del_unshare.sh for i in `seq 1 40` do (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) & done wait lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net lpk43:~# time ./add_del_unshare.sh real 0m37.521s user 0m0.886s sys 7m7.084s lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net lpk43:~# time ./add_del_unshare.sh real 0m4.761s user 0m0.851s sys 1m8.343s lpk43:~# Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 12 ++++++++++++ include/linux/netdevice.h | 7 +++++++ include/net/ip_tunnels.h | 2 ++ net/core/sysctl_net_core.c | 12 ++++++++++++ net/ipv4/ip_tunnel.c | 20 ++++++++++++-------- net/ipv6/ip6_gre.c | 4 +++- net/ipv6/ip6_tunnel.c | 2 ++ net/ipv6/sit.c | 5 ++++- 8 files changed, 54 insertions(+), 10 deletions(-) (limited to 'Documentation') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 35c62f522754..5992602469d8 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -270,6 +270,18 @@ optmem_max Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence of struct cmsghdr structures with appended data. +fb_tunnels_only_for_init_net +---------------------------- + +Controls if fallback tunnels (like tunl0, gre0, gretap0, erspan0, +sit0, ip6tnl0, ip6gre0) are automatically created when a new +network namespace is created, if corresponding tunnel is present +in initial network namespace. +If set to 1, these devices are not automatically created, and +user space is responsible for creating them if needed. + +Default : 0 (for compatibility reasons) + 2. /proc/sys/net/unix - Parameters for Unix domain sockets ------------------------------------------------------- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 95a613a7cc1c..9711108c3916 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -585,6 +585,13 @@ struct netdev_queue { #endif } ____cacheline_aligned_in_smp; +extern int sysctl_fb_tunnels_only_for_init_net; + +static inline bool net_has_fallback_tunnels(const struct net *net) +{ + return net == &init_net || !sysctl_fb_tunnels_only_for_init_net; +} + static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) { #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index cbe5addb9293..540a4b4417bf 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -180,8 +180,10 @@ struct tnl_ptk_info { struct ip_tunnel_net { struct net_device *fb_tunnel_dev; + struct rtnl_link_ops *rtnl_link_ops; struct hlist_head tunnels[IP_TNL_HASH_SIZE]; struct ip_tunnel __rcu *collect_md_tun; + int type; }; static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d714f65782b7..4f47f92459cc 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -32,6 +32,9 @@ static int max_skb_frags = MAX_SKB_FRAGS; static int net_msg_warn; /* Unused, but still a sysctl */ +int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; +EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -513,6 +516,15 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, + { + .procname = "fb_tunnels_only_for_init_net", + .data = &sysctl_fb_tunnels_only_for_init_net, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { } }; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 602597dfc395..5fcb17cb426b 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -347,8 +347,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, struct net_device *dev; int t_hlen; - BUG_ON(!itn->fb_tunnel_dev); - dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); + dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); if (IS_ERR(dev)) return ERR_CAST(dev); @@ -822,7 +821,6 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) struct net *net = t->net; struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); - BUG_ON(!itn->fb_tunnel_dev); switch (cmd) { case SIOCGETTUNNEL: if (dev == itn->fb_tunnel_dev) { @@ -847,7 +845,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) p->o_key = 0; } - t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); + t = ip_tunnel_find(itn, p, itn->type); if (cmd == SIOCADDTUNNEL) { if (!t) { @@ -991,10 +989,15 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct ip_tunnel_parm parms; unsigned int i; + itn->rtnl_link_ops = ops; for (i = 0; i < IP_TNL_HASH_SIZE; i++) INIT_HLIST_HEAD(&itn->tunnels[i]); - if (!ops) { + if (!ops || !net_has_fallback_tunnels(net)) { + struct ip_tunnel_net *it_init_net; + + it_init_net = net_generic(&init_net, ip_tnl_net_id); + itn->type = it_init_net->type; itn->fb_tunnel_dev = NULL; return 0; } @@ -1012,6 +1015,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); + itn->type = itn->fb_tunnel_dev->type; } rtnl_unlock(); @@ -1019,10 +1023,10 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); -static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, +static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, + struct list_head *head, struct rtnl_link_ops *ops) { - struct net *net = dev_net(itn->fb_tunnel_dev); struct net_device *dev, *aux; int h; @@ -1054,7 +1058,7 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { itn = net_generic(net, id); - ip_tunnel_destroy(itn, &list, ops); + ip_tunnel_destroy(net, itn, &list, ops); } unregister_netdevice_many(&list); rtnl_unlock(); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 18a3dfbd0300..7d8775c9570d 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -236,7 +236,7 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, return t; dev = ign->fb_tunnel_dev; - if (dev->flags & IFF_UP) + if (dev && dev->flags & IFF_UP) return netdev_priv(dev); return NULL; @@ -1472,6 +1472,8 @@ static int __net_init ip6gre_init_net(struct net *net) struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); int err; + if (!net_has_fallback_tunnels(net)) + return 0; ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0", NET_NAME_UNKNOWN, ip6gre_tunnel_setup); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 56c4967f1868..5c045fa407da 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2205,6 +2205,8 @@ static int __net_init ip6_tnl_init_net(struct net *net) ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; + if (!net_has_fallback_tunnels(net)) + return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", NET_NAME_UNKNOWN, ip6_tnl_dev_setup); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a9c4ac6efe22..8a4f8fddd812 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -182,7 +182,7 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel *t = netdev_priv(dev); - if (dev == sitn->fb_tunnel_dev) { + if (dev == sitn->fb_tunnel_dev || !sitn->fb_tunnel_dev) { ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0); t->ip6rd.relay_prefix = 0; t->ip6rd.prefixlen = 16; @@ -1835,6 +1835,9 @@ static int __net_init sit_init_net(struct net *net) sitn->tunnels[2] = sitn->tunnels_r; sitn->tunnels[3] = sitn->tunnels_r_l; + if (!net_has_fallback_tunnels(net)) + return 0; + sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", NET_NAME_UNKNOWN, ipip6_tunnel_setup); -- cgit v1.2.3 From ced68234b6a244355aeab07c2681bc49f695eaed Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 14 Mar 2018 12:49:19 -0400 Subject: sock: remove zerocopy sockopt restriction on closed tcp state Socket option SO_ZEROCOPY determines whether the kernel ignores or processes flag MSG_ZEROCOPY on subsequent send calls. This to avoid changing behavior for legacy processes. Limiting the state change to closed sockets is annoying with passive sockets and not necessary for correctness. Once created, zerocopy skbs are processed based on their private state, not this socket flag. Remove the constraint. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/msg_zerocopy.rst | 5 ----- net/core/sock.c | 2 -- 2 files changed, 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/msg_zerocopy.rst b/Documentation/networking/msg_zerocopy.rst index 291a01264967..fe46d4867e2d 100644 --- a/Documentation/networking/msg_zerocopy.rst +++ b/Documentation/networking/msg_zerocopy.rst @@ -72,11 +72,6 @@ this flag, a process must first signal intent by setting a socket option: if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one))) error(1, errno, "setsockopt zerocopy"); -Setting the socket option only works when the socket is in its initial -(TCP_CLOSED) state. Trying to set the option for a socket returned by accept(), -for example, will lead to an EBUSY error. In this case, the option should be set -to the listening socket and it will be inherited by the accepted sockets. - Transmission ------------ diff --git a/net/core/sock.c b/net/core/sock.c index 27f218bba43f..a8962d912895 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1052,8 +1052,6 @@ set_rcvbuf: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { if (sk->sk_protocol != IPPROTO_TCP) ret = -ENOTSUPP; - else if (sk->sk_state != TCP_CLOSE) - ret = -EBUSY; } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; } -- cgit v1.2.3 From 2b221d20db7583fa8f36d4545554786fcf93ca8e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 13 Mar 2018 12:24:19 -0700 Subject: doc: remove out of date links and info from packet mmap The packet_mmap documentation had links to no longer existing web sites; replace with other site which has similar example. Support for packet mmap has been in mainline versions of libpcap for several years. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- Documentation/networking/packet_mmap.txt | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index bf654845556e..999eb41da81d 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt @@ -7,15 +7,12 @@ socket interface on 2.4/2.6/3.x kernels. This type of sockets is used for i) capture network traffic with utilities like tcpdump, ii) transmit network traffic, or any other that needs raw access to network interface. -You can find the latest version of this document at: - http://wiki.ipxwarzone.com/index.php5?title=Linux_packet_mmap - Howto can be found at: - http://wiki.gnu-log.net (packet_mmap) + https://sites.google.com/site/packetmmap/ Please send your comments to Ulisses Alonso CamarĂ³ - Johann Baudy + Johann Baudy ------------------------------------------------------------------------------- + Why use PACKET_MMAP @@ -51,17 +48,8 @@ From the user standpoint, you should use the higher level libpcap library, which is a de facto standard, portable across nearly all operating systems including Win32. -Said that, at time of this writing, official libpcap 0.8.1 is out and doesn't include -support for PACKET_MMAP, and also probably the libpcap included in your distribution. - -I'm aware of two implementations of PACKET_MMAP in libpcap: - - http://wiki.ipxwarzone.com/ (by Simon Patarin, based on libpcap 0.6.2) - http://public.lanl.gov/cpw/ (by Phil Wood, based on lastest libpcap) - -The rest of this document is intended for people who want to understand -the low level details or want to improve libpcap by including PACKET_MMAP -support. +Packet MMAP support was integrated into libpcap around the time of version 1.3.0; +TPACKET_V3 support was added in version 1.5.0 -------------------------------------------------------------------------------- + How to use mmap() directly to improve capture process @@ -174,7 +162,7 @@ As capture, each frame contains two parts: /* bind socket to eth0 */ bind(this->socket, (struct sockaddr *)&my_addr, sizeof(struct sockaddr_ll)); - A complete tutorial is available at: http://wiki.gnu-log.net/ + A complete tutorial is available at: https://sites.google.com/site/packetmmap/ By default, the user should put data at : frame base + TPACKET_HDRLEN - sizeof(struct sockaddr_ll) -- cgit v1.2.3 From f3b249e652792290cea1db45fa58e51c69cdd68e Mon Sep 17 00:00:00 2001 From: Brad Mouring Date: Tue, 13 Mar 2018 16:32:16 -0500 Subject: Documentation: macb: Document phy-handle binding Document the existence of the optional binding, directing to the general ethernet document that describes this binding. Signed-off-by: Brad Mouring Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/macb.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt index 27966ae741e0..457d5ae16f23 100644 --- a/Documentation/devicetree/bindings/net/macb.txt +++ b/Documentation/devicetree/bindings/net/macb.txt @@ -29,6 +29,7 @@ Optional properties for PHY child node: - reset-gpios : Should specify the gpio for phy reset - magic-packet : If present, indicates that the hardware supports waking up via magic packet. +- phy-handle : see ethernet.txt file in the same directory Examples: -- cgit v1.2.3 From 320bd6de79ef0de1ece7c184469a722de690ccb0 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Tue, 13 Mar 2018 21:57:17 -0700 Subject: doc: Change the udp/sctp rmem/wmem default value. The SK_MEM_QUANTUM was changed from PAGE_SIZE to 4096. Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 783675a730e5..1d1120753ae8 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -755,13 +755,13 @@ udp_rmem_min - INTEGER Minimal size of receive buffer used by UDP sockets in moderation. Each UDP socket is able to use the size for receiving data, even if total pages of UDP sockets exceed udp_mem pressure. The unit is byte. - Default: 1 page + Default: 4K udp_wmem_min - INTEGER Minimal size of send buffer used by UDP sockets in moderation. Each UDP socket is able to use the size for sending data, even if total pages of UDP sockets exceed udp_mem pressure. The unit is byte. - Default: 1 page + Default: 4K CIPSOv4 Variables: @@ -2101,7 +2101,7 @@ sctp_rmem - vector of 3 INTEGERs: min, default, max It is guaranteed to each SCTP socket (but not association) even under moderate memory pressure. - Default: 1 page + Default: 4K sctp_wmem - vector of 3 INTEGERs: min, default, max Currently this tunable has no effect. -- cgit v1.2.3 From 78262f4575c29f185947fe58952cd1beabc74f82 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Mar 2018 00:21:15 +0100 Subject: bpf, doc: add description wrt native/bpf clang target and pointer size As this recently came up on netdev [0], lets add it to the BPF devel doc. [0] https://www.spinics.net/lists/netdev/msg489612.html Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- Documentation/bpf/bpf_devel_QA.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'Documentation') diff --git a/Documentation/bpf/bpf_devel_QA.txt b/Documentation/bpf/bpf_devel_QA.txt index 84cbb302f2b5..1a0b704e1a38 100644 --- a/Documentation/bpf/bpf_devel_QA.txt +++ b/Documentation/bpf/bpf_devel_QA.txt @@ -539,6 +539,18 @@ A: Although LLVM IR generation and optimization try to stay architecture The clang option "-fno-jump-tables" can be used to disable switch table generation. + - For clang -target bpf, it is guaranteed that pointer or long / + unsigned long types will always have a width of 64 bit, no matter + whether underlying clang binary or default target (or kernel) is + 32 bit. However, when native clang target is used, then it will + compile these types based on the underlying architecture's conventions, + meaning in case of 32 bit architecture, pointer or long / unsigned + long types e.g. in BPF context structure will have width of 32 bit + while the BPF LLVM back end still operates in 64 bit. The native + target is mostly needed in tracing for the case of walking pt_regs + or other kernel structures where CPU's register width matters. + Otherwise, clang -target bpf is generally recommended. + You should use default target when: - Your program includes a header file, e.g., ptrace.h, which eventually -- cgit v1.2.3 From faf4db008163607e993279b7f7aa456cb4c11e9a Mon Sep 17 00:00:00 2001 From: Tal Gilboa Date: Wed, 21 Mar 2018 20:33:45 +0200 Subject: Documentation/networking: Add net DIM documentation Net DIM is a generic algorithm, purposed for dynamically optimizing network devices interrupt moderation. This document describes how it works and how to use it. Signed-off-by: Tal Gilboa Reviewed-by: Randy Dunlap Signed-off-by: David S. Miller --- Documentation/networking/net_dim.txt | 174 +++++++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 Documentation/networking/net_dim.txt (limited to 'Documentation') diff --git a/Documentation/networking/net_dim.txt b/Documentation/networking/net_dim.txt new file mode 100644 index 000000000000..9cb31c5e2dcd --- /dev/null +++ b/Documentation/networking/net_dim.txt @@ -0,0 +1,174 @@ +Net DIM - Generic Network Dynamic Interrupt Moderation +====================================================== + +Author: + Tal Gilboa + + +Contents +========= + +- Assumptions +- Introduction +- The Net DIM Algorithm +- Registering a Network Device to DIM +- Example + +Part 0: Assumptions +====================== + +This document assumes the reader has basic knowledge in network drivers +and in general interrupt moderation. + + +Part I: Introduction +====================== + +Dynamic Interrupt Moderation (DIM) (in networking) refers to changing the +interrupt moderation configuration of a channel in order to optimize packet +processing. The mechanism includes an algorithm which decides if and how to +change moderation parameters for a channel, usually by performing an analysis on +runtime data sampled from the system. Net DIM is such a mechanism. In each +iteration of the algorithm, it analyses a given sample of the data, compares it +to the previous sample and if required, it can decide to change some of the +interrupt moderation configuration fields. The data sample is composed of data +bandwidth, the number of packets and the number of events. The time between +samples is also measured. Net DIM compares the current and the previous data and +returns an adjusted interrupt moderation configuration object. In some cases, +the algorithm might decide not to change anything. The configuration fields are +the minimum duration (microseconds) allowed between events and the maximum +number of wanted packets per event. The Net DIM algorithm ascribes importance to +increase bandwidth over reducing interrupt rate. + + +Part II: The Net DIM Algorithm +=============================== + +Each iteration of the Net DIM algorithm follows these steps: +1. Calculates new data sample. +2. Compares it to previous sample. +3. Makes a decision - suggests interrupt moderation configuration fields. +4. Applies a schedule work function, which applies suggested configuration. + +The first two steps are straightforward, both the new and the previous data are +supplied by the driver registered to Net DIM. The previous data is the new data +supplied to the previous iteration. The comparison step checks the difference +between the new and previous data and decides on the result of the last step. +A step would result as "better" if bandwidth increases and as "worse" if +bandwidth reduces. If there is no change in bandwidth, the packet rate is +compared in a similar fashion - increase == "better" and decrease == "worse". +In case there is no change in the packet rate as well, the interrupt rate is +compared. Here the algorithm tries to optimize for lower interrupt rate so an +increase in the interrupt rate is considered "worse" and a decrease is +considered "better". Step #2 has an optimization for avoiding false results: it +only considers a difference between samples as valid if it is greater than a +certain percentage. Also, since Net DIM does not measure anything by itself, it +assumes the data provided by the driver is valid. + +Step #3 decides on the suggested configuration based on the result from step #2 +and the internal state of the algorithm. The states reflect the "direction" of +the algorithm: is it going left (reducing moderation), right (increasing +moderation) or standing still. Another optimization is that if a decision +to stay still is made multiple times, the interval between iterations of the +algorithm would increase in order to reduce calculation overhead. Also, after +"parking" on one of the most left or most right decisions, the algorithm may +decide to verify this decision by taking a step in the other direction. This is +done in order to avoid getting stuck in a "deep sleep" scenario. Once a +decision is made, an interrupt moderation configuration is selected from +the predefined profiles. + +The last step is to notify the registered driver that it should apply the +suggested configuration. This is done by scheduling a work function, defined by +the Net DIM API and provided by the registered driver. + +As you can see, Net DIM itself does not actively interact with the system. It +would have trouble making the correct decisions if the wrong data is supplied to +it and it would be useless if the work function would not apply the suggested +configuration. This does, however, allow the registered driver some room for +manoeuvre as it may provide partial data or ignore the algorithm suggestion +under some conditions. + + +Part III: Registering a Network Device to DIM +============================================== + +Net DIM API exposes the main function net_dim(struct net_dim *dim, +struct net_dim_sample end_sample). This function is the entry point to the Net +DIM algorithm and has to be called every time the driver would like to check if +it should change interrupt moderation parameters. The driver should provide two +data structures: struct net_dim and struct net_dim_sample. Struct net_dim +describes the state of DIM for a specific object (RX queue, TX queue, +other queues, etc.). This includes the current selected profile, previous data +samples, the callback function provided by the driver and more. +Struct net_dim_sample describes a data sample, which will be compared to the +data sample stored in struct net_dim in order to decide on the algorithm's next +step. The sample should include bytes, packets and interrupts, measured by +the driver. + +In order to use Net DIM from a networking driver, the driver needs to call the +main net_dim() function. The recommended method is to call net_dim() on each +interrupt. Since Net DIM has a built-in moderation and it might decide to skip +iterations under certain conditions, there is no need to moderate the net_dim() +calls as well. As mentioned above, the driver needs to provide an object of type +struct net_dim to the net_dim() function call. It is advised for each entity +using Net DIM to hold a struct net_dim as part of its data structure and use it +as the main Net DIM API object. The struct net_dim_sample should hold the latest +bytes, packets and interrupts count. No need to perform any calculations, just +include the raw data. + +The net_dim() call itself does not return anything. Instead Net DIM relies on +the driver to provide a callback function, which is called when the algorithm +decides to make a change in the interrupt moderation parameters. This callback +will be scheduled and run in a separate thread in order not to add overhead to +the data flow. After the work is done, Net DIM algorithm needs to be set to +the proper state in order to move to the next iteration. + + +Part IV: Example +================= + +The following code demonstrates how to register a driver to Net DIM. The actual +usage is not complete but it should make the outline of the usage clear. + +my_driver.c: + +#include + +/* Callback for net DIM to schedule on a decision to change moderation */ +void my_driver_do_dim_work(struct work_struct *work) +{ + /* Get struct net_dim from struct work_struct */ + struct net_dim *dim = container_of(work, struct net_dim, + work); + /* Do interrupt moderation related stuff */ + ... + + /* Signal net DIM work is done and it should move to next iteration */ + dim->state = NET_DIM_START_MEASURE; +} + +/* My driver's interrupt handler */ +int my_driver_handle_interrupt(struct my_driver_entity *my_entity, ...) +{ + ... + /* A struct to hold current measured data */ + struct net_dim_sample dim_sample; + ... + /* Initiate data sample struct with current data */ + net_dim_sample(my_entity->events, + my_entity->packets, + my_entity->bytes, + &dim_sample); + /* Call net DIM */ + net_dim(&my_entity->dim, dim_sample); + ... +} + +/* My entity's initialization function (my_entity was already allocated) */ +int my_driver_init_my_entity(struct my_driver_entity *my_entity, ...) +{ + ... + /* Initiate struct work_struct with my driver's callback function */ + INIT_WORK(&my_entity->dim.work, my_driver_do_dim_work); + ... +} -- cgit v1.2.3 From b6c535b1638b32ff7817d78956b7232100570b5d Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 22 Mar 2018 10:10:44 -0700 Subject: tls: Add receive path documentation Add documentation on rx path setup and cmsg interface. Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- Documentation/networking/tls.txt | 66 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/tls.txt b/Documentation/networking/tls.txt index 77ed00631c12..58b5ef75f1b7 100644 --- a/Documentation/networking/tls.txt +++ b/Documentation/networking/tls.txt @@ -48,6 +48,9 @@ the transmit and the receive into the kernel. setsockopt(sock, SOL_TLS, TLS_TX, &crypto_info, sizeof(crypto_info)); +Transmit and receive are set separately, but the setup is the same, using either +TLS_TX or TLS_RX. + Sending TLS application data ---------------------------- @@ -79,6 +82,28 @@ for memory), or the encryption will always succeed. If send() returns -ENOMEM and some data was left on the socket buffer from a previous call using MSG_MORE, the MSG_MORE data is left on the socket buffer. +Receiving TLS application data +------------------------------ + +After setting the TLS_RX socket option, all recv family socket calls +are decrypted using TLS parameters provided. A full TLS record must +be received before decryption can happen. + + char buffer[16384]; + recv(sock, buffer, 16384); + +Received data is decrypted directly in to the user buffer if it is +large enough, and no additional allocations occur. If the userspace +buffer is too small, data is decrypted in the kernel and copied to +userspace. + +EINVAL is returned if the TLS version in the received message does not +match the version passed in setsockopt. + +EMSGSIZE is returned if the received message is too big. + +EBADMSG is returned if decryption failed for any other reason. + Send TLS control messages ------------------------- @@ -118,6 +143,43 @@ using a record of type @record_type. Control message data should be provided unencrypted, and will be encrypted by the kernel. +Receiving TLS control messages +------------------------------ + +TLS control messages are passed in the userspace buffer, with message +type passed via cmsg. If no cmsg buffer is provided, an error is +returned if a control message is received. Data messages may be +received without a cmsg buffer set. + + char buffer[16384]; + char cmsg[CMSG_SPACE(sizeof(unsigned char))]; + struct msghdr msg = {0}; + msg.msg_control = cmsg; + msg.msg_controllen = sizeof(cmsg); + + struct iovec msg_iov; + msg_iov.iov_base = buffer; + msg_iov.iov_len = 16384; + + msg.msg_iov = &msg_iov; + msg.msg_iovlen = 1; + + int ret = recvmsg(sock, &msg, 0 /* flags */); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (cmsg->cmsg_level == SOL_TLS && + cmsg->cmsg_type == TLS_GET_RECORD_TYPE) { + int record_type = *((unsigned char *)CMSG_DATA(cmsg)); + // Do something with record_type, and control message data in + // buffer. + // + // Note that record_type may be == to application data (23). + } else { + // Buffer contains application data. + } + +recv will never return data from mixed types of TLS records. + Integrating in to userspace TLS library --------------------------------------- @@ -126,10 +188,10 @@ layer of a userspace TLS library. A patchset to OpenSSL to use ktls as the record layer is here: -https://github.com/Mellanox/tls-openssl +https://github.com/Mellanox/openssl/commits/tls_rx2 An example of calling send directly after a handshake using gnutls. Since it doesn't implement a full record layer, control messages are not supported: -https://github.com/Mellanox/tls-af_ktls_tool +https://github.com/ktls/af_ktls-tool/commits/RX -- cgit v1.2.3 From 837f08fdecbe4b2ffc7725624342e73b886665a8 Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Tue, 20 Mar 2018 07:58:05 -0700 Subject: ice: Add basic driver framework for Intel(R) E800 Series This patch adds a basic driver framework for the Intel(R) E800 Ethernet Series of network devices. There is no functionality right now other than the ability to load. Signed-off-by: Anirudh Venkataramanan Tested-by: Tony Brelinski Signed-off-by: Jeff Kirsher --- Documentation/networking/ice.txt | 39 +++++++ MAINTAINERS | 1 + drivers/net/ethernet/intel/Kconfig | 14 +++ drivers/net/ethernet/intel/Makefile | 1 + drivers/net/ethernet/intel/ice/Makefile | 10 ++ drivers/net/ethernet/intel/ice/ice.h | 34 ++++++ drivers/net/ethernet/intel/ice/ice_devids.h | 19 ++++ drivers/net/ethernet/intel/ice/ice_main.c | 158 ++++++++++++++++++++++++++++ drivers/net/ethernet/intel/ice/ice_type.h | 28 +++++ 9 files changed, 304 insertions(+) create mode 100644 Documentation/networking/ice.txt create mode 100644 drivers/net/ethernet/intel/ice/Makefile create mode 100644 drivers/net/ethernet/intel/ice/ice.h create mode 100644 drivers/net/ethernet/intel/ice/ice_devids.h create mode 100644 drivers/net/ethernet/intel/ice/ice_main.c create mode 100644 drivers/net/ethernet/intel/ice/ice_type.h (limited to 'Documentation') diff --git a/Documentation/networking/ice.txt b/Documentation/networking/ice.txt new file mode 100644 index 000000000000..6261c46378e1 --- /dev/null +++ b/Documentation/networking/ice.txt @@ -0,0 +1,39 @@ +Intel(R) Ethernet Connection E800 Series Linux Driver +=================================================================== + +Intel ice Linux driver. +Copyright(c) 2018 Intel Corporation. + +Contents +======== +- Enabling the driver +- Support + +The driver in this release supports Intel's E800 Series of products. For +more information, visit Intel's support page at http://support.intel.com. + +Enabling the driver +=================== + +The driver is enabled via the standard kernel configuration system, +using the make command: + + Make oldconfig/silentoldconfig/menuconfig/etc. + +The driver is located in the menu structure at: + + -> Device Drivers + -> Network device support (NETDEVICES [=y]) + -> Ethernet driver support + -> Intel devices + -> Intel(R) Ethernet Connection E800 Series Support + +Support +======= + +For general information, go to the Intel support website at: + + http://support.intel.com + +If an issue is identified with the released source code, please email +the maintainer listed in the MAINTAINERS file. diff --git a/MAINTAINERS b/MAINTAINERS index b3ea844cf228..9107d9241564 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7063,6 +7063,7 @@ F: Documentation/networking/ixgbe.txt F: Documentation/networking/ixgbevf.txt F: Documentation/networking/i40e.txt F: Documentation/networking/i40evf.txt +F: Documentation/networking/ice.txt F: drivers/net/ethernet/intel/ F: drivers/net/ethernet/intel/*/ F: include/linux/avf/virtchnl.h diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index 1feb54b6d92e..14d287bed33c 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -251,6 +251,20 @@ config I40EVF will be called i40evf. MSI-X interrupt support is required for this driver to work correctly. +config ICE + tristate "Intel(R) Ethernet Connection E800 Series Support" + default n + depends on PCI_MSI + ---help--- + This driver supports Intel(R) Ethernet Connection E800 Series of + devices. For more information on how to identify your adapter, go + to the Adapter & Driver ID Guide that can be located at: + + + + To compile this driver as a module, choose M here. The module + will be called ice. + config FM10K tristate "Intel(R) FM10000 Ethernet Switch Host Interface Support" default n diff --git a/drivers/net/ethernet/intel/Makefile b/drivers/net/ethernet/intel/Makefile index 90af7757a885..807a4f8c7e4e 100644 --- a/drivers/net/ethernet/intel/Makefile +++ b/drivers/net/ethernet/intel/Makefile @@ -14,3 +14,4 @@ obj-$(CONFIG_I40E) += i40e/ obj-$(CONFIG_IXGB) += ixgb/ obj-$(CONFIG_I40EVF) += i40evf/ obj-$(CONFIG_FM10K) += fm10k/ +obj-$(CONFIG_ICE) += ice/ diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile new file mode 100644 index 000000000000..7ea6295d58fb --- /dev/null +++ b/drivers/net/ethernet/intel/ice/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018, Intel Corporation. + +# +# Makefile for the Intel(R) Ethernet Connection E800 Series Linux Driver +# + +obj-$(CONFIG_ICE) += ice.o + +ice-y := ice_main.o diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h new file mode 100644 index 000000000000..e3c1672d4976 --- /dev/null +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018, Intel Corporation. */ + +#ifndef _ICE_H_ +#define _ICE_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ice_devids.h" +#include "ice_type.h" + +#define ICE_BAR0 0 + +#define ICE_DFLT_NETIF_M (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK) + +enum ice_state { + __ICE_DOWN, + __ICE_STATE_NBITS /* must be last */ +}; + +struct ice_pf { + struct pci_dev *pdev; + DECLARE_BITMAP(state, __ICE_STATE_NBITS); + u32 msg_enable; + struct ice_hw hw; +}; +#endif /* _ICE_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_devids.h b/drivers/net/ethernet/intel/ice/ice_devids.h new file mode 100644 index 000000000000..0e14d7215a6e --- /dev/null +++ b/drivers/net/ethernet/intel/ice/ice_devids.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018, Intel Corporation. */ + +#ifndef _ICE_DEVIDS_H_ +#define _ICE_DEVIDS_H_ + +/* Device IDs */ +/* Intel(R) Ethernet Controller C810 for backplane */ +#define ICE_DEV_ID_C810_BACKPLANE 0x1591 +/* Intel(R) Ethernet Controller C810 for QSFP */ +#define ICE_DEV_ID_C810_QSFP 0x1592 +/* Intel(R) Ethernet Controller C810 for SFP */ +#define ICE_DEV_ID_C810_SFP 0x1593 +/* Intel(R) Ethernet Controller C810/X557-AT 10GBASE-T */ +#define ICE_DEV_ID_C810_10G_BASE_T 0x1594 +/* Intel(R) Ethernet Controller C810 1GbE */ +#define ICE_DEV_ID_C810_SGMII 0x1595 + +#endif /* _ICE_DEVIDS_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c new file mode 100644 index 000000000000..3ada230a3fc7 --- /dev/null +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018, Intel Corporation. */ + +/* Intel(R) Ethernet Connection E800 Series Linux Driver */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "ice.h" + +#define DRV_VERSION "ice-0.0.1-k" +#define DRV_SUMMARY "Intel(R) Ethernet Connection E800 Series Linux Driver" +static const char ice_drv_ver[] = DRV_VERSION; +static const char ice_driver_string[] = DRV_SUMMARY; +static const char ice_copyright[] = "Copyright (c) 2018, Intel Corporation."; + +MODULE_AUTHOR("Intel Corporation, "); +MODULE_DESCRIPTION(DRV_SUMMARY); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRV_VERSION); + +static int debug = -1; +module_param(debug, int, 0644); +MODULE_PARM_DESC(debug, "netif message level (0=none,...,0x7FFF=all)"); + +/** + * ice_probe - Device initialization routine + * @pdev: PCI device information struct + * @ent: entry in ice_pci_tbl + * + * Returns 0 on success, negative on failure + */ +static int ice_probe(struct pci_dev *pdev, + const struct pci_device_id __always_unused *ent) +{ + struct ice_pf *pf; + struct ice_hw *hw; + int err; + + /* this driver uses devres, see Documentation/driver-model/devres.txt */ + err = pcim_enable_device(pdev); + if (err) + return err; + + err = pcim_iomap_regions(pdev, BIT(ICE_BAR0), pci_name(pdev)); + if (err) { + dev_err(&pdev->dev, "I/O map error %d\n", err); + return err; + } + + pf = devm_kzalloc(&pdev->dev, sizeof(*pf), GFP_KERNEL); + if (!pf) + return -ENOMEM; + + /* set up for high or low dma */ + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (err) + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "DMA configuration failed: 0x%x\n", err); + return err; + } + + pci_enable_pcie_error_reporting(pdev); + pci_set_master(pdev); + + pf->pdev = pdev; + pci_set_drvdata(pdev, pf); + set_bit(__ICE_DOWN, pf->state); + + hw = &pf->hw; + hw->hw_addr = pcim_iomap_table(pdev)[ICE_BAR0]; + hw->back = pf; + hw->vendor_id = pdev->vendor; + hw->device_id = pdev->device; + pci_read_config_byte(pdev, PCI_REVISION_ID, &hw->revision_id); + hw->subsystem_vendor_id = pdev->subsystem_vendor; + hw->subsystem_device_id = pdev->subsystem_device; + hw->bus.device = PCI_SLOT(pdev->devfn); + hw->bus.func = PCI_FUNC(pdev->devfn); + pf->msg_enable = netif_msg_init(debug, ICE_DFLT_NETIF_M); + + return 0; +} + +/** + * ice_remove - Device removal routine + * @pdev: PCI device information struct + */ +static void ice_remove(struct pci_dev *pdev) +{ + struct ice_pf *pf = pci_get_drvdata(pdev); + + if (!pf) + return; + + set_bit(__ICE_DOWN, pf->state); + pci_disable_pcie_error_reporting(pdev); +} + +/* ice_pci_tbl - PCI Device ID Table + * + * Wildcard entries (PCI_ANY_ID) should come last + * Last entry must be all 0s + * + * { Vendor ID, Device ID, SubVendor ID, SubDevice ID, + * Class, Class Mask, private data (not used) } + */ +static const struct pci_device_id ice_pci_tbl[] = { + { PCI_VDEVICE(INTEL, ICE_DEV_ID_C810_BACKPLANE), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_C810_QSFP), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_C810_SFP), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_C810_10G_BASE_T), 0 }, + { PCI_VDEVICE(INTEL, ICE_DEV_ID_C810_SGMII), 0 }, + /* required last entry */ + { 0, } +}; +MODULE_DEVICE_TABLE(pci, ice_pci_tbl); + +static struct pci_driver ice_driver = { + .name = KBUILD_MODNAME, + .id_table = ice_pci_tbl, + .probe = ice_probe, + .remove = ice_remove, +}; + +/** + * ice_module_init - Driver registration routine + * + * ice_module_init is the first routine called when the driver is + * loaded. All it does is register with the PCI subsystem. + */ +static int __init ice_module_init(void) +{ + int status; + + pr_info("%s - version %s\n", ice_driver_string, ice_drv_ver); + pr_info("%s\n", ice_copyright); + + status = pci_register_driver(&ice_driver); + if (status) + pr_err("failed to register pci driver, err %d\n", status); + + return status; +} +module_init(ice_module_init); + +/** + * ice_module_exit - Driver exit cleanup routine + * + * ice_module_exit is called just before the driver is removed + * from memory. + */ +static void __exit ice_module_exit(void) +{ + pci_unregister_driver(&ice_driver); + pr_info("module unloaded\n"); +} +module_exit(ice_module_exit); diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h new file mode 100644 index 000000000000..cd018f81b5c7 --- /dev/null +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018, Intel Corporation. */ + +#ifndef _ICE_TYPE_H_ +#define _ICE_TYPE_H_ + +/* Bus parameters */ +struct ice_bus_info { + u16 device; + u8 func; +}; + +/* Port hardware description */ +struct ice_hw { + u8 __iomem *hw_addr; + void *back; + + /* pci info */ + u16 device_id; + u16 vendor_id; + u16 subsystem_device_id; + u16 subsystem_vendor_id; + u8 revision_id; + + struct ice_bus_info bus; +}; + +#endif /* _ICE_TYPE_H_ */ -- cgit v1.2.3 From 8396764154c6365bfeb9cc851b9746511a151255 Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Fri, 23 Mar 2018 21:30:36 +0900 Subject: dt-bindings: net: ave: add PXs3 support Add a compatible string for ethernet controller on UniPhier PXs3 SoC. Signed-off-by: Kunihiko Hayashi Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt b/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt index 270ea4efff13..96398cc2982f 100644 --- a/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt +++ b/Documentation/devicetree/bindings/net/socionext,uniphier-ave4.txt @@ -9,6 +9,7 @@ Required properties: - "socionext,uniphier-pxs2-ave4" : for PXs2 SoC - "socionext,uniphier-ld11-ave4" : for LD11 SoC - "socionext,uniphier-ld20-ave4" : for LD20 SoC + - "socionext,uniphier-pxs3-ave4" : for PXs3 SoC - reg: Address where registers are mapped and size of region. - interrupts: Should contain the MAC interrupt. - phy-mode: See ethernet.txt in the same directory. Allow to choose -- cgit v1.2.3 From 93e9ad985038f269838c4c5deab4c84d87c7b5a7 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 23 Mar 2018 21:24:02 -0700 Subject: ptp: Fix documentation to match code. Ever since commit 3a06c7ac24f9 ("posix-clocks: Remove interval timer facility and mmap/fasync callbacks") the possibility of PHC based posix timers has been removed. In addition it will probably never make sense to implement this functionality. This patch removes the misleading text which seems to suggest that posix timers for PHC devices will ever be a thing. Signed-off-by: Richard Cochran Signed-off-by: David S. Miller --- Documentation/ptp/ptp.txt | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/ptp/ptp.txt b/Documentation/ptp/ptp.txt index ae8fef86b832..11e904ee073f 100644 --- a/Documentation/ptp/ptp.txt +++ b/Documentation/ptp/ptp.txt @@ -18,7 +18,6 @@ - Adjust clock frequency + Ancillary clock features - - One short or periodic alarms, with signal delivery to user program - Time stamp external events - Period output signals configurable from user space - Synchronization of the Linux system time via the PPS subsystem @@ -48,9 +47,7 @@ User space programs may control the clock using standardized ioctls. A program may query, enable, configure, and disable the ancillary clock features. User space can receive time stamped - events via blocking read() and poll(). One shot and periodic - signals may be configured via the POSIX timer_settime() system - call. + events via blocking read() and poll(). ** Writing clock drivers -- cgit v1.2.3 From 75530a78de21267f0dc76fbc67d2253e89233d54 Mon Sep 17 00:00:00 2001 From: Moritz Fischer Date: Tue, 27 Mar 2018 14:43:14 -0700 Subject: dt-bindings: net: Add bindings for National Instruments XGE netdev This adds bindings for the NI XGE 1G/10G network device. Reviewed-by: Rob Herring Signed-off-by: Moritz Fischer Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/nixge.txt | 32 +++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/nixge.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/nixge.txt b/Documentation/devicetree/bindings/net/nixge.txt new file mode 100644 index 000000000000..e55af7f0881a --- /dev/null +++ b/Documentation/devicetree/bindings/net/nixge.txt @@ -0,0 +1,32 @@ +* NI XGE Ethernet controller + +Required properties: +- compatible: Should be "ni,xge-enet-2.00" +- reg: Address and length of the register set for the device +- interrupts: Should contain tx and rx interrupt +- interrupt-names: Should be "rx" and "tx" +- phy-mode: See ethernet.txt file in the same directory. +- phy-handle: See ethernet.txt file in the same directory. +- nvmem-cells: Phandle of nvmem cell containing the MAC address +- nvmem-cell-names: Should be "address" + +Examples (10G generic PHY): + nixge0: ethernet@40000000 { + compatible = "ni,xge-enet-2.00"; + reg = <0x40000000 0x6000>; + + nvmem-cells = <ð1_addr>; + nvmem-cell-names = "address"; + + interrupts = <0 29 IRQ_TYPE_LEVEL_HIGH>, <0 30 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "rx", "tx"; + interrupt-parent = <&intc>; + + phy-mode = "xgmii"; + phy-handle = <ðernet_phy1>; + + ethernet_phy1: ethernet-phy@4 { + compatible = "ethernet-phy-ieee802.3-c45"; + reg = <4>; + }; + }; -- cgit v1.2.3 From 19b351f16fd940092f2daa75773b0320f0785de9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 28 Mar 2018 15:00:43 +0200 Subject: netfilter: add flowtable documentation This patch adds initial documentation for the Netfilter flowtable infrastructure. Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- Documentation/networking/nf_flowtable.txt | 112 ++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 Documentation/networking/nf_flowtable.txt (limited to 'Documentation') diff --git a/Documentation/networking/nf_flowtable.txt b/Documentation/networking/nf_flowtable.txt new file mode 100644 index 000000000000..54128c50d508 --- /dev/null +++ b/Documentation/networking/nf_flowtable.txt @@ -0,0 +1,112 @@ +Netfilter's flowtable infrastructure +==================================== + +This documentation describes the software flowtable infrastructure available in +Netfilter since Linux kernel 4.16. + +Overview +-------- + +Initial packets follow the classic forwarding path, once the flow enters the +established state according to the conntrack semantics (ie. we have seen traffic +in both directions), then you can decide to offload the flow to the flowtable +from the forward chain via the 'flow offload' action available in nftables. + +Packets that find an entry in the flowtable (ie. flowtable hit) are sent to the +output netdevice via neigh_xmit(), hence, they bypass the classic forwarding +path (the visible effect is that you do not see these packets from any of the +netfilter hooks coming after the ingress). In case of flowtable miss, the packet +follows the classic forward path. + +The flowtable uses a resizable hashtable, lookups are based on the following +7-tuple selectors: source, destination, layer 3 and layer 4 protocols, source +and destination ports and the input interface (useful in case there are several +conntrack zones in place). + +Flowtables are populated via the 'flow offload' nftables action, so the user can +selectively specify what flows are placed into the flow table. Hence, packets +follow the classic forwarding path unless the user explicitly instruct packets +to use this new alternative forwarding path via nftables policy. + +This is represented in Fig.1, which describes the classic forwarding path +including the Netfilter hooks and the flowtable fastpath bypass. + + userspace process + ^ | + | | + _____|____ ____\/___ + / \ / \ + | input | | output | + \__________/ \_________/ + ^ | + | | + _________ __________ --------- _____\/_____ + / \ / \ |Routing | / \ + --> ingress ---> prerouting ---> |decision| | postrouting |--> neigh_xmit + \_________/ \__________/ ---------- \____________/ ^ + | ^ | | ^ | + flowtable | | ____\/___ | | + | | | / \ | | + __\/___ | --------->| forward |------------ | + |-----| | \_________/ | + |-----| | 'flow offload' rule | + |-----| | adds entry to | + |_____| | flowtable | + | | | + / \ | | + /hit\_no_| | + \ ? / | + \ / | + |__yes_________________fastpath bypass ____________________________| + + Fig.1 Netfilter hooks and flowtable interactions + +The flowtable entry also stores the NAT configuration, so all packets are +mangled according to the NAT policy that matches the initial packets that went +through the classic forwarding path. The TTL is decremented before calling +neigh_xmit(). Fragmented traffic is passed up to follow the classic forwarding +path given that the transport selectors are missing, therefore flowtable lookup +is not possible. + +Example configuration +--------------------- + +Enabling the flowtable bypass is relatively easy, you only need to create a +flowtable and add one rule to your forward chain. + + table inet x { + flowtable f { + hook ingress priority 0 devices = { eth0, eth1 }; + } + chain y { + type filter hook forward priority 0; policy accept; + ip protocol tcp flow offload @f + counter packets 0 bytes 0 + } + } + +This example adds the flowtable 'f' to the ingress hook of the eth0 and eth1 +netdevices. You can create as many flowtables as you want in case you need to +perform resource partitioning. The flowtable priority defines the order in which +hooks are run in the pipeline, this is convenient in case you already have a +nftables ingress chain (make sure the flowtable priority is smaller than the +nftables ingress chain hence the flowtable runs before in the pipeline). + +The 'flow offload' action from the forward chain 'y' adds an entry to the +flowtable for the TCP syn-ack packet coming in the reply direction. Once the +flow is offloaded, you will observe that the counter rule in the example above +does not get updated for the packets that are being forwarded through the +forwarding bypass. + +More reading +------------ + +This documentation is based on the LWN.net articles [1][2]. Rafal Milecki also +made a very complete and comprehensive summary called "A state of network +acceleration" that describes how things were before this infrastructure was +mailined [3] and it also makes a rough summary of this work [4]. + +[1] https://lwn.net/Articles/738214/ +[2] https://lwn.net/Articles/742164/ +[3] http://lists.infradead.org/pipermail/lede-dev/2018-January/010830.html +[4] http://lists.infradead.org/pipermail/lede-dev/2018-January/010829.html -- cgit v1.2.3 From 9217e566bdee4583d0a9ea4879c8f5e004886eac Mon Sep 17 00:00:00 2001 From: Mike Looijmans Date: Thu, 29 Mar 2018 07:29:48 +0200 Subject: of_net: Implement of_get_nvmem_mac_address helper It's common practice to store MAC addresses for network interfaces into nvmem devices. However the code to actually do this in the kernel lacks, so this patch adds of_get_nvmem_mac_address() for drivers to obtain the address from an nvmem cell provider. This is particulary useful on devices where the ethernet interface cannot be configured by the bootloader, for example because it's in an FPGA. Signed-off-by: Mike Looijmans Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/ethernet.txt | 2 ++ drivers/of/of_net.c | 40 ++++++++++++++++++++++ include/linux/of_net.h | 6 ++++ 3 files changed, 48 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt index 2974e63ba311..cfc376bc977a 100644 --- a/Documentation/devicetree/bindings/net/ethernet.txt +++ b/Documentation/devicetree/bindings/net/ethernet.txt @@ -10,6 +10,8 @@ Documentation/devicetree/bindings/phy/phy-bindings.txt. the boot program; should be used in cases where the MAC address assigned to the device by the boot program is different from the "local-mac-address" property; +- nvmem-cells: phandle, reference to an nvmem node for the MAC address; +- nvmem-cell-names: string, should be "mac-address" if nvmem is to be used; - max-speed: number, specifies maximum speed in Mbit/s supported by the device; - max-frame-size: number, maximum transfer unit (IEEE defined MTU), rather than the maximum frame size (there's contradiction in the Devicetree diff --git a/drivers/of/of_net.c b/drivers/of/of_net.c index d820f3edd431..53189d4022a6 100644 --- a/drivers/of/of_net.c +++ b/drivers/of/of_net.c @@ -7,6 +7,7 @@ */ #include #include +#include #include #include #include @@ -80,3 +81,42 @@ const void *of_get_mac_address(struct device_node *np) return of_get_mac_addr(np, "address"); } EXPORT_SYMBOL(of_get_mac_address); + +/** + * Obtain the MAC address from an nvmem provider named 'mac-address' through + * device tree. + * On success, copies the new address into memory pointed to by addr and + * returns 0. Returns a negative error code otherwise. + * @np: Device tree node containing the nvmem-cells phandle + * @addr: Pointer to receive the MAC address using ether_addr_copy() + */ +int of_get_nvmem_mac_address(struct device_node *np, void *addr) +{ + struct nvmem_cell *cell; + const void *mac; + size_t len; + int ret; + + cell = of_nvmem_cell_get(np, "mac-address"); + if (IS_ERR(cell)) + return PTR_ERR(cell); + + mac = nvmem_cell_read(cell, &len); + + nvmem_cell_put(cell); + + if (IS_ERR(mac)) + return PTR_ERR(mac); + + if (len < ETH_ALEN || !is_valid_ether_addr(mac)) { + ret = -EINVAL; + } else { + ether_addr_copy(addr, mac); + ret = 0; + } + + kfree(mac); + + return ret; +} +EXPORT_SYMBOL(of_get_nvmem_mac_address); diff --git a/include/linux/of_net.h b/include/linux/of_net.h index 9cd72aab76fe..90d81ee9e6a0 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -13,6 +13,7 @@ struct net_device; extern int of_get_phy_mode(struct device_node *np); extern const void *of_get_mac_address(struct device_node *np); +extern int of_get_nvmem_mac_address(struct device_node *np, void *addr); extern struct net_device *of_find_net_device_by_node(struct device_node *np); #else static inline int of_get_phy_mode(struct device_node *np) @@ -25,6 +26,11 @@ static inline const void *of_get_mac_address(struct device_node *np) return NULL; } +static inline int of_get_nvmem_mac_address(struct device_node *np, void *addr) +{ + return -ENODEV; +} + static inline struct net_device *of_find_net_device_by_node(struct device_node *np) { return NULL; -- cgit v1.2.3 From 2f0aaf7fb11c90645bbda447c1d26f5b0b04e984 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 29 Mar 2018 11:02:25 +0200 Subject: Documentation: ip-sysctl.txt: clarify disable_ipv6 Clarify that when disable_ipv6 is enabled even the ipv6 routes are deleted for the selected interface and from now it will not be possible to add addresses/routes to that interface Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 1d1120753ae8..33f35f049ad5 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1703,7 +1703,9 @@ disable_ipv6 - BOOLEAN interface and start Duplicate Address Detection, if necessary. When this value is changed from 0 to 1 (IPv6 is being disabled), - it will dynamically delete all address on the given interface. + it will dynamically delete all addresses and routes on the given + interface. From now on it will not possible to add addresses/routes + to the selected interface. accept_dad - INTEGER Whether to accept DAD (Duplicate Address Detection). -- cgit v1.2.3 From 9b85756341c53fd13b4b5e25104c22849274cd0d Mon Sep 17 00:00:00 2001 From: Biju Das Date: Thu, 29 Mar 2018 11:02:55 +0100 Subject: dt-bindings: net: renesas-ravb: Add support for r8a77470 SoC Add a new compatible string for the RZ/G1C (R8A77470) SoC. Signed-off-by: Biju Das Reviewed-by: Fabrizio Castro Acked-by: Sergei Shtylyov Reviewed-by: Geert Uytterhoeven Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/renesas,ravb.txt | 1 + 1 file changed, 1 insertion(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/renesas,ravb.txt b/Documentation/devicetree/bindings/net/renesas,ravb.txt index b4dc455eb155..c306f55d335b 100644 --- a/Documentation/devicetree/bindings/net/renesas,ravb.txt +++ b/Documentation/devicetree/bindings/net/renesas,ravb.txt @@ -7,6 +7,7 @@ Required properties: - compatible: Must contain one or more of the following: - "renesas,etheravb-r8a7743" for the R8A7743 SoC. - "renesas,etheravb-r8a7745" for the R8A7745 SoC. + - "renesas,etheravb-r8a77470" for the R8A77470 SoC. - "renesas,etheravb-r8a7790" for the R8A7790 SoC. - "renesas,etheravb-r8a7791" for the R8A7791 SoC. - "renesas,etheravb-r8a7792" for the R8A7792 SoC. -- cgit v1.2.3 From a5af1fb94ffcad1aff3bc7f8a78dae7ee266f31c Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Fri, 30 Mar 2018 01:00:34 +0200 Subject: dt-bindings: net: meson-dwmac: add support for the Meson8m2 SoC The Meson8m2 SoC uses a similar (potentially even identical) register layout for the dwmac glue as Meson8b and GXBB. Unfortunately there is no documentation available. Testing shows that both, RMII and RGMII PHYs are working if they are configured as on Meson8b. Add a new compatible string to the documentation so differences (if there are any) between Meson8m2 and the other SoCs can be taken care of within the driver. Signed-off-by: Martin Blumenstingl Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/meson-dwmac.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/meson-dwmac.txt b/Documentation/devicetree/bindings/net/meson-dwmac.txt index 354dd9896bb5..61cada22ae6c 100644 --- a/Documentation/devicetree/bindings/net/meson-dwmac.txt +++ b/Documentation/devicetree/bindings/net/meson-dwmac.txt @@ -9,6 +9,7 @@ Required properties on all platforms: - compatible: Depending on the platform this should be one of: - "amlogic,meson6-dwmac" - "amlogic,meson8b-dwmac" + - "amlogic,meson8m2-dwmac" - "amlogic,meson-gxbb-dwmac" Additionally "snps,dwmac" and any applicable more detailed version number described in net/stmmac.txt @@ -19,13 +20,13 @@ Required properties on all platforms: configuration (for example the PRG_ETHERNET register range on Meson8b and newer) -Required properties on Meson8b and newer: +Required properties on Meson8b, Meson8m2, GXBB and newer: - clock-names: Should contain the following: - "stmmaceth" - see stmmac.txt - "clkin0" - first parent clock of the internal mux - "clkin1" - second parent clock of the internal mux -Optional properties on Meson8b and newer: +Optional properties on Meson8b, Meson8m2, GXBB and newer: - amlogic,tx-delay-ns: The internal RGMII TX clock delay (provided by this driver) in nanoseconds. Allowed values are: 0ns, 2ns, 4ns, 6ns. -- cgit v1.2.3 From 648700f76b03b7e8149d13cc2bdb3355035258a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:49 -0700 Subject: inet: frags: use rhashtables for reassembly units Some applications still rely on IP fragmentation, and to be fair linux reassembly unit is not working under any serious load. It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!) A work queue is supposed to garbage collect items when host is under memory pressure, and doing a hash rebuild, changing seed used in hash computations. This work queue blocks softirqs for up to 25 ms when doing a hash rebuild, occurring every 5 seconds if host is under fire. Then there is the problem of sharing this hash table for all netns. It is time to switch to rhashtables, and allocate one of them per netns to speedup netns dismantle, since this is a critical metric these days. Lookup is now using RCU. A followup patch will even remove the refcount hold/release left from prior implementation and save a couple of atomic operations. Before this patch, 16 cpus (16 RX queue NIC) could not handle more than 1 Mpps frags DDOS. After the patch, I reach 9 Mpps without any tuning, and can use up to 2GB of storage for the fragments (exact number depends on frags being evicted after timeout) $ grep FRAG /proc/net/sockstat FRAG: inuse 1966916 memory 2140004608 A followup patch will change the limits for 64bit arches. Signed-off-by: Eric Dumazet Cc: Kirill Tkhai Cc: Herbert Xu Cc: Florian Westphal Cc: Jesper Dangaard Brouer Cc: Alexander Aring Cc: Stefan Schmidt Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 +- include/net/inet_frag.h | 81 ++++---- include/net/ipv6.h | 16 +- net/ieee802154/6lowpan/6lowpan_i.h | 26 +-- net/ieee802154/6lowpan/reassembly.c | 91 ++++----- net/ipv4/inet_fragment.c | 344 ++++++-------------------------- net/ipv4/ip_fragment.c | 112 +++++------ net/ipv6/netfilter/nf_conntrack_reasm.c | 51 ++--- net/ipv6/reassembly.c | 110 +++++----- 9 files changed, 265 insertions(+), 573 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 33f35f049ad5..6f2a3670e44b 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -134,13 +134,10 @@ min_adv_mss - INTEGER IP Fragmentation: ipfrag_high_thresh - INTEGER - Maximum memory used to reassemble IP fragments. When - ipfrag_high_thresh bytes of memory is allocated for this purpose, - the fragment handler will toss packets until ipfrag_low_thresh - is reached. This also serves as a maximum limit to namespaces - different from the initial one. + Maximum memory used to reassemble IP fragments. ipfrag_low_thresh - INTEGER + (Obsolete since linux-4.17) Maximum memory used to reassemble IP fragments before the kernel begins to remove incomplete fragment queues to free up resources. The kernel still accepts new fragments for defragmentation. diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 69e531ed8189..3fec0d3a0d01 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -2,7 +2,11 @@ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ +#include + struct netns_frags { + struct rhashtable rhashtable ____cacheline_aligned_in_smp; + /* Keep atomic mem on separate cachelines in structs that include it */ atomic_t mem ____cacheline_aligned_in_smp; /* sysctls */ @@ -26,12 +30,30 @@ enum { INET_FRAG_COMPLETE = BIT(2), }; +struct frag_v4_compare_key { + __be32 saddr; + __be32 daddr; + u32 user; + u32 vif; + __be16 id; + u16 protocol; +}; + +struct frag_v6_compare_key { + struct in6_addr saddr; + struct in6_addr daddr; + u32 user; + __be32 id; + u32 iif; +}; + /** * struct inet_frag_queue - fragment queue * - * @lock: spinlock protecting the queue + * @node: rhash node + * @key: keys identifying this frag. * @timer: queue expiration timer - * @list: hash bucket list + * @lock: spinlock protecting this frag * @refcnt: reference count of the queue * @fragments: received fragments head * @fragments_tail: received fragments tail @@ -41,12 +63,16 @@ enum { * @flags: fragment queue flags * @max_size: maximum received fragment size * @net: namespace that this frag belongs to - * @list_evictor: list of queues to forcefully evict (e.g. due to low memory) + * @rcu: rcu head for freeing deferall */ struct inet_frag_queue { - spinlock_t lock; + struct rhash_head node; + union { + struct frag_v4_compare_key v4; + struct frag_v6_compare_key v6; + } key; struct timer_list timer; - struct hlist_node list; + spinlock_t lock; refcount_t refcnt; struct sk_buff *fragments; struct sk_buff *fragments_tail; @@ -55,51 +81,20 @@ struct inet_frag_queue { int meat; __u8 flags; u16 max_size; - struct netns_frags *net; - struct hlist_node list_evictor; -}; - -#define INETFRAGS_HASHSZ 1024 - -/* averaged: - * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ / - * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or - * struct frag_queue)) - */ -#define INETFRAGS_MAXDEPTH 128 - -struct inet_frag_bucket { - struct hlist_head chain; - spinlock_t chain_lock; + struct netns_frags *net; + struct rcu_head rcu; }; struct inet_frags { - struct inet_frag_bucket hash[INETFRAGS_HASHSZ]; - - struct work_struct frags_work; - unsigned int next_bucket; - unsigned long last_rebuild_jiffies; - bool rebuild; - - /* The first call to hashfn is responsible to initialize - * rnd. This is best done with net_get_random_once. - * - * rnd_seqlock is used to let hash insertion detect - * when it needs to re-lookup the hash chain to use. - */ - u32 rnd; - seqlock_t rnd_seqlock; unsigned int qsize; - unsigned int (*hashfn)(const struct inet_frag_queue *); - bool (*match)(const struct inet_frag_queue *q, - const void *arg); void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); void (*frag_expire)(struct timer_list *t); struct kmem_cache *frags_cachep; const char *frags_cache_name; + struct rhashtable_params rhash_params; }; int inet_frags_init(struct inet_frags *); @@ -108,15 +103,13 @@ void inet_frags_fini(struct inet_frags *); static inline int inet_frags_init_net(struct netns_frags *nf) { atomic_set(&nf->mem, 0); - return 0; + return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params); } void inet_frags_exit_net(struct netns_frags *nf); void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q); -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, unsigned int hash); - +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key); void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, const char *prefix); @@ -128,7 +121,7 @@ static inline void inet_frag_put(struct inet_frag_queue *q) static inline bool inet_frag_evicting(struct inet_frag_queue *q) { - return !hlist_unhashed(&q->list_evictor); + return false; } /* Memory Tracking Functions. */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 57b7fe43d2ab..6fa9a2bc5896 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -579,17 +579,8 @@ enum ip6_defrag_users { __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, }; -struct ip6_create_arg { - __be32 id; - u32 user; - const struct in6_addr *src; - const struct in6_addr *dst; - int iif; - u8 ecn; -}; - void ip6_frag_init(struct inet_frag_queue *q, const void *a); -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); +extern const struct rhashtable_params ip6_rhash_params; /* * Equivalent of ipv4 struct ip @@ -597,11 +588,6 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); struct frag_queue { struct inet_frag_queue q; - __be32 id; /* fragment id */ - u32 user; - struct in6_addr saddr; - struct in6_addr daddr; - int iif; __u16 nhoffset; u8 ecn; diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h index d8de3bcfb103..b8d95cb71c25 100644 --- a/net/ieee802154/6lowpan/6lowpan_i.h +++ b/net/ieee802154/6lowpan/6lowpan_i.h @@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result; #define LOWPAN_DISPATCH_FRAG1 0xc0 #define LOWPAN_DISPATCH_FRAGN 0xe0 -struct lowpan_create_arg { +struct frag_lowpan_compare_key { u16 tag; u16 d_size; - const struct ieee802154_addr *src; - const struct ieee802154_addr *dst; + const struct ieee802154_addr src; + const struct ieee802154_addr dst; }; -/* Equivalent of ipv4 struct ip +/* Equivalent of ipv4 struct ipq */ struct lowpan_frag_queue { struct inet_frag_queue q; - - u16 tag; - u16 d_size; - struct ieee802154_addr saddr; - struct ieee802154_addr daddr; }; -static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a) -{ - switch (a->mode) { - case IEEE802154_ADDR_LONG: - return (((__force u64)a->extended_addr) >> 32) ^ - (((__force u64)a->extended_addr) & 0xffffffff); - case IEEE802154_ADDR_SHORT: - return (__force u32)(a->short_addr + (a->pan_id << 16)); - default: - return 0; - } -} - int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); void lowpan_net_frag_exit(void); int lowpan_net_frag_init(void); diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index ddada12a044d..0fa0121f85d4 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, struct net_device *ldev); -static unsigned int lowpan_hash_frag(u16 tag, u16 d_size, - const struct ieee802154_addr *saddr, - const struct ieee802154_addr *daddr) -{ - net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd)); - return jhash_3words(ieee802154_addr_hash(saddr), - ieee802154_addr_hash(daddr), - (__force u32)(tag + (d_size << 16)), - lowpan_frags.rnd); -} - -static unsigned int lowpan_hashfn(const struct inet_frag_queue *q) -{ - const struct lowpan_frag_queue *fq; - - fq = container_of(q, struct lowpan_frag_queue, q); - return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr); -} - -static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct lowpan_frag_queue *fq; - const struct lowpan_create_arg *arg = a; - - fq = container_of(q, struct lowpan_frag_queue, q); - return fq->tag == arg->tag && fq->d_size == arg->d_size && - ieee802154_addr_equal(&fq->saddr, arg->src) && - ieee802154_addr_equal(&fq->daddr, arg->dst); -} - static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { - const struct lowpan_create_arg *arg = a; + const struct frag_lowpan_compare_key *key = a; struct lowpan_frag_queue *fq; fq = container_of(q, struct lowpan_frag_queue, q); - fq->tag = arg->tag; - fq->d_size = arg->d_size; - fq->saddr = *arg->src; - fq->daddr = *arg->dst; + BUILD_BUG_ON(sizeof(*key) > sizeof(q->key)); + memcpy(&q->key, key, sizeof(*key)); } static void lowpan_frag_expire(struct timer_list *t) @@ -105,21 +73,17 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, const struct ieee802154_addr *src, const struct ieee802154_addr *dst) { - struct inet_frag_queue *q; - struct lowpan_create_arg arg; - unsigned int hash; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); + struct frag_lowpan_compare_key key = { + .tag = cb->d_tag, + .d_size = cb->d_size, + .src = *src, + .dst = *dst, + }; + struct inet_frag_queue *q; - arg.tag = cb->d_tag; - arg.d_size = cb->d_size; - arg.src = src; - arg.dst = dst; - - hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst); - - q = inet_frag_find(&ieee802154_lowpan->frags, - &lowpan_frags, &arg, hash); + q = inet_frag_find(&ieee802154_lowpan->frags, &key); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; @@ -611,17 +575,46 @@ static struct pernet_operations lowpan_frags_ops = { .exit = lowpan_frags_exit_net, }; +static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); +} + +static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key, + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); +} + +static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_lowpan_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static const struct rhashtable_params lowpan_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = lowpan_key_hashfn, + .obj_hashfn = lowpan_obj_hashfn, + .obj_cmpfn = lowpan_obj_cmpfn, + .automatic_shrinking = true, +}; + int __init lowpan_net_frag_init(void) { int ret; - lowpan_frags.hashfn = lowpan_hashfn; lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.destructor = NULL; lowpan_frags.qsize = sizeof(struct frag_queue); - lowpan_frags.match = lowpan_frag_match; lowpan_frags.frag_expire = lowpan_frag_expire; lowpan_frags.frags_cache_name = lowpan_frags_cache_name; + lowpan_frags.rhash_params = lowpan_rhash_params; ret = inet_frags_init(&lowpan_frags); if (ret) goto out; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 1ac69f65d0de..ebb8f411e0db 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,12 +25,6 @@ #include #include -#define INETFRAGS_EVICT_BUCKETS 128 -#define INETFRAGS_EVICT_MAX 512 - -/* don't rebuild inetfrag table with new secret more often than this */ -#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ) - /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field @@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); -static unsigned int -inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) -{ - return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); -} - -static bool inet_frag_may_rebuild(struct inet_frags *f) -{ - return time_after(jiffies, - f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL); -} - -static void inet_frag_secret_rebuild(struct inet_frags *f) -{ - int i; - - write_seqlock_bh(&f->rnd_seqlock); - - if (!inet_frag_may_rebuild(f)) - goto out; - - get_random_bytes(&f->rnd, sizeof(u32)); - - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - struct hlist_node *n; - - hb = &f->hash[i]; - spin_lock(&hb->chain_lock); - - hlist_for_each_entry_safe(q, n, &hb->chain, list) { - unsigned int hval = inet_frag_hashfn(f, q); - - if (hval != i) { - struct inet_frag_bucket *hb_dest; - - hlist_del(&q->list); - - /* Relink to new hash chain. */ - hb_dest = &f->hash[hval]; - - /* This is the only place where we take - * another chain_lock while already holding - * one. As this will not run concurrently, - * we cannot deadlock on hb_dest lock below, if its - * already locked it will be released soon since - * other caller cannot be waiting for hb lock - * that we've taken above. - */ - spin_lock_nested(&hb_dest->chain_lock, - SINGLE_DEPTH_NESTING); - hlist_add_head(&q->list, &hb_dest->chain); - spin_unlock(&hb_dest->chain_lock); - } - } - spin_unlock(&hb->chain_lock); - } - - f->rebuild = false; - f->last_rebuild_jiffies = jiffies; -out: - write_sequnlock_bh(&f->rnd_seqlock); -} - -static bool inet_fragq_should_evict(const struct inet_frag_queue *q) -{ - if (!hlist_unhashed(&q->list_evictor)) - return false; - - return q->net->low_thresh == 0 || - frag_mem_limit(q->net) >= q->net->low_thresh; -} - -static unsigned int -inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) -{ - struct inet_frag_queue *fq; - struct hlist_node *n; - unsigned int evicted = 0; - HLIST_HEAD(expired); - - spin_lock(&hb->chain_lock); - - hlist_for_each_entry_safe(fq, n, &hb->chain, list) { - if (!inet_fragq_should_evict(fq)) - continue; - - if (!del_timer(&fq->timer)) - continue; - - hlist_add_head(&fq->list_evictor, &expired); - ++evicted; - } - - spin_unlock(&hb->chain_lock); - - hlist_for_each_entry_safe(fq, n, &expired, list_evictor) - f->frag_expire(&fq->timer); - - return evicted; -} - -static void inet_frag_worker(struct work_struct *work) -{ - unsigned int budget = INETFRAGS_EVICT_BUCKETS; - unsigned int i, evicted = 0; - struct inet_frags *f; - - f = container_of(work, struct inet_frags, frags_work); - - BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); - - local_bh_disable(); - - for (i = READ_ONCE(f->next_bucket); budget; --budget) { - evicted += inet_evict_bucket(f, &f->hash[i]); - i = (i + 1) & (INETFRAGS_HASHSZ - 1); - if (evicted > INETFRAGS_EVICT_MAX) - break; - } - - f->next_bucket = i; - - local_bh_enable(); - - if (f->rebuild && inet_frag_may_rebuild(f)) - inet_frag_secret_rebuild(f); -} - -static void inet_frag_schedule_worker(struct inet_frags *f) -{ - if (unlikely(!work_pending(&f->frags_work))) - schedule_work(&f->frags_work); -} - int inet_frags_init(struct inet_frags *f) { - int i; - - INIT_WORK(&f->frags_work, inet_frag_worker); - - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb = &f->hash[i]; - - spin_lock_init(&hb->chain_lock); - INIT_HLIST_HEAD(&hb->chain); - } - - seqlock_init(&f->rnd_seqlock); - f->last_rebuild_jiffies = 0; f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, NULL); if (!f->frags_cachep) @@ -214,66 +59,42 @@ EXPORT_SYMBOL(inet_frags_init); void inet_frags_fini(struct inet_frags *f) { - cancel_work_sync(&f->frags_work); + /* We must wait that all inet_frag_destroy_rcu() have completed. */ + rcu_barrier(); + kmem_cache_destroy(f->frags_cachep); + f->frags_cachep = NULL; } EXPORT_SYMBOL(inet_frags_fini); -void inet_frags_exit_net(struct netns_frags *nf) -{ - struct inet_frags *f =nf->f; - unsigned int seq; - int i; - - nf->low_thresh = 0; - -evict_again: - local_bh_disable(); - seq = read_seqbegin(&f->rnd_seqlock); - - for (i = 0; i < INETFRAGS_HASHSZ ; i++) - inet_evict_bucket(f, &f->hash[i]); - - local_bh_enable(); - cond_resched(); - - if (read_seqretry(&f->rnd_seqlock, seq) || - sum_frag_mem_limit(nf)) - goto evict_again; -} -EXPORT_SYMBOL(inet_frags_exit_net); - -static struct inet_frag_bucket * -get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f) -__acquires(hb->chain_lock) +static void inet_frags_free_cb(void *ptr, void *arg) { - struct inet_frag_bucket *hb; - unsigned int seq, hash; + struct inet_frag_queue *fq = ptr; - restart: - seq = read_seqbegin(&f->rnd_seqlock); - - hash = inet_frag_hashfn(f, fq); - hb = &f->hash[hash]; + /* If we can not cancel the timer, it means this frag_queue + * is already disappearing, we have nothing to do. + * Otherwise, we own a refcount until the end of this function. + */ + if (!del_timer(&fq->timer)) + return; - spin_lock(&hb->chain_lock); - if (read_seqretry(&f->rnd_seqlock, seq)) { - spin_unlock(&hb->chain_lock); - goto restart; + spin_lock_bh(&fq->lock); + if (!(fq->flags & INET_FRAG_COMPLETE)) { + fq->flags |= INET_FRAG_COMPLETE; + refcount_dec(&fq->refcnt); } + spin_unlock_bh(&fq->lock); - return hb; + inet_frag_put(fq); } -static inline void fq_unlink(struct inet_frag_queue *fq) +void inet_frags_exit_net(struct netns_frags *nf) { - struct inet_frag_bucket *hb; + nf->low_thresh = 0; /* prevent creation of new frags */ - hb = get_frag_bucket_locked(fq, fq->net->f); - hlist_del(&fq->list); - fq->flags |= INET_FRAG_COMPLETE; - spin_unlock(&hb->chain_lock); + rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); } +EXPORT_SYMBOL(inet_frags_exit_net); void inet_frag_kill(struct inet_frag_queue *fq) { @@ -281,12 +102,26 @@ void inet_frag_kill(struct inet_frag_queue *fq) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { - fq_unlink(fq); + struct netns_frags *nf = fq->net; + + fq->flags |= INET_FRAG_COMPLETE; + rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); refcount_dec(&fq->refcnt); } } EXPORT_SYMBOL(inet_frag_kill); +static void inet_frag_destroy_rcu(struct rcu_head *head) +{ + struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, + rcu); + struct inet_frags *f = q->net->f; + + if (f->destructor) + f->destructor(q); + kmem_cache_free(f->frags_cachep, q); +} + void inet_frag_destroy(struct inet_frag_queue *q) { struct sk_buff *fp; @@ -310,59 +145,20 @@ void inet_frag_destroy(struct inet_frag_queue *q) } sum = sum_truesize + f->qsize; - if (f->destructor) - f->destructor(q); - kmem_cache_free(f->frags_cachep, q); + call_rcu(&q->rcu, inet_frag_destroy_rcu); sub_frag_mem_limit(nf, sum); } EXPORT_SYMBOL(inet_frag_destroy); -static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, - struct inet_frag_queue *qp_in, - struct inet_frags *f, - void *arg) -{ - struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); - struct inet_frag_queue *qp; - -#ifdef CONFIG_SMP - /* With SMP race we have to recheck hash table, because - * such entry could have been created on other cpu before - * we acquired hash bucket lock. - */ - hlist_for_each_entry(qp, &hb->chain, list) { - if (qp->net == nf && f->match(qp, arg)) { - refcount_inc(&qp->refcnt); - spin_unlock(&hb->chain_lock); - qp_in->flags |= INET_FRAG_COMPLETE; - inet_frag_put(qp_in); - return qp; - } - } -#endif - qp = qp_in; - if (!mod_timer(&qp->timer, jiffies + nf->timeout)) - refcount_inc(&qp->refcnt); - - refcount_inc(&qp->refcnt); - hlist_add_head(&qp->list, &hb->chain); - - spin_unlock(&hb->chain_lock); - - return qp; -} - static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { - inet_frag_schedule_worker(f); + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) return NULL; - } q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) @@ -374,59 +170,52 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); - refcount_set(&q->refcnt, 1); + refcount_set(&q->refcnt, 3); return q; } static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - struct inet_frags *f, void *arg) { + struct inet_frags *f = nf->f; struct inet_frag_queue *q; + int err; q = inet_frag_alloc(nf, f, arg); if (!q) return NULL; - return inet_frag_intern(nf, q, f, arg); + mod_timer(&q->timer, jiffies + nf->timeout); + + err = rhashtable_insert_fast(&nf->rhashtable, &q->node, + f->rhash_params); + if (err < 0) { + q->flags |= INET_FRAG_COMPLETE; + inet_frag_kill(q); + inet_frag_destroy(q); + return NULL; + } + return q; } -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, - unsigned int hash) +/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - int depth = 0; - - if (frag_mem_limit(nf) > nf->low_thresh) - inet_frag_schedule_worker(f); - - hash &= (INETFRAGS_HASHSZ - 1); - hb = &f->hash[hash]; - - spin_lock(&hb->chain_lock); - hlist_for_each_entry(q, &hb->chain, list) { - if (q->net == nf && f->match(q, key)) { - refcount_inc(&q->refcnt); - spin_unlock(&hb->chain_lock); - return q; - } - depth++; - } - spin_unlock(&hb->chain_lock); + struct inet_frag_queue *fq; - if (depth <= INETFRAGS_MAXDEPTH) - return inet_frag_create(nf, f, key); + rcu_read_lock(); - if (inet_frag_may_rebuild(f)) { - if (!f->rebuild) - f->rebuild = true; - inet_frag_schedule_worker(f); + fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + if (fq) { + if (!refcount_inc_not_zero(&fq->refcnt)) + fq = NULL; + rcu_read_unlock(); + return fq; } + rcu_read_unlock(); - return ERR_PTR(-ENOBUFS); + return inet_frag_create(nf, key); } EXPORT_SYMBOL(inet_frag_find); @@ -434,8 +223,7 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, const char *prefix) { static const char msg[] = "inet_frag_find: Fragment hash bucket" - " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) - ". Dropping fragment.\n"; + " list length grew over limit. Dropping fragment.\n"; if (PTR_ERR(q) == -ENOBUFS) net_dbg_ratelimited("%s%s", prefix, msg); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 1a3bc85d6f5e..4021820db6f2 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -69,15 +69,9 @@ struct ipfrag_skb_cb struct ipq { struct inet_frag_queue q; - u32 user; - __be32 saddr; - __be32 daddr; - __be16 id; - u8 protocol; u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; - int vif; /* L3 master device index */ unsigned int rid; struct inet_peer *peer; }; @@ -97,41 +91,6 @@ int ip_frag_mem(struct net *net) static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev); -struct ip4_create_arg { - struct iphdr *iph; - u32 user; - int vif; -}; - -static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) -{ - net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); - return jhash_3words((__force u32)id << 16 | prot, - (__force u32)saddr, (__force u32)daddr, - ip4_frags.rnd); -} - -static unsigned int ip4_hashfn(const struct inet_frag_queue *q) -{ - const struct ipq *ipq; - - ipq = container_of(q, struct ipq, q); - return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); -} - -static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct ipq *qp; - const struct ip4_create_arg *arg = a; - - qp = container_of(q, struct ipq, q); - return qp->id == arg->iph->id && - qp->saddr == arg->iph->saddr && - qp->daddr == arg->iph->daddr && - qp->protocol == arg->iph->protocol && - qp->user == arg->user && - qp->vif == arg->vif; -} static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { @@ -140,17 +99,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) frags); struct net *net = container_of(ipv4, struct net, ipv4); - const struct ip4_create_arg *arg = a; + const struct frag_v4_compare_key *key = a; - qp->protocol = arg->iph->protocol; - qp->id = arg->iph->id; - qp->ecn = ip4_frag_ecn(arg->iph->tos); - qp->saddr = arg->iph->saddr; - qp->daddr = arg->iph->daddr; - qp->vif = arg->vif; - qp->user = arg->user; + q->key.v4 = *key; + qp->ecn = 0; qp->peer = q->net->max_dist ? - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : + inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : NULL; } @@ -234,7 +188,7 @@ static void ip_expire(struct timer_list *t) /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ - if (frag_expire_skip_icmp(qp->user) && + if (frag_expire_skip_icmp(qp->q.key.v4.user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; @@ -262,17 +216,17 @@ out_rcu_unlock: static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user, int vif) { + struct frag_v4_compare_key key = { + .saddr = iph->saddr, + .daddr = iph->daddr, + .user = user, + .vif = vif, + .id = iph->id, + .protocol = iph->protocol, + }; struct inet_frag_queue *q; - struct ip4_create_arg arg; - unsigned int hash; - - arg.iph = iph; - arg.user = user; - arg.vif = vif; - hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); - - q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); + q = inet_frag_find(&net->ipv4.frags, &key); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; @@ -656,7 +610,7 @@ out_nomem: err = -ENOMEM; goto out_fail; out_oversize: - net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); + net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr); out_fail: __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; @@ -894,15 +848,47 @@ static struct pernet_operations ip4_frags_ops = { .exit = ipv4_frags_exit_net, }; + +static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); +} + +static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v4, + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); +} + +static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v4_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static const struct rhashtable_params ip4_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .key_offset = offsetof(struct inet_frag_queue, key), + .key_len = sizeof(struct frag_v4_compare_key), + .hashfn = ip4_key_hashfn, + .obj_hashfn = ip4_obj_hashfn, + .obj_cmpfn = ip4_obj_cmpfn, + .automatic_shrinking = true, +}; + void __init ipfrag_init(void) { - ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; ip4_frags.qsize = sizeof(struct ipq); - ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; ip4_frags.frags_cache_name = ip_frag_cache_name; + ip4_frags.rhash_params = ip4_rhash_params; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); ip4_frags_ctl_register(); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index c4b40fdee838..0ad3df551d98 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -152,23 +152,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } -static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, nf_frags.rnd); -} - - -static unsigned int nf_hashfn(const struct inet_frag_queue *q) -{ - const struct frag_queue *nq; - - nq = container_of(q, struct frag_queue, q); - return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); -} - static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); @@ -182,26 +165,19 @@ static void nf_ct_frag6_expire(struct timer_list *t) } /* Creation primitives. */ -static inline struct frag_queue *fq_find(struct net *net, __be32 id, - u32 user, struct in6_addr *src, - struct in6_addr *dst, int iif, u8 ecn) +static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, + const struct ipv6hdr *hdr, int iif) { + struct frag_v6_compare_key key = { + .id = id, + .saddr = hdr->saddr, + .daddr = hdr->daddr, + .user = user, + .iif = iif, + }; struct inet_frag_queue *q; - struct ip6_create_arg arg; - unsigned int hash; - - arg.id = id; - arg.user = user; - arg.src = src; - arg.dst = dst; - arg.iif = iif; - arg.ecn = ecn; - - local_bh_disable(); - hash = nf_hash_frag(id, src, dst); - q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); - local_bh_enable(); + q = inet_frag_find(&net->nf_frag.frags, &key); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; @@ -593,8 +569,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) fhdr = (struct frag_hdr *)skb_transport_header(skb); skb_orphan(skb); - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); + fq = fq_find(net, fhdr->identification, user, hdr, + skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); return -ENOMEM; @@ -660,13 +636,12 @@ int nf_ct_frag6_init(void) { int ret = 0; - nf_frags.hashfn = nf_hashfn; nf_frags.constructor = ip6_frag_init; nf_frags.destructor = NULL; nf_frags.qsize = sizeof(struct frag_queue); - nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frags_cache_name = nf_frags_cache_name; + nf_frags.rhash_params = ip6_rhash_params; ret = inet_frags_init(&nf_frags); if (ret) goto out; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index f0071b113a92..3fc853e4492a 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -79,52 +79,13 @@ static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev); -/* - * callers should be careful not to use the hash value outside the ipfrag_lock - * as doing so could race with ipfrag_hash_rnd being recalculated. - */ -static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd)); - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, ip6_frags.rnd); -} - -static unsigned int ip6_hashfn(const struct inet_frag_queue *q) -{ - const struct frag_queue *fq; - - fq = container_of(q, struct frag_queue, q); - return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr); -} - -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct frag_queue *fq; - const struct ip6_create_arg *arg = a; - - fq = container_of(q, struct frag_queue, q); - return fq->id == arg->id && - fq->user == arg->user && - ipv6_addr_equal(&fq->saddr, arg->src) && - ipv6_addr_equal(&fq->daddr, arg->dst) && - (arg->iif == fq->iif || - !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST | - IPV6_ADDR_LINKLOCAL))); -} -EXPORT_SYMBOL(ip6_frag_match); - void ip6_frag_init(struct inet_frag_queue *q, const void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); - const struct ip6_create_arg *arg = a; + const struct frag_v6_compare_key *key = a; - fq->id = arg->id; - fq->user = arg->user; - fq->saddr = *arg->src; - fq->daddr = *arg->dst; - fq->ecn = arg->ecn; + q->key.v6 = *key; + fq->ecn = 0; } EXPORT_SYMBOL(ip6_frag_init); @@ -182,23 +143,22 @@ static void ip6_frag_expire(struct timer_list *t) } static struct frag_queue * -fq_find(struct net *net, __be32 id, const struct in6_addr *src, - const struct in6_addr *dst, int iif, u8 ecn) +fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) { + struct frag_v6_compare_key key = { + .id = id, + .saddr = hdr->saddr, + .daddr = hdr->daddr, + .user = IP6_DEFRAG_LOCAL_DELIVER, + .iif = iif, + }; struct inet_frag_queue *q; - struct ip6_create_arg arg; - unsigned int hash; - - arg.id = id; - arg.user = IP6_DEFRAG_LOCAL_DELIVER; - arg.src = src; - arg.dst = dst; - arg.iif = iif; - arg.ecn = ecn; - hash = inet6_hash_frag(id, src, dst); + if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | + IPV6_ADDR_LINKLOCAL))) + key.iif = 0; - q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); + q = inet_frag_find(&net->ipv6.frags, &key); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; @@ -530,6 +490,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); + int iif; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; @@ -558,13 +519,14 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); + iif = skb->dev ? skb->dev->ifindex : 0; + fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { int ret; spin_lock(&fq->q.lock); + fq->iif = iif; ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); spin_unlock(&fq->q.lock); @@ -738,17 +700,47 @@ static struct pernet_operations ip6_frags_ops = { .exit = ipv6_frags_exit_net, }; +static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v6, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v6_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +const struct rhashtable_params ip6_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = ip6_key_hashfn, + .obj_hashfn = ip6_obj_hashfn, + .obj_cmpfn = ip6_obj_cmpfn, + .automatic_shrinking = true, +}; +EXPORT_SYMBOL(ip6_rhash_params); + int __init ipv6_frag_init(void) { int ret; - ip6_frags.hashfn = ip6_hashfn; ip6_frags.constructor = ip6_frag_init; ip6_frags.destructor = NULL; ip6_frags.qsize = sizeof(struct frag_queue); - ip6_frags.match = ip6_frag_match; ip6_frags.frag_expire = ip6_frag_expire; ip6_frags.frags_cache_name = ip6_frag_cache_name; + ip6_frags.rhash_params = ip6_rhash_params; ret = inet_frags_init(&ip6_frags); if (ret) goto out; -- cgit v1.2.3 From 3e67f106f619dcfaf6f4e2039599bdb69848c714 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:53 -0700 Subject: inet: frags: break the 2GB limit for frags storage Some users are willing to provision huge amounts of memory to be able to perform reassembly reasonnably well under pressure. Current memory tracking is using one atomic_t and integers. Switch to atomic_long_t so that 64bit arches can use more than 2GB, without any cost for 32bit arches. Note that this patch avoids an overflow error, if high_thresh was set to ~2GB, since this test in inet_frag_alloc() was never true : if (... || frag_mem_limit(nf) > nf->high_thresh) Tested: $ echo 16000000000 >/proc/sys/net/ipv4/ipfrag_high_thresh $ grep FRAG /proc/net/sockstat FRAG: inuse 14705885 memory 16000002880 $ nstat -n ; sleep 1 ; nstat | grep Reas IpReasmReqds 3317150 0.0 IpReasmFails 3317112 0.0 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 4 ++-- include/net/inet_frag.h | 20 ++++++++++---------- net/ieee802154/6lowpan/reassembly.c | 10 +++++----- net/ipv4/ip_fragment.c | 10 +++++----- net/ipv4/proc.c | 2 +- net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +++++----- net/ipv6/proc.c | 2 +- net/ipv6/reassembly.c | 6 +++--- 8 files changed, 32 insertions(+), 32 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 6f2a3670e44b..5dc1a040a2f1 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -133,10 +133,10 @@ min_adv_mss - INTEGER IP Fragmentation: -ipfrag_high_thresh - INTEGER +ipfrag_high_thresh - LONG INTEGER Maximum memory used to reassemble IP fragments. -ipfrag_low_thresh - INTEGER +ipfrag_low_thresh - LONG INTEGER (Obsolete since linux-4.17) Maximum memory used to reassemble IP fragments before the kernel begins to remove incomplete fragment queues to free up resources. diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 95e353e3305b..a52e7273e7a5 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -8,11 +8,11 @@ struct netns_frags { struct rhashtable rhashtable ____cacheline_aligned_in_smp; /* Keep atomic mem on separate cachelines in structs that include it */ - atomic_t mem ____cacheline_aligned_in_smp; + atomic_long_t mem ____cacheline_aligned_in_smp; /* sysctls */ + long high_thresh; + long low_thresh; int timeout; - int high_thresh; - int low_thresh; int max_dist; struct inet_frags *f; }; @@ -102,7 +102,7 @@ void inet_frags_fini(struct inet_frags *); static inline int inet_frags_init_net(struct netns_frags *nf) { - atomic_set(&nf->mem, 0); + atomic_long_set(&nf->mem, 0); return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params); } void inet_frags_exit_net(struct netns_frags *nf); @@ -119,19 +119,19 @@ static inline void inet_frag_put(struct inet_frag_queue *q) /* Memory Tracking Functions. */ -static inline int frag_mem_limit(struct netns_frags *nf) +static inline long frag_mem_limit(const struct netns_frags *nf) { - return atomic_read(&nf->mem); + return atomic_long_read(&nf->mem); } -static inline void sub_frag_mem_limit(struct netns_frags *nf, int i) +static inline void sub_frag_mem_limit(struct netns_frags *nf, long val) { - atomic_sub(i, &nf->mem); + atomic_long_sub(val, &nf->mem); } -static inline void add_frag_mem_limit(struct netns_frags *nf, int i) +static inline void add_frag_mem_limit(struct netns_frags *nf, long val) { - atomic_add(i, &nf->mem); + atomic_long_add(val, &nf->mem); } /* RFC 3168 support : diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 1aec71a3f904..44f148a6bb57 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -411,23 +411,23 @@ err: } #ifdef CONFIG_SYSCTL -static int zero; +static long zero; static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", .data = &init_net.ieee802154_lowpan.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh }, { .procname = "6lowpanfrag_low_thresh", .data = &init_net.ieee802154_lowpan.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh }, diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b0366224f314..053869f2c49b 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -678,23 +678,23 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) EXPORT_SYMBOL(ip_check_defrag); #ifdef CONFIG_SYSCTL -static int zero; +static long zero; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", .data = &init_net.ipv4.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", .data = &init_net.ipv4.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.ipv4.frags.high_thresh }, diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index aacfce0d7d82..a058de677e94 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -71,7 +71,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); - seq_printf(seq, "FRAG: inuse %u memory %u\n", + seq_printf(seq, "FRAG: inuse %u memory %lu\n", atomic_read(&net->ipv4.frags.rhashtable.nelems), frag_mem_limit(&net->ipv4.frags)); return 0; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d866412b8f6c..603a39592859 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -63,7 +63,7 @@ struct nf_ct_frag6_skb_cb static struct inet_frags nf_frags; #ifdef CONFIG_SYSCTL -static int zero; +static long zero; static struct ctl_table nf_ct_frag6_sysctl_table[] = { { @@ -76,18 +76,18 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_low_thresh", .data = &init_net.nf_frag.frags.low_thresh, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", .data = &init_net.nf_frag.frags.high_thresh, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.nf_frag.frags.low_thresh }, { } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 8befeb91e071..a85f7e0b14b1 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -47,7 +47,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); - seq_printf(seq, "FRAG6: inuse %u memory %u\n", + seq_printf(seq, "FRAG6: inuse %u memory %lu\n", atomic_read(&net->ipv6.frags.rhashtable.nelems), frag_mem_limit(&net->ipv6.frags)); return 0; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 2a77fda5e3bc..905a8aee2671 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -552,15 +552,15 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", .data = &init_net.ipv6.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", .data = &init_net.ipv6.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, -- cgit v1.2.3