From 57b354e66b67c4c72468a26d4313d1217ef32e17 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 16 May 2013 23:36:32 +0000 Subject: dev: remove duplicate 'skb->dev = dev' in dev_forward_skb() This was added by commit 59b9997baba5 (Revert "net: maintain namespace isolation between vlan and real device"). In fact, before the initial commit - the one that is reverted -, this statement was not present. 'skb->dev = dev' is already done in eth_type_trans(), which is call just after. Spotted-by: Alain Ritoux Signed-off-by: Nicolas Dichtel Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index fc1e289397f5..18e9730cc4be 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1629,7 +1629,6 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) return NET_RX_DROP; } skb->skb_iif = 0; - skb->dev = dev; skb_dst_drop(skb); skb->tstamp.tv64 = 0; skb->pkt_type = PACKET_HOST; -- cgit v1.2.3 From 99bbc70741903c063b3ccad90a3e06fc55df9245 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 20 May 2013 04:02:32 +0000 Subject: rps: selective flow shedding during softnet overflow A cpu executing the network receive path sheds packets when its input queue grows to netdev_max_backlog. A single high rate flow (such as a spoofed source DoS) can exceed a single cpu processing rate and will degrade throughput of other flows hashed onto the same cpu. This patch adds a more fine grained hashtable. If the netdev backlog is above a threshold, IRQ cpus track the ratio of total traffic of each flow (using 4096 buckets, configurable). The ratio is measured by counting the number of packets per flow over the last 256 packets from the source cpu. Any flow that occupies a large fraction of this (set at 50%) will see packet drop while above the threshold. Tested: Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0, kernel receive (RPS) on cpu0 and application threads on cpus 2--7 each handling 20k req/s. Throughput halves when hit with a 400 kpps antagonist storm. With this patch applied, antagonist overload is dropped and the server processes its complete load. The patch is effective when kernel receive processing is the bottleneck. The above RPS scenario is a extreme, but the same is reached with RFS and sufficient kernel processing (iptables, packet socket tap, ..). Signed-off-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 17 ++++++++ net/Kconfig | 12 ++++++ net/core/dev.c | 48 ++++++++++++++++++++- net/core/net-procfs.c | 16 ++++++- net/core/sysctl_net_core.c | 104 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 194 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a94a5a0ab122..7dd535d4b41e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1778,6 +1778,19 @@ static inline int unregister_gifconf(unsigned int family) return register_gifconf(family, NULL); } +#ifdef CONFIG_NET_FLOW_LIMIT +#define FLOW_LIMIT_HISTORY (1 << 8) /* must be ^2 */ +struct sd_flow_limit { + u64 count; + unsigned int num_buckets; + unsigned int history_head; + u16 history[FLOW_LIMIT_HISTORY]; + u8 buckets[]; +}; + +extern int netdev_flow_limit_table_len; +#endif /* CONFIG_NET_FLOW_LIMIT */ + /* * Incoming packets are placed on per-cpu queues */ @@ -1807,6 +1820,10 @@ struct softnet_data { unsigned int dropped; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; + +#ifdef CONFIG_NET_FLOW_LIMIT + struct sd_flow_limit *flow_limit; +#endif }; static inline void input_queue_head_incr(struct softnet_data *sd) diff --git a/net/Kconfig b/net/Kconfig index 2ddc9046868e..08de901415ee 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -259,6 +259,18 @@ config BPF_JIT packet sniffing (libpcap/tcpdump). Note : Admin should enable this feature changing /proc/sys/net/core/bpf_jit_enable +config NET_FLOW_LIMIT + boolean + depends on RPS + default y + ---help--- + The network stack has to drop packets when a receive processing CPU's + backlog reaches netdev_max_backlog. If a few out of many active flows + generate the vast majority of load, drop their traffic earlier to + maintain capacity for the other flows. This feature provides servers + with many clients some protection against DoS by a single (spoofed) + flow that greatly exceeds average workload. + menu "Network testing" config NET_PKTGEN diff --git a/net/core/dev.c b/net/core/dev.c index 18e9730cc4be..7229bc30e509 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3064,6 +3064,46 @@ static int rps_ipi_queued(struct softnet_data *sd) return 0; } +#ifdef CONFIG_NET_FLOW_LIMIT +int netdev_flow_limit_table_len __read_mostly = (1 << 12); +#endif + +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +{ +#ifdef CONFIG_NET_FLOW_LIMIT + struct sd_flow_limit *fl; + struct softnet_data *sd; + unsigned int old_flow, new_flow; + + if (qlen < (netdev_max_backlog >> 1)) + return false; + + sd = &__get_cpu_var(softnet_data); + + rcu_read_lock(); + fl = rcu_dereference(sd->flow_limit); + if (fl) { + new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1); + old_flow = fl->history[fl->history_head]; + fl->history[fl->history_head] = new_flow; + + fl->history_head++; + fl->history_head &= FLOW_LIMIT_HISTORY - 1; + + if (likely(fl->buckets[old_flow])) + fl->buckets[old_flow]--; + + if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { + fl->count++; + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); +#endif + return false; +} + /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). @@ -3073,13 +3113,15 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, { struct softnet_data *sd; unsigned long flags; + unsigned int qlen; sd = &per_cpu(softnet_data, cpu); local_irq_save(flags); rps_lock(sd); - if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { if (skb_queue_len(&sd->input_pkt_queue)) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); @@ -6269,6 +6311,10 @@ static int __init net_dev_init(void) sd->backlog.weight = weight_p; sd->backlog.gro_list = NULL; sd->backlog.gro_count = 0; + +#ifdef CONFIG_NET_FLOW_LIMIT + sd->flow_limit = NULL; +#endif } dev_boot_phase = 0; diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 569d355fec3e..2bf83299600a 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -146,11 +146,23 @@ static void softnet_seq_stop(struct seq_file *seq, void *v) static int softnet_seq_show(struct seq_file *seq, void *v) { struct softnet_data *sd = v; + unsigned int flow_limit_count = 0; - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", +#ifdef CONFIG_NET_FLOW_LIMIT + struct sd_flow_limit *fl; + + rcu_read_lock(); + fl = rcu_dereference(sd->flow_limit); + if (fl) + flow_limit_count = fl->count; + rcu_read_unlock(); +#endif + + seq_printf(seq, + "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", sd->processed, sd->dropped, sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ - sd->cpu_collision, sd->received_rps); + sd->cpu_collision, sd->received_rps, flow_limit_count); return 0; } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cfdb46ab3a7f..741db5fc7806 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -87,6 +87,96 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, } #endif /* CONFIG_RPS */ +#ifdef CONFIG_NET_FLOW_LIMIT +static DEFINE_MUTEX(flow_limit_update_mutex); + +static int flow_limit_cpu_sysctl(ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct sd_flow_limit *cur; + struct softnet_data *sd; + cpumask_var_t mask; + int i, len, ret = 0; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + if (write) { + ret = cpumask_parse_user(buffer, *lenp, mask); + if (ret) + goto done; + + mutex_lock(&flow_limit_update_mutex); + len = sizeof(*cur) + netdev_flow_limit_table_len; + for_each_possible_cpu(i) { + sd = &per_cpu(softnet_data, i); + cur = rcu_dereference_protected(sd->flow_limit, + lockdep_is_held(&flow_limit_update_mutex)); + if (cur && !cpumask_test_cpu(i, mask)) { + RCU_INIT_POINTER(sd->flow_limit, NULL); + synchronize_rcu(); + kfree(cur); + } else if (!cur && cpumask_test_cpu(i, mask)) { + cur = kzalloc(len, GFP_KERNEL); + if (!cur) { + /* not unwinding previous changes */ + ret = -ENOMEM; + goto write_unlock; + } + cur->num_buckets = netdev_flow_limit_table_len; + rcu_assign_pointer(sd->flow_limit, cur); + } + } +write_unlock: + mutex_unlock(&flow_limit_update_mutex); + } else { + if (*ppos || !*lenp) { + *lenp = 0; + goto done; + } + + cpumask_clear(mask); + rcu_read_lock(); + for_each_possible_cpu(i) { + sd = &per_cpu(softnet_data, i); + if (rcu_dereference(sd->flow_limit)) + cpumask_set_cpu(i, mask); + } + rcu_read_unlock(); + + len = cpumask_scnprintf(buffer, *lenp, mask); + *lenp = len + 1; + *ppos += len + 1; + } + +done: + free_cpumask_var(mask); + return ret; +} + +static int flow_limit_table_len_sysctl(ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + unsigned int old, *ptr; + int ret; + + mutex_lock(&flow_limit_update_mutex); + + ptr = table->data; + old = *ptr; + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (!ret && write && !is_power_of_2(*ptr)) { + *ptr = old; + ret = -EINVAL; + } + + mutex_unlock(&flow_limit_update_mutex); + return ret; +} +#endif /* CONFIG_NET_FLOW_LIMIT */ + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -180,6 +270,20 @@ static struct ctl_table net_core_table[] = { .proc_handler = rps_sock_flow_sysctl }, #endif +#ifdef CONFIG_NET_FLOW_LIMIT + { + .procname = "flow_limit_cpu_bitmap", + .mode = 0644, + .proc_handler = flow_limit_cpu_sysctl + }, + { + .procname = "flow_limit_table_len", + .data = &netdev_flow_limit_table_len, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = flow_limit_table_len_sysctl + }, +#endif /* CONFIG_NET_FLOW_LIMIT */ #endif /* CONFIG_NET */ { .procname = "netdev_budget", -- cgit v1.2.3 From 1cdbcb7957cf9e5f841dbcde9b38fd18a804208b Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 19 May 2013 15:46:49 +0000 Subject: net: Loosen constraints for recalculating checksum in skb_segment() This is a generic solution to resolve a specific problem that I have observed. If the encapsulation of an skb changes then ability to offload checksums may also change. In particular it may be necessary to perform checksumming in software. An example of such a case is where a non-GRE packet is received but is to be encapsulated and transmitted as GRE. Another example relates to my proposed support for for packets that are non-MPLS when received but MPLS when transmitted. The cost of this change is that the value of the csum variable may be checked when it previously was not. In the case where the csum variable is true this is pure overhead. In the case where the csum variable is false it leads to software checksumming, which I believe also leads to correct checksums in transmitted packets for the cases described above. Further analysis: This patch relies on the return value of can_checksum_protocol() being correct and in turn the return value of skb_network_protocol(), used to provide the protocol parameter of can_checksum_protocol(), being correct. It also relies on the features passed to skb_segment() and in turn to can_checksum_protocol() being correct. I believe that this problem has not been observed for VLANs because it appears that almost all drivers, the exception being xgbe, set vlan_features such that that the checksum offload support for VLAN packets is greater than or equal to that of non-VLAN packets. I wonder if the code in xgbe may be an oversight and the hardware does support checksumming of VLAN packets. If so it may be worth updating the vlan_features of the driver as this patch will force such checksums to be performed in software rather than hardware. Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index af9185d0be6a..d6298914f4e7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2853,7 +2853,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) doffset + tnl_hlen); if (fskb != skb_shinfo(skb)->frag_list) - continue; + goto perform_csum_check; if (!sg) { nskb->ip_summed = CHECKSUM_NONE; @@ -2917,6 +2917,7 @@ skip_fraglist: nskb->len += nskb->data_len; nskb->truesize += nskb->data_len; +perform_csum_check: if (!csum) { nskb->csum = skb_checksum(nskb, doffset, nskb->len - doffset, 0); -- cgit v1.2.3 From 42e52bf9e3ae80fd44b21ddfcd64c54e6db2ff76 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 25 May 2013 04:12:10 +0000 Subject: net: add netnotifier event for upper device change Now when upper device is changed, event is not propagated via RT Netlink to userspace. Userspace might never now about the change. Fix this by adding upper-device-change notifier event. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0ebd63ae2cc8..ea7b6bce9ea0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1593,6 +1593,7 @@ struct packet_offload { #define NETDEV_RELEASE 0x0012 #define NETDEV_NOTIFY_PEERS 0x0013 #define NETDEV_JOIN 0x0014 +#define NETDEV_CHANGEUPPER 0x0015 extern int register_netdevice_notifier(struct notifier_block *nb); extern int unregister_netdevice_notifier(struct notifier_block *nb); diff --git a/net/core/dev.c b/net/core/dev.c index 7229bc30e509..50c02ded1d69 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4411,7 +4411,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, else list_add_tail_rcu(&upper->list, &dev->upper_dev_list); dev_hold(upper_dev); - + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); return 0; } @@ -4471,6 +4471,7 @@ void netdev_upper_dev_unlink(struct net_device *dev, list_del_rcu(&upper->list); dev_put(upper_dev); kfree_rcu(upper, rcu); + call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); } EXPORT_SYMBOL(netdev_upper_dev_unlink); -- cgit v1.2.3 From 0d89d2035fe063461a5ddb609b2c12e7fb006e44 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 23 May 2013 21:02:52 +0000 Subject: MPLS: Add limited GSO support In the case where a non-MPLS packet is received and an MPLS stack is added it may well be the case that the original skb is GSO but the NIC used for transmit does not support GSO of MPLS packets. The aim of this code is to provide GSO in software for MPLS packets whose skbs are GSO. SKB Usage: When an implementation adds an MPLS stack to a non-MPLS packet it should do the following to skb metadata: * Set skb->inner_protocol to the old non-MPLS ethertype of the packet. skb->inner_protocol is added by this patch. * Set skb->protocol to the new MPLS ethertype of the packet. * Set skb->network_header to correspond to the end of the L3 header, including the MPLS label stack. I have posted a patch, "[PATCH v3.29] datapath: Add basic MPLS support to kernel" which adds MPLS support to the kernel datapath of Open vSwtich. That patch sets the above requirements in datapath/actions.c:push_mpls() and was used to exercise this code. The datapath patch is against the Open vSwtich tree but it is intended that it be added to the Open vSwtich code present in the mainline Linux kernel at some point. Features: I believe that the approach that I have taken is at least partially consistent with the handling of other protocols. Jesse, I understand that you have some ideas here. I am more than happy to change my implementation. This patch adds dev->mpls_features which may be used by devices to advertise features supported for MPLS packets. A new NETIF_F_MPLS_GSO feature is added for devices which support hardware MPLS GSO offload. Currently no devices support this and MPLS GSO always falls back to software. Alternate Implementation: One possible alternate implementation is to teach netif_skb_features() and skb_network_protocol() about MPLS, in a similar way to their understanding of VLANs. I believe this would avoid the need for net/mpls/mpls_gso.c and in particular the calls to __skb_push() and __skb_push() in mpls_gso_segment(). I have decided on the implementation in this patch as it should not introduce any overhead in the case where mpls_gso is not compiled into the kernel or inserted as a module. MPLS GSO suggested by Jesse Gross. Based in part on "v4 GRE: Add TCP segmentation offload for GRE" by Pravin B Shelar. Cc: Jesse Gross Cc: Pravin B Shelar Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 4 +- include/linux/netdevice.h | 2 + include/linux/skbuff.h | 4 ++ net/Kconfig | 1 + net/Makefile | 1 + net/core/dev.c | 4 ++ net/core/ethtool.c | 1 + net/ipv4/af_inet.c | 1 + net/ipv4/tcp.c | 1 + net/ipv4/udp.c | 2 +- net/ipv6/ip6_offload.c | 1 + net/ipv6/udp_offload.c | 3 +- net/mpls/Kconfig | 9 ++++ net/mpls/Makefile | 4 ++ net/mpls/mpls_gso.c | 108 ++++++++++++++++++++++++++++++++++++++++ 15 files changed, 143 insertions(+), 3 deletions(-) create mode 100644 net/mpls/Kconfig create mode 100644 net/mpls/Makefile create mode 100644 net/mpls/mpls_gso.c (limited to 'net/core') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 09906b7ca47d..a2a89a5c7be5 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -43,8 +43,9 @@ enum { NETIF_F_FSO_BIT, /* ... FCoE segmentation */ NETIF_F_GSO_GRE_BIT, /* ... GRE with TSO */ NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */ + NETIF_F_GSO_MPLS_BIT, /* ... MPLS segmentation */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ - NETIF_F_GSO_UDP_TUNNEL_BIT, + NETIF_F_GSO_MPLS_BIT, NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ NETIF_F_SCTP_CSUM_BIT, /* SCTP checksum offload */ @@ -107,6 +108,7 @@ enum { #define NETIF_F_RXALL __NETIF_F(RXALL) #define NETIF_F_GSO_GRE __NETIF_F(GSO_GRE) #define NETIF_F_GSO_UDP_TUNNEL __NETIF_F(GSO_UDP_TUNNEL) +#define NETIF_F_GSO_MPLS __NETIF_F(GSO_MPLS) #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ea7b6bce9ea0..6b2bb460d1d7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1088,6 +1088,8 @@ struct net_device { * need to set them appropriately. */ netdev_features_t hw_enc_features; + /* mask of fetures inheritable by MPLS */ + netdev_features_t mpls_features; /* Interface index. Unique device identifier */ int ifindex; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5663e3592784..8f2b830772a8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -319,6 +319,8 @@ enum { SKB_GSO_GRE = 1 << 6, SKB_GSO_UDP_TUNNEL = 1 << 7, + + SKB_GSO_MPLS = 1 << 8, }; #if BITS_PER_LONG > 32 @@ -389,6 +391,7 @@ typedef unsigned char *sk_buff_data_t; * @dropcount: total number of sk_receive_queue overflows * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information + * @inner_protocol: Protocol (encapsulation) * @inner_transport_header: Inner transport layer header (encapsulation) * @inner_network_header: Network layer header (encapsulation) * @inner_mac_header: Link layer header (encapsulation) @@ -509,6 +512,7 @@ struct sk_buff { __u32 reserved_tailroom; }; + __be16 inner_protocol; __u16 inner_transport_header; __u16 inner_network_header; __u16 inner_mac_header; diff --git a/net/Kconfig b/net/Kconfig index 08de901415ee..523e43e6da1b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -218,6 +218,7 @@ source "net/batman-adv/Kconfig" source "net/openvswitch/Kconfig" source "net/vmw_vsock/Kconfig" source "net/netlink/Kconfig" +source "net/mpls/Kconfig" config RPS boolean diff --git a/net/Makefile b/net/Makefile index 091e7b04f301..9492e8cb64e9 100644 --- a/net/Makefile +++ b/net/Makefile @@ -70,3 +70,4 @@ obj-$(CONFIG_BATMAN_ADV) += batman-adv/ obj-$(CONFIG_NFC) += nfc/ obj-$(CONFIG_OPENVSWITCH) += openvswitch/ obj-$(CONFIG_VSOCKETS) += vmw_vsock/ +obj-$(CONFIG_NET_MPLS_GSO) += mpls/ diff --git a/net/core/dev.c b/net/core/dev.c index 50c02ded1d69..2f09cb29cc95 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5277,6 +5277,10 @@ int register_netdevice(struct net_device *dev) */ dev->hw_enc_features |= NETIF_F_SG; + /* Make NETIF_F_SG inheritable to MPLS. + */ + dev->mpls_features |= NETIF_F_SG; + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 22efdaa76ebf..4e6f63ade741 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -82,6 +82,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", + [NETIF_F_GSO_MPLS_BIT] = "tx-mpls-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d01be2a3ae53..b05ae96aec44 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1295,6 +1295,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, SKB_GSO_GRE | SKB_GSO_TCPV6 | SKB_GSO_UDP_TUNNEL | + SKB_GSO_MPLS | 0))) goto out; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d87ce72ca8aa..ba4186e1dca9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2917,6 +2917,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_TCPV6 | SKB_GSO_GRE | + SKB_GSO_MPLS | SKB_GSO_UDP_TUNNEL | 0) || !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0bf5d399a03c..aa5eff46d137 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2381,7 +2381,7 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | - SKB_GSO_GRE) || + SKB_GSO_GRE | SKB_GSO_MPLS) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 71b766ee821d..a263b990ee11 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -98,6 +98,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_GRE | SKB_GSO_UDP_TUNNEL | + SKB_GSO_MPLS | SKB_GSO_TCPV6 | 0))) goto out; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 3bb3a891a424..76d401a93c7a 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -63,7 +63,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_UDP_TUNNEL | - SKB_GSO_GRE) || + SKB_GSO_GRE | + SKB_GSO_MPLS) || !(type & (SKB_GSO_UDP)))) goto out; diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig new file mode 100644 index 000000000000..37421db88965 --- /dev/null +++ b/net/mpls/Kconfig @@ -0,0 +1,9 @@ +# +# MPLS configuration +# +config NET_MPLS_GSO + tristate "MPLS: GSO support" + help + This is helper module to allow segmentation of non-MPLS GSO packets + that have had MPLS stack entries pushed onto them and thus + become MPLS GSO packets. diff --git a/net/mpls/Makefile b/net/mpls/Makefile new file mode 100644 index 000000000000..0a3c171be537 --- /dev/null +++ b/net/mpls/Makefile @@ -0,0 +1,4 @@ +# +# Makefile for MPLS. +# +obj-y += mpls_gso.o diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c new file mode 100644 index 000000000000..1bec1219ab81 --- /dev/null +++ b/net/mpls/mpls_gso.c @@ -0,0 +1,108 @@ +/* + * MPLS GSO Support + * + * Authors: Simon Horman (horms@verge.net.au) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Based on: GSO portions of net/ipv4/gre.c + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include + +static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + netdev_features_t mpls_features; + __be16 mpls_protocol; + + if (unlikely(skb_shinfo(skb)->gso_type & + ~(SKB_GSO_TCPV4 | + SKB_GSO_TCPV6 | + SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_GRE | + SKB_GSO_MPLS))) + goto out; + + /* Setup inner SKB. */ + mpls_protocol = skb->protocol; + skb->protocol = skb->inner_protocol; + + /* Push back the mac header that skb_mac_gso_segment() has pulled. + * It will be re-pulled by the call to skb_mac_gso_segment() below + */ + __skb_push(skb, skb->mac_len); + + /* Segment inner packet. */ + mpls_features = skb->dev->mpls_features & netif_skb_features(skb); + segs = skb_mac_gso_segment(skb, mpls_features); + + + /* Restore outer protocol. */ + skb->protocol = mpls_protocol; + + /* Re-pull the mac header that the call to skb_mac_gso_segment() + * above pulled. It will be re-pushed after returning + * skb_mac_gso_segment(), an indirect caller of this function. + */ + __skb_push(skb, skb->data - skb_mac_header(skb)); + +out: + return segs; +} + +static int mpls_gso_send_check(struct sk_buff *skb) +{ + return 0; +} + +static struct packet_offload mpls_mc_offload = { + .type = cpu_to_be16(ETH_P_MPLS_MC), + .callbacks = { + .gso_send_check = mpls_gso_send_check, + .gso_segment = mpls_gso_segment, + }, +}; + +static struct packet_offload mpls_uc_offload = { + .type = cpu_to_be16(ETH_P_MPLS_UC), + .callbacks = { + .gso_send_check = mpls_gso_send_check, + .gso_segment = mpls_gso_segment, + }, +}; + +static int __init mpls_gso_init(void) +{ + pr_info("MPLS GSO support\n"); + + dev_add_offload(&mpls_uc_offload); + dev_add_offload(&mpls_mc_offload); + + return 0; +} + +static void __exit mpls_gso_exit(void) +{ + dev_remove_offload(&mpls_uc_offload); + dev_remove_offload(&mpls_mc_offload); +} + +module_init(mpls_gso_init); +module_exit(mpls_gso_exit); + +MODULE_DESCRIPTION("MPLS GSO support"); +MODULE_AUTHOR("Simon Horman (horms@verge.net.au)"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From da6e378ba918cd0feeb90eeb84d8b42148bb0c82 Mon Sep 17 00:00:00 2001 From: dingtianhong Date: Mon, 27 May 2013 19:53:31 +0000 Subject: netpoll: remove return value from netpoll_rx_disable() The netpoll_rx_disable() will always return 0, it is no use and looks wordy, so remove the unnecessary code and get rid of it in _dev_open and _dev_close. Signed-off-by: Ding Tianhong Signed-off-by: David S. Miller --- include/linux/netpoll.h | 4 ++-- net/core/dev.c | 15 ++++----------- net/core/netpoll.c | 3 +-- 3 files changed, 7 insertions(+), 15 deletions(-) (limited to 'net/core') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index fa2cb76a7029..f3c7c24bec1c 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -53,10 +53,10 @@ struct netpoll_info { }; #ifdef CONFIG_NETPOLL -extern int netpoll_rx_disable(struct net_device *dev); +extern void netpoll_rx_disable(struct net_device *dev); extern void netpoll_rx_enable(struct net_device *dev); #else -static inline int netpoll_rx_disable(struct net_device *dev) { return 0; } +static inline void netpoll_rx_disable(struct net_device *dev) { return; } static inline void netpoll_rx_enable(struct net_device *dev) { return; } #endif diff --git a/net/core/dev.c b/net/core/dev.c index 2f09cb29cc95..5f747974ac58 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1198,9 +1198,7 @@ static int __dev_open(struct net_device *dev) * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ - ret = netpoll_rx_disable(dev); - if (ret) - return ret; + netpoll_rx_disable(dev); ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); ret = notifier_to_errno(ret); @@ -1309,9 +1307,7 @@ static int __dev_close(struct net_device *dev) LIST_HEAD(single); /* Temporarily disable netpoll until the interface is down */ - retval = netpoll_rx_disable(dev); - if (retval) - return retval; + netpoll_rx_disable(dev); list_add(&dev->unreg_list, &single); retval = __dev_close_many(&single); @@ -1353,14 +1349,11 @@ static int dev_close_many(struct list_head *head) */ int dev_close(struct net_device *dev) { - int ret = 0; if (dev->flags & IFF_UP) { LIST_HEAD(single); /* Block netpoll rx while the interface is going down */ - ret = netpoll_rx_disable(dev); - if (ret) - return ret; + netpoll_rx_disable(dev); list_add(&dev->unreg_list, &single); dev_close_many(&single); @@ -1368,7 +1361,7 @@ int dev_close(struct net_device *dev) netpoll_rx_enable(dev); } - return ret; + return 0; } EXPORT_SYMBOL(dev_close); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index cec074be8c43..37deedd48bcc 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -247,7 +247,7 @@ static void netpoll_poll_dev(struct net_device *dev) zap_completion_queue(); } -int netpoll_rx_disable(struct net_device *dev) +void netpoll_rx_disable(struct net_device *dev) { struct netpoll_info *ni; int idx; @@ -257,7 +257,6 @@ int netpoll_rx_disable(struct net_device *dev) if (ni) down(&ni->dev_lock); srcu_read_unlock(&netpoll_srcu, idx); - return 0; } EXPORT_SYMBOL(netpoll_rx_disable); -- cgit v1.2.3 From 351638e7deeed2ec8ce451b53d33921b3da68f83 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 28 May 2013 01:30:21 +0000 Subject: net: pass info struct via netdevice notifier So far, only net_device * could be passed along with netdevice notifier event. This patch provides a possibility to pass custom structure able to provide info that event listener needs to know. Signed-off-by: Jiri Pirko v2->v3: fix typo on simeth shortened dev_getter shortened notifier_info struct name v1->v2: fix notifier_call parameter in call_netdevice_notifier() Signed-off-by: David S. Miller --- arch/ia64/hp/sim/simeth.c | 2 +- arch/mips/txx9/generic/setup_tx4939.c | 3 +- drivers/infiniband/core/cma.c | 4 +- drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/net/bonding/bond_main.c | 2 +- drivers/net/can/led.c | 4 +- drivers/net/ethernet/broadcom/cnic.c | 2 +- drivers/net/ethernet/marvell/skge.c | 2 +- drivers/net/ethernet/marvell/sky2.c | 2 +- .../net/ethernet/qlogic/netxen/netxen_nic_main.c | 2 +- drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 2 +- drivers/net/ethernet/sfc/efx.c | 2 +- drivers/net/hamradio/bpqether.c | 7 +-- drivers/net/macvlan.c | 2 +- drivers/net/macvtap.c | 2 +- drivers/net/netconsole.c | 5 +- drivers/net/ppp/pppoe.c | 2 +- drivers/net/team/team.c | 2 +- drivers/net/wan/dlci.c | 2 +- drivers/net/wan/hdlc.c | 2 +- drivers/net/wan/lapbether.c | 2 +- drivers/scsi/fcoe/fcoe.c | 2 +- drivers/scsi/fcoe/fcoe_transport.c | 2 +- drivers/staging/csr/netdev.c | 2 +- drivers/staging/ft1000/ft1000-pcmcia/ft1000_proc.c | 2 +- drivers/staging/ft1000/ft1000-usb/ft1000_proc.c | 2 +- drivers/staging/silicom/bpctl_mod.c | 2 +- include/linux/netdevice.h | 13 +++++ net/8021q/vlan.c | 2 +- net/appletalk/aarp.c | 2 +- net/appletalk/ddp.c | 2 +- net/atm/clip.c | 4 +- net/atm/mpc.c | 6 +-- net/ax25/af_ax25.c | 6 +-- net/batman-adv/hard-interface.c | 2 +- net/bridge/br_notify.c | 2 +- net/caif/caif_dev.c | 4 +- net/caif/caif_usb.c | 4 +- net/can/af_can.c | 4 +- net/can/bcm.c | 4 +- net/can/gw.c | 4 +- net/can/raw.c | 4 +- net/core/dev.c | 56 ++++++++++++++++++---- net/core/drop_monitor.c | 4 +- net/core/dst.c | 2 +- net/core/fib_rules.c | 4 +- net/core/netprio_cgroup.c | 2 +- net/core/pktgen.c | 2 +- net/core/rtnetlink.c | 2 +- net/decnet/af_decnet.c | 4 +- net/ieee802154/6lowpan.c | 5 +- net/ipv4/arp.c | 2 +- net/ipv4/devinet.c | 2 +- net/ipv4/fib_frontend.c | 2 +- net/ipv4/ipmr.c | 2 +- net/ipv4/netfilter/ipt_MASQUERADE.c | 2 +- net/ipv6/addrconf.c | 4 +- net/ipv6/ip6mr.c | 2 +- net/ipv6/ndisc.c | 2 +- net/ipv6/netfilter/ip6t_MASQUERADE.c | 2 +- net/ipv6/route.c | 4 +- net/ipx/af_ipx.c | 2 +- net/iucv/af_iucv.c | 2 +- net/mac80211/iface.c | 5 +- net/netfilter/ipvs/ip_vs_ctl.c | 4 +- net/netfilter/nfnetlink_queue_core.c | 2 +- net/netfilter/xt_TEE.c | 2 +- net/netlabel/netlabel_unlabeled.c | 7 ++- net/netrom/af_netrom.c | 2 +- net/openvswitch/dp_notify.c | 2 +- net/packet/af_packet.c | 5 +- net/phonet/pn_dev.c | 4 +- net/rose/af_rose.c | 6 +-- net/sched/act_mirred.c | 2 +- net/tipc/eth_media.c | 4 +- net/tipc/ib_media.c | 4 +- net/wireless/core.c | 5 +- net/x25/af_x25.c | 2 +- net/xfrm/xfrm_policy.c | 2 +- security/selinux/netif.c | 2 +- 80 files changed, 172 insertions(+), 127 deletions(-) (limited to 'net/core') diff --git a/arch/ia64/hp/sim/simeth.c b/arch/ia64/hp/sim/simeth.c index c13064e422df..d1b04c4c95e3 100644 --- a/arch/ia64/hp/sim/simeth.c +++ b/arch/ia64/hp/sim/simeth.c @@ -268,7 +268,7 @@ static __inline__ int dev_is_ethdev(struct net_device *dev) static int simeth_device_event(struct notifier_block *this,unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct simeth_local *local; struct in_device *in_dev; struct in_ifaddr **ifap = NULL; diff --git a/arch/mips/txx9/generic/setup_tx4939.c b/arch/mips/txx9/generic/setup_tx4939.c index 729a50991780..b7eccbd17bf7 100644 --- a/arch/mips/txx9/generic/setup_tx4939.c +++ b/arch/mips/txx9/generic/setup_tx4939.c @@ -331,7 +331,8 @@ static int tx4939_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + if (event == NETDEV_CHANGE && netif_carrier_ok(dev)) { __u64 bit = 0; if (dev->irq == TXX9_IRQ_BASE + TX4939_IR_ETH(0)) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 71c2c7116802..34fbc2f60a09 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3269,9 +3269,9 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id } static int cma_netdev_callback(struct notifier_block *self, unsigned long event, - void *ctx) + void *ptr) { - struct net_device *ndev = (struct net_device *)ctx; + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct cma_device *cma_dev; struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 23d734349d8e..a188d3178559 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1161,7 +1161,7 @@ static void netdev_removed(struct mlx4_ib_dev *dev, int port) static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct mlx4_ib_dev *ibdev; struct net_device *oldnd; struct mlx4_ib_iboe *iboe; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 29b846cbfb48..f4489d65bf33 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3277,7 +3277,7 @@ static int bond_slave_netdev_event(unsigned long event, static int bond_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *event_dev = (struct net_device *)ptr; + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); pr_debug("event_dev: %s, event: %lx\n", event_dev ? event_dev->name : "None", diff --git a/drivers/net/can/led.c b/drivers/net/can/led.c index f27fca65dc4a..a3d99a8fd2d1 100644 --- a/drivers/net/can/led.c +++ b/drivers/net/can/led.c @@ -88,9 +88,9 @@ EXPORT_SYMBOL_GPL(devm_can_led_init); /* NETDEV rename notifier to rename the associated led triggers too */ static int can_led_notifier(struct notifier_block *nb, unsigned long msg, - void *data) + void *ptr) { - struct net_device *netdev = data; + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct can_priv *priv = safe_candev_priv(netdev); char name[CAN_LED_NAME_SZ]; diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index 6b0dc131b20e..d78d4cf140ed 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -5622,7 +5622,7 @@ static void cnic_rcv_netevent(struct cnic_local *cp, unsigned long event, static int cnic_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *netdev = ptr; + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct cnic_dev *dev; int new_dev = 0; diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c index 171f4b3dda07..c896079728e1 100644 --- a/drivers/net/ethernet/marvell/skge.c +++ b/drivers/net/ethernet/marvell/skge.c @@ -3706,7 +3706,7 @@ static const struct file_operations skge_debug_fops = { static int skge_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct skge_port *skge; struct dentry *d; diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index d175bbd3ffd3..e09a8c6f8536 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -4642,7 +4642,7 @@ static const struct file_operations sky2_debug_fops = { static int sky2_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct sky2_port *sky2 = netdev_priv(dev); if (dev->netdev_ops->ndo_open != sky2_open || !sky2_debug) diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c index af951f343ff6..51e13d92761e 100644 --- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c +++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c @@ -3311,7 +3311,7 @@ static int netxen_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netxen_adapter *adapter; - struct net_device *dev = (struct net_device *)ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net_device *orig_dev = dev; struct net_device *slave; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index da82f2eb73b4..6bb56d43614b 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -3530,7 +3530,7 @@ static int qlcnic_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct qlcnic_adapter *adapter; - struct net_device *dev = (struct net_device *)ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); recheck: if (dev == NULL) diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 39e4cb39de29..46cc11d5e205 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -2120,7 +2120,7 @@ static void efx_update_name(struct efx_nic *efx) static int efx_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *net_dev = ptr; + struct net_device *net_dev = netdev_notifier_info_to_dev(ptr); if (net_dev->netdev_ops == &efx_netdev_ops && event == NETDEV_CHANGENAME) diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index 02de6c891670..f91bf0ddf031 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -103,7 +103,7 @@ static struct packet_type bpq_packet_type __read_mostly = { }; static struct notifier_block bpq_dev_notifier = { - .notifier_call =bpq_device_event, + .notifier_call = bpq_device_event, }; @@ -544,9 +544,10 @@ static void bpq_free_device(struct net_device *ndev) /* * Handle device status changes. */ -static int bpq_device_event(struct notifier_block *this,unsigned long event, void *ptr) +static int bpq_device_event(struct notifier_block *this, + unsigned long event, void *ptr) { - struct net_device *dev = (struct net_device *)ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 1c502bb0c916..edfddc5f61b4 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -921,7 +921,7 @@ static struct rtnl_link_ops macvlan_link_ops = { static int macvlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvlan_dev *vlan, *next; struct macvlan_port *port; LIST_HEAD(list_kill); diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 59e9605de316..68efb91a5633 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -1053,7 +1053,7 @@ EXPORT_SYMBOL_GPL(macvtap_get_socket); static int macvtap_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvlan_dev *vlan; struct device *classdev; dev_t devt; diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index 59ac143dec25..1d1d0a12765c 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -653,12 +653,11 @@ static struct configfs_subsystem netconsole_subsys = { /* Handle network interface device notifications */ static int netconsole_netdev_event(struct notifier_block *this, - unsigned long event, - void *ptr) + unsigned long event, void *ptr) { unsigned long flags; struct netconsole_target *nt; - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); bool stopped = false; if (!(event == NETDEV_CHANGENAME || event == NETDEV_UNREGISTER || diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index bb07ba94c3aa..5f66e30d9823 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -338,7 +338,7 @@ static void pppoe_flush_dev(struct net_device *dev) static int pppoe_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = (struct net_device *)ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* Only look at sockets that are using this specific device. */ switch (event) { diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 7c43261975bd..9273f48a512b 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2647,7 +2647,7 @@ static void team_port_change_check(struct team_port *port, bool linkup) static int team_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = (struct net_device *) ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct team_port *port; port = team_port_get_rtnl(dev); diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c index 147614ed86aa..70ac59929f80 100644 --- a/drivers/net/wan/dlci.c +++ b/drivers/net/wan/dlci.c @@ -477,7 +477,7 @@ static void dlci_setup(struct net_device *dev) static int dlci_dev_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = (struct net_device *) ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev_net(dev) != &init_net) return NOTIFY_DONE; diff --git a/drivers/net/wan/hdlc.c b/drivers/net/wan/hdlc.c index a0a932c63d0a..9c33ca918e19 100644 --- a/drivers/net/wan/hdlc.c +++ b/drivers/net/wan/hdlc.c @@ -99,7 +99,7 @@ static inline void hdlc_proto_stop(struct net_device *dev) static int hdlc_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); hdlc_device *hdlc; unsigned long flags; int on; diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c index a73b49eb87e3..a33a46fa88dd 100644 --- a/drivers/net/wan/lapbether.c +++ b/drivers/net/wan/lapbether.c @@ -370,7 +370,7 @@ static int lapbeth_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct lapbethdev *lapbeth; - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev_net(dev) != &init_net) return NOTIFY_DONE; diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index 292b24f9bf93..ee721b6cbcdf 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -1975,7 +1975,7 @@ static int fcoe_device_notification(struct notifier_block *notifier, { struct fcoe_ctlr_device *cdev; struct fc_lport *lport = NULL; - struct net_device *netdev = ptr; + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct fcoe_ctlr *ctlr; struct fcoe_interface *fcoe; struct fcoe_port *port; diff --git a/drivers/scsi/fcoe/fcoe_transport.c b/drivers/scsi/fcoe/fcoe_transport.c index f3a5a53e8631..01adbe0ec53b 100644 --- a/drivers/scsi/fcoe/fcoe_transport.c +++ b/drivers/scsi/fcoe/fcoe_transport.c @@ -704,7 +704,7 @@ static struct net_device *fcoe_if_to_netdev(const char *buffer) static int libfcoe_device_notification(struct notifier_block *notifier, ulong event, void *ptr) { - struct net_device *netdev = ptr; + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_UNREGISTER: diff --git a/drivers/staging/csr/netdev.c b/drivers/staging/csr/netdev.c index a0177d998978..d49cdf84a496 100644 --- a/drivers/staging/csr/netdev.c +++ b/drivers/staging/csr/netdev.c @@ -2891,7 +2891,7 @@ void uf_net_get_name(struct net_device *dev, char *name, int len) */ static int uf_netdev_event(struct notifier_block *notif, unsigned long event, void* ptr) { - struct net_device *netdev = ptr; + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); netInterface_priv_t *interfacePriv = (netInterface_priv_t *)netdev_priv(netdev); unifi_priv_t *priv = NULL; static const CsrWifiMacAddress broadcast_address = {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}; diff --git a/drivers/staging/ft1000/ft1000-pcmcia/ft1000_proc.c b/drivers/staging/ft1000/ft1000-pcmcia/ft1000_proc.c index 94e426e4d98b..b2330f1df7e7 100644 --- a/drivers/staging/ft1000/ft1000-pcmcia/ft1000_proc.c +++ b/drivers/staging/ft1000/ft1000-pcmcia/ft1000_proc.c @@ -164,7 +164,7 @@ static const struct file_operations ft1000_proc_fops = { static int ft1000NotifyProc(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ft1000_info *info; info = netdev_priv(dev); diff --git a/drivers/staging/ft1000/ft1000-usb/ft1000_proc.c b/drivers/staging/ft1000/ft1000-usb/ft1000_proc.c index eca6f0292b4b..5ead942be680 100644 --- a/drivers/staging/ft1000/ft1000-usb/ft1000_proc.c +++ b/drivers/staging/ft1000/ft1000-usb/ft1000_proc.c @@ -166,7 +166,7 @@ static const struct file_operations ft1000_proc_fops = { static int ft1000NotifyProc(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ft1000_info *info; struct proc_dir_entry *ft1000_proc_file; diff --git a/drivers/staging/silicom/bpctl_mod.c b/drivers/staging/silicom/bpctl_mod.c index b7e570ccb759..c8ddb99e8526 100644 --- a/drivers/staging/silicom/bpctl_mod.c +++ b/drivers/staging/silicom/bpctl_mod.c @@ -133,7 +133,7 @@ static unsigned long str_to_hex(char *p); static int bp_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); static bpctl_dev_t *pbpctl_dev = NULL, *pbpctl_dev_m = NULL; int dev_num = 0, ret = 0, ret_d = 0, time_left = 0; /* printk("BP_PROC_SUPPORT event =%d %s %d\n", event,dev->name, dev->ifindex ); */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6b2bb460d1d7..13a34848b5e1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1599,6 +1599,19 @@ struct packet_offload { extern int register_netdevice_notifier(struct notifier_block *nb); extern int unregister_netdevice_notifier(struct notifier_block *nb); + +struct netdev_notifier_info { + struct net_device *dev; +}; + +static inline struct net_device * +netdev_notifier_info_to_dev(const struct netdev_notifier_info *info) +{ + return info->dev; +} + +extern int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, + struct netdev_notifier_info *info); extern int call_netdevice_notifiers(unsigned long val, struct net_device *dev); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 9424f3718ea7..2fb2d88e8c2e 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -341,7 +341,7 @@ static void __vlan_device_event(struct net_device *dev, unsigned long event) static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; int i, flgs; diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 173a2e82f486..690356fa52b9 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -332,7 +332,7 @@ static void aarp_expire_timeout(unsigned long unused) static int aarp_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); int ct; if (!net_eq(dev_net(dev), &init_net)) diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index ef12839a7cfe..7fee50d637f9 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -644,7 +644,7 @@ static inline void atalk_dev_down(struct net_device *dev) static int ddp_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/net/atm/clip.c b/net/atm/clip.c index 8ae3a7879335..cce241eb01d9 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -539,9 +539,9 @@ static int clip_create(int number) } static int clip_device_event(struct notifier_block *this, unsigned long event, - void *arg) + void *ptr) { - struct net_device *dev = arg; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/net/atm/mpc.c b/net/atm/mpc.c index d4cc1be5c364..3af12755cd04 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -998,14 +998,12 @@ int msg_to_mpoad(struct k_message *mesg, struct mpoa_client *mpc) } static int mpoa_event_listener(struct notifier_block *mpoa_notifier, - unsigned long event, void *dev_ptr) + unsigned long event, void *ptr) { - struct net_device *dev; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct mpoa_client *mpc; struct lec_priv *priv; - dev = dev_ptr; - if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index e277e38f736b..4b4d2b779ec1 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -111,9 +111,9 @@ again: * Handle device status changes. */ static int ax25_device_event(struct notifier_block *this, unsigned long event, - void *ptr) + void *ptr) { - struct net_device *dev = (struct net_device *)ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; @@ -1974,7 +1974,7 @@ static struct packet_type ax25_packet_type __read_mostly = { }; static struct notifier_block ax25_dev_notifier = { - .notifier_call =ax25_device_event, + .notifier_call = ax25_device_event, }; static int __init ax25_init(void) diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 522243aff2f3..b6504eac0ed8 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -595,7 +595,7 @@ void batadv_hardif_remove_interfaces(void) static int batadv_hard_if_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *net_dev = ptr; + struct net_device *net_dev = netdev_notifier_info_to_dev(ptr); struct batadv_hard_iface *hard_iface; struct batadv_hard_iface *primary_if = NULL; struct batadv_priv *bat_priv; diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index 1644b3e1f947..3a3f371b2841 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -31,7 +31,7 @@ struct notifier_block br_device_notifier = { */ static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; bool changed_addr; diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 1f9ece1a9c34..4dca159435cf 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -352,9 +352,9 @@ EXPORT_SYMBOL(caif_enroll_dev); /* notify Caif of device events */ static int caif_device_notify(struct notifier_block *me, unsigned long what, - void *arg) + void *ptr) { - struct net_device *dev = arg; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct caif_device_entry *caifd = NULL; struct caif_dev_common *caifdev; struct cfcnfg *cfg; diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 942e00a425fd..75ed04b78fa4 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -121,9 +121,9 @@ static struct packet_type caif_usb_type __read_mostly = { }; static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, - void *arg) + void *ptr) { - struct net_device *dev = arg; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct caif_dev_common common; struct cflayer *layer, *link_support; struct usbnet *usbnet; diff --git a/net/can/af_can.c b/net/can/af_can.c index c4e50852c9f4..3ab8dd2e1282 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -794,9 +794,9 @@ EXPORT_SYMBOL(can_proto_unregister); * af_can notifier to create/remove CAN netdevice specific structs */ static int can_notifier(struct notifier_block *nb, unsigned long msg, - void *data) + void *ptr) { - struct net_device *dev = (struct net_device *)data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct dev_rcv_lists *d; if (!net_eq(dev_net(dev), &init_net)) diff --git a/net/can/bcm.c b/net/can/bcm.c index 8f113e6ff327..46f20bfafc0e 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1350,9 +1350,9 @@ static int bcm_sendmsg(struct kiocb *iocb, struct socket *sock, * notification handler for netdevice status changes */ static int bcm_notifier(struct notifier_block *nb, unsigned long msg, - void *data) + void *ptr) { - struct net_device *dev = (struct net_device *)data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct bcm_sock *bo = container_of(nb, struct bcm_sock, notifier); struct sock *sk = &bo->sk; struct bcm_op *op; diff --git a/net/can/gw.c b/net/can/gw.c index 3ee690e8c7d3..2f291f961a17 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -445,9 +445,9 @@ static inline void cgw_unregister_filter(struct cgw_job *gwj) } static int cgw_notifier(struct notifier_block *nb, - unsigned long msg, void *data) + unsigned long msg, void *ptr) { - struct net_device *dev = (struct net_device *)data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/net/can/raw.c b/net/can/raw.c index 1085e65f848e..641e1c895123 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -239,9 +239,9 @@ static int raw_enable_allfilters(struct net_device *dev, struct sock *sk) } static int raw_notifier(struct notifier_block *nb, - unsigned long msg, void *data) + unsigned long msg, void *ptr) { - struct net_device *dev = (struct net_device *)data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct raw_sock *ro = container_of(nb, struct raw_sock, notifier); struct sock *sk = &ro->sk; diff --git a/net/core/dev.c b/net/core/dev.c index 5f747974ac58..54fce6006a83 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1391,6 +1391,20 @@ void dev_disable_lro(struct net_device *dev) } EXPORT_SYMBOL(dev_disable_lro); +static void netdev_notifier_info_init(struct netdev_notifier_info *info, + struct net_device *dev) +{ + info->dev = dev; +} + +static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, + struct net_device *dev) +{ + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, dev); + return nb->notifier_call(nb, val, &info); +} static int dev_boot_phase = 1; @@ -1423,7 +1437,7 @@ int register_netdevice_notifier(struct notifier_block *nb) goto unlock; for_each_net(net) { for_each_netdev(net, dev) { - err = nb->notifier_call(nb, NETDEV_REGISTER, dev); + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); err = notifier_to_errno(err); if (err) goto rollback; @@ -1431,7 +1445,7 @@ int register_netdevice_notifier(struct notifier_block *nb) if (!(dev->flags & IFF_UP)) continue; - nb->notifier_call(nb, NETDEV_UP, dev); + call_netdevice_notifier(nb, NETDEV_UP, dev); } } @@ -1447,10 +1461,11 @@ rollback: goto outroll; if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } @@ -1488,10 +1503,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb) for_each_net(net) { for_each_netdev(net, dev) { if (dev->flags & IFF_UP) { - nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); - nb->notifier_call(nb, NETDEV_DOWN, dev); + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); } - nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } unlock: @@ -1500,6 +1516,25 @@ unlock: } EXPORT_SYMBOL(unregister_netdevice_notifier); +/** + * call_netdevice_notifiers_info - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * @info: notifier information data + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ + +int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, + struct netdev_notifier_info *info) +{ + ASSERT_RTNL(); + netdev_notifier_info_init(info, dev); + return raw_notifier_call_chain(&netdev_chain, val, info); +} +EXPORT_SYMBOL(call_netdevice_notifiers_info); + /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function @@ -1511,8 +1546,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - ASSERT_RTNL(); - return raw_notifier_call_chain(&netdev_chain, val, dev); + struct netdev_notifier_info info; + + return call_netdevice_notifiers_info(val, dev, &info); } EXPORT_SYMBOL(call_netdevice_notifiers); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index d23b6682f4e9..5e78d44333b9 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -295,9 +295,9 @@ static int net_dm_cmd_trace(struct sk_buff *skb, } static int dropmon_net_event(struct notifier_block *ev_block, - unsigned long event, void *ptr) + unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct dm_hw_stat_delta *new_stat = NULL; struct dm_hw_stat_delta *tmp; diff --git a/net/core/dst.c b/net/core/dst.c index df9cc810ec8e..ca4231ec7347 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -372,7 +372,7 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct dst_entry *dst, *last = NULL; switch (event) { diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index d5a9f8ead0d8..21735440c44a 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -705,9 +705,9 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) static int fib_rules_event(struct notifier_block *this, unsigned long event, - void *ptr) + void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct fib_rules_ops *ops; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 0777d0aa18c3..e533259dce3c 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -261,7 +261,7 @@ struct cgroup_subsys net_prio_subsys = { static int netprio_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netprio_map *old; /* diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 11f2704c3810..795498fd4587 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1921,7 +1921,7 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d static int pktgen_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct pktgen_net *pn = net_generic(dev_net(dev), pg_net_id); if (pn->pktgen_exiting) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a08bd2b7fe3f..49c14451d8ab 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2667,7 +2667,7 @@ static void rtnetlink_rcv(struct sk_buff *skb) static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_UP: diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index c21f200eed93..dd4d506ef923 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -2078,9 +2078,9 @@ out_err: } static int dn_device_event(struct notifier_block *this, unsigned long event, - void *ptr) + void *ptr) { - struct net_device *dev = (struct net_device *)ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c index 55e1fd5b3e56..3b9d5f20bd1c 100644 --- a/net/ieee802154/6lowpan.c +++ b/net/ieee802154/6lowpan.c @@ -1352,10 +1352,9 @@ static inline void lowpan_netlink_fini(void) } static int lowpan_device_event(struct notifier_block *unused, - unsigned long event, - void *ptr) + unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); LIST_HEAD(del_list); struct lowpan_dev_record *entry, *tmp; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 247ec1951c35..bf574029a183 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1234,7 +1234,7 @@ out: static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_CHANGEADDR: diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index dfc39d4d48b7..b047e2d8a614 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1333,7 +1333,7 @@ static void inetdev_send_gratuitous_arp(struct net_device *dev, static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev = __in_dev_get_rtnl(dev); ASSERT_RTNL(); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c7629a209f9d..05a4888dede9 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1038,7 +1038,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev; struct net *net = dev_net(dev); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9d9610ae7855..f975399f3522 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1609,7 +1609,7 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 5d5d4d1be9c2..dd5508bde799 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -108,7 +108,7 @@ static int masq_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - const struct net_device *dev = ptr; + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_DOWN) { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 432e084b6b62..bce073b4bbd4 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2826,9 +2826,9 @@ static void addrconf_ip6_tnl_config(struct net_device *dev) } static int addrconf_notify(struct notifier_block *this, unsigned long event, - void *data) + void *ptr) { - struct net_device *dev = (struct net_device *) data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct inet6_dev *idev = __in6_dev_get(dev); int run_pending = 0; int err; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 241fb8ad9fcf..583e8d435f9a 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1319,7 +1319,7 @@ static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc, static int ip6mr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr6_table *mrt; struct mif_device *v; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 2712ab22a174..a0962697a257 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1568,7 +1568,7 @@ int ndisc_rcv(struct sk_buff *skb) static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct inet6_dev *idev; diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 60e9053bab05..b76257cd7e1e 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -71,7 +71,7 @@ static int device_cmp(struct nf_conn *ct, void *ifindex) static int masq_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - const struct net_device *dev = ptr; + const struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_DOWN) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ad0aa6b0b86a..194c3cde1536 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2681,9 +2681,9 @@ errout: } static int ip6_route_dev_notify(struct notifier_block *this, - unsigned long event, void *data) + unsigned long event, void *ptr) { - struct net_device *dev = (struct net_device *)data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) { diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index f547a47d381c..7a1e0fc1bd4d 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -330,7 +330,7 @@ static __inline__ void __ipxitf_put(struct ipx_interface *intrfc) static int ipxitf_device_event(struct notifier_block *notifier, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ipx_interface *i, *tmp; if (!net_eq(dev_net(dev), &init_net)) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index ae691651b721..168aff5e60de 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -2293,7 +2293,7 @@ out_unlock: static int afiucv_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *event_dev = (struct net_device *)ptr; + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); struct sock *sk; struct iucv_sock *iucv; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 60f1ce5e5e52..d2c3fd178dbe 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1717,10 +1717,9 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) } static int netdev_notify(struct notifier_block *nb, - unsigned long state, - void *ndev) + unsigned long state, void *ptr) { - struct net_device *dev = ndev; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ieee80211_sub_if_data *sdata; if (state != NETDEV_CHANGENAME) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5b142fb16480..7c3ed429789e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1487,9 +1487,9 @@ ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) * Currently only NETDEV_DOWN is handled to release refs to cached dsts */ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, - void *ptr) + void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_service *svc; diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 4e27fa035814..0f2ac8f2e7b7 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -800,7 +800,7 @@ static int nfqnl_rcv_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index bd93e51d30ac..292934d23482 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -200,7 +200,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) static int tee_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct xt_tee_priv *priv; priv = container_of(this, struct xt_tee_priv, notifier); diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 8a6c6ea466d8..af3531926ee0 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -708,7 +708,7 @@ unlhsh_remove_return: * netlbl_unlhsh_netdev_handler - Network device notification handler * @this: notifier block * @event: the event - * @ptr: the network device (cast to void) + * @ptr: the netdevice notifier info (cast to void) * * Description: * Handle network device events, although at present all we care about is a @@ -717,10 +717,9 @@ unlhsh_remove_return: * */ static int netlbl_unlhsh_netdev_handler(struct notifier_block *this, - unsigned long event, - void *ptr) + unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netlbl_unlhsh_iface *iface = NULL; if (!net_eq(dev_net(dev), &init_net)) diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index ec0c80fde69f..698814bfa7ad 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -117,7 +117,7 @@ static void nr_kill_by_device(struct net_device *dev) */ static int nr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = (struct net_device *)ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index ef4feec6cd84..c3235675f359 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -78,7 +78,7 @@ static int dp_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct ovs_net *ovs_net; - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vport *vport = NULL; if (!ovs_is_internal_dev(dev)) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8ec1bca7f859..79fe63246b27 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3331,10 +3331,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, } -static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data) +static int packet_notifier(struct notifier_block *this, + unsigned long msg, void *ptr) { struct sock *sk; - struct net_device *dev = data; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); rcu_read_lock(); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 45a7df6575de..56a6146ac94b 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -292,9 +292,9 @@ static void phonet_route_autodel(struct net_device *dev) /* notify Phonet of device events */ static int phonet_device_notify(struct notifier_block *me, unsigned long what, - void *arg) + void *ptr) { - struct net_device *dev = arg; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (what) { case NETDEV_REGISTER: diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 9c8347451597..e98fcfbe6007 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -202,10 +202,10 @@ static void rose_kill_by_device(struct net_device *dev) /* * Handle device status changes. */ -static int rose_device_event(struct notifier_block *this, unsigned long event, - void *ptr) +static int rose_device_event(struct notifier_block *this, + unsigned long event, void *ptr) { - struct net_device *dev = (struct net_device *)ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5d676edc22a6..977c10e0631b 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -243,7 +243,7 @@ nla_put_failure: static int mirred_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tcf_mirred *m; if (event == NETDEV_UNREGISTER) diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 120a676a3360..fc60bea63169 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -251,9 +251,9 @@ static void disable_bearer(struct tipc_bearer *tb_ptr) * specified device. */ static int recv_notification(struct notifier_block *nb, unsigned long evt, - void *dv) + void *ptr) { - struct net_device *dev = (struct net_device *)dv; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct eth_bearer *eb_ptr = ð_bearers[0]; struct eth_bearer *stop = ð_bearers[MAX_ETH_BEARERS]; diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index 2a2864c25e15..baa9df4327d9 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -244,9 +244,9 @@ static void disable_bearer(struct tipc_bearer *tb_ptr) * specified device. */ static int recv_notification(struct notifier_block *nb, unsigned long evt, - void *dv) + void *ptr) { - struct net_device *dev = (struct net_device *)dv; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ib_bearer *ib_ptr = &ib_bearers[0]; struct ib_bearer *stop = &ib_bearers[MAX_IB_BEARERS]; diff --git a/net/wireless/core.c b/net/wireless/core.c index 73405e00c800..01e41191f1bf 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -886,10 +886,9 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev, } static int cfg80211_netdev_notifier_call(struct notifier_block *nb, - unsigned long state, - void *ndev) + unsigned long state, void *ptr) { - struct net_device *dev = ndev; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; int ret; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 37ca9694aabe..1d964e23853f 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -224,7 +224,7 @@ static void x25_kill_by_device(struct net_device *dev) static int x25_device_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct x25_neigh *nb; if (!net_eq(dev_net(dev), &init_net)) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 23cea0f74336..536ccc95de89 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2784,7 +2784,7 @@ static void __net_init xfrm_dst_ops_init(struct net *net) static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_DOWN: diff --git a/security/selinux/netif.c b/security/selinux/netif.c index 47a49d1a6f6a..694e9e43855f 100644 --- a/security/selinux/netif.c +++ b/security/selinux/netif.c @@ -264,7 +264,7 @@ static int sel_netif_avc_callback(u32 event) static int sel_netif_netdev_notifier_handler(struct notifier_block *this, unsigned long event, void *ptr) { - struct net_device *dev = ptr; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev_net(dev) != &init_net) return NOTIFY_DONE; -- cgit v1.2.3 From be9efd3653284f2827fd82861e8e9db9a8f726e1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 28 May 2013 01:30:22 +0000 Subject: net: pass changed flags along with NETDEV_CHANGE event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use new netdevice notifier infrastructure to pass along changed flags. Signed-off-by: Timo Teräs Signed-off-by: Jiri Pirko v2->v3: shortened notifier_info struct name Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ net/core/dev.c | 9 +++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 13a34848b5e1..850271809a9e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1604,6 +1604,11 @@ struct netdev_notifier_info { struct net_device *dev; }; +struct netdev_notifier_change_info { + struct netdev_notifier_info info; /* must be first */ + unsigned int flags_changed; +}; + static inline struct net_device * netdev_notifier_info_to_dev(const struct netdev_notifier_info *info) { diff --git a/net/core/dev.c b/net/core/dev.c index 54fce6006a83..6eb621cc3b81 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4771,8 +4771,13 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) } if (dev->flags & IFF_UP && - (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) - call_netdevice_notifiers(NETDEV_CHANGE, dev); + (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { + struct netdev_notifier_change_info change_info; + + change_info.flags_changed = changes; + call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + &change_info.info); + } } /** -- cgit v1.2.3 From 06ecf24bdf2b7afc6c8fd13de6dba2a96dd331b6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 28 May 2013 13:15:50 -0700 Subject: net: Fix build warnings after mac_header and transport_header became __u16. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/core/skbuff.c: In function ‘__alloc_skb_head’: net/core/skbuff.c:203:2: warning: large integer implicitly truncated to unsigned type [-Woverflow] net/core/skbuff.c: In function ‘__alloc_skb’: net/core/skbuff.c:279:2: warning: large integer implicitly truncated to unsigned type [-Woverflow] net/core/skbuff.c:280:2: warning: large integer implicitly truncated to unsigned type [-Woverflow] net/core/skbuff.c: In function ‘build_skb’: net/core/skbuff.c:348:2: warning: large integer implicitly truncated to unsigned type [-Woverflow] net/core/skbuff.c:349:2: warning: large integer implicitly truncated to unsigned type [-Woverflow] Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d6298914f4e7..f45de077ab9e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -200,7 +200,7 @@ struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) atomic_set(&skb->users, 1); #ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = ~0U; + skb->mac_header = (__u16) ~0U; #endif out: return skb; @@ -276,8 +276,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb_reset_tail_pointer(skb); skb->end = skb->tail + size; #ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = ~0U; - skb->transport_header = ~0U; + skb->mac_header = (__u16) ~0U; + skb->transport_header = (__u16) ~0U; #endif /* make sure we initialize shinfo sequentially */ @@ -345,8 +345,8 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) skb_reset_tail_pointer(skb); skb->end = skb->tail + size; #ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = ~0U; - skb->transport_header = ~0U; + skb->mac_header = (__u16) ~0U; + skb->transport_header = (__u16) ~0U; #endif /* make sure we initialize shinfo sequentially */ -- cgit v1.2.3 From 75538c2b85cf22eb9af6adfaf26ed7219025adeb Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 29 May 2013 11:30:50 +0800 Subject: net: always pass struct netdev_notifier_info to netdevice notifiers commit 351638e7deeed2ec8ce451b53d3 (net: pass info struct via netdevice notifier) breaks booting of my KVM guest, this is due to we still forget to pass struct netdev_notifier_info in several places. This patch completes it. Cc: Jiri Pirko Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ net/atm/clip.c | 4 +++- net/core/dev.c | 6 ------ net/ipv4/netfilter/ipt_MASQUERADE.c | 5 ++++- net/ipv6/addrconf.c | 7 +++++-- net/ipv6/netfilter/ip6t_MASQUERADE.c | 4 +++- 6 files changed, 21 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 850271809a9e..8f967e34142b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1609,6 +1609,12 @@ struct netdev_notifier_change_info { unsigned int flags_changed; }; +static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, + struct net_device *dev) +{ + info->dev = dev; +} + static inline struct net_device * netdev_notifier_info_to_dev(const struct netdev_notifier_info *info) { diff --git a/net/atm/clip.c b/net/atm/clip.c index cce241eb01d9..8215f7cb170b 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -575,6 +575,7 @@ static int clip_inet_event(struct notifier_block *this, unsigned long event, void *ifa) { struct in_device *in_dev; + struct netdev_notifier_info info; in_dev = ((struct in_ifaddr *)ifa)->ifa_dev; /* @@ -583,7 +584,8 @@ static int clip_inet_event(struct notifier_block *this, unsigned long event, */ if (event != NETDEV_UP) return NOTIFY_DONE; - return clip_device_event(this, NETDEV_CHANGE, in_dev->dev); + netdev_notifier_info_init(&info, in_dev->dev); + return clip_device_event(this, NETDEV_CHANGE, &info); } static struct notifier_block clip_dev_notifier = { diff --git a/net/core/dev.c b/net/core/dev.c index 6eb621cc3b81..b2e9057be3bf 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1391,12 +1391,6 @@ void dev_disable_lro(struct net_device *dev) } EXPORT_SYMBOL(dev_disable_lro); -static void netdev_notifier_info_init(struct netdev_notifier_info *info, - struct net_device *dev) -{ - info->dev = dev; -} - static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index dd5508bde799..30e4de940567 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -129,7 +129,10 @@ static int masq_inet_event(struct notifier_block *this, void *ptr) { struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; - return masq_device_event(this, event, dev); + struct netdev_notifier_info info; + + netdev_notifier_info_init(&info, dev); + return masq_device_event(this, event, &info); } static struct notifier_block masq_dev_notifier = { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index bce073b4bbd4..7b34f06af344 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4645,13 +4645,16 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, static void dev_disable_change(struct inet6_dev *idev) { + struct netdev_notifier_info info; + if (!idev || !idev->dev) return; + netdev_notifier_info_init(&info, idev->dev); if (idev->cnf.disable_ipv6) - addrconf_notify(NULL, NETDEV_DOWN, idev->dev); + addrconf_notify(NULL, NETDEV_DOWN, &info); else - addrconf_notify(NULL, NETDEV_UP, idev->dev); + addrconf_notify(NULL, NETDEV_UP, &info); } static void addrconf_disable_change(struct net *net, __s32 newf) diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index b76257cd7e1e..47bff6107519 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -89,8 +89,10 @@ static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; + struct netdev_notifier_info info; - return masq_device_event(this, event, ifa->idev->dev); + netdev_notifier_info_init(&info, ifa->idev->dev); + return masq_device_event(this, event, &info); } static struct notifier_block masq_inet_notifier = { -- cgit v1.2.3 From ced14f6804a979d1972415bc23f2f8ddb18595dd Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 28 May 2013 20:34:25 +0000 Subject: net: Correct comparisons and calculations using skb->tail and skb-transport_header This corrects an regression introduced by "net: Use 16bits for *_headers fields of struct skbuff" when NET_SKBUFF_DATA_USES_OFFSET is not set. In that case skb->tail will be a pointer whereas skb->transport_header will be an offset from head. This is corrected by using wrappers that ensure that comparisons and calculations are always made using pointers. Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- include/net/inet_ecn.h | 6 ++++-- net/core/dev.c | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index aab73757bc4d..3bd22795c3e2 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -134,12 +134,14 @@ static inline int INET_ECN_set_ce(struct sk_buff *skb) { switch (skb->protocol) { case cpu_to_be16(ETH_P_IP): - if (skb->network_header + sizeof(struct iphdr) <= skb->tail) + if (skb_network_header(skb) + sizeof(struct iphdr) <= + skb_tail_pointer(skb)) return IP_ECN_set_ce(ip_hdr(skb)); break; case cpu_to_be16(ETH_P_IPV6): - if (skb->network_header + sizeof(struct ipv6hdr) <= skb->tail) + if (skb_network_header(skb) + sizeof(struct ipv6hdr) <= + skb_tail_pointer(skb)) return IP6_ECN_set_ce(ipv6_hdr(skb)); break; } diff --git a/net/core/dev.c b/net/core/dev.c index b2e9057be3bf..d4d874a25e45 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1724,7 +1724,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb2); if (skb_network_header(skb2) < skb2->data || - skb2->network_header > skb2->tail) { + skb_network_header(skb2) > skb_tail_pointer(skb2)) { net_crit_ratelimited("protocol %04x is buggy, dev %s\n", ntohs(skb2->protocol), dev->name); @@ -3892,7 +3892,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; - if (skb->mac_header == skb->tail && + if (skb_mac_header(skb) == skb_tail_pointer(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); -- cgit v1.2.3 From 7cc461900549fc480eb133948649a1edb7eaaa6f Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 28 May 2013 20:34:29 +0000 Subject: net, ipv4, ipv6: Correct assignment of skb->network_header to skb->tail This corrects an regression introduced by "net: Use 16bits for *_headers fields of struct skbuff" when NET_SKBUFF_DATA_USES_OFFSET is not set. In that case skb->tail will be a pointer however skb->network_header is now an offset. This patch corrects the problem by adding a wrapper to return skb tail as an offset regardless of the value of NET_SKBUFF_DATA_USES_OFFSET. It seems that skb->tail that this offset may be more than 64k and some care has been taken to treat such cases as an error. Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++++ net/core/netpoll.c | 9 ++++++++- net/core/pktgen.c | 16 ++++++++++++++-- net/ipv4/ipmr.c | 8 +++++++- 4 files changed, 38 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8f2b830772a8..5f931191cf57 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1391,6 +1391,11 @@ static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) skb_reset_tail_pointer(skb); skb->tail += offset; } + +static inline unsigned long skb_tail_offset(const struct sk_buff *skb) +{ + return skb->tail; +} #else /* NET_SKBUFF_DATA_USES_OFFSET */ static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb) { @@ -1407,6 +1412,10 @@ static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset) skb->tail = skb->data + offset; } +static inline unsigned long skb_tail_offset(const struct sk_buff *skb) +{ + return skb->tail - skb->head; +} #endif /* NET_SKBUFF_DATA_USES_OFFSET */ /* diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 37deedd48bcc..688517c7ff17 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -676,6 +676,8 @@ static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo spin_lock_irqsave(&npinfo->rx_lock, flags); list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + unsigned long tail_offset; + if (!ipv6_addr_equal(daddr, &np->local_ip.in6)) continue; @@ -700,7 +702,12 @@ static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo hdr->saddr = *saddr; hdr->daddr = *daddr; - send_skb->transport_header = send_skb->tail; + tail_offset = skb_tail_offset(skb); + if (tail_offset > 0xffff) { + kfree_skb(send_skb); + continue; + } + skb_set_network_header(send_skb, tail_offset); skb_put(send_skb, size); icmp6h = (struct icmp6hdr *)skb_transport_header(skb); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 795498fd4587..d2ede89662be 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2642,6 +2642,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ u16 queue_map; + unsigned long tail_offset; if (pkt_dev->nr_labels) protocol = htons(ETH_P_MPLS_UC); @@ -2708,7 +2709,12 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IP); } - skb->network_header = skb->tail; + tail_offset = skb_tail_offset(skb); + if (tail_offset > 0xffff) { + kfree_skb(skb); + return NULL; + } + skb_set_network_header(skb, tail_offset); skb->transport_header = skb->network_header + sizeof(struct iphdr); skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); @@ -2775,6 +2781,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ u16 queue_map; + unsigned long tail_offset; if (pkt_dev->nr_labels) protocol = htons(ETH_P_MPLS_UC); @@ -2822,7 +2829,12 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IPV6); } - skb->network_header = skb->tail; + tail_offset = skb_tail_offset(skb); + if (tail_offset > 0xffff) { + kfree_skb(skb); + return NULL; + } + skb_set_network_header(skb, tail_offset); skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index f975399f3522..df97f0ac1a1c 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -945,6 +945,7 @@ static int ipmr_cache_report(struct mr_table *mrt, struct igmpmsg *msg; struct sock *mroute_sk; int ret; + unsigned long tail_offset; #ifdef CONFIG_IP_PIMSM if (assert == IGMPMSG_WHOLEPKT) @@ -980,7 +981,12 @@ static int ipmr_cache_report(struct mr_table *mrt, /* Copy the IP header */ - skb->network_header = skb->tail; + tail_offset = skb_tail_offset(skb); + if (tail_offset > 0xffff) { + kfree_skb(skb); + return -EINVAL; + } + skb_set_network_header(skb, tail_offset); skb_put(skb, ihl); skb_copy_to_linear_data(skb, pkt->data, ihl); ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ -- cgit v1.2.3 From 35d0461061f27eeb62de63174959edbbb9e434de Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 29 May 2013 15:16:05 +0800 Subject: net: clean up skb headers code commit 1a37e412a0225fcba5587 (net: Use 16bits for *_headers fields of struct skbuff) converts skb->*_header to u16, some #if NET_SKBUFF_DATA_USES_OFFSET are now useless, and to be safe, we could just use "X = (typeof(X)) ~0U;" as suggested by David. Cc: David S. Miller Cc: Simon Horman Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++-- net/core/skbuff.c | 16 +++++----------- 2 files changed, 7 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5f931191cf57..b9997907a0f1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1593,7 +1593,7 @@ static inline void skb_set_inner_mac_header(struct sk_buff *skb, } static inline bool skb_transport_header_was_set(const struct sk_buff *skb) { - return skb->transport_header != ~0U; + return skb->transport_header != (typeof(skb->transport_header))~0U; } static inline unsigned char *skb_transport_header(const struct sk_buff *skb) @@ -1636,7 +1636,7 @@ static inline unsigned char *skb_mac_header(const struct sk_buff *skb) static inline int skb_mac_header_was_set(const struct sk_buff *skb) { - return skb->mac_header != ~0U; + return skb->mac_header != (typeof(skb->mac_header))~0U; } static inline void skb_reset_mac_header(struct sk_buff *skb) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f45de077ab9e..6b1b52c5593b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -199,9 +199,7 @@ struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) skb->truesize = sizeof(struct sk_buff); atomic_set(&skb->users, 1); -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = (__u16) ~0U; -#endif + skb->mac_header = (typeof(skb->mac_header))~0U; out: return skb; } @@ -275,10 +273,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = (__u16) ~0U; - skb->transport_header = (__u16) ~0U; -#endif + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); @@ -344,10 +340,8 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->mac_header = (__u16) ~0U; - skb->transport_header = (__u16) ~0U; -#endif + skb->mac_header = (typeof(skb->mac_header))~0U; + skb->transport_header = (typeof(skb->transport_header))~0U; /* make sure we initialize shinfo sequentially */ shinfo = skb_shinfo(skb); -- cgit v1.2.3 From 430f03cde2fb9596d8b562824471e298a8080df9 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Sun, 2 Jun 2013 20:43:55 +0000 Subject: net: mark netdev_create_hash __net_init netdev_create_hash() is only called from netdev_init() which is marked __net_init. Signed-off-by: Baruch Siach Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d4d874a25e45..9c18557f93c6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6088,7 +6088,7 @@ netdev_features_t netdev_increment_features(netdev_features_t all, } EXPORT_SYMBOL(netdev_increment_features); -static struct hlist_head *netdev_create_hash(void) +static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; -- cgit v1.2.3 From 600fed5e97afca10356952e334f362e82fc71466 Mon Sep 17 00:00:00 2001 From: Yan Burman Date: Mon, 3 Jun 2013 02:03:34 +0000 Subject: net/ethtool: Fix comment regarding location of dev_ethtool() call Signed-off-by: Yan Burman Signed-off-by: David S. Miller --- net/core/ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4e6f63ade741..cd23d314d68a 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1414,7 +1414,7 @@ static int ethtool_get_module_eeprom(struct net_device *dev, modinfo.eeprom_len); } -/* The main entry point in this file. Called from net/core/dev.c */ +/* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) { -- cgit v1.2.3 From 525cebedb32a87fa48584bc44e14170beb2c10d1 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Mon, 3 Jun 2013 11:49:23 +0000 Subject: pktgen: Fix position of ip and udp header skb_set_network_header() expects an offset based on the data pointer whereas skb_tail_offset() also includes the headroom. This resulted in the ip header being written in a wrong location. Use return values of skb_put() directly and rely on skb->len to set mac, network, and transport header. Cc: Simon Horman Cc: Daniel Borkmann Assisted-by: Daniel Borkmann Signed-off-by: Thomas Graf Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/pktgen.c | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d2ede89662be..303412d8332b 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2642,7 +2642,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ u16 queue_map; - unsigned long tail_offset; if (pkt_dev->nr_labels) protocol = htons(ETH_P_MPLS_UC); @@ -2709,20 +2708,15 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IP); } - tail_offset = skb_tail_offset(skb); - if (tail_offset > 0xffff) { - kfree_skb(skb); - return NULL; - } - skb_set_network_header(skb, tail_offset); - skb->transport_header = skb->network_header + sizeof(struct iphdr); - skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); + skb_set_mac_header(skb, 0); + skb_set_network_header(skb, skb->len); + iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr)); + + skb_set_transport_header(skb, skb->len); + udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); skb->priority = pkt_dev->skb_priority; - iph = ip_hdr(skb); - udph = udp_hdr(skb); - memcpy(eth, pkt_dev->hh, 12); *(__be16 *) & eth[12] = protocol; @@ -2752,8 +2746,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, iph->check = 0; iph->check = ip_fast_csum((void *)iph, iph->ihl); skb->protocol = protocol; - skb->mac_header = (skb->network_header - ETH_HLEN - - pkt_dev->pkt_overhead); skb->dev = odev; skb->pkt_type = PACKET_HOST; pktgen_finalize_skb(pkt_dev, skb, datalen); @@ -2781,7 +2773,6 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ u16 queue_map; - unsigned long tail_offset; if (pkt_dev->nr_labels) protocol = htons(ETH_P_MPLS_UC); @@ -2829,18 +2820,14 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, *vlan_encapsulated_proto = htons(ETH_P_IPV6); } - tail_offset = skb_tail_offset(skb); - if (tail_offset > 0xffff) { - kfree_skb(skb); - return NULL; - } - skb_set_network_header(skb, tail_offset); - skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); - skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); + skb_set_mac_header(skb, 0); + skb_set_network_header(skb, skb->len); + iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); + + skb_set_transport_header(skb, skb->len); + udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); skb->priority = pkt_dev->skb_priority; - iph = ipv6_hdr(skb); - udph = udp_hdr(skb); memcpy(eth, pkt_dev->hh, 12); *(__be16 *) ð[12] = protocol; @@ -2875,8 +2862,6 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, iph->daddr = pkt_dev->cur_in6_daddr; iph->saddr = pkt_dev->cur_in6_saddr; - skb->mac_header = (skb->network_header - ETH_HLEN - - pkt_dev->pkt_overhead); skb->protocol = protocol; skb->dev = odev; skb->pkt_type = PACKET_HOST; -- cgit v1.2.3 From 00f97da17a0c8d656d0c9a60b1d7f38735f69817 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 3 Jun 2013 16:31:36 +0000 Subject: netpoll: fix position of network header Similar to the problem in pktgen, netpoll uses skb_tail_offset() too, as the code is copied from pktgen. Also use return values of skb_put() directly, this will simiplify the code. Reported-by: Thomas Graf Cc: Thomas Graf Cc: Daniel Borkmann Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/netpoll.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 688517c7ff17..03c8ec3edc72 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -676,8 +676,6 @@ static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo spin_lock_irqsave(&npinfo->rx_lock, flags); list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { - unsigned long tail_offset; - if (!ipv6_addr_equal(daddr, &np->local_ip.in6)) continue; @@ -691,30 +689,20 @@ static void netpoll_neigh_reply(struct sk_buff *skb, struct netpoll_info *npinfo send_skb->dev = skb->dev; skb_reset_network_header(send_skb); - skb_put(send_skb, sizeof(struct ipv6hdr)); - hdr = ipv6_hdr(send_skb); - + hdr = (struct ipv6hdr *) skb_put(send_skb, sizeof(struct ipv6hdr)); *(__be32*)hdr = htonl(0x60000000); - hdr->payload_len = htons(size); hdr->nexthdr = IPPROTO_ICMPV6; hdr->hop_limit = 255; hdr->saddr = *saddr; hdr->daddr = *daddr; - tail_offset = skb_tail_offset(skb); - if (tail_offset > 0xffff) { - kfree_skb(send_skb); - continue; - } - skb_set_network_header(send_skb, tail_offset); - skb_put(send_skb, size); - - icmp6h = (struct icmp6hdr *)skb_transport_header(skb); + icmp6h = (struct icmp6hdr *) skb_put(send_skb, sizeof(struct icmp6hdr)); icmp6h->icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; icmp6h->icmp6_router = 0; icmp6h->icmp6_solicited = 1; - target = (struct in6_addr *)(skb_transport_header(send_skb) + sizeof(struct icmp6hdr)); + + target = (struct in6_addr *) skb_put(send_skb, sizeof(struct in6_addr)); *target = msg->target; icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, size, IPPROTO_ICMPV6, -- cgit v1.2.3 From af12fa6e46aa651e7b86a4c4117b562518fef184 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Mon, 10 Jun 2013 11:39:41 +0300 Subject: net: add napi_id and hash Adds a napi_id and a hashing mechanism to lookup a napi by id. This will be used by subsequent patches to implement low latency Ethernet device polling. Based on a code sample by Eric Dumazet. Signed-off-by: Eliezer Tamir Signed-off-by: Eric Dumazet Tested-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/netdevice.h | 29 +++++++++++++++++++++++ net/core/dev.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8f967e34142b..39bbd462d68e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -324,12 +324,15 @@ struct napi_struct { struct sk_buff *gro_list; struct sk_buff *skb; struct list_head dev_list; + struct hlist_node napi_hash_node; + unsigned int napi_id; }; enum { NAPI_STATE_SCHED, /* Poll is scheduled */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ + NAPI_STATE_HASHED, /* In NAPI hash */ }; enum gro_result { @@ -445,6 +448,32 @@ static inline bool napi_reschedule(struct napi_struct *napi) extern void __napi_complete(struct napi_struct *n); extern void napi_complete(struct napi_struct *n); +/** + * napi_by_id - lookup a NAPI by napi_id + * @napi_id: hashed napi_id + * + * lookup @napi_id in napi_hash table + * must be called under rcu_read_lock() + */ +extern struct napi_struct *napi_by_id(unsigned int napi_id); + +/** + * napi_hash_add - add a NAPI to global hashtable + * @napi: napi context + * + * generate a new napi_id and store a @napi under it in napi_hash + */ +extern void napi_hash_add(struct napi_struct *napi); + +/** + * napi_hash_del - remove a NAPI from global table + * @napi: napi context + * + * Warning: caller must observe rcu grace period + * before freeing memory containing @napi + */ +extern void napi_hash_del(struct napi_struct *napi); + /** * napi_disable - prevent NAPI from scheduling * @n: napi context diff --git a/net/core/dev.c b/net/core/dev.c index 9c18557f93c6..fa007dba6beb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -129,6 +129,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -166,6 +167,12 @@ static struct list_head offload_base __read_mostly; DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); +/* protects napi_hash addition/deletion and napi_gen_id */ +static DEFINE_SPINLOCK(napi_hash_lock); + +static unsigned int napi_gen_id; +static DEFINE_HASHTABLE(napi_hash, 8); + seqcount_t devnet_rename_seq; static inline void dev_base_seq_inc(struct net *net) @@ -4136,6 +4143,58 @@ void napi_complete(struct napi_struct *n) } EXPORT_SYMBOL(napi_complete); +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} +EXPORT_SYMBOL_GPL(napi_by_id); + +void napi_hash_add(struct napi_struct *napi) +{ + if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + + spin_lock(&napi_hash_lock); + + /* 0 is not a valid id, we also skip an id that is taken + * we expect both events to be extremely rare + */ + napi->napi_id = 0; + while (!napi->napi_id) { + napi->napi_id = ++napi_gen_id; + if (napi_by_id(napi->napi_id)) + napi->napi_id = 0; + } + + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + + spin_unlock(&napi_hash_lock); + } +} +EXPORT_SYMBOL_GPL(napi_hash_add); + +/* Warning : caller is responsible to make sure rcu grace period + * is respected before freeing memory containing @napi + */ +void napi_hash_del(struct napi_struct *napi) +{ + spin_lock(&napi_hash_lock); + + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + hlist_del_rcu(&napi->napi_hash_node); + + spin_unlock(&napi_hash_lock); +} +EXPORT_SYMBOL_GPL(napi_hash_del); + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { -- cgit v1.2.3 From 060212928670593fb89243640bf05cf89560b023 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Mon, 10 Jun 2013 11:39:50 +0300 Subject: net: add low latency socket poll Adds an ndo_ll_poll method and the code that supports it. This method can be used by low latency applications to busy-poll Ethernet device queues directly from the socket code. sysctl_net_ll_poll controls how many microseconds to poll. Default is zero (disabled). Individual protocol support will be added by subsequent patches. Signed-off-by: Alexander Duyck Signed-off-by: Jesse Brandeburg Signed-off-by: Eliezer Tamir Acked-by: Eric Dumazet Tested-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 7 ++ include/linux/netdevice.h | 3 + include/linux/skbuff.h | 8 ++- include/net/ll_poll.h | 148 +++++++++++++++++++++++++++++++++++++++++++ include/net/sock.h | 4 ++ include/uapi/linux/snmp.h | 1 + net/Kconfig | 12 ++++ net/core/skbuff.c | 4 ++ net/core/sock.c | 6 ++ net/core/sysctl_net_core.c | 10 +++ net/ipv4/proc.c | 1 + net/socket.c | 6 ++ 12 files changed, 208 insertions(+), 2 deletions(-) create mode 100644 include/net/ll_poll.h (limited to 'net/core') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index c1f8640c2fc8..85ab72dcdc3c 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -50,6 +50,13 @@ The maximum number of packets that kernel can handle on a NAPI interrupt, it's a Per-CPU variable. Default: 64 +low_latency_poll +---------------- +Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL) +Approximate time in us to spin waiting for packets on the device queue. +Recommended value is 50. May increase power usage. +Default: 0 (off) + rmem_default ------------ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 39bbd462d68e..2ecb96d9a1e5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -971,6 +971,9 @@ struct net_device_ops { struct netpoll_info *info, gfp_t gfp); void (*ndo_netpoll_cleanup)(struct net_device *dev); +#endif +#ifdef CONFIG_NET_LL_RX_POLL + int (*ndo_ll_poll)(struct napi_struct *dev); #endif int (*ndo_set_vf_mac)(struct net_device *dev, int queue, u8 *mac); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9995834d2cb6..400d82ae2b03 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -386,6 +386,7 @@ typedef unsigned char *sk_buff_data_t; * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @dma_cookie: a cookie to one of several possible DMA operations * done by skb DMA functions + * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking * @mark: Generic packet mark * @dropcount: total number of sk_receive_queue overflows @@ -500,8 +501,11 @@ struct sk_buff { /* 7/9 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); -#ifdef CONFIG_NET_DMA - dma_cookie_t dma_cookie; +#if defined CONFIG_NET_DMA || defined CONFIG_NET_LL_RX_POLL + union { + unsigned int napi_id; + dma_cookie_t dma_cookie; + }; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h new file mode 100644 index 000000000000..bc262f88173f --- /dev/null +++ b/include/net/ll_poll.h @@ -0,0 +1,148 @@ +/* + * Low Latency Sockets + * Copyright(c) 2013 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + * Author: Eliezer Tamir + * + * Contact Information: + * e1000-devel Mailing List + */ + +/* + * For now this depends on CONFIG_X86_TSC + */ + +#ifndef _LINUX_NET_LL_POLL_H +#define _LINUX_NET_LL_POLL_H + +#include +#include + +#ifdef CONFIG_NET_LL_RX_POLL + +struct napi_struct; +extern unsigned long sysctl_net_ll_poll __read_mostly; + +/* return values from ndo_ll_poll */ +#define LL_FLUSH_FAILED -1 +#define LL_FLUSH_BUSY -2 + +/* we don't mind a ~2.5% imprecision */ +#define TSC_MHZ (tsc_khz >> 10) + +static inline cycles_t ll_end_time(void) +{ + return TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll) + get_cycles(); +} + +static inline bool sk_valid_ll(struct sock *sk) +{ + return sysctl_net_ll_poll && sk->sk_napi_id && + !need_resched() && !signal_pending(current); +} + +static inline bool can_poll_ll(cycles_t end_time) +{ + return !time_after((unsigned long)get_cycles(), + (unsigned long)end_time); +} + +static inline bool sk_poll_ll(struct sock *sk, int nonblock) +{ + cycles_t end_time = ll_end_time(); + const struct net_device_ops *ops; + struct napi_struct *napi; + int rc = false; + + /* + * rcu read lock for napi hash + * bh so we don't race with net_rx_action + */ + rcu_read_lock_bh(); + + napi = napi_by_id(sk->sk_napi_id); + if (!napi) + goto out; + + ops = napi->dev->netdev_ops; + if (!ops->ndo_ll_poll) + goto out; + + do { + + rc = ops->ndo_ll_poll(napi); + + if (rc == LL_FLUSH_FAILED) + break; /* permanent failure */ + + if (rc > 0) + /* local bh are disabled so it is ok to use _BH */ + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_LOWLATENCYRXPACKETS, rc); + + } while (skb_queue_empty(&sk->sk_receive_queue) + && can_poll_ll(end_time) && !nonblock); + + rc = !skb_queue_empty(&sk->sk_receive_queue); +out: + rcu_read_unlock_bh(); + return rc; +} + +/* used in the NIC receive handler to mark the skb */ +static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi) +{ + skb->napi_id = napi->napi_id; +} + +/* used in the protocol hanlder to propagate the napi_id to the socket */ +static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) +{ + sk->sk_napi_id = skb->napi_id; +} + +#else /* CONFIG_NET_LL_RX_POLL */ + +static inline cycles_t ll_end_time(void) +{ + return 0; +} + +static inline bool sk_valid_ll(struct sock *sk) +{ + return false; +} + +static inline bool sk_poll_ll(struct sock *sk, int nonblock) +{ + return false; +} + +static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi) +{ +} + +static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) +{ +} + +static inline bool can_poll_ll(cycles_t end_time) +{ + return false; +} + +#endif /* CONFIG_NET_LL_RX_POLL */ +#endif /* _LINUX_NET_LL_POLL_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 66772cf8c3c5..ac8e1818380c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -229,6 +229,7 @@ struct cg_proto; * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward + * @sk_napi_id: id of the last napi context to receive data for sk * @sk_allocation: allocation mode * @sk_sndbuf: size of send buffer in bytes * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, @@ -324,6 +325,9 @@ struct sock { int sk_forward_alloc; #ifdef CONFIG_RPS __u32 sk_rxhash; +#endif +#ifdef CONFIG_NET_LL_RX_POLL + unsigned int sk_napi_id; #endif atomic_t sk_drops; int sk_rcvbuf; diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index df2e8b4f9c03..26cbf76f8058 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -253,6 +253,7 @@ enum LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ + LINUX_MIB_LOWLATENCYRXPACKETS, /* LowLatencyRxPackets */ __LINUX_MIB_MAX }; diff --git a/net/Kconfig b/net/Kconfig index 523e43e6da1b..d6a9ce6e1800 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -243,6 +243,18 @@ config NETPRIO_CGROUP Cgroup subsystem for use in assigning processes to network priorities on a per-interface basis +config NET_LL_RX_POLL + bool "Low Latency Receive Poll" + depends on X86_TSC + default n + ---help--- + Support Low Latency Receive Queue Poll. + (For network card drivers which support this option.) + When waiting for data in read or poll call directly into the the device driver + to flush packets which may be pending on the device queues into the stack. + + If unsure, say N. + config BQL boolean depends on SYSFS diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 73f57a0e1523..4a4181e16c1a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -733,6 +733,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->vlan_tci = old->vlan_tci; skb_copy_secmark(new, old); + +#ifdef CONFIG_NET_LL_RX_POLL + new->napi_id = old->napi_id; +#endif } /* diff --git a/net/core/sock.c b/net/core/sock.c index 88868a9d21da..788c0da5eed1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -139,6 +139,8 @@ #include #endif +#include + static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); @@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_stamp = ktime_set(-1L, 0); +#ifdef CONFIG_NET_LL_RX_POLL + sk->sk_napi_id = 0; +#endif + /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 741db5fc7806..4b48f39582b0 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -19,6 +19,7 @@ #include #include #include +#include static int one = 1; @@ -284,6 +285,15 @@ static struct ctl_table net_core_table[] = { .proc_handler = flow_limit_table_len_sysctl }, #endif /* CONFIG_NET_FLOW_LIMIT */ +#ifdef CONFIG_NET_LL_RX_POLL + { + .procname = "low_latency_poll", + .data = &sysctl_net_ll_poll, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax + }, +#endif #endif /* CONFIG_NET */ { .procname = "netdev_budget", diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 2a5bf86d2415..6577a1149a47 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), + SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS), SNMP_MIB_SENTINEL }; diff --git a/net/socket.c b/net/socket.c index 3ebdcb805c51..21fd29f63ed2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -104,6 +104,12 @@ #include #include #include +#include + +#ifdef CONFIG_NET_LL_RX_POLL +unsigned long sysctl_net_ll_poll __read_mostly; +EXPORT_SYMBOL_GPL(sysctl_net_ll_poll); +#endif static int sock_no_open(struct inode *irrelevant, struct file *dontcare); static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, -- cgit v1.2.3 From a5b50476f77a8fcc8055c955720d05a7c2d9c532 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Mon, 10 Jun 2013 11:40:00 +0300 Subject: udp: add low latency socket poll support Add upport for busy-polling on UDP sockets. In __udp[46]_lib_rcv add a call to sk_mark_ll() to copy the napi_id from the skb into the sk. This is done at the earliest possible moment, right after we identify which socket this skb is for. In __skb_recv_datagram When there is no data and the user tries to read we busy poll. Signed-off-by: Alexander Duyck Signed-off-by: Jesse Brandeburg Signed-off-by: Eliezer Tamir Acked-by: Eric Dumazet Tested-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/datagram.c | 4 ++++ net/ipv4/udp.c | 6 +++++- net/ipv6/udp.c | 6 +++++- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index b71423db7785..9cbaba98ce4c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -56,6 +56,7 @@ #include #include #include +#include /* * Is a socket 'connection oriented' ? @@ -207,6 +208,9 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, } spin_unlock_irqrestore(&queue->lock, cpu_flags); + if (sk_valid_ll(sk) && sk_poll_ll(sk, flags & MSG_DONTWAIT)) + continue; + /* User doesn't want to wait */ error = -EAGAIN; if (!timeo) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c7338ec79cc0..2955b25aee6d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -109,6 +109,7 @@ #include #include #include +#include #include "udp_impl.h" struct udp_table udp_table __read_mostly; @@ -1709,7 +1710,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { - int ret = udp_queue_rcv_skb(sk, skb); + int ret; + + sk_mark_ll(sk, skb); + ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); /* a return value > 0 means to resubmit the input, but diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b5808539cd5c..f77e34c5a0e2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -841,7 +842,10 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { - int ret = udpv6_queue_rcv_skb(sk, skb); + int ret; + + sk_mark_ll(sk, skb); + ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); /* a return value > 0 means to resubmit the input, but -- cgit v1.2.3 From b41abb42bf62a85a32c41dab873220598a6ee266 Mon Sep 17 00:00:00 2001 From: "Peter Pan(潘卫平)" Date: Thu, 6 Jun 2013 21:27:21 +0800 Subject: net: pass correct parameter to skb_headers_offset_update() Since commit 1a37e412a022(net: Use 16bits for *_headers fields of struct skbuff), skb->*_header are relative to skb->head, so copy_skb_header() should not call skb_headers_offset_update() now, and we should pass correct parameter to skb_headers_offset_update() in pskb_expand_head() and skb_copy_expand(). Signed-off-by: Weiping Pan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/core/skbuff.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4a4181e16c1a..edf37578e21e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -909,18 +909,8 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { -#ifndef NET_SKBUFF_DATA_USES_OFFSET - /* - * Shift between the two data areas in bytes - */ - unsigned long offset = new->data - old->data; -#endif - __copy_skb_header(new, old); -#ifndef NET_SKBUFF_DATA_USES_OFFSET - skb_headers_offset_update(new, offset); -#endif skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; @@ -1112,7 +1102,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->end = skb->head + size; #endif skb->tail += off; - skb_headers_offset_update(skb, off); + skb_headers_offset_update(skb, nhead); /* Only adjust this if it actually is csum_start rather than csum */ if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start += nhead; @@ -1207,9 +1197,8 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, off = newheadroom - oldheadroom; if (n->ip_summed == CHECKSUM_PARTIAL) n->csum_start += off; -#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb_headers_offset_update(n, off); -#endif return n; } -- cgit v1.2.3 From 45203a3b380cee28f570475c0d28c169f908c209 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Jun 2013 08:43:22 -0700 Subject: net_sched: add 64bit rate estimators struct gnet_stats_rate_est contains u32 fields, so the bytes per second field can wrap at 34360Mbit. Add a new gnet_stats_rate_est64 structure to get 64bit bps/pps fields, and switch the kernel to use this structure natively. This structure is dumped to user space as a new attribute : TCA_STATS_RATE_EST64 Old tc command will now display the capped bps (to 34360Mbit), instead of wrapped values, and updated tc command will display correct information. Old tc command output, after patch : eric:~# tc -s -d qd sh dev lo qdisc pfifo 8001: root refcnt 2 limit 1000p Sent 80868245400 bytes 1978837 pkt (dropped 0, overlimits 0 requeues 0) rate 34360Mbit 189696pps backlog 0b 0p requeues 0 This patch carefully reorganizes "struct Qdisc" layout to get optimal performance on SMP. Signed-off-by: Eric Dumazet Cc: Ben Hutchings Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/net/gen_stats.h | 10 +++++----- include/net/netfilter/xt_rateest.h | 2 +- include/net/sch_generic.h | 13 +++++++------ include/uapi/linux/gen_stats.h | 11 +++++++++++ net/core/gen_estimator.c | 12 ++++++------ net/core/gen_stats.c | 22 +++++++++++++++++----- net/netfilter/xt_rateest.c | 2 +- net/sched/sch_cbq.c | 2 +- net/sched/sch_drr.c | 2 +- net/sched/sch_hfsc.c | 2 +- net/sched/sch_htb.c | 2 +- net/sched/sch_qfq.c | 2 +- 13 files changed, 54 insertions(+), 30 deletions(-) (limited to 'net/core') diff --git a/include/net/act_api.h b/include/net/act_api.h index 06ef7e926a66..b8ffac7b6bab 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -18,7 +18,7 @@ struct tcf_common { struct tcf_t tcfc_tm; struct gnet_stats_basic_packed tcfc_bstats; struct gnet_stats_queue tcfc_qstats; - struct gnet_stats_rate_est tcfc_rate_est; + struct gnet_stats_rate_est64 tcfc_rate_est; spinlock_t tcfc_lock; struct rcu_head tcfc_rcu; }; diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index a79b6cfb02a8..cf8439ba4d11 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -30,7 +30,7 @@ extern int gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b); extern int gnet_stats_copy_rate_est(struct gnet_dump *d, const struct gnet_stats_basic_packed *b, - struct gnet_stats_rate_est *r); + struct gnet_stats_rate_est64 *r); extern int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q); extern int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); @@ -38,13 +38,13 @@ extern int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); extern int gnet_stats_finish_copy(struct gnet_dump *d); extern int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est, + struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt); extern void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est); + struct gnet_stats_rate_est64 *rate_est); extern int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est, + struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt); extern bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est *rate_est); + const struct gnet_stats_rate_est64 *rate_est); #endif diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h index 5a2978d1cb22..495c71f66e7e 100644 --- a/include/net/netfilter/xt_rateest.h +++ b/include/net/netfilter/xt_rateest.h @@ -6,7 +6,7 @@ struct xt_rateest { struct gnet_stats_basic_packed bstats; spinlock_t lock; /* keep rstats and lock on same cache line to speedup xt_rateest_mt() */ - struct gnet_stats_rate_est rstats; + struct gnet_stats_rate_est64 rstats; /* following fields not accessed in hot path */ struct hlist_node list; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e7f4e21cc3e1..df5676029827 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -58,14 +58,12 @@ struct Qdisc { * multiqueue device. */ #define TCQ_F_WARN_NONWC (1 << 16) - int padded; + u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; struct list_head list; u32 handle; u32 parent; - atomic_t refcnt; - struct gnet_stats_rate_est rate_est; int (*reshape_fail)(struct sk_buff *skb, struct Qdisc *q); @@ -76,8 +74,9 @@ struct Qdisc { */ struct Qdisc *__parent; struct netdev_queue *dev_queue; - struct Qdisc *next_sched; + struct gnet_stats_rate_est64 rate_est; + struct Qdisc *next_sched; struct sk_buff *gso_skb; /* * For performance sake on SMP, we put highly modified fields at the end @@ -88,8 +87,10 @@ struct Qdisc { unsigned int __state; struct gnet_stats_queue qstats; struct rcu_head rcu_head; - spinlock_t busylock; - u32 limit; + int padded; + atomic_t refcnt; + + spinlock_t busylock ____cacheline_aligned_in_smp; }; static inline bool qdisc_is_running(const struct Qdisc *qdisc) diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h index 552c8a0a12d1..6487317ea619 100644 --- a/include/uapi/linux/gen_stats.h +++ b/include/uapi/linux/gen_stats.h @@ -9,6 +9,7 @@ enum { TCA_STATS_RATE_EST, TCA_STATS_QUEUE, TCA_STATS_APP, + TCA_STATS_RATE_EST64, __TCA_STATS_MAX, }; #define TCA_STATS_MAX (__TCA_STATS_MAX - 1) @@ -37,6 +38,16 @@ struct gnet_stats_rate_est { __u32 pps; }; +/** + * struct gnet_stats_rate_est64 - rate estimator + * @bps: current byte rate + * @pps: current packet rate + */ +struct gnet_stats_rate_est64 { + __u64 bps; + __u64 pps; +}; + /** * struct gnet_stats_queue - queuing statistics * @qlen: queue length diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index d9d198aa9fed..6b5b6e7013ca 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -82,7 +82,7 @@ struct gen_estimator { struct list_head list; struct gnet_stats_basic_packed *bstats; - struct gnet_stats_rate_est *rate_est; + struct gnet_stats_rate_est64 *rate_est; spinlock_t *stats_lock; int ewma_log; u64 last_bytes; @@ -167,7 +167,7 @@ static void gen_add_node(struct gen_estimator *est) static struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est *rate_est) + const struct gnet_stats_rate_est64 *rate_est) { struct rb_node *p = est_root.rb_node; @@ -203,7 +203,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats * */ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est, + struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { @@ -258,7 +258,7 @@ EXPORT_SYMBOL(gen_new_estimator); * Note : Caller should respect an RCU grace period before freeing stats_lock */ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est) + struct gnet_stats_rate_est64 *rate_est) { struct gen_estimator *e; @@ -290,7 +290,7 @@ EXPORT_SYMBOL(gen_kill_estimator); * Returns 0 on success or a negative error code. */ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est *rate_est, + struct gnet_stats_rate_est64 *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { gen_kill_estimator(bstats, rate_est); @@ -306,7 +306,7 @@ EXPORT_SYMBOL(gen_replace_estimator); * Returns true if estimator is active, and false if not. */ bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est *rate_est) + const struct gnet_stats_rate_est64 *rate_est) { bool res; diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index ddedf211e588..9d3d9e78397b 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -143,18 +143,30 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); int gnet_stats_copy_rate_est(struct gnet_dump *d, const struct gnet_stats_basic_packed *b, - struct gnet_stats_rate_est *r) + struct gnet_stats_rate_est64 *r) { + struct gnet_stats_rate_est est; + int res; + if (b && !gen_estimator_active(b, r)) return 0; + est.bps = min_t(u64, UINT_MAX, r->bps); + /* we have some time before reaching 2^32 packets per second */ + est.pps = r->pps; + if (d->compat_tc_stats) { - d->tc_stats.bps = r->bps; - d->tc_stats.pps = r->pps; + d->tc_stats.bps = est.bps; + d->tc_stats.pps = est.pps; } - if (d->tail) - return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r)); + if (d->tail) { + res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est)); + if (res < 0 || est.bps == r->bps) + return res; + /* emit 64bit stats only if needed */ + return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r)); + } return 0; } diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index ed0db15ab00e..7720b036d76a 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -18,7 +18,7 @@ static bool xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rateest_match_info *info = par->matchinfo; - struct gnet_stats_rate_est *r; + struct gnet_stats_rate_est64 *r; u_int32_t bps1, bps2, pps1, pps2; bool ret = true; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 1bc210ffcba2..71a568862557 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -130,7 +130,7 @@ struct cbq_class { psched_time_t penalized; struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct tc_cbq_xstats xstats; struct tcf_proto *filter_list; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 759b308d1a8d..8302717ea303 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -25,7 +25,7 @@ struct drr_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct list_head alist; struct Qdisc *qdisc; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 9facea03faeb..c4075610502c 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -114,7 +114,7 @@ struct hfsc_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; unsigned int level; /* class level in hierarchy */ struct tcf_proto *filter_list; /* filter list */ unsigned int filter_cnt; /* filter count */ diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index adaedd79389c..162fb800754c 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -78,7 +78,7 @@ struct htb_class { /* general class parameters */ struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct tc_htb_xstats xstats; /* our special stats */ int refcnt; /* usage count of this class */ diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index d51852bba01c..7c195d972bf0 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -138,7 +138,7 @@ struct qfq_class { struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; - struct gnet_stats_rate_est rate_est; + struct gnet_stats_rate_est64 rate_est; struct Qdisc *qdisc; struct list_head alist; /* Link for active-classes list. */ struct qfq_aggregate *agg; /* Parent aggregate. */ -- cgit v1.2.3 From 7a6e288d2745611bef5b614acf19644283765732 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 8 Jun 2013 14:18:16 +0200 Subject: pktgen: ipv6: numa: consolidate skb allocation to pktgen_alloc_skb We currently allow for numa-node aware skb allocation only within the fill_packet_ipv4() path, but not in fill_packet_ipv6(). Consolidate that code to a common allocation helper to enable numa-node aware skb allocation for ipv6, and use it in both paths. This also makes both functions a bit more readable. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/pktgen.c | 52 +++++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 25 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 303412d8332b..9640972ec50e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2627,6 +2627,29 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, pgh->tv_usec = htonl(timestamp.tv_usec); } +static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, + struct pktgen_dev *pkt_dev, + unsigned int extralen) +{ + struct sk_buff *skb = NULL; + unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen + + pkt_dev->pkt_overhead; + + if (pkt_dev->flags & F_NODE) { + int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id(); + + skb = __alloc_skb(NET_SKB_PAD + size, GFP_NOWAIT, 0, node); + if (likely(skb)) { + skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + } + } else { + skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); + } + + return skb; +} + static struct sk_buff *fill_packet_ipv4(struct net_device *odev, struct pktgen_dev *pkt_dev) { @@ -2657,32 +2680,13 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, datalen = (odev->hard_header_len + 16) & ~0xf; - if (pkt_dev->flags & F_NODE) { - int node; - - if (pkt_dev->node >= 0) - node = pkt_dev->node; - else - node = numa_node_id(); - - skb = __alloc_skb(NET_SKB_PAD + pkt_dev->cur_pkt_size + 64 - + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT, 0, node); - if (likely(skb)) { - skb_reserve(skb, NET_SKB_PAD); - skb->dev = odev; - } - } - else - skb = __netdev_alloc_skb(odev, - pkt_dev->cur_pkt_size + 64 - + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT); - + skb = pktgen_alloc_skb(odev, pkt_dev, datalen); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; } - prefetchw(skb->data); + prefetchw(skb->data); skb_reserve(skb, datalen); /* Reserve for ethernet and IP header */ @@ -2786,15 +2790,13 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, mod_cur_headers(pkt_dev); queue_map = pkt_dev->cur_queue_map; - skb = __netdev_alloc_skb(odev, - pkt_dev->cur_pkt_size + 64 - + 16 + pkt_dev->pkt_overhead, GFP_NOWAIT); + skb = pktgen_alloc_skb(odev, pkt_dev, 16); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; } - prefetchw(skb->data); + prefetchw(skb->data); skb_reserve(skb, 16); /* Reserve for ethernet and IP header */ -- cgit v1.2.3 From 194f4a6df2a92c3d0bc65a85facfbc2433b25d06 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Tue, 11 Jun 2013 23:09:29 +0200 Subject: net: make all team port device link events urgent Since team functionality relies heavily on userspace daemon, we need to deliver event to userspace via Netlink as quick as possible. So make all team port device link events urgent. Signed-off-by: Flavio Leitner Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/link_watch.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 8f82a5cc3851..9c3a839322ba 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -92,6 +92,9 @@ static bool linkwatch_urgent_event(struct net_device *dev) if (dev->ifindex != dev->iflink) return true; + if (dev->priv_flags & IFF_TEAM_PORT) + return true; + return netif_carrier_ok(dev) && qdisc_tx_changing(dev); } -- cgit v1.2.3 From fe2c6338fd2c6f383c4d4164262f35c8f3708e1f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 11 Jun 2013 23:04:25 -0700 Subject: net: Convert uses of typedef ctl_table to struct ctl_table Reduce the uses of this unnecessary typedef. Done via perl script: $ git grep --name-only -w ctl_table net | \ xargs perl -p -i -e '\ sub trim { my ($local) = @_; $local =~ s/(^\s+|\s+$)//g; return $local; } \ s/\b(? Signed-off-by: David S. Miller --- net/ax25/sysctl_net_ax25.c | 2 +- net/bridge/br_netfilter.c | 4 ++-- net/core/neighbour.c | 6 ++--- net/core/sysctl_net_core.c | 8 +++---- net/decnet/dn_dev.c | 6 ++--- net/decnet/sysctl_net_decnet.c | 6 ++--- net/ipv4/devinet.c | 6 ++--- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- net/ipv4/route.c | 4 ++-- net/ipv4/sysctl_net_ipv4.c | 31 +++++++++++++------------- net/ipv6/addrconf.c | 10 ++++----- net/ipv6/icmp.c | 2 +- net/ipv6/route.c | 4 ++-- net/ipv6/sysctl_net_ipv6.c | 4 ++-- net/irda/irsysctl.c | 6 ++--- net/netfilter/ipvs/ip_vs_ctl.c | 8 +++---- net/netfilter/ipvs/ip_vs_lblc.c | 2 +- net/netfilter/ipvs/ip_vs_lblcr.c | 2 +- net/netfilter/nf_conntrack_standalone.c | 4 ++-- net/netfilter/nf_log.c | 2 +- net/netrom/sysctl_net_netrom.c | 2 +- net/phonet/sysctl.c | 4 ++-- net/rds/ib_sysctl.c | 2 +- net/rds/iw_sysctl.c | 2 +- net/rds/sysctl.c | 2 +- net/rose/sysctl_net_rose.c | 2 +- net/sctp/sysctl.c | 10 ++++----- net/sunrpc/sysctl.c | 10 ++++----- net/sunrpc/xprtrdma/svc_rdma.c | 8 +++---- net/sunrpc/xprtrdma/transport.c | 4 ++-- net/sunrpc/xprtsock.c | 4 ++-- net/unix/sysctl_net_unix.c | 2 +- 32 files changed, 86 insertions(+), 85 deletions(-) (limited to 'net/core') diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c index d5744b752511..919a5ce47515 100644 --- a/net/ax25/sysctl_net_ax25.c +++ b/net/ax25/sysctl_net_ax25.c @@ -29,7 +29,7 @@ static int min_proto[1], max_proto[] = { AX25_PROTO_MAX }; static int min_ds_timeout[1], max_ds_timeout[] = {65535000}; #endif -static const ctl_table ax25_param_table[] = { +static const struct ctl_table ax25_param_table[] = { { .procname = "ip_default_mode", .maxlen = sizeof(int), diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 1ed75bfd8d1d..f87736270eaa 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -992,7 +992,7 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = { #ifdef CONFIG_SYSCTL static -int brnf_sysctl_call_tables(ctl_table * ctl, int write, +int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, void __user * buffer, size_t * lenp, loff_t * ppos) { int ret; @@ -1004,7 +1004,7 @@ int brnf_sysctl_call_tables(ctl_table * ctl, int write, return ret; } -static ctl_table brnf_table[] = { +static struct ctl_table brnf_table[] = { { .procname = "bridge-nf-call-arptables", .data = &brnf_call_arptables, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5c56b217b999..decaa4b9db2f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2765,11 +2765,11 @@ EXPORT_SYMBOL(neigh_app_ns); static int zero; static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); -static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer, - size_t *lenp, loff_t *ppos) +static int proc_unres_qlen(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { int size, ret; - ctl_table tmp = *ctl; + struct ctl_table tmp = *ctl; tmp.extra1 = &zero; tmp.extra2 = &unres_qlen_max; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 4b48f39582b0..637a42e5d589 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,12 +24,12 @@ static int one = 1; #ifdef CONFIG_RPS -static int rps_sock_flow_sysctl(ctl_table *table, int write, +static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; - ctl_table tmp = { + struct ctl_table tmp = { .data = &size, .maxlen = sizeof(size), .mode = table->mode @@ -91,7 +91,7 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, #ifdef CONFIG_NET_FLOW_LIMIT static DEFINE_MUTEX(flow_limit_update_mutex); -static int flow_limit_cpu_sysctl(ctl_table *table, int write, +static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -156,7 +156,7 @@ done: return ret; } -static int flow_limit_table_len_sysctl(ctl_table *table, int write, +static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 7d9197063ebb..dd0dfb25f4b1 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -158,11 +158,11 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU static int min_priority[1]; static int max_priority[] = { 127 }; /* From DECnet spec */ -static int dn_forwarding_proc(ctl_table *, int, +static int dn_forwarding_proc(struct ctl_table *, int, void __user *, size_t *, loff_t *); static struct dn_dev_sysctl_table { struct ctl_table_header *sysctl_header; - ctl_table dn_dev_vars[5]; + struct ctl_table dn_dev_vars[5]; } dn_dev_sysctl = { NULL, { @@ -242,7 +242,7 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) } } -static int dn_forwarding_proc(ctl_table *table, int write, +static int dn_forwarding_proc(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index a55eeccaa72f..5325b541c526 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -132,7 +132,7 @@ static int parse_addr(__le16 *addr, char *str) return 0; } -static int dn_node_address_handler(ctl_table *table, int write, +static int dn_node_address_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -183,7 +183,7 @@ static int dn_node_address_handler(ctl_table *table, int write, return 0; } -static int dn_def_dev_handler(ctl_table *table, int write, +static int dn_def_dev_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -246,7 +246,7 @@ static int dn_def_dev_handler(ctl_table *table, int write, return 0; } -static ctl_table dn_table[] = { +static struct ctl_table dn_table[] = { { .procname = "node_address", .maxlen = 7, diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 3469506c106d..8d48c392adcc 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1942,7 +1942,7 @@ static void inet_forward_change(struct net *net) } } -static int devinet_conf_proc(ctl_table *ctl, int write, +static int devinet_conf_proc(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -1985,7 +1985,7 @@ static int devinet_conf_proc(ctl_table *ctl, int write, return ret; } -static int devinet_sysctl_forward(ctl_table *ctl, int write, +static int devinet_sysctl_forward(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2028,7 +2028,7 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, return ret; } -static int ipv4_doint_and_flush(ctl_table *ctl, int write, +static int ipv4_doint_and_flush(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 567d84168bd2..0a2e0e3e95ba 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -223,7 +223,7 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; -static ctl_table ip_ct_sysctl_table[] = { +static struct ctl_table ip_ct_sysctl_table[] = { { .procname = "ip_conntrack_max", .maxlen = sizeof(int), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 198ea596f2d9..f3fa42eac461 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2448,7 +2448,7 @@ static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; -static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, +static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2463,7 +2463,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, return -EINVAL; } -static ctl_table ipv4_route_table[] = { +static struct ctl_table ipv4_route_table[] = { { .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index fa2f63fc453b..b2c123c44d69 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -49,13 +49,13 @@ static void set_local_port_range(int range[2]) } /* Validate changes from /proc interface. */ -static int ipv4_local_port_range(ctl_table *table, int write, +static int ipv4_local_port_range(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; int range[2]; - ctl_table tmp = { + struct ctl_table tmp = { .data = &range, .maxlen = sizeof(range), .mode = table->mode, @@ -100,7 +100,7 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig } /* Validate changes from /proc interface. */ -static int ipv4_ping_group_range(ctl_table *table, int write, +static int ipv4_ping_group_range(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -108,7 +108,7 @@ static int ipv4_ping_group_range(ctl_table *table, int write, int ret; gid_t urange[2]; kgid_t low, high; - ctl_table tmp = { + struct ctl_table tmp = { .data = &urange, .maxlen = sizeof(urange), .mode = table->mode, @@ -135,11 +135,11 @@ static int ipv4_ping_group_range(ctl_table *table, int write, return ret; } -static int proc_tcp_congestion_control(ctl_table *ctl, int write, +static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { char val[TCP_CA_NAME_MAX]; - ctl_table tbl = { + struct ctl_table tbl = { .data = val, .maxlen = TCP_CA_NAME_MAX, }; @@ -153,12 +153,12 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, return ret; } -static int proc_tcp_available_congestion_control(ctl_table *ctl, +static int proc_tcp_available_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; + struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); @@ -170,12 +170,12 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl, return ret; } -static int proc_allowed_congestion_control(ctl_table *ctl, +static int proc_allowed_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; + struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; int ret; tbl.data = kmalloc(tbl.maxlen, GFP_USER); @@ -190,7 +190,7 @@ static int proc_allowed_congestion_control(ctl_table *ctl, return ret; } -static int ipv4_tcp_mem(ctl_table *ctl, int write, +static int ipv4_tcp_mem(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -201,7 +201,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, struct mem_cgroup *memcg; #endif - ctl_table tmp = { + struct ctl_table tmp = { .data = &vec, .maxlen = sizeof(vec), .mode = ctl->mode, @@ -233,10 +233,11 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, return 0; } -static int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer, - size_t *lenp, loff_t *ppos) +static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) { - ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; + struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; int ret; u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 21010fddb203..80449121afa2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4620,13 +4620,13 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) #ifdef CONFIG_SYSCTL static -int addrconf_sysctl_forward(ctl_table *ctl, int write, +int addrconf_sysctl_forward(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; loff_t pos = *ppos; - ctl_table lctl; + struct ctl_table lctl; int ret; /* @@ -4705,13 +4705,13 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) } static -int addrconf_sysctl_disable(ctl_table *ctl, int write, +int addrconf_sysctl_disable(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; loff_t pos = *ppos; - ctl_table lctl; + struct ctl_table lctl; int ret; /* @@ -4733,7 +4733,7 @@ int addrconf_sysctl_disable(ctl_table *ctl, int write, static struct addrconf_sysctl_table { struct ctl_table_header *sysctl_header; - ctl_table addrconf_vars[DEVCONF_MAX+1]; + struct ctl_table addrconf_vars[DEVCONF_MAX+1]; } addrconf_sysctl __read_mostly = { .sysctl_header = NULL, .addrconf_vars = { diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 4b4890bbe16d..7cfc8d284870 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -976,7 +976,7 @@ int icmpv6_err_convert(u8 type, u8 code, int *err) EXPORT_SYMBOL(icmpv6_err_convert); #ifdef CONFIG_SYSCTL -ctl_table ipv6_icmp_table_template[] = { +struct ctl_table ipv6_icmp_table_template[] = { { .procname = "ratelimit", .data = &init_net.ipv6.sysctl.icmpv6_time, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2b874185ebb2..7ca87b37c0ef 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2790,7 +2790,7 @@ static const struct file_operations rt6_stats_seq_fops = { #ifdef CONFIG_SYSCTL static -int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, +int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net *net; @@ -2805,7 +2805,7 @@ int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, return 0; } -ctl_table ipv6_route_table_template[] = { +struct ctl_table ipv6_route_table_template[] = { { .procname = "flush", .data = &init_net.ipv6.sysctl.flush_delay, diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index e85c48bd404f..107b2f1d90ae 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -16,7 +16,7 @@ #include #include -static ctl_table ipv6_table_template[] = { +static struct ctl_table ipv6_table_template[] = { { .procname = "bindv6only", .data = &init_net.ipv6.sysctl.bindv6only, @@ -27,7 +27,7 @@ static ctl_table ipv6_table_template[] = { { } }; -static ctl_table ipv6_rotable[] = { +static struct ctl_table ipv6_rotable[] = { { .procname = "mld_max_msf", .data = &sysctl_mld_max_msf, diff --git a/net/irda/irsysctl.c b/net/irda/irsysctl.c index de73f6496db5..d6a59651767a 100644 --- a/net/irda/irsysctl.c +++ b/net/irda/irsysctl.c @@ -73,7 +73,7 @@ static int min_lap_keepalive_time = 100; /* 100us */ /* For other sysctl, I've no idea of the range. Maybe Dag could help * us on that - Jean II */ -static int do_devname(ctl_table *table, int write, +static int do_devname(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -90,7 +90,7 @@ static int do_devname(ctl_table *table, int write, } -static int do_discovery(ctl_table *table, int write, +static int do_discovery(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -111,7 +111,7 @@ static int do_discovery(ctl_table *table, int write, } /* One file */ -static ctl_table irda_table[] = { +static struct ctl_table irda_table[] = { { .procname = "discovery", .data = &sysctl_discovery, diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index df05c1c276f0..edb88fbcb1bd 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1575,7 +1575,7 @@ static int zero; static int three = 3; static int -proc_do_defense_mode(ctl_table *table, int write, +proc_do_defense_mode(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; @@ -1596,7 +1596,7 @@ proc_do_defense_mode(ctl_table *table, int write, } static int -proc_do_sync_threshold(ctl_table *table, int write, +proc_do_sync_threshold(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; @@ -1616,7 +1616,7 @@ proc_do_sync_threshold(ctl_table *table, int write, } static int -proc_do_sync_mode(ctl_table *table, int write, +proc_do_sync_mode(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; @@ -1634,7 +1634,7 @@ proc_do_sync_mode(ctl_table *table, int write, } static int -proc_do_sync_ports(ctl_table *table, int write, +proc_do_sync_ports(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 5ea26bd87743..44595b8ae37f 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -118,7 +118,7 @@ struct ip_vs_lblc_table { * IPVS LBLC sysctl table */ #ifdef CONFIG_SYSCTL -static ctl_table vs_vars_table[] = { +static struct ctl_table vs_vars_table[] = { { .procname = "lblc_expiration", .data = NULL, diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 50123c2ab484..876937db0bf4 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -299,7 +299,7 @@ struct ip_vs_lblcr_table { * IPVS LBLCR sysctl table */ -static ctl_table vs_vars_table[] = { +static struct ctl_table vs_vars_table[] = { { .procname = "lblcr_expiration", .data = NULL, diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index bd700b4013c1..f641751dba9d 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -408,7 +408,7 @@ static int log_invalid_proto_max = 255; static struct ctl_table_header *nf_ct_netfilter_header; -static ctl_table nf_ct_sysctl_table[] = { +static struct ctl_table nf_ct_sysctl_table[] = { { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, @@ -458,7 +458,7 @@ static ctl_table nf_ct_sysctl_table[] = { #define NET_NF_CONNTRACK_MAX 2089 -static ctl_table nf_ct_netfilter_table[] = { +static struct ctl_table nf_ct_netfilter_table[] = { { .procname = "nf_conntrack_max", .data = &nf_conntrack_max, diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 4b60a87b7596..85296d4eac0e 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -245,7 +245,7 @@ static const struct file_operations nflog_file_ops = { static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; -static int nf_log_proc_dostring(ctl_table *table, int write, +static int nf_log_proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c index 42f630b9a698..ba1c368b3f18 100644 --- a/net/netrom/sysctl_net_netrom.c +++ b/net/netrom/sysctl_net_netrom.c @@ -34,7 +34,7 @@ static int min_reset[] = {0}, max_reset[] = {1}; static struct ctl_table_header *nr_table_header; -static ctl_table nr_table[] = { +static struct ctl_table nr_table[] = { { .procname = "default_path_quality", .data = &sysctl_netrom_default_path_quality, diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c index d6bbbbd0af18..c02a8c4bc11f 100644 --- a/net/phonet/sysctl.c +++ b/net/phonet/sysctl.c @@ -61,13 +61,13 @@ void phonet_get_local_port_range(int *min, int *max) } while (read_seqretry(&local_port_range_lock, seq)); } -static int proc_local_port_range(ctl_table *table, int write, +static int proc_local_port_range(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; int range[2] = {local_port_range[0], local_port_range[1]}; - ctl_table tmp = { + struct ctl_table tmp = { .data = &range, .maxlen = sizeof(range), .mode = table->mode, diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index 7e643bafb4af..e4e41b3afce7 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -61,7 +61,7 @@ static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; */ unsigned int rds_ib_sysctl_flow_control = 0; -static ctl_table rds_ib_sysctl_table[] = { +static struct ctl_table rds_ib_sysctl_table[] = { { .procname = "max_send_wr", .data = &rds_ib_sysctl_max_send_wr, diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c index 5d5ebd576f3f..89c91515ed0c 100644 --- a/net/rds/iw_sysctl.c +++ b/net/rds/iw_sysctl.c @@ -55,7 +55,7 @@ static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL; unsigned int rds_iw_sysctl_flow_control = 1; -static ctl_table rds_iw_sysctl_table[] = { +static struct ctl_table rds_iw_sysctl_table[] = { { .procname = "max_send_wr", .data = &rds_iw_sysctl_max_send_wr, diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index 907214b4c4d0..b5cb2aa08f33 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c @@ -49,7 +49,7 @@ unsigned int rds_sysctl_max_unacked_bytes = (16 << 20); unsigned int rds_sysctl_ping_enable = 1; -static ctl_table rds_sysctl_rds_table[] = { +static struct ctl_table rds_sysctl_rds_table[] = { { .procname = "reconnect_min_delay_ms", .data = &rds_sysctl_reconnect_min_jiffies, diff --git a/net/rose/sysctl_net_rose.c b/net/rose/sysctl_net_rose.c index 94ca9c2ccd69..89a9278795a9 100644 --- a/net/rose/sysctl_net_rose.c +++ b/net/rose/sysctl_net_rose.c @@ -24,7 +24,7 @@ static int min_window[] = {1}, max_window[] = {7}; static struct ctl_table_header *rose_table_header; -static ctl_table rose_table[] = { +static struct ctl_table rose_table[] = { { .procname = "restart_request_timeout", .data = &sysctl_rose_restart_request_timeout, diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index bf3c6e8fc401..9a5c4c9eddaf 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -62,12 +62,12 @@ extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; extern int sysctl_sctp_wmem[3]; -static int proc_sctp_do_hmac_alg(ctl_table *ctl, +static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -static ctl_table sctp_table[] = { +static struct ctl_table sctp_table[] = { { .procname = "sctp_mem", .data = &sysctl_sctp_mem, @@ -93,7 +93,7 @@ static ctl_table sctp_table[] = { { /* sentinel */ } }; -static ctl_table sctp_net_table[] = { +static struct ctl_table sctp_net_table[] = { { .procname = "rto_initial", .data = &init_net.sctp.rto_initial, @@ -300,14 +300,14 @@ static ctl_table sctp_net_table[] = { { /* sentinel */ } }; -static int proc_sctp_do_hmac_alg(ctl_table *ctl, +static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; char tmp[8]; - ctl_table tbl; + struct ctl_table tbl; int ret; int changed = 0; char *none = "none"; diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index af7d339add9d..c99c58e2ee66 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -40,7 +40,7 @@ EXPORT_SYMBOL_GPL(nlm_debug); #ifdef RPC_DEBUG static struct ctl_table_header *sunrpc_table_header; -static ctl_table sunrpc_table[]; +static struct ctl_table sunrpc_table[]; void rpc_register_sysctl(void) @@ -58,7 +58,7 @@ rpc_unregister_sysctl(void) } } -static int proc_do_xprt(ctl_table *table, int write, +static int proc_do_xprt(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; @@ -73,7 +73,7 @@ static int proc_do_xprt(ctl_table *table, int write, } static int -proc_dodebug(ctl_table *table, int write, +proc_dodebug(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[20], c, *s; @@ -135,7 +135,7 @@ done: } -static ctl_table debug_table[] = { +static struct ctl_table debug_table[] = { { .procname = "rpc_debug", .data = &rpc_debug, @@ -173,7 +173,7 @@ static ctl_table debug_table[] = { { } }; -static ctl_table sunrpc_table[] = { +static struct ctl_table sunrpc_table[] = { { .procname = "sunrpc", .mode = 0555, diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 8343737e85f4..c1b6270262c2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -84,7 +84,7 @@ struct workqueue_struct *svc_rdma_wq; * resets the associated statistic to zero. Any read returns it's * current value. */ -static int read_reset_stat(ctl_table *table, int write, +static int read_reset_stat(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -119,7 +119,7 @@ static int read_reset_stat(ctl_table *table, int write, } static struct ctl_table_header *svcrdma_table_header; -static ctl_table svcrdma_parm_table[] = { +static struct ctl_table svcrdma_parm_table[] = { { .procname = "max_requests", .data = &svcrdma_max_requests, @@ -214,7 +214,7 @@ static ctl_table svcrdma_parm_table[] = { { }, }; -static ctl_table svcrdma_table[] = { +static struct ctl_table svcrdma_table[] = { { .procname = "svc_rdma", .mode = 0555, @@ -223,7 +223,7 @@ static ctl_table svcrdma_table[] = { { }, }; -static ctl_table svcrdma_root_table[] = { +static struct ctl_table svcrdma_root_table[] = { { .procname = "sunrpc", .mode = 0555, diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 794312f22b9b..285dc0884115 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -86,7 +86,7 @@ static unsigned int max_memreg = RPCRDMA_LAST - 1; static struct ctl_table_header *sunrpc_table_header; -static ctl_table xr_tunables_table[] = { +static struct ctl_table xr_tunables_table[] = { { .procname = "rdma_slot_table_entries", .data = &xprt_rdma_slot_table_entries, @@ -138,7 +138,7 @@ static ctl_table xr_tunables_table[] = { { }, }; -static ctl_table sunrpc_table[] = { +static struct ctl_table sunrpc_table[] = { { .procname = "sunrpc", .mode = 0555, diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index ffd50348a509..412de7cfcc80 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -87,7 +87,7 @@ static struct ctl_table_header *sunrpc_table_header; * FIXME: changing the UDP slot table size should also resize the UDP * socket buffers for existing UDP transports */ -static ctl_table xs_tunables_table[] = { +static struct ctl_table xs_tunables_table[] = { { .procname = "udp_slot_table_entries", .data = &xprt_udp_slot_table_entries, @@ -143,7 +143,7 @@ static ctl_table xs_tunables_table[] = { { }, }; -static ctl_table sunrpc_table[] = { +static struct ctl_table sunrpc_table[] = { { .procname = "sunrpc", .mode = 0555, diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 8800604c93f4..b3d515021b74 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -15,7 +15,7 @@ #include -static ctl_table unix_table[] = { +static struct ctl_table unix_table[] = { { .procname = "max_dgram_qlen", .data = &init_net.unx.sysctl_max_dgram_qlen, -- cgit v1.2.3 From 5f121b9a83b499a61ed44e5ba619c7de8f7271ad Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 13 Jun 2013 15:29:38 -0400 Subject: net-rps: fixes for rps flow limit Caught by sparse: - __rcu: missing annotation to sd->flow_limit - __user: direct access in cpumask_scnprintf Also - add endline character when printing bitmap if room in buffer - avoid bucket overflow by reducing FLOW_LIMIT_HISTORY The last item warrants some explanation. The hashtable buckets are subject to overflow if FLOW_LIMIT_HISTORY is larger than or equal to bucket size, since all packets may end up in a single bucket. The current (rather arbitrary) history value of 256 happens to match the buffer size (u8). As a result, with a single flow, the first 128 packets are accepted (correct), the second 128 packets dropped (correct) and then the history[] array has filled, so that each subsequent new packet causes an increment in the bucket for new_flow plus a decrement for old_flow: a steady state. This is fine if packets are dropped, as the steady state goes away as soon as a mix of traffic reappears. But, because the 256th packet overflowed the bucket to 0: no packets are dropped. Instead of explicitly adding an overflow check, this patch changes FLOW_LIMIT_HISTORY to never be able to overflow a single bucket. Reported-by: Fengguang Wu (first item) Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- net/core/sysctl_net_core.c | 19 ++++++++++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e5d65573b4d6..8c9fcc42502a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1840,7 +1840,7 @@ static inline int unregister_gifconf(unsigned int family) } #ifdef CONFIG_NET_FLOW_LIMIT -#define FLOW_LIMIT_HISTORY (1 << 8) /* must be ^2 */ +#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { u64 count; unsigned int num_buckets; @@ -1883,7 +1883,7 @@ struct softnet_data { struct napi_struct backlog; #ifdef CONFIG_NET_FLOW_LIMIT - struct sd_flow_limit *flow_limit; + struct sd_flow_limit __rcu *flow_limit; #endif }; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 637a42e5d589..78c746e016ae 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -132,6 +132,8 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, write_unlock: mutex_unlock(&flow_limit_update_mutex); } else { + char kbuf[128]; + if (*ppos || !*lenp) { *lenp = 0; goto done; @@ -146,9 +148,20 @@ write_unlock: } rcu_read_unlock(); - len = cpumask_scnprintf(buffer, *lenp, mask); - *lenp = len + 1; - *ppos += len + 1; + len = min(sizeof(kbuf) - 1, *lenp); + len = cpumask_scnprintf(kbuf, len, mask); + if (!len) { + *lenp = 0; + goto done; + } + if (len < *lenp) + kbuf[len++] = '\n'; + if (copy_to_user(buffer, kbuf, len)) { + ret = -EFAULT; + goto done; + } + *lenp = len; + *ppos += len; } done: -- cgit v1.2.3 From 1d8faf48c74b8329a0322dc4b2a2030ae5003c86 Mon Sep 17 00:00:00 2001 From: Rony Efraim Date: Thu, 13 Jun 2013 13:19:10 +0300 Subject: net/core: Add VF link state control Add netlink directives and ndo entry to allow for controling VF link, which can be in one of three states: Auto - VF link state reflects the PF link state (default) Up - VF link state is up, traffic from VF to VF works even if the actual PF link is down Down - VF link state is down, no traffic from/to this VF, can be of use while configuring the VF Signed-off-by: Rony Efraim Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- include/linux/if_link.h | 1 + include/linux/netdevice.h | 3 +++ include/uapi/linux/if_link.h | 13 +++++++++++++ net/core/rtnetlink.c | 22 ++++++++++++++++++++-- 4 files changed, 37 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/include/linux/if_link.h b/include/linux/if_link.h index c3f817c3eb45..a86784dec3d3 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -12,5 +12,6 @@ struct ifla_vf_info { __u32 qos; __u32 tx_rate; __u32 spoofchk; + __u32 linkstate; }; #endif /* _LINUX_IF_LINK_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8c9fcc42502a..09b4188c1ea7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -829,6 +829,7 @@ struct netdev_fcoe_hbainfo { * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting); * int (*ndo_get_vf_config)(struct net_device *dev, * int vf, struct ifla_vf_info *ivf); + * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state); * int (*ndo_set_vf_port)(struct net_device *dev, int vf, * struct nlattr *port[]); * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); @@ -986,6 +987,8 @@ struct net_device_ops { int (*ndo_get_vf_config)(struct net_device *dev, int vf, struct ifla_vf_info *ivf); + int (*ndo_set_vf_link_state)(struct net_device *dev, + int vf, int link_state); int (*ndo_set_vf_port)(struct net_device *dev, int vf, struct nlattr *port[]); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index da05a2698cb5..03f6170ab337 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -338,6 +338,7 @@ enum { IFLA_VF_VLAN, IFLA_VF_TX_RATE, /* TX Bandwidth Allocation */ IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ + IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ __IFLA_VF_MAX, }; @@ -364,6 +365,18 @@ struct ifla_vf_spoofchk { __u32 setting; }; +enum { + IFLA_VF_LINK_STATE_AUTO, /* link state of the uplink */ + IFLA_VF_LINK_STATE_ENABLE, /* link always up */ + IFLA_VF_LINK_STATE_DISABLE, /* link always down */ + __IFLA_VF_LINK_STATE_MAX, +}; + +struct ifla_vf_link_state { + __u32 vf; + __u32 link_state; +}; + /* VF ports management section * * Nested layout of set/get msg is: diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 49c14451d8ab..9007533867f0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -947,6 +947,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct ifla_vf_vlan vf_vlan; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_spoofchk vf_spoofchk; + struct ifla_vf_link_state vf_linkstate; /* * Not all SR-IOV capable drivers support the @@ -956,18 +957,24 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, */ ivi.spoofchk = -1; memset(ivi.mac, 0, sizeof(ivi.mac)); + /* The default value for VF link state is "auto" + * IFLA_VF_LINK_STATE_AUTO which equals zero + */ + ivi.linkstate = 0; if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) break; vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = - vf_spoofchk.vf = ivi.vf; + vf_spoofchk.vf = + vf_linkstate.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_tx_rate.rate = ivi.tx_rate; vf_spoofchk.setting = ivi.spoofchk; + vf_linkstate.link_state = ivi.linkstate; vf = nla_nest_start(skb, IFLA_VF_INFO); if (!vf) { nla_nest_cancel(skb, vfinfo); @@ -978,7 +985,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate) || nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), - &vf_spoofchk)) + &vf_spoofchk) || + nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), + &vf_linkstate)) goto nla_put_failure; nla_nest_end(skb, vf); } @@ -1238,6 +1247,15 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) ivs->setting); break; } + case IFLA_VF_LINK_STATE: { + struct ifla_vf_link_state *ivl; + ivl = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_link_state) + err = ops->ndo_set_vf_link_state(dev, ivl->vf, + ivl->link_state); + break; + } default: err = -EINVAL; break; -- cgit v1.2.3 From eb6db622825b2028df74f490b8c36887cf3c2f50 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Fri, 14 Jun 2013 16:33:25 +0300 Subject: net: change sysctl_net_ll_poll into an unsigned int There is no reason for sysctl_net_ll_poll to be an unsigned long. Change it into an unsigned int. Fix the proc handler. Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- include/net/ll_poll.h | 5 +++-- net/core/sysctl_net_core.c | 4 ++-- net/socket.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h index bc262f88173f..44e2f707cb9f 100644 --- a/include/net/ll_poll.h +++ b/include/net/ll_poll.h @@ -34,7 +34,7 @@ #ifdef CONFIG_NET_LL_RX_POLL struct napi_struct; -extern unsigned long sysctl_net_ll_poll __read_mostly; +extern unsigned int sysctl_net_ll_poll __read_mostly; /* return values from ndo_ll_poll */ #define LL_FLUSH_FAILED -1 @@ -45,7 +45,8 @@ extern unsigned long sysctl_net_ll_poll __read_mostly; static inline cycles_t ll_end_time(void) { - return TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll) + get_cycles(); + return (cycles_t)TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll) + + get_cycles(); } static inline bool sk_valid_ll(struct sock *sk) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 78c746e016ae..62702c2053de 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -302,9 +302,9 @@ static struct ctl_table net_core_table[] = { { .procname = "low_latency_poll", .data = &sysctl_net_ll_poll, - .maxlen = sizeof(unsigned long), + .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_doulongvec_minmax + .proc_handler = proc_dointvec }, #endif #endif /* CONFIG_NET */ diff --git a/net/socket.c b/net/socket.c index 21fd29f63ed2..caaffa14e87e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -107,7 +107,7 @@ #include #ifdef CONFIG_NET_LL_RX_POLL -unsigned long sysctl_net_ll_poll __read_mostly; +unsigned int sysctl_net_ll_poll __read_mostly; EXPORT_SYMBOL_GPL(sysctl_net_ll_poll); #endif -- cgit v1.2.3 From dafcc4380deec21d160c31411f33c8813f67f517 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Fri, 14 Jun 2013 16:33:57 +0300 Subject: net: add socket option for low latency polling adds a socket option for low latency polling. This allows overriding the global sysctl value with a per-socket one. Unexport sysctl_net_ll_poll since for now it's not needed in modules. Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/avr32/include/uapi/asm/socket.h | 2 ++ arch/cris/include/uapi/asm/socket.h | 2 ++ arch/frv/include/uapi/asm/socket.h | 2 ++ arch/h8300/include/uapi/asm/socket.h | 2 ++ arch/ia64/include/uapi/asm/socket.h | 2 ++ arch/m32r/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/mn10300/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/powerpc/include/uapi/asm/socket.h | 2 ++ arch/s390/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ arch/xtensa/include/uapi/asm/socket.h | 2 ++ include/net/ll_poll.h | 12 ++++++------ include/net/sock.h | 2 ++ include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 20 ++++++++++++++++++++ net/socket.c | 1 - 19 files changed, 58 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index eee6ea76bdaf..4885825e498d 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -81,4 +81,6 @@ #define SO_SELECT_ERR_QUEUE 45 +#define SO_LL 46 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h index 37401f535126..79b61798ebf8 100644 --- a/arch/avr32/include/uapi/asm/socket.h +++ b/arch/avr32/include/uapi/asm/socket.h @@ -74,4 +74,6 @@ #define SO_SELECT_ERR_QUEUE 45 +#define SO_LL 46 + #endif /* __ASM_AVR32_SOCKET_H */ diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h index ba409c9947bc..47b1ec55092d 100644 --- a/arch/cris/include/uapi/asm/socket.h +++ b/arch/cris/include/uapi/asm/socket.h @@ -76,6 +76,8 @@ #define SO_SELECT_ERR_QUEUE 45 +#define SO_LL 46 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index 31dbb5d8e13d..dbc08520f22c 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -74,5 +74,7 @@ #define SO_SELECT_ERR_QUEUE 45 +#define SO_LL 46 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/h8300/include/uapi/asm/socket.h b/arch/h8300/include/uapi/asm/socket.h index 5d1c6d0870e6..a38d38a6520b 100644 --- a/arch/h8300/include/uapi/asm/socket.h +++ b/arch/h8300/include/uapi/asm/socket.h @@ -74,4 +74,6 @@ #define SO_SELECT_ERR_QUEUE 45 +#define SO_LL 46 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 6b4329f18b29..d3358b760681 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -83,4 +83,6 @@ #define SO_SELECT_ERR_QUEUE 45 +#define SO_LL 46 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index 2a3b59e0e171..44aaf4639a4a 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -74,4 +74,6 @@ #define SO_SELECT_ERR_QUEUE 45 +#define SO_LL 46 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 3b211507be7f..6a07992ba6c6 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -92,4 +92,6 @@ #define SO_SELECT_ERR_QUEUE 45 +#define SO_LL 46 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index b4ce844c9391..db80fd3e398b 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -74,4 +74,6 @@ #define SO_SELECT_ERR_QUEUE 45 +#define SO_LL 46 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 70c512a386f7..f866fff9a004 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -73,6 +73,8 @@ #define SO_SELECT_ERR_QUEUE 0x4026 +#define SO_LL 0x4027 + /* O_NONBLOCK clashes with the bits used for socket types. Therefore we * have to define SOCK_NONBLOCK to a different value here. */ diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index a36daf3c6f9a..405fb09bda94 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -81,4 +81,6 @@ #define SO_SELECT_ERR_QUEUE 45 +#define SO_LL 46 + #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 2dacb306835c..0c5105fbaaf3 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -80,4 +80,6 @@ #define SO_SELECT_ERR_QUEUE 45 +#define SO_LL 46 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 89f49b68a21c..b46c3fa0b265 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -70,6 +70,8 @@ #define SO_SELECT_ERR_QUEUE 0x0029 +#define SO_LL 0x0030 + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index a8f44f50e651..b21ace4fc9ba 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -85,4 +85,6 @@ #define SO_SELECT_ERR_QUEUE 45 +#define SO_LL 46 + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h index 6930cbd943e2..fcc7c365cee5 100644 --- a/include/net/ll_poll.h +++ b/include/net/ll_poll.h @@ -39,12 +39,12 @@ extern unsigned int sysctl_net_ll_poll __read_mostly; /* we can use sched_clock() because we don't care much about precision * we only care that the average is bounded */ -static inline u64 ll_end_time(void) +static inline u64 ll_end_time(struct sock *sk) { - u64 end_time = ACCESS_ONCE(sysctl_net_ll_poll); + u64 end_time = ACCESS_ONCE(sk->sk_ll_usec); /* we don't mind a ~2.5% imprecision - * sysctl_net_ll_poll is a u_int so this can't overflow + * sk->sk_ll_usec is a u_int so this can't overflow */ end_time = (end_time << 10) + sched_clock(); @@ -53,7 +53,7 @@ static inline u64 ll_end_time(void) static inline bool sk_valid_ll(struct sock *sk) { - return sysctl_net_ll_poll && sk->sk_napi_id && + return sk->sk_ll_usec && sk->sk_napi_id && !need_resched() && !signal_pending(current); } @@ -65,7 +65,7 @@ static inline bool can_poll_ll(u64 end_time) static inline bool sk_poll_ll(struct sock *sk, int nonblock) { const struct net_device_ops *ops; - u64 end_time = ll_end_time(); + u64 end_time = ll_end_time(sk); struct napi_struct *napi; int rc = false; @@ -118,7 +118,7 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) #else /* CONFIG_NET_LL_RX_POLL */ -static inline u64 ll_end_time(void) +static inline u64 ll_end_time(struct sock *sk) { return 0; } diff --git a/include/net/sock.h b/include/net/sock.h index ac8e1818380c..21db792bffa5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -230,6 +230,7 @@ struct cg_proto; * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward * @sk_napi_id: id of the last napi context to receive data for sk + * @sk_ll_usec: usecs to busypoll when there is no data * @sk_allocation: allocation mode * @sk_sndbuf: size of send buffer in bytes * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, @@ -328,6 +329,7 @@ struct sock { #endif #ifdef CONFIG_NET_LL_RX_POLL unsigned int sk_napi_id; + unsigned int sk_ll_usec; #endif atomic_t sk_drops; int sk_rcvbuf; diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index c5d2e3a1cf68..ca3a20d772ac 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -76,4 +76,6 @@ #define SO_SELECT_ERR_QUEUE 45 +#define SO_LL 46 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 788c0da5eed1..1e744b12fda3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -913,6 +913,19 @@ set_rcvbuf: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; +#ifdef CONFIG_NET_LL_RX_POLL + case SO_LL: + /* allow unprivileged users to decrease the value */ + if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) + ret = -EPERM; + else { + if (val < 0) + ret = -EINVAL; + else + sk->sk_ll_usec = val; + } + break; +#endif default: ret = -ENOPROTOOPT; break; @@ -1170,6 +1183,12 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); break; +#ifdef CONFIG_NET_LL_RX_POLL + case SO_LL: + v.val = sk->sk_ll_usec; + break; +#endif + default: return -ENOPROTOOPT; } @@ -2288,6 +2307,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) #ifdef CONFIG_NET_LL_RX_POLL sk->sk_napi_id = 0; + sk->sk_ll_usec = sysctl_net_ll_poll; #endif /* diff --git a/net/socket.c b/net/socket.c index caaffa14e87e..3eec3f76b49c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -108,7 +108,6 @@ #ifdef CONFIG_NET_LL_RX_POLL unsigned int sysctl_net_ll_poll __read_mostly; -EXPORT_SYMBOL_GPL(sysctl_net_ll_poll); #endif static int sock_no_open(struct inode *irrelevant, struct file *dontcare); -- cgit v1.2.3 From cf89d6b2803ab99ac596f95d585c3057d2be645c Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 20 Jun 2013 10:01:32 +0800 Subject: neigh: no need to call lookup_neigh_parms in neigh_parms_alloc neigh_table.parms always exist and is initialized,kmemdup can use it to create new neigh_parms, actually lookup_neigh_parms here will return neigh_table.parms too. Signed-off-by: Gao feng Signed-off-by: David S. Miller --- net/core/neighbour.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index decaa4b9db2f..53eab513955a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1429,15 +1429,11 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl) { - struct neigh_parms *p, *ref; + struct neigh_parms *p; struct net *net = dev_net(dev); const struct net_device_ops *ops = dev->netdev_ops; - ref = lookup_neigh_parms(tbl, net, 0); - if (!ref) - return NULL; - - p = kmemdup(ref, sizeof(*p), GFP_KERNEL); + p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); if (p) { p->tbl = tbl; atomic_set(&p->refcnt, 1); -- cgit v1.2.3 From 170d6f99541600ec7512f1d2b0b0c349009098d2 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 20 Jun 2013 10:01:33 +0800 Subject: neigh: only allow init_net to change the default neigh_parms Though we don't export the /proc/sys/net/ipv[4,6]/neigh/default/ directory to the un-init_net, but we can still use cmd such as "ip ntable change name arp_cache locktime 129" to change the locktime of default neigh_parms. This patch disallows the un-init_net to find out the neigh_table.parms. So the un-init_net will failed to influence the init_net. Signed-off-by: Gao feng Signed-off-by: David S. Miller --- net/core/neighbour.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 53eab513955a..86f9b165bbba 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1419,7 +1419,7 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, for (p = &tbl->parms; p; p = p->next) { if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || - (!p->dev && !ifindex)) + (!p->dev && !ifindex && net_eq(net, &init_net))) return p; } -- cgit v1.2.3 From dc25c676f54addb10e598daa9da9b8dd4fd487ab Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 20 Jun 2013 10:01:34 +0800 Subject: neigh: disallow un-init_net to change thresh of neigh thresh and interval are global resources, only init net can change them. Signed-off-by: Gao feng Signed-off-by: David S. Miller --- net/core/neighbour.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 86f9b165bbba..2569ab2cafbe 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2049,6 +2049,12 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) } } + err = -ENOENT; + if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] || + tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) && + !net_eq(net, &init_net)) + goto errout_tbl_lock; + if (tb[NDTA_THRESH1]) tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]); -- cgit v1.2.3 From 60877a32bce00041528576e6b8df5abe9251fa73 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Jun 2013 01:15:51 -0700 Subject: net: allow large number of tx queues netif_alloc_netdev_queues() uses kcalloc() to allocate memory for the "struct netdev_queue *_tx" array. For large number of tx queues, kcalloc() might fail, so this patch does a fallback to vzalloc(). As vmalloc() adds overhead on a critical network path, add __GFP_REPEAT to kzalloc() flags to do this fallback only when really needed. Signed-off-by: Eric Dumazet Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/dev.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index fa007dba6beb..722f633926e0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -130,6 +130,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -5253,17 +5254,28 @@ static void netdev_init_one_queue(struct net_device *dev, #endif } +static void netif_free_tx_queues(struct net_device *dev) +{ + if (is_vmalloc_addr(dev->_tx)) + vfree(dev->_tx); + else + kfree(dev->_tx); +} + static int netif_alloc_netdev_queues(struct net_device *dev) { unsigned int count = dev->num_tx_queues; struct netdev_queue *tx; + size_t sz = count * sizeof(*tx); - BUG_ON(count < 1); - - tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); - if (!tx) - return -ENOMEM; + BUG_ON(count < 1 || count > 0xffff); + tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!tx) { + tx = vzalloc(sz); + if (!tx) + return -ENOMEM; + } dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); @@ -5811,7 +5823,7 @@ free_all: free_pcpu: free_percpu(dev->pcpu_refcnt); - kfree(dev->_tx); + netif_free_tx_queues(dev); #ifdef CONFIG_RPS kfree(dev->_rx); #endif @@ -5836,7 +5848,7 @@ void free_netdev(struct net_device *dev) release_net(dev_net(dev)); - kfree(dev->_tx); + netif_free_tx_queues(dev); #ifdef CONFIG_RPS kfree(dev->_rx); #endif -- cgit v1.2.3 From aeb193ea6cef28e33589de05ef932424f8e19bde Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Sun, 23 Jun 2013 23:33:48 -0700 Subject: net: Unmap fragment page once iterator is done Callers of skb_seq_read() are currently forced to call skb_abort_seq_read() even when consuming all the data because the last call to skb_seq_read (the one that returns 0 to indicate the end) fails to unmap the last fragment page. With this patch callers will be allowed to traverse the SKB data by calling skb_prepare_seq_read() once and repeatedly calling skb_seq_read() as originally intended (and documented in the original commit 677e90eda), that is, only call skb_abort_seq_read() if the sequential read is actually aborted. Signed-off-by: Wedson Almeida Filho Signed-off-by: David S. Miller --- drivers/scsi/libiscsi_tcp.c | 1 - net/batman-adv/main.c | 1 - net/core/skbuff.c | 7 ++++++- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c index 552e8a2b6f5f..448eae850b9c 100644 --- a/drivers/scsi/libiscsi_tcp.c +++ b/drivers/scsi/libiscsi_tcp.c @@ -906,7 +906,6 @@ int iscsi_tcp_recv_skb(struct iscsi_conn *conn, struct sk_buff *skb, ISCSI_DBG_TCP(conn, "no more data avail. Consumed %d\n", consumed); *status = ISCSI_TCP_SKB_DONE; - skb_abort_seq_read(&seq); goto skb_done; } BUG_ON(segment->copied >= segment->size); diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 51aafd669cbb..08125f3f6064 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -473,7 +473,6 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) crc = crc32c(crc, data, len); consumed += len; } - skb_abort_seq_read(&st); return htonl(crc); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index edf37578e21e..9f73eca29fbe 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2541,8 +2541,13 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data, unsigned int block_limit, abs_offset = consumed + st->lower_offset; skb_frag_t *frag; - if (unlikely(abs_offset >= st->upper_offset)) + if (unlikely(abs_offset >= st->upper_offset)) { + if (st->frag_data) { + kunmap_atomic(st->frag_data); + st->frag_data = NULL; + } return 0; + } next_skb: block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; -- cgit v1.2.3 From f693dff7107063f0ce08502052b78c4d4feb0e87 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 25 Jun 2013 16:01:55 +0300 Subject: rtnetlink: allow using zero MAC address in rtnl_fdb_{add,del} This is required for multiple default destinations management in VXLAN Signed-off-by: Mike Rapoport Signed-off-by: Stephen Hemminger --- net/core/rtnetlink.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9007533867f0..3de740834d1f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2109,10 +2109,6 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) } addr = nla_data(tb[NDA_LLADDR]); - if (is_zero_ether_addr(addr)) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ether address\n"); - return -EINVAL; - } err = -EOPNOTSUPP; @@ -2210,10 +2206,6 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) } addr = nla_data(tb[NDA_LLADDR]); - if (is_zero_ether_addr(addr)) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ether address\n"); - return -EINVAL; - } err = -EOPNOTSUPP; -- cgit v1.2.3 From 2d48d67fa8cd129ea85ea02d91b4a793286866f8 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Mon, 24 Jun 2013 10:28:03 +0300 Subject: net: poll/select low latency socket support select/poll busy-poll support. Split sysctl value into two separate ones, one for read and one for poll. updated Documentation/sysctl/net.txt Add a new poll flag POLL_LL. When this flag is set, sock_poll will call sk_poll_ll if possible. sock_poll sets this flag in its return value to indicate to select/poll when a socket that can busy poll is found. When poll/select have nothing to report, call the low-level sock_poll again until we are out of time or we find something. Once the system call finds something, it stops setting POLL_LL, so it can return the result to the user ASAP. Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 18 ++++++++++++++++-- fs/select.c | 34 +++++++++++++++++++++++++++++----- include/net/ll_poll.h | 35 ++++++++++++++++++++++------------- include/uapi/asm-generic/poll.h | 2 ++ net/core/sock.c | 2 +- net/core/sysctl_net_core.c | 8 ++++++++ net/socket.c | 14 +++++++++++++- 7 files changed, 91 insertions(+), 22 deletions(-) (limited to 'net/core') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 5369879eafe2..e658bbfb641f 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -50,13 +50,27 @@ The maximum number of packets that kernel can handle on a NAPI interrupt, it's a Per-CPU variable. Default: 64 -low_latency_poll +low_latency_read ---------------- -Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL) +Low latency busy poll timeout for socket reads. (needs CONFIG_NET_LL_RX_POLL) Approximate time in us to spin waiting for packets on the device queue. +This sets the default value of the SO_LL socket option. +Can be set or overridden per socket by setting socket option SO_LL. Recommended value is 50. May increase power usage. Default: 0 (off) +low_latency_poll +---------------- +Low latency busy poll timeout for poll and select. (needs CONFIG_NET_LL_RX_POLL) +Approximate time in us to spin waiting for packets on the device queue. +Recommended value depends on the number of sockets you poll on. +For several sockets 50, for several hundreds 100. +For more than that you probably want to use epoll. +Note that only sockets with SO_LL set will be busy polled, so you want to either +selectively set SO_LL on those sockets or set sysctl.net.low_latency_read globally. +May increase power usage. +Default: 0 (off) + rmem_default ------------ diff --git a/fs/select.c b/fs/select.c index 8c1c96c27062..79b876eb91da 100644 --- a/fs/select.c +++ b/fs/select.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -384,9 +385,10 @@ get_max: #define POLLEX_SET (POLLPRI) static inline void wait_key_set(poll_table *wait, unsigned long in, - unsigned long out, unsigned long bit) + unsigned long out, unsigned long bit, + unsigned int ll_flag) { - wait->_key = POLLEX_SET; + wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) @@ -400,6 +402,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) poll_table *wait; int retval, i, timed_out = 0; unsigned long slack = 0; + unsigned int ll_flag = POLL_LL; + u64 ll_time = ll_end_time(); rcu_read_lock(); retval = max_select_fd(n, fds); @@ -422,6 +426,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; + bool can_ll = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; @@ -449,7 +454,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) f_op = f.file->f_op; mask = DEFAULT_POLLMASK; if (f_op && f_op->poll) { - wait_key_set(wait, in, out, bit); + wait_key_set(wait, in, out, + bit, ll_flag); mask = (*f_op->poll)(f.file, wait); } fdput(f); @@ -468,6 +474,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval++; wait->_qproc = NULL; } + if (mask & POLL_LL) + can_ll = true; + /* got something, stop busy polling */ + if (retval) + ll_flag = 0; } } if (res_in) @@ -486,6 +497,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) break; } + if (can_ll && can_poll_ll(ll_time)) + continue; + /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to @@ -717,7 +731,8 @@ struct poll_list { * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ -static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) +static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, + bool *can_ll, unsigned int ll_flag) { unsigned int mask; int fd; @@ -731,7 +746,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) mask = DEFAULT_POLLMASK; if (f.file->f_op && f.file->f_op->poll) { pwait->_key = pollfd->events|POLLERR|POLLHUP; + pwait->_key |= ll_flag; mask = f.file->f_op->poll(f.file, pwait); + if (mask & POLL_LL) + *can_ll = true; } /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; @@ -750,6 +768,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list, ktime_t expire, *to = NULL; int timed_out = 0, count = 0; unsigned long slack = 0; + unsigned int ll_flag = POLL_LL; + u64 ll_time = ll_end_time(); /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -762,6 +782,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, for (;;) { struct poll_list *walk; + bool can_ll = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; @@ -776,9 +797,10 @@ static int do_poll(unsigned int nfds, struct poll_list *list, * this. They'll get immediately deregistered * when we break out and return. */ - if (do_pollfd(pfd, pt)) { + if (do_pollfd(pfd, pt, &can_ll, ll_flag)) { count++; pt->_qproc = NULL; + ll_flag = 0; } } } @@ -795,6 +817,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list, if (count || timed_out) break; + if (can_ll && can_poll_ll(ll_time)) + continue; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h index fcc7c365cee5..5bf2b3a6129e 100644 --- a/include/net/ll_poll.h +++ b/include/net/ll_poll.h @@ -30,6 +30,7 @@ #ifdef CONFIG_NET_LL_RX_POLL struct napi_struct; +extern unsigned int sysctl_net_ll_read __read_mostly; extern unsigned int sysctl_net_ll_poll __read_mostly; /* return values from ndo_ll_poll */ @@ -38,17 +39,18 @@ extern unsigned int sysctl_net_ll_poll __read_mostly; /* we can use sched_clock() because we don't care much about precision * we only care that the average is bounded + * we don't mind a ~2.5% imprecision so <<10 instead of *1000 + * sk->sk_ll_usec is a u_int so this can't overflow */ -static inline u64 ll_end_time(struct sock *sk) +static inline u64 ll_sk_end_time(struct sock *sk) { - u64 end_time = ACCESS_ONCE(sk->sk_ll_usec); - - /* we don't mind a ~2.5% imprecision - * sk->sk_ll_usec is a u_int so this can't overflow - */ - end_time = (end_time << 10) + sched_clock(); + return ((u64)ACCESS_ONCE(sk->sk_ll_usec) << 10) + sched_clock(); +} - return end_time; +/* in poll/select we use the global sysctl_net_ll_poll value */ +static inline u64 ll_end_time(void) +{ + return ((u64)ACCESS_ONCE(sysctl_net_ll_poll) << 10) + sched_clock(); } static inline bool sk_valid_ll(struct sock *sk) @@ -62,10 +64,13 @@ static inline bool can_poll_ll(u64 end_time) return !time_after64(sched_clock(), end_time); } +/* when used in sock_poll() nonblock is known at compile time to be true + * so the loop and end_time will be optimized out + */ static inline bool sk_poll_ll(struct sock *sk, int nonblock) { + u64 end_time = nonblock ? 0 : ll_sk_end_time(sk); const struct net_device_ops *ops; - u64 end_time = ll_end_time(sk); struct napi_struct *napi; int rc = false; @@ -84,7 +89,6 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock) goto out; do { - rc = ops->ndo_ll_poll(napi); if (rc == LL_FLUSH_FAILED) @@ -95,8 +99,8 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock) NET_ADD_STATS_BH(sock_net(sk), LINUX_MIB_LOWLATENCYRXPACKETS, rc); - } while (skb_queue_empty(&sk->sk_receive_queue) - && can_poll_ll(end_time) && !nonblock); + } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && + can_poll_ll(end_time)); rc = !skb_queue_empty(&sk->sk_receive_queue); out: @@ -118,7 +122,12 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) #else /* CONFIG_NET_LL_RX_POLL */ -static inline u64 ll_end_time(struct sock *sk) +static inline u64 sk_ll_end_time(struct sock *sk) +{ + return 0; +} + +static inline u64 ll_end_time(void) { return 0; } diff --git a/include/uapi/asm-generic/poll.h b/include/uapi/asm-generic/poll.h index 9ce7f44aebd2..4aee586979ca 100644 --- a/include/uapi/asm-generic/poll.h +++ b/include/uapi/asm-generic/poll.h @@ -30,6 +30,8 @@ #define POLLFREE 0x4000 /* currently only for epoll */ +#define POLL_LL 0x8000 + struct pollfd { int fd; short events; diff --git a/net/core/sock.c b/net/core/sock.c index 1e744b12fda3..b6c619f4d47b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2307,7 +2307,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) #ifdef CONFIG_NET_LL_RX_POLL sk->sk_napi_id = 0; - sk->sk_ll_usec = sysctl_net_ll_poll; + sk->sk_ll_usec = sysctl_net_ll_read; #endif /* diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 62702c2053de..afc677eadd93 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -306,6 +306,14 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "low_latency_read", + .data = &sysctl_net_ll_read, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +# #endif #endif /* CONFIG_NET */ { diff --git a/net/socket.c b/net/socket.c index 3eec3f76b49c..4da14cbd49b6 100644 --- a/net/socket.c +++ b/net/socket.c @@ -107,6 +107,7 @@ #include #ifdef CONFIG_NET_LL_RX_POLL +unsigned int sysctl_net_ll_read __read_mostly; unsigned int sysctl_net_ll_poll __read_mostly; #endif @@ -1147,13 +1148,24 @@ EXPORT_SYMBOL(sock_create_lite); /* No kernel lock held - perfect */ static unsigned int sock_poll(struct file *file, poll_table *wait) { + unsigned int ll_flag = 0; struct socket *sock; /* * We can't return errors to poll, so it's either yes or no. */ sock = file->private_data; - return sock->ops->poll(file, sock, wait); + + if (sk_valid_ll(sock->sk)) { + /* this socket can poll_ll so tell the system call */ + ll_flag = POLL_LL; + + /* once, only if requested by syscall */ + if (wait && (wait->_key & POLL_LL)) + sk_poll_ll(sock->sk, 1); + } + + return ll_flag | sock->ops->poll(file, sock, wait); } static int sock_mmap(struct file *file, struct vm_area_struct *vma) -- cgit v1.2.3 From 621e84d6f373dcb273ebfd772638b8e7dc3c2c48 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 26 Jun 2013 16:11:27 +0200 Subject: dev: introduce skb_scrub_packet() The goal of this new function is to perform all needed cleanup before sending an skb into another netns. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/core/dev.c | 11 +---------- net/core/skbuff.c | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a7393adea0b5..6b06023e8a08 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2384,6 +2384,7 @@ extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); extern int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); +extern void skb_scrub_packet(struct sk_buff *skb); extern struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); diff --git a/net/core/dev.c b/net/core/dev.c index 722f633926e0..370354a9c5f6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1652,22 +1652,13 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } } - skb_orphan(skb); - if (unlikely(!is_skb_forwardable(dev, skb))) { atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } - skb->skb_iif = 0; - skb_dst_drop(skb); - skb->tstamp.tv64 = 0; - skb->pkt_type = PACKET_HOST; + skb_scrub_packet(skb); skb->protocol = eth_type_trans(skb, dev); - skb->mark = 0; - secpath_reset(skb); - nf_reset(skb); - nf_reset_trace(skb); return netif_rx(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9f73eca29fbe..b1fcb8727e56 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3492,3 +3492,26 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, return true; } EXPORT_SYMBOL(skb_try_coalesce); + +/** + * skb_scrub_packet - scrub an skb before sending it to another netns + * + * @skb: buffer to clean + * + * skb_scrub_packet can be used to clean an skb before injecting it in + * another namespace. We have to clear all information in the skb that + * could impact namespace isolation. + */ +void skb_scrub_packet(struct sk_buff *skb) +{ + skb_orphan(skb); + skb->tstamp.tv64 = 0; + skb->pkt_type = PACKET_HOST; + skb->skb_iif = 0; + skb_dst_drop(skb); + skb->mark = 0; + secpath_reset(skb); + nf_reset(skb); + nf_reset_trace(skb); +} +EXPORT_SYMBOL_GPL(skb_scrub_packet); -- cgit v1.2.3 From c9ab4d85de222f3390c67aedc9c18a50e767531e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 28 Jun 2013 02:37:42 -0700 Subject: neighbour: fix a race in neigh_destroy() There is a race in neighbour code, because neigh_destroy() uses skb_queue_purge(&neigh->arp_queue) without holding neighbour lock, while other parts of the code assume neighbour rwlock is what protects arp_queue Convert all skb_queue_purge() calls to the __skb_queue_purge() variant Use __skb_queue_head_init() instead of skb_queue_head_init() to make clear we do not use arp_queue.lock And hold neigh->lock in neigh_destroy() to close the race. Reported-by: Joe Jin Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 2569ab2cafbe..b7de821f98df 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -231,7 +231,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) we must kill timers etc. and move it to safe state. */ - skb_queue_purge(&n->arp_queue); + __skb_queue_purge(&n->arp_queue); n->arp_queue_len_bytes = 0; n->output = neigh_blackhole; if (n->nud_state & NUD_VALID) @@ -286,7 +286,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device if (!n) goto out_entries; - skb_queue_head_init(&n->arp_queue); + __skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); seqlock_init(&n->ha_lock); n->updated = n->used = now; @@ -708,7 +708,9 @@ void neigh_destroy(struct neighbour *neigh) if (neigh_del_timer(neigh)) pr_warn("Impossible event\n"); - skb_queue_purge(&neigh->arp_queue); + write_lock_bh(&neigh->lock); + __skb_queue_purge(&neigh->arp_queue); + write_unlock_bh(&neigh->lock); neigh->arp_queue_len_bytes = 0; if (dev->netdev_ops->ndo_neigh_destroy) @@ -858,7 +860,7 @@ static void neigh_invalidate(struct neighbour *neigh) neigh->ops->error_report(neigh, skb); write_lock(&neigh->lock); } - skb_queue_purge(&neigh->arp_queue); + __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } @@ -1210,7 +1212,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, write_lock_bh(&neigh->lock); } - skb_queue_purge(&neigh->arp_queue); + __skb_queue_purge(&neigh->arp_queue); neigh->arp_queue_len_bytes = 0; } out: -- cgit v1.2.3 From c590b5e2f05b5e98e614382582b7ae4cddb37599 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Mon, 1 Jul 2013 17:23:30 +0200 Subject: ethtool: make .get_dump_data() harder to misuse by drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As the patch "bnx2x: remove zeroing of dump data buffer" showed, it is too easy implement .get_dump_data incorrectly in a driver. Let's make sure drivers cannot get confused by userspace requesting a too big dump. Also WARN if the driver sets dump->len to something weird and make sure the length reported to userspace is the actual length of data copied to userspace. Signed-off-by: Michal Schmidt Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 9255bbdf81ff..ab5fa6336c84 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1320,10 +1320,19 @@ static int ethtool_get_dump_data(struct net_device *dev, if (ret) return ret; - len = (tmp.len > dump.len) ? dump.len : tmp.len; + len = min(tmp.len, dump.len); if (!len) return -EFAULT; + /* Don't ever let the driver think there's more space available + * than it requested with .get_dump_flag(). + */ + dump.len = len; + + /* Always allocate enough space to hold the whole thing so that the + * driver does not need to check the length and bother with partial + * dumping. + */ data = vzalloc(tmp.len); if (!data) return -ENOMEM; @@ -1331,6 +1340,16 @@ static int ethtool_get_dump_data(struct net_device *dev, if (ret) goto out; + /* There are two sane possibilities: + * 1. The driver's .get_dump_data() does not touch dump.len. + * 2. Or it may set dump.len to how much it really writes, which + * should be tmp.len (or len if it can do a partial dump). + * In any case respond to userspace with the actual length of data + * it's receiving. + */ + WARN_ON(dump.len != len && dump.len != tmp.len); + dump.len = len; + if (copy_to_user(useraddr, &dump, sizeof(dump))) { ret = -EFAULT; goto out; -- cgit v1.2.3 From 06a23fe31ca3992863721f21bdb0307af93da807 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 2 Jul 2013 20:30:10 +0900 Subject: core/dev: set pkt_type after eth_type_trans() in dev_forward_skb() The dev_forward_skb() assignment of pkt_type should be done after the call to eth_type_trans(). ip-encapsulated packets can be handled by localhost. But skb->pkt_type can be PACKET_OTHERHOST when packet comes via veth into ip tunnel device. In that case, the packet is dropped by ip_rcv(). Although this example uses gretap. l2tp-eth also has same issue. For l2tp-eth case, add dummy device for ip address and ip l2tp command. netns A | root netns | netns B veth<->veth=bridge=gretap <-loop back-> gretap=bridge=veth<->veth arp packet -> pkt_type BROADCAST------------>ip_rcv()------------------------> <- arp reply pkt_type ip_rcv()<-----------------OTHERHOST drop sample operations ip link add tapa type gretap remote 172.17.107.4 local 172.17.107.3 ip link add tapb type gretap remote 172.17.107.3 local 172.17.107.4 ip link set tapa up ip link set tapb up ip address add 172.17.107.3 dev tapa ip address add 172.17.107.4 dev tapb ip route get 172.17.107.3 > local 172.17.107.3 dev lo src 172.17.107.3 > cache ip route get 172.17.107.4 > local 172.17.107.4 dev lo src 172.17.107.4 > cache ip link add vetha type veth peer name vetha-peer ip link add vethb type veth peer name vethb-peer brctl addbr bra brctl addbr brb brctl addif bra tapa brctl addif bra vetha-peer brctl addif brb tapb brctl addif brb vethb-peer brctl show > bridge name bridge id STP enabled interfaces > bra 8000.6ea21e758ff1 no tapa > vetha-peer > brb 8000.420020eb92d5 no tapb > vethb-peer ip link set vetha-peer up ip link set vethb-peer up ip link set bra up ip link set brb up ip netns add a ip netns add b ip link set vetha netns a ip link set vethb netns b ip netns exec a ip address add 10.0.0.3/24 dev vetha ip netns exec b ip address add 10.0.0.4/24 dev vethb ip netns exec a ip link set vetha up ip netns exec b ip link set vethb up ip netns exec a arping -I vetha 10.0.0.4 ARPING 10.0.0.4 from 10.0.0.3 vetha ^CSent 2 probes (2 broadcast(s)) Received 0 response(s) Cc: Jason Wang Cc: "Michael S. Tsirkin" Cc: Eric Dumazet Cc: Patrick McHardy Cc: Hong Zhiguo Cc: Rami Rosen Cc: Tom Parkin Cc: Cong Wang Cc: Pravin B Shelar Cc: Jesse Gross Cc: dev@openvswitch.org Signed-off-by: Isaku Yamahata Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 370354a9c5f6..6a93cd8cd264 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1659,6 +1659,12 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } skb_scrub_packet(skb); skb->protocol = eth_type_trans(skb, dev); + + /* eth_type_trans() can set pkt_type. + * clear pkt_type _after_ calling eth_type_trans() + */ + skb->pkt_type = PACKET_HOST; + return netif_rx(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); -- cgit v1.2.3 From 4bc41b84e9b4d904f68cba2dbe0c60a5428c27c4 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 3 Jul 2013 16:04:25 +0900 Subject: core: Copy inner_protocol in copy_skb_header() inner_protocol was added to struct sk_buff in 0d89d2035fe063461a5ddb609b2c12e7fb006e44 ("MPLS: Add limited GSO support"), which is scheduled to be included in v3.11. That patch did not update __copy_skb_header to copy the inner_protocol. Signed-off-by: Joe Stringer Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/core/skbuff.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 77971a35d6e1..724bb7cb173f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -697,6 +697,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->transport_header = old->transport_header; new->network_header = old->network_header; new->mac_header = old->mac_header; + new->inner_protocol = old->inner_protocol; new->inner_transport_header = old->inner_transport_header; new->inner_network_header = old->inner_network_header; new->inner_mac_header = old->inner_mac_header; -- cgit v1.2.3 From cbf55001b2ddb814329735641be5d29b08c82b08 Mon Sep 17 00:00:00 2001 From: Eliezer Tamir Date: Mon, 8 Jul 2013 16:20:34 +0300 Subject: net: rename low latency sockets functions to busy poll Rename functions in include/net/ll_poll.h to busy wait. Clarify documentation about expected power use increase. Rename POLL_LL to POLL_BUSY_LOOP. Add need_resched() testing to poll/select busy loops. Note, that in select and poll can_busy_poll is dynamic and is updated continuously to reflect the existence of supported sockets with valid queue information. Signed-off-by: Eliezer Tamir Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 12 +++++---- fs/select.c | 60 +++++++++++++++++++++++++---------------- include/net/ll_poll.h | 46 ++++++++++++++++--------------- include/uapi/asm-generic/poll.h | 2 +- net/core/datagram.c | 3 ++- net/ipv4/tcp.c | 6 ++--- net/socket.c | 12 ++++----- 7 files changed, 80 insertions(+), 61 deletions(-) (limited to 'net/core') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index e658bbfb641f..7323b88e26be 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -53,22 +53,24 @@ Default: 64 low_latency_read ---------------- Low latency busy poll timeout for socket reads. (needs CONFIG_NET_LL_RX_POLL) -Approximate time in us to spin waiting for packets on the device queue. +Approximate time in us to busy loop waiting for packets on the device queue. This sets the default value of the SO_LL socket option. -Can be set or overridden per socket by setting socket option SO_LL. -Recommended value is 50. May increase power usage. +Can be set or overridden per socket by setting socket option SO_LL, which is +the preferred method of enabling. +If you need to enable the feature globally via sysctl, a value of 50 is recommended. +Will increase power usage. Default: 0 (off) low_latency_poll ---------------- Low latency busy poll timeout for poll and select. (needs CONFIG_NET_LL_RX_POLL) -Approximate time in us to spin waiting for packets on the device queue. +Approximate time in us to busy loop waiting for events. Recommended value depends on the number of sockets you poll on. For several sockets 50, for several hundreds 100. For more than that you probably want to use epoll. Note that only sockets with SO_LL set will be busy polled, so you want to either selectively set SO_LL on those sockets or set sysctl.net.low_latency_read globally. -May increase power usage. +Will increase power usage. Default: 0 (off) rmem_default diff --git a/fs/select.c b/fs/select.c index f28a58592725..25cac5faf6d6 100644 --- a/fs/select.c +++ b/fs/select.c @@ -402,9 +402,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) poll_table *wait; int retval, i, timed_out = 0; unsigned long slack = 0; - unsigned int ll_flag = ll_get_flag(); - u64 ll_start = ll_start_time(ll_flag); - u64 ll_time = ll_run_time(); + unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; + u64 busy_start = busy_loop_start_time(busy_flag); + u64 busy_end = busy_loop_end_time(); rcu_read_lock(); retval = max_select_fd(n, fds); @@ -427,7 +427,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; - bool can_ll = false; + bool can_busy_loop = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; @@ -456,7 +456,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) mask = DEFAULT_POLLMASK; if (f_op && f_op->poll) { wait_key_set(wait, in, out, - bit, ll_flag); + bit, busy_flag); mask = (*f_op->poll)(f.file, wait); } fdput(f); @@ -475,11 +475,18 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval++; wait->_qproc = NULL; } - if (mask & POLL_LL) - can_ll = true; /* got something, stop busy polling */ - if (retval) - ll_flag = 0; + if (retval) { + can_busy_loop = false; + busy_flag = 0; + + /* + * only remember a returned + * POLL_BUSY_LOOP if we asked for it + */ + } else if (busy_flag & mask) + can_busy_loop = true; + } } if (res_in) @@ -498,8 +505,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) break; } - /* only if on, have sockets with POLL_LL and not out of time */ - if (ll_flag && can_ll && can_poll_ll(ll_start, ll_time)) + /* only if found POLL_BUSY_LOOP sockets && not out of time */ + if (!need_resched() && can_busy_loop && + busy_loop_range(busy_start, busy_end)) continue; /* @@ -734,7 +742,8 @@ struct poll_list { * if pwait->_qproc is non-NULL. */ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, - bool *can_ll, unsigned int ll_flag) + bool *can_busy_poll, + unsigned int busy_flag) { unsigned int mask; int fd; @@ -748,10 +757,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, mask = DEFAULT_POLLMASK; if (f.file->f_op && f.file->f_op->poll) { pwait->_key = pollfd->events|POLLERR|POLLHUP; - pwait->_key |= ll_flag; + pwait->_key |= busy_flag; mask = f.file->f_op->poll(f.file, pwait); - if (mask & POLL_LL) - *can_ll = true; + if (mask & busy_flag) + *can_busy_poll = true; } /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; @@ -770,9 +779,10 @@ static int do_poll(unsigned int nfds, struct poll_list *list, ktime_t expire, *to = NULL; int timed_out = 0, count = 0; unsigned long slack = 0; - unsigned int ll_flag = ll_get_flag(); - u64 ll_start = ll_start_time(ll_flag); - u64 ll_time = ll_run_time(); + unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; + u64 busy_start = busy_loop_start_time(busy_flag); + u64 busy_end = busy_loop_end_time(); + /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -785,7 +795,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, for (;;) { struct poll_list *walk; - bool can_ll = false; + bool can_busy_loop = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; @@ -800,10 +810,13 @@ static int do_poll(unsigned int nfds, struct poll_list *list, * this. They'll get immediately deregistered * when we break out and return. */ - if (do_pollfd(pfd, pt, &can_ll, ll_flag)) { + if (do_pollfd(pfd, pt, &can_busy_loop, + busy_flag)) { count++; pt->_qproc = NULL; - ll_flag = 0; + /* found something, stop busy polling */ + busy_flag = 0; + can_busy_loop = false; } } } @@ -820,8 +833,9 @@ static int do_poll(unsigned int nfds, struct poll_list *list, if (count || timed_out) break; - /* only if on, have sockets with POLL_LL and not out of time */ - if (ll_flag && can_ll && can_poll_ll(ll_start, ll_time)) + /* only if found POLL_BUSY_LOOP sockets && not out of time */ + if (!need_resched() && can_busy_loop && + busy_loop_range(busy_start, busy_end)) continue; /* diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h index 0d620ba19bc5..f14dd88dafc8 100644 --- a/include/net/ll_poll.h +++ b/include/net/ll_poll.h @@ -37,9 +37,9 @@ extern unsigned int sysctl_net_ll_poll __read_mostly; #define LL_FLUSH_FAILED -1 #define LL_FLUSH_BUSY -2 -static inline unsigned int ll_get_flag(void) +static inline bool net_busy_loop_on(void) { - return sysctl_net_ll_poll ? POLL_LL : 0; + return sysctl_net_ll_poll; } /* a wrapper to make debug_smp_processor_id() happy @@ -47,7 +47,7 @@ static inline unsigned int ll_get_flag(void) * we only care that the average is bounded */ #ifdef CONFIG_DEBUG_PREEMPT -static inline u64 ll_sched_clock(void) +static inline u64 busy_loop_sched_clock(void) { u64 rc; @@ -58,7 +58,7 @@ static inline u64 ll_sched_clock(void) return rc; } #else /* CONFIG_DEBUG_PREEMPT */ -static inline u64 ll_sched_clock(void) +static inline u64 busy_loop_sched_clock(void) { return sched_clock(); } @@ -67,7 +67,7 @@ static inline u64 ll_sched_clock(void) /* we don't mind a ~2.5% imprecision so <<10 instead of *1000 * sk->sk_ll_usec is a u_int so this can't overflow */ -static inline u64 ll_sk_run_time(struct sock *sk) +static inline u64 sk_busy_loop_end_time(struct sock *sk) { return (u64)ACCESS_ONCE(sk->sk_ll_usec) << 10; } @@ -75,27 +75,29 @@ static inline u64 ll_sk_run_time(struct sock *sk) /* in poll/select we use the global sysctl_net_ll_poll value * only call sched_clock() if enabled */ -static inline u64 ll_run_time(void) +static inline u64 busy_loop_end_time(void) { return (u64)ACCESS_ONCE(sysctl_net_ll_poll) << 10; } -/* if flag is not set we don't need to know the time */ -static inline u64 ll_start_time(unsigned int flag) +/* if flag is not set we don't need to know the time + * so we want to avoid a potentially expensive sched_clock() + */ +static inline u64 busy_loop_start_time(unsigned int flag) { - return flag ? ll_sched_clock() : 0; + return flag ? busy_loop_sched_clock() : 0; } -static inline bool sk_valid_ll(struct sock *sk) +static inline bool sk_can_busy_loop(struct sock *sk) { return sk->sk_ll_usec && sk->sk_napi_id && !need_resched() && !signal_pending(current); } /* careful! time_in_range64 will evaluate now twice */ -static inline bool can_poll_ll(u64 start_time, u64 run_time) +static inline bool busy_loop_range(u64 start_time, u64 run_time) { - u64 now = ll_sched_clock(); + u64 now = busy_loop_sched_clock(); return time_in_range64(now, start_time, start_time + run_time); } @@ -103,10 +105,10 @@ static inline bool can_poll_ll(u64 start_time, u64 run_time) /* when used in sock_poll() nonblock is known at compile time to be true * so the loop and end_time will be optimized out */ -static inline bool sk_poll_ll(struct sock *sk, int nonblock) +static inline bool sk_busy_loop(struct sock *sk, int nonblock) { - u64 start_time = ll_start_time(!nonblock); - u64 run_time = ll_sk_run_time(sk); + u64 start_time = busy_loop_start_time(!nonblock); + u64 end_time = sk_busy_loop_end_time(sk); const struct net_device_ops *ops; struct napi_struct *napi; int rc = false; @@ -137,7 +139,7 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock) LINUX_MIB_LOWLATENCYRXPACKETS, rc); } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && - can_poll_ll(start_time, run_time)); + busy_loop_range(start_time, end_time)); rc = !skb_queue_empty(&sk->sk_receive_queue); out: @@ -158,27 +160,27 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) } #else /* CONFIG_NET_LL_RX_POLL */ -static inline unsigned long ll_get_flag(void) +static inline unsigned long net_busy_loop_on(void) { return 0; } -static inline u64 ll_start_time(unsigned int flag) +static inline u64 busy_loop_start_time(unsigned int flag) { return 0; } -static inline u64 ll_run_time(void) +static inline u64 busy_loop_end_time(void) { return 0; } -static inline bool sk_valid_ll(struct sock *sk) +static inline bool sk_can_busy_loop(struct sock *sk) { return false; } -static inline bool sk_poll_ll(struct sock *sk, int nonblock) +static inline bool sk_busy_poll(struct sock *sk, int nonblock) { return false; } @@ -191,7 +193,7 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) { } -static inline bool can_poll_ll(u64 start_time, u64 run_time) +static inline bool busy_loop_range(u64 start_time, u64 run_time) { return false; } diff --git a/include/uapi/asm-generic/poll.h b/include/uapi/asm-generic/poll.h index 4aee586979ca..a9694982689f 100644 --- a/include/uapi/asm-generic/poll.h +++ b/include/uapi/asm-generic/poll.h @@ -30,7 +30,7 @@ #define POLLFREE 0x4000 /* currently only for epoll */ -#define POLL_LL 0x8000 +#define POLL_BUSY_LOOP 0x8000 struct pollfd { int fd; diff --git a/net/core/datagram.c b/net/core/datagram.c index 9cbaba98ce4c..6e9ab31e457e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -208,7 +208,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags, } spin_unlock_irqrestore(&queue->lock, cpu_flags); - if (sk_valid_ll(sk) && sk_poll_ll(sk, flags & MSG_DONTWAIT)) + if (sk_can_busy_loop(sk) && + sk_busy_loop(sk, flags & MSG_DONTWAIT)) continue; /* User doesn't want to wait */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 46ed9afd1f5e..15cbfa94bd8e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1554,9 +1554,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct sk_buff *skb; u32 urg_hole = 0; - if (sk_valid_ll(sk) && skb_queue_empty(&sk->sk_receive_queue) - && (sk->sk_state == TCP_ESTABLISHED)) - sk_poll_ll(sk, nonblock); + if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && + (sk->sk_state == TCP_ESTABLISHED)) + sk_busy_loop(sk, nonblock); lock_sock(sk); diff --git a/net/socket.c b/net/socket.c index 4da14cbd49b6..45afa648364a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1148,7 +1148,7 @@ EXPORT_SYMBOL(sock_create_lite); /* No kernel lock held - perfect */ static unsigned int sock_poll(struct file *file, poll_table *wait) { - unsigned int ll_flag = 0; + unsigned int busy_flag = 0; struct socket *sock; /* @@ -1156,16 +1156,16 @@ static unsigned int sock_poll(struct file *file, poll_table *wait) */ sock = file->private_data; - if (sk_valid_ll(sock->sk)) { + if (sk_can_busy_loop(sock->sk)) { /* this socket can poll_ll so tell the system call */ - ll_flag = POLL_LL; + busy_flag = POLL_BUSY_LOOP; /* once, only if requested by syscall */ - if (wait && (wait->_key & POLL_LL)) - sk_poll_ll(sock->sk, 1); + if (wait && (wait->_key & POLL_BUSY_LOOP)) + sk_busy_loop(sock->sk, 1); } - return ll_flag | sock->ops->poll(file, sock, wait); + return busy_flag | sock->ops->poll(file, sock, wait); } static int sock_mmap(struct file *file, struct vm_area_struct *vma) -- cgit v1.2.3