diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/Makefile | 6 | ||||
| -rw-r--r-- | net/core/datagram.c | 24 | ||||
| -rw-r--r-- | net/core/dev.c | 523 | ||||
| -rw-r--r-- | net/core/dev_addr_lists.c | 24 | ||||
| -rw-r--r-- | net/core/dst.c | 15 | ||||
| -rw-r--r-- | net/core/ethtool.c | 732 | ||||
| -rw-r--r-- | net/core/fib_rules.c | 5 | ||||
| -rw-r--r-- | net/core/filter.c | 4 | ||||
| -rw-r--r-- | net/core/flow.c | 26 | ||||
| -rw-r--r-- | net/core/flow_dissector.c | 143 | ||||
| -rw-r--r-- | net/core/kmap_skb.h | 2 | ||||
| -rw-r--r-- | net/core/link_watch.c | 9 | ||||
| -rw-r--r-- | net/core/neighbour.c | 277 | ||||
| -rw-r--r-- | net/core/net-sysfs.c | 339 | ||||
| -rw-r--r-- | net/core/net-traces.c | 1 | ||||
| -rw-r--r-- | net/core/net_namespace.c | 1 | ||||
| -rw-r--r-- | net/core/netevent.c | 1 | ||||
| -rw-r--r-- | net/core/netpoll.c | 13 | ||||
| -rw-r--r-- | net/core/netprio_cgroup.c | 344 | ||||
| -rw-r--r-- | net/core/pktgen.c | 46 | ||||
| -rw-r--r-- | net/core/request_sock.c | 7 | ||||
| -rw-r--r-- | net/core/rtnetlink.c | 59 | ||||
| -rw-r--r-- | net/core/scm.c | 10 | ||||
| -rw-r--r-- | net/core/secure_seq.c | 10 | ||||
| -rw-r--r-- | net/core/skbuff.c | 264 | ||||
| -rw-r--r-- | net/core/sock.c | 236 | ||||
| -rw-r--r-- | net/core/sock_diag.c | 192 | ||||
| -rw-r--r-- | net/core/sysctl_net_core.c | 9 | ||||
| -rw-r--r-- | net/core/timestamping.c | 13 | ||||
| -rw-r--r-- | net/core/user_dma.c | 7 |
30 files changed, 2139 insertions, 1203 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 0d357b1c4e5..674641b13ae 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -3,12 +3,13 @@ # obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ - gen_stats.o gen_estimator.o net_namespace.o secure_seq.o + gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ - neighbour.o rtnetlink.o utils.o link_watch.o filter.o + neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ + sock_diag.o obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o @@ -19,3 +20,4 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o +obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o diff --git a/net/core/datagram.c b/net/core/datagram.c index 18ac112ea7a..68bbf9f65cb 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -324,15 +324,15 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { int err; u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - struct page *page = frag->page; + struct page *page = skb_frag_page(frag); if (copy > len) copy = len; @@ -410,15 +410,15 @@ int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { int err; u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - struct page *page = frag->page; + struct page *page = skb_frag_page(frag); if (copy > len) copy = len; @@ -500,15 +500,15 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { int err; u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - struct page *page = frag->page; + struct page *page = skb_frag_page(frag); if (copy > len) copy = len; @@ -585,16 +585,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { __wsum csum2; int err = 0; u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - struct page *page = frag->page; + struct page *page = skb_frag_page(frag); if (copy > len) copy = len; diff --git a/net/core/dev.c b/net/core/dev.c index b10ff0a7185..f494675471a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -133,6 +133,9 @@ #include <linux/pci.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> +#include <linux/net_tstamp.h> +#include <linux/jump_label.h> +#include <net/flow_keys.h> #include "net-sysfs.h" @@ -1316,8 +1319,6 @@ EXPORT_SYMBOL(dev_close); */ void dev_disable_lro(struct net_device *dev) { - u32 flags; - /* * If we're trying to disable lro on a vlan device * use the underlying physical device instead @@ -1325,15 +1326,9 @@ void dev_disable_lro(struct net_device *dev) if (is_vlan_dev(dev)) dev = vlan_dev_real_dev(dev); - if (dev->ethtool_ops && dev->ethtool_ops->get_flags) - flags = dev->ethtool_ops->get_flags(dev); - else - flags = ethtool_op_get_flags(dev); - - if (!(flags & ETH_FLAG_LRO)) - return; + dev->wanted_features &= ~NETIF_F_LRO; + netdev_update_features(dev); - __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); } @@ -1392,7 +1387,7 @@ rollback: for_each_net(net) { for_each_netdev(net, dev) { if (dev == last) - break; + goto outroll; if (dev->flags & IFF_UP) { nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); @@ -1403,6 +1398,7 @@ rollback: } } +outroll: raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } @@ -1445,33 +1441,105 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); -/* When > 0 there are consumers of rx skb time stamps */ -static atomic_t netstamp_needed = ATOMIC_INIT(0); +static struct jump_label_key netstamp_needed __read_mostly; +#ifdef HAVE_JUMP_LABEL +/* We are not allowed to call jump_label_dec() from irq context + * If net_disable_timestamp() is called from irq context, defer the + * jump_label_dec() calls. + */ +static atomic_t netstamp_needed_deferred; +#endif void net_enable_timestamp(void) { - atomic_inc(&netstamp_needed); +#ifdef HAVE_JUMP_LABEL + int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + + if (deferred) { + while (--deferred) + jump_label_dec(&netstamp_needed); + return; + } +#endif + WARN_ON(in_interrupt()); + jump_label_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { - atomic_dec(&netstamp_needed); +#ifdef HAVE_JUMP_LABEL + if (in_interrupt()) { + atomic_inc(&netstamp_needed_deferred); + return; + } +#endif + jump_label_dec(&netstamp_needed); } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { - if (atomic_read(&netstamp_needed)) + skb->tstamp.tv64 = 0; + if (static_branch(&netstamp_needed)) __net_timestamp(skb); - else - skb->tstamp.tv64 = 0; } -static inline void net_timestamp_check(struct sk_buff *skb) +#define net_timestamp_check(COND, SKB) \ + if (static_branch(&netstamp_needed)) { \ + if ((COND) && !(SKB)->tstamp.tv64) \ + __net_timestamp(SKB); \ + } \ + +static int net_hwtstamp_validate(struct ifreq *ifr) { - if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed)) - __net_timestamp(skb); + struct hwtstamp_config cfg; + enum hwtstamp_tx_types tx_type; + enum hwtstamp_rx_filters rx_filter; + int tx_type_valid = 0; + int rx_filter_valid = 0; + + if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) + return -EFAULT; + + if (cfg.flags) /* reserved for future extensions */ + return -EINVAL; + + tx_type = cfg.tx_type; + rx_filter = cfg.rx_filter; + + switch (tx_type) { + case HWTSTAMP_TX_OFF: + case HWTSTAMP_TX_ON: + case HWTSTAMP_TX_ONESTEP_SYNC: + tx_type_valid = 1; + break; + } + + switch (rx_filter) { + case HWTSTAMP_FILTER_NONE: + case HWTSTAMP_FILTER_ALL: + case HWTSTAMP_FILTER_SOME: + case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + rx_filter_valid = 1; + break; + } + + if (!tx_type_valid || !rx_filter_valid) + return -ERANGE; + + return 0; } static inline bool is_skb_forwardable(struct net_device *dev, @@ -1868,7 +1936,8 @@ EXPORT_SYMBOL(skb_checksum_help); * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. */ -struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features) +struct sk_buff *skb_gso_segment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_type *ptype; @@ -1898,9 +1967,9 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features) if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) dev->ethtool_ops->get_drvinfo(dev, &info); - WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n", - info.driver, dev ? dev->features : 0L, - skb->sk ? skb->sk->sk_route_caps : 0L, + WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d ip_summed=%d\n", + info.driver, dev ? &dev->features : NULL, + skb->sk ? &skb->sk->sk_route_caps : NULL, skb->len, skb->data_len, skb->ip_summed); if (skb_header_cloned(skb) && @@ -1955,9 +2024,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) #ifdef CONFIG_HIGHMEM int i; if (!(dev->features & NETIF_F_HIGHDMA)) { - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - if (PageHighMem(skb_shinfo(skb)->frags[i].page)) + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + if (PageHighMem(skb_frag_page(frag))) return 1; + } } if (PCI_DMA_BUS_IS_PHYS) { @@ -1966,7 +2037,8 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) if (!pdev) return 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page); + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + dma_addr_t addr = page_to_phys(skb_frag_page(frag)); if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) return 1; } @@ -2006,7 +2078,7 @@ static void dev_gso_skb_destructor(struct sk_buff *skb) * This function segments the given skb and stores the list of segments * in skb->next. */ -static int dev_gso_segment(struct sk_buff *skb, int features) +static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs; @@ -2045,7 +2117,7 @@ static inline void skb_orphan_try(struct sk_buff *skb) } } -static bool can_checksum_protocol(unsigned long features, __be16 protocol) +static bool can_checksum_protocol(netdev_features_t features, __be16 protocol) { return ((features & NETIF_F_GEN_CSUM) || ((features & NETIF_F_V4_CSUM) && @@ -2056,7 +2128,8 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol) protocol == htons(ETH_P_FCOE))); } -static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features) +static netdev_features_t harmonize_features(struct sk_buff *skb, + __be16 protocol, netdev_features_t features) { if (!can_checksum_protocol(features, protocol)) { features &= ~NETIF_F_ALL_CSUM; @@ -2068,10 +2141,10 @@ static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features return features; } -u32 netif_skb_features(struct sk_buff *skb) +netdev_features_t netif_skb_features(struct sk_buff *skb) { __be16 protocol = skb->protocol; - u32 features = skb->dev->features; + netdev_features_t features = skb->dev->features; if (protocol == htons(ETH_P_8021Q)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; @@ -2117,7 +2190,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, unsigned int skb_len; if (likely(!skb->next)) { - u32 features; + netdev_features_t features; /* * If device doesn't need skb->dst, release it right now while @@ -2198,7 +2271,7 @@ gso: return rc; } txq_trans_update(txq); - if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) + if (unlikely(netif_xmit_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); @@ -2398,6 +2471,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } +#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +static void skb_update_prio(struct sk_buff *skb) +{ + struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); + + if ((!skb->priority) && (skb->sk) && map) + skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx]; +} +#else +#define skb_update_prio(skb) +#endif + static DEFINE_PER_CPU(int, xmit_recursion); #define RECURSION_LIMIT 10 @@ -2438,6 +2523,8 @@ int dev_queue_xmit(struct sk_buff *skb) */ rcu_read_lock_bh(); + skb_update_prio(skb); + txq = dev_pick_tx(dev, skb); q = rcu_dereference_bh(txq->qdisc); @@ -2472,7 +2559,7 @@ int dev_queue_xmit(struct sk_buff *skb) HARD_TX_LOCK(dev, txq, cpu); - if (!netif_tx_queue_stopped(txq)) { + if (!netif_xmit_stopped(txq)) { __this_cpu_inc(xmit_recursion); rc = dev_hard_start_xmit(skb, dev, txq); __this_cpu_dec(xmit_recursion); @@ -2527,72 +2614,35 @@ static inline void ____napi_schedule(struct softnet_data *sd, /* * __skb_get_rxhash: calculate a flow hash based on src/dst addresses - * and src/dst port numbers. Returns a non-zero hash number on success - * and 0 on failure. + * and src/dst port numbers. Sets rxhash in skb to non-zero hash value + * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb + * if hash is a canonical 4-tuple hash over transport ports. */ -__u32 __skb_get_rxhash(struct sk_buff *skb) -{ - int nhoff, hash = 0, poff; - const struct ipv6hdr *ip6; - const struct iphdr *ip; - u8 ip_proto; - u32 addr1, addr2, ihl; - union { - u32 v32; - u16 v16[2]; - } ports; - - nhoff = skb_network_offset(skb); - - switch (skb->protocol) { - case __constant_htons(ETH_P_IP): - if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) - goto done; +void __skb_get_rxhash(struct sk_buff *skb) +{ + struct flow_keys keys; + u32 hash; - ip = (const struct iphdr *) (skb->data + nhoff); - if (ip_is_fragment(ip)) - ip_proto = 0; - else - ip_proto = ip->protocol; - addr1 = (__force u32) ip->saddr; - addr2 = (__force u32) ip->daddr; - ihl = ip->ihl; - break; - case __constant_htons(ETH_P_IPV6): - if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) - goto done; + if (!skb_flow_dissect(skb, &keys)) + return; - ip6 = (const struct ipv6hdr *) (skb->data + nhoff); - ip_proto = ip6->nexthdr; - addr1 = (__force u32) ip6->saddr.s6_addr32[3]; - addr2 = (__force u32) ip6->daddr.s6_addr32[3]; - ihl = (40 >> 2); - break; - default: - goto done; - } - - ports.v32 = 0; - poff = proto_ports_offset(ip_proto); - if (poff >= 0) { - nhoff += ihl * 4 + poff; - if (pskb_may_pull(skb, nhoff + 4)) { - ports.v32 = * (__force u32 *) (skb->data + nhoff); - if (ports.v16[1] < ports.v16[0]) - swap(ports.v16[0], ports.v16[1]); - } + if (keys.ports) { + if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]) + swap(keys.port16[0], keys.port16[1]); + skb->l4_rxhash = 1; } /* get a consistent hash (same value on both flow directions) */ - if (addr2 < addr1) - swap(addr1, addr2); + if ((__force u32)keys.dst < (__force u32)keys.src) + swap(keys.dst, keys.src); - hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports, hashrnd); if (!hash) hash = 1; -done: - return hash; + skb->rxhash = hash; } EXPORT_SYMBOL(__skb_get_rxhash); @@ -2602,14 +2652,13 @@ EXPORT_SYMBOL(__skb_get_rxhash); struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); +struct jump_label_key rps_needed __read_mostly; + static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { - u16 tcpu; - - tcpu = rflow->cpu = next_cpu; - if (tcpu != RPS_NO_CPU) { + if (next_cpu != RPS_NO_CPU) { #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; @@ -2637,16 +2686,16 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, goto out; old_rflow = rflow; rflow = &flow_table->flows[flow_id]; - rflow->cpu = next_cpu; rflow->filter = rc; if (old_rflow->filter == rflow->filter) old_rflow->filter = RPS_NO_FILTER; out: #endif rflow->last_qtail = - per_cpu(softnet_data, tcpu).input_queue_head; + per_cpu(softnet_data, next_cpu).input_queue_head; } + rflow->cpu = next_cpu; return rflow; } @@ -2681,13 +2730,13 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, map = rcu_dereference(rxqueue->rps_map); if (map) { if (map->len == 1 && - !rcu_dereference_raw(rxqueue->rps_flow_table)) { + !rcu_access_pointer(rxqueue->rps_flow_table)) { tcpu = map->cpus[0]; if (cpu_online(tcpu)) cpu = tcpu; goto done; } - } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) { + } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) { goto done; } @@ -2884,12 +2933,11 @@ int netif_rx(struct sk_buff *skb) if (netpoll_rx(skb)) return NET_RX_DROP; - if (netdev_tstamp_prequeue) - net_timestamp_check(skb); + net_timestamp_check(netdev_tstamp_prequeue, skb); trace_netif_rx(skb); #ifdef CONFIG_RPS - { + if (static_branch(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; @@ -2904,14 +2952,13 @@ int netif_rx(struct sk_buff *skb) rcu_read_unlock(); preempt_enable(); - } -#else + } else +#endif { unsigned int qtail; ret = enqueue_to_backlog(skb, get_cpu(), &qtail); put_cpu(); } -#endif return ret; } EXPORT_SYMBOL(netif_rx); @@ -3102,8 +3149,8 @@ void netdev_rx_handler_unregister(struct net_device *dev) { ASSERT_RTNL(); - rcu_assign_pointer(dev->rx_handler, NULL); - rcu_assign_pointer(dev->rx_handler_data, NULL); + RCU_INIT_POINTER(dev->rx_handler, NULL); + RCU_INIT_POINTER(dev->rx_handler_data, NULL); } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); @@ -3117,8 +3164,7 @@ static int __netif_receive_skb(struct sk_buff *skb) int ret = NET_RX_DROP; __be16 type; - if (!netdev_tstamp_prequeue) - net_timestamp_check(skb); + net_timestamp_check(!netdev_tstamp_prequeue, skb); trace_netif_receive_skb(skb); @@ -3171,6 +3217,17 @@ ncls: #endif rx_handler = rcu_dereference(skb->dev->rx_handler); + if (vlan_tx_tag_present(skb)) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + if (vlan_do_receive(&skb, !rx_handler)) + goto another_round; + else if (unlikely(!skb)) + goto out; + } + if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); @@ -3190,18 +3247,6 @@ ncls: } } - if (vlan_tx_tag_present(skb)) { - if (pt_prev) { - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = NULL; - } - if (vlan_do_receive(&skb)) { - ret = __netif_receive_skb(skb); - goto out; - } else if (unlikely(!skb)) - goto out; - } - /* deliver only exact match when indicated */ null_or_dev = deliver_exact ? skb->dev : NULL; @@ -3250,14 +3295,13 @@ out: */ int netif_receive_skb(struct sk_buff *skb) { - if (netdev_tstamp_prequeue) - net_timestamp_check(skb); + net_timestamp_check(netdev_tstamp_prequeue, skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; #ifdef CONFIG_RPS - { + if (static_branch(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu, ret; @@ -3268,16 +3312,12 @@ int netif_receive_skb(struct sk_buff *skb) if (cpu >= 0) { ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); - } else { - rcu_read_unlock(); - ret = __netif_receive_skb(skb); + return ret; } - - return ret; + rcu_read_unlock(); } -#else - return __netif_receive_skb(skb); #endif + return __netif_receive_skb(skb); } EXPORT_SYMBOL(netif_receive_skb); @@ -3429,10 +3469,10 @@ pull: skb->data_len -= grow; skb_shinfo(skb)->frags[0].page_offset += grow; - skb_shinfo(skb)->frags[0].size -= grow; + skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow); - if (unlikely(!skb_shinfo(skb)->frags[0].size)) { - put_page(skb_shinfo(skb)->frags[0].page); + if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) { + skb_frag_unref(skb, 0); memmove(skb_shinfo(skb)->frags, skb_shinfo(skb)->frags + 1, --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); @@ -3496,11 +3536,10 @@ void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->frag0_len = 0; if (skb->mac_header == skb->tail && - !PageHighMem(skb_shinfo(skb)->frags[0].page)) { + !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) { NAPI_GRO_CB(skb)->frag0 = - page_address(skb_shinfo(skb)->frags[0].page) + - skb_shinfo(skb)->frags[0].page_offset; - NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; + skb_frag_address(&skb_shinfo(skb)->frags[0]); + NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]); } } EXPORT_SYMBOL(skb_gro_reset_offset); @@ -3982,6 +4021,60 @@ static int dev_ifconf(struct net *net, char __user *arg) } #ifdef CONFIG_PROC_FS + +#define BUCKET_SPACE (32 - NETDEV_HASHBITS) + +struct dev_iter_state { + struct seq_net_private p; + unsigned int pos; /* bucket << BUCKET_SPACE + offset */ +}; + +#define get_bucket(x) ((x) >> BUCKET_SPACE) +#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) +#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) + +static inline struct net_device *dev_from_same_bucket(struct seq_file *seq) +{ + struct dev_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct net_device *dev; + struct hlist_node *p; + struct hlist_head *h; + unsigned int count, bucket, offset; + + bucket = get_bucket(state->pos); + offset = get_offset(state->pos); + h = &net->dev_name_head[bucket]; + count = 0; + hlist_for_each_entry_rcu(dev, p, h, name_hlist) { + if (count++ == offset) { + state->pos = set_bucket_offset(bucket, count); + return dev; + } + } + + return NULL; +} + +static inline struct net_device *dev_from_new_bucket(struct seq_file *seq) +{ + struct dev_iter_state *state = seq->private; + struct net_device *dev; + unsigned int bucket; + + bucket = get_bucket(state->pos); + do { + dev = dev_from_same_bucket(seq); + if (dev) + return dev; + + bucket++; + state->pos = set_bucket_offset(bucket, 0); + } while (bucket < NETDEV_HASHENTRIES); + + return NULL; +} + /* * This is invoked by the /proc filesystem handler to display a device * in detail. @@ -3989,33 +4082,33 @@ static int dev_ifconf(struct net *net, char __user *arg) void *dev_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { - struct net *net = seq_file_net(seq); - loff_t off; - struct net_device *dev; + struct dev_iter_state *state = seq->private; rcu_read_lock(); if (!*pos) return SEQ_START_TOKEN; - off = 1; - for_each_netdev_rcu(net, dev) - if (off++ == *pos) - return dev; + /* check for end of the hash */ + if (state->pos == 0 && *pos > 1) + return NULL; - return NULL; + return dev_from_new_bucket(seq); } void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net_device *dev = v; + struct net_device *dev; + + ++*pos; if (v == SEQ_START_TOKEN) - dev = first_net_device_rcu(seq_file_net(seq)); - else - dev = next_net_device_rcu(dev); + return dev_from_new_bucket(seq); - ++*pos; - return dev; + dev = dev_from_same_bucket(seq); + if (dev) + return dev; + + return dev_from_new_bucket(seq); } void dev_seq_stop(struct seq_file *seq, void *v) @@ -4114,7 +4207,13 @@ static const struct seq_operations dev_seq_ops = { static int dev_seq_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &dev_seq_ops, - sizeof(struct seq_net_private)); + sizeof(struct dev_iter_state)); +} + +int dev_seq_open_ops(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + return seq_open_net(inode, file, ops, sizeof(struct dev_iter_state)); } static const struct file_operations dev_seq_fops = { @@ -4367,7 +4466,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) static int __dev_set_promiscuity(struct net_device *dev, int inc) { - unsigned short old_flags = dev->flags; + unsigned int old_flags = dev->flags; uid_t uid; gid_t gid; @@ -4424,7 +4523,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc) */ int dev_set_promiscuity(struct net_device *dev, int inc) { - unsigned short old_flags = dev->flags; + unsigned int old_flags = dev->flags; int err; err = __dev_set_promiscuity(dev, inc); @@ -4451,7 +4550,7 @@ EXPORT_SYMBOL(dev_set_promiscuity); int dev_set_allmulti(struct net_device *dev, int inc) { - unsigned short old_flags = dev->flags; + unsigned int old_flags = dev->flags; ASSERT_RTNL(); @@ -4497,9 +4596,7 @@ void __dev_set_rx_mode(struct net_device *dev) if (!netif_device_present(dev)) return; - if (ops->ndo_set_rx_mode) - ops->ndo_set_rx_mode(dev); - else { + if (!(dev->priv_flags & IFF_UNICAST_FLT)) { /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ @@ -4510,10 +4607,10 @@ void __dev_set_rx_mode(struct net_device *dev) __dev_set_promiscuity(dev, -1); dev->uc_promisc = false; } - - if (ops->ndo_set_multicast_list) - ops->ndo_set_multicast_list(dev); } + + if (ops->ndo_set_rx_mode) + ops->ndo_set_rx_mode(dev); } void dev_set_rx_mode(struct net_device *dev) @@ -4524,30 +4621,6 @@ void dev_set_rx_mode(struct net_device *dev) } /** - * dev_ethtool_get_settings - call device's ethtool_ops::get_settings() - * @dev: device - * @cmd: memory area for ethtool_ops::get_settings() result - * - * The cmd arg is initialized properly (cleared and - * ethtool_cmd::cmd field set to ETHTOOL_GSET). - * - * Return device's ethtool_ops::get_settings() result value or - * -EOPNOTSUPP when device doesn't expose - * ethtool_ops::get_settings() operation. - */ -int dev_ethtool_get_settings(struct net_device *dev, - struct ethtool_cmd *cmd) -{ - if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) - return -EOPNOTSUPP; - - memset(cmd, 0, sizeof(struct ethtool_cmd)); - cmd->cmd = ETHTOOL_GSET; - return dev->ethtool_ops->get_settings(dev, cmd); -} -EXPORT_SYMBOL(dev_ethtool_get_settings); - -/** * dev_get_flags - get flags reported to userspace * @dev: device * @@ -4580,7 +4653,7 @@ EXPORT_SYMBOL(dev_get_flags); int __dev_change_flags(struct net_device *dev, unsigned int flags) { - int old_flags = dev->flags; + unsigned int old_flags = dev->flags; int ret; ASSERT_RTNL(); @@ -4663,10 +4736,10 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) * Change settings on device based state flags. The flags are * in the userspace exported format. */ -int dev_change_flags(struct net_device *dev, unsigned flags) +int dev_change_flags(struct net_device *dev, unsigned int flags) { - int ret, changes; - int old_flags = dev->flags; + int ret; + unsigned int changes, old_flags = dev->flags; ret = __dev_change_flags(dev, flags); if (ret < 0) @@ -4863,7 +4936,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return -EOPNOTSUPP; case SIOCADDMULTI: - if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || + if (!ops->ndo_set_rx_mode || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) @@ -4871,7 +4944,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); case SIOCDELMULTI: - if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || + if (!ops->ndo_set_rx_mode || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) @@ -4888,6 +4961,12 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) ifr->ifr_newname[IFNAMSIZ-1] = '\0'; return dev_change_name(dev, ifr->ifr_newname); + case SIOCSHWTSTAMP: + err = net_hwtstamp_validate(ifr); + if (err) + return err; + /* fall through */ + /* * Unknown or private ioctl */ @@ -5202,7 +5281,7 @@ static void rollback_registered_many(struct list_head *head) dev = list_first_entry(head, struct net_device, unreg_list); call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); - rcu_barrier(); + synchronize_net(); list_for_each_entry(dev, head, unreg_list) dev_put(dev); @@ -5217,7 +5296,8 @@ static void rollback_registered(struct net_device *dev) list_del(&single); } -static u32 netdev_fix_features(struct net_device *dev, u32 features) +static netdev_features_t netdev_fix_features(struct net_device *dev, + netdev_features_t features) { /* Fix illegal checksum combinations */ if ((features & NETIF_F_HW_CSUM) && @@ -5226,12 +5306,6 @@ static u32 netdev_fix_features(struct net_device *dev, u32 features) features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } - if ((features & NETIF_F_NO_CSUM) && - (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { - netdev_warn(dev, "mixed no checksumming and other settings.\n"); - features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); - } - /* Fix illegal SG+CSUM combinations. */ if ((features & NETIF_F_SG) && !(features & NETIF_F_ALL_CSUM)) { @@ -5279,7 +5353,7 @@ static u32 netdev_fix_features(struct net_device *dev, u32 features) int __netdev_update_features(struct net_device *dev) { - u32 features; + netdev_features_t features; int err = 0; ASSERT_RTNL(); @@ -5295,16 +5369,16 @@ int __netdev_update_features(struct net_device *dev) if (dev->features == features) return 0; - netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n", - dev->features, features); + netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", + &dev->features, &features); if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); if (unlikely(err < 0)) { netdev_err(dev, - "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n", - err, features, dev->features); + "set_features() failed (%d); wanted %pNF, left %pNF\n", + err, &features, &dev->features); return -1; } @@ -5403,6 +5477,9 @@ static void netdev_init_one_queue(struct net_device *dev, queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; +#ifdef CONFIG_BQL + dql_init(&queue->dql, HZ); +#endif } static int netif_alloc_netdev_queues(struct net_device *dev) @@ -5488,11 +5565,12 @@ int register_netdevice(struct net_device *dev) dev->wanted_features = dev->features & dev->hw_features; /* Turn on no cache copy if HW is doing checksum */ - dev->hw_features |= NETIF_F_NOCACHE_COPY; - if ((dev->features & NETIF_F_ALL_CSUM) && - !(dev->features & NETIF_F_NO_CSUM)) { - dev->wanted_features |= NETIF_F_NOCACHE_COPY; - dev->features |= NETIF_F_NOCACHE_COPY; + if (!(dev->flags & IFF_LOOPBACK)) { + dev->hw_features |= NETIF_F_NOCACHE_COPY; + if (dev->features & NETIF_F_ALL_CSUM) { + dev->wanted_features |= NETIF_F_NOCACHE_COPY; + dev->features |= NETIF_F_NOCACHE_COPY; + } } /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. @@ -5715,6 +5793,12 @@ void netdev_run_todo(void) __rtnl_unlock(); + /* Wait for rcu callbacks to finish before attempting to drain + * the device list. This usually avoids a 250ms wait. + */ + if (!list_empty(&list)) + rcu_barrier(); + while (!list_empty(&list)) { struct net_device *dev = list_first_entry(&list, struct net_device, todo_list); @@ -5735,8 +5819,8 @@ void netdev_run_todo(void) /* paranoia */ BUG_ON(netdev_refcnt_read(dev)); - WARN_ON(rcu_dereference_raw(dev->ip_ptr)); - WARN_ON(rcu_dereference_raw(dev->ip6_ptr)); + WARN_ON(rcu_access_pointer(dev->ip_ptr)); + WARN_ON(rcu_access_pointer(dev->ip6_ptr)); WARN_ON(dev->dn_ptr); if (dev->destructor) @@ -5940,7 +6024,7 @@ void free_netdev(struct net_device *dev) kfree(dev->_rx); #endif - kfree(rcu_dereference_raw(dev->ingress_queue)); + kfree(rcu_dereference_protected(dev->ingress_queue, 1)); /* Flush device addresses */ dev_addr_flush(dev); @@ -6115,6 +6199,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); /* * Flush the unicast and multicast chains @@ -6221,7 +6306,8 @@ static int dev_cpu_callback(struct notifier_block *nfb, * @one to the master device with current feature set @all. Will not * enable anything that is off in @mask. Returns the new feature set. */ -u32 netdev_increment_features(u32 all, u32 one, u32 mask) +netdev_features_t netdev_increment_features(netdev_features_t all, + netdev_features_t one, netdev_features_t mask) { if (mask & NETIF_F_GEN_CSUM) mask |= NETIF_F_ALL_CSUM; @@ -6230,10 +6316,6 @@ u32 netdev_increment_features(u32 all, u32 one, u32 mask) all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; - /* If device needs checksumming, downgrade to it. */ - if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM)) - all &= ~NETIF_F_NO_CSUM; - /* If one device supports hw checksumming, set for all. */ if (all & NETIF_F_GEN_CSUM) all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); @@ -6298,7 +6380,7 @@ const char *netdev_drivername(const struct net_device *dev) return empty; } -static int __netdev_printk(const char *level, const struct net_device *dev, +int __netdev_printk(const char *level, const struct net_device *dev, struct va_format *vaf) { int r; @@ -6313,6 +6395,7 @@ static int __netdev_printk(const char *level, const struct net_device *dev, return r; } +EXPORT_SYMBOL(__netdev_printk); int netdev_printk(const char *level, const struct net_device *dev, const char *format, ...) diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index e2e66939ed0..29c07fef922 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -13,6 +13,7 @@ #include <linux/netdevice.h> #include <linux/rtnetlink.h> +#include <linux/export.h> #include <linux/list.h> #include <linux/proc_fs.h> @@ -426,7 +427,7 @@ EXPORT_SYMBOL(dev_uc_del); * * Add newly added addresses to the destination device and release * addresses that have no users left. The source device must be - * locked by netif_tx_lock_bh. + * locked by netif_addr_lock_bh. * * This function is intended to be called from the dev->set_rx_mode * function of layered software devices. @@ -438,11 +439,11 @@ int dev_uc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_bh(to); + netif_addr_lock_nested(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); - netif_addr_unlock_bh(to); + netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_uc_sync); @@ -462,7 +463,7 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from) return; netif_addr_lock_bh(from); - netif_addr_lock(to); + netif_addr_lock_nested(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); @@ -589,10 +590,10 @@ EXPORT_SYMBOL(dev_mc_del_global); * * Add newly added addresses to the destination device and release * addresses that have no users left. The source device must be - * locked by netif_tx_lock_bh. + * locked by netif_addr_lock_bh. * - * This function is intended to be called from the dev->set_multicast_list - * or dev->set_rx_mode function of layered software devices. + * This function is intended to be called from the ndo_set_rx_mode + * function of layered software devices. */ int dev_mc_sync(struct net_device *to, struct net_device *from) { @@ -601,11 +602,11 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock_bh(to); + netif_addr_lock_nested(to); err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); - netif_addr_unlock_bh(to); + netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_mc_sync); @@ -625,7 +626,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from) return; netif_addr_lock_bh(from); - netif_addr_lock(to); + netif_addr_lock_nested(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); @@ -695,8 +696,7 @@ static const struct seq_operations dev_mc_seq_ops = { static int dev_mc_seq_open(struct inode *inode, struct file *file) { - return seq_open_net(inode, file, &dev_mc_seq_ops, - sizeof(struct seq_net_private)); + return dev_seq_open_ops(inode, file, &dev_mc_seq_ops); } static const struct file_operations dev_mc_seq_fops = { diff --git a/net/core/dst.c b/net/core/dst.c index 14b33baf073..43d94cedbf7 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -171,7 +171,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst_init_metrics(dst, dst_default_metrics, true); dst->expires = 0UL; dst->path = dst; - dst->_neighbour = NULL; + RCU_INIT_POINTER(dst->_neighbour, NULL); #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif @@ -229,11 +229,11 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) smp_rmb(); again: - neigh = dst->_neighbour; + neigh = rcu_dereference_protected(dst->_neighbour, 1); child = dst->child; if (neigh) { - dst->_neighbour = NULL; + RCU_INIT_POINTER(dst->_neighbour, NULL); neigh_release(neigh); } @@ -360,14 +360,19 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (!unregister) { dst->input = dst->output = dst_discard; } else { + struct neighbour *neigh; + dst->dev = dev_net(dst->dev)->loopback_dev; dev_hold(dst->dev); dev_put(dev); - if (dst->_neighbour && dst->_neighbour->dev == dev) { - dst->_neighbour->dev = dst->dev; + rcu_read_lock(); + neigh = dst_get_neighbour_noref(dst); + if (neigh && neigh->dev == dev) { + neigh->dev = dst->dev; dev_hold(dst->dev); dev_put(dev); } + rcu_read_unlock(); } } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 6cdba5fc2be..921aa2b4b41 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -36,235 +36,44 @@ u32 ethtool_op_get_link(struct net_device *dev) } EXPORT_SYMBOL(ethtool_op_get_link); -u32 ethtool_op_get_tx_csum(struct net_device *dev) -{ - return (dev->features & NETIF_F_ALL_CSUM) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_tx_csum); - -int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) -{ - if (data) - dev->features |= NETIF_F_IP_CSUM; - else - dev->features &= ~NETIF_F_IP_CSUM; - - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_tx_csum); - -int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data) -{ - if (data) - dev->features |= NETIF_F_HW_CSUM; - else - dev->features &= ~NETIF_F_HW_CSUM; - - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum); - -int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data) -{ - if (data) - dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; - else - dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); - - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum); - -u32 ethtool_op_get_sg(struct net_device *dev) -{ - return (dev->features & NETIF_F_SG) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_sg); - -int ethtool_op_set_sg(struct net_device *dev, u32 data) -{ - if (data) - dev->features |= NETIF_F_SG; - else - dev->features &= ~NETIF_F_SG; - - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_sg); - -u32 ethtool_op_get_tso(struct net_device *dev) -{ - return (dev->features & NETIF_F_TSO) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_tso); - -int ethtool_op_set_tso(struct net_device *dev, u32 data) -{ - if (data) - dev->features |= NETIF_F_TSO; - else - dev->features &= ~NETIF_F_TSO; - - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_tso); - -u32 ethtool_op_get_ufo(struct net_device *dev) -{ - return (dev->features & NETIF_F_UFO) != 0; -} -EXPORT_SYMBOL(ethtool_op_get_ufo); - -int ethtool_op_set_ufo(struct net_device *dev, u32 data) -{ - if (data) - dev->features |= NETIF_F_UFO; - else - dev->features &= ~NETIF_F_UFO; - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_ufo); - -/* the following list of flags are the same as their associated - * NETIF_F_xxx values in include/linux/netdevice.h - */ -static const u32 flags_dup_features = - (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE | - ETH_FLAG_RXHASH); - -u32 ethtool_op_get_flags(struct net_device *dev) -{ - /* in the future, this function will probably contain additional - * handling for flags which are not so easily handled - * by a simple masking operation - */ - - return dev->features & flags_dup_features; -} -EXPORT_SYMBOL(ethtool_op_get_flags); - -/* Check if device can enable (or disable) particular feature coded in "data" - * argument. Flags "supported" describe features that can be toggled by device. - * If feature can not be toggled, it state (enabled or disabled) must match - * hardcoded device features state, otherwise flags are marked as invalid. - */ -bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported) -{ - u32 features = dev->features & flags_dup_features; - /* "data" can contain only flags_dup_features bits, - * see __ethtool_set_flags */ - - return (features & ~supported) != (data & ~supported); -} -EXPORT_SYMBOL(ethtool_invalid_flags); - -int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported) -{ - if (ethtool_invalid_flags(dev, data, supported)) - return -EINVAL; - - dev->features = ((dev->features & ~flags_dup_features) | - (data & flags_dup_features)); - return 0; -} -EXPORT_SYMBOL(ethtool_op_set_flags); - /* Handlers for each ethtool command */ -#define ETHTOOL_DEV_FEATURE_WORDS 1 - -static void ethtool_get_features_compat(struct net_device *dev, - struct ethtool_get_features_block *features) -{ - if (!dev->ethtool_ops) - return; - - /* getting RX checksum */ - if (dev->ethtool_ops->get_rx_csum) - if (dev->ethtool_ops->get_rx_csum(dev)) - features[0].active |= NETIF_F_RXCSUM; - - /* mark legacy-changeable features */ - if (dev->ethtool_ops->set_sg) - features[0].available |= NETIF_F_SG; - if (dev->ethtool_ops->set_tx_csum) - features[0].available |= NETIF_F_ALL_CSUM; - if (dev->ethtool_ops->set_tso) - features[0].available |= NETIF_F_ALL_TSO; - if (dev->ethtool_ops->set_rx_csum) - features[0].available |= NETIF_F_RXCSUM; - if (dev->ethtool_ops->set_flags) - features[0].available |= flags_dup_features; -} - -static int ethtool_set_feature_compat(struct net_device *dev, - int (*legacy_set)(struct net_device *, u32), - struct ethtool_set_features_block *features, u32 mask) -{ - u32 do_set; - - if (!legacy_set) - return 0; - - if (!(features[0].valid & mask)) - return 0; - - features[0].valid &= ~mask; - - do_set = !!(features[0].requested & mask); - - if (legacy_set(dev, do_set) < 0) - netdev_info(dev, - "Legacy feature change (%s) failed for 0x%08x\n", - do_set ? "set" : "clear", mask); - - return 1; -} - -static int ethtool_set_flags_compat(struct net_device *dev, - int (*legacy_set)(struct net_device *, u32), - struct ethtool_set_features_block *features, u32 mask) -{ - u32 value; - - if (!legacy_set) - return 0; - - if (!(features[0].valid & mask)) - return 0; - - value = dev->features & ~features[0].valid; - value |= features[0].requested; - - features[0].valid &= ~mask; - - if (legacy_set(dev, value & mask) < 0) - netdev_info(dev, "Legacy flags change failed\n"); - - return 1; -} - -static int ethtool_set_features_compat(struct net_device *dev, - struct ethtool_set_features_block *features) -{ - int compat; - - if (!dev->ethtool_ops) - return 0; - - compat = ethtool_set_feature_compat(dev, dev->ethtool_ops->set_sg, - features, NETIF_F_SG); - compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tx_csum, - features, NETIF_F_ALL_CSUM); - compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tso, - features, NETIF_F_ALL_TSO); - compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_rx_csum, - features, NETIF_F_RXCSUM); - compat |= ethtool_set_flags_compat(dev, dev->ethtool_ops->set_flags, - features, flags_dup_features); - - return compat; -} +#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32) + +static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { + [NETIF_F_SG_BIT] = "tx-scatter-gather", + [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4", + [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic", + [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", + [NETIF_F_HIGHDMA_BIT] = "highdma", + [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", + [NETIF_F_HW_VLAN_TX_BIT] = "tx-vlan-hw-insert", + + [NETIF_F_HW_VLAN_RX_BIT] = "rx-vlan-hw-parse", + [NETIF_F_HW_VLAN_FILTER_BIT] = "rx-vlan-filter", + [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", + [NETIF_F_GSO_BIT] = "tx-generic-segmentation", + [NETIF_F_LLTX_BIT] = "tx-lockless", + [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", + [NETIF_F_GRO_BIT] = "rx-gro", + [NETIF_F_LRO_BIT] = "rx-lro", + + [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", + [NETIF_F_UFO_BIT] = "tx-udp-fragmentation", + [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", + [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", + [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", + [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", + + [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", + [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", + [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", + [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", + [NETIF_F_RXHASH_BIT] = "rx-hashing", + [NETIF_F_RXCSUM_BIT] = "rx-checksum", + [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy", + [NETIF_F_LOOPBACK_BIT] = "loopback", +}; static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { @@ -272,18 +81,21 @@ static int ethtool_get_features(struct net_device *dev, void __user *useraddr) .cmd = ETHTOOL_GFEATURES, .size = ETHTOOL_DEV_FEATURE_WORDS, }; - struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = { - { - .available = dev->hw_features, - .requested = dev->wanted_features, - .active = dev->features, - .never_changed = NETIF_F_NEVER_CHANGE, - }, - }; + struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; u32 __user *sizeaddr; u32 copy_size; + int i; + + /* in case feature bits run out again */ + BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t)); - ethtool_get_features_compat(dev, features); + for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { + features[i].available = (u32)(dev->hw_features >> (32 * i)); + features[i].requested = (u32)(dev->wanted_features >> (32 * i)); + features[i].active = (u32)(dev->features >> (32 * i)); + features[i].never_changed = + (u32)(NETIF_F_NEVER_CHANGE >> (32 * i)); + } sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); if (get_user(copy_size, sizeaddr)) @@ -305,7 +117,8 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr) { struct ethtool_sfeatures cmd; struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; - int ret = 0; + netdev_features_t wanted = 0, valid = 0; + int i, ret = 0; if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; @@ -317,65 +130,29 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr) if (copy_from_user(features, useraddr, sizeof(features))) return -EFAULT; - if (features[0].valid & ~NETIF_F_ETHTOOL_BITS) - return -EINVAL; + for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { + valid |= (netdev_features_t)features[i].valid << (32 * i); + wanted |= (netdev_features_t)features[i].requested << (32 * i); + } - if (ethtool_set_features_compat(dev, features)) - ret |= ETHTOOL_F_COMPAT; + if (valid & ~NETIF_F_ETHTOOL_BITS) + return -EINVAL; - if (features[0].valid & ~dev->hw_features) { - features[0].valid &= dev->hw_features; + if (valid & ~dev->hw_features) { + valid &= dev->hw_features; ret |= ETHTOOL_F_UNSUPPORTED; } - dev->wanted_features &= ~features[0].valid; - dev->wanted_features |= features[0].valid & features[0].requested; + dev->wanted_features &= ~valid; + dev->wanted_features |= wanted & valid; __netdev_update_features(dev); - if ((dev->wanted_features ^ dev->features) & features[0].valid) + if ((dev->wanted_features ^ dev->features) & valid) ret |= ETHTOOL_F_WISH; return ret; } -static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = { - /* NETIF_F_SG */ "tx-scatter-gather", - /* NETIF_F_IP_CSUM */ "tx-checksum-ipv4", - /* NETIF_F_NO_CSUM */ "tx-checksum-unneeded", - /* NETIF_F_HW_CSUM */ "tx-checksum-ip-generic", - /* NETIF_F_IPV6_CSUM */ "tx-checksum-ipv6", - /* NETIF_F_HIGHDMA */ "highdma", - /* NETIF_F_FRAGLIST */ "tx-scatter-gather-fraglist", - /* NETIF_F_HW_VLAN_TX */ "tx-vlan-hw-insert", - - /* NETIF_F_HW_VLAN_RX */ "rx-vlan-hw-parse", - /* NETIF_F_HW_VLAN_FILTER */ "rx-vlan-filter", - /* NETIF_F_VLAN_CHALLENGED */ "vlan-challenged", - /* NETIF_F_GSO */ "tx-generic-segmentation", - /* NETIF_F_LLTX */ "tx-lockless", - /* NETIF_F_NETNS_LOCAL */ "netns-local", - /* NETIF_F_GRO */ "rx-gro", - /* NETIF_F_LRO */ "rx-lro", - - /* NETIF_F_TSO */ "tx-tcp-segmentation", - /* NETIF_F_UFO */ "tx-udp-fragmentation", - /* NETIF_F_GSO_ROBUST */ "tx-gso-robust", - /* NETIF_F_TSO_ECN */ "tx-tcp-ecn-segmentation", - /* NETIF_F_TSO6 */ "tx-tcp6-segmentation", - /* NETIF_F_FSO */ "tx-fcoe-segmentation", - "", - "", - - /* NETIF_F_FCOE_CRC */ "tx-checksum-fcoe-crc", - /* NETIF_F_SCTP_CSUM */ "tx-checksum-sctp", - /* NETIF_F_FCOE_MTU */ "fcoe-mtu", - /* NETIF_F_NTUPLE */ "rx-ntuple-filter", - /* NETIF_F_RXHASH */ "rx-hashing", - /* NETIF_F_RXCSUM */ "rx-checksum", - /* NETIF_F_NOCACHE_COPY */ "tx-nocache-copy", - /* NETIF_F_LOOPBACK */ "loopback", -}; - static int __ethtool_get_sset_count(struct net_device *dev, int sset) { const struct ethtool_ops *ops = dev->ethtool_ops; @@ -402,7 +179,7 @@ static void __ethtool_get_strings(struct net_device *dev, ops->get_strings(dev, stringset, data); } -static u32 ethtool_get_feature_mask(u32 eth_cmd) +static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) { /* feature masks of legacy discrete ethtool ops */ @@ -433,151 +210,107 @@ static u32 ethtool_get_feature_mask(u32 eth_cmd) } } -static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd) -{ - const struct ethtool_ops *ops = dev->ethtool_ops; - - if (!ops) - return NULL; - - switch (ethcmd) { - case ETHTOOL_GTXCSUM: - return ops->get_tx_csum; - case ETHTOOL_GRXCSUM: - return ops->get_rx_csum; - case ETHTOOL_SSG: - return ops->get_sg; - case ETHTOOL_STSO: - return ops->get_tso; - case ETHTOOL_SUFO: - return ops->get_ufo; - default: - return NULL; - } -} - -static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev) -{ - return !!(dev->features & NETIF_F_ALL_CSUM); -} - static int ethtool_get_one_feature(struct net_device *dev, char __user *useraddr, u32 ethcmd) { - u32 mask = ethtool_get_feature_mask(ethcmd); + netdev_features_t mask = ethtool_get_feature_mask(ethcmd); struct ethtool_value edata = { .cmd = ethcmd, .data = !!(dev->features & mask), }; - /* compatibility with discrete get_ ops */ - if (!(dev->hw_features & mask)) { - u32 (*actor)(struct net_device *); - - actor = __ethtool_get_one_feature_actor(dev, ethcmd); - - /* bug compatibility with old get_rx_csum */ - if (ethcmd == ETHTOOL_GRXCSUM && !actor) - actor = __ethtool_get_rx_csum_oldbug; - - if (actor) - edata.data = actor(dev); - } - if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } -static int __ethtool_set_tx_csum(struct net_device *dev, u32 data); -static int __ethtool_set_rx_csum(struct net_device *dev, u32 data); -static int __ethtool_set_sg(struct net_device *dev, u32 data); -static int __ethtool_set_tso(struct net_device *dev, u32 data); -static int __ethtool_set_ufo(struct net_device *dev, u32 data); - static int ethtool_set_one_feature(struct net_device *dev, void __user *useraddr, u32 ethcmd) { struct ethtool_value edata; - u32 mask; + netdev_features_t mask; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; mask = ethtool_get_feature_mask(ethcmd); mask &= dev->hw_features; - if (mask) { - if (edata.data) - dev->wanted_features |= mask; - else - dev->wanted_features &= ~mask; + if (!mask) + return -EOPNOTSUPP; - __netdev_update_features(dev); - return 0; - } + if (edata.data) + dev->wanted_features |= mask; + else + dev->wanted_features &= ~mask; - /* Driver is not converted to ndo_fix_features or does not - * support changing this offload. In the latter case it won't - * have corresponding ethtool_ops field set. - * - * Following part is to be removed after all drivers advertise - * their changeable features in netdev->hw_features and stop - * using discrete offload setting ops. - */ + __netdev_update_features(dev); - switch (ethcmd) { - case ETHTOOL_STXCSUM: - return __ethtool_set_tx_csum(dev, edata.data); - case ETHTOOL_SRXCSUM: - return __ethtool_set_rx_csum(dev, edata.data); - case ETHTOOL_SSG: - return __ethtool_set_sg(dev, edata.data); - case ETHTOOL_STSO: - return __ethtool_set_tso(dev, edata.data); - case ETHTOOL_SUFO: - return __ethtool_set_ufo(dev, edata.data); - default: - return -EOPNOTSUPP; - } + return 0; } -int __ethtool_set_flags(struct net_device *dev, u32 data) +#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ + ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) +#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_RX | \ + NETIF_F_HW_VLAN_TX | NETIF_F_NTUPLE | NETIF_F_RXHASH) + +static u32 __ethtool_get_flags(struct net_device *dev) { - u32 changed; + u32 flags = 0; + + if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO; + if (dev->features & NETIF_F_HW_VLAN_RX) flags |= ETH_FLAG_RXVLAN; + if (dev->features & NETIF_F_HW_VLAN_TX) flags |= ETH_FLAG_TXVLAN; + if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE; + if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH; - if (data & ~flags_dup_features) + return flags; +} + +static int __ethtool_set_flags(struct net_device *dev, u32 data) +{ + netdev_features_t features = 0, changed; + + if (data & ~ETH_ALL_FLAGS) return -EINVAL; - /* legacy set_flags() op */ - if (dev->ethtool_ops->set_flags) { - if (unlikely(dev->hw_features & flags_dup_features)) - netdev_warn(dev, - "driver BUG: mixed hw_features and set_flags()\n"); - return dev->ethtool_ops->set_flags(dev, data); - } + if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO; + if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_RX; + if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_TX; + if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE; + if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH; /* allow changing only bits set in hw_features */ - changed = (data ^ dev->features) & flags_dup_features; + changed = (features ^ dev->features) & ETH_ALL_FEATURES; if (changed & ~dev->hw_features) return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; dev->wanted_features = - (dev->wanted_features & ~changed) | (data & dev->hw_features); + (dev->wanted_features & ~changed) | (features & changed); __netdev_update_features(dev); return 0; } -static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) { - struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET }; - int err; + ASSERT_RTNL(); - if (!dev->ethtool_ops->get_settings) + if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) return -EOPNOTSUPP; - err = dev->ethtool_ops->get_settings(dev, &cmd); + memset(cmd, 0, sizeof(struct ethtool_cmd)); + cmd->cmd = ETHTOOL_GSET; + return dev->ethtool_ops->get_settings(dev, cmd); +} +EXPORT_SYMBOL(__ethtool_get_settings); + +static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +{ + int err; + struct ethtool_cmd cmd; + + err = __ethtool_get_settings(dev, &cmd); if (err < 0) return err; @@ -706,6 +439,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, { struct ethtool_rxnfc info; size_t info_size = sizeof(info); + int rc; if (!dev->ethtool_ops->set_rxnfc) return -EOPNOTSUPP; @@ -721,7 +455,15 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, if (copy_from_user(&info, useraddr, info_size)) return -EFAULT; - return dev->ethtool_ops->set_rxnfc(dev, &info); + rc = dev->ethtool_ops->set_rxnfc(dev, &info); + if (rc) + return rc; + + if (cmd == ETHTOOL_SRXCLSRLINS && + copy_to_user(useraddr, &info, info_size)) + return -EFAULT; + + return 0; } static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, @@ -782,34 +524,44 @@ err_out: static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { - struct ethtool_rxfh_indir *indir; - u32 table_size; - size_t full_size; + u32 user_size, dev_size; + u32 *indir; int ret; - if (!dev->ethtool_ops->get_rxfh_indir) + if (!dev->ethtool_ops->get_rxfh_indir_size || + !dev->ethtool_ops->get_rxfh_indir) + return -EOPNOTSUPP; + dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (dev_size == 0) return -EOPNOTSUPP; - if (copy_from_user(&table_size, + if (copy_from_user(&user_size, useraddr + offsetof(struct ethtool_rxfh_indir, size), - sizeof(table_size))) + sizeof(user_size))) return -EFAULT; - if (table_size > - (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) - return -ENOMEM; - full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; - indir = kzalloc(full_size, GFP_USER); + if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size), + &dev_size, sizeof(dev_size))) + return -EFAULT; + + /* If the user buffer size is 0, this is just a query for the + * device table size. Otherwise, if it's smaller than the + * device table size it's an error. + */ + if (user_size < dev_size) + return user_size == 0 ? 0 : -EINVAL; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); if (!indir) return -ENOMEM; - indir->cmd = ETHTOOL_GRXFHINDIR; - indir->size = table_size; ret = dev->ethtool_ops->get_rxfh_indir(dev, indir); if (ret) goto out; - if (copy_to_user(useraddr, indir, full_size)) + if (copy_to_user(useraddr + + offsetof(struct ethtool_rxfh_indir, ring_index[0]), + indir, dev_size * sizeof(indir[0]))) ret = -EFAULT; out: @@ -820,30 +572,56 @@ out: static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, void __user *useraddr) { - struct ethtool_rxfh_indir *indir; - u32 table_size; - size_t full_size; + struct ethtool_rxnfc rx_rings; + u32 user_size, dev_size, i; + u32 *indir; int ret; - if (!dev->ethtool_ops->set_rxfh_indir) + if (!dev->ethtool_ops->get_rxfh_indir_size || + !dev->ethtool_ops->set_rxfh_indir || + !dev->ethtool_ops->get_rxnfc) + return -EOPNOTSUPP; + dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (dev_size == 0) return -EOPNOTSUPP; - if (copy_from_user(&table_size, + if (copy_from_user(&user_size, useraddr + offsetof(struct ethtool_rxfh_indir, size), - sizeof(table_size))) + sizeof(user_size))) return -EFAULT; - if (table_size > - (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index)) - return -ENOMEM; - full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size; - indir = kmalloc(full_size, GFP_USER); + if (user_size != 0 && user_size != dev_size) + return -EINVAL; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); if (!indir) return -ENOMEM; - if (copy_from_user(indir, useraddr, full_size)) { - ret = -EFAULT; + rx_rings.cmd = ETHTOOL_GRXRINGS; + ret = dev->ethtool_ops->get_rxnfc(dev, &rx_rings, NULL); + if (ret) goto out; + + if (user_size == 0) { + for (i = 0; i < dev_size; i++) + indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + } else { + if (copy_from_user(indir, + useraddr + + offsetof(struct ethtool_rxfh_indir, + ring_index[0]), + dev_size * sizeof(indir[0]))) { + ret = -EFAULT; + goto out; + } + + /* Validate ring indices */ + for (i = 0; i < dev_size; i++) { + if (indir[i] >= rx_rings.data) { + ret = -EINVAL; + goto out; + } + } } ret = dev->ethtool_ops->set_rxfh_indir(dev, indir); @@ -853,58 +631,6 @@ out: return ret; } -/* - * ethtool does not (or did not) set masks for flow parameters that are - * not specified, so if both value and mask are 0 then this must be - * treated as equivalent to a mask with all bits set. Implement that - * here rather than in drivers. - */ -static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs) -{ - struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec; - struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec; - - if (fs->flow_type != TCP_V4_FLOW && - fs->flow_type != UDP_V4_FLOW && - fs->flow_type != SCTP_V4_FLOW) - return; - - if (!(entry->ip4src | mask->ip4src)) - mask->ip4src = htonl(0xffffffff); - if (!(entry->ip4dst | mask->ip4dst)) - mask->ip4dst = htonl(0xffffffff); - if (!(entry->psrc | mask->psrc)) - mask->psrc = htons(0xffff); - if (!(entry->pdst | mask->pdst)) - mask->pdst = htons(0xffff); - if (!(entry->tos | mask->tos)) - mask->tos = 0xff; - if (!(fs->vlan_tag | fs->vlan_tag_mask)) - fs->vlan_tag_mask = 0xffff; - if (!(fs->data | fs->data_mask)) - fs->data_mask = 0xffffffffffffffffULL; -} - -static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, - void __user *useraddr) -{ - struct ethtool_rx_ntuple cmd; - const struct ethtool_ops *ops = dev->ethtool_ops; - - if (!ops->set_rx_ntuple) - return -EOPNOTSUPP; - - if (!(dev->features & NETIF_F_NTUPLE)) - return -EINVAL; - - if (copy_from_user(&cmd, useraddr, sizeof(cmd))) - return -EFAULT; - - rx_ntuple_fix_masks(&cmd.fs); - - return ops->set_rx_ntuple(dev, &cmd); -} - static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) { struct ethtool_regs regs; @@ -1221,81 +947,6 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); } -static int __ethtool_set_sg(struct net_device *dev, u32 data) -{ - int err; - - if (!dev->ethtool_ops->set_sg) - return -EOPNOTSUPP; - - if (data && !(dev->features & NETIF_F_ALL_CSUM)) - return -EINVAL; - - if (!data && dev->ethtool_ops->set_tso) { - err = dev->ethtool_ops->set_tso(dev, 0); - if (err) - return err; - } - - if (!data && dev->ethtool_ops->set_ufo) { - err = dev->ethtool_ops->set_ufo(dev, 0); - if (err) - return err; - } - return dev->ethtool_ops->set_sg(dev, data); -} - -static int __ethtool_set_tx_csum(struct net_device *dev, u32 data) -{ - int err; - - if (!dev->ethtool_ops->set_tx_csum) - return -EOPNOTSUPP; - - if (!data && dev->ethtool_ops->set_sg) { - err = __ethtool_set_sg(dev, 0); - if (err) - return err; - } - - return dev->ethtool_ops->set_tx_csum(dev, data); -} - -static int __ethtool_set_rx_csum(struct net_device *dev, u32 data) -{ - if (!dev->ethtool_ops->set_rx_csum) - return -EOPNOTSUPP; - - if (!data) - dev->features &= ~NETIF_F_GRO; - - return dev->ethtool_ops->set_rx_csum(dev, data); -} - -static int __ethtool_set_tso(struct net_device *dev, u32 data) -{ - if (!dev->ethtool_ops->set_tso) - return -EOPNOTSUPP; - - if (data && !(dev->features & NETIF_F_SG)) - return -EINVAL; - - return dev->ethtool_ops->set_tso(dev, data); -} - -static int __ethtool_set_ufo(struct net_device *dev, u32 data) -{ - if (!dev->ethtool_ops->set_ufo) - return -EOPNOTSUPP; - if (data && !(dev->features & NETIF_F_SG)) - return -EINVAL; - if (data && !((dev->features & NETIF_F_GEN_CSUM) || - (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) - == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) - return -EINVAL; - return dev->ethtool_ops->set_ufo(dev, data); -} - static int ethtool_self_test(struct net_device *dev, char __user *useraddr) { struct ethtool_test test; @@ -1761,9 +1412,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) break; case ETHTOOL_GFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, - (dev->ethtool_ops->get_flags ? - dev->ethtool_ops->get_flags : - ethtool_op_get_flags)); + __ethtool_get_flags); break; case ETHTOOL_SFLAGS: rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); @@ -1794,9 +1443,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_RESET: rc = ethtool_reset(dev, useraddr); break; - case ETHTOOL_SRXNTUPLE: - rc = ethtool_set_rx_ntuple(dev, useraddr); - break; case ETHTOOL_GSSET_INFO: rc = ethtool_get_sset_info(dev, useraddr); break; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 27071ee2a4e..c02e63c908d 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -12,6 +12,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> +#include <linux/module.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/fib_rules.h> @@ -490,7 +491,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (ops->nr_goto_rules > 0) { list_for_each_entry(tmp, &ops->rules_list, list) { if (rtnl_dereference(tmp->ctarget) == rule) { - rcu_assign_pointer(tmp->ctarget, NULL); + RCU_INIT_POINTER(tmp->ctarget, NULL); ops->unresolved_rules++; } } @@ -548,7 +549,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh->flags = rule->flags; if (rule->action == FR_ACT_GOTO && - rcu_dereference_raw(rule->ctarget) == NULL) + rcu_access_pointer(rule->ctarget) == NULL) frh->flags |= FIB_RULE_UNRESOLVED; if (rule->iifname[0]) { diff --git a/net/core/filter.c b/net/core/filter.c index 36f975fa87c..5dea4527921 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -436,7 +436,7 @@ error: * * Returns 0 if the rule set is legal or -EINVAL if not. */ -int sk_chk_filter(struct sock_filter *filter, int flen) +int sk_chk_filter(struct sock_filter *filter, unsigned int flen) { /* * Valid instructions are initialized to non-0. @@ -645,7 +645,7 @@ int sk_detach_filter(struct sock *sk) filter = rcu_dereference_protected(sk->sk_filter, sock_owned_by_user(sk)); if (filter) { - rcu_assign_pointer(sk->sk_filter, NULL); + RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); ret = 0; } diff --git a/net/core/flow.c b/net/core/flow.c index 555a456efb0..e318c7e9804 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -358,6 +358,18 @@ void flow_cache_flush(void) put_online_cpus(); } +static void flow_cache_flush_task(struct work_struct *work) +{ + flow_cache_flush(); +} + +static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task); + +void flow_cache_flush_deferred(void) +{ + schedule_work(&flow_cache_flush_work); +} + static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) { struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); @@ -413,7 +425,7 @@ static int __init flow_cache_init(struct flow_cache *fc) for_each_online_cpu(i) { if (flow_cache_cpu_prepare(fc, i)) - return -ENOMEM; + goto err; } fc->hotcpu_notifier = (struct notifier_block){ .notifier_call = flow_cache_cpu, @@ -426,6 +438,18 @@ static int __init flow_cache_init(struct flow_cache *fc) add_timer(&fc->rnd_timer); return 0; + +err: + for_each_possible_cpu(i) { + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); + kfree(fcp->hash_table); + fcp->hash_table = NULL; + } + + free_percpu(fc->percpu); + fc->percpu = NULL; + + return -ENOMEM; } static int __init flow_cache_init_global(void) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c new file mode 100644 index 00000000000..0985b9b14b8 --- /dev/null +++ b/net/core/flow_dissector.c @@ -0,0 +1,143 @@ +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/if_vlan.h> +#include <net/ip.h> +#include <linux/if_tunnel.h> +#include <linux/if_pppox.h> +#include <linux/ppp_defs.h> +#include <net/flow_keys.h> + +/* copy saddr & daddr, possibly using 64bit load/store + * Equivalent to : flow->src = iph->saddr; + * flow->dst = iph->daddr; + */ +static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph) +{ + BUILD_BUG_ON(offsetof(typeof(*flow), dst) != + offsetof(typeof(*flow), src) + sizeof(flow->src)); + memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); +} + +bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +{ + int poff, nhoff = skb_network_offset(skb); + u8 ip_proto; + __be16 proto = skb->protocol; + + memset(flow, 0, sizeof(*flow)); + +again: + switch (proto) { + case __constant_htons(ETH_P_IP): { + const struct iphdr *iph; + struct iphdr _iph; +ip: + iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + if (!iph) + return false; + + if (ip_is_fragment(iph)) + ip_proto = 0; + else + ip_proto = iph->protocol; + iph_to_flow_copy_addrs(flow, iph); + nhoff += iph->ihl * 4; + break; + } + case __constant_htons(ETH_P_IPV6): { + const struct ipv6hdr *iph; + struct ipv6hdr _iph; +ipv6: + iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + if (!iph) + return false; + + ip_proto = iph->nexthdr; + flow->src = iph->saddr.s6_addr32[3]; + flow->dst = iph->daddr.s6_addr32[3]; + nhoff += sizeof(struct ipv6hdr); + break; + } + case __constant_htons(ETH_P_8021Q): { + const struct vlan_hdr *vlan; + struct vlan_hdr _vlan; + + vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); + if (!vlan) + return false; + + proto = vlan->h_vlan_encapsulated_proto; + nhoff += sizeof(*vlan); + goto again; + } + case __constant_htons(ETH_P_PPP_SES): { + struct { + struct pppoe_hdr hdr; + __be16 proto; + } *hdr, _hdr; + hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + if (!hdr) + return false; + proto = hdr->proto; + nhoff += PPPOE_SES_HLEN; + switch (proto) { + case __constant_htons(PPP_IP): + goto ip; + case __constant_htons(PPP_IPV6): + goto ipv6; + default: + return false; + } + } + default: + return false; + } + + switch (ip_proto) { + case IPPROTO_GRE: { + struct gre_hdr { + __be16 flags; + __be16 proto; + } *hdr, _hdr; + + hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + if (!hdr) + return false; + /* + * Only look inside GRE if version zero and no + * routing + */ + if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) { + proto = hdr->proto; + nhoff += 4; + if (hdr->flags & GRE_CSUM) + nhoff += 4; + if (hdr->flags & GRE_KEY) + nhoff += 4; + if (hdr->flags & GRE_SEQ) + nhoff += 4; + goto again; + } + break; + } + case IPPROTO_IPIP: + goto again; + default: + break; + } + + flow->ip_proto = ip_proto; + poff = proto_ports_offset(ip_proto); + if (poff >= 0) { + __be32 *ports, _ports; + + nhoff += poff; + ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports); + if (ports) + flow->ports = *ports; + } + + return true; +} +EXPORT_SYMBOL(skb_flow_dissect); diff --git a/net/core/kmap_skb.h b/net/core/kmap_skb.h index 283c2b993fb..81e1ed7c838 100644 --- a/net/core/kmap_skb.h +++ b/net/core/kmap_skb.h @@ -7,7 +7,7 @@ static inline void *kmap_skb_frag(const skb_frag_t *frag) local_bh_disable(); #endif - return kmap_atomic(frag->page, KM_SKB_DATA_SOFTIRQ); + return kmap_atomic(skb_frag_page(frag), KM_SKB_DATA_SOFTIRQ); } static inline void kunmap_skb_frag(void *vaddr) diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 357bd4ee4ba..c3519c6d1b1 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -78,8 +78,13 @@ static void rfc2863_policy(struct net_device *dev) static bool linkwatch_urgent_event(struct net_device *dev) { - return netif_running(dev) && netif_carrier_ok(dev) && - qdisc_tx_changing(dev); + if (!netif_running(dev)) + return false; + + if (dev->ifindex != dev->iflink) + return true; + + return netif_carrier_ok(dev) && qdisc_tx_changing(dev); } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 1334d7e56f0..e287346e093 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -238,6 +238,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) it to safe state. */ skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; n->output = neigh_blackhole; if (n->nud_state & NUD_VALID) n->nud_state = NUD_NOARP; @@ -272,7 +273,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) } EXPORT_SYMBOL(neigh_ifdown); -static struct neighbour *neigh_alloc(struct neigh_table *tbl) +static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) { struct neighbour *n = NULL; unsigned long now = jiffies; @@ -287,7 +288,15 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) goto out_entries; } - n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC); + if (tbl->entry_size) + n = kzalloc(tbl->entry_size, GFP_ATOMIC); + else { + int sz = sizeof(*n) + tbl->key_len; + + sz = ALIGN(sz, NEIGH_PRIV_ALIGN); + sz += dev->neigh_priv_len; + n = kzalloc(sz, GFP_ATOMIC); + } if (!n) goto out_entries; @@ -313,11 +322,18 @@ out_entries: goto out; } +static void neigh_get_hash_rnd(u32 *x) +{ + get_random_bytes(x, sizeof(*x)); + *x |= 1; +} + static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { size_t size = (1 << shift) * sizeof(struct neighbour *); struct neigh_hash_table *ret; struct neighbour __rcu **buckets; + int i; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) @@ -334,8 +350,8 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) } ret->hash_buckets = buckets; ret->hash_shift = shift; - get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); - ret->hash_rnd |= 1; + for (i = 0; i < NEIGH_NUM_HASH_RND; i++) + neigh_get_hash_rnd(&ret->hash_rnd[i]); return ret; } @@ -462,7 +478,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, u32 hash_val; int key_len = tbl->key_len; int error; - struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); struct neigh_hash_table *nht; if (!n) { @@ -480,6 +496,14 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, goto out_neigh_release; } + if (dev->netdev_ops->ndo_neigh_construct) { + error = dev->netdev_ops->ndo_neigh_construct(n); + if (error < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + } + /* Device specific setup. */ if (n->parms->neigh_setup && (error = n->parms->neigh_setup(n)) < 0) { @@ -677,18 +701,14 @@ static inline void neigh_parms_put(struct neigh_parms *parms) neigh_parms_destroy(parms); } -static void neigh_destroy_rcu(struct rcu_head *head) -{ - struct neighbour *neigh = container_of(head, struct neighbour, rcu); - - kmem_cache_free(neigh->tbl->kmem_cachep, neigh); -} /* * neighbour must already be out of the table; * */ void neigh_destroy(struct neighbour *neigh) { + struct net_device *dev = neigh->dev; + NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); if (!neigh->dead) { @@ -702,14 +722,18 @@ void neigh_destroy(struct neighbour *neigh) printk(KERN_WARNING "Impossible event.\n"); skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; - dev_put(neigh->dev); + if (dev->netdev_ops->ndo_neigh_destroy) + dev->netdev_ops->ndo_neigh_destroy(neigh); + + dev_put(dev); neigh_parms_put(neigh->parms); NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); atomic_dec(&neigh->tbl->entries); - call_rcu(&neigh->rcu, neigh_destroy_rcu); + kfree_rcu(neigh, rcu); } EXPORT_SYMBOL(neigh_destroy); @@ -842,6 +866,20 @@ static void neigh_invalidate(struct neighbour *neigh) write_lock(&neigh->lock); } skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; +} + +static void neigh_probe(struct neighbour *neigh) + __releases(neigh->lock) +{ + struct sk_buff *skb = skb_peek(&neigh->arp_queue); + /* keep skb alive even if arp_queue overflows */ + if (skb) + skb = skb_copy(skb, GFP_ATOMIC); + write_unlock(&neigh->lock); + neigh->ops->solicit(neigh, skb); + atomic_inc(&neigh->probes); + kfree_skb(skb); } /* Called when a timer expires for a neighbour entry. */ @@ -859,12 +897,8 @@ static void neigh_timer_handler(unsigned long arg) now = jiffies; next = now + HZ; - if (!(state & NUD_IN_TIMER)) { -#ifndef CONFIG_SMP - printk(KERN_WARNING "neigh: timer & !nud_in_timer\n"); -#endif + if (!(state & NUD_IN_TIMER)) goto out; - } if (state & NUD_REACHABLE) { if (time_before_eq(now, @@ -920,14 +954,7 @@ static void neigh_timer_handler(unsigned long arg) neigh_hold(neigh); } if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { - struct sk_buff *skb = skb_peek(&neigh->arp_queue); - /* keep skb alive even if arp_queue overflows */ - if (skb) - skb = skb_copy(skb, GFP_ATOMIC); - write_unlock(&neigh->lock); - neigh->ops->solicit(neigh, skb); - atomic_inc(&neigh->probes); - kfree_skb(skb); + neigh_probe(neigh); } else { out: write_unlock(&neigh->lock); @@ -942,7 +969,7 @@ out: int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { int rc; - unsigned long now; + bool immediate_probe = false; write_lock_bh(&neigh->lock); @@ -950,14 +977,16 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; - now = jiffies; - if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { if (neigh->parms->mcast_probes + neigh->parms->app_probes) { + unsigned long next, now = jiffies; + atomic_set(&neigh->probes, neigh->parms->ucast_probes); neigh->nud_state = NUD_INCOMPLETE; - neigh->updated = jiffies; - neigh_add_timer(neigh, now + 1); + neigh->updated = now; + next = now + max(neigh->parms->retrans_time, HZ/2); + neigh_add_timer(neigh, next); + immediate_probe = true; } else { neigh->nud_state = NUD_FAILED; neigh->updated = jiffies; @@ -976,20 +1005,29 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) if (neigh->nud_state == NUD_INCOMPLETE) { if (skb) { - if (skb_queue_len(&neigh->arp_queue) >= - neigh->parms->queue_len) { + while (neigh->arp_queue_len_bytes + skb->truesize > + neigh->parms->queue_len_bytes) { struct sk_buff *buff; + buff = __skb_dequeue(&neigh->arp_queue); + if (!buff) + break; + neigh->arp_queue_len_bytes -= buff->truesize; kfree_skb(buff); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); __skb_queue_tail(&neigh->arp_queue, skb); + neigh->arp_queue_len_bytes += skb->truesize; } rc = 1; } out_unlock_bh: - write_unlock_bh(&neigh->lock); + if (immediate_probe) + neigh_probe(neigh); + else + write_unlock(&neigh->lock); + local_bh_enable(); return rc; } EXPORT_SYMBOL(__neigh_event_send); @@ -1156,13 +1194,18 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, struct dst_entry *dst = skb_dst(skb); struct neighbour *n2, *n1 = neigh; write_unlock_bh(&neigh->lock); + + rcu_read_lock(); /* On shaper/eql skb->dst->neighbour != neigh :( */ - if (dst && (n2 = dst_get_neighbour(dst)) != NULL) + if (dst && (n2 = dst_get_neighbour_noref(dst)) != NULL) n1 = n2; n1->output(n1, skb); + rcu_read_unlock(); + write_lock_bh(&neigh->lock); } skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; } out: if (update_isrouter) { @@ -1465,11 +1508,6 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl) tbl->parms.reachable_time = neigh_rand_reach_time(tbl->parms.base_reachable_time); - if (!tbl->kmem_cachep) - tbl->kmem_cachep = - kmem_cache_create(tbl->id, tbl->entry_size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL); tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) panic("cannot create neighbour cache statistics"); @@ -1554,9 +1592,6 @@ int neigh_table_clear(struct neigh_table *tbl) free_percpu(tbl->stats); tbl->stats = NULL; - kmem_cache_destroy(tbl->kmem_cachep); - tbl->kmem_cachep = NULL; - return 0; } EXPORT_SYMBOL(neigh_table_clear); @@ -1735,7 +1770,11 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex); NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)); - NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len); + NLA_PUT_U32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes); + /* approximative value for deprecated QUEUE_LEN (in packets) */ + NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, + DIV_ROUND_UP(parms->queue_len_bytes, + SKB_TRUESIZE(ETH_FRAME_LEN))); NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen); NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes); NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes); @@ -1796,7 +1835,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); - ndc.ndtc_hash_rnd = nht->hash_rnd; + ndc.ndtc_hash_rnd = nht->hash_rnd[0]; ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); rcu_read_unlock_bh(); @@ -1962,7 +2001,11 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) switch (i) { case NDTPA_QUEUE_LEN: - p->queue_len = nla_get_u32(tbp[i]); + p->queue_len_bytes = nla_get_u32(tbp[i]) * + SKB_TRUESIZE(ETH_FRAME_LEN); + break; + case NDTPA_QUEUE_LENBYTES: + p->queue_len_bytes = nla_get_u32(tbp[i]); break; case NDTPA_PROXY_QLEN: p->proxy_qlen = nla_get_u32(tbp[i]); @@ -2385,7 +2428,10 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct neigh_table *tbl = state->tbl; - pn = pn->next; + do { + pn = pn->next; + } while (pn && !net_eq(pneigh_net(pn), net)); + while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; @@ -2623,117 +2669,158 @@ EXPORT_SYMBOL(neigh_app_ns); #ifdef CONFIG_SYSCTL -#define NEIGH_VARS_MAX 19 +static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int size, ret; + ctl_table tmp = *ctl; + + tmp.data = &size; + size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN)); + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (write && !ret) + *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); + return ret; +} + +enum { + NEIGH_VAR_MCAST_PROBE, + NEIGH_VAR_UCAST_PROBE, + NEIGH_VAR_APP_PROBE, + NEIGH_VAR_RETRANS_TIME, + NEIGH_VAR_BASE_REACHABLE_TIME, + NEIGH_VAR_DELAY_PROBE_TIME, + NEIGH_VAR_GC_STALETIME, + NEIGH_VAR_QUEUE_LEN, + NEIGH_VAR_QUEUE_LEN_BYTES, + NEIGH_VAR_PROXY_QLEN, + NEIGH_VAR_ANYCAST_DELAY, + NEIGH_VAR_PROXY_DELAY, + NEIGH_VAR_LOCKTIME, + NEIGH_VAR_RETRANS_TIME_MS, + NEIGH_VAR_BASE_REACHABLE_TIME_MS, + NEIGH_VAR_GC_INTERVAL, + NEIGH_VAR_GC_THRESH1, + NEIGH_VAR_GC_THRESH2, + NEIGH_VAR_GC_THRESH3, + NEIGH_VAR_MAX +}; static struct neigh_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table neigh_vars[NEIGH_VARS_MAX]; + struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; char *dev_name; } neigh_sysctl_template __read_mostly = { .neigh_vars = { - { + [NEIGH_VAR_MCAST_PROBE] = { .procname = "mcast_solicit", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_UCAST_PROBE] = { .procname = "ucast_solicit", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_APP_PROBE] = { .procname = "app_solicit", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_RETRANS_TIME] = { .procname = "retrans_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_userhz_jiffies, }, - { + [NEIGH_VAR_BASE_REACHABLE_TIME] = { .procname = "base_reachable_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { + [NEIGH_VAR_DELAY_PROBE_TIME] = { .procname = "delay_first_probe_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { + [NEIGH_VAR_GC_STALETIME] = { .procname = "gc_stale_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { + [NEIGH_VAR_QUEUE_LEN] = { .procname = "unres_qlen", .maxlen = sizeof(int), .mode = 0644, + .proc_handler = proc_unres_qlen, + }, + [NEIGH_VAR_QUEUE_LEN_BYTES] = { + .procname = "unres_qlen_bytes", + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_PROXY_QLEN] = { .procname = "proxy_qlen", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_ANYCAST_DELAY] = { .procname = "anycast_delay", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_userhz_jiffies, }, - { + [NEIGH_VAR_PROXY_DELAY] = { .procname = "proxy_delay", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_userhz_jiffies, }, - { + [NEIGH_VAR_LOCKTIME] = { .procname = "locktime", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_userhz_jiffies, }, - { + [NEIGH_VAR_RETRANS_TIME_MS] = { .procname = "retrans_time_ms", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, - { + [NEIGH_VAR_BASE_REACHABLE_TIME_MS] = { .procname = "base_reachable_time_ms", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, - { + [NEIGH_VAR_GC_INTERVAL] = { .procname = "gc_interval", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { + [NEIGH_VAR_GC_THRESH1] = { .procname = "gc_thresh1", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_GC_THRESH2] = { .procname = "gc_thresh2", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, - { + [NEIGH_VAR_GC_THRESH3] = { .procname = "gc_thresh3", .maxlen = sizeof(int), .mode = 0644, @@ -2766,47 +2853,49 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, if (!t) goto err; - t->neigh_vars[0].data = &p->mcast_probes; - t->neigh_vars[1].data = &p->ucast_probes; - t->neigh_vars[2].data = &p->app_probes; - t->neigh_vars[3].data = &p->retrans_time; - t->neigh_vars[4].data = &p->base_reachable_time; - t->neigh_vars[5].data = &p->delay_probe_time; - t->neigh_vars[6].data = &p->gc_staletime; - t->neigh_vars[7].data = &p->queue_len; - t->neigh_vars[8].data = &p->proxy_qlen; - t->neigh_vars[9].data = &p->anycast_delay; - t->neigh_vars[10].data = &p->proxy_delay; - t->neigh_vars[11].data = &p->locktime; - t->neigh_vars[12].data = &p->retrans_time; - t->neigh_vars[13].data = &p->base_reachable_time; + t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data = &p->mcast_probes; + t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data = &p->ucast_probes; + t->neigh_vars[NEIGH_VAR_APP_PROBE].data = &p->app_probes; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data = &p->retrans_time; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data = &p->base_reachable_time; + t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data = &p->delay_probe_time; + t->neigh_vars[NEIGH_VAR_GC_STALETIME].data = &p->gc_staletime; + t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data = &p->queue_len_bytes; + t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data = &p->queue_len_bytes; + t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data = &p->proxy_qlen; + t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data = &p->anycast_delay; + t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay; + t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data = &p->retrans_time; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data = &p->base_reachable_time; if (dev) { dev_name_source = dev->name; /* Terminate the table early */ - memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14])); + memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, + sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); } else { dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname; - t->neigh_vars[14].data = (int *)(p + 1); - t->neigh_vars[15].data = (int *)(p + 1) + 1; - t->neigh_vars[16].data = (int *)(p + 1) + 2; - t->neigh_vars[17].data = (int *)(p + 1) + 3; + t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1); + t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1; + t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2; + t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3; } if (handler) { /* RetransTime */ - t->neigh_vars[3].proc_handler = handler; - t->neigh_vars[3].extra1 = dev; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev; /* ReachableTime */ - t->neigh_vars[4].proc_handler = handler; - t->neigh_vars[4].extra1 = dev; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev; /* RetransTime (in milliseconds)*/ - t->neigh_vars[12].proc_handler = handler; - t->neigh_vars[12].extra1 = dev; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev; /* ReachableTime (in milliseconds) */ - t->neigh_vars[13].proc_handler = handler; - t->neigh_vars[13].extra1 = dev; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev; } t->dev_name = kstrdup(dev_name_source, GFP_KERNEL); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1683e5db2f2..f3dbd4f596a 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -20,6 +20,8 @@ #include <linux/rtnetlink.h> #include <linux/wireless.h> #include <linux/vmalloc.h> +#include <linux/export.h> +#include <linux/jiffies.h> #include <net/wext.h> #include "net-sysfs.h" @@ -147,7 +149,7 @@ static ssize_t show_speed(struct device *dev, if (netif_running(netdev)) { struct ethtool_cmd cmd; - if (!dev_ethtool_get_settings(netdev, &cmd)) + if (!__ethtool_get_settings(netdev, &cmd)) ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd)); } rtnl_unlock(); @@ -165,7 +167,7 @@ static ssize_t show_duplex(struct device *dev, if (netif_running(netdev)) { struct ethtool_cmd cmd; - if (!dev_ethtool_get_settings(netdev, &cmd)) + if (!__ethtool_get_settings(netdev, &cmd)) ret = sprintf(buf, "%s\n", cmd.duplex ? "full" : "half"); } @@ -605,9 +607,12 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, rcu_assign_pointer(queue->rps_map, map); spin_unlock(&rps_map_lock); - if (old_map) + if (map) + jump_label_inc(&rps_needed); + if (old_map) { kfree_rcu(old_map, rcu); - + jump_label_dec(&rps_needed); + } free_cpumask_var(mask); return len; } @@ -617,15 +622,15 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, char *buf) { struct rps_dev_flow_table *flow_table; - unsigned int val = 0; + unsigned long val = 0; rcu_read_lock(); flow_table = rcu_dereference(queue->rps_flow_table); if (flow_table) - val = flow_table->mask + 1; + val = (unsigned long)flow_table->mask + 1; rcu_read_unlock(); - return sprintf(buf, "%u\n", val); + return sprintf(buf, "%lu\n", val); } static void rps_dev_flow_table_release_work(struct work_struct *work) @@ -649,33 +654,46 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, struct rx_queue_attribute *attr, const char *buf, size_t len) { - unsigned int count; - char *endp; + unsigned long mask, count; struct rps_dev_flow_table *table, *old_table; static DEFINE_SPINLOCK(rps_dev_flow_lock); + int rc; if (!capable(CAP_NET_ADMIN)) return -EPERM; - count = simple_strtoul(buf, &endp, 0); - if (endp == buf) - return -EINVAL; + rc = kstrtoul(buf, 0, &count); + if (rc < 0) + return rc; if (count) { - int i; - - if (count > 1<<30) { + mask = count - 1; + /* mask = roundup_pow_of_two(count) - 1; + * without overflows... + */ + while ((mask | (mask >> 1)) != mask) + mask |= (mask >> 1); + /* On 64 bit arches, must check mask fits in table->mask (u32), + * and on 32bit arches, must check RPS_DEV_FLOW_TABLE_SIZE(mask + 1) + * doesnt overflow. + */ +#if BITS_PER_LONG > 32 + if (mask > (unsigned long)(u32)mask) + return -EINVAL; +#else + if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1)) + / sizeof(struct rps_dev_flow)) { /* Enforce a limit to prevent overflow */ return -EINVAL; } - count = roundup_pow_of_two(count); - table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count)); +#endif + table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1)); if (!table) return -ENOMEM; - table->mask = count - 1; - for (i = 0; i < count; i++) - table->flows[i].cpu = RPS_NO_CPU; + table->mask = mask; + for (count = 0; count <= mask; count++) + table->flows[count].cpu = RPS_NO_CPU; } else table = NULL; @@ -712,13 +730,13 @@ static void rx_queue_release(struct kobject *kobj) struct rps_dev_flow_table *flow_table; - map = rcu_dereference_raw(queue->rps_map); + map = rcu_dereference_protected(queue->rps_map, 1); if (map) { RCU_INIT_POINTER(queue->rps_map, NULL); kfree_rcu(map, rcu); } - flow_table = rcu_dereference_raw(queue->rps_flow_table); + flow_table = rcu_dereference_protected(queue->rps_flow_table, 1); if (flow_table) { RCU_INIT_POINTER(queue->rps_flow_table, NULL); call_rcu(&flow_table->rcu, rps_dev_flow_table_release); @@ -779,7 +797,7 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) #endif } -#ifdef CONFIG_XPS +#ifdef CONFIG_SYSFS /* * netdev_queue sysfs structures and functions. */ @@ -825,6 +843,133 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { .store = netdev_queue_attr_store, }; +static ssize_t show_trans_timeout(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + char *buf) +{ + unsigned long trans_timeout; + + spin_lock_irq(&queue->_xmit_lock); + trans_timeout = queue->trans_timeout; + spin_unlock_irq(&queue->_xmit_lock); + + return sprintf(buf, "%lu", trans_timeout); +} + +static struct netdev_queue_attribute queue_trans_timeout = + __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); + +#ifdef CONFIG_BQL +/* + * Byte queue limits sysfs structures and functions. + */ +static ssize_t bql_show(char *buf, unsigned int value) +{ + return sprintf(buf, "%u\n", value); +} + +static ssize_t bql_set(const char *buf, const size_t count, + unsigned int *pvalue) +{ + unsigned int value; + int err; + + if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) + value = DQL_MAX_LIMIT; + else { + err = kstrtouint(buf, 10, &value); + if (err < 0) + return err; + if (value > DQL_MAX_LIMIT) + return -EINVAL; + } + + *pvalue = value; + + return count; +} + +static ssize_t bql_show_hold_time(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, + char *buf) +{ + struct dql *dql = &queue->dql; + + return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); +} + +static ssize_t bql_set_hold_time(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct dql *dql = &queue->dql; + unsigned value; + int err; + + err = kstrtouint(buf, 10, &value); + if (err < 0) + return err; + + dql->slack_hold_time = msecs_to_jiffies(value); + + return len; +} + +static struct netdev_queue_attribute bql_hold_time_attribute = + __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time, + bql_set_hold_time); + +static ssize_t bql_show_inflight(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, + char *buf) +{ + struct dql *dql = &queue->dql; + + return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed); +} + +static struct netdev_queue_attribute bql_inflight_attribute = + __ATTR(inflight, S_IRUGO | S_IWUSR, bql_show_inflight, NULL); + +#define BQL_ATTR(NAME, FIELD) \ +static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ + struct netdev_queue_attribute *attr, \ + char *buf) \ +{ \ + return bql_show(buf, queue->dql.FIELD); \ +} \ + \ +static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ + struct netdev_queue_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + return bql_set(buf, len, &queue->dql.FIELD); \ +} \ + \ +static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \ + __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \ + bql_set_ ## NAME); + +BQL_ATTR(limit, limit) +BQL_ATTR(limit_max, max_limit) +BQL_ATTR(limit_min, min_limit) + +static struct attribute *dql_attrs[] = { + &bql_limit_attribute.attr, + &bql_limit_max_attribute.attr, + &bql_limit_min_attribute.attr, + &bql_hold_time_attribute.attr, + &bql_inflight_attribute.attr, + NULL +}; + +static struct attribute_group dql_group = { + .name = "byte_queue_limits", + .attrs = dql_attrs, +}; +#endif /* CONFIG_BQL */ + +#ifdef CONFIG_XPS static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; @@ -889,6 +1034,52 @@ static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) +static void xps_queue_release(struct netdev_queue *queue) +{ + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + struct xps_map *map; + unsigned long index; + int i, pos, nonempty = 0; + + index = get_netdev_queue_index(queue); + + mutex_lock(&xps_map_mutex); + dev_maps = xmap_dereference(dev->xps_maps); + + if (dev_maps) { + for_each_possible_cpu(i) { + map = xmap_dereference(dev_maps->cpu_map[i]); + if (!map) + continue; + + for (pos = 0; pos < map->len; pos++) + if (map->queues[pos] == index) + break; + + if (pos < map->len) { + if (map->len > 1) + map->queues[pos] = + map->queues[--map->len]; + else { + RCU_INIT_POINTER(dev_maps->cpu_map[i], + NULL); + kfree_rcu(map, rcu); + map = NULL; + } + } + if (map) + nonempty = 1; + } + + if (!nonempty) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + kfree_rcu(dev_maps, rcu); + } + } + mutex_unlock(&xps_map_mutex); +} + static ssize_t store_xps_map(struct netdev_queue *queue, struct netdev_queue_attribute *attribute, const char *buf, size_t len) @@ -900,7 +1091,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue, struct xps_map *map, *new_map; struct xps_dev_maps *dev_maps, *new_dev_maps; int nonempty = 0; - int numa_node = -2; + int numa_node_id = -2; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -943,10 +1134,10 @@ static ssize_t store_xps_map(struct netdev_queue *queue, need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu); #ifdef CONFIG_NUMA if (need_set) { - if (numa_node == -2) - numa_node = cpu_to_node(cpu); - else if (numa_node != cpu_to_node(cpu)) - numa_node = -1; + if (numa_node_id == -2) + numa_node_id = cpu_to_node(cpu); + else if (numa_node_id != cpu_to_node(cpu)) + numa_node_id = -1; } #endif if (need_set && pos >= map_len) { @@ -986,17 +1177,17 @@ static ssize_t store_xps_map(struct netdev_queue *queue, nonempty = 1; } - if (nonempty) + if (nonempty) { rcu_assign_pointer(dev->xps_maps, new_dev_maps); - else { + } else { kfree(new_dev_maps); - rcu_assign_pointer(dev->xps_maps, NULL); + RCU_INIT_POINTER(dev->xps_maps, NULL); } if (dev_maps) kfree_rcu(dev_maps, rcu); - netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node : + netdev_queue_numa_node_write(queue, (numa_node_id >= 0) ? numa_node_id : NUMA_NO_NODE); mutex_unlock(&xps_map_mutex); @@ -1019,58 +1210,23 @@ error: static struct netdev_queue_attribute xps_cpus_attribute = __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map); +#endif /* CONFIG_XPS */ static struct attribute *netdev_queue_default_attrs[] = { + &queue_trans_timeout.attr, +#ifdef CONFIG_XPS &xps_cpus_attribute.attr, +#endif NULL }; static void netdev_queue_release(struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); - struct net_device *dev = queue->dev; - struct xps_dev_maps *dev_maps; - struct xps_map *map; - unsigned long index; - int i, pos, nonempty = 0; - - index = get_netdev_queue_index(queue); - - mutex_lock(&xps_map_mutex); - dev_maps = xmap_dereference(dev->xps_maps); - - if (dev_maps) { - for_each_possible_cpu(i) { - map = xmap_dereference(dev_maps->cpu_map[i]); - if (!map) - continue; - for (pos = 0; pos < map->len; pos++) - if (map->queues[pos] == index) - break; - - if (pos < map->len) { - if (map->len > 1) - map->queues[pos] = - map->queues[--map->len]; - else { - RCU_INIT_POINTER(dev_maps->cpu_map[i], - NULL); - kfree_rcu(map, rcu); - map = NULL; - } - } - if (map) - nonempty = 1; - } - - if (!nonempty) { - RCU_INIT_POINTER(dev->xps_maps, NULL); - kfree_rcu(dev_maps, rcu); - } - } - - mutex_unlock(&xps_map_mutex); +#ifdef CONFIG_XPS + xps_queue_release(queue); +#endif memset(kobj, 0, sizeof(*kobj)); dev_put(queue->dev); @@ -1091,22 +1247,29 @@ static int netdev_queue_add_kobject(struct net_device *net, int index) kobj->kset = net->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, "tx-%u", index); - if (error) { - kobject_put(kobj); - return error; - } + if (error) + goto exit; + +#ifdef CONFIG_BQL + error = sysfs_create_group(kobj, &dql_group); + if (error) + goto exit; +#endif kobject_uevent(kobj, KOBJ_ADD); dev_hold(queue->dev); + return 0; +exit: + kobject_put(kobj); return error; } -#endif /* CONFIG_XPS */ +#endif /* CONFIG_SYSFS */ int netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) { -#ifdef CONFIG_XPS +#ifdef CONFIG_SYSFS int i; int error = 0; @@ -1118,20 +1281,26 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) } } - while (--i >= new_num) - kobject_put(&net->_tx[i].kobj); + while (--i >= new_num) { + struct netdev_queue *queue = net->_tx + i; + +#ifdef CONFIG_BQL + sysfs_remove_group(&queue->kobj, &dql_group); +#endif + kobject_put(&queue->kobj); + } return error; #else return 0; -#endif +#endif /* CONFIG_SYSFS */ } static int register_queue_kobjects(struct net_device *net) { int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; -#if defined(CONFIG_RPS) || defined(CONFIG_XPS) +#ifdef CONFIG_SYSFS net->queues_kset = kset_create_and_add("queues", NULL, &net->dev.kobj); if (!net->queues_kset) @@ -1172,7 +1341,7 @@ static void remove_queue_kobjects(struct net_device *net) net_rx_queue_update_kobjects(net, real_rx, 0); netdev_queue_update_kobjects(net, real_tx, 0); -#if defined(CONFIG_RPS) || defined(CONFIG_XPS) +#ifdef CONFIG_SYSFS kset_unregister(net->queues_kset); #endif } diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 52380b1d552..ba3c0120786 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -11,6 +11,7 @@ #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> +#include <linux/export.h> #include <linux/netpoll.h> #include <linux/sched.h> #include <linux/delay.h> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 5bbdbf0d366..aefcd7acbff 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -10,6 +10,7 @@ #include <linux/nsproxy.h> #include <linux/proc_fs.h> #include <linux/file.h> +#include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/generic.h> diff --git a/net/core/netevent.c b/net/core/netevent.c index 865f0ceb81f..f17ccd291d3 100644 --- a/net/core/netevent.c +++ b/net/core/netevent.c @@ -15,6 +15,7 @@ #include <linux/rtnetlink.h> #include <linux/notifier.h> +#include <linux/export.h> #include <net/netevent.h> static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 52622517e0d..556b0829866 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -23,6 +23,7 @@ #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/slab.h> +#include <linux/export.h> #include <net/tcp.h> #include <net/udp.h> #include <asm/unaligned.h> @@ -75,7 +76,7 @@ static void queue_process(struct work_struct *work) local_irq_save(flags); __netif_tx_lock(txq, smp_processor_id()); - if (netif_tx_queue_frozen_or_stopped(txq) || + if (netif_xmit_frozen_or_stopped(txq) || ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { skb_queue_head(&npinfo->txq, skb); __netif_tx_unlock(txq); @@ -316,7 +317,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { if (__netif_tx_trylock(txq)) { - if (!netif_tx_queue_stopped(txq)) { + if (!netif_xmit_stopped(txq)) { status = ops->ndo_start_xmit(skb, dev); if (status == NETDEV_TX_OK) txq_trans_update(txq); @@ -421,6 +422,7 @@ static void arp_reply(struct sk_buff *skb) struct sk_buff *send_skb; struct netpoll *np, *tmp; unsigned long flags; + int hlen, tlen; int hits = 0; if (list_empty(&npinfo->rx_np)) @@ -478,8 +480,9 @@ static void arp_reply(struct sk_buff *skb) if (tip != np->local_ip) continue; - send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev), - LL_RESERVED_SPACE(np->dev)); + hlen = LL_RESERVED_SPACE(np->dev); + tlen = np->dev->needed_tailroom; + send_skb = find_skb(np, size + hlen + tlen, hlen); if (!send_skb) continue; @@ -903,7 +906,7 @@ void __netpoll_cleanup(struct netpoll *np) if (ops->ndo_netpoll_cleanup) ops->ndo_netpoll_cleanup(np->dev); - rcu_assign_pointer(np->dev->npinfo, NULL); + RCU_INIT_POINTER(np->dev->npinfo, NULL); /* avoid racing with NAPI reading npinfo */ synchronize_rcu_bh(); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c new file mode 100644 index 00000000000..3a9fd4826b7 --- /dev/null +++ b/net/core/netprio_cgroup.c @@ -0,0 +1,344 @@ +/* + * net/core/netprio_cgroup.c Priority Control Group + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Neil Horman <nhorman@tuxdriver.com> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/cgroup.h> +#include <linux/rcupdate.h> +#include <linux/atomic.h> +#include <net/rtnetlink.h> +#include <net/pkt_cls.h> +#include <net/sock.h> +#include <net/netprio_cgroup.h> + +static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, + struct cgroup *cgrp); +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); + +struct cgroup_subsys net_prio_subsys = { + .name = "net_prio", + .create = cgrp_create, + .destroy = cgrp_destroy, + .populate = cgrp_populate, +#ifdef CONFIG_NETPRIO_CGROUP + .subsys_id = net_prio_subsys_id, +#endif + .module = THIS_MODULE +}; + +#define PRIOIDX_SZ 128 + +static unsigned long prioidx_map[PRIOIDX_SZ]; +static DEFINE_SPINLOCK(prioidx_map_lock); +static atomic_t max_prioidx = ATOMIC_INIT(0); + +static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), + struct cgroup_netprio_state, css); +} + +static int get_prioidx(u32 *prio) +{ + unsigned long flags; + u32 prioidx; + + spin_lock_irqsave(&prioidx_map_lock, flags); + prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); + set_bit(prioidx, prioidx_map); + spin_unlock_irqrestore(&prioidx_map_lock, flags); + if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) + return -ENOSPC; + + atomic_set(&max_prioidx, prioidx); + *prio = prioidx; + return 0; +} + +static void put_prioidx(u32 idx) +{ + unsigned long flags; + + spin_lock_irqsave(&prioidx_map_lock, flags); + clear_bit(idx, prioidx_map); + spin_unlock_irqrestore(&prioidx_map_lock, flags); +} + +static void extend_netdev_table(struct net_device *dev, u32 new_len) +{ + size_t new_size = sizeof(struct netprio_map) + + ((sizeof(u32) * new_len)); + struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL); + struct netprio_map *old_priomap; + int i; + + old_priomap = rtnl_dereference(dev->priomap); + + if (!new_priomap) { + printk(KERN_WARNING "Unable to alloc new priomap!\n"); + return; + } + + for (i = 0; + old_priomap && (i < old_priomap->priomap_len); + i++) + new_priomap->priomap[i] = old_priomap->priomap[i]; + + new_priomap->priomap_len = new_len; + + rcu_assign_pointer(dev->priomap, new_priomap); + if (old_priomap) + kfree_rcu(old_priomap, rcu); +} + +static void update_netdev_tables(void) +{ + struct net_device *dev; + u32 max_len = atomic_read(&max_prioidx); + struct netprio_map *map; + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + map = rtnl_dereference(dev->priomap); + if ((!map) || + (map->priomap_len < max_len)) + extend_netdev_table(dev, max_len); + } + rtnl_unlock(); +} + +static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, + struct cgroup *cgrp) +{ + struct cgroup_netprio_state *cs; + int ret; + + cs = kzalloc(sizeof(*cs), GFP_KERNEL); + if (!cs) + return ERR_PTR(-ENOMEM); + + if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) { + kfree(cs); + return ERR_PTR(-EINVAL); + } + + ret = get_prioidx(&cs->prioidx); + if (ret != 0) { + printk(KERN_WARNING "No space in priority index array\n"); + kfree(cs); + return ERR_PTR(ret); + } + + return &cs->css; +} + +static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct cgroup_netprio_state *cs; + struct net_device *dev; + struct netprio_map *map; + + cs = cgrp_netprio_state(cgrp); + rtnl_lock(); + for_each_netdev(&init_net, dev) { + map = rtnl_dereference(dev->priomap); + if (map) + map->priomap[cs->prioidx] = 0; + } + rtnl_unlock(); + put_prioidx(cs->prioidx); + kfree(cs); +} + +static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) +{ + return (u64)cgrp_netprio_state(cgrp)->prioidx; +} + +static int read_priomap(struct cgroup *cont, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct net_device *dev; + u32 prioidx = cgrp_netprio_state(cont)->prioidx; + u32 priority; + struct netprio_map *map; + + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + map = rcu_dereference(dev->priomap); + priority = map ? map->priomap[prioidx] : 0; + cb->fill(cb, dev->name, priority); + } + rcu_read_unlock(); + return 0; +} + +static int write_priomap(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + char *devname = kstrdup(buffer, GFP_KERNEL); + int ret = -EINVAL; + u32 prioidx = cgrp_netprio_state(cgrp)->prioidx; + unsigned long priority; + char *priostr; + struct net_device *dev; + struct netprio_map *map; + + if (!devname) + return -ENOMEM; + + /* + * Minimally sized valid priomap string + */ + if (strlen(devname) < 3) + goto out_free_devname; + + priostr = strstr(devname, " "); + if (!priostr) + goto out_free_devname; + + /* + *Separate the devname from the associated priority + *and advance the priostr poitner to the priority value + */ + *priostr = '\0'; + priostr++; + + /* + * If the priostr points to NULL, we're at the end of the passed + * in string, and its not a valid write + */ + if (*priostr == '\0') + goto out_free_devname; + + ret = kstrtoul(priostr, 10, &priority); + if (ret < 0) + goto out_free_devname; + + ret = -ENODEV; + + dev = dev_get_by_name(&init_net, devname); + if (!dev) + goto out_free_devname; + + update_netdev_tables(); + ret = 0; + rcu_read_lock(); + map = rcu_dereference(dev->priomap); + if (map) + map->priomap[prioidx] = priority; + rcu_read_unlock(); + dev_put(dev); + +out_free_devname: + kfree(devname); + return ret; +} + +static struct cftype ss_files[] = { + { + .name = "prioidx", + .read_u64 = read_prioidx, + }, + { + .name = "ifpriomap", + .read_map = read_priomap, + .write_string = write_priomap, + }, +}; + +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); +} + +static int netprio_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct netprio_map *old; + u32 max_len = atomic_read(&max_prioidx); + + /* + * Note this is called with rtnl_lock held so we have update side + * protection on our rcu assignments + */ + + switch (event) { + + case NETDEV_REGISTER: + if (max_len) + extend_netdev_table(dev, max_len); + break; + case NETDEV_UNREGISTER: + old = rtnl_dereference(dev->priomap); + RCU_INIT_POINTER(dev->priomap, NULL); + if (old) + kfree_rcu(old, rcu); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block netprio_device_notifier = { + .notifier_call = netprio_device_event +}; + +static int __init init_cgroup_netprio(void) +{ + int ret; + + ret = cgroup_load_subsys(&net_prio_subsys); + if (ret) + goto out; +#ifndef CONFIG_NETPRIO_CGROUP + smp_wmb(); + net_prio_subsys_id = net_prio_subsys.subsys_id; +#endif + + register_netdevice_notifier(&netprio_device_notifier); + +out: + return ret; +} + +static void __exit exit_cgroup_netprio(void) +{ + struct netprio_map *old; + struct net_device *dev; + + unregister_netdevice_notifier(&netprio_device_notifier); + + cgroup_unload_subsys(&net_prio_subsys); + +#ifndef CONFIG_NETPRIO_CGROUP + net_prio_subsys_id = -1; + synchronize_rcu(); +#endif + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + old = rtnl_dereference(dev->priomap); + RCU_INIT_POINTER(dev->priomap, NULL); + if (old) + kfree_rcu(old, rcu); + } + rtnl_unlock(); +} + +module_init(init_cgroup_netprio); +module_exit(exit_cgroup_netprio); +MODULE_LICENSE("GPL v2"); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index e35a6fbb811..65f80c7b165 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1304,7 +1304,7 @@ static ssize_t pktgen_if_write(struct file *file, scan_ip6(buf, pkt_dev->in6_daddr.s6_addr); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr); - ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr); + pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr; if (debug) printk(KERN_DEBUG "pktgen: dst6 set to: %s\n", buf); @@ -1327,8 +1327,7 @@ static ssize_t pktgen_if_write(struct file *file, scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr); - ipv6_addr_copy(&pkt_dev->cur_in6_daddr, - &pkt_dev->min_in6_daddr); + pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr; if (debug) printk(KERN_DEBUG "pktgen: dst6_min set to: %s\n", buf); @@ -1371,7 +1370,7 @@ static ssize_t pktgen_if_write(struct file *file, scan_ip6(buf, pkt_dev->in6_saddr.s6_addr); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr); - ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr); + pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr; if (debug) printk(KERN_DEBUG "pktgen: src6 set to: %s\n", buf); @@ -2025,13 +2024,13 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, pkt_dev->odevname); - pkt_dev->queue_map_min = ntxq - 1; + pkt_dev->queue_map_min = (ntxq ?: 1) - 1; } if (pkt_dev->queue_map_max >= ntxq) { pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, pkt_dev->odevname); - pkt_dev->queue_map_max = ntxq - 1; + pkt_dev->queue_map_max = (ntxq ?: 1) - 1; } /* Default to the interface's mac if not explicitly set. */ @@ -2079,9 +2078,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) ifp = ifp->if_next) { if (ifp->scope == IFA_LINK && !(ifp->flags & IFA_F_TENTATIVE)) { - ipv6_addr_copy(&pkt_dev-> - cur_in6_saddr, - &ifp->addr); + pkt_dev->cur_in6_saddr = ifp->addr; err = 0; break; } @@ -2145,9 +2142,12 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) } start_time = ktime_now(); - if (remaining < 100000) - ndelay(remaining); /* really small just spin */ - else { + if (remaining < 100000) { + /* for small delays (<100us), just loop until limit is reached */ + do { + end_time = ktime_now(); + } while (ktime_lt(end_time, spin_until)); + } else { /* see do_nanosleep */ hrtimer_init_sleeper(&t, current); do { @@ -2162,8 +2162,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) hrtimer_cancel(&t.timer); } while (t.task && pkt_dev->running && !signal_pending(current)); __set_current_state(TASK_RUNNING); + end_time = ktime_now(); } - end_time = ktime_now(); pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time)); pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); @@ -2602,18 +2602,18 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, if (!pkt_dev->page) break; } - skb_shinfo(skb)->frags[i].page = pkt_dev->page; get_page(pkt_dev->page); + skb_frag_set_page(skb, i, pkt_dev->page); skb_shinfo(skb)->frags[i].page_offset = 0; /*last fragment, fill rest of data*/ if (i == (frags - 1)) - skb_shinfo(skb)->frags[i].size = - (datalen < PAGE_SIZE ? datalen : PAGE_SIZE); + skb_frag_size_set(&skb_shinfo(skb)->frags[i], + (datalen < PAGE_SIZE ? datalen : PAGE_SIZE)); else - skb_shinfo(skb)->frags[i].size = frag_len; - datalen -= skb_shinfo(skb)->frags[i].size; - skb->len += skb_shinfo(skb)->frags[i].size; - skb->data_len += skb_shinfo(skb)->frags[i].size; + skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len); + datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]); + skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]); + skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]); i++; skb_shinfo(skb)->nr_frags = i; } @@ -2955,8 +2955,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, iph->payload_len = htons(sizeof(struct udphdr) + datalen); iph->nexthdr = IPPROTO_UDP; - ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr); - ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr); + iph->daddr = pkt_dev->cur_in6_daddr; + iph->saddr = pkt_dev->cur_in6_saddr; skb->mac_header = (skb->network_header - ETH_HLEN - pkt_dev->pkt_overhead); @@ -3342,7 +3342,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) __netif_tx_lock_bh(txq); - if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) { + if (unlikely(netif_xmit_frozen_or_stopped(txq))) { ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 182236b2510..9b570a6a33c 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -26,10 +26,11 @@ * but then some measure against one socket starving all other sockets * would be needed. * - * It was 128 by default. Experiments with real servers show, that + * The minimum value of it is 128. Experiments with real servers show that * it is absolutely not enough even at 100conn/sec. 256 cures most - * of problems. This value is adjusted to 128 for very small machines - * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). + * of problems. + * This value is adjusted to 128 for low memory machines, + * and it will increase in proportion to the memory of machine. * Note : Dont forget somaxconn that may limit backlog too. */ int sysctl_max_syn_backlog = 256; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d3a62819671..f16444bc6cb 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -273,6 +273,17 @@ EXPORT_SYMBOL_GPL(rtnl_unregister_all); static LIST_HEAD(link_ops); +static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +{ + const struct rtnl_link_ops *ops; + + list_for_each_entry(ops, &link_ops, list) { + if (!strcmp(ops->kind, kind)) + return ops; + } + return NULL; +} + /** * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register @@ -285,6 +296,9 @@ static LIST_HEAD(link_ops); */ int __rtnl_link_register(struct rtnl_link_ops *ops) { + if (rtnl_link_ops_get(ops->kind)) + return -EEXIST; + if (!ops->dellink) ops->dellink = unregister_netdevice_queue; @@ -351,17 +365,6 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops) } EXPORT_SYMBOL_GPL(rtnl_link_unregister); -static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) -{ - const struct rtnl_link_ops *ops; - - list_for_each_entry(ops, &link_ops, list) { - if (!strcmp(ops->kind, kind)) - return ops; - } - return NULL; -} - static size_t rtnl_link_get_size(const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; @@ -731,7 +734,8 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev) size += num_vfs * (nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + - nla_total_size(sizeof(struct ifla_vf_tx_rate))); + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + + nla_total_size(sizeof(struct ifla_vf_spoofchk))); return size; } else return 0; @@ -954,13 +958,27 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct ifla_vf_mac vf_mac; struct ifla_vf_vlan vf_vlan; struct ifla_vf_tx_rate vf_tx_rate; + struct ifla_vf_spoofchk vf_spoofchk; + + /* + * Not all SR-IOV capable drivers support the + * spoofcheck query. Preset to -1 so the user + * space tool can detect that the driver didn't + * report anything. + */ + ivi.spoofchk = -1; if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) break; - vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf; + vf_mac.vf = + vf_vlan.vf = + vf_tx_rate.vf = + vf_spoofchk.vf = ivi.vf; + memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_tx_rate.rate = ivi.tx_rate; + vf_spoofchk.setting = ivi.spoofchk; vf = nla_nest_start(skb, IFLA_VF_INFO); if (!vf) { nla_nest_cancel(skb, vfinfo); @@ -968,7 +986,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, } NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac); NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan); - NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate); + NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), + &vf_tx_rate); + NLA_PUT(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), + &vf_spoofchk); nla_nest_end(skb, vf); } nla_nest_end(skb, vfinfo); @@ -1202,6 +1223,15 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) ivt->rate); break; } + case IFLA_VF_SPOOFCHK: { + struct ifla_vf_spoofchk *ivs; + ivs = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_spoofchk) + err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, + ivs->setting); + break; + } default: err = -EINVAL; break; @@ -1604,7 +1634,6 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net, dev_net_set(dev, net); dev->rtnl_link_ops = ops; dev->rtnl_link_state = RTNL_LINK_INITIALIZING; - dev->real_num_tx_queues = real_num_queues; if (tb[IFLA_MTU]) dev->mtu = nla_get_u32(tb[IFLA_MTU]); diff --git a/net/core/scm.c b/net/core/scm.c index 811b53fb330..ff52ad0a515 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -173,7 +173,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) if (err) goto error; - if (pid_vnr(p->pid) != p->creds.pid) { + if (!p->pid || pid_vnr(p->pid) != p->creds.pid) { struct pid *pid; err = -ESRCH; pid = find_get_pid(p->creds.pid); @@ -183,8 +183,9 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) p->pid = pid; } - if ((p->cred->euid != p->creds.uid) || - (p->cred->egid != p->creds.gid)) { + if (!p->cred || + (p->cred->euid != p->creds.uid) || + (p->cred->egid != p->creds.gid)) { struct cred *cred; err = -ENOMEM; cred = prepare_creds(); @@ -193,7 +194,8 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) cred->uid = cred->euid = p->creds.uid; cred->gid = cred->egid = p->creds.gid; - put_cred(p->cred); + if (p->cred) + put_cred(p->cred); p->cred = cred; } break; diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 45329d7c9dd..6fd44606fdd 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -19,6 +19,7 @@ static int __init net_secret_init(void) } late_initcall(net_secret_init); +#ifdef CONFIG_INET static u32 seq_scale(u32 seq) { /* @@ -33,9 +34,10 @@ static u32 seq_scale(u32 seq) */ return seq + (ktime_to_ns(ktime_get_real()) >> 6); } +#endif -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -__u32 secure_tcpv6_sequence_number(__be32 *saddr, __be32 *daddr, +#if IS_ENABLED(CONFIG_IPV6) +__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport) { u32 secret[MD5_MESSAGE_BYTES / 4]; @@ -132,7 +134,7 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif -#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +#if IS_ENABLED(CONFIG_IP_DCCP) u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) { @@ -154,7 +156,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, } EXPORT_SYMBOL(secure_dccp_sequence_number); -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if IS_ENABLED(CONFIG_IPV6) u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, __be16 sport, __be16 dport) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 387703f56fc..da0c97f2fab 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -184,11 +184,21 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, goto out; prefetchw(skb); + /* We do our best to align skb_shared_info on a separate cache + * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives + * aligned memory blocks, unless SLUB/SLAB debug is enabled. + * Both skb->head and skb_shared_info are cache line aligned. + */ size = SKB_DATA_ALIGN(size); - data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), - gfp_mask, node); + size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + data = kmalloc_node_track_caller(size, gfp_mask, node); if (!data) goto nodata; + /* kmalloc(size) might give us more room than requested. + * Put skb_shared_info exactly at the end of allocated zone, + * to allow max possible filling before reallocation. + */ + size = SKB_WITH_OVERHEAD(ksize(data)); prefetchw(data + size); /* @@ -197,7 +207,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, * the tail pointer in struct sk_buff! */ memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->truesize = size + sizeof(struct sk_buff); + /* Account for allocated memory : skb + skb->head */ + skb->truesize = SKB_TRUESIZE(size); atomic_set(&skb->users, 1); skb->head = data; skb->data = data; @@ -234,6 +245,55 @@ nodata: EXPORT_SYMBOL(__alloc_skb); /** + * build_skb - build a network buffer + * @data: data buffer provided by caller + * + * Allocate a new &sk_buff. Caller provides space holding head and + * skb_shared_info. @data must have been allocated by kmalloc() + * The return is the new skb buffer. + * On a failure the return is %NULL, and @data is not freed. + * Notes : + * Before IO, driver allocates only data buffer where NIC put incoming frame + * Driver should add room at head (NET_SKB_PAD) and + * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) + * After IO, driver calls build_skb(), to allocate sk_buff and populate it + * before giving packet to stack. + * RX rings only contains data buffers, not full skbs. + */ +struct sk_buff *build_skb(void *data) +{ + struct skb_shared_info *shinfo; + struct sk_buff *skb; + unsigned int size; + + skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + if (!skb) + return NULL; + + size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->truesize = SKB_TRUESIZE(size); + atomic_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->mac_header = ~0U; +#endif + + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + kmemcheck_annotate_variable(shinfo->destructor_arg); + + return skb; +} +EXPORT_SYMBOL(build_skb); + +/** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on * @length: length to allocate @@ -326,7 +386,7 @@ static void skb_release_data(struct sk_buff *skb) if (skb_shinfo(skb)->nr_frags) { int i; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - put_page(skb_shinfo(skb)->frags[i].page); + skb_frag_unref(skb, i); } /* @@ -392,7 +452,7 @@ static void skb_release_head_state(struct sk_buff *skb) WARN_ON(in_irq()); skb->destructor(skb); } -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb->nfct); #endif #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED @@ -475,6 +535,30 @@ void consume_skb(struct sk_buff *skb) EXPORT_SYMBOL(consume_skb); /** + * skb_recycle - clean up an skb for reuse + * @skb: buffer + * + * Recycles the skb to be reused as a receive buffer. This + * function does any necessary reference count dropping, and + * cleans up the skbuff as if it just came from __alloc_skb(). + */ +void skb_recycle(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo; + + skb_release_head_state(skb); + + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->data = skb->head + NET_SKB_PAD; + skb_reset_tail_pointer(skb); +} +EXPORT_SYMBOL(skb_recycle); + +/** * skb_recycle_check - check if skb can be reused for receive * @skb: buffer * @skb_size: minimum receive buffer size @@ -488,33 +572,10 @@ EXPORT_SYMBOL(consume_skb); */ bool skb_recycle_check(struct sk_buff *skb, int skb_size) { - struct skb_shared_info *shinfo; - - if (irqs_disabled()) + if (!skb_is_recycleable(skb, skb_size)) return false; - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) - return false; - - if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE) - return false; - - skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD); - if (skb_end_pointer(skb) - skb->head < skb_size) - return false; - - if (skb_shared(skb) || skb_cloned(skb)) - return false; - - skb_release_head_state(skb); - - shinfo = skb_shinfo(skb); - memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); - atomic_set(&shinfo->dataref, 1); - - memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->data = skb->head + NET_SKB_PAD; - skb_reset_tail_pointer(skb); + skb_recycle(skb); return true; } @@ -529,6 +590,8 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->mac_header = old->mac_header; skb_dst_copy(new, old); new->rxhash = old->rxhash; + new->ooo_okay = old->ooo_okay; + new->l4_rxhash = old->l4_rxhash; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif @@ -539,15 +602,14 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->ip_summed = old->ip_summed; skb_copy_queue_mapping(new, old); new->priority = old->priority; -#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) +#if IS_ENABLED(CONFIG_IP_VS) new->ipvs_property = old->ipvs_property; #endif new->protocol = old->protocol; new->mark = old->mark; new->skb_iif = old->skb_iif; __nf_copy(new, old); -#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ - defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) new->nf_trace = old->nf_trace; #endif #ifdef CONFIG_NET_SCHED @@ -647,7 +709,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) } vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); memcpy(page_address(page), - vaddr + f->page_offset, f->size); + vaddr + f->page_offset, skb_frag_size(f)); kunmap_skb_frag(vaddr); page->private = (unsigned long)head; head = page; @@ -655,14 +717,14 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) /* skb frags release userspace buffers */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - put_page(skb_shinfo(skb)->frags[i].page); + skb_frag_unref(skb, i); uarg->callback(uarg); /* skb frags point to kernel buffers */ for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { - skb_shinfo(skb)->frags[i - 1].page_offset = 0; - skb_shinfo(skb)->frags[i - 1].page = head; + __skb_fill_page_desc(skb, i-1, head, 0, + skb_shinfo(skb)->frags[i - 1].size); head = (struct page *)head->private; } @@ -777,8 +839,9 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) EXPORT_SYMBOL(skb_copy); /** - * pskb_copy - create copy of an sk_buff with private head. + * __pskb_copy - create copy of an sk_buff with private head. * @skb: buffer to copy + * @headroom: headroom of new skb * @gfp_mask: allocation priority * * Make a copy of both an &sk_buff and part of its data, located @@ -789,16 +852,16 @@ EXPORT_SYMBOL(skb_copy); * The returned buffer has a reference count of 1. */ -struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) +struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) { - unsigned int size = skb_end_pointer(skb) - skb->head; + unsigned int size = skb_headlen(skb) + headroom; struct sk_buff *n = alloc_skb(size, gfp_mask); if (!n) goto out; /* Set the data pointer */ - skb_reserve(n, skb_headroom(skb)); + skb_reserve(n, headroom); /* Set the tail pointer and length */ skb_put(n, skb_headlen(skb)); /* Copy the bytes */ @@ -820,7 +883,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; - get_page(skb_shinfo(n)->frags[i].page); + skb_frag_ref(skb, i); } skb_shinfo(n)->nr_frags = i; } @@ -834,7 +897,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) out: return n; } -EXPORT_SYMBOL(pskb_copy); +EXPORT_SYMBOL(__pskb_copy); /** * pskb_expand_head - reallocate header of &sk_buff @@ -911,7 +974,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, goto nofrags; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - get_page(skb_shinfo(skb)->frags[i].page); + skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); @@ -1178,20 +1241,20 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len) goto drop_pages; for (; i < nfrags; i++) { - int end = offset + skb_shinfo(skb)->frags[i].size; + int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); if (end < len) { offset = end; continue; } - skb_shinfo(skb)->frags[i++].size = len - offset; + skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); drop_pages: skb_shinfo(skb)->nr_frags = i; for (; i < nfrags; i++) - put_page(skb_shinfo(skb)->frags[i].page); + skb_frag_unref(skb, i); if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); @@ -1294,9 +1357,11 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Estimate size of pulled pages. */ eat = delta; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (skb_shinfo(skb)->frags[i].size >= eat) + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (size >= eat) goto pull_pages; - eat -= skb_shinfo(skb)->frags[i].size; + eat -= size; } /* If we need update frag list, we are in troubles. @@ -1359,14 +1424,16 @@ pull_pages: eat = delta; k = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (skb_shinfo(skb)->frags[i].size <= eat) { - put_page(skb_shinfo(skb)->frags[i].page); - eat -= skb_shinfo(skb)->frags[i].size; + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (size <= eat) { + skb_frag_unref(skb, i); + eat -= size; } else { skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; if (eat) { skb_shinfo(skb)->frags[k].page_offset += eat; - skb_shinfo(skb)->frags[k].size -= eat; + skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); eat = 0; } k++; @@ -1421,7 +1488,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { u8 *vaddr; @@ -1619,7 +1686,8 @@ static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; - if (__splice_segment(f->page, f->page_offset, f->size, + if (__splice_segment(skb_frag_page(f), + f->page_offset, skb_frag_size(f), offset, len, skb, spd, 0, sk, pipe)) return 1; } @@ -1729,7 +1797,7 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) WARN_ON(start > offset + len); - end = start + frag->size; + end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { u8 *vaddr; @@ -1802,7 +1870,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { __wsum csum2; u8 *vaddr; @@ -1877,7 +1945,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { __wsum csum2; u8 *vaddr; @@ -2150,7 +2218,7 @@ static inline void skb_split_no_header(struct sk_buff *skb, skb->data_len = len - pos; for (i = 0; i < nfrags; i++) { - int size = skb_shinfo(skb)->frags[i].size; + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (pos + size > len) { skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; @@ -2164,10 +2232,10 @@ static inline void skb_split_no_header(struct sk_buff *skb, * where splitting is expensive. * 2. Split is accurately. We make this. */ - get_page(skb_shinfo(skb)->frags[i].page); + skb_frag_ref(skb, i); skb_shinfo(skb1)->frags[0].page_offset += len - pos; - skb_shinfo(skb1)->frags[0].size -= len - pos; - skb_shinfo(skb)->frags[i].size = len - pos; + skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); + skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); skb_shinfo(skb)->nr_frags++; } k++; @@ -2211,7 +2279,7 @@ static int skb_prepare_for_shift(struct sk_buff *skb) * @shiftlen: shift up to this many bytes * * Attempts to shift up to shiftlen worth of bytes, which may be less than - * the length of the skb, from tgt to skb. Returns number bytes shifted. + * the length of the skb, from skb to tgt. Returns number bytes shifted. * It's up to caller to free skb if everything was shifted. * * If @tgt runs out of frags, the whole operation is aborted. @@ -2239,12 +2307,13 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) * commit all, so that we don't have to undo partial changes */ if (!to || - !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) { + !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), + fragfrom->page_offset)) { merge = -1; } else { merge = to - 1; - todo -= fragfrom->size; + todo -= skb_frag_size(fragfrom); if (todo < 0) { if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) @@ -2254,8 +2323,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragfrom = &skb_shinfo(skb)->frags[from]; fragto = &skb_shinfo(tgt)->frags[merge]; - fragto->size += shiftlen; - fragfrom->size -= shiftlen; + skb_frag_size_add(fragto, shiftlen); + skb_frag_size_sub(fragfrom, shiftlen); fragfrom->page_offset += shiftlen; goto onlymerged; @@ -2279,20 +2348,20 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragfrom = &skb_shinfo(skb)->frags[from]; fragto = &skb_shinfo(tgt)->frags[to]; - if (todo >= fragfrom->size) { + if (todo >= skb_frag_size(fragfrom)) { *fragto = *fragfrom; - todo -= fragfrom->size; + todo -= skb_frag_size(fragfrom); from++; to++; } else { - get_page(fragfrom->page); + __skb_frag_ref(fragfrom); fragto->page = fragfrom->page; fragto->page_offset = fragfrom->page_offset; - fragto->size = todo; + skb_frag_size_set(fragto, todo); fragfrom->page_offset += todo; - fragfrom->size -= todo; + skb_frag_size_sub(fragfrom, todo); todo = 0; to++; @@ -2307,8 +2376,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) fragfrom = &skb_shinfo(skb)->frags[0]; fragto = &skb_shinfo(tgt)->frags[merge]; - fragto->size += fragfrom->size; - put_page(fragfrom->page); + skb_frag_size_add(fragto, skb_frag_size(fragfrom)); + __skb_frag_unref(fragfrom); } /* Reposition in the original skb */ @@ -2405,7 +2474,7 @@ next_skb: while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; - block_limit = frag->size + st->stepped_offset; + block_limit = skb_frag_size(frag) + st->stepped_offset; if (abs_offset < block_limit) { if (!st->frag_data) @@ -2423,7 +2492,7 @@ next_skb: } st->frag_idx++; - st->stepped_offset += frag->size; + st->stepped_offset += skb_frag_size(frag); } if (st->frag_data) { @@ -2553,14 +2622,13 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, left = PAGE_SIZE - frag->page_offset; copy = (length > left)? left : length; - ret = getfrag(from, (page_address(frag->page) + - frag->page_offset + frag->size), + ret = getfrag(from, skb_frag_address(frag) + skb_frag_size(frag), offset, copy, 0, skb); if (ret < 0) return -EFAULT; /* copy was successful so update the size parameters */ - frag->size += copy; + skb_frag_size_add(frag, copy); skb->len += copy; skb->data_len += copy; offset += copy; @@ -2602,7 +2670,7 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum); * a pointer to the first in a list of new skbs for the segments. * In case of error it returns ERR_PTR(err). */ -struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) +struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; @@ -2706,12 +2774,12 @@ struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) while (pos < offset + len && i < nfrags) { *frag = skb_shinfo(skb)->frags[i]; - get_page(frag->page); - size = frag->size; + __skb_frag_ref(frag); + size = skb_frag_size(frag); if (pos < offset) { frag->page_offset += offset - pos; - frag->size -= offset - pos; + skb_frag_size_sub(frag, offset - pos); } skb_shinfo(nskb)->nr_frags++; @@ -2720,7 +2788,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) i++; pos += size; } else { - frag->size -= pos + size - (offset + len); + skb_frag_size_sub(frag, pos + size - (offset + len)); goto skip_fraglist; } @@ -2800,7 +2868,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) } while (--i); frag->page_offset += offset; - frag->size -= offset; + skb_frag_size_sub(frag, offset); skb->truesize -= skb->data_len; skb->len -= skb->data_len; @@ -2852,7 +2920,7 @@ merge: unsigned int eat = offset - headlen; skbinfo->frags[0].page_offset += eat; - skbinfo->frags[0].size -= eat; + skb_frag_size_sub(&skbinfo->frags[0], eat); skb->data_len -= eat; skb->len -= eat; offset = headlen; @@ -2923,13 +2991,13 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; if (copy > len) copy = len; - sg_set_page(&sg[elt], frag->page, copy, + sg_set_page(&sg[elt], skb_frag_page(frag), copy, frag->page_offset+offset-start); elt++; if (!(len -= copy)) @@ -3150,6 +3218,26 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, } EXPORT_SYMBOL_GPL(skb_tstamp_tx); +void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) +{ + struct sock *sk = skb->sk; + struct sock_exterr_skb *serr; + int err; + + skb->wifi_acked_valid = 1; + skb->wifi_acked = acked; + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; + + err = sock_queue_err_skb(sk, skb); + if (err) + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); + /** * skb_partial_csum_set - set up and verify partial csum values for packet diff --git a/net/core/sock.c b/net/core/sock.c index bc745d00ea4..5c5af9988f9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -111,6 +111,8 @@ #include <linux/init.h> #include <linux/highmem.h> #include <linux/user_namespace.h> +#include <linux/jump_label.h> +#include <linux/memcontrol.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -125,6 +127,7 @@ #include <net/xfrm.h> #include <linux/ipsec.h> #include <net/cls_cgroup.h> +#include <net/netprio_cgroup.h> #include <linux/filter.h> @@ -134,6 +137,46 @@ #include <net/tcp.h> #endif +static DEFINE_MUTEX(proto_list_mutex); +static LIST_HEAD(proto_list); + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ + struct proto *proto; + int ret = 0; + + mutex_lock(&proto_list_mutex); + list_for_each_entry(proto, &proto_list, node) { + if (proto->init_cgroup) { + ret = proto->init_cgroup(cgrp, ss); + if (ret) + goto out; + } + } + + mutex_unlock(&proto_list_mutex); + return ret; +out: + list_for_each_entry_continue_reverse(proto, &proto_list, node) + if (proto->destroy_cgroup) + proto->destroy_cgroup(cgrp, ss); + mutex_unlock(&proto_list_mutex); + return ret; +} + +void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ + struct proto *proto; + + mutex_lock(&proto_list_mutex); + list_for_each_entry_reverse(proto, &proto_list, node) + if (proto->destroy_cgroup) + proto->destroy_cgroup(cgrp, ss); + mutex_unlock(&proto_list_mutex); +} +#endif + /* * Each address family might have different locking rules, so we have * one slock key per address family: @@ -141,6 +184,9 @@ static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; +struct jump_label_key memcg_socket_limit_enabled; +EXPORT_SYMBOL(memcg_socket_limit_enabled); + /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket @@ -207,7 +253,7 @@ static struct lock_class_key af_callback_keys[AF_MAX]; * not depend upon such differences. */ #define _SK_MEM_PACKETS 256 -#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256) +#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) @@ -221,10 +267,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); -#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) +#if defined(CONFIG_CGROUPS) +#if !defined(CONFIG_NET_CLS_CGROUP) int net_cls_subsys_id = -1; EXPORT_SYMBOL_GPL(net_cls_subsys_id); #endif +#if !defined(CONFIG_NETPRIO_CGROUP) +int net_prio_subsys_id = -1; +EXPORT_SYMBOL_GPL(net_prio_subsys_id); +#endif +#endif static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) { @@ -269,14 +321,14 @@ static void sock_warn_obsolete_bsdism(const char *name) } } -static void sock_disable_timestamp(struct sock *sk, int flag) +#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) + +static void sock_disable_timestamp(struct sock *sk, unsigned long flags) { - if (sock_flag(sk, flag)) { - sock_reset_flag(sk, flag); - if (!sock_flag(sk, SOCK_TIMESTAMP) && - !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) { + if (sk->sk_flags & flags) { + sk->sk_flags &= ~flags; + if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP)) net_disable_timestamp(); - } } } @@ -288,11 +340,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; - /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces - number of warnings when compiling with -W --ANK - */ - if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= - (unsigned)sk->sk_rcvbuf) { + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; @@ -387,7 +435,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk_tx_queue_clear(sk); - rcu_assign_pointer(sk->sk_dst_cache, NULL); + RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; } @@ -682,7 +730,7 @@ set_rcvbuf: SOCK_TIMESTAMPING_RX_SOFTWARE); else sock_disable_timestamp(sk, - SOCK_TIMESTAMPING_RX_SOFTWARE); + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, val & SOF_TIMESTAMPING_SOFTWARE); sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, @@ -738,11 +786,13 @@ set_rcvbuf: /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */ case SO_RXQ_OVFL: - if (valbool) - sock_set_flag(sk, SOCK_RXQ_OVFL); - else - sock_reset_flag(sk, SOCK_RXQ_OVFL); + sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; + + case SO_WIFI_STATUS: + sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); + break; + default: ret = -ENOPROTOOPT; break; @@ -964,6 +1014,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = !!sock_flag(sk, SOCK_RXQ_OVFL); break; + case SO_WIFI_STATUS: + v.val = !!sock_flag(sk, SOCK_WIFI_STATUS); + break; + default: return -ENOPROTOOPT; } @@ -1114,6 +1168,18 @@ void sock_update_classid(struct sock *sk) sk->sk_classid = classid; } EXPORT_SYMBOL(sock_update_classid); + +void sock_update_netprioidx(struct sock *sk) +{ + struct cgroup_netprio_state *state; + if (in_interrupt()) + return; + rcu_read_lock(); + state = task_netprio_state(current); + sk->sk_cgrp_prioidx = state ? state->prioidx : 0; + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif /** @@ -1141,6 +1207,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); + sock_update_netprioidx(sk); } return sk; @@ -1158,11 +1225,10 @@ static void __sk_free(struct sock *sk) atomic_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); - rcu_assign_pointer(sk->sk_filter, NULL); + RCU_INIT_POINTER(sk->sk_filter, NULL); } - sock_disable_timestamp(sk, SOCK_TIMESTAMP); - sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); + sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); if (atomic_read(&sk->sk_omem_alloc)) printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", @@ -1207,7 +1273,20 @@ void sk_release_kernel(struct sock *sk) } EXPORT_SYMBOL(sk_release_kernel); -struct sock *sk_clone(const struct sock *sk, const gfp_t priority) +static void sk_update_clone(const struct sock *sk, struct sock *newsk) +{ + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + sock_update_memcg(newsk); +} + +/** + * sk_clone_lock - clone a socket, and lock its clone + * @sk: the socket to clone + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + */ +struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) { struct sock *newsk; @@ -1260,6 +1339,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) /* It is still raw copy of parent, so invalidate * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); sk_free(newsk); newsk = NULL; goto out; @@ -1289,17 +1369,18 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; + sk_update_clone(sk, newsk); + if (newsk->sk_prot->sockets_allocated) - percpu_counter_inc(newsk->sk_prot->sockets_allocated); + sk_sockets_allocated_inc(newsk); - if (sock_flag(newsk, SOCK_TIMESTAMP) || - sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE)) + if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); } out: return newsk; } -EXPORT_SYMBOL_GPL(sk_clone); +EXPORT_SYMBOL_GPL(sk_clone_lock); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { @@ -1533,7 +1614,6 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, skb_shinfo(skb)->nr_frags = npages; for (i = 0; i < npages; i++) { struct page *page; - skb_frag_t *frag; page = alloc_pages(sk->sk_allocation, 0); if (!page) { @@ -1543,12 +1623,11 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, goto failure; } - frag = &skb_shinfo(skb)->frags[i]; - frag->page = page; - frag->page_offset = 0; - frag->size = (data_len >= PAGE_SIZE ? - PAGE_SIZE : - data_len); + __skb_fill_page_desc(skb, i, + page, 0, + (data_len >= PAGE_SIZE ? + PAGE_SIZE : + data_len)); data_len -= PAGE_SIZE; } @@ -1681,30 +1760,34 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); long allocated; + int parent_status = UNDER_LIMIT; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = atomic_long_add_return(amt, prot->memory_allocated); + + allocated = sk_memory_allocated_add(sk, amt, &parent_status); /* Under limit. */ - if (allocated <= prot->sysctl_mem[0]) { - if (prot->memory_pressure && *prot->memory_pressure) - *prot->memory_pressure = 0; + if (parent_status == UNDER_LIMIT && + allocated <= sk_prot_mem_limits(sk, 0)) { + sk_leave_memory_pressure(sk); return 1; } - /* Under pressure. */ - if (allocated > prot->sysctl_mem[1]) - if (prot->enter_memory_pressure) - prot->enter_memory_pressure(sk); + /* Under pressure. (we or our parents) */ + if ((parent_status > SOFT_LIMIT) || + allocated > sk_prot_mem_limits(sk, 1)) + sk_enter_memory_pressure(sk); - /* Over hard limit. */ - if (allocated > prot->sysctl_mem[2]) + /* Over hard limit (we or our parents) */ + if ((parent_status == OVER_LIMIT) || + (allocated > sk_prot_mem_limits(sk, 2))) goto suppress_allocation; /* guarantee minimum buffer size under pressure */ if (kind == SK_MEM_RECV) { if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) return 1; + } else { /* SK_MEM_SEND */ if (sk->sk_type == SOCK_STREAM) { if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) @@ -1714,13 +1797,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) return 1; } - if (prot->memory_pressure) { + if (sk_has_memory_pressure(sk)) { int alloc; - if (!*prot->memory_pressure) + if (!sk_under_memory_pressure(sk)) return 1; - alloc = percpu_counter_read_positive(prot->sockets_allocated); - if (prot->sysctl_mem[2] > alloc * + alloc = sk_sockets_allocated_read_positive(sk); + if (sk_prot_mem_limits(sk, 2) > alloc * sk_mem_pages(sk->sk_wmem_queued + atomic_read(&sk->sk_rmem_alloc) + sk->sk_forward_alloc)) @@ -1743,7 +1826,9 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - atomic_long_sub(amt, prot->memory_allocated); + + sk_memory_allocated_sub(sk, amt, parent_status); + return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -1754,15 +1839,13 @@ EXPORT_SYMBOL(__sk_mem_schedule); */ void __sk_mem_reclaim(struct sock *sk) { - struct proto *prot = sk->sk_prot; - - atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, - prot->memory_allocated); + sk_memory_allocated_sub(sk, + sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; - if (prot->memory_pressure && *prot->memory_pressure && - (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) - *prot->memory_pressure = 0; + if (sk_under_memory_pressure(sk) && + (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) + sk_leave_memory_pressure(sk); } EXPORT_SYMBOL(__sk_mem_reclaim); @@ -2133,16 +2216,15 @@ EXPORT_SYMBOL(sock_get_timestampns); void sock_enable_timestamp(struct sock *sk, int flag) { if (!sock_flag(sk, flag)) { + unsigned long previous_flags = sk->sk_flags; + sock_set_flag(sk, flag); /* * we just set one of the two flags which require net * time stamping, but time stamping might have been on * already because of the other one */ - if (!sock_flag(sk, - flag == SOCK_TIMESTAMP ? - SOCK_TIMESTAMPING_RX_SOFTWARE : - SOCK_TIMESTAMP)) + if (!(previous_flags & SK_FLAGS_TIMESTAMP)) net_enable_timestamp(); } } @@ -2254,9 +2336,6 @@ void sk_common_release(struct sock *sk) } EXPORT_SYMBOL(sk_common_release); -static DEFINE_RWLOCK(proto_list_lock); -static LIST_HEAD(proto_list); - #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { @@ -2405,10 +2484,10 @@ int proto_register(struct proto *prot, int alloc_slab) } } - write_lock(&proto_list_lock); + mutex_lock(&proto_list_mutex); list_add(&prot->node, &proto_list); assign_proto_idx(prot); - write_unlock(&proto_list_lock); + mutex_unlock(&proto_list_mutex); return 0; out_free_timewait_sock_slab_name: @@ -2431,10 +2510,10 @@ EXPORT_SYMBOL(proto_register); void proto_unregister(struct proto *prot) { - write_lock(&proto_list_lock); + mutex_lock(&proto_list_mutex); release_proto_idx(prot); list_del(&prot->node); - write_unlock(&proto_list_lock); + mutex_unlock(&proto_list_mutex); if (prot->slab != NULL) { kmem_cache_destroy(prot->slab); @@ -2457,9 +2536,9 @@ EXPORT_SYMBOL(proto_unregister); #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(proto_list_lock) + __acquires(proto_list_mutex) { - read_lock(&proto_list_lock); + mutex_lock(&proto_list_mutex); return seq_list_start_head(&proto_list, *pos); } @@ -2469,25 +2548,36 @@ static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void proto_seq_stop(struct seq_file *seq, void *v) - __releases(proto_list_lock) + __releases(proto_list_mutex) { - read_unlock(&proto_list_lock); + mutex_unlock(&proto_list_mutex); } static char proto_method_implemented(const void *method) { return method == NULL ? 'n' : 'y'; } +static long sock_prot_memory_allocated(struct proto *proto) +{ + return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L; +} + +static char *sock_prot_memory_pressure(struct proto *proto) +{ + return proto->memory_pressure != NULL ? + proto_memory_pressure(proto) ? "yes" : "no" : "NI"; +} static void proto_seq_printf(struct seq_file *seq, struct proto *proto) { + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, sock_prot_inuse_get(seq_file_net(seq), proto), - proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, - proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", + sock_prot_memory_allocated(proto), + sock_prot_memory_pressure(proto), proto->max_header, proto->slab == NULL ? "no" : "yes", module_name(proto->owner), diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c new file mode 100644 index 00000000000..b9868e1fd62 --- /dev/null +++ b/net/core/sock_diag.c @@ -0,0 +1,192 @@ +#include <linux/mutex.h> +#include <linux/socket.h> +#include <linux/skbuff.h> +#include <net/netlink.h> +#include <net/net_namespace.h> +#include <linux/module.h> +#include <linux/rtnetlink.h> +#include <net/sock.h> + +#include <linux/inet_diag.h> +#include <linux/sock_diag.h> + +static struct sock_diag_handler *sock_diag_handlers[AF_MAX]; +static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); +static DEFINE_MUTEX(sock_diag_table_mutex); + +int sock_diag_check_cookie(void *sk, __u32 *cookie) +{ + if ((cookie[0] != INET_DIAG_NOCOOKIE || + cookie[1] != INET_DIAG_NOCOOKIE) && + ((u32)(unsigned long)sk != cookie[0] || + (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1])) + return -ESTALE; + else + return 0; +} +EXPORT_SYMBOL_GPL(sock_diag_check_cookie); + +void sock_diag_save_cookie(void *sk, __u32 *cookie) +{ + cookie[0] = (u32)(unsigned long)sk; + cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); +} +EXPORT_SYMBOL_GPL(sock_diag_save_cookie); + +int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) +{ + __u32 *mem; + + mem = RTA_DATA(__RTA_PUT(skb, attrtype, SK_MEMINFO_VARS * sizeof(__u32))); + + mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); + mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; + mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); + mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; + mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; + mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; + mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); + + return 0; + +rtattr_failure: + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); + +void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +{ + mutex_lock(&sock_diag_table_mutex); + inet_rcv_compat = fn; + mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); + +void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +{ + mutex_lock(&sock_diag_table_mutex); + inet_rcv_compat = NULL; + mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); + +int sock_diag_register(struct sock_diag_handler *hndl) +{ + int err = 0; + + if (hndl->family >= AF_MAX) + return -EINVAL; + + mutex_lock(&sock_diag_table_mutex); + if (sock_diag_handlers[hndl->family]) + err = -EBUSY; + else + sock_diag_handlers[hndl->family] = hndl; + mutex_unlock(&sock_diag_table_mutex); + + return err; +} +EXPORT_SYMBOL_GPL(sock_diag_register); + +void sock_diag_unregister(struct sock_diag_handler *hnld) +{ + int family = hnld->family; + + if (family >= AF_MAX) + return; + + mutex_lock(&sock_diag_table_mutex); + BUG_ON(sock_diag_handlers[family] != hnld); + sock_diag_handlers[family] = NULL; + mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_unregister); + +static inline struct sock_diag_handler *sock_diag_lock_handler(int family) +{ + if (sock_diag_handlers[family] == NULL) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, family); + + mutex_lock(&sock_diag_table_mutex); + return sock_diag_handlers[family]; +} + +static inline void sock_diag_unlock_handler(struct sock_diag_handler *h) +{ + mutex_unlock(&sock_diag_table_mutex); +} + +static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + int err; + struct sock_diag_req *req = NLMSG_DATA(nlh); + struct sock_diag_handler *hndl; + + if (nlmsg_len(nlh) < sizeof(*req)) + return -EINVAL; + + hndl = sock_diag_lock_handler(req->sdiag_family); + if (hndl == NULL) + err = -ENOENT; + else + err = hndl->dump(skb, nlh); + sock_diag_unlock_handler(hndl); + + return err; +} + +static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + int ret; + + switch (nlh->nlmsg_type) { + case TCPDIAG_GETSOCK: + case DCCPDIAG_GETSOCK: + if (inet_rcv_compat == NULL) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, AF_INET); + + mutex_lock(&sock_diag_table_mutex); + if (inet_rcv_compat != NULL) + ret = inet_rcv_compat(skb, nlh); + else + ret = -EOPNOTSUPP; + mutex_unlock(&sock_diag_table_mutex); + + return ret; + case SOCK_DIAG_BY_FAMILY: + return __sock_diag_rcv_msg(skb, nlh); + default: + return -EINVAL; + } +} + +static DEFINE_MUTEX(sock_diag_mutex); + +static void sock_diag_rcv(struct sk_buff *skb) +{ + mutex_lock(&sock_diag_mutex); + netlink_rcv_skb(skb, &sock_diag_rcv_msg); + mutex_unlock(&sock_diag_mutex); +} + +struct sock *sock_diag_nlsk; +EXPORT_SYMBOL_GPL(sock_diag_nlsk); + +static int __init sock_diag_init(void) +{ + sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0, + sock_diag_rcv, NULL, THIS_MODULE); + return sock_diag_nlsk == NULL ? -ENOMEM : 0; +} + +static void __exit sock_diag_exit(void) +{ + netlink_kernel_release(sock_diag_nlsk); +} + +module_init(sock_diag_init); +module_exit(sock_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 77a65f03148..d05559d4d9c 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -68,8 +68,13 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); - synchronize_rcu(); - vfree(orig_sock_table); + if (sock_table) + jump_label_inc(&rps_needed); + if (orig_sock_table) { + jump_label_dec(&rps_needed); + synchronize_rcu(); + vfree(orig_sock_table); + } } } diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 98a52640e7c..661b5a40ec1 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -21,6 +21,7 @@ #include <linux/phy.h> #include <linux/ptp_classify.h> #include <linux/skbuff.h> +#include <linux/export.h> static struct sock_filter ptp_filter[] = { PTP_FILTER @@ -57,9 +58,13 @@ void skb_clone_tx_timestamp(struct sk_buff *skb) case PTP_CLASS_V2_VLAN: phydev = skb->dev->phydev; if (likely(phydev->drv->txtstamp)) { + if (!atomic_inc_not_zero(&sk->sk_refcnt)) + return; clone = skb_clone(skb, GFP_ATOMIC); - if (!clone) + if (!clone) { + sock_put(sk); return; + } clone->sk = sk; phydev->drv->txtstamp(phydev, clone, type); } @@ -77,8 +82,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, struct sock_exterr_skb *serr; int err; - if (!hwtstamps) + if (!hwtstamps) { + sock_put(sk); + kfree_skb(skb); return; + } *skb_hwtstamps(skb) = *hwtstamps; serr = SKB_EXT_ERR(skb); @@ -87,6 +95,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; skb->sk = NULL; err = sock_queue_err_skb(sk, skb); + sock_put(sk); if (err) kfree_skb(skb); } diff --git a/net/core/user_dma.c b/net/core/user_dma.c index 25d717ebc92..1b5fefdb819 100644 --- a/net/core/user_dma.c +++ b/net/core/user_dma.c @@ -27,6 +27,7 @@ #include <linux/dmaengine.h> #include <linux/socket.h> +#include <linux/export.h> #include <net/tcp.h> #include <net/netdma.h> @@ -71,14 +72,14 @@ int dma_skb_copy_datagram_iovec(struct dma_chan *chan, /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON(start > offset + len); - end = start + skb_shinfo(skb)->frags[i].size; + end = start + skb_frag_size(frag); copy = end - offset; if (copy > 0) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - struct page *page = frag->page; + struct page *page = skb_frag_page(frag); if (copy > len) copy = len; |
